diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,246436 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 35199, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.522969402539845e-05, + "grad_norm": 250.39899745024, + "learning_rate": 0.0, + "loss": 3.4203, + "step": 1 + }, + { + "epoch": 0.0001704593880507969, + "grad_norm": 865.7330776558471, + "learning_rate": 2.840909090909091e-09, + "loss": 7.3037, + "step": 2 + }, + { + "epoch": 0.00025568908207619537, + "grad_norm": 703.3154433533495, + "learning_rate": 5.681818181818182e-09, + "loss": 3.9548, + "step": 3 + }, + { + "epoch": 0.0003409187761015938, + "grad_norm": 284.8908320610401, + "learning_rate": 8.522727272727273e-09, + "loss": 4.8316, + "step": 4 + }, + { + "epoch": 0.00042614847012699224, + "grad_norm": 269.14347436597205, + "learning_rate": 1.1363636363636364e-08, + "loss": 4.7586, + "step": 5 + }, + { + "epoch": 0.0005113781641523907, + "grad_norm": 184.24809562846914, + "learning_rate": 1.4204545454545455e-08, + "loss": 3.4821, + "step": 6 + }, + { + "epoch": 0.0005966078581777891, + "grad_norm": 428.10043255477746, + "learning_rate": 1.7045454545454546e-08, + "loss": 3.4866, + "step": 7 + }, + { + "epoch": 0.0006818375522031876, + "grad_norm": 501.5018563850846, + "learning_rate": 1.9886363636363638e-08, + "loss": 4.1641, + "step": 8 + }, + { + "epoch": 0.000767067246228586, + "grad_norm": 765.1035066197936, + "learning_rate": 2.272727272727273e-08, + "loss": 4.9533, + "step": 9 + }, + { + "epoch": 0.0008522969402539845, + "grad_norm": 174.90445480653963, + "learning_rate": 2.556818181818182e-08, + "loss": 3.0987, + "step": 10 + }, + { + "epoch": 0.000937526634279383, + "grad_norm": 524.6314426556058, + "learning_rate": 2.840909090909091e-08, + "loss": 3.1033, + "step": 11 + }, + { + "epoch": 0.0010227563283047815, + "grad_norm": 335.7889264784564, + "learning_rate": 3.1250000000000005e-08, + "loss": 4.7237, + "step": 12 + }, + { + "epoch": 0.0011079860223301798, + "grad_norm": 276.33698808570415, + "learning_rate": 3.409090909090909e-08, + "loss": 4.1447, + "step": 13 + }, + { + "epoch": 0.0011932157163555782, + "grad_norm": 145.17281158963226, + "learning_rate": 3.693181818181819e-08, + "loss": 3.173, + "step": 14 + }, + { + "epoch": 0.0012784454103809768, + "grad_norm": 349.3439201482225, + "learning_rate": 3.9772727272727275e-08, + "loss": 2.6048, + "step": 15 + }, + { + "epoch": 0.0013636751044063752, + "grad_norm": 196.56176833383043, + "learning_rate": 4.261363636363636e-08, + "loss": 4.562, + "step": 16 + }, + { + "epoch": 0.0014489047984317736, + "grad_norm": 563.267853026754, + "learning_rate": 4.545454545454546e-08, + "loss": 4.4595, + "step": 17 + }, + { + "epoch": 0.001534134492457172, + "grad_norm": 173.32725342388815, + "learning_rate": 4.8295454545454545e-08, + "loss": 3.6125, + "step": 18 + }, + { + "epoch": 0.0016193641864825706, + "grad_norm": 562.5327261494056, + "learning_rate": 5.113636363636364e-08, + "loss": 3.178, + "step": 19 + }, + { + "epoch": 0.001704593880507969, + "grad_norm": 237.19947559966369, + "learning_rate": 5.3977272727272734e-08, + "loss": 4.4753, + "step": 20 + }, + { + "epoch": 0.0017898235745333673, + "grad_norm": 178.65934619254696, + "learning_rate": 5.681818181818182e-08, + "loss": 4.2248, + "step": 21 + }, + { + "epoch": 0.001875053268558766, + "grad_norm": 254.86407528923914, + "learning_rate": 5.965909090909092e-08, + "loss": 4.6877, + "step": 22 + }, + { + "epoch": 0.001960282962584164, + "grad_norm": 150.34498755726975, + "learning_rate": 6.250000000000001e-08, + "loss": 2.6796, + "step": 23 + }, + { + "epoch": 0.002045512656609563, + "grad_norm": 465.828120071364, + "learning_rate": 6.53409090909091e-08, + "loss": 4.0644, + "step": 24 + }, + { + "epoch": 0.0021307423506349613, + "grad_norm": 819.220348337823, + "learning_rate": 6.818181818181819e-08, + "loss": 5.3169, + "step": 25 + }, + { + "epoch": 0.0022159720446603597, + "grad_norm": 348.7896045532364, + "learning_rate": 7.102272727272727e-08, + "loss": 3.7007, + "step": 26 + }, + { + "epoch": 0.002301201738685758, + "grad_norm": 354.0469040244825, + "learning_rate": 7.386363636363637e-08, + "loss": 3.0501, + "step": 27 + }, + { + "epoch": 0.0023864314327111565, + "grad_norm": 414.6043622640884, + "learning_rate": 7.670454545454546e-08, + "loss": 4.6156, + "step": 28 + }, + { + "epoch": 0.002471661126736555, + "grad_norm": 298.0689176019552, + "learning_rate": 7.954545454545455e-08, + "loss": 4.4496, + "step": 29 + }, + { + "epoch": 0.0025568908207619537, + "grad_norm": 134.51653610340216, + "learning_rate": 8.238636363636364e-08, + "loss": 3.2626, + "step": 30 + }, + { + "epoch": 0.002642120514787352, + "grad_norm": 240.02437828396145, + "learning_rate": 8.522727272727273e-08, + "loss": 3.2362, + "step": 31 + }, + { + "epoch": 0.0027273502088127504, + "grad_norm": 148.27797597890276, + "learning_rate": 8.806818181818181e-08, + "loss": 2.2487, + "step": 32 + }, + { + "epoch": 0.002812579902838149, + "grad_norm": 154.71158899730403, + "learning_rate": 9.090909090909091e-08, + "loss": 3.0125, + "step": 33 + }, + { + "epoch": 0.002897809596863547, + "grad_norm": 197.61627280959524, + "learning_rate": 9.375e-08, + "loss": 3.1855, + "step": 34 + }, + { + "epoch": 0.0029830392908889456, + "grad_norm": 192.78031941313122, + "learning_rate": 9.659090909090909e-08, + "loss": 4.0776, + "step": 35 + }, + { + "epoch": 0.003068268984914344, + "grad_norm": 111.67433737127301, + "learning_rate": 9.943181818181819e-08, + "loss": 1.8621, + "step": 36 + }, + { + "epoch": 0.0031534986789397428, + "grad_norm": 104.55280713632509, + "learning_rate": 1.0227272727272728e-07, + "loss": 3.2286, + "step": 37 + }, + { + "epoch": 0.003238728372965141, + "grad_norm": 155.05853473167554, + "learning_rate": 1.0511363636363637e-07, + "loss": 2.5236, + "step": 38 + }, + { + "epoch": 0.0033239580669905395, + "grad_norm": 274.0352423700115, + "learning_rate": 1.0795454545454547e-07, + "loss": 4.8314, + "step": 39 + }, + { + "epoch": 0.003409187761015938, + "grad_norm": 479.50574461940346, + "learning_rate": 1.1079545454545456e-07, + "loss": 5.4615, + "step": 40 + }, + { + "epoch": 0.0034944174550413363, + "grad_norm": 180.20433572688123, + "learning_rate": 1.1363636363636364e-07, + "loss": 3.0576, + "step": 41 + }, + { + "epoch": 0.0035796471490667347, + "grad_norm": 141.83810278608001, + "learning_rate": 1.1647727272727274e-07, + "loss": 2.7546, + "step": 42 + }, + { + "epoch": 0.0036648768430921335, + "grad_norm": 1042.3440724642874, + "learning_rate": 1.1931818181818185e-07, + "loss": 4.3743, + "step": 43 + }, + { + "epoch": 0.003750106537117532, + "grad_norm": 312.19057399955403, + "learning_rate": 1.2215909090909093e-07, + "loss": 3.1987, + "step": 44 + }, + { + "epoch": 0.0038353362311429303, + "grad_norm": 135.391154988186, + "learning_rate": 1.2500000000000002e-07, + "loss": 3.1263, + "step": 45 + }, + { + "epoch": 0.003920565925168328, + "grad_norm": 870.8726666278995, + "learning_rate": 1.278409090909091e-07, + "loss": 5.0233, + "step": 46 + }, + { + "epoch": 0.004005795619193727, + "grad_norm": 246.20236466062025, + "learning_rate": 1.306818181818182e-07, + "loss": 3.4927, + "step": 47 + }, + { + "epoch": 0.004091025313219126, + "grad_norm": 319.73195877632025, + "learning_rate": 1.3352272727272728e-07, + "loss": 5.9589, + "step": 48 + }, + { + "epoch": 0.004176255007244524, + "grad_norm": 293.0426389717749, + "learning_rate": 1.3636363636363637e-07, + "loss": 2.2941, + "step": 49 + }, + { + "epoch": 0.004261484701269923, + "grad_norm": 457.10815885877133, + "learning_rate": 1.3920454545454546e-07, + "loss": 5.5087, + "step": 50 + }, + { + "epoch": 0.004346714395295321, + "grad_norm": 165.6234042454497, + "learning_rate": 1.4204545454545455e-07, + "loss": 2.2669, + "step": 51 + }, + { + "epoch": 0.004431944089320719, + "grad_norm": 566.8132853778014, + "learning_rate": 1.4488636363636364e-07, + "loss": 4.2228, + "step": 52 + }, + { + "epoch": 0.004517173783346118, + "grad_norm": 216.0244419506028, + "learning_rate": 1.4772727272727275e-07, + "loss": 3.6335, + "step": 53 + }, + { + "epoch": 0.004602403477371516, + "grad_norm": 220.45761258993448, + "learning_rate": 1.5056818181818184e-07, + "loss": 3.5245, + "step": 54 + }, + { + "epoch": 0.004687633171396915, + "grad_norm": 201.82422139578733, + "learning_rate": 1.5340909090909093e-07, + "loss": 3.5122, + "step": 55 + }, + { + "epoch": 0.004772862865422313, + "grad_norm": 472.2823612540938, + "learning_rate": 1.5625e-07, + "loss": 4.5874, + "step": 56 + }, + { + "epoch": 0.004858092559447712, + "grad_norm": 178.59301199623857, + "learning_rate": 1.590909090909091e-07, + "loss": 2.7256, + "step": 57 + }, + { + "epoch": 0.00494332225347311, + "grad_norm": 171.03001732880563, + "learning_rate": 1.619318181818182e-07, + "loss": 3.7058, + "step": 58 + }, + { + "epoch": 0.0050285519474985085, + "grad_norm": 154.81312046113513, + "learning_rate": 1.6477272727272728e-07, + "loss": 2.8401, + "step": 59 + }, + { + "epoch": 0.005113781641523907, + "grad_norm": 300.6615171532456, + "learning_rate": 1.676136363636364e-07, + "loss": 4.9595, + "step": 60 + }, + { + "epoch": 0.005199011335549305, + "grad_norm": 211.21436633537988, + "learning_rate": 1.7045454545454545e-07, + "loss": 3.801, + "step": 61 + }, + { + "epoch": 0.005284241029574704, + "grad_norm": 209.55177069671475, + "learning_rate": 1.7329545454545457e-07, + "loss": 4.293, + "step": 62 + }, + { + "epoch": 0.005369470723600102, + "grad_norm": 108.31799977374901, + "learning_rate": 1.7613636363636363e-07, + "loss": 2.5204, + "step": 63 + }, + { + "epoch": 0.005454700417625501, + "grad_norm": 296.17881012715486, + "learning_rate": 1.7897727272727274e-07, + "loss": 4.1312, + "step": 64 + }, + { + "epoch": 0.005539930111650899, + "grad_norm": 137.2389701027683, + "learning_rate": 1.8181818181818183e-07, + "loss": 3.6796, + "step": 65 + }, + { + "epoch": 0.005625159805676298, + "grad_norm": 111.55343366678075, + "learning_rate": 1.8465909090909094e-07, + "loss": 2.2025, + "step": 66 + }, + { + "epoch": 0.005710389499701696, + "grad_norm": 364.8890174610969, + "learning_rate": 1.875e-07, + "loss": 5.374, + "step": 67 + }, + { + "epoch": 0.005795619193727094, + "grad_norm": 452.4817388878714, + "learning_rate": 1.9034090909090912e-07, + "loss": 4.1876, + "step": 68 + }, + { + "epoch": 0.005880848887752493, + "grad_norm": 209.38600113895117, + "learning_rate": 1.9318181818181818e-07, + "loss": 4.2656, + "step": 69 + }, + { + "epoch": 0.005966078581777891, + "grad_norm": 219.65826002523457, + "learning_rate": 1.960227272727273e-07, + "loss": 3.5851, + "step": 70 + }, + { + "epoch": 0.00605130827580329, + "grad_norm": 663.9785048977195, + "learning_rate": 1.9886363636363638e-07, + "loss": 6.6065, + "step": 71 + }, + { + "epoch": 0.006136537969828688, + "grad_norm": 129.3406060572297, + "learning_rate": 2.017045454545455e-07, + "loss": 2.9051, + "step": 72 + }, + { + "epoch": 0.006221767663854087, + "grad_norm": 360.8468419173071, + "learning_rate": 2.0454545454545456e-07, + "loss": 3.8005, + "step": 73 + }, + { + "epoch": 0.0063069973578794855, + "grad_norm": 302.3027301227285, + "learning_rate": 2.0738636363636367e-07, + "loss": 3.5402, + "step": 74 + }, + { + "epoch": 0.0063922270519048835, + "grad_norm": 156.29814213127165, + "learning_rate": 2.1022727272727273e-07, + "loss": 1.6033, + "step": 75 + }, + { + "epoch": 0.006477456745930282, + "grad_norm": 452.43099661016277, + "learning_rate": 2.1306818181818182e-07, + "loss": 4.6577, + "step": 76 + }, + { + "epoch": 0.00656268643995568, + "grad_norm": 140.821522611596, + "learning_rate": 2.1590909090909094e-07, + "loss": 2.8536, + "step": 77 + }, + { + "epoch": 0.006647916133981079, + "grad_norm": 213.75071466316834, + "learning_rate": 2.1875e-07, + "loss": 4.9042, + "step": 78 + }, + { + "epoch": 0.006733145828006477, + "grad_norm": 131.34083624120163, + "learning_rate": 2.215909090909091e-07, + "loss": 3.2104, + "step": 79 + }, + { + "epoch": 0.006818375522031876, + "grad_norm": 781.6205106720345, + "learning_rate": 2.244318181818182e-07, + "loss": 3.7495, + "step": 80 + }, + { + "epoch": 0.006903605216057275, + "grad_norm": 242.82543545463085, + "learning_rate": 2.2727272727272729e-07, + "loss": 4.0819, + "step": 81 + }, + { + "epoch": 0.006988834910082673, + "grad_norm": 77.82138260200352, + "learning_rate": 2.3011363636363637e-07, + "loss": 2.3084, + "step": 82 + }, + { + "epoch": 0.007074064604108071, + "grad_norm": 77.4364134641956, + "learning_rate": 2.329545454545455e-07, + "loss": 1.903, + "step": 83 + }, + { + "epoch": 0.007159294298133469, + "grad_norm": 117.28452561016898, + "learning_rate": 2.3579545454545455e-07, + "loss": 3.6482, + "step": 84 + }, + { + "epoch": 0.007244523992158868, + "grad_norm": 184.54072362268366, + "learning_rate": 2.386363636363637e-07, + "loss": 4.8453, + "step": 85 + }, + { + "epoch": 0.007329753686184267, + "grad_norm": 157.74148797807763, + "learning_rate": 2.414772727272727e-07, + "loss": 3.713, + "step": 86 + }, + { + "epoch": 0.007414983380209665, + "grad_norm": 2238.0704430334918, + "learning_rate": 2.4431818181818187e-07, + "loss": 3.9335, + "step": 87 + }, + { + "epoch": 0.007500213074235064, + "grad_norm": 490.3739075548347, + "learning_rate": 2.471590909090909e-07, + "loss": 3.7453, + "step": 88 + }, + { + "epoch": 0.007585442768260462, + "grad_norm": 141.14080036354113, + "learning_rate": 2.5000000000000004e-07, + "loss": 3.0281, + "step": 89 + }, + { + "epoch": 0.0076706724622858605, + "grad_norm": 2157.6408157114743, + "learning_rate": 2.5284090909090913e-07, + "loss": 3.9894, + "step": 90 + }, + { + "epoch": 0.0077559021563112585, + "grad_norm": 486.6547306669673, + "learning_rate": 2.556818181818182e-07, + "loss": 6.0302, + "step": 91 + }, + { + "epoch": 0.007841131850336656, + "grad_norm": 158.61415469094806, + "learning_rate": 2.585227272727273e-07, + "loss": 2.6344, + "step": 92 + }, + { + "epoch": 0.007926361544362055, + "grad_norm": 488.4048964881571, + "learning_rate": 2.613636363636364e-07, + "loss": 4.422, + "step": 93 + }, + { + "epoch": 0.008011591238387454, + "grad_norm": 189.3151185226685, + "learning_rate": 2.642045454545455e-07, + "loss": 4.5739, + "step": 94 + }, + { + "epoch": 0.008096820932412853, + "grad_norm": 135.76824051355268, + "learning_rate": 2.6704545454545457e-07, + "loss": 2.364, + "step": 95 + }, + { + "epoch": 0.008182050626438252, + "grad_norm": 54.51346868111074, + "learning_rate": 2.6988636363636366e-07, + "loss": 2.3769, + "step": 96 + }, + { + "epoch": 0.008267280320463649, + "grad_norm": 256.39811549715995, + "learning_rate": 2.7272727272727274e-07, + "loss": 4.3316, + "step": 97 + }, + { + "epoch": 0.008352510014489048, + "grad_norm": 171.55619229877723, + "learning_rate": 2.7556818181818183e-07, + "loss": 3.8809, + "step": 98 + }, + { + "epoch": 0.008437739708514446, + "grad_norm": 112.46654049443256, + "learning_rate": 2.784090909090909e-07, + "loss": 2.2481, + "step": 99 + }, + { + "epoch": 0.008522969402539845, + "grad_norm": 271.9575227550222, + "learning_rate": 2.8125e-07, + "loss": 4.7522, + "step": 100 + }, + { + "epoch": 0.008608199096565244, + "grad_norm": 128.61592192949286, + "learning_rate": 2.840909090909091e-07, + "loss": 2.7311, + "step": 101 + }, + { + "epoch": 0.008693428790590641, + "grad_norm": 206.33279141417196, + "learning_rate": 2.8693181818181824e-07, + "loss": 3.2797, + "step": 102 + }, + { + "epoch": 0.00877865848461604, + "grad_norm": 145.88047255681246, + "learning_rate": 2.8977272727272727e-07, + "loss": 4.0818, + "step": 103 + }, + { + "epoch": 0.008863888178641439, + "grad_norm": 206.64820251233053, + "learning_rate": 2.926136363636364e-07, + "loss": 3.2925, + "step": 104 + }, + { + "epoch": 0.008949117872666838, + "grad_norm": 88.71664122767285, + "learning_rate": 2.954545454545455e-07, + "loss": 3.565, + "step": 105 + }, + { + "epoch": 0.009034347566692236, + "grad_norm": 304.1213404166656, + "learning_rate": 2.982954545454546e-07, + "loss": 3.631, + "step": 106 + }, + { + "epoch": 0.009119577260717633, + "grad_norm": 516.053562920911, + "learning_rate": 3.011363636363637e-07, + "loss": 3.7459, + "step": 107 + }, + { + "epoch": 0.009204806954743032, + "grad_norm": 127.73745109482283, + "learning_rate": 3.0397727272727276e-07, + "loss": 4.4899, + "step": 108 + }, + { + "epoch": 0.009290036648768431, + "grad_norm": 128.69222886311763, + "learning_rate": 3.0681818181818185e-07, + "loss": 2.616, + "step": 109 + }, + { + "epoch": 0.00937526634279383, + "grad_norm": 371.4918142479049, + "learning_rate": 3.0965909090909094e-07, + "loss": 4.2648, + "step": 110 + }, + { + "epoch": 0.009460496036819227, + "grad_norm": 156.5778977155587, + "learning_rate": 3.125e-07, + "loss": 2.9905, + "step": 111 + }, + { + "epoch": 0.009545725730844626, + "grad_norm": 480.9793736390648, + "learning_rate": 3.153409090909091e-07, + "loss": 3.5398, + "step": 112 + }, + { + "epoch": 0.009630955424870025, + "grad_norm": 120.05919201605693, + "learning_rate": 3.181818181818182e-07, + "loss": 3.7334, + "step": 113 + }, + { + "epoch": 0.009716185118895423, + "grad_norm": 213.27395149726877, + "learning_rate": 3.2102272727272734e-07, + "loss": 5.7413, + "step": 114 + }, + { + "epoch": 0.009801414812920822, + "grad_norm": 137.77860569520428, + "learning_rate": 3.238636363636364e-07, + "loss": 3.0277, + "step": 115 + }, + { + "epoch": 0.00988664450694622, + "grad_norm": 177.56771977113104, + "learning_rate": 3.2670454545454546e-07, + "loss": 3.3676, + "step": 116 + }, + { + "epoch": 0.009971874200971618, + "grad_norm": 105.72252289469823, + "learning_rate": 3.2954545454545455e-07, + "loss": 2.8491, + "step": 117 + }, + { + "epoch": 0.010057103894997017, + "grad_norm": 160.4422192436026, + "learning_rate": 3.323863636363637e-07, + "loss": 3.6212, + "step": 118 + }, + { + "epoch": 0.010142333589022416, + "grad_norm": 134.5184903994595, + "learning_rate": 3.352272727272728e-07, + "loss": 2.8532, + "step": 119 + }, + { + "epoch": 0.010227563283047815, + "grad_norm": 127.26853386097503, + "learning_rate": 3.380681818181818e-07, + "loss": 2.808, + "step": 120 + }, + { + "epoch": 0.010312792977073212, + "grad_norm": 87.60979661289592, + "learning_rate": 3.409090909090909e-07, + "loss": 2.76, + "step": 121 + }, + { + "epoch": 0.01039802267109861, + "grad_norm": 1833.8719919590235, + "learning_rate": 3.4375000000000004e-07, + "loss": 3.4267, + "step": 122 + }, + { + "epoch": 0.01048325236512401, + "grad_norm": 123.08554759351154, + "learning_rate": 3.4659090909090913e-07, + "loss": 2.9201, + "step": 123 + }, + { + "epoch": 0.010568482059149408, + "grad_norm": 213.34581393247834, + "learning_rate": 3.494318181818182e-07, + "loss": 3.6621, + "step": 124 + }, + { + "epoch": 0.010653711753174805, + "grad_norm": 237.22849010905216, + "learning_rate": 3.5227272727272725e-07, + "loss": 4.8206, + "step": 125 + }, + { + "epoch": 0.010738941447200204, + "grad_norm": 116.38128682785414, + "learning_rate": 3.5511363636363645e-07, + "loss": 4.3699, + "step": 126 + }, + { + "epoch": 0.010824171141225603, + "grad_norm": 339.9245794022992, + "learning_rate": 3.579545454545455e-07, + "loss": 5.8987, + "step": 127 + }, + { + "epoch": 0.010909400835251002, + "grad_norm": 637.4881839456355, + "learning_rate": 3.6079545454545457e-07, + "loss": 4.0648, + "step": 128 + }, + { + "epoch": 0.0109946305292764, + "grad_norm": 273.18276934943333, + "learning_rate": 3.6363636363636366e-07, + "loss": 4.7066, + "step": 129 + }, + { + "epoch": 0.011079860223301798, + "grad_norm": 579.2669887701163, + "learning_rate": 3.6647727272727275e-07, + "loss": 4.5571, + "step": 130 + }, + { + "epoch": 0.011165089917327196, + "grad_norm": 193.08694693814633, + "learning_rate": 3.693181818181819e-07, + "loss": 2.8432, + "step": 131 + }, + { + "epoch": 0.011250319611352595, + "grad_norm": 184.38194620519573, + "learning_rate": 3.721590909090909e-07, + "loss": 4.1222, + "step": 132 + }, + { + "epoch": 0.011335549305377994, + "grad_norm": 190.88328772802413, + "learning_rate": 3.75e-07, + "loss": 3.7417, + "step": 133 + }, + { + "epoch": 0.011420778999403393, + "grad_norm": 90.3847720426298, + "learning_rate": 3.778409090909091e-07, + "loss": 3.2545, + "step": 134 + }, + { + "epoch": 0.01150600869342879, + "grad_norm": 71.82773976472583, + "learning_rate": 3.8068181818181824e-07, + "loss": 2.071, + "step": 135 + }, + { + "epoch": 0.011591238387454189, + "grad_norm": 161.95995879048314, + "learning_rate": 3.835227272727273e-07, + "loss": 4.2778, + "step": 136 + }, + { + "epoch": 0.011676468081479588, + "grad_norm": 187.48825905733386, + "learning_rate": 3.8636363636363636e-07, + "loss": 4.3622, + "step": 137 + }, + { + "epoch": 0.011761697775504986, + "grad_norm": 124.41058801411569, + "learning_rate": 3.8920454545454545e-07, + "loss": 3.1838, + "step": 138 + }, + { + "epoch": 0.011846927469530385, + "grad_norm": 185.63263845707172, + "learning_rate": 3.920454545454546e-07, + "loss": 4.814, + "step": 139 + }, + { + "epoch": 0.011932157163555782, + "grad_norm": 102.69556187102026, + "learning_rate": 3.948863636363637e-07, + "loss": 3.3637, + "step": 140 + }, + { + "epoch": 0.012017386857581181, + "grad_norm": 136.86986154209265, + "learning_rate": 3.9772727272727276e-07, + "loss": 2.7404, + "step": 141 + }, + { + "epoch": 0.01210261655160658, + "grad_norm": 213.19471148456836, + "learning_rate": 4.0056818181818185e-07, + "loss": 6.0191, + "step": 142 + }, + { + "epoch": 0.012187846245631979, + "grad_norm": 211.81190797985306, + "learning_rate": 4.03409090909091e-07, + "loss": 4.305, + "step": 143 + }, + { + "epoch": 0.012273075939657376, + "grad_norm": 113.18040702893447, + "learning_rate": 4.0625000000000003e-07, + "loss": 2.356, + "step": 144 + }, + { + "epoch": 0.012358305633682775, + "grad_norm": 226.46182284944328, + "learning_rate": 4.090909090909091e-07, + "loss": 3.8956, + "step": 145 + }, + { + "epoch": 0.012443535327708173, + "grad_norm": 113.32808769565996, + "learning_rate": 4.119318181818182e-07, + "loss": 3.3759, + "step": 146 + }, + { + "epoch": 0.012528765021733572, + "grad_norm": 154.2450311934641, + "learning_rate": 4.1477272727272734e-07, + "loss": 2.5956, + "step": 147 + }, + { + "epoch": 0.012613994715758971, + "grad_norm": 96.4977331392678, + "learning_rate": 4.1761363636363643e-07, + "loss": 1.7419, + "step": 148 + }, + { + "epoch": 0.012699224409784368, + "grad_norm": 98.74182216050134, + "learning_rate": 4.2045454545454547e-07, + "loss": 2.9918, + "step": 149 + }, + { + "epoch": 0.012784454103809767, + "grad_norm": 106.50842172969575, + "learning_rate": 4.2329545454545455e-07, + "loss": 2.5216, + "step": 150 + }, + { + "epoch": 0.012869683797835166, + "grad_norm": 158.19242505931095, + "learning_rate": 4.2613636363636364e-07, + "loss": 3.0126, + "step": 151 + }, + { + "epoch": 0.012954913491860565, + "grad_norm": 188.24605310489204, + "learning_rate": 4.289772727272728e-07, + "loss": 3.3236, + "step": 152 + }, + { + "epoch": 0.013040143185885963, + "grad_norm": 103.90852816167765, + "learning_rate": 4.3181818181818187e-07, + "loss": 3.4941, + "step": 153 + }, + { + "epoch": 0.01312537287991136, + "grad_norm": 130.37165824134095, + "learning_rate": 4.3465909090909096e-07, + "loss": 2.9359, + "step": 154 + }, + { + "epoch": 0.01321060257393676, + "grad_norm": 298.132041460007, + "learning_rate": 4.375e-07, + "loss": 3.9998, + "step": 155 + }, + { + "epoch": 0.013295832267962158, + "grad_norm": 101.37329572768402, + "learning_rate": 4.4034090909090913e-07, + "loss": 3.6757, + "step": 156 + }, + { + "epoch": 0.013381061961987557, + "grad_norm": 131.76094302229205, + "learning_rate": 4.431818181818182e-07, + "loss": 3.3352, + "step": 157 + }, + { + "epoch": 0.013466291656012954, + "grad_norm": 115.44103687535636, + "learning_rate": 4.460227272727273e-07, + "loss": 2.3719, + "step": 158 + }, + { + "epoch": 0.013551521350038353, + "grad_norm": 168.6520102536206, + "learning_rate": 4.488636363636364e-07, + "loss": 3.0622, + "step": 159 + }, + { + "epoch": 0.013636751044063752, + "grad_norm": 78.93609514270247, + "learning_rate": 4.5170454545454554e-07, + "loss": 1.7851, + "step": 160 + }, + { + "epoch": 0.01372198073808915, + "grad_norm": 109.5069583244409, + "learning_rate": 4.5454545454545457e-07, + "loss": 3.5207, + "step": 161 + }, + { + "epoch": 0.01380721043211455, + "grad_norm": 112.77535267763146, + "learning_rate": 4.5738636363636366e-07, + "loss": 2.8013, + "step": 162 + }, + { + "epoch": 0.013892440126139946, + "grad_norm": 132.17202380331707, + "learning_rate": 4.6022727272727275e-07, + "loss": 3.2489, + "step": 163 + }, + { + "epoch": 0.013977669820165345, + "grad_norm": 129.18610099978332, + "learning_rate": 4.630681818181819e-07, + "loss": 3.7647, + "step": 164 + }, + { + "epoch": 0.014062899514190744, + "grad_norm": 230.8525537471209, + "learning_rate": 4.65909090909091e-07, + "loss": 4.208, + "step": 165 + }, + { + "epoch": 0.014148129208216143, + "grad_norm": 191.69690628077757, + "learning_rate": 4.6875000000000006e-07, + "loss": 4.2222, + "step": 166 + }, + { + "epoch": 0.014233358902241542, + "grad_norm": 104.63203683758915, + "learning_rate": 4.715909090909091e-07, + "loss": 3.1872, + "step": 167 + }, + { + "epoch": 0.014318588596266939, + "grad_norm": 222.699510414806, + "learning_rate": 4.744318181818182e-07, + "loss": 5.5839, + "step": 168 + }, + { + "epoch": 0.014403818290292338, + "grad_norm": 69.14515201992734, + "learning_rate": 4.772727272727274e-07, + "loss": 3.0932, + "step": 169 + }, + { + "epoch": 0.014489047984317736, + "grad_norm": 319.5665857475932, + "learning_rate": 4.801136363636364e-07, + "loss": 4.6559, + "step": 170 + }, + { + "epoch": 0.014574277678343135, + "grad_norm": 146.86698165713503, + "learning_rate": 4.829545454545455e-07, + "loss": 3.3999, + "step": 171 + }, + { + "epoch": 0.014659507372368534, + "grad_norm": 125.67015383318824, + "learning_rate": 4.857954545454546e-07, + "loss": 2.9174, + "step": 172 + }, + { + "epoch": 0.014744737066393931, + "grad_norm": 89.50920998901266, + "learning_rate": 4.886363636363637e-07, + "loss": 2.955, + "step": 173 + }, + { + "epoch": 0.01482996676041933, + "grad_norm": 118.67530584049535, + "learning_rate": 4.914772727272728e-07, + "loss": 2.5027, + "step": 174 + }, + { + "epoch": 0.014915196454444729, + "grad_norm": 101.61686584660058, + "learning_rate": 4.943181818181818e-07, + "loss": 3.9979, + "step": 175 + }, + { + "epoch": 0.015000426148470128, + "grad_norm": 132.57156819212196, + "learning_rate": 4.971590909090909e-07, + "loss": 3.6953, + "step": 176 + }, + { + "epoch": 0.015085655842495525, + "grad_norm": 87.68556337929829, + "learning_rate": 5.000000000000001e-07, + "loss": 2.9968, + "step": 177 + }, + { + "epoch": 0.015170885536520923, + "grad_norm": 119.89483499911049, + "learning_rate": 5.028409090909091e-07, + "loss": 3.3074, + "step": 178 + }, + { + "epoch": 0.015256115230546322, + "grad_norm": 73.30859869893742, + "learning_rate": 5.056818181818183e-07, + "loss": 3.0008, + "step": 179 + }, + { + "epoch": 0.015341344924571721, + "grad_norm": 93.4157760449364, + "learning_rate": 5.085227272727273e-07, + "loss": 3.3114, + "step": 180 + }, + { + "epoch": 0.01542657461859712, + "grad_norm": 115.11408515510475, + "learning_rate": 5.113636363636364e-07, + "loss": 3.0375, + "step": 181 + }, + { + "epoch": 0.015511804312622517, + "grad_norm": 173.25293097125837, + "learning_rate": 5.142045454545455e-07, + "loss": 3.1403, + "step": 182 + }, + { + "epoch": 0.015597034006647916, + "grad_norm": 87.36145341917525, + "learning_rate": 5.170454545454546e-07, + "loss": 2.8344, + "step": 183 + }, + { + "epoch": 0.015682263700673313, + "grad_norm": 169.83859138487253, + "learning_rate": 5.198863636363636e-07, + "loss": 4.9563, + "step": 184 + }, + { + "epoch": 0.01576749339469871, + "grad_norm": 173.75913790294277, + "learning_rate": 5.227272727272728e-07, + "loss": 4.4477, + "step": 185 + }, + { + "epoch": 0.01585272308872411, + "grad_norm": 141.69159419996714, + "learning_rate": 5.255681818181819e-07, + "loss": 3.2411, + "step": 186 + }, + { + "epoch": 0.01593795278274951, + "grad_norm": 82.75724077329609, + "learning_rate": 5.28409090909091e-07, + "loss": 3.3473, + "step": 187 + }, + { + "epoch": 0.016023182476774908, + "grad_norm": 145.88817387840004, + "learning_rate": 5.3125e-07, + "loss": 3.635, + "step": 188 + }, + { + "epoch": 0.016108412170800307, + "grad_norm": 158.60488240359567, + "learning_rate": 5.340909090909091e-07, + "loss": 4.2251, + "step": 189 + }, + { + "epoch": 0.016193641864825706, + "grad_norm": 86.97743322534704, + "learning_rate": 5.369318181818183e-07, + "loss": 3.0065, + "step": 190 + }, + { + "epoch": 0.016278871558851105, + "grad_norm": 120.65916616067665, + "learning_rate": 5.397727272727273e-07, + "loss": 3.1866, + "step": 191 + }, + { + "epoch": 0.016364101252876503, + "grad_norm": 209.10707864246, + "learning_rate": 5.426136363636363e-07, + "loss": 4.107, + "step": 192 + }, + { + "epoch": 0.016449330946901902, + "grad_norm": 156.37179043021646, + "learning_rate": 5.454545454545455e-07, + "loss": 2.6414, + "step": 193 + }, + { + "epoch": 0.016534560640927298, + "grad_norm": 132.04199013519926, + "learning_rate": 5.482954545454546e-07, + "loss": 2.7152, + "step": 194 + }, + { + "epoch": 0.016619790334952696, + "grad_norm": 120.86916803187114, + "learning_rate": 5.511363636363637e-07, + "loss": 2.8124, + "step": 195 + }, + { + "epoch": 0.016705020028978095, + "grad_norm": 258.1585010233339, + "learning_rate": 5.539772727272728e-07, + "loss": 4.7897, + "step": 196 + }, + { + "epoch": 0.016790249723003494, + "grad_norm": 162.05053510844445, + "learning_rate": 5.568181818181818e-07, + "loss": 2.9154, + "step": 197 + }, + { + "epoch": 0.016875479417028893, + "grad_norm": 118.13450169739554, + "learning_rate": 5.59659090909091e-07, + "loss": 2.643, + "step": 198 + }, + { + "epoch": 0.01696070911105429, + "grad_norm": 71.23599685754459, + "learning_rate": 5.625e-07, + "loss": 1.8593, + "step": 199 + }, + { + "epoch": 0.01704593880507969, + "grad_norm": 105.28376534343533, + "learning_rate": 5.653409090909092e-07, + "loss": 2.5148, + "step": 200 + }, + { + "epoch": 0.01713116849910509, + "grad_norm": 106.76770571020883, + "learning_rate": 5.681818181818182e-07, + "loss": 2.9666, + "step": 201 + }, + { + "epoch": 0.017216398193130488, + "grad_norm": 116.60794484055636, + "learning_rate": 5.710227272727273e-07, + "loss": 2.3055, + "step": 202 + }, + { + "epoch": 0.017301627887155883, + "grad_norm": 83.14166853997868, + "learning_rate": 5.738636363636365e-07, + "loss": 3.9487, + "step": 203 + }, + { + "epoch": 0.017386857581181282, + "grad_norm": 62.995458880103904, + "learning_rate": 5.767045454545455e-07, + "loss": 1.799, + "step": 204 + }, + { + "epoch": 0.01747208727520668, + "grad_norm": 157.9459133430821, + "learning_rate": 5.795454545454545e-07, + "loss": 3.5182, + "step": 205 + }, + { + "epoch": 0.01755731696923208, + "grad_norm": 155.01872989565928, + "learning_rate": 5.823863636363637e-07, + "loss": 2.8654, + "step": 206 + }, + { + "epoch": 0.01764254666325748, + "grad_norm": 89.46911848293267, + "learning_rate": 5.852272727272728e-07, + "loss": 2.606, + "step": 207 + }, + { + "epoch": 0.017727776357282878, + "grad_norm": 160.58021892039184, + "learning_rate": 5.880681818181819e-07, + "loss": 5.0747, + "step": 208 + }, + { + "epoch": 0.017813006051308276, + "grad_norm": 134.77652878003556, + "learning_rate": 5.90909090909091e-07, + "loss": 2.9496, + "step": 209 + }, + { + "epoch": 0.017898235745333675, + "grad_norm": 116.34893571543978, + "learning_rate": 5.9375e-07, + "loss": 3.5302, + "step": 210 + }, + { + "epoch": 0.017983465439359074, + "grad_norm": 91.08364097010406, + "learning_rate": 5.965909090909092e-07, + "loss": 2.9671, + "step": 211 + }, + { + "epoch": 0.018068695133384473, + "grad_norm": 198.68674014121265, + "learning_rate": 5.994318181818182e-07, + "loss": 3.1672, + "step": 212 + }, + { + "epoch": 0.018153924827409868, + "grad_norm": 166.831588592127, + "learning_rate": 6.022727272727273e-07, + "loss": 3.7037, + "step": 213 + }, + { + "epoch": 0.018239154521435267, + "grad_norm": 157.249535194454, + "learning_rate": 6.051136363636364e-07, + "loss": 3.0456, + "step": 214 + }, + { + "epoch": 0.018324384215460666, + "grad_norm": 75.54951985141956, + "learning_rate": 6.079545454545455e-07, + "loss": 1.8695, + "step": 215 + }, + { + "epoch": 0.018409613909486065, + "grad_norm": 97.9508268498845, + "learning_rate": 6.107954545454546e-07, + "loss": 2.762, + "step": 216 + }, + { + "epoch": 0.018494843603511463, + "grad_norm": 107.84701320365579, + "learning_rate": 6.136363636363637e-07, + "loss": 3.1795, + "step": 217 + }, + { + "epoch": 0.018580073297536862, + "grad_norm": 98.28612107183328, + "learning_rate": 6.164772727272727e-07, + "loss": 2.8586, + "step": 218 + }, + { + "epoch": 0.01866530299156226, + "grad_norm": 136.29619857403745, + "learning_rate": 6.193181818181819e-07, + "loss": 3.1864, + "step": 219 + }, + { + "epoch": 0.01875053268558766, + "grad_norm": 83.89771569386207, + "learning_rate": 6.22159090909091e-07, + "loss": 3.8209, + "step": 220 + }, + { + "epoch": 0.01883576237961306, + "grad_norm": 99.56113216645842, + "learning_rate": 6.25e-07, + "loss": 1.8849, + "step": 221 + }, + { + "epoch": 0.018920992073638454, + "grad_norm": 90.83546602770525, + "learning_rate": 6.278409090909092e-07, + "loss": 3.5464, + "step": 222 + }, + { + "epoch": 0.019006221767663853, + "grad_norm": 116.13299638824736, + "learning_rate": 6.306818181818182e-07, + "loss": 3.0903, + "step": 223 + }, + { + "epoch": 0.01909145146168925, + "grad_norm": 234.9559058387995, + "learning_rate": 6.335227272727274e-07, + "loss": 4.2137, + "step": 224 + }, + { + "epoch": 0.01917668115571465, + "grad_norm": 176.36715301762678, + "learning_rate": 6.363636363636364e-07, + "loss": 4.2306, + "step": 225 + }, + { + "epoch": 0.01926191084974005, + "grad_norm": 117.50755065349809, + "learning_rate": 6.392045454545455e-07, + "loss": 3.4698, + "step": 226 + }, + { + "epoch": 0.019347140543765448, + "grad_norm": 108.0108516298747, + "learning_rate": 6.420454545454547e-07, + "loss": 2.7793, + "step": 227 + }, + { + "epoch": 0.019432370237790847, + "grad_norm": 54.75734897075883, + "learning_rate": 6.448863636363636e-07, + "loss": 1.7847, + "step": 228 + }, + { + "epoch": 0.019517599931816246, + "grad_norm": 89.20957211253213, + "learning_rate": 6.477272727272728e-07, + "loss": 3.0873, + "step": 229 + }, + { + "epoch": 0.019602829625841645, + "grad_norm": 152.82549243248533, + "learning_rate": 6.505681818181819e-07, + "loss": 4.4137, + "step": 230 + }, + { + "epoch": 0.019688059319867043, + "grad_norm": 110.93119895402604, + "learning_rate": 6.534090909090909e-07, + "loss": 3.6103, + "step": 231 + }, + { + "epoch": 0.01977328901389244, + "grad_norm": 69.43607725605754, + "learning_rate": 6.562500000000001e-07, + "loss": 2.2269, + "step": 232 + }, + { + "epoch": 0.019858518707917838, + "grad_norm": 802.9420671983883, + "learning_rate": 6.590909090909091e-07, + "loss": 4.6882, + "step": 233 + }, + { + "epoch": 0.019943748401943236, + "grad_norm": 97.54810094924865, + "learning_rate": 6.619318181818182e-07, + "loss": 3.5406, + "step": 234 + }, + { + "epoch": 0.020028978095968635, + "grad_norm": 99.20129871851842, + "learning_rate": 6.647727272727274e-07, + "loss": 2.5636, + "step": 235 + }, + { + "epoch": 0.020114207789994034, + "grad_norm": 167.0447320355863, + "learning_rate": 6.676136363636364e-07, + "loss": 3.6537, + "step": 236 + }, + { + "epoch": 0.020199437484019433, + "grad_norm": 150.41299278875604, + "learning_rate": 6.704545454545456e-07, + "loss": 3.5528, + "step": 237 + }, + { + "epoch": 0.02028466717804483, + "grad_norm": 135.67008069106157, + "learning_rate": 6.732954545454547e-07, + "loss": 2.34, + "step": 238 + }, + { + "epoch": 0.02036989687207023, + "grad_norm": 99.92390256898186, + "learning_rate": 6.761363636363636e-07, + "loss": 2.8297, + "step": 239 + }, + { + "epoch": 0.02045512656609563, + "grad_norm": 88.07616421123917, + "learning_rate": 6.789772727272728e-07, + "loss": 2.6479, + "step": 240 + }, + { + "epoch": 0.020540356260121025, + "grad_norm": 113.41573391083305, + "learning_rate": 6.818181818181818e-07, + "loss": 2.4778, + "step": 241 + }, + { + "epoch": 0.020625585954146423, + "grad_norm": 111.84753307656433, + "learning_rate": 6.84659090909091e-07, + "loss": 2.2514, + "step": 242 + }, + { + "epoch": 0.020710815648171822, + "grad_norm": 180.77427357382007, + "learning_rate": 6.875000000000001e-07, + "loss": 3.328, + "step": 243 + }, + { + "epoch": 0.02079604534219722, + "grad_norm": 205.89379588992253, + "learning_rate": 6.903409090909091e-07, + "loss": 3.3767, + "step": 244 + }, + { + "epoch": 0.02088127503622262, + "grad_norm": 108.54451059354766, + "learning_rate": 6.931818181818183e-07, + "loss": 2.7011, + "step": 245 + }, + { + "epoch": 0.02096650473024802, + "grad_norm": 122.34411316611391, + "learning_rate": 6.960227272727273e-07, + "loss": 3.6, + "step": 246 + }, + { + "epoch": 0.021051734424273418, + "grad_norm": 118.04813855457634, + "learning_rate": 6.988636363636364e-07, + "loss": 3.0139, + "step": 247 + }, + { + "epoch": 0.021136964118298816, + "grad_norm": 78.93696606988901, + "learning_rate": 7.017045454545456e-07, + "loss": 3.03, + "step": 248 + }, + { + "epoch": 0.021222193812324215, + "grad_norm": 84.32254565656888, + "learning_rate": 7.045454545454545e-07, + "loss": 3.3966, + "step": 249 + }, + { + "epoch": 0.02130742350634961, + "grad_norm": 557.1054063864275, + "learning_rate": 7.073863636363638e-07, + "loss": 4.397, + "step": 250 + }, + { + "epoch": 0.02139265320037501, + "grad_norm": 154.97503596262754, + "learning_rate": 7.102272727272729e-07, + "loss": 3.4117, + "step": 251 + }, + { + "epoch": 0.021477882894400408, + "grad_norm": 133.95070097634658, + "learning_rate": 7.130681818181818e-07, + "loss": 3.6256, + "step": 252 + }, + { + "epoch": 0.021563112588425807, + "grad_norm": 102.33663253987766, + "learning_rate": 7.15909090909091e-07, + "loss": 2.3831, + "step": 253 + }, + { + "epoch": 0.021648342282451206, + "grad_norm": 137.81550696551574, + "learning_rate": 7.1875e-07, + "loss": 3.3265, + "step": 254 + }, + { + "epoch": 0.021733571976476605, + "grad_norm": 255.99090520392008, + "learning_rate": 7.215909090909091e-07, + "loss": 3.6007, + "step": 255 + }, + { + "epoch": 0.021818801670502003, + "grad_norm": 105.22126504212339, + "learning_rate": 7.244318181818183e-07, + "loss": 3.2848, + "step": 256 + }, + { + "epoch": 0.021904031364527402, + "grad_norm": 188.57079712485722, + "learning_rate": 7.272727272727273e-07, + "loss": 4.4714, + "step": 257 + }, + { + "epoch": 0.0219892610585528, + "grad_norm": 220.80776853319188, + "learning_rate": 7.301136363636365e-07, + "loss": 2.9887, + "step": 258 + }, + { + "epoch": 0.0220744907525782, + "grad_norm": 108.51283701061449, + "learning_rate": 7.329545454545455e-07, + "loss": 3.1548, + "step": 259 + }, + { + "epoch": 0.022159720446603595, + "grad_norm": 101.8426877891466, + "learning_rate": 7.357954545454546e-07, + "loss": 2.9659, + "step": 260 + }, + { + "epoch": 0.022244950140628994, + "grad_norm": 106.01963092187486, + "learning_rate": 7.386363636363638e-07, + "loss": 2.3498, + "step": 261 + }, + { + "epoch": 0.022330179834654393, + "grad_norm": 117.00546569684656, + "learning_rate": 7.414772727272727e-07, + "loss": 2.5858, + "step": 262 + }, + { + "epoch": 0.02241540952867979, + "grad_norm": 112.62822265406098, + "learning_rate": 7.443181818181818e-07, + "loss": 3.0584, + "step": 263 + }, + { + "epoch": 0.02250063922270519, + "grad_norm": 114.27390702055794, + "learning_rate": 7.47159090909091e-07, + "loss": 3.4998, + "step": 264 + }, + { + "epoch": 0.02258586891673059, + "grad_norm": 78.9166411689089, + "learning_rate": 7.5e-07, + "loss": 3.2728, + "step": 265 + }, + { + "epoch": 0.022671098610755988, + "grad_norm": 285.0862407784301, + "learning_rate": 7.528409090909092e-07, + "loss": 4.4491, + "step": 266 + }, + { + "epoch": 0.022756328304781387, + "grad_norm": 93.68396559657094, + "learning_rate": 7.556818181818182e-07, + "loss": 2.4694, + "step": 267 + }, + { + "epoch": 0.022841557998806786, + "grad_norm": 97.09768505486525, + "learning_rate": 7.585227272727273e-07, + "loss": 3.2937, + "step": 268 + }, + { + "epoch": 0.02292678769283218, + "grad_norm": 317.95479378199315, + "learning_rate": 7.613636363636365e-07, + "loss": 5.0972, + "step": 269 + }, + { + "epoch": 0.02301201738685758, + "grad_norm": 274.5015348056683, + "learning_rate": 7.642045454545455e-07, + "loss": 3.2461, + "step": 270 + }, + { + "epoch": 0.02309724708088298, + "grad_norm": 145.78952423352203, + "learning_rate": 7.670454545454547e-07, + "loss": 3.6922, + "step": 271 + }, + { + "epoch": 0.023182476774908378, + "grad_norm": 127.50937838717667, + "learning_rate": 7.698863636363638e-07, + "loss": 3.2375, + "step": 272 + }, + { + "epoch": 0.023267706468933776, + "grad_norm": 106.18523431475656, + "learning_rate": 7.727272727272727e-07, + "loss": 2.7136, + "step": 273 + }, + { + "epoch": 0.023352936162959175, + "grad_norm": 117.20193924375774, + "learning_rate": 7.755681818181819e-07, + "loss": 3.4523, + "step": 274 + }, + { + "epoch": 0.023438165856984574, + "grad_norm": 80.40918305227156, + "learning_rate": 7.784090909090909e-07, + "loss": 2.3453, + "step": 275 + }, + { + "epoch": 0.023523395551009973, + "grad_norm": 91.91123336832406, + "learning_rate": 7.8125e-07, + "loss": 3.291, + "step": 276 + }, + { + "epoch": 0.02360862524503537, + "grad_norm": 119.38442538905844, + "learning_rate": 7.840909090909092e-07, + "loss": 3.6034, + "step": 277 + }, + { + "epoch": 0.02369385493906077, + "grad_norm": 171.61145645952098, + "learning_rate": 7.869318181818182e-07, + "loss": 3.0601, + "step": 278 + }, + { + "epoch": 0.023779084633086166, + "grad_norm": 230.02195892956274, + "learning_rate": 7.897727272727274e-07, + "loss": 4.024, + "step": 279 + }, + { + "epoch": 0.023864314327111565, + "grad_norm": 187.59041525037634, + "learning_rate": 7.926136363636364e-07, + "loss": 3.7658, + "step": 280 + }, + { + "epoch": 0.023949544021136963, + "grad_norm": 139.45836780751756, + "learning_rate": 7.954545454545455e-07, + "loss": 3.1615, + "step": 281 + }, + { + "epoch": 0.024034773715162362, + "grad_norm": 80.73637757928243, + "learning_rate": 7.982954545454547e-07, + "loss": 2.5919, + "step": 282 + }, + { + "epoch": 0.02412000340918776, + "grad_norm": 93.83715686505812, + "learning_rate": 8.011363636363637e-07, + "loss": 3.0516, + "step": 283 + }, + { + "epoch": 0.02420523310321316, + "grad_norm": 83.58578799479228, + "learning_rate": 8.039772727272728e-07, + "loss": 3.5779, + "step": 284 + }, + { + "epoch": 0.02429046279723856, + "grad_norm": 132.56457711432637, + "learning_rate": 8.06818181818182e-07, + "loss": 2.8612, + "step": 285 + }, + { + "epoch": 0.024375692491263957, + "grad_norm": 112.1928892435923, + "learning_rate": 8.096590909090909e-07, + "loss": 3.3557, + "step": 286 + }, + { + "epoch": 0.024460922185289356, + "grad_norm": 115.394164354446, + "learning_rate": 8.125000000000001e-07, + "loss": 2.8302, + "step": 287 + }, + { + "epoch": 0.02454615187931475, + "grad_norm": 141.26927115279668, + "learning_rate": 8.153409090909091e-07, + "loss": 3.6229, + "step": 288 + }, + { + "epoch": 0.02463138157334015, + "grad_norm": 77.8172827844611, + "learning_rate": 8.181818181818182e-07, + "loss": 2.0106, + "step": 289 + }, + { + "epoch": 0.02471661126736555, + "grad_norm": 118.68707468198437, + "learning_rate": 8.210227272727274e-07, + "loss": 3.3857, + "step": 290 + }, + { + "epoch": 0.024801840961390948, + "grad_norm": 142.78946352438675, + "learning_rate": 8.238636363636364e-07, + "loss": 3.68, + "step": 291 + }, + { + "epoch": 0.024887070655416347, + "grad_norm": 165.84826793001233, + "learning_rate": 8.267045454545455e-07, + "loss": 3.1377, + "step": 292 + }, + { + "epoch": 0.024972300349441746, + "grad_norm": 104.5909981997227, + "learning_rate": 8.295454545454547e-07, + "loss": 2.6229, + "step": 293 + }, + { + "epoch": 0.025057530043467145, + "grad_norm": 141.0423516721485, + "learning_rate": 8.323863636363637e-07, + "loss": 2.7321, + "step": 294 + }, + { + "epoch": 0.025142759737492543, + "grad_norm": 101.57602371699429, + "learning_rate": 8.352272727272729e-07, + "loss": 2.6513, + "step": 295 + }, + { + "epoch": 0.025227989431517942, + "grad_norm": 87.44861458340363, + "learning_rate": 8.380681818181818e-07, + "loss": 2.8351, + "step": 296 + }, + { + "epoch": 0.02531321912554334, + "grad_norm": 65.71808928442108, + "learning_rate": 8.409090909090909e-07, + "loss": 2.7239, + "step": 297 + }, + { + "epoch": 0.025398448819568736, + "grad_norm": 142.06205196407333, + "learning_rate": 8.437500000000001e-07, + "loss": 3.1547, + "step": 298 + }, + { + "epoch": 0.025483678513594135, + "grad_norm": 132.1969885417548, + "learning_rate": 8.465909090909091e-07, + "loss": 3.1638, + "step": 299 + }, + { + "epoch": 0.025568908207619534, + "grad_norm": 144.17049119736512, + "learning_rate": 8.494318181818182e-07, + "loss": 3.2337, + "step": 300 + }, + { + "epoch": 0.025654137901644933, + "grad_norm": 325.5899332693174, + "learning_rate": 8.522727272727273e-07, + "loss": 4.1324, + "step": 301 + }, + { + "epoch": 0.02573936759567033, + "grad_norm": 145.62202537223223, + "learning_rate": 8.551136363636364e-07, + "loss": 3.7925, + "step": 302 + }, + { + "epoch": 0.02582459728969573, + "grad_norm": 142.3040657555028, + "learning_rate": 8.579545454545456e-07, + "loss": 3.7486, + "step": 303 + }, + { + "epoch": 0.02590982698372113, + "grad_norm": 108.6959352738307, + "learning_rate": 8.607954545454546e-07, + "loss": 2.6565, + "step": 304 + }, + { + "epoch": 0.025995056677746528, + "grad_norm": 104.60909775663275, + "learning_rate": 8.636363636363637e-07, + "loss": 3.5538, + "step": 305 + }, + { + "epoch": 0.026080286371771927, + "grad_norm": 82.01908650637719, + "learning_rate": 8.664772727272729e-07, + "loss": 2.3359, + "step": 306 + }, + { + "epoch": 0.026165516065797322, + "grad_norm": 114.4941056578326, + "learning_rate": 8.693181818181819e-07, + "loss": 3.4768, + "step": 307 + }, + { + "epoch": 0.02625074575982272, + "grad_norm": 74.23187035372301, + "learning_rate": 8.721590909090911e-07, + "loss": 1.1381, + "step": 308 + }, + { + "epoch": 0.02633597545384812, + "grad_norm": 70.74213483482625, + "learning_rate": 8.75e-07, + "loss": 3.1576, + "step": 309 + }, + { + "epoch": 0.02642120514787352, + "grad_norm": 125.41863450208736, + "learning_rate": 8.778409090909091e-07, + "loss": 3.2448, + "step": 310 + }, + { + "epoch": 0.026506434841898917, + "grad_norm": 88.34512555792544, + "learning_rate": 8.806818181818183e-07, + "loss": 2.211, + "step": 311 + }, + { + "epoch": 0.026591664535924316, + "grad_norm": 165.38604064192657, + "learning_rate": 8.835227272727273e-07, + "loss": 3.2701, + "step": 312 + }, + { + "epoch": 0.026676894229949715, + "grad_norm": 74.6703945649058, + "learning_rate": 8.863636363636364e-07, + "loss": 3.3581, + "step": 313 + }, + { + "epoch": 0.026762123923975114, + "grad_norm": 98.903324397902, + "learning_rate": 8.892045454545455e-07, + "loss": 3.0373, + "step": 314 + }, + { + "epoch": 0.026847353618000513, + "grad_norm": 131.80373017585617, + "learning_rate": 8.920454545454546e-07, + "loss": 3.4115, + "step": 315 + }, + { + "epoch": 0.026932583312025908, + "grad_norm": 142.81099540513853, + "learning_rate": 8.948863636363638e-07, + "loss": 2.9646, + "step": 316 + }, + { + "epoch": 0.027017813006051307, + "grad_norm": 158.5414316889018, + "learning_rate": 8.977272727272728e-07, + "loss": 3.2701, + "step": 317 + }, + { + "epoch": 0.027103042700076706, + "grad_norm": 127.41921835292213, + "learning_rate": 9.005681818181819e-07, + "loss": 3.7891, + "step": 318 + }, + { + "epoch": 0.027188272394102105, + "grad_norm": 82.75920484296299, + "learning_rate": 9.034090909090911e-07, + "loss": 3.0876, + "step": 319 + }, + { + "epoch": 0.027273502088127503, + "grad_norm": 73.3465723414968, + "learning_rate": 9.0625e-07, + "loss": 3.6987, + "step": 320 + }, + { + "epoch": 0.027358731782152902, + "grad_norm": 92.41073283425968, + "learning_rate": 9.090909090909091e-07, + "loss": 1.6272, + "step": 321 + }, + { + "epoch": 0.0274439614761783, + "grad_norm": 148.31615878335919, + "learning_rate": 9.119318181818182e-07, + "loss": 2.967, + "step": 322 + }, + { + "epoch": 0.0275291911702037, + "grad_norm": 78.0176611647675, + "learning_rate": 9.147727272727273e-07, + "loss": 2.5284, + "step": 323 + }, + { + "epoch": 0.0276144208642291, + "grad_norm": 101.50059027967846, + "learning_rate": 9.176136363636365e-07, + "loss": 2.4962, + "step": 324 + }, + { + "epoch": 0.027699650558254497, + "grad_norm": 327.78909766014607, + "learning_rate": 9.204545454545455e-07, + "loss": 3.9303, + "step": 325 + }, + { + "epoch": 0.027784880252279893, + "grad_norm": 207.88852419160145, + "learning_rate": 9.232954545454546e-07, + "loss": 4.0321, + "step": 326 + }, + { + "epoch": 0.02787010994630529, + "grad_norm": 75.47546138555121, + "learning_rate": 9.261363636363638e-07, + "loss": 2.3519, + "step": 327 + }, + { + "epoch": 0.02795533964033069, + "grad_norm": 163.16115776167615, + "learning_rate": 9.289772727272728e-07, + "loss": 2.7964, + "step": 328 + }, + { + "epoch": 0.02804056933435609, + "grad_norm": 154.59192126858736, + "learning_rate": 9.31818181818182e-07, + "loss": 2.9267, + "step": 329 + }, + { + "epoch": 0.028125799028381488, + "grad_norm": 128.5888342408266, + "learning_rate": 9.346590909090909e-07, + "loss": 2.796, + "step": 330 + }, + { + "epoch": 0.028211028722406887, + "grad_norm": 189.25934765572285, + "learning_rate": 9.375000000000001e-07, + "loss": 3.7468, + "step": 331 + }, + { + "epoch": 0.028296258416432286, + "grad_norm": 128.0634843490702, + "learning_rate": 9.403409090909093e-07, + "loss": 3.6611, + "step": 332 + }, + { + "epoch": 0.028381488110457685, + "grad_norm": 158.3399540930656, + "learning_rate": 9.431818181818182e-07, + "loss": 3.4109, + "step": 333 + }, + { + "epoch": 0.028466717804483083, + "grad_norm": 87.84604610339598, + "learning_rate": 9.460227272727273e-07, + "loss": 3.2052, + "step": 334 + }, + { + "epoch": 0.02855194749850848, + "grad_norm": 103.5507621085292, + "learning_rate": 9.488636363636364e-07, + "loss": 2.2144, + "step": 335 + }, + { + "epoch": 0.028637177192533878, + "grad_norm": 109.72823533545005, + "learning_rate": 9.517045454545455e-07, + "loss": 3.1045, + "step": 336 + }, + { + "epoch": 0.028722406886559276, + "grad_norm": 113.82192254686895, + "learning_rate": 9.545454545454548e-07, + "loss": 3.3263, + "step": 337 + }, + { + "epoch": 0.028807636580584675, + "grad_norm": 168.58900488644073, + "learning_rate": 9.573863636363637e-07, + "loss": 3.3636, + "step": 338 + }, + { + "epoch": 0.028892866274610074, + "grad_norm": 167.68177053929108, + "learning_rate": 9.602272727272728e-07, + "loss": 3.042, + "step": 339 + }, + { + "epoch": 0.028978095968635473, + "grad_norm": 75.29502614147667, + "learning_rate": 9.63068181818182e-07, + "loss": 3.2673, + "step": 340 + }, + { + "epoch": 0.02906332566266087, + "grad_norm": 58.390531558996585, + "learning_rate": 9.65909090909091e-07, + "loss": 2.1155, + "step": 341 + }, + { + "epoch": 0.02914855535668627, + "grad_norm": 142.29542536827014, + "learning_rate": 9.6875e-07, + "loss": 3.846, + "step": 342 + }, + { + "epoch": 0.02923378505071167, + "grad_norm": 86.96737689658268, + "learning_rate": 9.715909090909092e-07, + "loss": 2.9035, + "step": 343 + }, + { + "epoch": 0.029319014744737068, + "grad_norm": 158.40042405935404, + "learning_rate": 9.744318181818183e-07, + "loss": 2.5641, + "step": 344 + }, + { + "epoch": 0.029404244438762463, + "grad_norm": 108.5142114096278, + "learning_rate": 9.772727272727275e-07, + "loss": 3.7319, + "step": 345 + }, + { + "epoch": 0.029489474132787862, + "grad_norm": 96.48692195331144, + "learning_rate": 9.801136363636364e-07, + "loss": 1.8064, + "step": 346 + }, + { + "epoch": 0.02957470382681326, + "grad_norm": 286.0201458838856, + "learning_rate": 9.829545454545455e-07, + "loss": 3.5679, + "step": 347 + }, + { + "epoch": 0.02965993352083866, + "grad_norm": 100.66754635152245, + "learning_rate": 9.857954545454547e-07, + "loss": 3.348, + "step": 348 + }, + { + "epoch": 0.02974516321486406, + "grad_norm": 146.8876967652738, + "learning_rate": 9.886363636363636e-07, + "loss": 3.7963, + "step": 349 + }, + { + "epoch": 0.029830392908889457, + "grad_norm": 98.57015200536863, + "learning_rate": 9.914772727272727e-07, + "loss": 1.9003, + "step": 350 + }, + { + "epoch": 0.029915622602914856, + "grad_norm": 100.96850755404878, + "learning_rate": 9.943181818181819e-07, + "loss": 1.8169, + "step": 351 + }, + { + "epoch": 0.030000852296940255, + "grad_norm": 122.79046341032272, + "learning_rate": 9.97159090909091e-07, + "loss": 2.715, + "step": 352 + }, + { + "epoch": 0.030086081990965654, + "grad_norm": 95.31546923569914, + "learning_rate": 1.0000000000000002e-06, + "loss": 3.3321, + "step": 353 + }, + { + "epoch": 0.03017131168499105, + "grad_norm": 210.17761478045074, + "learning_rate": 1.002840909090909e-06, + "loss": 3.8507, + "step": 354 + }, + { + "epoch": 0.030256541379016448, + "grad_norm": 163.48692435351472, + "learning_rate": 1.0056818181818182e-06, + "loss": 3.9665, + "step": 355 + }, + { + "epoch": 0.030341771073041847, + "grad_norm": 143.5661726394433, + "learning_rate": 1.0085227272727274e-06, + "loss": 3.4834, + "step": 356 + }, + { + "epoch": 0.030427000767067246, + "grad_norm": 117.07547223268246, + "learning_rate": 1.0113636363636365e-06, + "loss": 3.2566, + "step": 357 + }, + { + "epoch": 0.030512230461092645, + "grad_norm": 170.218546699598, + "learning_rate": 1.0142045454545457e-06, + "loss": 3.6, + "step": 358 + }, + { + "epoch": 0.030597460155118043, + "grad_norm": 91.02698959852204, + "learning_rate": 1.0170454545454546e-06, + "loss": 2.5194, + "step": 359 + }, + { + "epoch": 0.030682689849143442, + "grad_norm": 133.4586891387088, + "learning_rate": 1.0198863636363637e-06, + "loss": 3.1893, + "step": 360 + }, + { + "epoch": 0.03076791954316884, + "grad_norm": 207.9368991344518, + "learning_rate": 1.0227272727272729e-06, + "loss": 3.294, + "step": 361 + }, + { + "epoch": 0.03085314923719424, + "grad_norm": 120.42073825852239, + "learning_rate": 1.0255681818181818e-06, + "loss": 3.9033, + "step": 362 + }, + { + "epoch": 0.03093837893121964, + "grad_norm": 123.84070352467293, + "learning_rate": 1.028409090909091e-06, + "loss": 3.5625, + "step": 363 + }, + { + "epoch": 0.031023608625245034, + "grad_norm": 79.16605910815214, + "learning_rate": 1.03125e-06, + "loss": 3.6956, + "step": 364 + }, + { + "epoch": 0.031108838319270433, + "grad_norm": 133.19935774274833, + "learning_rate": 1.0340909090909092e-06, + "loss": 3.2381, + "step": 365 + }, + { + "epoch": 0.03119406801329583, + "grad_norm": 88.79647914205017, + "learning_rate": 1.0369318181818184e-06, + "loss": 3.1733, + "step": 366 + }, + { + "epoch": 0.031279297707321234, + "grad_norm": 106.19319338772343, + "learning_rate": 1.0397727272727273e-06, + "loss": 2.6366, + "step": 367 + }, + { + "epoch": 0.031364527401346626, + "grad_norm": 105.85779023510985, + "learning_rate": 1.0426136363636364e-06, + "loss": 2.7268, + "step": 368 + }, + { + "epoch": 0.031449757095372025, + "grad_norm": 105.91661855501745, + "learning_rate": 1.0454545454545456e-06, + "loss": 3.0484, + "step": 369 + }, + { + "epoch": 0.03153498678939742, + "grad_norm": 80.31490047241128, + "learning_rate": 1.0482954545454547e-06, + "loss": 2.1112, + "step": 370 + }, + { + "epoch": 0.03162021648342282, + "grad_norm": 217.455343240135, + "learning_rate": 1.0511363636363639e-06, + "loss": 5.6551, + "step": 371 + }, + { + "epoch": 0.03170544617744822, + "grad_norm": 101.06639443138, + "learning_rate": 1.0539772727272728e-06, + "loss": 3.452, + "step": 372 + }, + { + "epoch": 0.03179067587147362, + "grad_norm": 92.51905860387185, + "learning_rate": 1.056818181818182e-06, + "loss": 3.2069, + "step": 373 + }, + { + "epoch": 0.03187590556549902, + "grad_norm": 250.39988261037087, + "learning_rate": 1.059659090909091e-06, + "loss": 5.152, + "step": 374 + }, + { + "epoch": 0.03196113525952442, + "grad_norm": 183.8861241937409, + "learning_rate": 1.0625e-06, + "loss": 3.9282, + "step": 375 + }, + { + "epoch": 0.032046364953549816, + "grad_norm": 165.50250521356708, + "learning_rate": 1.0653409090909091e-06, + "loss": 2.7592, + "step": 376 + }, + { + "epoch": 0.032131594647575215, + "grad_norm": 91.60357321947846, + "learning_rate": 1.0681818181818183e-06, + "loss": 3.3994, + "step": 377 + }, + { + "epoch": 0.032216824341600614, + "grad_norm": 166.85092723738586, + "learning_rate": 1.0710227272727274e-06, + "loss": 3.1432, + "step": 378 + }, + { + "epoch": 0.03230205403562601, + "grad_norm": 96.36923894794101, + "learning_rate": 1.0738636363636366e-06, + "loss": 3.3087, + "step": 379 + }, + { + "epoch": 0.03238728372965141, + "grad_norm": 155.45882304182362, + "learning_rate": 1.0767045454545455e-06, + "loss": 2.7738, + "step": 380 + }, + { + "epoch": 0.03247251342367681, + "grad_norm": 85.62821377699017, + "learning_rate": 1.0795454545454546e-06, + "loss": 2.6076, + "step": 381 + }, + { + "epoch": 0.03255774311770221, + "grad_norm": 148.6311081380207, + "learning_rate": 1.0823863636363638e-06, + "loss": 3.5389, + "step": 382 + }, + { + "epoch": 0.03264297281172761, + "grad_norm": 115.77930616793503, + "learning_rate": 1.0852272727272727e-06, + "loss": 2.7094, + "step": 383 + }, + { + "epoch": 0.03272820250575301, + "grad_norm": 143.81678876067758, + "learning_rate": 1.0880681818181818e-06, + "loss": 3.4275, + "step": 384 + }, + { + "epoch": 0.032813432199778406, + "grad_norm": 115.41766435979964, + "learning_rate": 1.090909090909091e-06, + "loss": 2.5751, + "step": 385 + }, + { + "epoch": 0.032898661893803804, + "grad_norm": 116.09789178067341, + "learning_rate": 1.0937500000000001e-06, + "loss": 2.8098, + "step": 386 + }, + { + "epoch": 0.032983891587829196, + "grad_norm": 140.3990582454365, + "learning_rate": 1.0965909090909093e-06, + "loss": 2.6269, + "step": 387 + }, + { + "epoch": 0.033069121281854595, + "grad_norm": 181.4148329487386, + "learning_rate": 1.0994318181818182e-06, + "loss": 3.4867, + "step": 388 + }, + { + "epoch": 0.033154350975879994, + "grad_norm": 126.42736865178449, + "learning_rate": 1.1022727272727273e-06, + "loss": 3.3089, + "step": 389 + }, + { + "epoch": 0.03323958066990539, + "grad_norm": 169.99753916382934, + "learning_rate": 1.1051136363636365e-06, + "loss": 4.2904, + "step": 390 + }, + { + "epoch": 0.03332481036393079, + "grad_norm": 115.09015251340175, + "learning_rate": 1.1079545454545456e-06, + "loss": 3.3855, + "step": 391 + }, + { + "epoch": 0.03341004005795619, + "grad_norm": 155.98498055400944, + "learning_rate": 1.1107954545454547e-06, + "loss": 3.2454, + "step": 392 + }, + { + "epoch": 0.03349526975198159, + "grad_norm": 129.63087268732863, + "learning_rate": 1.1136363636363637e-06, + "loss": 2.5969, + "step": 393 + }, + { + "epoch": 0.03358049944600699, + "grad_norm": 162.2052138671942, + "learning_rate": 1.1164772727272728e-06, + "loss": 3.8417, + "step": 394 + }, + { + "epoch": 0.03366572914003239, + "grad_norm": 96.76780212983478, + "learning_rate": 1.119318181818182e-06, + "loss": 2.6767, + "step": 395 + }, + { + "epoch": 0.033750958834057786, + "grad_norm": 97.15492621632546, + "learning_rate": 1.1221590909090909e-06, + "loss": 3.6547, + "step": 396 + }, + { + "epoch": 0.033836188528083185, + "grad_norm": 129.75578781822097, + "learning_rate": 1.125e-06, + "loss": 3.9387, + "step": 397 + }, + { + "epoch": 0.03392141822210858, + "grad_norm": 110.16008169318667, + "learning_rate": 1.1278409090909092e-06, + "loss": 2.2426, + "step": 398 + }, + { + "epoch": 0.03400664791613398, + "grad_norm": 156.70370074611336, + "learning_rate": 1.1306818181818183e-06, + "loss": 2.6186, + "step": 399 + }, + { + "epoch": 0.03409187761015938, + "grad_norm": 104.99825590729507, + "learning_rate": 1.1335227272727275e-06, + "loss": 2.2304, + "step": 400 + }, + { + "epoch": 0.03417710730418478, + "grad_norm": 81.71571073212318, + "learning_rate": 1.1363636363636364e-06, + "loss": 2.8419, + "step": 401 + }, + { + "epoch": 0.03426233699821018, + "grad_norm": 112.58721336017108, + "learning_rate": 1.1392045454545455e-06, + "loss": 3.8819, + "step": 402 + }, + { + "epoch": 0.03434756669223558, + "grad_norm": 132.23303720545658, + "learning_rate": 1.1420454545454547e-06, + "loss": 3.134, + "step": 403 + }, + { + "epoch": 0.034432796386260976, + "grad_norm": 105.21309757132612, + "learning_rate": 1.1448863636363638e-06, + "loss": 2.1023, + "step": 404 + }, + { + "epoch": 0.034518026080286375, + "grad_norm": 232.3595363100561, + "learning_rate": 1.147727272727273e-06, + "loss": 4.4904, + "step": 405 + }, + { + "epoch": 0.03460325577431177, + "grad_norm": 79.21497877330088, + "learning_rate": 1.1505681818181819e-06, + "loss": 2.9437, + "step": 406 + }, + { + "epoch": 0.034688485468337166, + "grad_norm": 182.98272366635703, + "learning_rate": 1.153409090909091e-06, + "loss": 2.5617, + "step": 407 + }, + { + "epoch": 0.034773715162362565, + "grad_norm": 105.77371470112266, + "learning_rate": 1.1562500000000002e-06, + "loss": 2.8925, + "step": 408 + }, + { + "epoch": 0.03485894485638796, + "grad_norm": 106.41858392399851, + "learning_rate": 1.159090909090909e-06, + "loss": 2.5307, + "step": 409 + }, + { + "epoch": 0.03494417455041336, + "grad_norm": 146.52398449230807, + "learning_rate": 1.1619318181818182e-06, + "loss": 3.9708, + "step": 410 + }, + { + "epoch": 0.03502940424443876, + "grad_norm": 168.01007512359226, + "learning_rate": 1.1647727272727274e-06, + "loss": 3.3144, + "step": 411 + }, + { + "epoch": 0.03511463393846416, + "grad_norm": 107.65213015494179, + "learning_rate": 1.1676136363636365e-06, + "loss": 2.1883, + "step": 412 + }, + { + "epoch": 0.03519986363248956, + "grad_norm": 90.34051165541518, + "learning_rate": 1.1704545454545456e-06, + "loss": 3.1992, + "step": 413 + }, + { + "epoch": 0.03528509332651496, + "grad_norm": 152.58017457484675, + "learning_rate": 1.1732954545454546e-06, + "loss": 3.912, + "step": 414 + }, + { + "epoch": 0.035370323020540356, + "grad_norm": 347.4497567854035, + "learning_rate": 1.1761363636363637e-06, + "loss": 3.4308, + "step": 415 + }, + { + "epoch": 0.035455552714565755, + "grad_norm": 145.6890993413056, + "learning_rate": 1.1789772727272729e-06, + "loss": 2.4895, + "step": 416 + }, + { + "epoch": 0.035540782408591154, + "grad_norm": 176.17426099871946, + "learning_rate": 1.181818181818182e-06, + "loss": 3.5028, + "step": 417 + }, + { + "epoch": 0.03562601210261655, + "grad_norm": 186.10854578054955, + "learning_rate": 1.1846590909090911e-06, + "loss": 4.2727, + "step": 418 + }, + { + "epoch": 0.03571124179664195, + "grad_norm": 106.69865552265135, + "learning_rate": 1.1875e-06, + "loss": 3.6194, + "step": 419 + }, + { + "epoch": 0.03579647149066735, + "grad_norm": 103.1347262982565, + "learning_rate": 1.1903409090909092e-06, + "loss": 3.4249, + "step": 420 + }, + { + "epoch": 0.03588170118469275, + "grad_norm": 84.97419427781833, + "learning_rate": 1.1931818181818183e-06, + "loss": 2.5884, + "step": 421 + }, + { + "epoch": 0.03596693087871815, + "grad_norm": 136.4339601176936, + "learning_rate": 1.1960227272727273e-06, + "loss": 3.5608, + "step": 422 + }, + { + "epoch": 0.03605216057274355, + "grad_norm": 100.65005474243547, + "learning_rate": 1.1988636363636364e-06, + "loss": 2.3132, + "step": 423 + }, + { + "epoch": 0.036137390266768946, + "grad_norm": 152.32767091290268, + "learning_rate": 1.2017045454545456e-06, + "loss": 3.0044, + "step": 424 + }, + { + "epoch": 0.03622261996079434, + "grad_norm": 111.84582494087954, + "learning_rate": 1.2045454545454547e-06, + "loss": 3.7519, + "step": 425 + }, + { + "epoch": 0.036307849654819736, + "grad_norm": 76.46598310623764, + "learning_rate": 1.2073863636363638e-06, + "loss": 2.8639, + "step": 426 + }, + { + "epoch": 0.036393079348845135, + "grad_norm": 90.50298291524022, + "learning_rate": 1.2102272727272728e-06, + "loss": 2.4239, + "step": 427 + }, + { + "epoch": 0.036478309042870534, + "grad_norm": 93.12699722881295, + "learning_rate": 1.213068181818182e-06, + "loss": 2.4692, + "step": 428 + }, + { + "epoch": 0.03656353873689593, + "grad_norm": 180.54913510938906, + "learning_rate": 1.215909090909091e-06, + "loss": 2.8508, + "step": 429 + }, + { + "epoch": 0.03664876843092133, + "grad_norm": 132.68820518662497, + "learning_rate": 1.21875e-06, + "loss": 3.0773, + "step": 430 + }, + { + "epoch": 0.03673399812494673, + "grad_norm": 101.23963322526882, + "learning_rate": 1.2215909090909091e-06, + "loss": 2.9381, + "step": 431 + }, + { + "epoch": 0.03681922781897213, + "grad_norm": 121.97276347883977, + "learning_rate": 1.2244318181818183e-06, + "loss": 3.4428, + "step": 432 + }, + { + "epoch": 0.03690445751299753, + "grad_norm": 242.38946480243956, + "learning_rate": 1.2272727272727274e-06, + "loss": 3.3491, + "step": 433 + }, + { + "epoch": 0.03698968720702293, + "grad_norm": 164.85936682321233, + "learning_rate": 1.2301136363636365e-06, + "loss": 3.1647, + "step": 434 + }, + { + "epoch": 0.037074916901048326, + "grad_norm": 68.21545265056058, + "learning_rate": 1.2329545454545455e-06, + "loss": 3.2197, + "step": 435 + }, + { + "epoch": 0.037160146595073724, + "grad_norm": 94.96059254887417, + "learning_rate": 1.2357954545454546e-06, + "loss": 2.6256, + "step": 436 + }, + { + "epoch": 0.03724537628909912, + "grad_norm": 167.080179098943, + "learning_rate": 1.2386363636363638e-06, + "loss": 3.2865, + "step": 437 + }, + { + "epoch": 0.03733060598312452, + "grad_norm": 101.21526682245796, + "learning_rate": 1.2414772727272729e-06, + "loss": 2.4349, + "step": 438 + }, + { + "epoch": 0.03741583567714992, + "grad_norm": 89.40930388095018, + "learning_rate": 1.244318181818182e-06, + "loss": 2.7375, + "step": 439 + }, + { + "epoch": 0.03750106537117532, + "grad_norm": 180.88211054630634, + "learning_rate": 1.247159090909091e-06, + "loss": 4.2461, + "step": 440 + }, + { + "epoch": 0.03758629506520072, + "grad_norm": 102.53189292617766, + "learning_rate": 1.25e-06, + "loss": 1.5348, + "step": 441 + }, + { + "epoch": 0.03767152475922612, + "grad_norm": 125.57388939121266, + "learning_rate": 1.252840909090909e-06, + "loss": 2.9198, + "step": 442 + }, + { + "epoch": 0.037756754453251516, + "grad_norm": 112.34418677582634, + "learning_rate": 1.2556818181818184e-06, + "loss": 2.6834, + "step": 443 + }, + { + "epoch": 0.03784198414727691, + "grad_norm": 164.56748283524033, + "learning_rate": 1.2585227272727273e-06, + "loss": 3.5841, + "step": 444 + }, + { + "epoch": 0.03792721384130231, + "grad_norm": 86.06582855326627, + "learning_rate": 1.2613636363636365e-06, + "loss": 2.9098, + "step": 445 + }, + { + "epoch": 0.038012443535327706, + "grad_norm": 226.49966469648268, + "learning_rate": 1.2642045454545456e-06, + "loss": 3.3922, + "step": 446 + }, + { + "epoch": 0.038097673229353105, + "grad_norm": 103.25352529114251, + "learning_rate": 1.2670454545454547e-06, + "loss": 3.4442, + "step": 447 + }, + { + "epoch": 0.0381829029233785, + "grad_norm": 93.54919241102047, + "learning_rate": 1.2698863636363637e-06, + "loss": 2.9764, + "step": 448 + }, + { + "epoch": 0.0382681326174039, + "grad_norm": 100.89706856397002, + "learning_rate": 1.2727272727272728e-06, + "loss": 2.6636, + "step": 449 + }, + { + "epoch": 0.0383533623114293, + "grad_norm": 99.6190866420591, + "learning_rate": 1.275568181818182e-06, + "loss": 2.9441, + "step": 450 + }, + { + "epoch": 0.0384385920054547, + "grad_norm": 230.25778114755943, + "learning_rate": 1.278409090909091e-06, + "loss": 4.4724, + "step": 451 + }, + { + "epoch": 0.0385238216994801, + "grad_norm": 85.20664007329206, + "learning_rate": 1.28125e-06, + "loss": 2.8798, + "step": 452 + }, + { + "epoch": 0.0386090513935055, + "grad_norm": 123.44243008644615, + "learning_rate": 1.2840909090909094e-06, + "loss": 3.2412, + "step": 453 + }, + { + "epoch": 0.038694281087530896, + "grad_norm": 184.40366151687553, + "learning_rate": 1.2869318181818183e-06, + "loss": 2.6445, + "step": 454 + }, + { + "epoch": 0.038779510781556295, + "grad_norm": 97.94323623608108, + "learning_rate": 1.2897727272727272e-06, + "loss": 2.7275, + "step": 455 + }, + { + "epoch": 0.038864740475581694, + "grad_norm": 135.68016135236817, + "learning_rate": 1.2926136363636366e-06, + "loss": 2.4887, + "step": 456 + }, + { + "epoch": 0.03894997016960709, + "grad_norm": 215.28730452590386, + "learning_rate": 1.2954545454545455e-06, + "loss": 3.0503, + "step": 457 + }, + { + "epoch": 0.03903519986363249, + "grad_norm": 246.7547008608558, + "learning_rate": 1.2982954545454546e-06, + "loss": 2.5294, + "step": 458 + }, + { + "epoch": 0.03912042955765789, + "grad_norm": 112.03499926048421, + "learning_rate": 1.3011363636363638e-06, + "loss": 3.1029, + "step": 459 + }, + { + "epoch": 0.03920565925168329, + "grad_norm": 140.63497854963876, + "learning_rate": 1.303977272727273e-06, + "loss": 3.5499, + "step": 460 + }, + { + "epoch": 0.03929088894570869, + "grad_norm": 101.37062204130387, + "learning_rate": 1.3068181818181819e-06, + "loss": 2.3847, + "step": 461 + }, + { + "epoch": 0.03937611863973409, + "grad_norm": 107.77648651340064, + "learning_rate": 1.3096590909090908e-06, + "loss": 2.6121, + "step": 462 + }, + { + "epoch": 0.03946134833375948, + "grad_norm": 98.34050510754598, + "learning_rate": 1.3125000000000001e-06, + "loss": 2.7134, + "step": 463 + }, + { + "epoch": 0.03954657802778488, + "grad_norm": 252.339944765737, + "learning_rate": 1.315340909090909e-06, + "loss": 3.5469, + "step": 464 + }, + { + "epoch": 0.039631807721810276, + "grad_norm": 219.89541272511036, + "learning_rate": 1.3181818181818182e-06, + "loss": 3.647, + "step": 465 + }, + { + "epoch": 0.039717037415835675, + "grad_norm": 76.06745164370824, + "learning_rate": 1.3210227272727276e-06, + "loss": 2.1674, + "step": 466 + }, + { + "epoch": 0.039802267109861074, + "grad_norm": 92.88728537608017, + "learning_rate": 1.3238636363636365e-06, + "loss": 2.0411, + "step": 467 + }, + { + "epoch": 0.03988749680388647, + "grad_norm": 140.5764397965794, + "learning_rate": 1.3267045454545454e-06, + "loss": 3.2837, + "step": 468 + }, + { + "epoch": 0.03997272649791187, + "grad_norm": 93.98553480167014, + "learning_rate": 1.3295454545454548e-06, + "loss": 2.2613, + "step": 469 + }, + { + "epoch": 0.04005795619193727, + "grad_norm": 165.55763048146545, + "learning_rate": 1.3323863636363637e-06, + "loss": 3.4823, + "step": 470 + }, + { + "epoch": 0.04014318588596267, + "grad_norm": 117.96466736509056, + "learning_rate": 1.3352272727272728e-06, + "loss": 3.7087, + "step": 471 + }, + { + "epoch": 0.04022841557998807, + "grad_norm": 126.39952294037882, + "learning_rate": 1.338068181818182e-06, + "loss": 2.2129, + "step": 472 + }, + { + "epoch": 0.04031364527401347, + "grad_norm": 197.21708027156976, + "learning_rate": 1.3409090909090911e-06, + "loss": 3.1752, + "step": 473 + }, + { + "epoch": 0.040398874968038866, + "grad_norm": 97.53051899059884, + "learning_rate": 1.34375e-06, + "loss": 3.1598, + "step": 474 + }, + { + "epoch": 0.040484104662064264, + "grad_norm": 92.8917951458134, + "learning_rate": 1.3465909090909094e-06, + "loss": 2.3626, + "step": 475 + }, + { + "epoch": 0.04056933435608966, + "grad_norm": 138.70500702609388, + "learning_rate": 1.3494318181818183e-06, + "loss": 2.5192, + "step": 476 + }, + { + "epoch": 0.04065456405011506, + "grad_norm": 76.18872837555884, + "learning_rate": 1.3522727272727273e-06, + "loss": 2.0256, + "step": 477 + }, + { + "epoch": 0.04073979374414046, + "grad_norm": 112.21435032574291, + "learning_rate": 1.3551136363636364e-06, + "loss": 3.08, + "step": 478 + }, + { + "epoch": 0.04082502343816586, + "grad_norm": 294.9206213573849, + "learning_rate": 1.3579545454545455e-06, + "loss": 3.6563, + "step": 479 + }, + { + "epoch": 0.04091025313219126, + "grad_norm": 149.06296434085047, + "learning_rate": 1.3607954545454547e-06, + "loss": 2.8919, + "step": 480 + }, + { + "epoch": 0.04099548282621666, + "grad_norm": 228.0633767895611, + "learning_rate": 1.3636363636363636e-06, + "loss": 3.2205, + "step": 481 + }, + { + "epoch": 0.04108071252024205, + "grad_norm": 263.39261792799465, + "learning_rate": 1.366477272727273e-06, + "loss": 3.1976, + "step": 482 + }, + { + "epoch": 0.04116594221426745, + "grad_norm": 133.76128086452405, + "learning_rate": 1.369318181818182e-06, + "loss": 2.8317, + "step": 483 + }, + { + "epoch": 0.04125117190829285, + "grad_norm": 175.9328863671943, + "learning_rate": 1.372159090909091e-06, + "loss": 3.6252, + "step": 484 + }, + { + "epoch": 0.041336401602318246, + "grad_norm": 259.73456941427594, + "learning_rate": 1.3750000000000002e-06, + "loss": 3.5902, + "step": 485 + }, + { + "epoch": 0.041421631296343644, + "grad_norm": 178.23329597563634, + "learning_rate": 1.3778409090909093e-06, + "loss": 3.0183, + "step": 486 + }, + { + "epoch": 0.04150686099036904, + "grad_norm": 107.49238096171862, + "learning_rate": 1.3806818181818182e-06, + "loss": 2.7579, + "step": 487 + }, + { + "epoch": 0.04159209068439444, + "grad_norm": 141.06901735143126, + "learning_rate": 1.3835227272727276e-06, + "loss": 2.8479, + "step": 488 + }, + { + "epoch": 0.04167732037841984, + "grad_norm": 115.14025620448626, + "learning_rate": 1.3863636363636365e-06, + "loss": 3.3019, + "step": 489 + }, + { + "epoch": 0.04176255007244524, + "grad_norm": 229.4992273531305, + "learning_rate": 1.3892045454545455e-06, + "loss": 3.2566, + "step": 490 + }, + { + "epoch": 0.04184777976647064, + "grad_norm": 107.4572204453799, + "learning_rate": 1.3920454545454546e-06, + "loss": 3.1631, + "step": 491 + }, + { + "epoch": 0.04193300946049604, + "grad_norm": 258.4975870288423, + "learning_rate": 1.3948863636363637e-06, + "loss": 3.5772, + "step": 492 + }, + { + "epoch": 0.042018239154521436, + "grad_norm": 160.9516509365985, + "learning_rate": 1.3977272727272729e-06, + "loss": 3.6284, + "step": 493 + }, + { + "epoch": 0.042103468848546835, + "grad_norm": 157.53622022661727, + "learning_rate": 1.4005681818181818e-06, + "loss": 1.9971, + "step": 494 + }, + { + "epoch": 0.042188698542572234, + "grad_norm": 104.50507005929988, + "learning_rate": 1.4034090909090912e-06, + "loss": 2.4896, + "step": 495 + }, + { + "epoch": 0.04227392823659763, + "grad_norm": 143.80455060082573, + "learning_rate": 1.40625e-06, + "loss": 2.611, + "step": 496 + }, + { + "epoch": 0.04235915793062303, + "grad_norm": 201.27087806938087, + "learning_rate": 1.409090909090909e-06, + "loss": 2.5812, + "step": 497 + }, + { + "epoch": 0.04244438762464843, + "grad_norm": 141.11693313786543, + "learning_rate": 1.4119318181818184e-06, + "loss": 3.23, + "step": 498 + }, + { + "epoch": 0.04252961731867383, + "grad_norm": 90.17344426042126, + "learning_rate": 1.4147727272727275e-06, + "loss": 1.5111, + "step": 499 + }, + { + "epoch": 0.04261484701269922, + "grad_norm": 141.61623331291025, + "learning_rate": 1.4176136363636364e-06, + "loss": 2.9268, + "step": 500 + }, + { + "epoch": 0.04270007670672462, + "grad_norm": 124.6213387886414, + "learning_rate": 1.4204545454545458e-06, + "loss": 3.648, + "step": 501 + }, + { + "epoch": 0.04278530640075002, + "grad_norm": 84.73037971709324, + "learning_rate": 1.4232954545454547e-06, + "loss": 2.6278, + "step": 502 + }, + { + "epoch": 0.04287053609477542, + "grad_norm": 152.91332389418798, + "learning_rate": 1.4261363636363637e-06, + "loss": 3.425, + "step": 503 + }, + { + "epoch": 0.042955765788800816, + "grad_norm": 396.84846486909845, + "learning_rate": 1.4289772727272728e-06, + "loss": 4.0078, + "step": 504 + }, + { + "epoch": 0.043040995482826215, + "grad_norm": 190.67138324705317, + "learning_rate": 1.431818181818182e-06, + "loss": 3.1992, + "step": 505 + }, + { + "epoch": 0.043126225176851614, + "grad_norm": 93.26092382800812, + "learning_rate": 1.434659090909091e-06, + "loss": 3.1198, + "step": 506 + }, + { + "epoch": 0.04321145487087701, + "grad_norm": 68.78961431846474, + "learning_rate": 1.4375e-06, + "loss": 2.2497, + "step": 507 + }, + { + "epoch": 0.04329668456490241, + "grad_norm": 110.45730690898812, + "learning_rate": 1.4403409090909094e-06, + "loss": 2.3915, + "step": 508 + }, + { + "epoch": 0.04338191425892781, + "grad_norm": 142.7273387320924, + "learning_rate": 1.4431818181818183e-06, + "loss": 2.8424, + "step": 509 + }, + { + "epoch": 0.04346714395295321, + "grad_norm": 71.72337732955918, + "learning_rate": 1.4460227272727272e-06, + "loss": 2.5923, + "step": 510 + }, + { + "epoch": 0.04355237364697861, + "grad_norm": 103.29158736617056, + "learning_rate": 1.4488636363636366e-06, + "loss": 2.9514, + "step": 511 + }, + { + "epoch": 0.04363760334100401, + "grad_norm": 128.75129116314474, + "learning_rate": 1.4517045454545455e-06, + "loss": 3.1481, + "step": 512 + }, + { + "epoch": 0.043722833035029406, + "grad_norm": 144.260775683893, + "learning_rate": 1.4545454545454546e-06, + "loss": 3.5697, + "step": 513 + }, + { + "epoch": 0.043808062729054804, + "grad_norm": 281.14885456618947, + "learning_rate": 1.4573863636363638e-06, + "loss": 4.0452, + "step": 514 + }, + { + "epoch": 0.0438932924230802, + "grad_norm": 145.27867079443783, + "learning_rate": 1.460227272727273e-06, + "loss": 3.6667, + "step": 515 + }, + { + "epoch": 0.0439785221171056, + "grad_norm": 73.49713129145496, + "learning_rate": 1.4630681818181818e-06, + "loss": 3.0543, + "step": 516 + }, + { + "epoch": 0.044063751811131, + "grad_norm": 79.40212454341497, + "learning_rate": 1.465909090909091e-06, + "loss": 2.0464, + "step": 517 + }, + { + "epoch": 0.0441489815051564, + "grad_norm": 159.38620810729043, + "learning_rate": 1.4687500000000001e-06, + "loss": 2.6476, + "step": 518 + }, + { + "epoch": 0.04423421119918179, + "grad_norm": 99.4591664511014, + "learning_rate": 1.4715909090909093e-06, + "loss": 2.2616, + "step": 519 + }, + { + "epoch": 0.04431944089320719, + "grad_norm": 140.62460552594175, + "learning_rate": 1.4744318181818182e-06, + "loss": 2.4923, + "step": 520 + }, + { + "epoch": 0.04440467058723259, + "grad_norm": 169.25164477618554, + "learning_rate": 1.4772727272727275e-06, + "loss": 3.0913, + "step": 521 + }, + { + "epoch": 0.04448990028125799, + "grad_norm": 118.96298749566955, + "learning_rate": 1.4801136363636365e-06, + "loss": 2.9902, + "step": 522 + }, + { + "epoch": 0.04457512997528339, + "grad_norm": 109.11791832891795, + "learning_rate": 1.4829545454545454e-06, + "loss": 2.3078, + "step": 523 + }, + { + "epoch": 0.044660359669308786, + "grad_norm": 103.60578935811932, + "learning_rate": 1.4857954545454548e-06, + "loss": 2.6465, + "step": 524 + }, + { + "epoch": 0.044745589363334184, + "grad_norm": 246.8295885447885, + "learning_rate": 1.4886363636363637e-06, + "loss": 4.0275, + "step": 525 + }, + { + "epoch": 0.04483081905735958, + "grad_norm": 134.86947372907838, + "learning_rate": 1.4914772727272728e-06, + "loss": 3.1869, + "step": 526 + }, + { + "epoch": 0.04491604875138498, + "grad_norm": 339.6766419999818, + "learning_rate": 1.494318181818182e-06, + "loss": 3.6384, + "step": 527 + }, + { + "epoch": 0.04500127844541038, + "grad_norm": 75.06174246544327, + "learning_rate": 1.4971590909090911e-06, + "loss": 2.2329, + "step": 528 + }, + { + "epoch": 0.04508650813943578, + "grad_norm": 89.62371393712381, + "learning_rate": 1.5e-06, + "loss": 3.0713, + "step": 529 + }, + { + "epoch": 0.04517173783346118, + "grad_norm": 156.70891971016033, + "learning_rate": 1.5028409090909094e-06, + "loss": 3.1219, + "step": 530 + }, + { + "epoch": 0.04525696752748658, + "grad_norm": 96.55915284519654, + "learning_rate": 1.5056818181818183e-06, + "loss": 2.7865, + "step": 531 + }, + { + "epoch": 0.045342197221511976, + "grad_norm": 93.47409762964989, + "learning_rate": 1.5085227272727275e-06, + "loss": 3.7655, + "step": 532 + }, + { + "epoch": 0.045427426915537375, + "grad_norm": 136.30202922041107, + "learning_rate": 1.5113636363636364e-06, + "loss": 3.6437, + "step": 533 + }, + { + "epoch": 0.045512656609562774, + "grad_norm": 164.72342515989897, + "learning_rate": 1.5142045454545457e-06, + "loss": 3.9377, + "step": 534 + }, + { + "epoch": 0.04559788630358817, + "grad_norm": 233.42373304214428, + "learning_rate": 1.5170454545454547e-06, + "loss": 3.5019, + "step": 535 + }, + { + "epoch": 0.04568311599761357, + "grad_norm": 222.69431595356778, + "learning_rate": 1.5198863636363636e-06, + "loss": 3.1268, + "step": 536 + }, + { + "epoch": 0.04576834569163897, + "grad_norm": 86.35997258860644, + "learning_rate": 1.522727272727273e-06, + "loss": 2.5965, + "step": 537 + }, + { + "epoch": 0.04585357538566436, + "grad_norm": 252.1037987938117, + "learning_rate": 1.5255681818181819e-06, + "loss": 3.4333, + "step": 538 + }, + { + "epoch": 0.04593880507968976, + "grad_norm": 231.5527745384785, + "learning_rate": 1.528409090909091e-06, + "loss": 4.3868, + "step": 539 + }, + { + "epoch": 0.04602403477371516, + "grad_norm": 121.5911742894043, + "learning_rate": 1.5312500000000002e-06, + "loss": 3.5783, + "step": 540 + }, + { + "epoch": 0.04610926446774056, + "grad_norm": 87.45977768953992, + "learning_rate": 1.5340909090909093e-06, + "loss": 3.2103, + "step": 541 + }, + { + "epoch": 0.04619449416176596, + "grad_norm": 100.76543682292099, + "learning_rate": 1.5369318181818182e-06, + "loss": 3.1689, + "step": 542 + }, + { + "epoch": 0.046279723855791356, + "grad_norm": 80.26249093821622, + "learning_rate": 1.5397727272727276e-06, + "loss": 3.0615, + "step": 543 + }, + { + "epoch": 0.046364953549816755, + "grad_norm": 131.28845622023456, + "learning_rate": 1.5426136363636365e-06, + "loss": 3.115, + "step": 544 + }, + { + "epoch": 0.046450183243842154, + "grad_norm": 164.08304038292897, + "learning_rate": 1.5454545454545454e-06, + "loss": 2.5795, + "step": 545 + }, + { + "epoch": 0.04653541293786755, + "grad_norm": 79.49176305459149, + "learning_rate": 1.5482954545454546e-06, + "loss": 3.0678, + "step": 546 + }, + { + "epoch": 0.04662064263189295, + "grad_norm": 211.95462834480435, + "learning_rate": 1.5511363636363637e-06, + "loss": 2.5654, + "step": 547 + }, + { + "epoch": 0.04670587232591835, + "grad_norm": 104.74778601047551, + "learning_rate": 1.5539772727272729e-06, + "loss": 3.6323, + "step": 548 + }, + { + "epoch": 0.04679110201994375, + "grad_norm": 148.09920775836963, + "learning_rate": 1.5568181818181818e-06, + "loss": 3.2265, + "step": 549 + }, + { + "epoch": 0.04687633171396915, + "grad_norm": 146.6107208112997, + "learning_rate": 1.5596590909090911e-06, + "loss": 2.2726, + "step": 550 + }, + { + "epoch": 0.04696156140799455, + "grad_norm": 180.8599549288953, + "learning_rate": 1.5625e-06, + "loss": 3.6685, + "step": 551 + }, + { + "epoch": 0.047046791102019946, + "grad_norm": 142.48648296751236, + "learning_rate": 1.5653409090909092e-06, + "loss": 3.0951, + "step": 552 + }, + { + "epoch": 0.047132020796045344, + "grad_norm": 145.1874884354685, + "learning_rate": 1.5681818181818184e-06, + "loss": 3.2651, + "step": 553 + }, + { + "epoch": 0.04721725049007074, + "grad_norm": 146.58894691265246, + "learning_rate": 1.5710227272727275e-06, + "loss": 3.0545, + "step": 554 + }, + { + "epoch": 0.04730248018409614, + "grad_norm": 106.71808956443556, + "learning_rate": 1.5738636363636364e-06, + "loss": 2.7637, + "step": 555 + }, + { + "epoch": 0.04738770987812154, + "grad_norm": 88.95228667530573, + "learning_rate": 1.5767045454545458e-06, + "loss": 2.6928, + "step": 556 + }, + { + "epoch": 0.04747293957214693, + "grad_norm": 144.3326161150151, + "learning_rate": 1.5795454545454547e-06, + "loss": 3.5854, + "step": 557 + }, + { + "epoch": 0.04755816926617233, + "grad_norm": 267.463546214926, + "learning_rate": 1.5823863636363636e-06, + "loss": 4.266, + "step": 558 + }, + { + "epoch": 0.04764339896019773, + "grad_norm": 206.6607839266905, + "learning_rate": 1.5852272727272728e-06, + "loss": 3.1704, + "step": 559 + }, + { + "epoch": 0.04772862865422313, + "grad_norm": 203.29042575162873, + "learning_rate": 1.588068181818182e-06, + "loss": 3.394, + "step": 560 + }, + { + "epoch": 0.04781385834824853, + "grad_norm": 100.32771519755147, + "learning_rate": 1.590909090909091e-06, + "loss": 3.0575, + "step": 561 + }, + { + "epoch": 0.04789908804227393, + "grad_norm": 152.08587268706262, + "learning_rate": 1.59375e-06, + "loss": 2.5724, + "step": 562 + }, + { + "epoch": 0.047984317736299326, + "grad_norm": 111.03526647054531, + "learning_rate": 1.5965909090909093e-06, + "loss": 1.2004, + "step": 563 + }, + { + "epoch": 0.048069547430324724, + "grad_norm": 139.32954533220394, + "learning_rate": 1.5994318181818183e-06, + "loss": 2.9405, + "step": 564 + }, + { + "epoch": 0.04815477712435012, + "grad_norm": 161.49939828688454, + "learning_rate": 1.6022727272727274e-06, + "loss": 3.2765, + "step": 565 + }, + { + "epoch": 0.04824000681837552, + "grad_norm": 88.16482192437829, + "learning_rate": 1.6051136363636366e-06, + "loss": 2.4917, + "step": 566 + }, + { + "epoch": 0.04832523651240092, + "grad_norm": 101.00920517063335, + "learning_rate": 1.6079545454545457e-06, + "loss": 2.8197, + "step": 567 + }, + { + "epoch": 0.04841046620642632, + "grad_norm": 70.83850924488992, + "learning_rate": 1.6107954545454546e-06, + "loss": 2.3334, + "step": 568 + }, + { + "epoch": 0.04849569590045172, + "grad_norm": 159.66851281990827, + "learning_rate": 1.613636363636364e-06, + "loss": 2.9268, + "step": 569 + }, + { + "epoch": 0.04858092559447712, + "grad_norm": 351.2831757826699, + "learning_rate": 1.616477272727273e-06, + "loss": 4.0198, + "step": 570 + }, + { + "epoch": 0.048666155288502516, + "grad_norm": 98.45589852290159, + "learning_rate": 1.6193181818181818e-06, + "loss": 3.2048, + "step": 571 + }, + { + "epoch": 0.048751384982527915, + "grad_norm": 215.04175395357103, + "learning_rate": 1.622159090909091e-06, + "loss": 3.0093, + "step": 572 + }, + { + "epoch": 0.048836614676553314, + "grad_norm": 255.62488480779717, + "learning_rate": 1.6250000000000001e-06, + "loss": 3.2949, + "step": 573 + }, + { + "epoch": 0.04892184437057871, + "grad_norm": 151.22194411487266, + "learning_rate": 1.6278409090909093e-06, + "loss": 2.742, + "step": 574 + }, + { + "epoch": 0.04900707406460411, + "grad_norm": 203.37178436448397, + "learning_rate": 1.6306818181818182e-06, + "loss": 3.173, + "step": 575 + }, + { + "epoch": 0.0490923037586295, + "grad_norm": 85.76256817798799, + "learning_rate": 1.6335227272727275e-06, + "loss": 2.646, + "step": 576 + }, + { + "epoch": 0.0491775334526549, + "grad_norm": 90.58317778295338, + "learning_rate": 1.6363636363636365e-06, + "loss": 3.3929, + "step": 577 + }, + { + "epoch": 0.0492627631466803, + "grad_norm": 133.21090406246554, + "learning_rate": 1.6392045454545454e-06, + "loss": 3.5198, + "step": 578 + }, + { + "epoch": 0.0493479928407057, + "grad_norm": 80.75014223302392, + "learning_rate": 1.6420454545454547e-06, + "loss": 2.8934, + "step": 579 + }, + { + "epoch": 0.0494332225347311, + "grad_norm": 133.89199470570333, + "learning_rate": 1.6448863636363639e-06, + "loss": 2.1825, + "step": 580 + }, + { + "epoch": 0.0495184522287565, + "grad_norm": 115.94176682863385, + "learning_rate": 1.6477272727272728e-06, + "loss": 2.4941, + "step": 581 + }, + { + "epoch": 0.049603681922781896, + "grad_norm": 230.50372103845382, + "learning_rate": 1.6505681818181822e-06, + "loss": 3.2877, + "step": 582 + }, + { + "epoch": 0.049688911616807295, + "grad_norm": 378.9202927793399, + "learning_rate": 1.653409090909091e-06, + "loss": 3.1323, + "step": 583 + }, + { + "epoch": 0.049774141310832694, + "grad_norm": 189.02908827941073, + "learning_rate": 1.65625e-06, + "loss": 3.3981, + "step": 584 + }, + { + "epoch": 0.04985937100485809, + "grad_norm": 209.81473663768142, + "learning_rate": 1.6590909090909094e-06, + "loss": 2.8993, + "step": 585 + }, + { + "epoch": 0.04994460069888349, + "grad_norm": 150.75265050926615, + "learning_rate": 1.6619318181818183e-06, + "loss": 3.9464, + "step": 586 + }, + { + "epoch": 0.05002983039290889, + "grad_norm": 169.21324834961467, + "learning_rate": 1.6647727272727274e-06, + "loss": 3.8052, + "step": 587 + }, + { + "epoch": 0.05011506008693429, + "grad_norm": 110.48175831231896, + "learning_rate": 1.6676136363636364e-06, + "loss": 2.7772, + "step": 588 + }, + { + "epoch": 0.05020028978095969, + "grad_norm": 68.72247321281397, + "learning_rate": 1.6704545454545457e-06, + "loss": 1.3411, + "step": 589 + }, + { + "epoch": 0.05028551947498509, + "grad_norm": 202.84219075784878, + "learning_rate": 1.6732954545454547e-06, + "loss": 3.2313, + "step": 590 + }, + { + "epoch": 0.050370749169010486, + "grad_norm": 118.97864726034997, + "learning_rate": 1.6761363636363636e-06, + "loss": 3.2432, + "step": 591 + }, + { + "epoch": 0.050455978863035884, + "grad_norm": 100.39971481554203, + "learning_rate": 1.678977272727273e-06, + "loss": 2.1818, + "step": 592 + }, + { + "epoch": 0.05054120855706128, + "grad_norm": 117.41160447680204, + "learning_rate": 1.6818181818181819e-06, + "loss": 3.5429, + "step": 593 + }, + { + "epoch": 0.05062643825108668, + "grad_norm": 239.42587160885014, + "learning_rate": 1.684659090909091e-06, + "loss": 3.4358, + "step": 594 + }, + { + "epoch": 0.050711667945112074, + "grad_norm": 540.5324870564154, + "learning_rate": 1.6875000000000001e-06, + "loss": 4.0371, + "step": 595 + }, + { + "epoch": 0.05079689763913747, + "grad_norm": 108.81942483846349, + "learning_rate": 1.6903409090909093e-06, + "loss": 2.3583, + "step": 596 + }, + { + "epoch": 0.05088212733316287, + "grad_norm": 84.78315574377856, + "learning_rate": 1.6931818181818182e-06, + "loss": 2.3229, + "step": 597 + }, + { + "epoch": 0.05096735702718827, + "grad_norm": 92.99324458917866, + "learning_rate": 1.6960227272727276e-06, + "loss": 2.8125, + "step": 598 + }, + { + "epoch": 0.05105258672121367, + "grad_norm": 81.12625897086768, + "learning_rate": 1.6988636363636365e-06, + "loss": 2.7491, + "step": 599 + }, + { + "epoch": 0.05113781641523907, + "grad_norm": 151.14786804984678, + "learning_rate": 1.7017045454545456e-06, + "loss": 2.937, + "step": 600 + }, + { + "epoch": 0.05122304610926447, + "grad_norm": 167.13766628108286, + "learning_rate": 1.7045454545454546e-06, + "loss": 3.1967, + "step": 601 + }, + { + "epoch": 0.051308275803289866, + "grad_norm": 92.27542832502121, + "learning_rate": 1.707386363636364e-06, + "loss": 3.4793, + "step": 602 + }, + { + "epoch": 0.051393505497315264, + "grad_norm": 88.16171474894897, + "learning_rate": 1.7102272727272729e-06, + "loss": 1.9996, + "step": 603 + }, + { + "epoch": 0.05147873519134066, + "grad_norm": 86.03237715513592, + "learning_rate": 1.7130681818181818e-06, + "loss": 2.203, + "step": 604 + }, + { + "epoch": 0.05156396488536606, + "grad_norm": 314.77076479592955, + "learning_rate": 1.7159090909090911e-06, + "loss": 3.5319, + "step": 605 + }, + { + "epoch": 0.05164919457939146, + "grad_norm": 287.8177344715754, + "learning_rate": 1.71875e-06, + "loss": 2.6442, + "step": 606 + }, + { + "epoch": 0.05173442427341686, + "grad_norm": 134.48197976006347, + "learning_rate": 1.7215909090909092e-06, + "loss": 3.2689, + "step": 607 + }, + { + "epoch": 0.05181965396744226, + "grad_norm": 103.38789908668626, + "learning_rate": 1.7244318181818183e-06, + "loss": 2.7293, + "step": 608 + }, + { + "epoch": 0.05190488366146766, + "grad_norm": 185.01916723809327, + "learning_rate": 1.7272727272727275e-06, + "loss": 2.1342, + "step": 609 + }, + { + "epoch": 0.051990113355493056, + "grad_norm": 92.20140211256039, + "learning_rate": 1.7301136363636364e-06, + "loss": 3.0265, + "step": 610 + }, + { + "epoch": 0.052075343049518455, + "grad_norm": 282.45516708478203, + "learning_rate": 1.7329545454545458e-06, + "loss": 2.9205, + "step": 611 + }, + { + "epoch": 0.052160572743543854, + "grad_norm": 108.7035024637814, + "learning_rate": 1.7357954545454547e-06, + "loss": 2.73, + "step": 612 + }, + { + "epoch": 0.05224580243756925, + "grad_norm": 75.50508258081435, + "learning_rate": 1.7386363636363638e-06, + "loss": 2.2166, + "step": 613 + }, + { + "epoch": 0.052331032131594644, + "grad_norm": 137.78637354037255, + "learning_rate": 1.7414772727272728e-06, + "loss": 2.7991, + "step": 614 + }, + { + "epoch": 0.05241626182562004, + "grad_norm": 169.12702340115808, + "learning_rate": 1.7443181818181821e-06, + "loss": 2.866, + "step": 615 + }, + { + "epoch": 0.05250149151964544, + "grad_norm": 122.97399874357292, + "learning_rate": 1.747159090909091e-06, + "loss": 2.7378, + "step": 616 + }, + { + "epoch": 0.05258672121367084, + "grad_norm": 196.9613549067064, + "learning_rate": 1.75e-06, + "loss": 3.8905, + "step": 617 + }, + { + "epoch": 0.05267195090769624, + "grad_norm": 142.91330422845812, + "learning_rate": 1.7528409090909093e-06, + "loss": 3.8859, + "step": 618 + }, + { + "epoch": 0.05275718060172164, + "grad_norm": 249.169841328107, + "learning_rate": 1.7556818181818183e-06, + "loss": 2.7953, + "step": 619 + }, + { + "epoch": 0.05284241029574704, + "grad_norm": 134.4905640019115, + "learning_rate": 1.7585227272727274e-06, + "loss": 3.4011, + "step": 620 + }, + { + "epoch": 0.052927639989772436, + "grad_norm": 293.56579839673054, + "learning_rate": 1.7613636363636365e-06, + "loss": 3.5261, + "step": 621 + }, + { + "epoch": 0.053012869683797835, + "grad_norm": 296.0683916537008, + "learning_rate": 1.7642045454545457e-06, + "loss": 2.9268, + "step": 622 + }, + { + "epoch": 0.053098099377823234, + "grad_norm": 117.22566378606271, + "learning_rate": 1.7670454545454546e-06, + "loss": 2.8392, + "step": 623 + }, + { + "epoch": 0.05318332907184863, + "grad_norm": 121.42152035964865, + "learning_rate": 1.769886363636364e-06, + "loss": 1.686, + "step": 624 + }, + { + "epoch": 0.05326855876587403, + "grad_norm": 318.55174295243893, + "learning_rate": 1.7727272727272729e-06, + "loss": 4.0349, + "step": 625 + }, + { + "epoch": 0.05335378845989943, + "grad_norm": 244.91396734014666, + "learning_rate": 1.7755681818181818e-06, + "loss": 3.5294, + "step": 626 + }, + { + "epoch": 0.05343901815392483, + "grad_norm": 170.17947042723455, + "learning_rate": 1.778409090909091e-06, + "loss": 3.0313, + "step": 627 + }, + { + "epoch": 0.05352424784795023, + "grad_norm": 153.57665215113798, + "learning_rate": 1.78125e-06, + "loss": 3.1912, + "step": 628 + }, + { + "epoch": 0.05360947754197563, + "grad_norm": 107.17443206425132, + "learning_rate": 1.7840909090909092e-06, + "loss": 3.1267, + "step": 629 + }, + { + "epoch": 0.053694707236001026, + "grad_norm": 231.69993566059256, + "learning_rate": 1.7869318181818182e-06, + "loss": 3.513, + "step": 630 + }, + { + "epoch": 0.053779936930026424, + "grad_norm": 68.40674286902372, + "learning_rate": 1.7897727272727275e-06, + "loss": 2.0268, + "step": 631 + }, + { + "epoch": 0.053865166624051816, + "grad_norm": 224.3355423139148, + "learning_rate": 1.7926136363636364e-06, + "loss": 3.5682, + "step": 632 + }, + { + "epoch": 0.053950396318077215, + "grad_norm": 351.7809778387707, + "learning_rate": 1.7954545454545456e-06, + "loss": 2.9339, + "step": 633 + }, + { + "epoch": 0.054035626012102614, + "grad_norm": 103.04737845064513, + "learning_rate": 1.7982954545454547e-06, + "loss": 2.3759, + "step": 634 + }, + { + "epoch": 0.05412085570612801, + "grad_norm": 362.6290145755373, + "learning_rate": 1.8011363636363639e-06, + "loss": 3.3751, + "step": 635 + }, + { + "epoch": 0.05420608540015341, + "grad_norm": 502.02466303856573, + "learning_rate": 1.8039772727272728e-06, + "loss": 3.7222, + "step": 636 + }, + { + "epoch": 0.05429131509417881, + "grad_norm": 111.66239342271703, + "learning_rate": 1.8068181818181822e-06, + "loss": 3.2566, + "step": 637 + }, + { + "epoch": 0.05437654478820421, + "grad_norm": 201.9468164292666, + "learning_rate": 1.809659090909091e-06, + "loss": 2.4924, + "step": 638 + }, + { + "epoch": 0.05446177448222961, + "grad_norm": 212.146810360993, + "learning_rate": 1.8125e-06, + "loss": 3.2202, + "step": 639 + }, + { + "epoch": 0.05454700417625501, + "grad_norm": 325.9984640027333, + "learning_rate": 1.8153409090909094e-06, + "loss": 3.0447, + "step": 640 + }, + { + "epoch": 0.054632233870280406, + "grad_norm": 701.855796427334, + "learning_rate": 1.8181818181818183e-06, + "loss": 4.0576, + "step": 641 + }, + { + "epoch": 0.054717463564305804, + "grad_norm": 337.10987838723577, + "learning_rate": 1.8210227272727274e-06, + "loss": 3.3987, + "step": 642 + }, + { + "epoch": 0.0548026932583312, + "grad_norm": 69.39387473074609, + "learning_rate": 1.8238636363636364e-06, + "loss": 1.8697, + "step": 643 + }, + { + "epoch": 0.0548879229523566, + "grad_norm": 124.1531100268806, + "learning_rate": 1.8267045454545457e-06, + "loss": 2.5264, + "step": 644 + }, + { + "epoch": 0.054973152646382, + "grad_norm": 99.93052318091566, + "learning_rate": 1.8295454545454546e-06, + "loss": 3.1018, + "step": 645 + }, + { + "epoch": 0.0550583823404074, + "grad_norm": 84.58696076159715, + "learning_rate": 1.8323863636363638e-06, + "loss": 2.9571, + "step": 646 + }, + { + "epoch": 0.0551436120344328, + "grad_norm": 93.99042607017661, + "learning_rate": 1.835227272727273e-06, + "loss": 2.6024, + "step": 647 + }, + { + "epoch": 0.0552288417284582, + "grad_norm": 272.13862764923164, + "learning_rate": 1.838068181818182e-06, + "loss": 3.1271, + "step": 648 + }, + { + "epoch": 0.055314071422483596, + "grad_norm": 103.630154070562, + "learning_rate": 1.840909090909091e-06, + "loss": 2.6693, + "step": 649 + }, + { + "epoch": 0.055399301116508995, + "grad_norm": 570.5210721374958, + "learning_rate": 1.8437500000000003e-06, + "loss": 3.914, + "step": 650 + }, + { + "epoch": 0.05548453081053439, + "grad_norm": 100.39078050302747, + "learning_rate": 1.8465909090909093e-06, + "loss": 3.2804, + "step": 651 + }, + { + "epoch": 0.055569760504559786, + "grad_norm": 151.22096642772266, + "learning_rate": 1.8494318181818182e-06, + "loss": 2.7697, + "step": 652 + }, + { + "epoch": 0.055654990198585184, + "grad_norm": 237.26833374063244, + "learning_rate": 1.8522727272727276e-06, + "loss": 2.8453, + "step": 653 + }, + { + "epoch": 0.05574021989261058, + "grad_norm": 95.83734672684601, + "learning_rate": 1.8551136363636365e-06, + "loss": 2.1953, + "step": 654 + }, + { + "epoch": 0.05582544958663598, + "grad_norm": 104.41050438539779, + "learning_rate": 1.8579545454545456e-06, + "loss": 2.8912, + "step": 655 + }, + { + "epoch": 0.05591067928066138, + "grad_norm": 98.79390605660858, + "learning_rate": 1.8607954545454546e-06, + "loss": 2.9571, + "step": 656 + }, + { + "epoch": 0.05599590897468678, + "grad_norm": 109.08408122817562, + "learning_rate": 1.863636363636364e-06, + "loss": 3.245, + "step": 657 + }, + { + "epoch": 0.05608113866871218, + "grad_norm": 273.8517173820915, + "learning_rate": 1.8664772727272728e-06, + "loss": 2.752, + "step": 658 + }, + { + "epoch": 0.05616636836273758, + "grad_norm": 209.62096947625872, + "learning_rate": 1.8693181818181818e-06, + "loss": 2.551, + "step": 659 + }, + { + "epoch": 0.056251598056762976, + "grad_norm": 180.66667645333672, + "learning_rate": 1.8721590909090911e-06, + "loss": 3.459, + "step": 660 + }, + { + "epoch": 0.056336827750788375, + "grad_norm": 159.7240513565266, + "learning_rate": 1.8750000000000003e-06, + "loss": 3.0915, + "step": 661 + }, + { + "epoch": 0.056422057444813774, + "grad_norm": 86.4797306293979, + "learning_rate": 1.8778409090909092e-06, + "loss": 2.9995, + "step": 662 + }, + { + "epoch": 0.05650728713883917, + "grad_norm": 192.37139693808567, + "learning_rate": 1.8806818181818185e-06, + "loss": 3.4586, + "step": 663 + }, + { + "epoch": 0.05659251683286457, + "grad_norm": 100.64197493585782, + "learning_rate": 1.8835227272727275e-06, + "loss": 2.755, + "step": 664 + }, + { + "epoch": 0.05667774652688997, + "grad_norm": 129.31448662586936, + "learning_rate": 1.8863636363636364e-06, + "loss": 3.0019, + "step": 665 + }, + { + "epoch": 0.05676297622091537, + "grad_norm": 115.94487626305168, + "learning_rate": 1.8892045454545458e-06, + "loss": 3.314, + "step": 666 + }, + { + "epoch": 0.05684820591494077, + "grad_norm": 155.24263179885585, + "learning_rate": 1.8920454545454547e-06, + "loss": 2.7567, + "step": 667 + }, + { + "epoch": 0.05693343560896617, + "grad_norm": 99.64210082203476, + "learning_rate": 1.8948863636363638e-06, + "loss": 3.1076, + "step": 668 + }, + { + "epoch": 0.057018665302991565, + "grad_norm": 84.12122480375898, + "learning_rate": 1.8977272727272727e-06, + "loss": 2.5114, + "step": 669 + }, + { + "epoch": 0.05710389499701696, + "grad_norm": 111.44113153848936, + "learning_rate": 1.900568181818182e-06, + "loss": 2.8893, + "step": 670 + }, + { + "epoch": 0.057189124691042356, + "grad_norm": 91.32797156636913, + "learning_rate": 1.903409090909091e-06, + "loss": 1.9039, + "step": 671 + }, + { + "epoch": 0.057274354385067755, + "grad_norm": 94.3754654675019, + "learning_rate": 1.90625e-06, + "loss": 3.2669, + "step": 672 + }, + { + "epoch": 0.057359584079093154, + "grad_norm": 97.0112146411636, + "learning_rate": 1.9090909090909095e-06, + "loss": 3.2047, + "step": 673 + }, + { + "epoch": 0.05744481377311855, + "grad_norm": 139.38127997741464, + "learning_rate": 1.9119318181818185e-06, + "loss": 2.7982, + "step": 674 + }, + { + "epoch": 0.05753004346714395, + "grad_norm": 134.50978266363106, + "learning_rate": 1.9147727272727274e-06, + "loss": 3.1724, + "step": 675 + }, + { + "epoch": 0.05761527316116935, + "grad_norm": 118.67421509901695, + "learning_rate": 1.9176136363636367e-06, + "loss": 1.9564, + "step": 676 + }, + { + "epoch": 0.05770050285519475, + "grad_norm": 226.9957755386141, + "learning_rate": 1.9204545454545457e-06, + "loss": 3.0788, + "step": 677 + }, + { + "epoch": 0.05778573254922015, + "grad_norm": 416.3559663153018, + "learning_rate": 1.9232954545454546e-06, + "loss": 3.8418, + "step": 678 + }, + { + "epoch": 0.05787096224324555, + "grad_norm": 96.25629897616096, + "learning_rate": 1.926136363636364e-06, + "loss": 2.6422, + "step": 679 + }, + { + "epoch": 0.057956191937270946, + "grad_norm": 124.38609154991805, + "learning_rate": 1.928977272727273e-06, + "loss": 2.4819, + "step": 680 + }, + { + "epoch": 0.058041421631296344, + "grad_norm": 93.79656758482787, + "learning_rate": 1.931818181818182e-06, + "loss": 3.4113, + "step": 681 + }, + { + "epoch": 0.05812665132532174, + "grad_norm": 125.33188800653063, + "learning_rate": 1.9346590909090907e-06, + "loss": 2.6617, + "step": 682 + }, + { + "epoch": 0.05821188101934714, + "grad_norm": 150.40408451327107, + "learning_rate": 1.9375e-06, + "loss": 3.5331, + "step": 683 + }, + { + "epoch": 0.05829711071337254, + "grad_norm": 85.80322810675051, + "learning_rate": 1.940340909090909e-06, + "loss": 2.3076, + "step": 684 + }, + { + "epoch": 0.05838234040739794, + "grad_norm": 80.19307861322767, + "learning_rate": 1.9431818181818184e-06, + "loss": 3.0792, + "step": 685 + }, + { + "epoch": 0.05846757010142334, + "grad_norm": 210.3310240389992, + "learning_rate": 1.9460227272727277e-06, + "loss": 3.662, + "step": 686 + }, + { + "epoch": 0.05855279979544874, + "grad_norm": 92.89215684209125, + "learning_rate": 1.9488636363636366e-06, + "loss": 2.2559, + "step": 687 + }, + { + "epoch": 0.058638029489474136, + "grad_norm": 79.35865479695829, + "learning_rate": 1.9517045454545456e-06, + "loss": 2.9808, + "step": 688 + }, + { + "epoch": 0.05872325918349953, + "grad_norm": 104.38942395887877, + "learning_rate": 1.954545454545455e-06, + "loss": 2.8975, + "step": 689 + }, + { + "epoch": 0.05880848887752493, + "grad_norm": 75.79598037186831, + "learning_rate": 1.957386363636364e-06, + "loss": 3.3207, + "step": 690 + }, + { + "epoch": 0.058893718571550326, + "grad_norm": 134.50038863026066, + "learning_rate": 1.9602272727272728e-06, + "loss": 3.6455, + "step": 691 + }, + { + "epoch": 0.058978948265575724, + "grad_norm": 95.51269036433153, + "learning_rate": 1.963068181818182e-06, + "loss": 2.7537, + "step": 692 + }, + { + "epoch": 0.05906417795960112, + "grad_norm": 89.12864356647934, + "learning_rate": 1.965909090909091e-06, + "loss": 3.0772, + "step": 693 + }, + { + "epoch": 0.05914940765362652, + "grad_norm": 118.97393068031985, + "learning_rate": 1.96875e-06, + "loss": 3.011, + "step": 694 + }, + { + "epoch": 0.05923463734765192, + "grad_norm": 126.10576027240839, + "learning_rate": 1.9715909090909093e-06, + "loss": 3.8335, + "step": 695 + }, + { + "epoch": 0.05931986704167732, + "grad_norm": 122.23503578011534, + "learning_rate": 1.9744318181818183e-06, + "loss": 1.5561, + "step": 696 + }, + { + "epoch": 0.05940509673570272, + "grad_norm": 93.31079682932447, + "learning_rate": 1.977272727272727e-06, + "loss": 2.2052, + "step": 697 + }, + { + "epoch": 0.05949032642972812, + "grad_norm": 203.52303442121013, + "learning_rate": 1.9801136363636366e-06, + "loss": 2.2283, + "step": 698 + }, + { + "epoch": 0.059575556123753516, + "grad_norm": 182.96166032631865, + "learning_rate": 1.9829545454545455e-06, + "loss": 3.6015, + "step": 699 + }, + { + "epoch": 0.059660785817778915, + "grad_norm": 244.63132930637306, + "learning_rate": 1.985795454545455e-06, + "loss": 4.533, + "step": 700 + }, + { + "epoch": 0.059746015511804314, + "grad_norm": 92.2744392921235, + "learning_rate": 1.9886363636363638e-06, + "loss": 2.9714, + "step": 701 + }, + { + "epoch": 0.05983124520582971, + "grad_norm": 76.64570101490904, + "learning_rate": 1.991477272727273e-06, + "loss": 2.2828, + "step": 702 + }, + { + "epoch": 0.05991647489985511, + "grad_norm": 116.64787470013063, + "learning_rate": 1.994318181818182e-06, + "loss": 2.3846, + "step": 703 + }, + { + "epoch": 0.06000170459388051, + "grad_norm": 177.03867411543212, + "learning_rate": 1.997159090909091e-06, + "loss": 3.3915, + "step": 704 + }, + { + "epoch": 0.06008693428790591, + "grad_norm": 74.67284744092473, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.9187, + "step": 705 + }, + { + "epoch": 0.06017216398193131, + "grad_norm": 132.2981945709264, + "learning_rate": 2.0028409090909093e-06, + "loss": 2.8524, + "step": 706 + }, + { + "epoch": 0.06025739367595671, + "grad_norm": 69.58206297555499, + "learning_rate": 2.005681818181818e-06, + "loss": 1.946, + "step": 707 + }, + { + "epoch": 0.0603426233699821, + "grad_norm": 128.23235669605415, + "learning_rate": 2.0085227272727275e-06, + "loss": 3.4079, + "step": 708 + }, + { + "epoch": 0.0604278530640075, + "grad_norm": 220.55848705572106, + "learning_rate": 2.0113636363636365e-06, + "loss": 3.3475, + "step": 709 + }, + { + "epoch": 0.060513082758032896, + "grad_norm": 93.68664540977792, + "learning_rate": 2.0142045454545454e-06, + "loss": 2.8832, + "step": 710 + }, + { + "epoch": 0.060598312452058295, + "grad_norm": 127.3995991808329, + "learning_rate": 2.0170454545454548e-06, + "loss": 2.9606, + "step": 711 + }, + { + "epoch": 0.060683542146083694, + "grad_norm": 140.8747959806414, + "learning_rate": 2.0198863636363637e-06, + "loss": 2.6641, + "step": 712 + }, + { + "epoch": 0.06076877184010909, + "grad_norm": 88.27416156283081, + "learning_rate": 2.022727272727273e-06, + "loss": 2.8727, + "step": 713 + }, + { + "epoch": 0.06085400153413449, + "grad_norm": 184.84951802383813, + "learning_rate": 2.025568181818182e-06, + "loss": 3.2773, + "step": 714 + }, + { + "epoch": 0.06093923122815989, + "grad_norm": 76.92099997156289, + "learning_rate": 2.0284090909090913e-06, + "loss": 2.3403, + "step": 715 + }, + { + "epoch": 0.06102446092218529, + "grad_norm": 128.12544480169103, + "learning_rate": 2.0312500000000002e-06, + "loss": 2.9844, + "step": 716 + }, + { + "epoch": 0.06110969061621069, + "grad_norm": 112.4009514860719, + "learning_rate": 2.034090909090909e-06, + "loss": 3.9743, + "step": 717 + }, + { + "epoch": 0.06119492031023609, + "grad_norm": 97.46255311521499, + "learning_rate": 2.0369318181818185e-06, + "loss": 2.5596, + "step": 718 + }, + { + "epoch": 0.061280150004261486, + "grad_norm": 81.12846742760951, + "learning_rate": 2.0397727272727275e-06, + "loss": 2.0996, + "step": 719 + }, + { + "epoch": 0.061365379698286884, + "grad_norm": 128.11359062618226, + "learning_rate": 2.0426136363636364e-06, + "loss": 3.1086, + "step": 720 + }, + { + "epoch": 0.06145060939231228, + "grad_norm": 227.57176401145367, + "learning_rate": 2.0454545454545457e-06, + "loss": 2.4235, + "step": 721 + }, + { + "epoch": 0.06153583908633768, + "grad_norm": 424.71700763039155, + "learning_rate": 2.0482954545454547e-06, + "loss": 4.1135, + "step": 722 + }, + { + "epoch": 0.06162106878036308, + "grad_norm": 103.12470093204881, + "learning_rate": 2.0511363636363636e-06, + "loss": 2.6761, + "step": 723 + }, + { + "epoch": 0.06170629847438848, + "grad_norm": 67.59696372050068, + "learning_rate": 2.053977272727273e-06, + "loss": 2.2752, + "step": 724 + }, + { + "epoch": 0.06179152816841388, + "grad_norm": 281.58250357396037, + "learning_rate": 2.056818181818182e-06, + "loss": 3.936, + "step": 725 + }, + { + "epoch": 0.06187675786243928, + "grad_norm": 102.51210267266669, + "learning_rate": 2.0596590909090912e-06, + "loss": 2.7404, + "step": 726 + }, + { + "epoch": 0.06196198755646467, + "grad_norm": 91.68331110860395, + "learning_rate": 2.0625e-06, + "loss": 3.3426, + "step": 727 + }, + { + "epoch": 0.06204721725049007, + "grad_norm": 346.6017429091905, + "learning_rate": 2.0653409090909095e-06, + "loss": 3.415, + "step": 728 + }, + { + "epoch": 0.06213244694451547, + "grad_norm": 250.2888491792013, + "learning_rate": 2.0681818181818184e-06, + "loss": 3.1777, + "step": 729 + }, + { + "epoch": 0.062217676638540866, + "grad_norm": 180.3823568103876, + "learning_rate": 2.0710227272727274e-06, + "loss": 3.7463, + "step": 730 + }, + { + "epoch": 0.062302906332566264, + "grad_norm": 194.91448379759973, + "learning_rate": 2.0738636363636367e-06, + "loss": 3.0192, + "step": 731 + }, + { + "epoch": 0.06238813602659166, + "grad_norm": 81.71709526440577, + "learning_rate": 2.0767045454545456e-06, + "loss": 2.9043, + "step": 732 + }, + { + "epoch": 0.06247336572061706, + "grad_norm": 73.22770119148352, + "learning_rate": 2.0795454545454546e-06, + "loss": 3.0531, + "step": 733 + }, + { + "epoch": 0.06255859541464247, + "grad_norm": 225.30389148795325, + "learning_rate": 2.082386363636364e-06, + "loss": 4.0511, + "step": 734 + }, + { + "epoch": 0.06264382510866787, + "grad_norm": 154.43855830949764, + "learning_rate": 2.085227272727273e-06, + "loss": 2.526, + "step": 735 + }, + { + "epoch": 0.06272905480269325, + "grad_norm": 293.2920137385253, + "learning_rate": 2.088068181818182e-06, + "loss": 2.1601, + "step": 736 + }, + { + "epoch": 0.06281428449671865, + "grad_norm": 126.41191200556041, + "learning_rate": 2.090909090909091e-06, + "loss": 2.431, + "step": 737 + }, + { + "epoch": 0.06289951419074405, + "grad_norm": 103.50114983718008, + "learning_rate": 2.09375e-06, + "loss": 2.7851, + "step": 738 + }, + { + "epoch": 0.06298474388476945, + "grad_norm": 137.36313504225322, + "learning_rate": 2.0965909090909094e-06, + "loss": 3.9344, + "step": 739 + }, + { + "epoch": 0.06306997357879485, + "grad_norm": 76.60464165796903, + "learning_rate": 2.0994318181818184e-06, + "loss": 1.994, + "step": 740 + }, + { + "epoch": 0.06315520327282025, + "grad_norm": 198.4371886636514, + "learning_rate": 2.1022727272727277e-06, + "loss": 3.892, + "step": 741 + }, + { + "epoch": 0.06324043296684564, + "grad_norm": 130.37099438812675, + "learning_rate": 2.1051136363636366e-06, + "loss": 3.7942, + "step": 742 + }, + { + "epoch": 0.06332566266087104, + "grad_norm": 113.73152361711635, + "learning_rate": 2.1079545454545456e-06, + "loss": 2.4221, + "step": 743 + }, + { + "epoch": 0.06341089235489644, + "grad_norm": 168.05809585323595, + "learning_rate": 2.110795454545455e-06, + "loss": 2.7329, + "step": 744 + }, + { + "epoch": 0.06349612204892184, + "grad_norm": 95.88360735148757, + "learning_rate": 2.113636363636364e-06, + "loss": 2.7909, + "step": 745 + }, + { + "epoch": 0.06358135174294724, + "grad_norm": 130.66988952824533, + "learning_rate": 2.1164772727272728e-06, + "loss": 2.3101, + "step": 746 + }, + { + "epoch": 0.06366658143697264, + "grad_norm": 143.33339095817314, + "learning_rate": 2.119318181818182e-06, + "loss": 3.4952, + "step": 747 + }, + { + "epoch": 0.06375181113099804, + "grad_norm": 294.65389792894274, + "learning_rate": 2.122159090909091e-06, + "loss": 3.3898, + "step": 748 + }, + { + "epoch": 0.06383704082502344, + "grad_norm": 177.49364334278104, + "learning_rate": 2.125e-06, + "loss": 3.0845, + "step": 749 + }, + { + "epoch": 0.06392227051904883, + "grad_norm": 112.26877540148526, + "learning_rate": 2.1278409090909093e-06, + "loss": 2.9358, + "step": 750 + }, + { + "epoch": 0.06400750021307423, + "grad_norm": 72.37197509361678, + "learning_rate": 2.1306818181818183e-06, + "loss": 2.8618, + "step": 751 + }, + { + "epoch": 0.06409272990709963, + "grad_norm": 107.03848408983795, + "learning_rate": 2.1335227272727276e-06, + "loss": 3.5571, + "step": 752 + }, + { + "epoch": 0.06417795960112503, + "grad_norm": 81.55433612526946, + "learning_rate": 2.1363636363636365e-06, + "loss": 2.5884, + "step": 753 + }, + { + "epoch": 0.06426318929515043, + "grad_norm": 200.80277633076994, + "learning_rate": 2.139204545454546e-06, + "loss": 2.9492, + "step": 754 + }, + { + "epoch": 0.06434841898917583, + "grad_norm": 355.80448613483725, + "learning_rate": 2.142045454545455e-06, + "loss": 3.721, + "step": 755 + }, + { + "epoch": 0.06443364868320123, + "grad_norm": 159.19181975539837, + "learning_rate": 2.1448863636363638e-06, + "loss": 3.183, + "step": 756 + }, + { + "epoch": 0.06451887837722663, + "grad_norm": 121.0966589123692, + "learning_rate": 2.147727272727273e-06, + "loss": 3.9756, + "step": 757 + }, + { + "epoch": 0.06460410807125203, + "grad_norm": 98.65756383409283, + "learning_rate": 2.150568181818182e-06, + "loss": 3.1994, + "step": 758 + }, + { + "epoch": 0.06468933776527742, + "grad_norm": 117.08813187057346, + "learning_rate": 2.153409090909091e-06, + "loss": 2.5817, + "step": 759 + }, + { + "epoch": 0.06477456745930282, + "grad_norm": 130.57884678023487, + "learning_rate": 2.1562500000000003e-06, + "loss": 3.1371, + "step": 760 + }, + { + "epoch": 0.06485979715332822, + "grad_norm": 84.29597543036861, + "learning_rate": 2.1590909090909092e-06, + "loss": 2.3181, + "step": 761 + }, + { + "epoch": 0.06494502684735362, + "grad_norm": 238.86896093721822, + "learning_rate": 2.161931818181818e-06, + "loss": 2.8487, + "step": 762 + }, + { + "epoch": 0.06503025654137902, + "grad_norm": 300.4796063601937, + "learning_rate": 2.1647727272727275e-06, + "loss": 4.7535, + "step": 763 + }, + { + "epoch": 0.06511548623540442, + "grad_norm": 98.17392225406664, + "learning_rate": 2.1676136363636365e-06, + "loss": 1.6438, + "step": 764 + }, + { + "epoch": 0.06520071592942982, + "grad_norm": 320.6589743857015, + "learning_rate": 2.1704545454545454e-06, + "loss": 3.4764, + "step": 765 + }, + { + "epoch": 0.06528594562345522, + "grad_norm": 197.47939339732616, + "learning_rate": 2.1732954545454547e-06, + "loss": 3.6222, + "step": 766 + }, + { + "epoch": 0.06537117531748061, + "grad_norm": 73.2747893868519, + "learning_rate": 2.1761363636363637e-06, + "loss": 2.9822, + "step": 767 + }, + { + "epoch": 0.06545640501150601, + "grad_norm": 93.31945482225413, + "learning_rate": 2.178977272727273e-06, + "loss": 2.5796, + "step": 768 + }, + { + "epoch": 0.06554163470553141, + "grad_norm": 83.29555783511445, + "learning_rate": 2.181818181818182e-06, + "loss": 2.975, + "step": 769 + }, + { + "epoch": 0.06562686439955681, + "grad_norm": 119.27280591356572, + "learning_rate": 2.1846590909090913e-06, + "loss": 3.2529, + "step": 770 + }, + { + "epoch": 0.06571209409358221, + "grad_norm": 85.78230933050071, + "learning_rate": 2.1875000000000002e-06, + "loss": 2.5014, + "step": 771 + }, + { + "epoch": 0.06579732378760761, + "grad_norm": 269.05785657870285, + "learning_rate": 2.190340909090909e-06, + "loss": 3.7994, + "step": 772 + }, + { + "epoch": 0.06588255348163301, + "grad_norm": 134.14001882898447, + "learning_rate": 2.1931818181818185e-06, + "loss": 3.4308, + "step": 773 + }, + { + "epoch": 0.06596778317565839, + "grad_norm": 92.9857322497726, + "learning_rate": 2.1960227272727274e-06, + "loss": 2.6723, + "step": 774 + }, + { + "epoch": 0.06605301286968379, + "grad_norm": 123.00560139996175, + "learning_rate": 2.1988636363636364e-06, + "loss": 2.9152, + "step": 775 + }, + { + "epoch": 0.06613824256370919, + "grad_norm": 101.29038850365527, + "learning_rate": 2.2017045454545457e-06, + "loss": 3.2626, + "step": 776 + }, + { + "epoch": 0.06622347225773459, + "grad_norm": 160.35813888161718, + "learning_rate": 2.2045454545454547e-06, + "loss": 3.0431, + "step": 777 + }, + { + "epoch": 0.06630870195175999, + "grad_norm": 203.7512260206057, + "learning_rate": 2.2073863636363636e-06, + "loss": 2.3285, + "step": 778 + }, + { + "epoch": 0.06639393164578539, + "grad_norm": 177.55787186674118, + "learning_rate": 2.210227272727273e-06, + "loss": 3.5051, + "step": 779 + }, + { + "epoch": 0.06647916133981079, + "grad_norm": 313.50468788241716, + "learning_rate": 2.213068181818182e-06, + "loss": 3.8857, + "step": 780 + }, + { + "epoch": 0.06656439103383618, + "grad_norm": 158.2712090430371, + "learning_rate": 2.2159090909090912e-06, + "loss": 2.5373, + "step": 781 + }, + { + "epoch": 0.06664962072786158, + "grad_norm": 138.75773082970247, + "learning_rate": 2.21875e-06, + "loss": 2.6563, + "step": 782 + }, + { + "epoch": 0.06673485042188698, + "grad_norm": 90.66231428577817, + "learning_rate": 2.2215909090909095e-06, + "loss": 2.8064, + "step": 783 + }, + { + "epoch": 0.06682008011591238, + "grad_norm": 83.44550390005918, + "learning_rate": 2.2244318181818184e-06, + "loss": 2.6869, + "step": 784 + }, + { + "epoch": 0.06690530980993778, + "grad_norm": 104.0344661051804, + "learning_rate": 2.2272727272727274e-06, + "loss": 3.1629, + "step": 785 + }, + { + "epoch": 0.06699053950396318, + "grad_norm": 241.10908318021438, + "learning_rate": 2.2301136363636367e-06, + "loss": 3.1677, + "step": 786 + }, + { + "epoch": 0.06707576919798858, + "grad_norm": 80.14867800980899, + "learning_rate": 2.2329545454545456e-06, + "loss": 2.3642, + "step": 787 + }, + { + "epoch": 0.06716099889201398, + "grad_norm": 138.547238319344, + "learning_rate": 2.2357954545454546e-06, + "loss": 3.1626, + "step": 788 + }, + { + "epoch": 0.06724622858603937, + "grad_norm": 340.62681045832994, + "learning_rate": 2.238636363636364e-06, + "loss": 5.2369, + "step": 789 + }, + { + "epoch": 0.06733145828006477, + "grad_norm": 131.71455538360266, + "learning_rate": 2.241477272727273e-06, + "loss": 3.4046, + "step": 790 + }, + { + "epoch": 0.06741668797409017, + "grad_norm": 140.04491005968447, + "learning_rate": 2.2443181818181818e-06, + "loss": 2.8986, + "step": 791 + }, + { + "epoch": 0.06750191766811557, + "grad_norm": 65.30134468780508, + "learning_rate": 2.247159090909091e-06, + "loss": 2.6501, + "step": 792 + }, + { + "epoch": 0.06758714736214097, + "grad_norm": 97.5495393902277, + "learning_rate": 2.25e-06, + "loss": 2.4095, + "step": 793 + }, + { + "epoch": 0.06767237705616637, + "grad_norm": 68.97114565134363, + "learning_rate": 2.2528409090909094e-06, + "loss": 4.039, + "step": 794 + }, + { + "epoch": 0.06775760675019177, + "grad_norm": 83.92193249119406, + "learning_rate": 2.2556818181818183e-06, + "loss": 2.287, + "step": 795 + }, + { + "epoch": 0.06784283644421717, + "grad_norm": 101.26092854106082, + "learning_rate": 2.2585227272727277e-06, + "loss": 2.4266, + "step": 796 + }, + { + "epoch": 0.06792806613824257, + "grad_norm": 101.26517540400351, + "learning_rate": 2.2613636363636366e-06, + "loss": 3.2706, + "step": 797 + }, + { + "epoch": 0.06801329583226796, + "grad_norm": 111.69777819500247, + "learning_rate": 2.2642045454545455e-06, + "loss": 2.2806, + "step": 798 + }, + { + "epoch": 0.06809852552629336, + "grad_norm": 87.05078896179495, + "learning_rate": 2.267045454545455e-06, + "loss": 3.2644, + "step": 799 + }, + { + "epoch": 0.06818375522031876, + "grad_norm": 104.69825051402906, + "learning_rate": 2.269886363636364e-06, + "loss": 2.8523, + "step": 800 + }, + { + "epoch": 0.06826898491434416, + "grad_norm": 149.6644770859519, + "learning_rate": 2.2727272727272728e-06, + "loss": 2.5292, + "step": 801 + }, + { + "epoch": 0.06835421460836956, + "grad_norm": 80.1340630878674, + "learning_rate": 2.275568181818182e-06, + "loss": 3.1055, + "step": 802 + }, + { + "epoch": 0.06843944430239496, + "grad_norm": 134.53265813844448, + "learning_rate": 2.278409090909091e-06, + "loss": 3.3309, + "step": 803 + }, + { + "epoch": 0.06852467399642036, + "grad_norm": 112.71851629177299, + "learning_rate": 2.28125e-06, + "loss": 3.1313, + "step": 804 + }, + { + "epoch": 0.06860990369044576, + "grad_norm": 84.4447264576877, + "learning_rate": 2.2840909090909093e-06, + "loss": 2.2132, + "step": 805 + }, + { + "epoch": 0.06869513338447115, + "grad_norm": 138.60957277710725, + "learning_rate": 2.2869318181818183e-06, + "loss": 3.0337, + "step": 806 + }, + { + "epoch": 0.06878036307849655, + "grad_norm": 160.02016976718792, + "learning_rate": 2.2897727272727276e-06, + "loss": 3.4037, + "step": 807 + }, + { + "epoch": 0.06886559277252195, + "grad_norm": 229.21789638653382, + "learning_rate": 2.2926136363636365e-06, + "loss": 3.2154, + "step": 808 + }, + { + "epoch": 0.06895082246654735, + "grad_norm": 131.1804092461035, + "learning_rate": 2.295454545454546e-06, + "loss": 3.723, + "step": 809 + }, + { + "epoch": 0.06903605216057275, + "grad_norm": 226.72415568103247, + "learning_rate": 2.298295454545455e-06, + "loss": 4.7227, + "step": 810 + }, + { + "epoch": 0.06912128185459815, + "grad_norm": 141.9635163959684, + "learning_rate": 2.3011363636363637e-06, + "loss": 3.4792, + "step": 811 + }, + { + "epoch": 0.06920651154862353, + "grad_norm": 250.1908049325916, + "learning_rate": 2.303977272727273e-06, + "loss": 3.6493, + "step": 812 + }, + { + "epoch": 0.06929174124264893, + "grad_norm": 69.57223119007774, + "learning_rate": 2.306818181818182e-06, + "loss": 1.8585, + "step": 813 + }, + { + "epoch": 0.06937697093667433, + "grad_norm": 114.27950881711398, + "learning_rate": 2.309659090909091e-06, + "loss": 3.2557, + "step": 814 + }, + { + "epoch": 0.06946220063069973, + "grad_norm": 125.38240488110007, + "learning_rate": 2.3125000000000003e-06, + "loss": 3.2324, + "step": 815 + }, + { + "epoch": 0.06954743032472513, + "grad_norm": 122.76080731877045, + "learning_rate": 2.3153409090909092e-06, + "loss": 3.4005, + "step": 816 + }, + { + "epoch": 0.06963266001875053, + "grad_norm": 131.36448611336817, + "learning_rate": 2.318181818181818e-06, + "loss": 2.3671, + "step": 817 + }, + { + "epoch": 0.06971788971277593, + "grad_norm": 96.24242511438659, + "learning_rate": 2.3210227272727275e-06, + "loss": 1.99, + "step": 818 + }, + { + "epoch": 0.06980311940680133, + "grad_norm": 136.51600630175417, + "learning_rate": 2.3238636363636364e-06, + "loss": 3.265, + "step": 819 + }, + { + "epoch": 0.06988834910082672, + "grad_norm": 78.92784227169044, + "learning_rate": 2.326704545454546e-06, + "loss": 3.1466, + "step": 820 + }, + { + "epoch": 0.06997357879485212, + "grad_norm": 65.52613689938426, + "learning_rate": 2.3295454545454547e-06, + "loss": 2.1434, + "step": 821 + }, + { + "epoch": 0.07005880848887752, + "grad_norm": 82.867941732871, + "learning_rate": 2.332386363636364e-06, + "loss": 2.7243, + "step": 822 + }, + { + "epoch": 0.07014403818290292, + "grad_norm": 84.5141781191738, + "learning_rate": 2.335227272727273e-06, + "loss": 3.0669, + "step": 823 + }, + { + "epoch": 0.07022926787692832, + "grad_norm": 142.60496742736942, + "learning_rate": 2.338068181818182e-06, + "loss": 2.8998, + "step": 824 + }, + { + "epoch": 0.07031449757095372, + "grad_norm": 97.33083323328, + "learning_rate": 2.3409090909090913e-06, + "loss": 2.9114, + "step": 825 + }, + { + "epoch": 0.07039972726497912, + "grad_norm": 484.40175537777907, + "learning_rate": 2.3437500000000002e-06, + "loss": 4.5001, + "step": 826 + }, + { + "epoch": 0.07048495695900452, + "grad_norm": 158.6700249618076, + "learning_rate": 2.346590909090909e-06, + "loss": 3.3318, + "step": 827 + }, + { + "epoch": 0.07057018665302991, + "grad_norm": 93.59931250960295, + "learning_rate": 2.3494318181818185e-06, + "loss": 3.0756, + "step": 828 + }, + { + "epoch": 0.07065541634705531, + "grad_norm": 288.07772352217074, + "learning_rate": 2.3522727272727274e-06, + "loss": 3.7461, + "step": 829 + }, + { + "epoch": 0.07074064604108071, + "grad_norm": 91.59090878135996, + "learning_rate": 2.3551136363636364e-06, + "loss": 2.8974, + "step": 830 + }, + { + "epoch": 0.07082587573510611, + "grad_norm": 82.18480342666206, + "learning_rate": 2.3579545454545457e-06, + "loss": 1.8994, + "step": 831 + }, + { + "epoch": 0.07091110542913151, + "grad_norm": 186.22401268532863, + "learning_rate": 2.3607954545454546e-06, + "loss": 4.2763, + "step": 832 + }, + { + "epoch": 0.07099633512315691, + "grad_norm": 128.5311345358553, + "learning_rate": 2.363636363636364e-06, + "loss": 2.1268, + "step": 833 + }, + { + "epoch": 0.07108156481718231, + "grad_norm": 97.1410702917148, + "learning_rate": 2.366477272727273e-06, + "loss": 2.6387, + "step": 834 + }, + { + "epoch": 0.0711667945112077, + "grad_norm": 198.4684783628603, + "learning_rate": 2.3693181818181823e-06, + "loss": 4.1703, + "step": 835 + }, + { + "epoch": 0.0712520242052331, + "grad_norm": 188.72291675006488, + "learning_rate": 2.372159090909091e-06, + "loss": 2.4265, + "step": 836 + }, + { + "epoch": 0.0713372538992585, + "grad_norm": 161.07110843398, + "learning_rate": 2.375e-06, + "loss": 3.1328, + "step": 837 + }, + { + "epoch": 0.0714224835932839, + "grad_norm": 90.81761310090272, + "learning_rate": 2.3778409090909095e-06, + "loss": 3.0356, + "step": 838 + }, + { + "epoch": 0.0715077132873093, + "grad_norm": 124.29162013645337, + "learning_rate": 2.3806818181818184e-06, + "loss": 2.9231, + "step": 839 + }, + { + "epoch": 0.0715929429813347, + "grad_norm": 115.98975845055915, + "learning_rate": 2.3835227272727273e-06, + "loss": 2.3815, + "step": 840 + }, + { + "epoch": 0.0716781726753601, + "grad_norm": 102.67457702256362, + "learning_rate": 2.3863636363636367e-06, + "loss": 2.6878, + "step": 841 + }, + { + "epoch": 0.0717634023693855, + "grad_norm": 89.58401269883475, + "learning_rate": 2.3892045454545456e-06, + "loss": 2.8299, + "step": 842 + }, + { + "epoch": 0.0718486320634109, + "grad_norm": 183.17668772075817, + "learning_rate": 2.3920454545454546e-06, + "loss": 4.0433, + "step": 843 + }, + { + "epoch": 0.0719338617574363, + "grad_norm": 67.64479951891147, + "learning_rate": 2.394886363636364e-06, + "loss": 2.0526, + "step": 844 + }, + { + "epoch": 0.0720190914514617, + "grad_norm": 75.27648047931048, + "learning_rate": 2.397727272727273e-06, + "loss": 2.6604, + "step": 845 + }, + { + "epoch": 0.0721043211454871, + "grad_norm": 272.4635278234866, + "learning_rate": 2.4005681818181818e-06, + "loss": 3.7261, + "step": 846 + }, + { + "epoch": 0.07218955083951249, + "grad_norm": 83.69940995738429, + "learning_rate": 2.403409090909091e-06, + "loss": 3.0895, + "step": 847 + }, + { + "epoch": 0.07227478053353789, + "grad_norm": 214.97637700584028, + "learning_rate": 2.40625e-06, + "loss": 3.2565, + "step": 848 + }, + { + "epoch": 0.07236001022756328, + "grad_norm": 117.17148785494027, + "learning_rate": 2.4090909090909094e-06, + "loss": 3.456, + "step": 849 + }, + { + "epoch": 0.07244523992158867, + "grad_norm": 122.84044682393916, + "learning_rate": 2.4119318181818183e-06, + "loss": 3.8836, + "step": 850 + }, + { + "epoch": 0.07253046961561407, + "grad_norm": 154.15599034064502, + "learning_rate": 2.4147727272727277e-06, + "loss": 3.1692, + "step": 851 + }, + { + "epoch": 0.07261569930963947, + "grad_norm": 135.57390448317594, + "learning_rate": 2.4176136363636366e-06, + "loss": 3.4612, + "step": 852 + }, + { + "epoch": 0.07270092900366487, + "grad_norm": 135.1309580528141, + "learning_rate": 2.4204545454545455e-06, + "loss": 2.8955, + "step": 853 + }, + { + "epoch": 0.07278615869769027, + "grad_norm": 90.55420200201513, + "learning_rate": 2.423295454545455e-06, + "loss": 2.3527, + "step": 854 + }, + { + "epoch": 0.07287138839171567, + "grad_norm": 145.68615713390713, + "learning_rate": 2.426136363636364e-06, + "loss": 3.3684, + "step": 855 + }, + { + "epoch": 0.07295661808574107, + "grad_norm": 128.48690097319837, + "learning_rate": 2.4289772727272727e-06, + "loss": 3.2395, + "step": 856 + }, + { + "epoch": 0.07304184777976647, + "grad_norm": 95.77073558969366, + "learning_rate": 2.431818181818182e-06, + "loss": 3.1725, + "step": 857 + }, + { + "epoch": 0.07312707747379187, + "grad_norm": 91.92827505070217, + "learning_rate": 2.434659090909091e-06, + "loss": 2.7927, + "step": 858 + }, + { + "epoch": 0.07321230716781726, + "grad_norm": 90.35757496556795, + "learning_rate": 2.4375e-06, + "loss": 2.8985, + "step": 859 + }, + { + "epoch": 0.07329753686184266, + "grad_norm": 105.36519570654785, + "learning_rate": 2.4403409090909093e-06, + "loss": 2.7602, + "step": 860 + }, + { + "epoch": 0.07338276655586806, + "grad_norm": 239.75406940367046, + "learning_rate": 2.4431818181818182e-06, + "loss": 4.2888, + "step": 861 + }, + { + "epoch": 0.07346799624989346, + "grad_norm": 100.65920072313138, + "learning_rate": 2.4460227272727276e-06, + "loss": 2.7473, + "step": 862 + }, + { + "epoch": 0.07355322594391886, + "grad_norm": 174.69920557989067, + "learning_rate": 2.4488636363636365e-06, + "loss": 2.9297, + "step": 863 + }, + { + "epoch": 0.07363845563794426, + "grad_norm": 221.85754880201029, + "learning_rate": 2.451704545454546e-06, + "loss": 3.8704, + "step": 864 + }, + { + "epoch": 0.07372368533196966, + "grad_norm": 147.69059231208976, + "learning_rate": 2.454545454545455e-06, + "loss": 3.229, + "step": 865 + }, + { + "epoch": 0.07380891502599506, + "grad_norm": 171.7770226905314, + "learning_rate": 2.4573863636363637e-06, + "loss": 3.6001, + "step": 866 + }, + { + "epoch": 0.07389414472002045, + "grad_norm": 101.64914321059337, + "learning_rate": 2.460227272727273e-06, + "loss": 2.8826, + "step": 867 + }, + { + "epoch": 0.07397937441404585, + "grad_norm": 106.26933420576532, + "learning_rate": 2.463068181818182e-06, + "loss": 3.4257, + "step": 868 + }, + { + "epoch": 0.07406460410807125, + "grad_norm": 65.26854443135, + "learning_rate": 2.465909090909091e-06, + "loss": 2.2466, + "step": 869 + }, + { + "epoch": 0.07414983380209665, + "grad_norm": 198.2004475275843, + "learning_rate": 2.4687500000000003e-06, + "loss": 3.439, + "step": 870 + }, + { + "epoch": 0.07423506349612205, + "grad_norm": 65.5374016646667, + "learning_rate": 2.4715909090909092e-06, + "loss": 2.4933, + "step": 871 + }, + { + "epoch": 0.07432029319014745, + "grad_norm": 85.29337104488405, + "learning_rate": 2.474431818181818e-06, + "loss": 2.9731, + "step": 872 + }, + { + "epoch": 0.07440552288417285, + "grad_norm": 343.46986291774965, + "learning_rate": 2.4772727272727275e-06, + "loss": 2.4818, + "step": 873 + }, + { + "epoch": 0.07449075257819825, + "grad_norm": 180.77444796105976, + "learning_rate": 2.4801136363636364e-06, + "loss": 3.2907, + "step": 874 + }, + { + "epoch": 0.07457598227222365, + "grad_norm": 161.9388791639908, + "learning_rate": 2.4829545454545458e-06, + "loss": 3.5436, + "step": 875 + }, + { + "epoch": 0.07466121196624904, + "grad_norm": 158.8102773457251, + "learning_rate": 2.4857954545454547e-06, + "loss": 3.466, + "step": 876 + }, + { + "epoch": 0.07474644166027444, + "grad_norm": 71.47836058684854, + "learning_rate": 2.488636363636364e-06, + "loss": 2.8405, + "step": 877 + }, + { + "epoch": 0.07483167135429984, + "grad_norm": 74.92463255842343, + "learning_rate": 2.491477272727273e-06, + "loss": 3.1018, + "step": 878 + }, + { + "epoch": 0.07491690104832524, + "grad_norm": 104.2492139553263, + "learning_rate": 2.494318181818182e-06, + "loss": 2.188, + "step": 879 + }, + { + "epoch": 0.07500213074235064, + "grad_norm": 288.9528971288814, + "learning_rate": 2.4971590909090913e-06, + "loss": 3.9507, + "step": 880 + }, + { + "epoch": 0.07508736043637604, + "grad_norm": 90.88825874243014, + "learning_rate": 2.5e-06, + "loss": 3.3947, + "step": 881 + }, + { + "epoch": 0.07517259013040144, + "grad_norm": 102.76855835573062, + "learning_rate": 2.5028409090909096e-06, + "loss": 2.7583, + "step": 882 + }, + { + "epoch": 0.07525781982442684, + "grad_norm": 60.17113648283112, + "learning_rate": 2.505681818181818e-06, + "loss": 2.1379, + "step": 883 + }, + { + "epoch": 0.07534304951845223, + "grad_norm": 134.7496668697273, + "learning_rate": 2.5085227272727274e-06, + "loss": 2.0725, + "step": 884 + }, + { + "epoch": 0.07542827921247763, + "grad_norm": 92.4132327817515, + "learning_rate": 2.5113636363636368e-06, + "loss": 2.7426, + "step": 885 + }, + { + "epoch": 0.07551350890650303, + "grad_norm": 84.68090710413918, + "learning_rate": 2.5142045454545457e-06, + "loss": 2.2008, + "step": 886 + }, + { + "epoch": 0.07559873860052842, + "grad_norm": 96.50232274695657, + "learning_rate": 2.5170454545454546e-06, + "loss": 3.2405, + "step": 887 + }, + { + "epoch": 0.07568396829455382, + "grad_norm": 228.45984263400788, + "learning_rate": 2.519886363636364e-06, + "loss": 2.837, + "step": 888 + }, + { + "epoch": 0.07576919798857921, + "grad_norm": 90.69679627651442, + "learning_rate": 2.522727272727273e-06, + "loss": 2.3983, + "step": 889 + }, + { + "epoch": 0.07585442768260461, + "grad_norm": 75.61677453440974, + "learning_rate": 2.5255681818181823e-06, + "loss": 2.2538, + "step": 890 + }, + { + "epoch": 0.07593965737663001, + "grad_norm": 95.33364035789475, + "learning_rate": 2.528409090909091e-06, + "loss": 2.9897, + "step": 891 + }, + { + "epoch": 0.07602488707065541, + "grad_norm": 194.2412367774104, + "learning_rate": 2.53125e-06, + "loss": 3.1089, + "step": 892 + }, + { + "epoch": 0.07611011676468081, + "grad_norm": 130.73852076171343, + "learning_rate": 2.5340909090909095e-06, + "loss": 2.9252, + "step": 893 + }, + { + "epoch": 0.07619534645870621, + "grad_norm": 359.4643355969633, + "learning_rate": 2.536931818181819e-06, + "loss": 4.1163, + "step": 894 + }, + { + "epoch": 0.07628057615273161, + "grad_norm": 71.18397649650503, + "learning_rate": 2.5397727272727273e-06, + "loss": 2.0571, + "step": 895 + }, + { + "epoch": 0.076365805846757, + "grad_norm": 147.94254701982362, + "learning_rate": 2.5426136363636367e-06, + "loss": 3.8025, + "step": 896 + }, + { + "epoch": 0.0764510355407824, + "grad_norm": 108.60685482115957, + "learning_rate": 2.5454545454545456e-06, + "loss": 3.7173, + "step": 897 + }, + { + "epoch": 0.0765362652348078, + "grad_norm": 66.53222078725362, + "learning_rate": 2.5482954545454545e-06, + "loss": 2.3655, + "step": 898 + }, + { + "epoch": 0.0766214949288332, + "grad_norm": 164.39203593776233, + "learning_rate": 2.551136363636364e-06, + "loss": 4.0933, + "step": 899 + }, + { + "epoch": 0.0767067246228586, + "grad_norm": 119.99696476828119, + "learning_rate": 2.553977272727273e-06, + "loss": 3.4904, + "step": 900 + }, + { + "epoch": 0.076791954316884, + "grad_norm": 91.60098730776589, + "learning_rate": 2.556818181818182e-06, + "loss": 2.3697, + "step": 901 + }, + { + "epoch": 0.0768771840109094, + "grad_norm": 235.0782991969982, + "learning_rate": 2.559659090909091e-06, + "loss": 2.9057, + "step": 902 + }, + { + "epoch": 0.0769624137049348, + "grad_norm": 191.01766770674433, + "learning_rate": 2.5625e-06, + "loss": 3.6471, + "step": 903 + }, + { + "epoch": 0.0770476433989602, + "grad_norm": 82.80534871978138, + "learning_rate": 2.5653409090909094e-06, + "loss": 2.6645, + "step": 904 + }, + { + "epoch": 0.0771328730929856, + "grad_norm": 332.90136092836093, + "learning_rate": 2.5681818181818187e-06, + "loss": 2.959, + "step": 905 + }, + { + "epoch": 0.077218102787011, + "grad_norm": 60.10753797939084, + "learning_rate": 2.5710227272727272e-06, + "loss": 2.2328, + "step": 906 + }, + { + "epoch": 0.0773033324810364, + "grad_norm": 80.5942780563471, + "learning_rate": 2.5738636363636366e-06, + "loss": 2.0359, + "step": 907 + }, + { + "epoch": 0.07738856217506179, + "grad_norm": 105.36040824023131, + "learning_rate": 2.576704545454546e-06, + "loss": 3.6854, + "step": 908 + }, + { + "epoch": 0.07747379186908719, + "grad_norm": 77.88370122578775, + "learning_rate": 2.5795454545454545e-06, + "loss": 2.832, + "step": 909 + }, + { + "epoch": 0.07755902156311259, + "grad_norm": 348.460440526766, + "learning_rate": 2.582386363636364e-06, + "loss": 4.2588, + "step": 910 + }, + { + "epoch": 0.07764425125713799, + "grad_norm": 95.82995117150202, + "learning_rate": 2.585227272727273e-06, + "loss": 3.2683, + "step": 911 + }, + { + "epoch": 0.07772948095116339, + "grad_norm": 151.05688443139826, + "learning_rate": 2.5880681818181817e-06, + "loss": 3.0094, + "step": 912 + }, + { + "epoch": 0.07781471064518879, + "grad_norm": 116.54059549917375, + "learning_rate": 2.590909090909091e-06, + "loss": 3.2842, + "step": 913 + }, + { + "epoch": 0.07789994033921419, + "grad_norm": 122.79447149526189, + "learning_rate": 2.5937500000000004e-06, + "loss": 2.9783, + "step": 914 + }, + { + "epoch": 0.07798517003323958, + "grad_norm": 207.26192354702383, + "learning_rate": 2.5965909090909093e-06, + "loss": 3.8026, + "step": 915 + }, + { + "epoch": 0.07807039972726498, + "grad_norm": 176.93582180522432, + "learning_rate": 2.5994318181818186e-06, + "loss": 3.5299, + "step": 916 + }, + { + "epoch": 0.07815562942129038, + "grad_norm": 100.43555698459113, + "learning_rate": 2.6022727272727276e-06, + "loss": 2.7614, + "step": 917 + }, + { + "epoch": 0.07824085911531578, + "grad_norm": 96.67033014415786, + "learning_rate": 2.6051136363636365e-06, + "loss": 3.0788, + "step": 918 + }, + { + "epoch": 0.07832608880934118, + "grad_norm": 236.08059873639476, + "learning_rate": 2.607954545454546e-06, + "loss": 3.2777, + "step": 919 + }, + { + "epoch": 0.07841131850336658, + "grad_norm": 83.00380589628034, + "learning_rate": 2.610795454545455e-06, + "loss": 2.9028, + "step": 920 + }, + { + "epoch": 0.07849654819739198, + "grad_norm": 87.3377439757927, + "learning_rate": 2.6136363636363637e-06, + "loss": 3.4452, + "step": 921 + }, + { + "epoch": 0.07858177789141738, + "grad_norm": 226.08096996157377, + "learning_rate": 2.616477272727273e-06, + "loss": 3.8619, + "step": 922 + }, + { + "epoch": 0.07866700758544277, + "grad_norm": 272.3086359963776, + "learning_rate": 2.6193181818181816e-06, + "loss": 4.3131, + "step": 923 + }, + { + "epoch": 0.07875223727946817, + "grad_norm": 84.40200270845719, + "learning_rate": 2.622159090909091e-06, + "loss": 2.2586, + "step": 924 + }, + { + "epoch": 0.07883746697349356, + "grad_norm": 112.69354294297318, + "learning_rate": 2.6250000000000003e-06, + "loss": 3.1458, + "step": 925 + }, + { + "epoch": 0.07892269666751896, + "grad_norm": 98.79841248779226, + "learning_rate": 2.627840909090909e-06, + "loss": 2.5209, + "step": 926 + }, + { + "epoch": 0.07900792636154436, + "grad_norm": 104.25548287061659, + "learning_rate": 2.630681818181818e-06, + "loss": 3.4535, + "step": 927 + }, + { + "epoch": 0.07909315605556975, + "grad_norm": 97.1536386276211, + "learning_rate": 2.6335227272727275e-06, + "loss": 2.7571, + "step": 928 + }, + { + "epoch": 0.07917838574959515, + "grad_norm": 211.2643647441358, + "learning_rate": 2.6363636363636364e-06, + "loss": 4.3219, + "step": 929 + }, + { + "epoch": 0.07926361544362055, + "grad_norm": 192.6227183133035, + "learning_rate": 2.6392045454545458e-06, + "loss": 2.6047, + "step": 930 + }, + { + "epoch": 0.07934884513764595, + "grad_norm": 94.77411040352051, + "learning_rate": 2.642045454545455e-06, + "loss": 2.6935, + "step": 931 + }, + { + "epoch": 0.07943407483167135, + "grad_norm": 190.38886372578125, + "learning_rate": 2.6448863636363636e-06, + "loss": 3.8807, + "step": 932 + }, + { + "epoch": 0.07951930452569675, + "grad_norm": 64.43245711845574, + "learning_rate": 2.647727272727273e-06, + "loss": 1.3533, + "step": 933 + }, + { + "epoch": 0.07960453421972215, + "grad_norm": 97.61323559489733, + "learning_rate": 2.6505681818181823e-06, + "loss": 2.684, + "step": 934 + }, + { + "epoch": 0.07968976391374755, + "grad_norm": 155.42312968047017, + "learning_rate": 2.653409090909091e-06, + "loss": 2.6391, + "step": 935 + }, + { + "epoch": 0.07977499360777295, + "grad_norm": 178.30048150568405, + "learning_rate": 2.65625e-06, + "loss": 2.3657, + "step": 936 + }, + { + "epoch": 0.07986022330179834, + "grad_norm": 116.64389477972722, + "learning_rate": 2.6590909090909095e-06, + "loss": 2.7322, + "step": 937 + }, + { + "epoch": 0.07994545299582374, + "grad_norm": 160.24777183316186, + "learning_rate": 2.661931818181818e-06, + "loss": 3.3887, + "step": 938 + }, + { + "epoch": 0.08003068268984914, + "grad_norm": 104.0502996872612, + "learning_rate": 2.6647727272727274e-06, + "loss": 3.2344, + "step": 939 + }, + { + "epoch": 0.08011591238387454, + "grad_norm": 75.07016907404407, + "learning_rate": 2.6676136363636368e-06, + "loss": 2.5803, + "step": 940 + }, + { + "epoch": 0.08020114207789994, + "grad_norm": 147.83555218309698, + "learning_rate": 2.6704545454545457e-06, + "loss": 3.418, + "step": 941 + }, + { + "epoch": 0.08028637177192534, + "grad_norm": 61.19864851265594, + "learning_rate": 2.6732954545454546e-06, + "loss": 3.2177, + "step": 942 + }, + { + "epoch": 0.08037160146595074, + "grad_norm": 116.84925288493181, + "learning_rate": 2.676136363636364e-06, + "loss": 3.0021, + "step": 943 + }, + { + "epoch": 0.08045683115997614, + "grad_norm": 103.707097629356, + "learning_rate": 2.678977272727273e-06, + "loss": 2.6853, + "step": 944 + }, + { + "epoch": 0.08054206085400153, + "grad_norm": 172.5439853239382, + "learning_rate": 2.6818181818181822e-06, + "loss": 2.9358, + "step": 945 + }, + { + "epoch": 0.08062729054802693, + "grad_norm": 137.49456372801805, + "learning_rate": 2.684659090909091e-06, + "loss": 2.2017, + "step": 946 + }, + { + "epoch": 0.08071252024205233, + "grad_norm": 116.57316129154536, + "learning_rate": 2.6875e-06, + "loss": 2.3842, + "step": 947 + }, + { + "epoch": 0.08079774993607773, + "grad_norm": 88.77030238119903, + "learning_rate": 2.6903409090909095e-06, + "loss": 2.7085, + "step": 948 + }, + { + "epoch": 0.08088297963010313, + "grad_norm": 50.54361649698155, + "learning_rate": 2.693181818181819e-06, + "loss": 1.4591, + "step": 949 + }, + { + "epoch": 0.08096820932412853, + "grad_norm": 120.45986782454757, + "learning_rate": 2.6960227272727273e-06, + "loss": 3.1865, + "step": 950 + }, + { + "epoch": 0.08105343901815393, + "grad_norm": 143.78296407039173, + "learning_rate": 2.6988636363636367e-06, + "loss": 3.4066, + "step": 951 + }, + { + "epoch": 0.08113866871217933, + "grad_norm": 97.98209459461765, + "learning_rate": 2.7017045454545456e-06, + "loss": 3.186, + "step": 952 + }, + { + "epoch": 0.08122389840620473, + "grad_norm": 113.6021795609555, + "learning_rate": 2.7045454545454545e-06, + "loss": 2.8602, + "step": 953 + }, + { + "epoch": 0.08130912810023012, + "grad_norm": 173.69672033400948, + "learning_rate": 2.707386363636364e-06, + "loss": 4.5156, + "step": 954 + }, + { + "epoch": 0.08139435779425552, + "grad_norm": 82.42122290237151, + "learning_rate": 2.710227272727273e-06, + "loss": 2.9177, + "step": 955 + }, + { + "epoch": 0.08147958748828092, + "grad_norm": 70.71837246825638, + "learning_rate": 2.713068181818182e-06, + "loss": 2.9533, + "step": 956 + }, + { + "epoch": 0.08156481718230632, + "grad_norm": 98.96760557341545, + "learning_rate": 2.715909090909091e-06, + "loss": 2.4319, + "step": 957 + }, + { + "epoch": 0.08165004687633172, + "grad_norm": 145.16544302576852, + "learning_rate": 2.71875e-06, + "loss": 3.3864, + "step": 958 + }, + { + "epoch": 0.08173527657035712, + "grad_norm": 220.47018999027662, + "learning_rate": 2.7215909090909094e-06, + "loss": 4.4105, + "step": 959 + }, + { + "epoch": 0.08182050626438252, + "grad_norm": 94.12901686466425, + "learning_rate": 2.7244318181818187e-06, + "loss": 3.3942, + "step": 960 + }, + { + "epoch": 0.08190573595840792, + "grad_norm": 138.31728197861455, + "learning_rate": 2.7272727272727272e-06, + "loss": 2.5418, + "step": 961 + }, + { + "epoch": 0.08199096565243331, + "grad_norm": 106.21515364198409, + "learning_rate": 2.7301136363636366e-06, + "loss": 3.1601, + "step": 962 + }, + { + "epoch": 0.0820761953464587, + "grad_norm": 204.66428687632694, + "learning_rate": 2.732954545454546e-06, + "loss": 2.9918, + "step": 963 + }, + { + "epoch": 0.0821614250404841, + "grad_norm": 245.64999420493655, + "learning_rate": 2.7357954545454544e-06, + "loss": 3.6134, + "step": 964 + }, + { + "epoch": 0.0822466547345095, + "grad_norm": 79.20871479551417, + "learning_rate": 2.738636363636364e-06, + "loss": 2.9358, + "step": 965 + }, + { + "epoch": 0.0823318844285349, + "grad_norm": 141.25907484991265, + "learning_rate": 2.741477272727273e-06, + "loss": 3.5883, + "step": 966 + }, + { + "epoch": 0.0824171141225603, + "grad_norm": 74.64020375137063, + "learning_rate": 2.744318181818182e-06, + "loss": 2.6955, + "step": 967 + }, + { + "epoch": 0.0825023438165857, + "grad_norm": 281.0718876241155, + "learning_rate": 2.747159090909091e-06, + "loss": 3.3711, + "step": 968 + }, + { + "epoch": 0.08258757351061109, + "grad_norm": 316.3958834891779, + "learning_rate": 2.7500000000000004e-06, + "loss": 3.3908, + "step": 969 + }, + { + "epoch": 0.08267280320463649, + "grad_norm": 158.39570434397294, + "learning_rate": 2.7528409090909093e-06, + "loss": 3.0315, + "step": 970 + }, + { + "epoch": 0.08275803289866189, + "grad_norm": 170.19965957283452, + "learning_rate": 2.7556818181818186e-06, + "loss": 4.1763, + "step": 971 + }, + { + "epoch": 0.08284326259268729, + "grad_norm": 60.756314101081415, + "learning_rate": 2.7585227272727276e-06, + "loss": 1.6085, + "step": 972 + }, + { + "epoch": 0.08292849228671269, + "grad_norm": 338.2648158963574, + "learning_rate": 2.7613636363636365e-06, + "loss": 4.4415, + "step": 973 + }, + { + "epoch": 0.08301372198073809, + "grad_norm": 160.41391306873606, + "learning_rate": 2.764204545454546e-06, + "loss": 3.1441, + "step": 974 + }, + { + "epoch": 0.08309895167476349, + "grad_norm": 99.14385195973608, + "learning_rate": 2.767045454545455e-06, + "loss": 2.5108, + "step": 975 + }, + { + "epoch": 0.08318418136878888, + "grad_norm": 135.4763330247815, + "learning_rate": 2.7698863636363637e-06, + "loss": 2.7826, + "step": 976 + }, + { + "epoch": 0.08326941106281428, + "grad_norm": 261.54066525620783, + "learning_rate": 2.772727272727273e-06, + "loss": 3.5251, + "step": 977 + }, + { + "epoch": 0.08335464075683968, + "grad_norm": 110.74890718116677, + "learning_rate": 2.775568181818182e-06, + "loss": 3.4147, + "step": 978 + }, + { + "epoch": 0.08343987045086508, + "grad_norm": 384.58522364510196, + "learning_rate": 2.778409090909091e-06, + "loss": 4.47, + "step": 979 + }, + { + "epoch": 0.08352510014489048, + "grad_norm": 85.97766970651138, + "learning_rate": 2.7812500000000003e-06, + "loss": 3.2916, + "step": 980 + }, + { + "epoch": 0.08361032983891588, + "grad_norm": 112.40166878586002, + "learning_rate": 2.784090909090909e-06, + "loss": 2.8384, + "step": 981 + }, + { + "epoch": 0.08369555953294128, + "grad_norm": 104.57081686602982, + "learning_rate": 2.7869318181818185e-06, + "loss": 3.0143, + "step": 982 + }, + { + "epoch": 0.08378078922696668, + "grad_norm": 217.4216899852579, + "learning_rate": 2.7897727272727275e-06, + "loss": 3.0912, + "step": 983 + }, + { + "epoch": 0.08386601892099207, + "grad_norm": 111.4303915987023, + "learning_rate": 2.7926136363636364e-06, + "loss": 4.2472, + "step": 984 + }, + { + "epoch": 0.08395124861501747, + "grad_norm": 89.49304314371696, + "learning_rate": 2.7954545454545458e-06, + "loss": 2.5202, + "step": 985 + }, + { + "epoch": 0.08403647830904287, + "grad_norm": 98.97408043308523, + "learning_rate": 2.798295454545455e-06, + "loss": 2.6407, + "step": 986 + }, + { + "epoch": 0.08412170800306827, + "grad_norm": 105.43902173895883, + "learning_rate": 2.8011363636363636e-06, + "loss": 3.2888, + "step": 987 + }, + { + "epoch": 0.08420693769709367, + "grad_norm": 171.98351702737807, + "learning_rate": 2.803977272727273e-06, + "loss": 3.6075, + "step": 988 + }, + { + "epoch": 0.08429216739111907, + "grad_norm": 112.39002387611275, + "learning_rate": 2.8068181818181823e-06, + "loss": 2.876, + "step": 989 + }, + { + "epoch": 0.08437739708514447, + "grad_norm": 92.20900144484753, + "learning_rate": 2.809659090909091e-06, + "loss": 3.2029, + "step": 990 + }, + { + "epoch": 0.08446262677916987, + "grad_norm": 68.26724378155595, + "learning_rate": 2.8125e-06, + "loss": 2.7556, + "step": 991 + }, + { + "epoch": 0.08454785647319527, + "grad_norm": 80.06198965851975, + "learning_rate": 2.8153409090909095e-06, + "loss": 2.9644, + "step": 992 + }, + { + "epoch": 0.08463308616722066, + "grad_norm": 112.43868497026858, + "learning_rate": 2.818181818181818e-06, + "loss": 2.6083, + "step": 993 + }, + { + "epoch": 0.08471831586124606, + "grad_norm": 98.92447154860581, + "learning_rate": 2.8210227272727274e-06, + "loss": 2.85, + "step": 994 + }, + { + "epoch": 0.08480354555527146, + "grad_norm": 97.52928408197221, + "learning_rate": 2.8238636363636367e-06, + "loss": 2.6301, + "step": 995 + }, + { + "epoch": 0.08488877524929686, + "grad_norm": 71.7427145223073, + "learning_rate": 2.8267045454545457e-06, + "loss": 2.4327, + "step": 996 + }, + { + "epoch": 0.08497400494332226, + "grad_norm": 75.37916676675515, + "learning_rate": 2.829545454545455e-06, + "loss": 2.6902, + "step": 997 + }, + { + "epoch": 0.08505923463734766, + "grad_norm": 139.6104410236198, + "learning_rate": 2.832386363636364e-06, + "loss": 3.5768, + "step": 998 + }, + { + "epoch": 0.08514446433137306, + "grad_norm": 114.07809954777038, + "learning_rate": 2.835227272727273e-06, + "loss": 2.8911, + "step": 999 + }, + { + "epoch": 0.08522969402539844, + "grad_norm": 92.5179850191385, + "learning_rate": 2.8380681818181822e-06, + "loss": 2.1325, + "step": 1000 + }, + { + "epoch": 0.08531492371942384, + "grad_norm": 71.6367334905359, + "learning_rate": 2.8409090909090916e-06, + "loss": 2.801, + "step": 1001 + }, + { + "epoch": 0.08540015341344924, + "grad_norm": 134.0734124201893, + "learning_rate": 2.84375e-06, + "loss": 2.9373, + "step": 1002 + }, + { + "epoch": 0.08548538310747464, + "grad_norm": 97.95477352271179, + "learning_rate": 2.8465909090909094e-06, + "loss": 2.8016, + "step": 1003 + }, + { + "epoch": 0.08557061280150004, + "grad_norm": 125.42568763808157, + "learning_rate": 2.849431818181819e-06, + "loss": 3.0013, + "step": 1004 + }, + { + "epoch": 0.08565584249552544, + "grad_norm": 104.13277140202669, + "learning_rate": 2.8522727272727273e-06, + "loss": 3.387, + "step": 1005 + }, + { + "epoch": 0.08574107218955083, + "grad_norm": 93.43200771203566, + "learning_rate": 2.8551136363636367e-06, + "loss": 2.7416, + "step": 1006 + }, + { + "epoch": 0.08582630188357623, + "grad_norm": 115.55062907763048, + "learning_rate": 2.8579545454545456e-06, + "loss": 2.7874, + "step": 1007 + }, + { + "epoch": 0.08591153157760163, + "grad_norm": 90.88467069174553, + "learning_rate": 2.8607954545454545e-06, + "loss": 2.9426, + "step": 1008 + }, + { + "epoch": 0.08599676127162703, + "grad_norm": 105.01785797830036, + "learning_rate": 2.863636363636364e-06, + "loss": 2.9281, + "step": 1009 + }, + { + "epoch": 0.08608199096565243, + "grad_norm": 213.32345582588076, + "learning_rate": 2.866477272727273e-06, + "loss": 3.0433, + "step": 1010 + }, + { + "epoch": 0.08616722065967783, + "grad_norm": 75.16719695941639, + "learning_rate": 2.869318181818182e-06, + "loss": 2.3713, + "step": 1011 + }, + { + "epoch": 0.08625245035370323, + "grad_norm": 137.6604795115238, + "learning_rate": 2.8721590909090915e-06, + "loss": 2.8972, + "step": 1012 + }, + { + "epoch": 0.08633768004772863, + "grad_norm": 80.37521366747163, + "learning_rate": 2.875e-06, + "loss": 3.1729, + "step": 1013 + }, + { + "epoch": 0.08642290974175403, + "grad_norm": 118.12183960885469, + "learning_rate": 2.8778409090909094e-06, + "loss": 3.6182, + "step": 1014 + }, + { + "epoch": 0.08650813943577942, + "grad_norm": 328.59729567400063, + "learning_rate": 2.8806818181818187e-06, + "loss": 3.6023, + "step": 1015 + }, + { + "epoch": 0.08659336912980482, + "grad_norm": 189.23526597358614, + "learning_rate": 2.8835227272727272e-06, + "loss": 3.7619, + "step": 1016 + }, + { + "epoch": 0.08667859882383022, + "grad_norm": 89.79867049697805, + "learning_rate": 2.8863636363636366e-06, + "loss": 2.798, + "step": 1017 + }, + { + "epoch": 0.08676382851785562, + "grad_norm": 131.8412564282441, + "learning_rate": 2.889204545454546e-06, + "loss": 2.8403, + "step": 1018 + }, + { + "epoch": 0.08684905821188102, + "grad_norm": 290.97350343023095, + "learning_rate": 2.8920454545454544e-06, + "loss": 3.4444, + "step": 1019 + }, + { + "epoch": 0.08693428790590642, + "grad_norm": 133.24209943641011, + "learning_rate": 2.8948863636363638e-06, + "loss": 3.1761, + "step": 1020 + }, + { + "epoch": 0.08701951759993182, + "grad_norm": 115.98794660437562, + "learning_rate": 2.897727272727273e-06, + "loss": 2.7752, + "step": 1021 + }, + { + "epoch": 0.08710474729395722, + "grad_norm": 203.56984616326181, + "learning_rate": 2.900568181818182e-06, + "loss": 3.629, + "step": 1022 + }, + { + "epoch": 0.08718997698798261, + "grad_norm": 89.05117641108353, + "learning_rate": 2.903409090909091e-06, + "loss": 3.474, + "step": 1023 + }, + { + "epoch": 0.08727520668200801, + "grad_norm": 90.5244148375639, + "learning_rate": 2.9062500000000003e-06, + "loss": 3.0265, + "step": 1024 + }, + { + "epoch": 0.08736043637603341, + "grad_norm": 103.49373553302891, + "learning_rate": 2.9090909090909093e-06, + "loss": 2.1412, + "step": 1025 + }, + { + "epoch": 0.08744566607005881, + "grad_norm": 90.21319362574886, + "learning_rate": 2.9119318181818186e-06, + "loss": 2.6291, + "step": 1026 + }, + { + "epoch": 0.08753089576408421, + "grad_norm": 143.58129765423027, + "learning_rate": 2.9147727272727275e-06, + "loss": 3.5003, + "step": 1027 + }, + { + "epoch": 0.08761612545810961, + "grad_norm": 75.79987894713653, + "learning_rate": 2.9176136363636365e-06, + "loss": 2.5136, + "step": 1028 + }, + { + "epoch": 0.08770135515213501, + "grad_norm": 104.27695501346585, + "learning_rate": 2.920454545454546e-06, + "loss": 3.2728, + "step": 1029 + }, + { + "epoch": 0.0877865848461604, + "grad_norm": 107.33022644623723, + "learning_rate": 2.923295454545455e-06, + "loss": 1.8884, + "step": 1030 + }, + { + "epoch": 0.0878718145401858, + "grad_norm": 171.8405391854115, + "learning_rate": 2.9261363636363637e-06, + "loss": 2.7989, + "step": 1031 + }, + { + "epoch": 0.0879570442342112, + "grad_norm": 96.43441764668584, + "learning_rate": 2.928977272727273e-06, + "loss": 3.2724, + "step": 1032 + }, + { + "epoch": 0.0880422739282366, + "grad_norm": 159.10684635057348, + "learning_rate": 2.931818181818182e-06, + "loss": 2.8118, + "step": 1033 + }, + { + "epoch": 0.088127503622262, + "grad_norm": 90.08678719038066, + "learning_rate": 2.934659090909091e-06, + "loss": 3.4966, + "step": 1034 + }, + { + "epoch": 0.0882127333162874, + "grad_norm": 116.86603879856419, + "learning_rate": 2.9375000000000003e-06, + "loss": 3.6097, + "step": 1035 + }, + { + "epoch": 0.0882979630103128, + "grad_norm": 123.57078775419943, + "learning_rate": 2.940340909090909e-06, + "loss": 2.7887, + "step": 1036 + }, + { + "epoch": 0.0883831927043382, + "grad_norm": 86.7994462561539, + "learning_rate": 2.9431818181818185e-06, + "loss": 3.3358, + "step": 1037 + }, + { + "epoch": 0.08846842239836358, + "grad_norm": 74.71574098626367, + "learning_rate": 2.9460227272727275e-06, + "loss": 3.1164, + "step": 1038 + }, + { + "epoch": 0.08855365209238898, + "grad_norm": 137.66310706112418, + "learning_rate": 2.9488636363636364e-06, + "loss": 2.8081, + "step": 1039 + }, + { + "epoch": 0.08863888178641438, + "grad_norm": 99.63689109430231, + "learning_rate": 2.9517045454545457e-06, + "loss": 3.5088, + "step": 1040 + }, + { + "epoch": 0.08872411148043978, + "grad_norm": 232.69408219634542, + "learning_rate": 2.954545454545455e-06, + "loss": 3.505, + "step": 1041 + }, + { + "epoch": 0.08880934117446518, + "grad_norm": 104.21582068597144, + "learning_rate": 2.9573863636363636e-06, + "loss": 3.2987, + "step": 1042 + }, + { + "epoch": 0.08889457086849058, + "grad_norm": 201.14238522098233, + "learning_rate": 2.960227272727273e-06, + "loss": 4.6737, + "step": 1043 + }, + { + "epoch": 0.08897980056251598, + "grad_norm": 96.2126717594776, + "learning_rate": 2.9630681818181823e-06, + "loss": 3.0673, + "step": 1044 + }, + { + "epoch": 0.08906503025654137, + "grad_norm": 154.5580328756325, + "learning_rate": 2.965909090909091e-06, + "loss": 3.0697, + "step": 1045 + }, + { + "epoch": 0.08915025995056677, + "grad_norm": 173.81732987984248, + "learning_rate": 2.96875e-06, + "loss": 3.6949, + "step": 1046 + }, + { + "epoch": 0.08923548964459217, + "grad_norm": 82.27946085397383, + "learning_rate": 2.9715909090909095e-06, + "loss": 2.8281, + "step": 1047 + }, + { + "epoch": 0.08932071933861757, + "grad_norm": 104.63817799992604, + "learning_rate": 2.9744318181818184e-06, + "loss": 3.2844, + "step": 1048 + }, + { + "epoch": 0.08940594903264297, + "grad_norm": 76.25230994330343, + "learning_rate": 2.9772727272727274e-06, + "loss": 3.2519, + "step": 1049 + }, + { + "epoch": 0.08949117872666837, + "grad_norm": 176.49395346731288, + "learning_rate": 2.9801136363636367e-06, + "loss": 3.4041, + "step": 1050 + }, + { + "epoch": 0.08957640842069377, + "grad_norm": 87.90604653640314, + "learning_rate": 2.9829545454545457e-06, + "loss": 2.8628, + "step": 1051 + }, + { + "epoch": 0.08966163811471917, + "grad_norm": 187.32678631134013, + "learning_rate": 2.985795454545455e-06, + "loss": 3.7474, + "step": 1052 + }, + { + "epoch": 0.08974686780874457, + "grad_norm": 74.31929154145682, + "learning_rate": 2.988636363636364e-06, + "loss": 3.1586, + "step": 1053 + }, + { + "epoch": 0.08983209750276996, + "grad_norm": 61.28947873404979, + "learning_rate": 2.991477272727273e-06, + "loss": 2.2715, + "step": 1054 + }, + { + "epoch": 0.08991732719679536, + "grad_norm": 80.78369067495491, + "learning_rate": 2.9943181818181822e-06, + "loss": 2.7956, + "step": 1055 + }, + { + "epoch": 0.09000255689082076, + "grad_norm": 74.7030290725216, + "learning_rate": 2.9971590909090916e-06, + "loss": 2.9926, + "step": 1056 + }, + { + "epoch": 0.09008778658484616, + "grad_norm": 240.43845581193008, + "learning_rate": 3e-06, + "loss": 3.9416, + "step": 1057 + }, + { + "epoch": 0.09017301627887156, + "grad_norm": 84.74582601585814, + "learning_rate": 3.0028409090909094e-06, + "loss": 3.2838, + "step": 1058 + }, + { + "epoch": 0.09025824597289696, + "grad_norm": 227.47280993585522, + "learning_rate": 3.0056818181818188e-06, + "loss": 2.9051, + "step": 1059 + }, + { + "epoch": 0.09034347566692236, + "grad_norm": 188.2255293786382, + "learning_rate": 3.0085227272727273e-06, + "loss": 3.4428, + "step": 1060 + }, + { + "epoch": 0.09042870536094776, + "grad_norm": 448.1099082593104, + "learning_rate": 3.0113636363636366e-06, + "loss": 4.1135, + "step": 1061 + }, + { + "epoch": 0.09051393505497315, + "grad_norm": 72.07130742804867, + "learning_rate": 3.0142045454545456e-06, + "loss": 2.5433, + "step": 1062 + }, + { + "epoch": 0.09059916474899855, + "grad_norm": 58.118798396939404, + "learning_rate": 3.017045454545455e-06, + "loss": 2.6844, + "step": 1063 + }, + { + "epoch": 0.09068439444302395, + "grad_norm": 85.5791810545278, + "learning_rate": 3.019886363636364e-06, + "loss": 2.9333, + "step": 1064 + }, + { + "epoch": 0.09076962413704935, + "grad_norm": 93.94200656389914, + "learning_rate": 3.0227272727272728e-06, + "loss": 2.1285, + "step": 1065 + }, + { + "epoch": 0.09085485383107475, + "grad_norm": 89.73340352382402, + "learning_rate": 3.025568181818182e-06, + "loss": 2.4889, + "step": 1066 + }, + { + "epoch": 0.09094008352510015, + "grad_norm": 132.2106152323808, + "learning_rate": 3.0284090909090915e-06, + "loss": 3.1384, + "step": 1067 + }, + { + "epoch": 0.09102531321912555, + "grad_norm": 117.24885480664986, + "learning_rate": 3.03125e-06, + "loss": 2.8328, + "step": 1068 + }, + { + "epoch": 0.09111054291315095, + "grad_norm": 81.9817785873202, + "learning_rate": 3.0340909090909093e-06, + "loss": 2.5836, + "step": 1069 + }, + { + "epoch": 0.09119577260717635, + "grad_norm": 108.95056950204142, + "learning_rate": 3.0369318181818187e-06, + "loss": 3.4389, + "step": 1070 + }, + { + "epoch": 0.09128100230120174, + "grad_norm": 156.87287505792065, + "learning_rate": 3.039772727272727e-06, + "loss": 4.0302, + "step": 1071 + }, + { + "epoch": 0.09136623199522714, + "grad_norm": 149.72487314247508, + "learning_rate": 3.0426136363636366e-06, + "loss": 1.8699, + "step": 1072 + }, + { + "epoch": 0.09145146168925254, + "grad_norm": 77.02813148067723, + "learning_rate": 3.045454545454546e-06, + "loss": 2.6147, + "step": 1073 + }, + { + "epoch": 0.09153669138327794, + "grad_norm": 116.30340236136665, + "learning_rate": 3.0482954545454544e-06, + "loss": 2.5783, + "step": 1074 + }, + { + "epoch": 0.09162192107730334, + "grad_norm": 134.19798235654062, + "learning_rate": 3.0511363636363638e-06, + "loss": 2.0401, + "step": 1075 + }, + { + "epoch": 0.09170715077132872, + "grad_norm": 328.9052371972952, + "learning_rate": 3.053977272727273e-06, + "loss": 4.0826, + "step": 1076 + }, + { + "epoch": 0.09179238046535412, + "grad_norm": 119.72025242244096, + "learning_rate": 3.056818181818182e-06, + "loss": 3.6098, + "step": 1077 + }, + { + "epoch": 0.09187761015937952, + "grad_norm": 117.11233901967489, + "learning_rate": 3.0596590909090914e-06, + "loss": 3.1483, + "step": 1078 + }, + { + "epoch": 0.09196283985340492, + "grad_norm": 76.74264906489454, + "learning_rate": 3.0625000000000003e-06, + "loss": 3.2719, + "step": 1079 + }, + { + "epoch": 0.09204806954743032, + "grad_norm": 61.18374932019411, + "learning_rate": 3.0653409090909093e-06, + "loss": 1.639, + "step": 1080 + }, + { + "epoch": 0.09213329924145572, + "grad_norm": 65.6465402946173, + "learning_rate": 3.0681818181818186e-06, + "loss": 2.039, + "step": 1081 + }, + { + "epoch": 0.09221852893548112, + "grad_norm": 285.46881404047457, + "learning_rate": 3.071022727272728e-06, + "loss": 5.5099, + "step": 1082 + }, + { + "epoch": 0.09230375862950652, + "grad_norm": 154.14514269665472, + "learning_rate": 3.0738636363636365e-06, + "loss": 3.9071, + "step": 1083 + }, + { + "epoch": 0.09238898832353191, + "grad_norm": 83.48023568380133, + "learning_rate": 3.076704545454546e-06, + "loss": 3.0013, + "step": 1084 + }, + { + "epoch": 0.09247421801755731, + "grad_norm": 35.323160500173366, + "learning_rate": 3.079545454545455e-06, + "loss": 1.1336, + "step": 1085 + }, + { + "epoch": 0.09255944771158271, + "grad_norm": 82.04763718486006, + "learning_rate": 3.0823863636363637e-06, + "loss": 2.9429, + "step": 1086 + }, + { + "epoch": 0.09264467740560811, + "grad_norm": 84.54779876176379, + "learning_rate": 3.085227272727273e-06, + "loss": 3.1944, + "step": 1087 + }, + { + "epoch": 0.09272990709963351, + "grad_norm": 127.28824943536247, + "learning_rate": 3.088068181818182e-06, + "loss": 3.1581, + "step": 1088 + }, + { + "epoch": 0.09281513679365891, + "grad_norm": 46.46011067198935, + "learning_rate": 3.090909090909091e-06, + "loss": 1.7718, + "step": 1089 + }, + { + "epoch": 0.09290036648768431, + "grad_norm": 134.61673633474882, + "learning_rate": 3.0937500000000002e-06, + "loss": 3.3115, + "step": 1090 + }, + { + "epoch": 0.0929855961817097, + "grad_norm": 80.43482054098189, + "learning_rate": 3.096590909090909e-06, + "loss": 2.9086, + "step": 1091 + }, + { + "epoch": 0.0930708258757351, + "grad_norm": 122.94662330955958, + "learning_rate": 3.0994318181818185e-06, + "loss": 3.1891, + "step": 1092 + }, + { + "epoch": 0.0931560555697605, + "grad_norm": 55.77129618236803, + "learning_rate": 3.1022727272727274e-06, + "loss": 2.0373, + "step": 1093 + }, + { + "epoch": 0.0932412852637859, + "grad_norm": 174.87156412756397, + "learning_rate": 3.1051136363636364e-06, + "loss": 3.3079, + "step": 1094 + }, + { + "epoch": 0.0933265149578113, + "grad_norm": 106.67498885996251, + "learning_rate": 3.1079545454545457e-06, + "loss": 3.3718, + "step": 1095 + }, + { + "epoch": 0.0934117446518367, + "grad_norm": 104.90198595530978, + "learning_rate": 3.110795454545455e-06, + "loss": 2.806, + "step": 1096 + }, + { + "epoch": 0.0934969743458621, + "grad_norm": 149.01754434286528, + "learning_rate": 3.1136363636363636e-06, + "loss": 2.8689, + "step": 1097 + }, + { + "epoch": 0.0935822040398875, + "grad_norm": 64.78287903003007, + "learning_rate": 3.116477272727273e-06, + "loss": 3.0795, + "step": 1098 + }, + { + "epoch": 0.0936674337339129, + "grad_norm": 93.5138860656651, + "learning_rate": 3.1193181818181823e-06, + "loss": 2.8288, + "step": 1099 + }, + { + "epoch": 0.0937526634279383, + "grad_norm": 74.10639558226904, + "learning_rate": 3.122159090909091e-06, + "loss": 2.9891, + "step": 1100 + }, + { + "epoch": 0.0938378931219637, + "grad_norm": 135.24474971652197, + "learning_rate": 3.125e-06, + "loss": 2.9916, + "step": 1101 + }, + { + "epoch": 0.0939231228159891, + "grad_norm": 95.20328687492903, + "learning_rate": 3.1278409090909095e-06, + "loss": 3.8672, + "step": 1102 + }, + { + "epoch": 0.09400835251001449, + "grad_norm": 78.61521452438649, + "learning_rate": 3.1306818181818184e-06, + "loss": 3.3665, + "step": 1103 + }, + { + "epoch": 0.09409358220403989, + "grad_norm": 117.90747534331102, + "learning_rate": 3.1335227272727274e-06, + "loss": 4.031, + "step": 1104 + }, + { + "epoch": 0.09417881189806529, + "grad_norm": 143.72642758212055, + "learning_rate": 3.1363636363636367e-06, + "loss": 3.2997, + "step": 1105 + }, + { + "epoch": 0.09426404159209069, + "grad_norm": 141.52400516715605, + "learning_rate": 3.1392045454545456e-06, + "loss": 2.8019, + "step": 1106 + }, + { + "epoch": 0.09434927128611609, + "grad_norm": 194.91310693142847, + "learning_rate": 3.142045454545455e-06, + "loss": 2.5882, + "step": 1107 + }, + { + "epoch": 0.09443450098014149, + "grad_norm": 155.92996631612058, + "learning_rate": 3.144886363636364e-06, + "loss": 3.4796, + "step": 1108 + }, + { + "epoch": 0.09451973067416689, + "grad_norm": 92.8578590165742, + "learning_rate": 3.147727272727273e-06, + "loss": 3.2008, + "step": 1109 + }, + { + "epoch": 0.09460496036819228, + "grad_norm": 86.26374922145985, + "learning_rate": 3.150568181818182e-06, + "loss": 3.1854, + "step": 1110 + }, + { + "epoch": 0.09469019006221768, + "grad_norm": 103.71988785052115, + "learning_rate": 3.1534090909090916e-06, + "loss": 3.569, + "step": 1111 + }, + { + "epoch": 0.09477541975624308, + "grad_norm": 90.13625782653841, + "learning_rate": 3.15625e-06, + "loss": 2.4098, + "step": 1112 + }, + { + "epoch": 0.09486064945026847, + "grad_norm": 68.39364762542938, + "learning_rate": 3.1590909090909094e-06, + "loss": 3.2029, + "step": 1113 + }, + { + "epoch": 0.09494587914429387, + "grad_norm": 69.64449422452073, + "learning_rate": 3.1619318181818188e-06, + "loss": 3.2228, + "step": 1114 + }, + { + "epoch": 0.09503110883831926, + "grad_norm": 82.86420687320785, + "learning_rate": 3.1647727272727273e-06, + "loss": 3.1393, + "step": 1115 + }, + { + "epoch": 0.09511633853234466, + "grad_norm": 92.30076240197275, + "learning_rate": 3.1676136363636366e-06, + "loss": 3.0898, + "step": 1116 + }, + { + "epoch": 0.09520156822637006, + "grad_norm": 111.8627207003252, + "learning_rate": 3.1704545454545456e-06, + "loss": 2.9514, + "step": 1117 + }, + { + "epoch": 0.09528679792039546, + "grad_norm": 119.77831412517618, + "learning_rate": 3.173295454545455e-06, + "loss": 3.3759, + "step": 1118 + }, + { + "epoch": 0.09537202761442086, + "grad_norm": 70.87788976097899, + "learning_rate": 3.176136363636364e-06, + "loss": 2.9855, + "step": 1119 + }, + { + "epoch": 0.09545725730844626, + "grad_norm": 89.30036205827507, + "learning_rate": 3.1789772727272728e-06, + "loss": 3.4203, + "step": 1120 + }, + { + "epoch": 0.09554248700247166, + "grad_norm": 100.67721058602028, + "learning_rate": 3.181818181818182e-06, + "loss": 2.8158, + "step": 1121 + }, + { + "epoch": 0.09562771669649706, + "grad_norm": 211.42514488904638, + "learning_rate": 3.1846590909090915e-06, + "loss": 2.7379, + "step": 1122 + }, + { + "epoch": 0.09571294639052245, + "grad_norm": 118.10409656171666, + "learning_rate": 3.1875e-06, + "loss": 2.9575, + "step": 1123 + }, + { + "epoch": 0.09579817608454785, + "grad_norm": 227.40503667752125, + "learning_rate": 3.1903409090909093e-06, + "loss": 4.289, + "step": 1124 + }, + { + "epoch": 0.09588340577857325, + "grad_norm": 62.83386671986698, + "learning_rate": 3.1931818181818187e-06, + "loss": 2.1581, + "step": 1125 + }, + { + "epoch": 0.09596863547259865, + "grad_norm": 199.34833360242516, + "learning_rate": 3.196022727272727e-06, + "loss": 3.9504, + "step": 1126 + }, + { + "epoch": 0.09605386516662405, + "grad_norm": 419.4436046757169, + "learning_rate": 3.1988636363636365e-06, + "loss": 5.1954, + "step": 1127 + }, + { + "epoch": 0.09613909486064945, + "grad_norm": 100.89704628194536, + "learning_rate": 3.201704545454546e-06, + "loss": 3.165, + "step": 1128 + }, + { + "epoch": 0.09622432455467485, + "grad_norm": 231.27074511122782, + "learning_rate": 3.204545454545455e-06, + "loss": 4.0715, + "step": 1129 + }, + { + "epoch": 0.09630955424870025, + "grad_norm": 83.2053576279226, + "learning_rate": 3.2073863636363637e-06, + "loss": 3.1782, + "step": 1130 + }, + { + "epoch": 0.09639478394272565, + "grad_norm": 197.5718186447759, + "learning_rate": 3.210227272727273e-06, + "loss": 4.0472, + "step": 1131 + }, + { + "epoch": 0.09648001363675104, + "grad_norm": 97.70347317421086, + "learning_rate": 3.213068181818182e-06, + "loss": 3.1697, + "step": 1132 + }, + { + "epoch": 0.09656524333077644, + "grad_norm": 164.57520045769243, + "learning_rate": 3.2159090909090914e-06, + "loss": 3.4124, + "step": 1133 + }, + { + "epoch": 0.09665047302480184, + "grad_norm": 94.18064061093114, + "learning_rate": 3.2187500000000003e-06, + "loss": 2.8038, + "step": 1134 + }, + { + "epoch": 0.09673570271882724, + "grad_norm": 104.59300537606363, + "learning_rate": 3.2215909090909092e-06, + "loss": 3.3229, + "step": 1135 + }, + { + "epoch": 0.09682093241285264, + "grad_norm": 85.41975081226022, + "learning_rate": 3.2244318181818186e-06, + "loss": 2.8212, + "step": 1136 + }, + { + "epoch": 0.09690616210687804, + "grad_norm": 117.54209040078824, + "learning_rate": 3.227272727272728e-06, + "loss": 3.6435, + "step": 1137 + }, + { + "epoch": 0.09699139180090344, + "grad_norm": 120.82026851178281, + "learning_rate": 3.2301136363636365e-06, + "loss": 3.7065, + "step": 1138 + }, + { + "epoch": 0.09707662149492884, + "grad_norm": 347.1839575611193, + "learning_rate": 3.232954545454546e-06, + "loss": 4.382, + "step": 1139 + }, + { + "epoch": 0.09716185118895423, + "grad_norm": 311.2889198361093, + "learning_rate": 3.235795454545455e-06, + "loss": 3.9256, + "step": 1140 + }, + { + "epoch": 0.09724708088297963, + "grad_norm": 193.1669179983387, + "learning_rate": 3.2386363636363637e-06, + "loss": 3.5167, + "step": 1141 + }, + { + "epoch": 0.09733231057700503, + "grad_norm": 241.5220614509825, + "learning_rate": 3.241477272727273e-06, + "loss": 3.2656, + "step": 1142 + }, + { + "epoch": 0.09741754027103043, + "grad_norm": 84.6428004938358, + "learning_rate": 3.244318181818182e-06, + "loss": 2.8757, + "step": 1143 + }, + { + "epoch": 0.09750276996505583, + "grad_norm": 187.88400503942373, + "learning_rate": 3.2471590909090913e-06, + "loss": 3.5482, + "step": 1144 + }, + { + "epoch": 0.09758799965908123, + "grad_norm": 72.80802101519791, + "learning_rate": 3.2500000000000002e-06, + "loss": 2.9452, + "step": 1145 + }, + { + "epoch": 0.09767322935310663, + "grad_norm": 110.66956420904553, + "learning_rate": 3.252840909090909e-06, + "loss": 3.4192, + "step": 1146 + }, + { + "epoch": 0.09775845904713203, + "grad_norm": 105.98228102697273, + "learning_rate": 3.2556818181818185e-06, + "loss": 3.7429, + "step": 1147 + }, + { + "epoch": 0.09784368874115743, + "grad_norm": 157.53055349225582, + "learning_rate": 3.258522727272728e-06, + "loss": 3.6668, + "step": 1148 + }, + { + "epoch": 0.09792891843518282, + "grad_norm": 179.518421260964, + "learning_rate": 3.2613636363636364e-06, + "loss": 3.1693, + "step": 1149 + }, + { + "epoch": 0.09801414812920822, + "grad_norm": 80.88740782308689, + "learning_rate": 3.2642045454545457e-06, + "loss": 2.6439, + "step": 1150 + }, + { + "epoch": 0.09809937782323361, + "grad_norm": 87.42649858990677, + "learning_rate": 3.267045454545455e-06, + "loss": 1.7915, + "step": 1151 + }, + { + "epoch": 0.098184607517259, + "grad_norm": 193.9610336999912, + "learning_rate": 3.2698863636363636e-06, + "loss": 3.2002, + "step": 1152 + }, + { + "epoch": 0.0982698372112844, + "grad_norm": 384.6124875440586, + "learning_rate": 3.272727272727273e-06, + "loss": 4.0548, + "step": 1153 + }, + { + "epoch": 0.0983550669053098, + "grad_norm": 126.5939463376886, + "learning_rate": 3.2755681818181823e-06, + "loss": 3.1977, + "step": 1154 + }, + { + "epoch": 0.0984402965993352, + "grad_norm": 97.08470696111678, + "learning_rate": 3.2784090909090908e-06, + "loss": 2.7582, + "step": 1155 + }, + { + "epoch": 0.0985255262933606, + "grad_norm": 115.48153544713529, + "learning_rate": 3.28125e-06, + "loss": 3.4634, + "step": 1156 + }, + { + "epoch": 0.098610755987386, + "grad_norm": 97.47655644966338, + "learning_rate": 3.2840909090909095e-06, + "loss": 3.1618, + "step": 1157 + }, + { + "epoch": 0.0986959856814114, + "grad_norm": 303.57388981098285, + "learning_rate": 3.2869318181818184e-06, + "loss": 4.1443, + "step": 1158 + }, + { + "epoch": 0.0987812153754368, + "grad_norm": 227.65569374993055, + "learning_rate": 3.2897727272727278e-06, + "loss": 2.9475, + "step": 1159 + }, + { + "epoch": 0.0988664450694622, + "grad_norm": 92.64366135053278, + "learning_rate": 3.2926136363636367e-06, + "loss": 3.5584, + "step": 1160 + }, + { + "epoch": 0.0989516747634876, + "grad_norm": 199.6713945610002, + "learning_rate": 3.2954545454545456e-06, + "loss": 4.1747, + "step": 1161 + }, + { + "epoch": 0.099036904457513, + "grad_norm": 138.79649423778153, + "learning_rate": 3.298295454545455e-06, + "loss": 2.8406, + "step": 1162 + }, + { + "epoch": 0.0991221341515384, + "grad_norm": 186.06202560929452, + "learning_rate": 3.3011363636363643e-06, + "loss": 3.4754, + "step": 1163 + }, + { + "epoch": 0.09920736384556379, + "grad_norm": 186.43849947657463, + "learning_rate": 3.303977272727273e-06, + "loss": 3.7635, + "step": 1164 + }, + { + "epoch": 0.09929259353958919, + "grad_norm": 123.95579687110418, + "learning_rate": 3.306818181818182e-06, + "loss": 3.1434, + "step": 1165 + }, + { + "epoch": 0.09937782323361459, + "grad_norm": 170.98415449321683, + "learning_rate": 3.3096590909090915e-06, + "loss": 3.0935, + "step": 1166 + }, + { + "epoch": 0.09946305292763999, + "grad_norm": 89.9946558084069, + "learning_rate": 3.3125e-06, + "loss": 2.7892, + "step": 1167 + }, + { + "epoch": 0.09954828262166539, + "grad_norm": 77.43116039620334, + "learning_rate": 3.3153409090909094e-06, + "loss": 2.581, + "step": 1168 + }, + { + "epoch": 0.09963351231569079, + "grad_norm": 87.84830483882101, + "learning_rate": 3.3181818181818188e-06, + "loss": 2.919, + "step": 1169 + }, + { + "epoch": 0.09971874200971619, + "grad_norm": 79.31859497189193, + "learning_rate": 3.3210227272727273e-06, + "loss": 2.3989, + "step": 1170 + }, + { + "epoch": 0.09980397170374158, + "grad_norm": 286.259560291707, + "learning_rate": 3.3238636363636366e-06, + "loss": 3.449, + "step": 1171 + }, + { + "epoch": 0.09988920139776698, + "grad_norm": 89.30679036666757, + "learning_rate": 3.3267045454545455e-06, + "loss": 2.9661, + "step": 1172 + }, + { + "epoch": 0.09997443109179238, + "grad_norm": 72.46127172807508, + "learning_rate": 3.329545454545455e-06, + "loss": 3.2116, + "step": 1173 + }, + { + "epoch": 0.10005966078581778, + "grad_norm": 139.59455016673732, + "learning_rate": 3.332386363636364e-06, + "loss": 2.4865, + "step": 1174 + }, + { + "epoch": 0.10014489047984318, + "grad_norm": 88.17991199389907, + "learning_rate": 3.3352272727272728e-06, + "loss": 2.9419, + "step": 1175 + }, + { + "epoch": 0.10023012017386858, + "grad_norm": 100.4110528965119, + "learning_rate": 3.338068181818182e-06, + "loss": 3.1972, + "step": 1176 + }, + { + "epoch": 0.10031534986789398, + "grad_norm": 122.06362908564864, + "learning_rate": 3.3409090909090915e-06, + "loss": 2.9605, + "step": 1177 + }, + { + "epoch": 0.10040057956191938, + "grad_norm": 184.39242006406502, + "learning_rate": 3.34375e-06, + "loss": 3.7842, + "step": 1178 + }, + { + "epoch": 0.10048580925594477, + "grad_norm": 109.59996546294295, + "learning_rate": 3.3465909090909093e-06, + "loss": 2.5516, + "step": 1179 + }, + { + "epoch": 0.10057103894997017, + "grad_norm": 159.0000161746763, + "learning_rate": 3.3494318181818187e-06, + "loss": 2.9593, + "step": 1180 + }, + { + "epoch": 0.10065626864399557, + "grad_norm": 134.0049503314019, + "learning_rate": 3.352272727272727e-06, + "loss": 2.253, + "step": 1181 + }, + { + "epoch": 0.10074149833802097, + "grad_norm": 92.48401937647274, + "learning_rate": 3.3551136363636365e-06, + "loss": 3.3613, + "step": 1182 + }, + { + "epoch": 0.10082672803204637, + "grad_norm": 90.84348535413879, + "learning_rate": 3.357954545454546e-06, + "loss": 2.1062, + "step": 1183 + }, + { + "epoch": 0.10091195772607177, + "grad_norm": 109.30736384750945, + "learning_rate": 3.360795454545455e-06, + "loss": 3.8084, + "step": 1184 + }, + { + "epoch": 0.10099718742009717, + "grad_norm": 96.10586592565926, + "learning_rate": 3.3636363636363637e-06, + "loss": 3.4208, + "step": 1185 + }, + { + "epoch": 0.10108241711412257, + "grad_norm": 138.07825929896507, + "learning_rate": 3.366477272727273e-06, + "loss": 2.9005, + "step": 1186 + }, + { + "epoch": 0.10116764680814797, + "grad_norm": 145.12224986994266, + "learning_rate": 3.369318181818182e-06, + "loss": 2.8904, + "step": 1187 + }, + { + "epoch": 0.10125287650217336, + "grad_norm": 170.00512375978388, + "learning_rate": 3.3721590909090914e-06, + "loss": 3.7182, + "step": 1188 + }, + { + "epoch": 0.10133810619619875, + "grad_norm": 120.94275966129464, + "learning_rate": 3.3750000000000003e-06, + "loss": 2.2278, + "step": 1189 + }, + { + "epoch": 0.10142333589022415, + "grad_norm": 200.33776489035876, + "learning_rate": 3.3778409090909092e-06, + "loss": 3.6326, + "step": 1190 + }, + { + "epoch": 0.10150856558424955, + "grad_norm": 53.635293570008294, + "learning_rate": 3.3806818181818186e-06, + "loss": 1.6496, + "step": 1191 + }, + { + "epoch": 0.10159379527827495, + "grad_norm": 122.39142693034466, + "learning_rate": 3.383522727272728e-06, + "loss": 3.6246, + "step": 1192 + }, + { + "epoch": 0.10167902497230034, + "grad_norm": 205.49304409300916, + "learning_rate": 3.3863636363636364e-06, + "loss": 5.0368, + "step": 1193 + }, + { + "epoch": 0.10176425466632574, + "grad_norm": 104.42110968925515, + "learning_rate": 3.389204545454546e-06, + "loss": 2.8224, + "step": 1194 + }, + { + "epoch": 0.10184948436035114, + "grad_norm": 230.7069467544212, + "learning_rate": 3.392045454545455e-06, + "loss": 3.4569, + "step": 1195 + }, + { + "epoch": 0.10193471405437654, + "grad_norm": 161.79216000111597, + "learning_rate": 3.3948863636363636e-06, + "loss": 3.0616, + "step": 1196 + }, + { + "epoch": 0.10201994374840194, + "grad_norm": 60.44326986105123, + "learning_rate": 3.397727272727273e-06, + "loss": 2.5098, + "step": 1197 + }, + { + "epoch": 0.10210517344242734, + "grad_norm": 58.95242523117896, + "learning_rate": 3.400568181818182e-06, + "loss": 1.9869, + "step": 1198 + }, + { + "epoch": 0.10219040313645274, + "grad_norm": 156.2770106166637, + "learning_rate": 3.4034090909090913e-06, + "loss": 3.1816, + "step": 1199 + }, + { + "epoch": 0.10227563283047814, + "grad_norm": 111.68892263003815, + "learning_rate": 3.40625e-06, + "loss": 2.7652, + "step": 1200 + }, + { + "epoch": 0.10236086252450353, + "grad_norm": 93.2143992118315, + "learning_rate": 3.409090909090909e-06, + "loss": 2.0297, + "step": 1201 + }, + { + "epoch": 0.10244609221852893, + "grad_norm": 98.01503760052829, + "learning_rate": 3.4119318181818185e-06, + "loss": 2.7871, + "step": 1202 + }, + { + "epoch": 0.10253132191255433, + "grad_norm": 85.33116060076944, + "learning_rate": 3.414772727272728e-06, + "loss": 2.8754, + "step": 1203 + }, + { + "epoch": 0.10261655160657973, + "grad_norm": 86.0725167626916, + "learning_rate": 3.4176136363636363e-06, + "loss": 2.8852, + "step": 1204 + }, + { + "epoch": 0.10270178130060513, + "grad_norm": 108.50722016675704, + "learning_rate": 3.4204545454545457e-06, + "loss": 3.2045, + "step": 1205 + }, + { + "epoch": 0.10278701099463053, + "grad_norm": 176.89849266690794, + "learning_rate": 3.423295454545455e-06, + "loss": 4.1703, + "step": 1206 + }, + { + "epoch": 0.10287224068865593, + "grad_norm": 190.9259648540823, + "learning_rate": 3.4261363636363636e-06, + "loss": 3.7888, + "step": 1207 + }, + { + "epoch": 0.10295747038268133, + "grad_norm": 104.34334412900591, + "learning_rate": 3.428977272727273e-06, + "loss": 3.4234, + "step": 1208 + }, + { + "epoch": 0.10304270007670673, + "grad_norm": 334.2565606518354, + "learning_rate": 3.4318181818181823e-06, + "loss": 3.5767, + "step": 1209 + }, + { + "epoch": 0.10312792977073212, + "grad_norm": 155.80814951379887, + "learning_rate": 3.434659090909091e-06, + "loss": 2.5518, + "step": 1210 + }, + { + "epoch": 0.10321315946475752, + "grad_norm": 144.2100017247856, + "learning_rate": 3.4375e-06, + "loss": 2.6425, + "step": 1211 + }, + { + "epoch": 0.10329838915878292, + "grad_norm": 73.7374207068338, + "learning_rate": 3.4403409090909095e-06, + "loss": 2.47, + "step": 1212 + }, + { + "epoch": 0.10338361885280832, + "grad_norm": 94.7761538202155, + "learning_rate": 3.4431818181818184e-06, + "loss": 3.3754, + "step": 1213 + }, + { + "epoch": 0.10346884854683372, + "grad_norm": 102.89932769646211, + "learning_rate": 3.4460227272727278e-06, + "loss": 3.1963, + "step": 1214 + }, + { + "epoch": 0.10355407824085912, + "grad_norm": 234.34956539688068, + "learning_rate": 3.4488636363636367e-06, + "loss": 4.8669, + "step": 1215 + }, + { + "epoch": 0.10363930793488452, + "grad_norm": 179.58618923556728, + "learning_rate": 3.4517045454545456e-06, + "loss": 3.0671, + "step": 1216 + }, + { + "epoch": 0.10372453762890992, + "grad_norm": 229.54317783828947, + "learning_rate": 3.454545454545455e-06, + "loss": 3.5515, + "step": 1217 + }, + { + "epoch": 0.10380976732293531, + "grad_norm": 90.56120164934292, + "learning_rate": 3.4573863636363643e-06, + "loss": 2.2828, + "step": 1218 + }, + { + "epoch": 0.10389499701696071, + "grad_norm": 95.86234490731822, + "learning_rate": 3.460227272727273e-06, + "loss": 3.2258, + "step": 1219 + }, + { + "epoch": 0.10398022671098611, + "grad_norm": 177.86464606835156, + "learning_rate": 3.463068181818182e-06, + "loss": 3.8359, + "step": 1220 + }, + { + "epoch": 0.10406545640501151, + "grad_norm": 74.8368140006128, + "learning_rate": 3.4659090909090915e-06, + "loss": 2.9985, + "step": 1221 + }, + { + "epoch": 0.10415068609903691, + "grad_norm": 124.22881348284021, + "learning_rate": 3.46875e-06, + "loss": 3.289, + "step": 1222 + }, + { + "epoch": 0.10423591579306231, + "grad_norm": 86.28899833231722, + "learning_rate": 3.4715909090909094e-06, + "loss": 2.842, + "step": 1223 + }, + { + "epoch": 0.10432114548708771, + "grad_norm": 64.77199027365455, + "learning_rate": 3.4744318181818187e-06, + "loss": 2.0304, + "step": 1224 + }, + { + "epoch": 0.1044063751811131, + "grad_norm": 89.2423209221669, + "learning_rate": 3.4772727272727277e-06, + "loss": 3.5193, + "step": 1225 + }, + { + "epoch": 0.1044916048751385, + "grad_norm": 113.67100921861112, + "learning_rate": 3.4801136363636366e-06, + "loss": 2.8331, + "step": 1226 + }, + { + "epoch": 0.10457683456916389, + "grad_norm": 137.10382454737424, + "learning_rate": 3.4829545454545455e-06, + "loss": 3.3032, + "step": 1227 + }, + { + "epoch": 0.10466206426318929, + "grad_norm": 66.48098428421905, + "learning_rate": 3.485795454545455e-06, + "loss": 3.0048, + "step": 1228 + }, + { + "epoch": 0.10474729395721469, + "grad_norm": 131.0877148581104, + "learning_rate": 3.4886363636363642e-06, + "loss": 2.8794, + "step": 1229 + }, + { + "epoch": 0.10483252365124009, + "grad_norm": 96.16113675562903, + "learning_rate": 3.4914772727272727e-06, + "loss": 2.775, + "step": 1230 + }, + { + "epoch": 0.10491775334526549, + "grad_norm": 79.44237912919489, + "learning_rate": 3.494318181818182e-06, + "loss": 2.6909, + "step": 1231 + }, + { + "epoch": 0.10500298303929088, + "grad_norm": 154.1967133341986, + "learning_rate": 3.4971590909090914e-06, + "loss": 3.6038, + "step": 1232 + }, + { + "epoch": 0.10508821273331628, + "grad_norm": 95.23273627294182, + "learning_rate": 3.5e-06, + "loss": 2.9048, + "step": 1233 + }, + { + "epoch": 0.10517344242734168, + "grad_norm": 101.94019592395786, + "learning_rate": 3.5028409090909093e-06, + "loss": 3.4733, + "step": 1234 + }, + { + "epoch": 0.10525867212136708, + "grad_norm": 144.7408513935955, + "learning_rate": 3.5056818181818187e-06, + "loss": 3.1248, + "step": 1235 + }, + { + "epoch": 0.10534390181539248, + "grad_norm": 89.67246797303548, + "learning_rate": 3.508522727272727e-06, + "loss": 3.2799, + "step": 1236 + }, + { + "epoch": 0.10542913150941788, + "grad_norm": 140.1408100551008, + "learning_rate": 3.5113636363636365e-06, + "loss": 2.6777, + "step": 1237 + }, + { + "epoch": 0.10551436120344328, + "grad_norm": 112.60040550938531, + "learning_rate": 3.514204545454546e-06, + "loss": 4.1345, + "step": 1238 + }, + { + "epoch": 0.10559959089746868, + "grad_norm": 133.34285493134809, + "learning_rate": 3.517045454545455e-06, + "loss": 2.6489, + "step": 1239 + }, + { + "epoch": 0.10568482059149407, + "grad_norm": 82.12191513648831, + "learning_rate": 3.519886363636364e-06, + "loss": 2.726, + "step": 1240 + }, + { + "epoch": 0.10577005028551947, + "grad_norm": 166.90373298506367, + "learning_rate": 3.522727272727273e-06, + "loss": 4.1774, + "step": 1241 + }, + { + "epoch": 0.10585527997954487, + "grad_norm": 122.832359290391, + "learning_rate": 3.525568181818182e-06, + "loss": 3.097, + "step": 1242 + }, + { + "epoch": 0.10594050967357027, + "grad_norm": 114.7796112569593, + "learning_rate": 3.5284090909090914e-06, + "loss": 3.2436, + "step": 1243 + }, + { + "epoch": 0.10602573936759567, + "grad_norm": 93.69387881970694, + "learning_rate": 3.5312500000000007e-06, + "loss": 3.505, + "step": 1244 + }, + { + "epoch": 0.10611096906162107, + "grad_norm": 58.8176480506675, + "learning_rate": 3.5340909090909092e-06, + "loss": 2.5669, + "step": 1245 + }, + { + "epoch": 0.10619619875564647, + "grad_norm": 98.95319095396032, + "learning_rate": 3.5369318181818186e-06, + "loss": 3.9273, + "step": 1246 + }, + { + "epoch": 0.10628142844967187, + "grad_norm": 113.61104948191459, + "learning_rate": 3.539772727272728e-06, + "loss": 3.3222, + "step": 1247 + }, + { + "epoch": 0.10636665814369727, + "grad_norm": 83.81549222410808, + "learning_rate": 3.5426136363636364e-06, + "loss": 3.3619, + "step": 1248 + }, + { + "epoch": 0.10645188783772266, + "grad_norm": 91.73700240132092, + "learning_rate": 3.5454545454545458e-06, + "loss": 3.1491, + "step": 1249 + }, + { + "epoch": 0.10653711753174806, + "grad_norm": 78.6582229158151, + "learning_rate": 3.548295454545455e-06, + "loss": 3.4857, + "step": 1250 + }, + { + "epoch": 0.10662234722577346, + "grad_norm": 123.56710931825343, + "learning_rate": 3.5511363636363636e-06, + "loss": 3.2614, + "step": 1251 + }, + { + "epoch": 0.10670757691979886, + "grad_norm": 206.92058677357173, + "learning_rate": 3.553977272727273e-06, + "loss": 3.7334, + "step": 1252 + }, + { + "epoch": 0.10679280661382426, + "grad_norm": 165.86095131653718, + "learning_rate": 3.556818181818182e-06, + "loss": 3.7035, + "step": 1253 + }, + { + "epoch": 0.10687803630784966, + "grad_norm": 244.79551993754575, + "learning_rate": 3.5596590909090913e-06, + "loss": 3.5434, + "step": 1254 + }, + { + "epoch": 0.10696326600187506, + "grad_norm": 71.52534471146005, + "learning_rate": 3.5625e-06, + "loss": 2.0894, + "step": 1255 + }, + { + "epoch": 0.10704849569590046, + "grad_norm": 88.19304951247021, + "learning_rate": 3.565340909090909e-06, + "loss": 3.4202, + "step": 1256 + }, + { + "epoch": 0.10713372538992585, + "grad_norm": 87.94400478388029, + "learning_rate": 3.5681818181818185e-06, + "loss": 3.0816, + "step": 1257 + }, + { + "epoch": 0.10721895508395125, + "grad_norm": 82.47945250305422, + "learning_rate": 3.571022727272728e-06, + "loss": 2.9593, + "step": 1258 + }, + { + "epoch": 0.10730418477797665, + "grad_norm": 220.1769693601156, + "learning_rate": 3.5738636363636363e-06, + "loss": 4.1635, + "step": 1259 + }, + { + "epoch": 0.10738941447200205, + "grad_norm": 141.8987966130459, + "learning_rate": 3.5767045454545457e-06, + "loss": 3.0977, + "step": 1260 + }, + { + "epoch": 0.10747464416602745, + "grad_norm": 154.8702287723961, + "learning_rate": 3.579545454545455e-06, + "loss": 4.2532, + "step": 1261 + }, + { + "epoch": 0.10755987386005285, + "grad_norm": 120.7150975538768, + "learning_rate": 3.5823863636363635e-06, + "loss": 2.4223, + "step": 1262 + }, + { + "epoch": 0.10764510355407825, + "grad_norm": 186.00801407700067, + "learning_rate": 3.585227272727273e-06, + "loss": 3.5208, + "step": 1263 + }, + { + "epoch": 0.10773033324810363, + "grad_norm": 73.6912309561681, + "learning_rate": 3.5880681818181823e-06, + "loss": 3.3309, + "step": 1264 + }, + { + "epoch": 0.10781556294212903, + "grad_norm": 160.68821527473068, + "learning_rate": 3.590909090909091e-06, + "loss": 4.0316, + "step": 1265 + }, + { + "epoch": 0.10790079263615443, + "grad_norm": 114.69666425951682, + "learning_rate": 3.59375e-06, + "loss": 3.6975, + "step": 1266 + }, + { + "epoch": 0.10798602233017983, + "grad_norm": 78.3342096231345, + "learning_rate": 3.5965909090909095e-06, + "loss": 2.439, + "step": 1267 + }, + { + "epoch": 0.10807125202420523, + "grad_norm": 112.90283039228979, + "learning_rate": 3.5994318181818184e-06, + "loss": 3.0025, + "step": 1268 + }, + { + "epoch": 0.10815648171823063, + "grad_norm": 1132.6674786169676, + "learning_rate": 3.6022727272727277e-06, + "loss": 4.3969, + "step": 1269 + }, + { + "epoch": 0.10824171141225603, + "grad_norm": 73.54134535025177, + "learning_rate": 3.6051136363636367e-06, + "loss": 3.0017, + "step": 1270 + }, + { + "epoch": 0.10832694110628142, + "grad_norm": 141.69400737786887, + "learning_rate": 3.6079545454545456e-06, + "loss": 2.9929, + "step": 1271 + }, + { + "epoch": 0.10841217080030682, + "grad_norm": 127.1358144939968, + "learning_rate": 3.610795454545455e-06, + "loss": 3.1926, + "step": 1272 + }, + { + "epoch": 0.10849740049433222, + "grad_norm": 238.57907989569335, + "learning_rate": 3.6136363636363643e-06, + "loss": 3.2343, + "step": 1273 + }, + { + "epoch": 0.10858263018835762, + "grad_norm": 335.81899965171465, + "learning_rate": 3.616477272727273e-06, + "loss": 3.8567, + "step": 1274 + }, + { + "epoch": 0.10866785988238302, + "grad_norm": 83.37756940957598, + "learning_rate": 3.619318181818182e-06, + "loss": 2.9148, + "step": 1275 + }, + { + "epoch": 0.10875308957640842, + "grad_norm": 129.16049699896308, + "learning_rate": 3.6221590909090915e-06, + "loss": 3.9301, + "step": 1276 + }, + { + "epoch": 0.10883831927043382, + "grad_norm": 138.6254323534345, + "learning_rate": 3.625e-06, + "loss": 3.1899, + "step": 1277 + }, + { + "epoch": 0.10892354896445922, + "grad_norm": 118.6075651522683, + "learning_rate": 3.6278409090909094e-06, + "loss": 3.6046, + "step": 1278 + }, + { + "epoch": 0.10900877865848461, + "grad_norm": 71.2637007983754, + "learning_rate": 3.6306818181818187e-06, + "loss": 2.4541, + "step": 1279 + }, + { + "epoch": 0.10909400835251001, + "grad_norm": 84.78991497442331, + "learning_rate": 3.6335227272727277e-06, + "loss": 2.604, + "step": 1280 + }, + { + "epoch": 0.10917923804653541, + "grad_norm": 531.3595688909447, + "learning_rate": 3.6363636363636366e-06, + "loss": 4.3251, + "step": 1281 + }, + { + "epoch": 0.10926446774056081, + "grad_norm": 85.41586231864702, + "learning_rate": 3.6392045454545455e-06, + "loss": 3.024, + "step": 1282 + }, + { + "epoch": 0.10934969743458621, + "grad_norm": 108.31394621577134, + "learning_rate": 3.642045454545455e-06, + "loss": 2.6711, + "step": 1283 + }, + { + "epoch": 0.10943492712861161, + "grad_norm": 136.86161348705753, + "learning_rate": 3.6448863636363642e-06, + "loss": 2.2917, + "step": 1284 + }, + { + "epoch": 0.10952015682263701, + "grad_norm": 110.5815268560128, + "learning_rate": 3.6477272727272727e-06, + "loss": 3.0909, + "step": 1285 + }, + { + "epoch": 0.1096053865166624, + "grad_norm": 90.06951337917164, + "learning_rate": 3.650568181818182e-06, + "loss": 3.3103, + "step": 1286 + }, + { + "epoch": 0.1096906162106878, + "grad_norm": 235.32536287523325, + "learning_rate": 3.6534090909090914e-06, + "loss": 3.5575, + "step": 1287 + }, + { + "epoch": 0.1097758459047132, + "grad_norm": 48.21743035091666, + "learning_rate": 3.65625e-06, + "loss": 2.0517, + "step": 1288 + }, + { + "epoch": 0.1098610755987386, + "grad_norm": 117.20418488019303, + "learning_rate": 3.6590909090909093e-06, + "loss": 2.3464, + "step": 1289 + }, + { + "epoch": 0.109946305292764, + "grad_norm": 69.82280529036505, + "learning_rate": 3.6619318181818186e-06, + "loss": 3.0854, + "step": 1290 + }, + { + "epoch": 0.1100315349867894, + "grad_norm": 129.07308085232663, + "learning_rate": 3.6647727272727276e-06, + "loss": 3.4409, + "step": 1291 + }, + { + "epoch": 0.1101167646808148, + "grad_norm": 264.21045190967305, + "learning_rate": 3.6676136363636365e-06, + "loss": 3.8564, + "step": 1292 + }, + { + "epoch": 0.1102019943748402, + "grad_norm": 106.80061351809906, + "learning_rate": 3.670454545454546e-06, + "loss": 3.3521, + "step": 1293 + }, + { + "epoch": 0.1102872240688656, + "grad_norm": 164.54982051353443, + "learning_rate": 3.6732954545454548e-06, + "loss": 2.958, + "step": 1294 + }, + { + "epoch": 0.110372453762891, + "grad_norm": 112.18846919139716, + "learning_rate": 3.676136363636364e-06, + "loss": 3.1339, + "step": 1295 + }, + { + "epoch": 0.1104576834569164, + "grad_norm": 94.43752902627779, + "learning_rate": 3.678977272727273e-06, + "loss": 2.5828, + "step": 1296 + }, + { + "epoch": 0.1105429131509418, + "grad_norm": 195.08723191434237, + "learning_rate": 3.681818181818182e-06, + "loss": 4.0944, + "step": 1297 + }, + { + "epoch": 0.11062814284496719, + "grad_norm": 105.74779026825523, + "learning_rate": 3.6846590909090913e-06, + "loss": 1.7358, + "step": 1298 + }, + { + "epoch": 0.11071337253899259, + "grad_norm": 68.78422606976298, + "learning_rate": 3.6875000000000007e-06, + "loss": 2.1592, + "step": 1299 + }, + { + "epoch": 0.11079860223301799, + "grad_norm": 116.91940306679757, + "learning_rate": 3.690340909090909e-06, + "loss": 3.5207, + "step": 1300 + }, + { + "epoch": 0.11088383192704339, + "grad_norm": 131.58801101174993, + "learning_rate": 3.6931818181818186e-06, + "loss": 2.9987, + "step": 1301 + }, + { + "epoch": 0.11096906162106877, + "grad_norm": 87.49078309875908, + "learning_rate": 3.696022727272728e-06, + "loss": 3.0905, + "step": 1302 + }, + { + "epoch": 0.11105429131509417, + "grad_norm": 244.4950415161008, + "learning_rate": 3.6988636363636364e-06, + "loss": 4.9488, + "step": 1303 + }, + { + "epoch": 0.11113952100911957, + "grad_norm": 67.83291338352201, + "learning_rate": 3.7017045454545458e-06, + "loss": 2.5314, + "step": 1304 + }, + { + "epoch": 0.11122475070314497, + "grad_norm": 84.78211843839188, + "learning_rate": 3.704545454545455e-06, + "loss": 3.2503, + "step": 1305 + }, + { + "epoch": 0.11130998039717037, + "grad_norm": 179.36673536201178, + "learning_rate": 3.707386363636364e-06, + "loss": 4.041, + "step": 1306 + }, + { + "epoch": 0.11139521009119577, + "grad_norm": 55.766869658261506, + "learning_rate": 3.710227272727273e-06, + "loss": 2.2486, + "step": 1307 + }, + { + "epoch": 0.11148043978522117, + "grad_norm": 79.15707667577726, + "learning_rate": 3.713068181818182e-06, + "loss": 2.7676, + "step": 1308 + }, + { + "epoch": 0.11156566947924657, + "grad_norm": 78.31122775656075, + "learning_rate": 3.7159090909090913e-06, + "loss": 3.1993, + "step": 1309 + }, + { + "epoch": 0.11165089917327196, + "grad_norm": 190.46459516500553, + "learning_rate": 3.7187500000000006e-06, + "loss": 4.3633, + "step": 1310 + }, + { + "epoch": 0.11173612886729736, + "grad_norm": 127.77817415864098, + "learning_rate": 3.721590909090909e-06, + "loss": 3.2153, + "step": 1311 + }, + { + "epoch": 0.11182135856132276, + "grad_norm": 72.12292857138213, + "learning_rate": 3.7244318181818185e-06, + "loss": 2.5761, + "step": 1312 + }, + { + "epoch": 0.11190658825534816, + "grad_norm": 108.59833076674099, + "learning_rate": 3.727272727272728e-06, + "loss": 3.6959, + "step": 1313 + }, + { + "epoch": 0.11199181794937356, + "grad_norm": 115.99525075868102, + "learning_rate": 3.7301136363636363e-06, + "loss": 3.4649, + "step": 1314 + }, + { + "epoch": 0.11207704764339896, + "grad_norm": 189.2111858819128, + "learning_rate": 3.7329545454545457e-06, + "loss": 3.8777, + "step": 1315 + }, + { + "epoch": 0.11216227733742436, + "grad_norm": 78.9731570882107, + "learning_rate": 3.735795454545455e-06, + "loss": 3.3086, + "step": 1316 + }, + { + "epoch": 0.11224750703144976, + "grad_norm": 77.8545062902044, + "learning_rate": 3.7386363636363635e-06, + "loss": 3.0563, + "step": 1317 + }, + { + "epoch": 0.11233273672547515, + "grad_norm": 104.19531921992923, + "learning_rate": 3.741477272727273e-06, + "loss": 3.5854, + "step": 1318 + }, + { + "epoch": 0.11241796641950055, + "grad_norm": 117.19082695104802, + "learning_rate": 3.7443181818181822e-06, + "loss": 3.1192, + "step": 1319 + }, + { + "epoch": 0.11250319611352595, + "grad_norm": 122.80851219405383, + "learning_rate": 3.747159090909091e-06, + "loss": 3.5484, + "step": 1320 + }, + { + "epoch": 0.11258842580755135, + "grad_norm": 145.81879157058418, + "learning_rate": 3.7500000000000005e-06, + "loss": 3.4907, + "step": 1321 + }, + { + "epoch": 0.11267365550157675, + "grad_norm": 148.28005989207728, + "learning_rate": 3.7528409090909094e-06, + "loss": 3.6643, + "step": 1322 + }, + { + "epoch": 0.11275888519560215, + "grad_norm": 90.65793450784794, + "learning_rate": 3.7556818181818184e-06, + "loss": 2.1596, + "step": 1323 + }, + { + "epoch": 0.11284411488962755, + "grad_norm": 104.20716219538534, + "learning_rate": 3.7585227272727277e-06, + "loss": 3.2877, + "step": 1324 + }, + { + "epoch": 0.11292934458365295, + "grad_norm": 111.6282832775841, + "learning_rate": 3.761363636363637e-06, + "loss": 3.0596, + "step": 1325 + }, + { + "epoch": 0.11301457427767835, + "grad_norm": 100.00099059369906, + "learning_rate": 3.7642045454545456e-06, + "loss": 2.641, + "step": 1326 + }, + { + "epoch": 0.11309980397170374, + "grad_norm": 158.295557146595, + "learning_rate": 3.767045454545455e-06, + "loss": 3.3554, + "step": 1327 + }, + { + "epoch": 0.11318503366572914, + "grad_norm": 157.9015890530165, + "learning_rate": 3.7698863636363643e-06, + "loss": 3.0405, + "step": 1328 + }, + { + "epoch": 0.11327026335975454, + "grad_norm": 118.20218313723072, + "learning_rate": 3.772727272727273e-06, + "loss": 3.7974, + "step": 1329 + }, + { + "epoch": 0.11335549305377994, + "grad_norm": 79.18540425331425, + "learning_rate": 3.775568181818182e-06, + "loss": 3.1172, + "step": 1330 + }, + { + "epoch": 0.11344072274780534, + "grad_norm": 247.3668475667921, + "learning_rate": 3.7784090909090915e-06, + "loss": 2.6214, + "step": 1331 + }, + { + "epoch": 0.11352595244183074, + "grad_norm": 122.541609658505, + "learning_rate": 3.78125e-06, + "loss": 4.0249, + "step": 1332 + }, + { + "epoch": 0.11361118213585614, + "grad_norm": 82.07313941236843, + "learning_rate": 3.7840909090909094e-06, + "loss": 3.2244, + "step": 1333 + }, + { + "epoch": 0.11369641182988154, + "grad_norm": 142.46374003494074, + "learning_rate": 3.7869318181818187e-06, + "loss": 3.1673, + "step": 1334 + }, + { + "epoch": 0.11378164152390693, + "grad_norm": 107.48226730282049, + "learning_rate": 3.7897727272727276e-06, + "loss": 3.2085, + "step": 1335 + }, + { + "epoch": 0.11386687121793233, + "grad_norm": 103.14314967833431, + "learning_rate": 3.7926136363636366e-06, + "loss": 3.1411, + "step": 1336 + }, + { + "epoch": 0.11395210091195773, + "grad_norm": 116.64114881016121, + "learning_rate": 3.7954545454545455e-06, + "loss": 3.2905, + "step": 1337 + }, + { + "epoch": 0.11403733060598313, + "grad_norm": 69.37721305808078, + "learning_rate": 3.798295454545455e-06, + "loss": 2.3278, + "step": 1338 + }, + { + "epoch": 0.11412256030000853, + "grad_norm": 54.72220913011939, + "learning_rate": 3.801136363636364e-06, + "loss": 2.7916, + "step": 1339 + }, + { + "epoch": 0.11420778999403391, + "grad_norm": 73.69920220193642, + "learning_rate": 3.8039772727272727e-06, + "loss": 2.2136, + "step": 1340 + }, + { + "epoch": 0.11429301968805931, + "grad_norm": 75.24874923985752, + "learning_rate": 3.806818181818182e-06, + "loss": 3.1305, + "step": 1341 + }, + { + "epoch": 0.11437824938208471, + "grad_norm": 247.72721658630658, + "learning_rate": 3.8096590909090914e-06, + "loss": 3.0681, + "step": 1342 + }, + { + "epoch": 0.11446347907611011, + "grad_norm": 176.07860855612347, + "learning_rate": 3.8125e-06, + "loss": 3.1727, + "step": 1343 + }, + { + "epoch": 0.11454870877013551, + "grad_norm": 134.57304802931736, + "learning_rate": 3.815340909090909e-06, + "loss": 3.1935, + "step": 1344 + }, + { + "epoch": 0.11463393846416091, + "grad_norm": 63.348954860182346, + "learning_rate": 3.818181818181819e-06, + "loss": 2.8209, + "step": 1345 + }, + { + "epoch": 0.11471916815818631, + "grad_norm": 94.33502664050296, + "learning_rate": 3.821022727272727e-06, + "loss": 3.3687, + "step": 1346 + }, + { + "epoch": 0.1148043978522117, + "grad_norm": 111.37592005145868, + "learning_rate": 3.823863636363637e-06, + "loss": 2.5487, + "step": 1347 + }, + { + "epoch": 0.1148896275462371, + "grad_norm": 159.71058695270068, + "learning_rate": 3.826704545454546e-06, + "loss": 3.4012, + "step": 1348 + }, + { + "epoch": 0.1149748572402625, + "grad_norm": 104.16499112142945, + "learning_rate": 3.829545454545455e-06, + "loss": 3.4035, + "step": 1349 + }, + { + "epoch": 0.1150600869342879, + "grad_norm": 134.3459360093547, + "learning_rate": 3.832386363636364e-06, + "loss": 2.8579, + "step": 1350 + }, + { + "epoch": 0.1151453166283133, + "grad_norm": 228.94709005983685, + "learning_rate": 3.8352272727272735e-06, + "loss": 3.7723, + "step": 1351 + }, + { + "epoch": 0.1152305463223387, + "grad_norm": 68.15219035483415, + "learning_rate": 3.8380681818181816e-06, + "loss": 2.8958, + "step": 1352 + }, + { + "epoch": 0.1153157760163641, + "grad_norm": 270.74439945198293, + "learning_rate": 3.840909090909091e-06, + "loss": 4.1699, + "step": 1353 + }, + { + "epoch": 0.1154010057103895, + "grad_norm": 77.66928329864126, + "learning_rate": 3.84375e-06, + "loss": 2.9646, + "step": 1354 + }, + { + "epoch": 0.1154862354044149, + "grad_norm": 68.3695206788178, + "learning_rate": 3.846590909090909e-06, + "loss": 2.8892, + "step": 1355 + }, + { + "epoch": 0.1155714650984403, + "grad_norm": 96.5867534279561, + "learning_rate": 3.849431818181819e-06, + "loss": 2.6388, + "step": 1356 + }, + { + "epoch": 0.1156566947924657, + "grad_norm": 75.04542684319858, + "learning_rate": 3.852272727272728e-06, + "loss": 2.3338, + "step": 1357 + }, + { + "epoch": 0.1157419244864911, + "grad_norm": 144.94472623877476, + "learning_rate": 3.855113636363637e-06, + "loss": 3.5721, + "step": 1358 + }, + { + "epoch": 0.11582715418051649, + "grad_norm": 183.3563922044586, + "learning_rate": 3.857954545454546e-06, + "loss": 3.4172, + "step": 1359 + }, + { + "epoch": 0.11591238387454189, + "grad_norm": 214.35461727051006, + "learning_rate": 3.8607954545454555e-06, + "loss": 3.9053, + "step": 1360 + }, + { + "epoch": 0.11599761356856729, + "grad_norm": 90.28340132836077, + "learning_rate": 3.863636363636364e-06, + "loss": 3.2754, + "step": 1361 + }, + { + "epoch": 0.11608284326259269, + "grad_norm": 161.44733873342577, + "learning_rate": 3.866477272727273e-06, + "loss": 4.4183, + "step": 1362 + }, + { + "epoch": 0.11616807295661809, + "grad_norm": 376.8076746430527, + "learning_rate": 3.8693181818181815e-06, + "loss": 4.1115, + "step": 1363 + }, + { + "epoch": 0.11625330265064349, + "grad_norm": 136.61982119847, + "learning_rate": 3.872159090909091e-06, + "loss": 2.6602, + "step": 1364 + }, + { + "epoch": 0.11633853234466889, + "grad_norm": 160.8418642027409, + "learning_rate": 3.875e-06, + "loss": 3.6681, + "step": 1365 + }, + { + "epoch": 0.11642376203869428, + "grad_norm": 75.63978362532471, + "learning_rate": 3.877840909090909e-06, + "loss": 2.0441, + "step": 1366 + }, + { + "epoch": 0.11650899173271968, + "grad_norm": 73.18449998863167, + "learning_rate": 3.880681818181818e-06, + "loss": 2.4437, + "step": 1367 + }, + { + "epoch": 0.11659422142674508, + "grad_norm": 63.798551626755895, + "learning_rate": 3.883522727272728e-06, + "loss": 2.9989, + "step": 1368 + }, + { + "epoch": 0.11667945112077048, + "grad_norm": 67.58957418981471, + "learning_rate": 3.886363636363637e-06, + "loss": 3.2333, + "step": 1369 + }, + { + "epoch": 0.11676468081479588, + "grad_norm": 143.72601599013393, + "learning_rate": 3.889204545454546e-06, + "loss": 4.0645, + "step": 1370 + }, + { + "epoch": 0.11684991050882128, + "grad_norm": 122.10596491089387, + "learning_rate": 3.8920454545454554e-06, + "loss": 3.1587, + "step": 1371 + }, + { + "epoch": 0.11693514020284668, + "grad_norm": 111.09920087696689, + "learning_rate": 3.8948863636363635e-06, + "loss": 2.8298, + "step": 1372 + }, + { + "epoch": 0.11702036989687208, + "grad_norm": 231.60035355460911, + "learning_rate": 3.897727272727273e-06, + "loss": 4.5979, + "step": 1373 + }, + { + "epoch": 0.11710559959089747, + "grad_norm": 62.14494626757722, + "learning_rate": 3.900568181818182e-06, + "loss": 2.8014, + "step": 1374 + }, + { + "epoch": 0.11719082928492287, + "grad_norm": 194.13087412163, + "learning_rate": 3.903409090909091e-06, + "loss": 3.8999, + "step": 1375 + }, + { + "epoch": 0.11727605897894827, + "grad_norm": 64.83336412642832, + "learning_rate": 3.90625e-06, + "loss": 2.8252, + "step": 1376 + }, + { + "epoch": 0.11736128867297367, + "grad_norm": 105.87271651063216, + "learning_rate": 3.90909090909091e-06, + "loss": 3.4691, + "step": 1377 + }, + { + "epoch": 0.11744651836699906, + "grad_norm": 73.27446470596891, + "learning_rate": 3.911931818181818e-06, + "loss": 3.4446, + "step": 1378 + }, + { + "epoch": 0.11753174806102445, + "grad_norm": 70.2952019348431, + "learning_rate": 3.914772727272728e-06, + "loss": 1.9149, + "step": 1379 + }, + { + "epoch": 0.11761697775504985, + "grad_norm": 253.39205245530653, + "learning_rate": 3.917613636363637e-06, + "loss": 3.9519, + "step": 1380 + }, + { + "epoch": 0.11770220744907525, + "grad_norm": 183.32626073559413, + "learning_rate": 3.9204545454545456e-06, + "loss": 3.9061, + "step": 1381 + }, + { + "epoch": 0.11778743714310065, + "grad_norm": 91.77987620370556, + "learning_rate": 3.9232954545454545e-06, + "loss": 2.7632, + "step": 1382 + }, + { + "epoch": 0.11787266683712605, + "grad_norm": 68.30322110878825, + "learning_rate": 3.926136363636364e-06, + "loss": 2.4265, + "step": 1383 + }, + { + "epoch": 0.11795789653115145, + "grad_norm": 88.32233611092545, + "learning_rate": 3.928977272727273e-06, + "loss": 2.0851, + "step": 1384 + }, + { + "epoch": 0.11804312622517685, + "grad_norm": 322.28667128707644, + "learning_rate": 3.931818181818182e-06, + "loss": 3.5614, + "step": 1385 + }, + { + "epoch": 0.11812835591920225, + "grad_norm": 138.56271348220739, + "learning_rate": 3.934659090909091e-06, + "loss": 4.6757, + "step": 1386 + }, + { + "epoch": 0.11821358561322765, + "grad_norm": 115.47277405713268, + "learning_rate": 3.9375e-06, + "loss": 3.2385, + "step": 1387 + }, + { + "epoch": 0.11829881530725304, + "grad_norm": 75.31448830796222, + "learning_rate": 3.94034090909091e-06, + "loss": 2.5901, + "step": 1388 + }, + { + "epoch": 0.11838404500127844, + "grad_norm": 133.11447885496696, + "learning_rate": 3.943181818181819e-06, + "loss": 3.5372, + "step": 1389 + }, + { + "epoch": 0.11846927469530384, + "grad_norm": 115.41370337932283, + "learning_rate": 3.946022727272728e-06, + "loss": 3.0808, + "step": 1390 + }, + { + "epoch": 0.11855450438932924, + "grad_norm": 61.281222329617606, + "learning_rate": 3.9488636363636366e-06, + "loss": 2.4943, + "step": 1391 + }, + { + "epoch": 0.11863973408335464, + "grad_norm": 150.13426297710996, + "learning_rate": 3.9517045454545455e-06, + "loss": 3.2107, + "step": 1392 + }, + { + "epoch": 0.11872496377738004, + "grad_norm": 73.98452046050978, + "learning_rate": 3.954545454545454e-06, + "loss": 2.7374, + "step": 1393 + }, + { + "epoch": 0.11881019347140544, + "grad_norm": 174.67903426489724, + "learning_rate": 3.957386363636364e-06, + "loss": 3.6535, + "step": 1394 + }, + { + "epoch": 0.11889542316543084, + "grad_norm": 137.23739881477937, + "learning_rate": 3.960227272727273e-06, + "loss": 3.0388, + "step": 1395 + }, + { + "epoch": 0.11898065285945623, + "grad_norm": 77.6075247112342, + "learning_rate": 3.963068181818182e-06, + "loss": 2.3281, + "step": 1396 + }, + { + "epoch": 0.11906588255348163, + "grad_norm": 72.80480509749876, + "learning_rate": 3.965909090909091e-06, + "loss": 2.6198, + "step": 1397 + }, + { + "epoch": 0.11915111224750703, + "grad_norm": 72.03944374049166, + "learning_rate": 3.96875e-06, + "loss": 2.7836, + "step": 1398 + }, + { + "epoch": 0.11923634194153243, + "grad_norm": 103.00289242061525, + "learning_rate": 3.97159090909091e-06, + "loss": 2.577, + "step": 1399 + }, + { + "epoch": 0.11932157163555783, + "grad_norm": 112.63805462287256, + "learning_rate": 3.974431818181819e-06, + "loss": 3.3963, + "step": 1400 + }, + { + "epoch": 0.11940680132958323, + "grad_norm": 202.99506603623036, + "learning_rate": 3.9772727272727275e-06, + "loss": 2.7376, + "step": 1401 + }, + { + "epoch": 0.11949203102360863, + "grad_norm": 84.80850283107165, + "learning_rate": 3.9801136363636365e-06, + "loss": 3.0047, + "step": 1402 + }, + { + "epoch": 0.11957726071763403, + "grad_norm": 136.9301998597916, + "learning_rate": 3.982954545454546e-06, + "loss": 2.9657, + "step": 1403 + }, + { + "epoch": 0.11966249041165943, + "grad_norm": 339.6345746955641, + "learning_rate": 3.985795454545454e-06, + "loss": 2.5897, + "step": 1404 + }, + { + "epoch": 0.11974772010568482, + "grad_norm": 71.22143468710433, + "learning_rate": 3.988636363636364e-06, + "loss": 3.1222, + "step": 1405 + }, + { + "epoch": 0.11983294979971022, + "grad_norm": 80.47677159596627, + "learning_rate": 3.991477272727273e-06, + "loss": 2.6343, + "step": 1406 + }, + { + "epoch": 0.11991817949373562, + "grad_norm": 147.1832492736939, + "learning_rate": 3.994318181818182e-06, + "loss": 3.968, + "step": 1407 + }, + { + "epoch": 0.12000340918776102, + "grad_norm": 229.10744296407415, + "learning_rate": 3.997159090909091e-06, + "loss": 4.6786, + "step": 1408 + }, + { + "epoch": 0.12008863888178642, + "grad_norm": 99.74809365457685, + "learning_rate": 4.000000000000001e-06, + "loss": 3.1386, + "step": 1409 + }, + { + "epoch": 0.12017386857581182, + "grad_norm": 236.15285181921712, + "learning_rate": 4.00284090909091e-06, + "loss": 3.9664, + "step": 1410 + }, + { + "epoch": 0.12025909826983722, + "grad_norm": 115.13710257922851, + "learning_rate": 4.0056818181818185e-06, + "loss": 3.1581, + "step": 1411 + }, + { + "epoch": 0.12034432796386262, + "grad_norm": 129.2535635926545, + "learning_rate": 4.0085227272727275e-06, + "loss": 3.401, + "step": 1412 + }, + { + "epoch": 0.12042955765788801, + "grad_norm": 177.81136267622875, + "learning_rate": 4.011363636363636e-06, + "loss": 3.5081, + "step": 1413 + }, + { + "epoch": 0.12051478735191341, + "grad_norm": 197.5086035251755, + "learning_rate": 4.014204545454546e-06, + "loss": 2.5098, + "step": 1414 + }, + { + "epoch": 0.1206000170459388, + "grad_norm": 81.70875969102211, + "learning_rate": 4.017045454545455e-06, + "loss": 3.3258, + "step": 1415 + }, + { + "epoch": 0.1206852467399642, + "grad_norm": 129.73318254119408, + "learning_rate": 4.019886363636364e-06, + "loss": 4.127, + "step": 1416 + }, + { + "epoch": 0.1207704764339896, + "grad_norm": 166.17975029177586, + "learning_rate": 4.022727272727273e-06, + "loss": 2.9363, + "step": 1417 + }, + { + "epoch": 0.120855706128015, + "grad_norm": 114.36674276234216, + "learning_rate": 4.025568181818182e-06, + "loss": 3.456, + "step": 1418 + }, + { + "epoch": 0.1209409358220404, + "grad_norm": 419.66439727343607, + "learning_rate": 4.028409090909091e-06, + "loss": 4.1152, + "step": 1419 + }, + { + "epoch": 0.12102616551606579, + "grad_norm": 100.3268877760118, + "learning_rate": 4.031250000000001e-06, + "loss": 3.7806, + "step": 1420 + }, + { + "epoch": 0.12111139521009119, + "grad_norm": 129.95325142073068, + "learning_rate": 4.0340909090909095e-06, + "loss": 3.2185, + "step": 1421 + }, + { + "epoch": 0.12119662490411659, + "grad_norm": 136.43557895285815, + "learning_rate": 4.0369318181818184e-06, + "loss": 3.6687, + "step": 1422 + }, + { + "epoch": 0.12128185459814199, + "grad_norm": 168.99074864533142, + "learning_rate": 4.039772727272727e-06, + "loss": 3.1225, + "step": 1423 + }, + { + "epoch": 0.12136708429216739, + "grad_norm": 85.09338157851653, + "learning_rate": 4.042613636363636e-06, + "loss": 3.1622, + "step": 1424 + }, + { + "epoch": 0.12145231398619279, + "grad_norm": 163.77899880304537, + "learning_rate": 4.045454545454546e-06, + "loss": 4.0257, + "step": 1425 + }, + { + "epoch": 0.12153754368021819, + "grad_norm": 147.84688425750895, + "learning_rate": 4.048295454545455e-06, + "loss": 3.2666, + "step": 1426 + }, + { + "epoch": 0.12162277337424358, + "grad_norm": 98.25707344085525, + "learning_rate": 4.051136363636364e-06, + "loss": 2.6957, + "step": 1427 + }, + { + "epoch": 0.12170800306826898, + "grad_norm": 62.859285612051245, + "learning_rate": 4.053977272727273e-06, + "loss": 2.004, + "step": 1428 + }, + { + "epoch": 0.12179323276229438, + "grad_norm": 78.21409894431686, + "learning_rate": 4.056818181818183e-06, + "loss": 3.0091, + "step": 1429 + }, + { + "epoch": 0.12187846245631978, + "grad_norm": 78.10927933612705, + "learning_rate": 4.059659090909091e-06, + "loss": 3.0535, + "step": 1430 + }, + { + "epoch": 0.12196369215034518, + "grad_norm": 90.18529448951274, + "learning_rate": 4.0625000000000005e-06, + "loss": 3.3236, + "step": 1431 + }, + { + "epoch": 0.12204892184437058, + "grad_norm": 99.42523476014071, + "learning_rate": 4.065340909090909e-06, + "loss": 2.6469, + "step": 1432 + }, + { + "epoch": 0.12213415153839598, + "grad_norm": 70.39473420907956, + "learning_rate": 4.068181818181818e-06, + "loss": 2.8921, + "step": 1433 + }, + { + "epoch": 0.12221938123242138, + "grad_norm": 104.27187284706153, + "learning_rate": 4.071022727272727e-06, + "loss": 2.9477, + "step": 1434 + }, + { + "epoch": 0.12230461092644677, + "grad_norm": 117.87859901435148, + "learning_rate": 4.073863636363637e-06, + "loss": 3.4096, + "step": 1435 + }, + { + "epoch": 0.12238984062047217, + "grad_norm": 66.15426854970164, + "learning_rate": 4.076704545454546e-06, + "loss": 2.6602, + "step": 1436 + }, + { + "epoch": 0.12247507031449757, + "grad_norm": 89.86685194161986, + "learning_rate": 4.079545454545455e-06, + "loss": 3.3589, + "step": 1437 + }, + { + "epoch": 0.12256030000852297, + "grad_norm": 132.11511923338847, + "learning_rate": 4.082386363636364e-06, + "loss": 3.3373, + "step": 1438 + }, + { + "epoch": 0.12264552970254837, + "grad_norm": 359.6989456321349, + "learning_rate": 4.085227272727273e-06, + "loss": 4.8977, + "step": 1439 + }, + { + "epoch": 0.12273075939657377, + "grad_norm": 171.72305638004005, + "learning_rate": 4.0880681818181825e-06, + "loss": 3.2689, + "step": 1440 + }, + { + "epoch": 0.12281598909059917, + "grad_norm": 251.4785323445732, + "learning_rate": 4.0909090909090915e-06, + "loss": 3.5563, + "step": 1441 + }, + { + "epoch": 0.12290121878462457, + "grad_norm": 131.82884077957942, + "learning_rate": 4.09375e-06, + "loss": 3.1838, + "step": 1442 + }, + { + "epoch": 0.12298644847864997, + "grad_norm": 136.54831239260955, + "learning_rate": 4.096590909090909e-06, + "loss": 3.1696, + "step": 1443 + }, + { + "epoch": 0.12307167817267536, + "grad_norm": 149.46798334391596, + "learning_rate": 4.099431818181819e-06, + "loss": 2.8574, + "step": 1444 + }, + { + "epoch": 0.12315690786670076, + "grad_norm": 131.2296365521725, + "learning_rate": 4.102272727272727e-06, + "loss": 2.7899, + "step": 1445 + }, + { + "epoch": 0.12324213756072616, + "grad_norm": 71.3885751041622, + "learning_rate": 4.105113636363637e-06, + "loss": 2.6229, + "step": 1446 + }, + { + "epoch": 0.12332736725475156, + "grad_norm": 87.67997687910878, + "learning_rate": 4.107954545454546e-06, + "loss": 2.7763, + "step": 1447 + }, + { + "epoch": 0.12341259694877696, + "grad_norm": 224.84193268442365, + "learning_rate": 4.110795454545455e-06, + "loss": 4.4387, + "step": 1448 + }, + { + "epoch": 0.12349782664280236, + "grad_norm": 86.32022233481143, + "learning_rate": 4.113636363636364e-06, + "loss": 3.0093, + "step": 1449 + }, + { + "epoch": 0.12358305633682776, + "grad_norm": 75.96311266401744, + "learning_rate": 4.116477272727273e-06, + "loss": 2.9993, + "step": 1450 + }, + { + "epoch": 0.12366828603085316, + "grad_norm": 83.83595466493271, + "learning_rate": 4.1193181818181825e-06, + "loss": 2.7541, + "step": 1451 + }, + { + "epoch": 0.12375351572487855, + "grad_norm": 91.93052366607922, + "learning_rate": 4.122159090909091e-06, + "loss": 3.3898, + "step": 1452 + }, + { + "epoch": 0.12383874541890394, + "grad_norm": 331.17926332548456, + "learning_rate": 4.125e-06, + "loss": 4.932, + "step": 1453 + }, + { + "epoch": 0.12392397511292934, + "grad_norm": 100.12541782977195, + "learning_rate": 4.127840909090909e-06, + "loss": 2.9548, + "step": 1454 + }, + { + "epoch": 0.12400920480695474, + "grad_norm": 130.10431012976503, + "learning_rate": 4.130681818181819e-06, + "loss": 3.0139, + "step": 1455 + }, + { + "epoch": 0.12409443450098014, + "grad_norm": 129.90725140584854, + "learning_rate": 4.133522727272727e-06, + "loss": 3.3057, + "step": 1456 + }, + { + "epoch": 0.12417966419500553, + "grad_norm": 76.9297205916914, + "learning_rate": 4.136363636363637e-06, + "loss": 3.0157, + "step": 1457 + }, + { + "epoch": 0.12426489388903093, + "grad_norm": 109.60520224640098, + "learning_rate": 4.139204545454546e-06, + "loss": 3.4257, + "step": 1458 + }, + { + "epoch": 0.12435012358305633, + "grad_norm": 356.9672517938369, + "learning_rate": 4.142045454545455e-06, + "loss": 4.3438, + "step": 1459 + }, + { + "epoch": 0.12443535327708173, + "grad_norm": 101.74531212705674, + "learning_rate": 4.144886363636364e-06, + "loss": 3.4667, + "step": 1460 + }, + { + "epoch": 0.12452058297110713, + "grad_norm": 48.89219025852351, + "learning_rate": 4.1477272727272734e-06, + "loss": 1.9585, + "step": 1461 + }, + { + "epoch": 0.12460581266513253, + "grad_norm": 110.20734403976066, + "learning_rate": 4.150568181818182e-06, + "loss": 3.462, + "step": 1462 + }, + { + "epoch": 0.12469104235915793, + "grad_norm": 65.24226072274489, + "learning_rate": 4.153409090909091e-06, + "loss": 2.7647, + "step": 1463 + }, + { + "epoch": 0.12477627205318333, + "grad_norm": 200.25203147543283, + "learning_rate": 4.15625e-06, + "loss": 3.6931, + "step": 1464 + }, + { + "epoch": 0.12486150174720873, + "grad_norm": 214.25107580179088, + "learning_rate": 4.159090909090909e-06, + "loss": 3.7758, + "step": 1465 + }, + { + "epoch": 0.12494673144123412, + "grad_norm": 87.78969254506387, + "learning_rate": 4.161931818181819e-06, + "loss": 3.7753, + "step": 1466 + }, + { + "epoch": 0.12503196113525952, + "grad_norm": 117.18886061559222, + "learning_rate": 4.164772727272728e-06, + "loss": 3.2544, + "step": 1467 + }, + { + "epoch": 0.12511719082928494, + "grad_norm": 75.8224014382089, + "learning_rate": 4.167613636363637e-06, + "loss": 3.7587, + "step": 1468 + }, + { + "epoch": 0.12520242052331032, + "grad_norm": 384.14373697951174, + "learning_rate": 4.170454545454546e-06, + "loss": 3.6958, + "step": 1469 + }, + { + "epoch": 0.12528765021733573, + "grad_norm": 151.18813288597096, + "learning_rate": 4.1732954545454555e-06, + "loss": 3.1883, + "step": 1470 + }, + { + "epoch": 0.12537287991136112, + "grad_norm": 218.12000161859012, + "learning_rate": 4.176136363636364e-06, + "loss": 3.7475, + "step": 1471 + }, + { + "epoch": 0.1254581096053865, + "grad_norm": 194.83215875043277, + "learning_rate": 4.178977272727273e-06, + "loss": 4.674, + "step": 1472 + }, + { + "epoch": 0.12554333929941192, + "grad_norm": 143.2160782225397, + "learning_rate": 4.181818181818182e-06, + "loss": 3.2315, + "step": 1473 + }, + { + "epoch": 0.1256285689934373, + "grad_norm": 76.49279353686633, + "learning_rate": 4.184659090909091e-06, + "loss": 3.3149, + "step": 1474 + }, + { + "epoch": 0.1257137986874627, + "grad_norm": 107.34919805757713, + "learning_rate": 4.1875e-06, + "loss": 3.0584, + "step": 1475 + }, + { + "epoch": 0.1257990283814881, + "grad_norm": 77.86613622926, + "learning_rate": 4.190340909090909e-06, + "loss": 3.0645, + "step": 1476 + }, + { + "epoch": 0.1258842580755135, + "grad_norm": 90.4693680724182, + "learning_rate": 4.193181818181819e-06, + "loss": 3.3996, + "step": 1477 + }, + { + "epoch": 0.1259694877695389, + "grad_norm": 86.2975968180733, + "learning_rate": 4.196022727272728e-06, + "loss": 2.2261, + "step": 1478 + }, + { + "epoch": 0.1260547174635643, + "grad_norm": 86.04632767530747, + "learning_rate": 4.198863636363637e-06, + "loss": 2.541, + "step": 1479 + }, + { + "epoch": 0.1261399471575897, + "grad_norm": 88.14747934827432, + "learning_rate": 4.201704545454546e-06, + "loss": 3.8088, + "step": 1480 + }, + { + "epoch": 0.1262251768516151, + "grad_norm": 144.09830294226958, + "learning_rate": 4.204545454545455e-06, + "loss": 3.0869, + "step": 1481 + }, + { + "epoch": 0.1263104065456405, + "grad_norm": 86.00315783792126, + "learning_rate": 4.2073863636363635e-06, + "loss": 1.8927, + "step": 1482 + }, + { + "epoch": 0.1263956362396659, + "grad_norm": 154.7545490141611, + "learning_rate": 4.210227272727273e-06, + "loss": 3.0794, + "step": 1483 + }, + { + "epoch": 0.1264808659336913, + "grad_norm": 221.4085972532358, + "learning_rate": 4.213068181818182e-06, + "loss": 3.328, + "step": 1484 + }, + { + "epoch": 0.1265660956277167, + "grad_norm": 109.78588183949992, + "learning_rate": 4.215909090909091e-06, + "loss": 2.9151, + "step": 1485 + }, + { + "epoch": 0.1266513253217421, + "grad_norm": 152.7295421857017, + "learning_rate": 4.21875e-06, + "loss": 3.3597, + "step": 1486 + }, + { + "epoch": 0.1267365550157675, + "grad_norm": 97.75875076688493, + "learning_rate": 4.22159090909091e-06, + "loss": 2.5859, + "step": 1487 + }, + { + "epoch": 0.12682178470979288, + "grad_norm": 225.91278878797138, + "learning_rate": 4.224431818181819e-06, + "loss": 3.9328, + "step": 1488 + }, + { + "epoch": 0.1269070144038183, + "grad_norm": 227.76403740813637, + "learning_rate": 4.227272727272728e-06, + "loss": 4.6544, + "step": 1489 + }, + { + "epoch": 0.12699224409784368, + "grad_norm": 95.69625137554162, + "learning_rate": 4.230113636363637e-06, + "loss": 3.3193, + "step": 1490 + }, + { + "epoch": 0.1270774737918691, + "grad_norm": 100.38188531966081, + "learning_rate": 4.2329545454545455e-06, + "loss": 3.6122, + "step": 1491 + }, + { + "epoch": 0.12716270348589448, + "grad_norm": 72.47721878272074, + "learning_rate": 4.235795454545455e-06, + "loss": 2.3485, + "step": 1492 + }, + { + "epoch": 0.1272479331799199, + "grad_norm": 87.76643825285798, + "learning_rate": 4.238636363636364e-06, + "loss": 3.2766, + "step": 1493 + }, + { + "epoch": 0.12733316287394528, + "grad_norm": 106.79580874509142, + "learning_rate": 4.241477272727273e-06, + "loss": 3.3507, + "step": 1494 + }, + { + "epoch": 0.1274183925679707, + "grad_norm": 105.43660411355864, + "learning_rate": 4.244318181818182e-06, + "loss": 3.7275, + "step": 1495 + }, + { + "epoch": 0.12750362226199607, + "grad_norm": 123.0229863823248, + "learning_rate": 4.247159090909092e-06, + "loss": 3.7117, + "step": 1496 + }, + { + "epoch": 0.1275888519560215, + "grad_norm": 167.38944408052396, + "learning_rate": 4.25e-06, + "loss": 3.2549, + "step": 1497 + }, + { + "epoch": 0.12767408165004687, + "grad_norm": 72.76645271326589, + "learning_rate": 4.25284090909091e-06, + "loss": 2.793, + "step": 1498 + }, + { + "epoch": 0.12775931134407228, + "grad_norm": 265.86536998070886, + "learning_rate": 4.255681818181819e-06, + "loss": 4.3916, + "step": 1499 + }, + { + "epoch": 0.12784454103809767, + "grad_norm": 105.92667406472628, + "learning_rate": 4.258522727272728e-06, + "loss": 3.4306, + "step": 1500 + }, + { + "epoch": 0.12792977073212308, + "grad_norm": 47.93316105938203, + "learning_rate": 4.2613636363636365e-06, + "loss": 2.0306, + "step": 1501 + }, + { + "epoch": 0.12801500042614847, + "grad_norm": 100.84029831785588, + "learning_rate": 4.2642045454545455e-06, + "loss": 2.6883, + "step": 1502 + }, + { + "epoch": 0.12810023012017388, + "grad_norm": 118.88095543109719, + "learning_rate": 4.267045454545455e-06, + "loss": 3.4635, + "step": 1503 + }, + { + "epoch": 0.12818545981419927, + "grad_norm": 142.70683098320686, + "learning_rate": 4.269886363636364e-06, + "loss": 2.9255, + "step": 1504 + }, + { + "epoch": 0.12827068950822468, + "grad_norm": 76.05755325207922, + "learning_rate": 4.272727272727273e-06, + "loss": 2.4652, + "step": 1505 + }, + { + "epoch": 0.12835591920225006, + "grad_norm": 128.45203129955547, + "learning_rate": 4.275568181818182e-06, + "loss": 3.8416, + "step": 1506 + }, + { + "epoch": 0.12844114889627548, + "grad_norm": 136.22194496637744, + "learning_rate": 4.278409090909092e-06, + "loss": 3.2631, + "step": 1507 + }, + { + "epoch": 0.12852637859030086, + "grad_norm": 75.97131387924287, + "learning_rate": 4.28125e-06, + "loss": 2.9916, + "step": 1508 + }, + { + "epoch": 0.12861160828432627, + "grad_norm": 131.27465217775486, + "learning_rate": 4.28409090909091e-06, + "loss": 4.2396, + "step": 1509 + }, + { + "epoch": 0.12869683797835166, + "grad_norm": 79.97960172668017, + "learning_rate": 4.286931818181819e-06, + "loss": 3.1449, + "step": 1510 + }, + { + "epoch": 0.12878206767237704, + "grad_norm": 501.895578203162, + "learning_rate": 4.2897727272727275e-06, + "loss": 5.0089, + "step": 1511 + }, + { + "epoch": 0.12886729736640246, + "grad_norm": 82.08661684766194, + "learning_rate": 4.2926136363636364e-06, + "loss": 2.6942, + "step": 1512 + }, + { + "epoch": 0.12895252706042784, + "grad_norm": 89.49218870899558, + "learning_rate": 4.295454545454546e-06, + "loss": 3.0498, + "step": 1513 + }, + { + "epoch": 0.12903775675445325, + "grad_norm": 356.1561753846768, + "learning_rate": 4.298295454545454e-06, + "loss": 2.7127, + "step": 1514 + }, + { + "epoch": 0.12912298644847864, + "grad_norm": 144.42214646505116, + "learning_rate": 4.301136363636364e-06, + "loss": 3.7396, + "step": 1515 + }, + { + "epoch": 0.12920821614250405, + "grad_norm": 108.16047495218952, + "learning_rate": 4.303977272727273e-06, + "loss": 2.7266, + "step": 1516 + }, + { + "epoch": 0.12929344583652944, + "grad_norm": 62.312674627396234, + "learning_rate": 4.306818181818182e-06, + "loss": 2.9555, + "step": 1517 + }, + { + "epoch": 0.12937867553055485, + "grad_norm": 106.22654725971985, + "learning_rate": 4.309659090909092e-06, + "loss": 3.5933, + "step": 1518 + }, + { + "epoch": 0.12946390522458023, + "grad_norm": 116.55676672298549, + "learning_rate": 4.312500000000001e-06, + "loss": 3.5176, + "step": 1519 + }, + { + "epoch": 0.12954913491860565, + "grad_norm": 106.39871567588844, + "learning_rate": 4.3153409090909096e-06, + "loss": 3.5923, + "step": 1520 + }, + { + "epoch": 0.12963436461263103, + "grad_norm": 105.86629964584257, + "learning_rate": 4.3181818181818185e-06, + "loss": 3.2229, + "step": 1521 + }, + { + "epoch": 0.12971959430665644, + "grad_norm": 99.48497010186792, + "learning_rate": 4.321022727272728e-06, + "loss": 2.5115, + "step": 1522 + }, + { + "epoch": 0.12980482400068183, + "grad_norm": 81.63781567552712, + "learning_rate": 4.323863636363636e-06, + "loss": 3.0299, + "step": 1523 + }, + { + "epoch": 0.12989005369470724, + "grad_norm": 56.296115430822226, + "learning_rate": 4.326704545454546e-06, + "loss": 2.5582, + "step": 1524 + }, + { + "epoch": 0.12997528338873263, + "grad_norm": 102.55874875480609, + "learning_rate": 4.329545454545455e-06, + "loss": 3.3001, + "step": 1525 + }, + { + "epoch": 0.13006051308275804, + "grad_norm": 73.65690510116158, + "learning_rate": 4.332386363636364e-06, + "loss": 3.2531, + "step": 1526 + }, + { + "epoch": 0.13014574277678342, + "grad_norm": 121.96860515121524, + "learning_rate": 4.335227272727273e-06, + "loss": 3.1863, + "step": 1527 + }, + { + "epoch": 0.13023097247080884, + "grad_norm": 159.91981515658304, + "learning_rate": 4.338068181818182e-06, + "loss": 2.9262, + "step": 1528 + }, + { + "epoch": 0.13031620216483422, + "grad_norm": 100.56522780981462, + "learning_rate": 4.340909090909091e-06, + "loss": 2.4689, + "step": 1529 + }, + { + "epoch": 0.13040143185885963, + "grad_norm": 87.14661771832297, + "learning_rate": 4.3437500000000006e-06, + "loss": 3.1459, + "step": 1530 + }, + { + "epoch": 0.13048666155288502, + "grad_norm": 162.95178754073714, + "learning_rate": 4.3465909090909095e-06, + "loss": 3.9608, + "step": 1531 + }, + { + "epoch": 0.13057189124691043, + "grad_norm": 265.33776966289554, + "learning_rate": 4.349431818181818e-06, + "loss": 4.1082, + "step": 1532 + }, + { + "epoch": 0.13065712094093582, + "grad_norm": 128.87361695264198, + "learning_rate": 4.352272727272727e-06, + "loss": 3.2002, + "step": 1533 + }, + { + "epoch": 0.13074235063496123, + "grad_norm": 628.7042392933407, + "learning_rate": 4.355113636363636e-06, + "loss": 4.2425, + "step": 1534 + }, + { + "epoch": 0.13082758032898661, + "grad_norm": 140.19895527871375, + "learning_rate": 4.357954545454546e-06, + "loss": 3.4004, + "step": 1535 + }, + { + "epoch": 0.13091281002301203, + "grad_norm": 100.5682850847055, + "learning_rate": 4.360795454545455e-06, + "loss": 3.8702, + "step": 1536 + }, + { + "epoch": 0.1309980397170374, + "grad_norm": 100.17525448549746, + "learning_rate": 4.363636363636364e-06, + "loss": 3.2161, + "step": 1537 + }, + { + "epoch": 0.13108326941106282, + "grad_norm": 407.3472599324173, + "learning_rate": 4.366477272727273e-06, + "loss": 3.7235, + "step": 1538 + }, + { + "epoch": 0.1311684991050882, + "grad_norm": 115.1394887635429, + "learning_rate": 4.369318181818183e-06, + "loss": 3.0339, + "step": 1539 + }, + { + "epoch": 0.13125372879911362, + "grad_norm": 136.341709774432, + "learning_rate": 4.372159090909091e-06, + "loss": 3.5066, + "step": 1540 + }, + { + "epoch": 0.131338958493139, + "grad_norm": 85.80157650100269, + "learning_rate": 4.3750000000000005e-06, + "loss": 3.6226, + "step": 1541 + }, + { + "epoch": 0.13142418818716442, + "grad_norm": 173.18548973724089, + "learning_rate": 4.377840909090909e-06, + "loss": 3.7113, + "step": 1542 + }, + { + "epoch": 0.1315094178811898, + "grad_norm": 70.81235938794741, + "learning_rate": 4.380681818181818e-06, + "loss": 3.2189, + "step": 1543 + }, + { + "epoch": 0.13159464757521522, + "grad_norm": 229.23595638902316, + "learning_rate": 4.383522727272727e-06, + "loss": 4.6883, + "step": 1544 + }, + { + "epoch": 0.1316798772692406, + "grad_norm": 135.24725597211315, + "learning_rate": 4.386363636363637e-06, + "loss": 2.589, + "step": 1545 + }, + { + "epoch": 0.13176510696326602, + "grad_norm": 75.64417180746764, + "learning_rate": 4.389204545454546e-06, + "loss": 3.2987, + "step": 1546 + }, + { + "epoch": 0.1318503366572914, + "grad_norm": 136.94105619333598, + "learning_rate": 4.392045454545455e-06, + "loss": 3.111, + "step": 1547 + }, + { + "epoch": 0.13193556635131679, + "grad_norm": 104.64137380902879, + "learning_rate": 4.394886363636364e-06, + "loss": 2.9749, + "step": 1548 + }, + { + "epoch": 0.1320207960453422, + "grad_norm": 75.86317303340903, + "learning_rate": 4.397727272727273e-06, + "loss": 3.1336, + "step": 1549 + }, + { + "epoch": 0.13210602573936758, + "grad_norm": 90.93238495782686, + "learning_rate": 4.4005681818181825e-06, + "loss": 3.1867, + "step": 1550 + }, + { + "epoch": 0.132191255433393, + "grad_norm": 126.65384993764825, + "learning_rate": 4.4034090909090914e-06, + "loss": 3.7444, + "step": 1551 + }, + { + "epoch": 0.13227648512741838, + "grad_norm": 214.48896912060664, + "learning_rate": 4.40625e-06, + "loss": 4.6855, + "step": 1552 + }, + { + "epoch": 0.1323617148214438, + "grad_norm": 62.88557668793207, + "learning_rate": 4.409090909090909e-06, + "loss": 2.7758, + "step": 1553 + }, + { + "epoch": 0.13244694451546918, + "grad_norm": 115.07232594780278, + "learning_rate": 4.411931818181819e-06, + "loss": 2.8903, + "step": 1554 + }, + { + "epoch": 0.1325321742094946, + "grad_norm": 292.16724001984136, + "learning_rate": 4.414772727272727e-06, + "loss": 3.2751, + "step": 1555 + }, + { + "epoch": 0.13261740390351998, + "grad_norm": 117.91888868283402, + "learning_rate": 4.417613636363637e-06, + "loss": 3.7405, + "step": 1556 + }, + { + "epoch": 0.1327026335975454, + "grad_norm": 208.53046796281436, + "learning_rate": 4.420454545454546e-06, + "loss": 2.4897, + "step": 1557 + }, + { + "epoch": 0.13278786329157077, + "grad_norm": 90.24085621746859, + "learning_rate": 4.423295454545455e-06, + "loss": 2.8016, + "step": 1558 + }, + { + "epoch": 0.1328730929855962, + "grad_norm": 171.93978083710883, + "learning_rate": 4.426136363636364e-06, + "loss": 3.6659, + "step": 1559 + }, + { + "epoch": 0.13295832267962157, + "grad_norm": 195.70238724496227, + "learning_rate": 4.428977272727273e-06, + "loss": 4.0936, + "step": 1560 + }, + { + "epoch": 0.13304355237364698, + "grad_norm": 197.98385691631103, + "learning_rate": 4.4318181818181824e-06, + "loss": 3.0656, + "step": 1561 + }, + { + "epoch": 0.13312878206767237, + "grad_norm": 104.9332197373583, + "learning_rate": 4.434659090909091e-06, + "loss": 2.6123, + "step": 1562 + }, + { + "epoch": 0.13321401176169778, + "grad_norm": 211.764090037702, + "learning_rate": 4.4375e-06, + "loss": 3.5065, + "step": 1563 + }, + { + "epoch": 0.13329924145572317, + "grad_norm": 185.62927920415316, + "learning_rate": 4.440340909090909e-06, + "loss": 3.2592, + "step": 1564 + }, + { + "epoch": 0.13338447114974858, + "grad_norm": 151.27471540346153, + "learning_rate": 4.443181818181819e-06, + "loss": 2.5, + "step": 1565 + }, + { + "epoch": 0.13346970084377396, + "grad_norm": 264.17413235208807, + "learning_rate": 4.446022727272727e-06, + "loss": 2.643, + "step": 1566 + }, + { + "epoch": 0.13355493053779938, + "grad_norm": 161.22820518017102, + "learning_rate": 4.448863636363637e-06, + "loss": 3.6605, + "step": 1567 + }, + { + "epoch": 0.13364016023182476, + "grad_norm": 94.68962211179007, + "learning_rate": 4.451704545454546e-06, + "loss": 3.0714, + "step": 1568 + }, + { + "epoch": 0.13372538992585017, + "grad_norm": 114.29420160827163, + "learning_rate": 4.454545454545455e-06, + "loss": 3.5919, + "step": 1569 + }, + { + "epoch": 0.13381061961987556, + "grad_norm": 284.3020809121997, + "learning_rate": 4.457386363636364e-06, + "loss": 3.3326, + "step": 1570 + }, + { + "epoch": 0.13389584931390097, + "grad_norm": 115.85038312235409, + "learning_rate": 4.460227272727273e-06, + "loss": 2.931, + "step": 1571 + }, + { + "epoch": 0.13398107900792636, + "grad_norm": 274.98432334602563, + "learning_rate": 4.463068181818182e-06, + "loss": 4.2246, + "step": 1572 + }, + { + "epoch": 0.13406630870195177, + "grad_norm": 234.7663315804467, + "learning_rate": 4.465909090909091e-06, + "loss": 4.3626, + "step": 1573 + }, + { + "epoch": 0.13415153839597715, + "grad_norm": 202.46524219062385, + "learning_rate": 4.46875e-06, + "loss": 4.0065, + "step": 1574 + }, + { + "epoch": 0.13423676809000257, + "grad_norm": 95.30571567855463, + "learning_rate": 4.471590909090909e-06, + "loss": 3.1089, + "step": 1575 + }, + { + "epoch": 0.13432199778402795, + "grad_norm": 115.26895303489341, + "learning_rate": 4.474431818181819e-06, + "loss": 3.3434, + "step": 1576 + }, + { + "epoch": 0.13440722747805336, + "grad_norm": 153.82221836711898, + "learning_rate": 4.477272727272728e-06, + "loss": 3.9372, + "step": 1577 + }, + { + "epoch": 0.13449245717207875, + "grad_norm": 92.24707257667765, + "learning_rate": 4.480113636363637e-06, + "loss": 3.2217, + "step": 1578 + }, + { + "epoch": 0.13457768686610416, + "grad_norm": 153.41957003777196, + "learning_rate": 4.482954545454546e-06, + "loss": 3.2839, + "step": 1579 + }, + { + "epoch": 0.13466291656012955, + "grad_norm": 100.70735148749736, + "learning_rate": 4.4857954545454555e-06, + "loss": 3.2553, + "step": 1580 + }, + { + "epoch": 0.13474814625415496, + "grad_norm": 137.71455193344534, + "learning_rate": 4.4886363636363636e-06, + "loss": 3.1657, + "step": 1581 + }, + { + "epoch": 0.13483337594818035, + "grad_norm": 91.89571014706921, + "learning_rate": 4.491477272727273e-06, + "loss": 3.2366, + "step": 1582 + }, + { + "epoch": 0.13491860564220576, + "grad_norm": 112.75336172277666, + "learning_rate": 4.494318181818182e-06, + "loss": 4.0728, + "step": 1583 + }, + { + "epoch": 0.13500383533623114, + "grad_norm": 74.67923413716355, + "learning_rate": 4.497159090909091e-06, + "loss": 2.305, + "step": 1584 + }, + { + "epoch": 0.13508906503025653, + "grad_norm": 141.6012863672979, + "learning_rate": 4.5e-06, + "loss": 3.6562, + "step": 1585 + }, + { + "epoch": 0.13517429472428194, + "grad_norm": 194.30421806123286, + "learning_rate": 4.502840909090909e-06, + "loss": 3.7695, + "step": 1586 + }, + { + "epoch": 0.13525952441830733, + "grad_norm": 89.7140111122289, + "learning_rate": 4.505681818181819e-06, + "loss": 3.0549, + "step": 1587 + }, + { + "epoch": 0.13534475411233274, + "grad_norm": 65.8178808743917, + "learning_rate": 4.508522727272728e-06, + "loss": 2.0877, + "step": 1588 + }, + { + "epoch": 0.13542998380635812, + "grad_norm": 139.93193079276813, + "learning_rate": 4.511363636363637e-06, + "loss": 2.8345, + "step": 1589 + }, + { + "epoch": 0.13551521350038354, + "grad_norm": 69.70841203826753, + "learning_rate": 4.514204545454546e-06, + "loss": 2.5405, + "step": 1590 + }, + { + "epoch": 0.13560044319440892, + "grad_norm": 85.56534085114413, + "learning_rate": 4.517045454545455e-06, + "loss": 2.7697, + "step": 1591 + }, + { + "epoch": 0.13568567288843433, + "grad_norm": 120.67511887441978, + "learning_rate": 4.5198863636363635e-06, + "loss": 4.1358, + "step": 1592 + }, + { + "epoch": 0.13577090258245972, + "grad_norm": 129.97781860128177, + "learning_rate": 4.522727272727273e-06, + "loss": 3.8978, + "step": 1593 + }, + { + "epoch": 0.13585613227648513, + "grad_norm": 106.63347514328932, + "learning_rate": 4.525568181818182e-06, + "loss": 3.3418, + "step": 1594 + }, + { + "epoch": 0.13594136197051052, + "grad_norm": 87.15137795658829, + "learning_rate": 4.528409090909091e-06, + "loss": 3.0211, + "step": 1595 + }, + { + "epoch": 0.13602659166453593, + "grad_norm": 70.36561177999175, + "learning_rate": 4.53125e-06, + "loss": 3.0574, + "step": 1596 + }, + { + "epoch": 0.1361118213585613, + "grad_norm": 363.71485122510654, + "learning_rate": 4.53409090909091e-06, + "loss": 2.8652, + "step": 1597 + }, + { + "epoch": 0.13619705105258673, + "grad_norm": 299.911517715755, + "learning_rate": 4.536931818181819e-06, + "loss": 3.3265, + "step": 1598 + }, + { + "epoch": 0.1362822807466121, + "grad_norm": 141.23951007236087, + "learning_rate": 4.539772727272728e-06, + "loss": 3.6447, + "step": 1599 + }, + { + "epoch": 0.13636751044063752, + "grad_norm": 85.01563497511535, + "learning_rate": 4.542613636363637e-06, + "loss": 2.8083, + "step": 1600 + }, + { + "epoch": 0.1364527401346629, + "grad_norm": 131.3176243032913, + "learning_rate": 4.5454545454545455e-06, + "loss": 3.4984, + "step": 1601 + }, + { + "epoch": 0.13653796982868832, + "grad_norm": 237.11994303038313, + "learning_rate": 4.548295454545455e-06, + "loss": 4.3016, + "step": 1602 + }, + { + "epoch": 0.1366231995227137, + "grad_norm": 833.2405774317875, + "learning_rate": 4.551136363636364e-06, + "loss": 3.2063, + "step": 1603 + }, + { + "epoch": 0.13670842921673912, + "grad_norm": 160.6355482699458, + "learning_rate": 4.553977272727273e-06, + "loss": 3.6941, + "step": 1604 + }, + { + "epoch": 0.1367936589107645, + "grad_norm": 101.39831911014036, + "learning_rate": 4.556818181818182e-06, + "loss": 3.0484, + "step": 1605 + }, + { + "epoch": 0.13687888860478992, + "grad_norm": 78.8094463128741, + "learning_rate": 4.559659090909092e-06, + "loss": 2.9892, + "step": 1606 + }, + { + "epoch": 0.1369641182988153, + "grad_norm": 47.56909799804802, + "learning_rate": 4.5625e-06, + "loss": 1.4693, + "step": 1607 + }, + { + "epoch": 0.13704934799284071, + "grad_norm": 110.12436865652067, + "learning_rate": 4.56534090909091e-06, + "loss": 3.136, + "step": 1608 + }, + { + "epoch": 0.1371345776868661, + "grad_norm": 150.79523601822348, + "learning_rate": 4.568181818181819e-06, + "loss": 4.0916, + "step": 1609 + }, + { + "epoch": 0.1372198073808915, + "grad_norm": 80.33770364872744, + "learning_rate": 4.5710227272727276e-06, + "loss": 2.9086, + "step": 1610 + }, + { + "epoch": 0.1373050370749169, + "grad_norm": 113.45786786431005, + "learning_rate": 4.5738636363636365e-06, + "loss": 3.7366, + "step": 1611 + }, + { + "epoch": 0.1373902667689423, + "grad_norm": 192.9040433176929, + "learning_rate": 4.5767045454545454e-06, + "loss": 2.9, + "step": 1612 + }, + { + "epoch": 0.1374754964629677, + "grad_norm": 167.7676095249412, + "learning_rate": 4.579545454545455e-06, + "loss": 2.8522, + "step": 1613 + }, + { + "epoch": 0.1375607261569931, + "grad_norm": 65.51889471375524, + "learning_rate": 4.582386363636364e-06, + "loss": 2.646, + "step": 1614 + }, + { + "epoch": 0.1376459558510185, + "grad_norm": 82.17841228644781, + "learning_rate": 4.585227272727273e-06, + "loss": 3.5714, + "step": 1615 + }, + { + "epoch": 0.1377311855450439, + "grad_norm": 116.39743603954953, + "learning_rate": 4.588068181818182e-06, + "loss": 2.3676, + "step": 1616 + }, + { + "epoch": 0.1378164152390693, + "grad_norm": 109.44238273535493, + "learning_rate": 4.590909090909092e-06, + "loss": 3.3161, + "step": 1617 + }, + { + "epoch": 0.1379016449330947, + "grad_norm": 180.42776857936337, + "learning_rate": 4.59375e-06, + "loss": 3.5848, + "step": 1618 + }, + { + "epoch": 0.1379868746271201, + "grad_norm": 75.432639479698, + "learning_rate": 4.59659090909091e-06, + "loss": 3.1172, + "step": 1619 + }, + { + "epoch": 0.1380721043211455, + "grad_norm": 97.6146022582692, + "learning_rate": 4.5994318181818186e-06, + "loss": 2.9853, + "step": 1620 + }, + { + "epoch": 0.13815733401517089, + "grad_norm": 120.31209782093431, + "learning_rate": 4.6022727272727275e-06, + "loss": 3.2358, + "step": 1621 + }, + { + "epoch": 0.1382425637091963, + "grad_norm": 79.40916785284482, + "learning_rate": 4.605113636363636e-06, + "loss": 3.4591, + "step": 1622 + }, + { + "epoch": 0.13832779340322168, + "grad_norm": 255.73160350053658, + "learning_rate": 4.607954545454546e-06, + "loss": 4.0103, + "step": 1623 + }, + { + "epoch": 0.13841302309724707, + "grad_norm": 185.97727944420748, + "learning_rate": 4.610795454545455e-06, + "loss": 4.3966, + "step": 1624 + }, + { + "epoch": 0.13849825279127248, + "grad_norm": 161.59372615831384, + "learning_rate": 4.613636363636364e-06, + "loss": 4.1058, + "step": 1625 + }, + { + "epoch": 0.13858348248529787, + "grad_norm": 72.56851448158041, + "learning_rate": 4.616477272727273e-06, + "loss": 3.6098, + "step": 1626 + }, + { + "epoch": 0.13866871217932328, + "grad_norm": 99.02445241572691, + "learning_rate": 4.619318181818182e-06, + "loss": 3.252, + "step": 1627 + }, + { + "epoch": 0.13875394187334866, + "grad_norm": 75.09435143373128, + "learning_rate": 4.622159090909092e-06, + "loss": 3.4637, + "step": 1628 + }, + { + "epoch": 0.13883917156737408, + "grad_norm": 142.941042636128, + "learning_rate": 4.625000000000001e-06, + "loss": 3.89, + "step": 1629 + }, + { + "epoch": 0.13892440126139946, + "grad_norm": 60.32878796681618, + "learning_rate": 4.6278409090909095e-06, + "loss": 3.1511, + "step": 1630 + }, + { + "epoch": 0.13900963095542487, + "grad_norm": 89.26680653112487, + "learning_rate": 4.6306818181818185e-06, + "loss": 2.8599, + "step": 1631 + }, + { + "epoch": 0.13909486064945026, + "grad_norm": 88.35154444388107, + "learning_rate": 4.633522727272728e-06, + "loss": 3.0236, + "step": 1632 + }, + { + "epoch": 0.13918009034347567, + "grad_norm": 119.17248249037485, + "learning_rate": 4.636363636363636e-06, + "loss": 4.4784, + "step": 1633 + }, + { + "epoch": 0.13926532003750106, + "grad_norm": 141.62641883082392, + "learning_rate": 4.639204545454546e-06, + "loss": 2.9936, + "step": 1634 + }, + { + "epoch": 0.13935054973152647, + "grad_norm": 94.03605077836394, + "learning_rate": 4.642045454545455e-06, + "loss": 3.0567, + "step": 1635 + }, + { + "epoch": 0.13943577942555185, + "grad_norm": 111.54315522694218, + "learning_rate": 4.644886363636364e-06, + "loss": 3.259, + "step": 1636 + }, + { + "epoch": 0.13952100911957727, + "grad_norm": 157.92945162557567, + "learning_rate": 4.647727272727273e-06, + "loss": 3.6143, + "step": 1637 + }, + { + "epoch": 0.13960623881360265, + "grad_norm": 69.97477142248395, + "learning_rate": 4.650568181818182e-06, + "loss": 3.1709, + "step": 1638 + }, + { + "epoch": 0.13969146850762806, + "grad_norm": 155.95751360171062, + "learning_rate": 4.653409090909092e-06, + "loss": 3.6632, + "step": 1639 + }, + { + "epoch": 0.13977669820165345, + "grad_norm": 116.86696523779494, + "learning_rate": 4.6562500000000005e-06, + "loss": 3.4014, + "step": 1640 + }, + { + "epoch": 0.13986192789567886, + "grad_norm": 282.4837388775807, + "learning_rate": 4.6590909090909095e-06, + "loss": 4.482, + "step": 1641 + }, + { + "epoch": 0.13994715758970425, + "grad_norm": 87.47405505515852, + "learning_rate": 4.661931818181818e-06, + "loss": 2.4165, + "step": 1642 + }, + { + "epoch": 0.14003238728372966, + "grad_norm": 77.04683980052758, + "learning_rate": 4.664772727272728e-06, + "loss": 2.929, + "step": 1643 + }, + { + "epoch": 0.14011761697775504, + "grad_norm": 145.08990271814096, + "learning_rate": 4.667613636363636e-06, + "loss": 3.7897, + "step": 1644 + }, + { + "epoch": 0.14020284667178046, + "grad_norm": 115.13789236418802, + "learning_rate": 4.670454545454546e-06, + "loss": 4.3998, + "step": 1645 + }, + { + "epoch": 0.14028807636580584, + "grad_norm": 101.85623994804652, + "learning_rate": 4.673295454545455e-06, + "loss": 2.7818, + "step": 1646 + }, + { + "epoch": 0.14037330605983125, + "grad_norm": 142.91475628967072, + "learning_rate": 4.676136363636364e-06, + "loss": 3.1544, + "step": 1647 + }, + { + "epoch": 0.14045853575385664, + "grad_norm": 90.79306278809347, + "learning_rate": 4.678977272727273e-06, + "loss": 3.0465, + "step": 1648 + }, + { + "epoch": 0.14054376544788205, + "grad_norm": 51.78564216964463, + "learning_rate": 4.681818181818183e-06, + "loss": 2.1728, + "step": 1649 + }, + { + "epoch": 0.14062899514190744, + "grad_norm": 114.54551122983727, + "learning_rate": 4.6846590909090915e-06, + "loss": 3.0029, + "step": 1650 + }, + { + "epoch": 0.14071422483593285, + "grad_norm": 118.4135581113063, + "learning_rate": 4.6875000000000004e-06, + "loss": 3.3812, + "step": 1651 + }, + { + "epoch": 0.14079945452995823, + "grad_norm": 67.74735594789426, + "learning_rate": 4.690340909090909e-06, + "loss": 2.6986, + "step": 1652 + }, + { + "epoch": 0.14088468422398365, + "grad_norm": 140.18342404836363, + "learning_rate": 4.693181818181818e-06, + "loss": 3.3304, + "step": 1653 + }, + { + "epoch": 0.14096991391800903, + "grad_norm": 125.87798379401494, + "learning_rate": 4.696022727272728e-06, + "loss": 2.8477, + "step": 1654 + }, + { + "epoch": 0.14105514361203444, + "grad_norm": 71.20362035838797, + "learning_rate": 4.698863636363637e-06, + "loss": 3.1956, + "step": 1655 + }, + { + "epoch": 0.14114037330605983, + "grad_norm": 97.20308129143721, + "learning_rate": 4.701704545454546e-06, + "loss": 3.1211, + "step": 1656 + }, + { + "epoch": 0.14122560300008524, + "grad_norm": 171.41602287468226, + "learning_rate": 4.704545454545455e-06, + "loss": 3.8245, + "step": 1657 + }, + { + "epoch": 0.14131083269411063, + "grad_norm": 108.68113779347037, + "learning_rate": 4.707386363636365e-06, + "loss": 3.122, + "step": 1658 + }, + { + "epoch": 0.14139606238813604, + "grad_norm": 84.82328676609211, + "learning_rate": 4.710227272727273e-06, + "loss": 3.5098, + "step": 1659 + }, + { + "epoch": 0.14148129208216143, + "grad_norm": 91.83026944865226, + "learning_rate": 4.7130681818181825e-06, + "loss": 3.316, + "step": 1660 + }, + { + "epoch": 0.1415665217761868, + "grad_norm": 197.91423289588843, + "learning_rate": 4.715909090909091e-06, + "loss": 4.2144, + "step": 1661 + }, + { + "epoch": 0.14165175147021222, + "grad_norm": 177.2345579041629, + "learning_rate": 4.71875e-06, + "loss": 3.253, + "step": 1662 + }, + { + "epoch": 0.1417369811642376, + "grad_norm": 107.64228771280006, + "learning_rate": 4.721590909090909e-06, + "loss": 3.0266, + "step": 1663 + }, + { + "epoch": 0.14182221085826302, + "grad_norm": 60.98045315282506, + "learning_rate": 4.724431818181819e-06, + "loss": 1.7957, + "step": 1664 + }, + { + "epoch": 0.1419074405522884, + "grad_norm": 168.26330952537373, + "learning_rate": 4.727272727272728e-06, + "loss": 3.0786, + "step": 1665 + }, + { + "epoch": 0.14199267024631382, + "grad_norm": 138.42995047533856, + "learning_rate": 4.730113636363637e-06, + "loss": 4.0864, + "step": 1666 + }, + { + "epoch": 0.1420778999403392, + "grad_norm": 189.42300638231555, + "learning_rate": 4.732954545454546e-06, + "loss": 4.0323, + "step": 1667 + }, + { + "epoch": 0.14216312963436462, + "grad_norm": 99.14198406178222, + "learning_rate": 4.735795454545455e-06, + "loss": 3.4611, + "step": 1668 + }, + { + "epoch": 0.14224835932839, + "grad_norm": 71.18939950559451, + "learning_rate": 4.7386363636363645e-06, + "loss": 3.3861, + "step": 1669 + }, + { + "epoch": 0.1423335890224154, + "grad_norm": 105.02691017288812, + "learning_rate": 4.741477272727273e-06, + "loss": 3.435, + "step": 1670 + }, + { + "epoch": 0.1424188187164408, + "grad_norm": 132.4886134668689, + "learning_rate": 4.744318181818182e-06, + "loss": 3.8165, + "step": 1671 + }, + { + "epoch": 0.1425040484104662, + "grad_norm": 176.07919644881835, + "learning_rate": 4.747159090909091e-06, + "loss": 3.7407, + "step": 1672 + }, + { + "epoch": 0.1425892781044916, + "grad_norm": 145.5938435356572, + "learning_rate": 4.75e-06, + "loss": 3.7087, + "step": 1673 + }, + { + "epoch": 0.142674507798517, + "grad_norm": 242.68214315053882, + "learning_rate": 4.752840909090909e-06, + "loss": 3.3884, + "step": 1674 + }, + { + "epoch": 0.1427597374925424, + "grad_norm": 238.06809211427338, + "learning_rate": 4.755681818181819e-06, + "loss": 3.9928, + "step": 1675 + }, + { + "epoch": 0.1428449671865678, + "grad_norm": 104.91477107116586, + "learning_rate": 4.758522727272727e-06, + "loss": 3.1451, + "step": 1676 + }, + { + "epoch": 0.1429301968805932, + "grad_norm": 82.25326949350755, + "learning_rate": 4.761363636363637e-06, + "loss": 3.1144, + "step": 1677 + }, + { + "epoch": 0.1430154265746186, + "grad_norm": 98.49618709053337, + "learning_rate": 4.764204545454546e-06, + "loss": 3.8186, + "step": 1678 + }, + { + "epoch": 0.143100656268644, + "grad_norm": 104.96364546880594, + "learning_rate": 4.767045454545455e-06, + "loss": 3.3676, + "step": 1679 + }, + { + "epoch": 0.1431858859626694, + "grad_norm": 134.59508018704497, + "learning_rate": 4.7698863636363645e-06, + "loss": 3.4726, + "step": 1680 + }, + { + "epoch": 0.1432711156566948, + "grad_norm": 211.17592123112797, + "learning_rate": 4.772727272727273e-06, + "loss": 4.4695, + "step": 1681 + }, + { + "epoch": 0.1433563453507202, + "grad_norm": 78.81046074217839, + "learning_rate": 4.775568181818182e-06, + "loss": 2.951, + "step": 1682 + }, + { + "epoch": 0.14344157504474558, + "grad_norm": 64.53419129928004, + "learning_rate": 4.778409090909091e-06, + "loss": 2.69, + "step": 1683 + }, + { + "epoch": 0.143526804738771, + "grad_norm": 145.5968218480987, + "learning_rate": 4.781250000000001e-06, + "loss": 4.0914, + "step": 1684 + }, + { + "epoch": 0.14361203443279638, + "grad_norm": 191.48339890125808, + "learning_rate": 4.784090909090909e-06, + "loss": 3.1163, + "step": 1685 + }, + { + "epoch": 0.1436972641268218, + "grad_norm": 70.15600704938151, + "learning_rate": 4.786931818181819e-06, + "loss": 2.5306, + "step": 1686 + }, + { + "epoch": 0.14378249382084718, + "grad_norm": 94.21552192081909, + "learning_rate": 4.789772727272728e-06, + "loss": 2.4971, + "step": 1687 + }, + { + "epoch": 0.1438677235148726, + "grad_norm": 72.92782048101195, + "learning_rate": 4.792613636363637e-06, + "loss": 2.3203, + "step": 1688 + }, + { + "epoch": 0.14395295320889798, + "grad_norm": 73.5821573645008, + "learning_rate": 4.795454545454546e-06, + "loss": 3.2387, + "step": 1689 + }, + { + "epoch": 0.1440381829029234, + "grad_norm": 156.20912441542134, + "learning_rate": 4.7982954545454554e-06, + "loss": 3.1047, + "step": 1690 + }, + { + "epoch": 0.14412341259694877, + "grad_norm": 82.92605715352357, + "learning_rate": 4.8011363636363635e-06, + "loss": 3.1743, + "step": 1691 + }, + { + "epoch": 0.1442086422909742, + "grad_norm": 318.13539689866974, + "learning_rate": 4.803977272727273e-06, + "loss": 4.4351, + "step": 1692 + }, + { + "epoch": 0.14429387198499957, + "grad_norm": 123.09970686137416, + "learning_rate": 4.806818181818182e-06, + "loss": 3.856, + "step": 1693 + }, + { + "epoch": 0.14437910167902498, + "grad_norm": 87.90627020510225, + "learning_rate": 4.809659090909091e-06, + "loss": 3.2898, + "step": 1694 + }, + { + "epoch": 0.14446433137305037, + "grad_norm": 125.73863522693887, + "learning_rate": 4.8125e-06, + "loss": 3.5592, + "step": 1695 + }, + { + "epoch": 0.14454956106707578, + "grad_norm": 65.60471396001104, + "learning_rate": 4.815340909090909e-06, + "loss": 2.8304, + "step": 1696 + }, + { + "epoch": 0.14463479076110117, + "grad_norm": 103.49148022792528, + "learning_rate": 4.818181818181819e-06, + "loss": 2.8466, + "step": 1697 + }, + { + "epoch": 0.14472002045512655, + "grad_norm": 112.22684270462945, + "learning_rate": 4.821022727272728e-06, + "loss": 3.8706, + "step": 1698 + }, + { + "epoch": 0.14480525014915197, + "grad_norm": 63.39567224953927, + "learning_rate": 4.823863636363637e-06, + "loss": 3.127, + "step": 1699 + }, + { + "epoch": 0.14489047984317735, + "grad_norm": 90.24497798007144, + "learning_rate": 4.826704545454546e-06, + "loss": 1.8825, + "step": 1700 + }, + { + "epoch": 0.14497570953720276, + "grad_norm": 111.61465633001028, + "learning_rate": 4.829545454545455e-06, + "loss": 3.1734, + "step": 1701 + }, + { + "epoch": 0.14506093923122815, + "grad_norm": 100.2063035776916, + "learning_rate": 4.8323863636363634e-06, + "loss": 3.4033, + "step": 1702 + }, + { + "epoch": 0.14514616892525356, + "grad_norm": 72.7169558507569, + "learning_rate": 4.835227272727273e-06, + "loss": 2.8556, + "step": 1703 + }, + { + "epoch": 0.14523139861927895, + "grad_norm": 139.91991602463358, + "learning_rate": 4.838068181818182e-06, + "loss": 5.4776, + "step": 1704 + }, + { + "epoch": 0.14531662831330436, + "grad_norm": 83.62355088785624, + "learning_rate": 4.840909090909091e-06, + "loss": 3.2778, + "step": 1705 + }, + { + "epoch": 0.14540185800732974, + "grad_norm": 97.87343116362605, + "learning_rate": 4.84375e-06, + "loss": 3.8913, + "step": 1706 + }, + { + "epoch": 0.14548708770135516, + "grad_norm": 121.5036825962143, + "learning_rate": 4.84659090909091e-06, + "loss": 3.4473, + "step": 1707 + }, + { + "epoch": 0.14557231739538054, + "grad_norm": 203.4176650755348, + "learning_rate": 4.849431818181819e-06, + "loss": 4.5557, + "step": 1708 + }, + { + "epoch": 0.14565754708940595, + "grad_norm": 187.76988177169724, + "learning_rate": 4.852272727272728e-06, + "loss": 3.176, + "step": 1709 + }, + { + "epoch": 0.14574277678343134, + "grad_norm": 76.49395991147561, + "learning_rate": 4.8551136363636366e-06, + "loss": 3.2944, + "step": 1710 + }, + { + "epoch": 0.14582800647745675, + "grad_norm": 66.85248075114389, + "learning_rate": 4.8579545454545455e-06, + "loss": 3.0945, + "step": 1711 + }, + { + "epoch": 0.14591323617148214, + "grad_norm": 155.33664966122586, + "learning_rate": 4.860795454545455e-06, + "loss": 4.0981, + "step": 1712 + }, + { + "epoch": 0.14599846586550755, + "grad_norm": 77.1054170411114, + "learning_rate": 4.863636363636364e-06, + "loss": 2.7951, + "step": 1713 + }, + { + "epoch": 0.14608369555953293, + "grad_norm": 77.82132470098315, + "learning_rate": 4.866477272727273e-06, + "loss": 3.1009, + "step": 1714 + }, + { + "epoch": 0.14616892525355835, + "grad_norm": 65.77885630739593, + "learning_rate": 4.869318181818182e-06, + "loss": 3.4223, + "step": 1715 + }, + { + "epoch": 0.14625415494758373, + "grad_norm": 110.59950765044923, + "learning_rate": 4.872159090909092e-06, + "loss": 3.6922, + "step": 1716 + }, + { + "epoch": 0.14633938464160914, + "grad_norm": 85.34123074952846, + "learning_rate": 4.875e-06, + "loss": 3.2674, + "step": 1717 + }, + { + "epoch": 0.14642461433563453, + "grad_norm": 140.76860681028413, + "learning_rate": 4.87784090909091e-06, + "loss": 3.5816, + "step": 1718 + }, + { + "epoch": 0.14650984402965994, + "grad_norm": 159.95814346370443, + "learning_rate": 4.880681818181819e-06, + "loss": 4.652, + "step": 1719 + }, + { + "epoch": 0.14659507372368533, + "grad_norm": 88.31860133483316, + "learning_rate": 4.8835227272727275e-06, + "loss": 3.2422, + "step": 1720 + }, + { + "epoch": 0.14668030341771074, + "grad_norm": 153.39533954348067, + "learning_rate": 4.8863636363636365e-06, + "loss": 4.0488, + "step": 1721 + }, + { + "epoch": 0.14676553311173612, + "grad_norm": 248.46031650321538, + "learning_rate": 4.889204545454545e-06, + "loss": 5.0797, + "step": 1722 + }, + { + "epoch": 0.14685076280576154, + "grad_norm": 80.4269317471089, + "learning_rate": 4.892045454545455e-06, + "loss": 2.9589, + "step": 1723 + }, + { + "epoch": 0.14693599249978692, + "grad_norm": 72.0773289482377, + "learning_rate": 4.894886363636364e-06, + "loss": 1.5816, + "step": 1724 + }, + { + "epoch": 0.14702122219381233, + "grad_norm": 162.69318191058457, + "learning_rate": 4.897727272727273e-06, + "loss": 3.5969, + "step": 1725 + }, + { + "epoch": 0.14710645188783772, + "grad_norm": 254.24385721679266, + "learning_rate": 4.900568181818182e-06, + "loss": 3.1479, + "step": 1726 + }, + { + "epoch": 0.14719168158186313, + "grad_norm": 103.47837125337352, + "learning_rate": 4.903409090909092e-06, + "loss": 3.5316, + "step": 1727 + }, + { + "epoch": 0.14727691127588852, + "grad_norm": 117.4816226437779, + "learning_rate": 4.90625e-06, + "loss": 3.6463, + "step": 1728 + }, + { + "epoch": 0.14736214096991393, + "grad_norm": 112.46701322490753, + "learning_rate": 4.90909090909091e-06, + "loss": 3.2741, + "step": 1729 + }, + { + "epoch": 0.14744737066393931, + "grad_norm": 80.27113277415766, + "learning_rate": 4.9119318181818185e-06, + "loss": 3.1435, + "step": 1730 + }, + { + "epoch": 0.14753260035796473, + "grad_norm": 83.57795354996871, + "learning_rate": 4.9147727272727275e-06, + "loss": 3.406, + "step": 1731 + }, + { + "epoch": 0.1476178300519901, + "grad_norm": 132.2631736316698, + "learning_rate": 4.917613636363636e-06, + "loss": 3.0379, + "step": 1732 + }, + { + "epoch": 0.14770305974601552, + "grad_norm": 116.63153080755232, + "learning_rate": 4.920454545454546e-06, + "loss": 2.851, + "step": 1733 + }, + { + "epoch": 0.1477882894400409, + "grad_norm": 86.41942939080296, + "learning_rate": 4.923295454545455e-06, + "loss": 3.1343, + "step": 1734 + }, + { + "epoch": 0.14787351913406632, + "grad_norm": 227.00518705340914, + "learning_rate": 4.926136363636364e-06, + "loss": 4.7463, + "step": 1735 + }, + { + "epoch": 0.1479587488280917, + "grad_norm": 147.01655742880797, + "learning_rate": 4.928977272727273e-06, + "loss": 4.1312, + "step": 1736 + }, + { + "epoch": 0.1480439785221171, + "grad_norm": 117.54555010054742, + "learning_rate": 4.931818181818182e-06, + "loss": 3.1647, + "step": 1737 + }, + { + "epoch": 0.1481292082161425, + "grad_norm": 93.24059459321812, + "learning_rate": 4.934659090909092e-06, + "loss": 2.9809, + "step": 1738 + }, + { + "epoch": 0.1482144379101679, + "grad_norm": 111.07029715974035, + "learning_rate": 4.937500000000001e-06, + "loss": 4.0908, + "step": 1739 + }, + { + "epoch": 0.1482996676041933, + "grad_norm": 81.68941183059309, + "learning_rate": 4.9403409090909095e-06, + "loss": 3.5518, + "step": 1740 + }, + { + "epoch": 0.1483848972982187, + "grad_norm": 180.81935943502415, + "learning_rate": 4.9431818181818184e-06, + "loss": 3.8593, + "step": 1741 + }, + { + "epoch": 0.1484701269922441, + "grad_norm": 114.82921552239517, + "learning_rate": 4.946022727272728e-06, + "loss": 3.9575, + "step": 1742 + }, + { + "epoch": 0.14855535668626949, + "grad_norm": 137.14539511233008, + "learning_rate": 4.948863636363636e-06, + "loss": 3.7994, + "step": 1743 + }, + { + "epoch": 0.1486405863802949, + "grad_norm": 79.35763091011472, + "learning_rate": 4.951704545454546e-06, + "loss": 3.5918, + "step": 1744 + }, + { + "epoch": 0.14872581607432028, + "grad_norm": 76.38745062691906, + "learning_rate": 4.954545454545455e-06, + "loss": 3.4355, + "step": 1745 + }, + { + "epoch": 0.1488110457683457, + "grad_norm": 87.6930999823706, + "learning_rate": 4.957386363636364e-06, + "loss": 2.831, + "step": 1746 + }, + { + "epoch": 0.14889627546237108, + "grad_norm": 71.59586330177436, + "learning_rate": 4.960227272727273e-06, + "loss": 3.1104, + "step": 1747 + }, + { + "epoch": 0.1489815051563965, + "grad_norm": 142.64405279151984, + "learning_rate": 4.963068181818182e-06, + "loss": 3.6804, + "step": 1748 + }, + { + "epoch": 0.14906673485042188, + "grad_norm": 72.33166104368367, + "learning_rate": 4.9659090909090916e-06, + "loss": 3.072, + "step": 1749 + }, + { + "epoch": 0.1491519645444473, + "grad_norm": 126.32519372178824, + "learning_rate": 4.9687500000000005e-06, + "loss": 3.9437, + "step": 1750 + }, + { + "epoch": 0.14923719423847268, + "grad_norm": 151.72456948842097, + "learning_rate": 4.9715909090909094e-06, + "loss": 3.5217, + "step": 1751 + }, + { + "epoch": 0.1493224239324981, + "grad_norm": 81.98232701094841, + "learning_rate": 4.974431818181818e-06, + "loss": 2.4379, + "step": 1752 + }, + { + "epoch": 0.14940765362652347, + "grad_norm": 78.98530452573054, + "learning_rate": 4.977272727272728e-06, + "loss": 2.9244, + "step": 1753 + }, + { + "epoch": 0.14949288332054889, + "grad_norm": 143.02684662930744, + "learning_rate": 4.980113636363636e-06, + "loss": 3.2507, + "step": 1754 + }, + { + "epoch": 0.14957811301457427, + "grad_norm": 71.341049601393, + "learning_rate": 4.982954545454546e-06, + "loss": 2.8488, + "step": 1755 + }, + { + "epoch": 0.14966334270859968, + "grad_norm": 81.93693345108788, + "learning_rate": 4.985795454545455e-06, + "loss": 3.1548, + "step": 1756 + }, + { + "epoch": 0.14974857240262507, + "grad_norm": 90.99039485245125, + "learning_rate": 4.988636363636364e-06, + "loss": 3.3299, + "step": 1757 + }, + { + "epoch": 0.14983380209665048, + "grad_norm": 192.38840123602785, + "learning_rate": 4.991477272727273e-06, + "loss": 3.8249, + "step": 1758 + }, + { + "epoch": 0.14991903179067587, + "grad_norm": 109.21031395034329, + "learning_rate": 4.9943181818181826e-06, + "loss": 3.2205, + "step": 1759 + }, + { + "epoch": 0.15000426148470128, + "grad_norm": 47.33901600164348, + "learning_rate": 4.9971590909090915e-06, + "loss": 2.3197, + "step": 1760 + }, + { + "epoch": 0.15008949117872666, + "grad_norm": 122.72424840670602, + "learning_rate": 5e-06, + "loss": 3.9677, + "step": 1761 + }, + { + "epoch": 0.15017472087275208, + "grad_norm": 66.084720767093, + "learning_rate": 5.002840909090909e-06, + "loss": 3.1998, + "step": 1762 + }, + { + "epoch": 0.15025995056677746, + "grad_norm": 51.68602854257747, + "learning_rate": 5.005681818181819e-06, + "loss": 2.6143, + "step": 1763 + }, + { + "epoch": 0.15034518026080287, + "grad_norm": 67.99700370192856, + "learning_rate": 5.008522727272728e-06, + "loss": 3.2513, + "step": 1764 + }, + { + "epoch": 0.15043040995482826, + "grad_norm": 97.15523861649486, + "learning_rate": 5.011363636363636e-06, + "loss": 3.1679, + "step": 1765 + }, + { + "epoch": 0.15051563964885367, + "grad_norm": 142.43185518300186, + "learning_rate": 5.014204545454546e-06, + "loss": 3.8987, + "step": 1766 + }, + { + "epoch": 0.15060086934287906, + "grad_norm": 118.33160376441236, + "learning_rate": 5.017045454545455e-06, + "loss": 4.2203, + "step": 1767 + }, + { + "epoch": 0.15068609903690447, + "grad_norm": 79.06075410281875, + "learning_rate": 5.019886363636364e-06, + "loss": 2.6171, + "step": 1768 + }, + { + "epoch": 0.15077132873092985, + "grad_norm": 93.41630660713454, + "learning_rate": 5.0227272727272735e-06, + "loss": 3.091, + "step": 1769 + }, + { + "epoch": 0.15085655842495527, + "grad_norm": 205.69071495756748, + "learning_rate": 5.0255681818181825e-06, + "loss": 3.3094, + "step": 1770 + }, + { + "epoch": 0.15094178811898065, + "grad_norm": 130.60884884268629, + "learning_rate": 5.028409090909091e-06, + "loss": 3.098, + "step": 1771 + }, + { + "epoch": 0.15102701781300606, + "grad_norm": 185.78529112345672, + "learning_rate": 5.031250000000001e-06, + "loss": 3.94, + "step": 1772 + }, + { + "epoch": 0.15111224750703145, + "grad_norm": 261.0131127658274, + "learning_rate": 5.034090909090909e-06, + "loss": 3.5393, + "step": 1773 + }, + { + "epoch": 0.15119747720105683, + "grad_norm": 46.07076024168808, + "learning_rate": 5.036931818181818e-06, + "loss": 2.4211, + "step": 1774 + }, + { + "epoch": 0.15128270689508225, + "grad_norm": 84.95028421001048, + "learning_rate": 5.039772727272728e-06, + "loss": 3.1981, + "step": 1775 + }, + { + "epoch": 0.15136793658910763, + "grad_norm": 75.11483920736424, + "learning_rate": 5.042613636363637e-06, + "loss": 3.5326, + "step": 1776 + }, + { + "epoch": 0.15145316628313304, + "grad_norm": 139.52822438583024, + "learning_rate": 5.045454545454546e-06, + "loss": 3.3035, + "step": 1777 + }, + { + "epoch": 0.15153839597715843, + "grad_norm": 76.92960163274532, + "learning_rate": 5.048295454545456e-06, + "loss": 3.6273, + "step": 1778 + }, + { + "epoch": 0.15162362567118384, + "grad_norm": 86.00578868698413, + "learning_rate": 5.0511363636363645e-06, + "loss": 2.9972, + "step": 1779 + }, + { + "epoch": 0.15170885536520923, + "grad_norm": 106.02339758454593, + "learning_rate": 5.053977272727273e-06, + "loss": 3.2897, + "step": 1780 + }, + { + "epoch": 0.15179408505923464, + "grad_norm": 148.23016508431138, + "learning_rate": 5.056818181818182e-06, + "loss": 4.1812, + "step": 1781 + }, + { + "epoch": 0.15187931475326003, + "grad_norm": 79.19868191118962, + "learning_rate": 5.059659090909091e-06, + "loss": 2.3154, + "step": 1782 + }, + { + "epoch": 0.15196454444728544, + "grad_norm": 80.76078987399352, + "learning_rate": 5.0625e-06, + "loss": 3.3915, + "step": 1783 + }, + { + "epoch": 0.15204977414131082, + "grad_norm": 141.1561634350937, + "learning_rate": 5.06534090909091e-06, + "loss": 2.7433, + "step": 1784 + }, + { + "epoch": 0.15213500383533624, + "grad_norm": 206.5859518113536, + "learning_rate": 5.068181818181819e-06, + "loss": 4.1873, + "step": 1785 + }, + { + "epoch": 0.15222023352936162, + "grad_norm": 71.82366461141677, + "learning_rate": 5.071022727272728e-06, + "loss": 1.7739, + "step": 1786 + }, + { + "epoch": 0.15230546322338703, + "grad_norm": 78.6203878795793, + "learning_rate": 5.073863636363638e-06, + "loss": 3.0716, + "step": 1787 + }, + { + "epoch": 0.15239069291741242, + "grad_norm": 63.37942602698678, + "learning_rate": 5.076704545454546e-06, + "loss": 2.9388, + "step": 1788 + }, + { + "epoch": 0.15247592261143783, + "grad_norm": 171.80709598398306, + "learning_rate": 5.079545454545455e-06, + "loss": 3.3816, + "step": 1789 + }, + { + "epoch": 0.15256115230546322, + "grad_norm": 57.40450025217871, + "learning_rate": 5.082386363636364e-06, + "loss": 2.4666, + "step": 1790 + }, + { + "epoch": 0.15264638199948863, + "grad_norm": 153.2617195192066, + "learning_rate": 5.085227272727273e-06, + "loss": 2.9914, + "step": 1791 + }, + { + "epoch": 0.152731611693514, + "grad_norm": 102.82263780935195, + "learning_rate": 5.088068181818182e-06, + "loss": 3.2061, + "step": 1792 + }, + { + "epoch": 0.15281684138753943, + "grad_norm": 114.74240869946031, + "learning_rate": 5.090909090909091e-06, + "loss": 3.6025, + "step": 1793 + }, + { + "epoch": 0.1529020710815648, + "grad_norm": 133.14422441813716, + "learning_rate": 5.093750000000001e-06, + "loss": 4.0884, + "step": 1794 + }, + { + "epoch": 0.15298730077559022, + "grad_norm": 86.66795890556128, + "learning_rate": 5.096590909090909e-06, + "loss": 2.6369, + "step": 1795 + }, + { + "epoch": 0.1530725304696156, + "grad_norm": 84.84848476057425, + "learning_rate": 5.099431818181818e-06, + "loss": 3.6771, + "step": 1796 + }, + { + "epoch": 0.15315776016364102, + "grad_norm": 97.90812720594533, + "learning_rate": 5.102272727272728e-06, + "loss": 2.9871, + "step": 1797 + }, + { + "epoch": 0.1532429898576664, + "grad_norm": 136.33061977571302, + "learning_rate": 5.105113636363637e-06, + "loss": 3.0826, + "step": 1798 + }, + { + "epoch": 0.15332821955169182, + "grad_norm": 195.86465515666785, + "learning_rate": 5.107954545454546e-06, + "loss": 4.0935, + "step": 1799 + }, + { + "epoch": 0.1534134492457172, + "grad_norm": 101.3562291092811, + "learning_rate": 5.110795454545455e-06, + "loss": 2.9213, + "step": 1800 + }, + { + "epoch": 0.15349867893974262, + "grad_norm": 133.4959034360437, + "learning_rate": 5.113636363636364e-06, + "loss": 2.8177, + "step": 1801 + }, + { + "epoch": 0.153583908633768, + "grad_norm": 108.18960178983005, + "learning_rate": 5.1164772727272724e-06, + "loss": 3.3185, + "step": 1802 + }, + { + "epoch": 0.15366913832779341, + "grad_norm": 88.15051413882802, + "learning_rate": 5.119318181818182e-06, + "loss": 3.3729, + "step": 1803 + }, + { + "epoch": 0.1537543680218188, + "grad_norm": 60.69473497018971, + "learning_rate": 5.122159090909091e-06, + "loss": 2.6933, + "step": 1804 + }, + { + "epoch": 0.1538395977158442, + "grad_norm": 137.08910998967292, + "learning_rate": 5.125e-06, + "loss": 3.0112, + "step": 1805 + }, + { + "epoch": 0.1539248274098696, + "grad_norm": 75.53406649040272, + "learning_rate": 5.12784090909091e-06, + "loss": 3.3116, + "step": 1806 + }, + { + "epoch": 0.154010057103895, + "grad_norm": 95.02580455212393, + "learning_rate": 5.130681818181819e-06, + "loss": 3.4581, + "step": 1807 + }, + { + "epoch": 0.1540952867979204, + "grad_norm": 84.89017004727611, + "learning_rate": 5.133522727272727e-06, + "loss": 3.025, + "step": 1808 + }, + { + "epoch": 0.1541805164919458, + "grad_norm": 94.29920406908832, + "learning_rate": 5.1363636363636375e-06, + "loss": 2.3855, + "step": 1809 + }, + { + "epoch": 0.1542657461859712, + "grad_norm": 100.34432022272962, + "learning_rate": 5.1392045454545456e-06, + "loss": 3.4142, + "step": 1810 + }, + { + "epoch": 0.15435097587999658, + "grad_norm": 121.66595996160365, + "learning_rate": 5.1420454545454545e-06, + "loss": 3.6066, + "step": 1811 + }, + { + "epoch": 0.154436205574022, + "grad_norm": 226.35342285636017, + "learning_rate": 5.144886363636364e-06, + "loss": 3.8433, + "step": 1812 + }, + { + "epoch": 0.15452143526804737, + "grad_norm": 70.25403793498023, + "learning_rate": 5.147727272727273e-06, + "loss": 1.7764, + "step": 1813 + }, + { + "epoch": 0.1546066649620728, + "grad_norm": 189.66436599021748, + "learning_rate": 5.150568181818182e-06, + "loss": 3.5379, + "step": 1814 + }, + { + "epoch": 0.15469189465609817, + "grad_norm": 67.26690991359645, + "learning_rate": 5.153409090909092e-06, + "loss": 3.5537, + "step": 1815 + }, + { + "epoch": 0.15477712435012358, + "grad_norm": 95.50839231718433, + "learning_rate": 5.156250000000001e-06, + "loss": 2.5261, + "step": 1816 + }, + { + "epoch": 0.15486235404414897, + "grad_norm": 72.99305046135203, + "learning_rate": 5.159090909090909e-06, + "loss": 2.9345, + "step": 1817 + }, + { + "epoch": 0.15494758373817438, + "grad_norm": 65.89672179380246, + "learning_rate": 5.161931818181819e-06, + "loss": 2.928, + "step": 1818 + }, + { + "epoch": 0.15503281343219977, + "grad_norm": 138.03469934179844, + "learning_rate": 5.164772727272728e-06, + "loss": 3.2052, + "step": 1819 + }, + { + "epoch": 0.15511804312622518, + "grad_norm": 74.28447406238944, + "learning_rate": 5.1676136363636365e-06, + "loss": 2.6411, + "step": 1820 + }, + { + "epoch": 0.15520327282025057, + "grad_norm": 402.4021136758565, + "learning_rate": 5.170454545454546e-06, + "loss": 4.2999, + "step": 1821 + }, + { + "epoch": 0.15528850251427598, + "grad_norm": 57.90067353428665, + "learning_rate": 5.173295454545455e-06, + "loss": 3.1447, + "step": 1822 + }, + { + "epoch": 0.15537373220830136, + "grad_norm": 210.9372582426665, + "learning_rate": 5.176136363636363e-06, + "loss": 4.1413, + "step": 1823 + }, + { + "epoch": 0.15545896190232678, + "grad_norm": 69.5674689040682, + "learning_rate": 5.178977272727274e-06, + "loss": 2.6444, + "step": 1824 + }, + { + "epoch": 0.15554419159635216, + "grad_norm": 110.6080474978393, + "learning_rate": 5.181818181818182e-06, + "loss": 2.7317, + "step": 1825 + }, + { + "epoch": 0.15562942129037757, + "grad_norm": 72.10556364817103, + "learning_rate": 5.184659090909091e-06, + "loss": 2.9135, + "step": 1826 + }, + { + "epoch": 0.15571465098440296, + "grad_norm": 112.1237648135776, + "learning_rate": 5.187500000000001e-06, + "loss": 3.0818, + "step": 1827 + }, + { + "epoch": 0.15579988067842837, + "grad_norm": 134.69491978152632, + "learning_rate": 5.19034090909091e-06, + "loss": 4.4074, + "step": 1828 + }, + { + "epoch": 0.15588511037245376, + "grad_norm": 243.07267738839792, + "learning_rate": 5.193181818181819e-06, + "loss": 4.8142, + "step": 1829 + }, + { + "epoch": 0.15597034006647917, + "grad_norm": 106.28014092885488, + "learning_rate": 5.196022727272728e-06, + "loss": 3.008, + "step": 1830 + }, + { + "epoch": 0.15605556976050455, + "grad_norm": 375.80820582217615, + "learning_rate": 5.198863636363637e-06, + "loss": 2.4244, + "step": 1831 + }, + { + "epoch": 0.15614079945452997, + "grad_norm": 107.86699034498169, + "learning_rate": 5.201704545454545e-06, + "loss": 2.6524, + "step": 1832 + }, + { + "epoch": 0.15622602914855535, + "grad_norm": 94.54269614744008, + "learning_rate": 5.204545454545455e-06, + "loss": 3.3149, + "step": 1833 + }, + { + "epoch": 0.15631125884258076, + "grad_norm": 66.40558262560535, + "learning_rate": 5.207386363636364e-06, + "loss": 3.0119, + "step": 1834 + }, + { + "epoch": 0.15639648853660615, + "grad_norm": 46.692400414227095, + "learning_rate": 5.210227272727273e-06, + "loss": 2.1639, + "step": 1835 + }, + { + "epoch": 0.15648171823063156, + "grad_norm": 85.7357300473509, + "learning_rate": 5.213068181818183e-06, + "loss": 2.471, + "step": 1836 + }, + { + "epoch": 0.15656694792465695, + "grad_norm": 140.05821880776102, + "learning_rate": 5.215909090909092e-06, + "loss": 3.2834, + "step": 1837 + }, + { + "epoch": 0.15665217761868236, + "grad_norm": 185.540895712163, + "learning_rate": 5.21875e-06, + "loss": 2.942, + "step": 1838 + }, + { + "epoch": 0.15673740731270774, + "grad_norm": 111.53029483030036, + "learning_rate": 5.22159090909091e-06, + "loss": 2.9481, + "step": 1839 + }, + { + "epoch": 0.15682263700673316, + "grad_norm": 112.5808923059975, + "learning_rate": 5.2244318181818185e-06, + "loss": 2.9053, + "step": 1840 + }, + { + "epoch": 0.15690786670075854, + "grad_norm": 171.52536289354475, + "learning_rate": 5.2272727272727274e-06, + "loss": 4.5307, + "step": 1841 + }, + { + "epoch": 0.15699309639478395, + "grad_norm": 105.5178935490287, + "learning_rate": 5.230113636363637e-06, + "loss": 3.1347, + "step": 1842 + }, + { + "epoch": 0.15707832608880934, + "grad_norm": 79.25356536953349, + "learning_rate": 5.232954545454546e-06, + "loss": 2.797, + "step": 1843 + }, + { + "epoch": 0.15716355578283475, + "grad_norm": 76.09107876476757, + "learning_rate": 5.235795454545455e-06, + "loss": 3.292, + "step": 1844 + }, + { + "epoch": 0.15724878547686014, + "grad_norm": 97.36774851762061, + "learning_rate": 5.238636363636363e-06, + "loss": 3.3534, + "step": 1845 + }, + { + "epoch": 0.15733401517088555, + "grad_norm": 82.34833405449373, + "learning_rate": 5.241477272727274e-06, + "loss": 3.7249, + "step": 1846 + }, + { + "epoch": 0.15741924486491093, + "grad_norm": 146.62462522071334, + "learning_rate": 5.244318181818182e-06, + "loss": 4.1356, + "step": 1847 + }, + { + "epoch": 0.15750447455893635, + "grad_norm": 142.50085093124468, + "learning_rate": 5.247159090909091e-06, + "loss": 3.1532, + "step": 1848 + }, + { + "epoch": 0.15758970425296173, + "grad_norm": 76.12648826552956, + "learning_rate": 5.2500000000000006e-06, + "loss": 3.4979, + "step": 1849 + }, + { + "epoch": 0.15767493394698712, + "grad_norm": 363.9794915354472, + "learning_rate": 5.2528409090909095e-06, + "loss": 3.4045, + "step": 1850 + }, + { + "epoch": 0.15776016364101253, + "grad_norm": 81.1487801445979, + "learning_rate": 5.255681818181818e-06, + "loss": 4.0162, + "step": 1851 + }, + { + "epoch": 0.15784539333503791, + "grad_norm": 56.376390277980434, + "learning_rate": 5.258522727272728e-06, + "loss": 3.0948, + "step": 1852 + }, + { + "epoch": 0.15793062302906333, + "grad_norm": 96.13273336578604, + "learning_rate": 5.261363636363636e-06, + "loss": 2.9038, + "step": 1853 + }, + { + "epoch": 0.1580158527230887, + "grad_norm": 162.62993972643588, + "learning_rate": 5.264204545454545e-06, + "loss": 3.0312, + "step": 1854 + }, + { + "epoch": 0.15810108241711412, + "grad_norm": 465.0875876910905, + "learning_rate": 5.267045454545455e-06, + "loss": 4.2135, + "step": 1855 + }, + { + "epoch": 0.1581863121111395, + "grad_norm": 166.91836179189602, + "learning_rate": 5.269886363636364e-06, + "loss": 3.5773, + "step": 1856 + }, + { + "epoch": 0.15827154180516492, + "grad_norm": 78.73145659057752, + "learning_rate": 5.272727272727273e-06, + "loss": 3.1824, + "step": 1857 + }, + { + "epoch": 0.1583567714991903, + "grad_norm": 71.9442139014642, + "learning_rate": 5.275568181818183e-06, + "loss": 2.9165, + "step": 1858 + }, + { + "epoch": 0.15844200119321572, + "grad_norm": 102.9507195925425, + "learning_rate": 5.2784090909090915e-06, + "loss": 3.6232, + "step": 1859 + }, + { + "epoch": 0.1585272308872411, + "grad_norm": 141.63960738479088, + "learning_rate": 5.28125e-06, + "loss": 3.4591, + "step": 1860 + }, + { + "epoch": 0.15861246058126652, + "grad_norm": 141.6477444684188, + "learning_rate": 5.28409090909091e-06, + "loss": 3.8435, + "step": 1861 + }, + { + "epoch": 0.1586976902752919, + "grad_norm": 154.82673991982338, + "learning_rate": 5.286931818181818e-06, + "loss": 3.2294, + "step": 1862 + }, + { + "epoch": 0.15878291996931732, + "grad_norm": 131.00223723052216, + "learning_rate": 5.289772727272727e-06, + "loss": 3.9695, + "step": 1863 + }, + { + "epoch": 0.1588681496633427, + "grad_norm": 143.61535468135514, + "learning_rate": 5.292613636363637e-06, + "loss": 2.7444, + "step": 1864 + }, + { + "epoch": 0.1589533793573681, + "grad_norm": 107.95794291292336, + "learning_rate": 5.295454545454546e-06, + "loss": 3.4322, + "step": 1865 + }, + { + "epoch": 0.1590386090513935, + "grad_norm": 56.697270660102966, + "learning_rate": 5.298295454545455e-06, + "loss": 2.9164, + "step": 1866 + }, + { + "epoch": 0.1591238387454189, + "grad_norm": 82.95808695619634, + "learning_rate": 5.301136363636365e-06, + "loss": 2.5618, + "step": 1867 + }, + { + "epoch": 0.1592090684394443, + "grad_norm": 129.52709217179202, + "learning_rate": 5.303977272727273e-06, + "loss": 3.919, + "step": 1868 + }, + { + "epoch": 0.1592942981334697, + "grad_norm": 111.6691146574534, + "learning_rate": 5.306818181818182e-06, + "loss": 3.0115, + "step": 1869 + }, + { + "epoch": 0.1593795278274951, + "grad_norm": 78.47261723272963, + "learning_rate": 5.3096590909090915e-06, + "loss": 3.2285, + "step": 1870 + }, + { + "epoch": 0.1594647575215205, + "grad_norm": 111.57536125040657, + "learning_rate": 5.3125e-06, + "loss": 3.4927, + "step": 1871 + }, + { + "epoch": 0.1595499872155459, + "grad_norm": 177.5589885380517, + "learning_rate": 5.315340909090909e-06, + "loss": 4.2868, + "step": 1872 + }, + { + "epoch": 0.1596352169095713, + "grad_norm": 175.86089049001802, + "learning_rate": 5.318181818181819e-06, + "loss": 3.0444, + "step": 1873 + }, + { + "epoch": 0.1597204466035967, + "grad_norm": 205.82065095423343, + "learning_rate": 5.321022727272728e-06, + "loss": 4.6519, + "step": 1874 + }, + { + "epoch": 0.1598056762976221, + "grad_norm": 92.74696331697245, + "learning_rate": 5.323863636363636e-06, + "loss": 3.9217, + "step": 1875 + }, + { + "epoch": 0.1598909059916475, + "grad_norm": 217.7639002714063, + "learning_rate": 5.326704545454546e-06, + "loss": 4.4102, + "step": 1876 + }, + { + "epoch": 0.1599761356856729, + "grad_norm": 87.98189809681766, + "learning_rate": 5.329545454545455e-06, + "loss": 3.8735, + "step": 1877 + }, + { + "epoch": 0.16006136537969828, + "grad_norm": 68.53392402972906, + "learning_rate": 5.332386363636364e-06, + "loss": 2.6966, + "step": 1878 + }, + { + "epoch": 0.1601465950737237, + "grad_norm": 57.18763633128511, + "learning_rate": 5.3352272727272735e-06, + "loss": 2.5538, + "step": 1879 + }, + { + "epoch": 0.16023182476774908, + "grad_norm": 108.53151912887172, + "learning_rate": 5.3380681818181824e-06, + "loss": 3.7791, + "step": 1880 + }, + { + "epoch": 0.1603170544617745, + "grad_norm": 309.1446990089572, + "learning_rate": 5.340909090909091e-06, + "loss": 2.0623, + "step": 1881 + }, + { + "epoch": 0.16040228415579988, + "grad_norm": 64.14532588623035, + "learning_rate": 5.343750000000001e-06, + "loss": 2.9976, + "step": 1882 + }, + { + "epoch": 0.1604875138498253, + "grad_norm": 155.71597724552979, + "learning_rate": 5.346590909090909e-06, + "loss": 3.3079, + "step": 1883 + }, + { + "epoch": 0.16057274354385068, + "grad_norm": 98.1920176119401, + "learning_rate": 5.349431818181818e-06, + "loss": 3.4863, + "step": 1884 + }, + { + "epoch": 0.1606579732378761, + "grad_norm": 65.75990670335811, + "learning_rate": 5.352272727272728e-06, + "loss": 3.1998, + "step": 1885 + }, + { + "epoch": 0.16074320293190147, + "grad_norm": 108.83711431548558, + "learning_rate": 5.355113636363637e-06, + "loss": 4.1299, + "step": 1886 + }, + { + "epoch": 0.16082843262592686, + "grad_norm": 97.12071799229625, + "learning_rate": 5.357954545454546e-06, + "loss": 3.3467, + "step": 1887 + }, + { + "epoch": 0.16091366231995227, + "grad_norm": 282.44585863663525, + "learning_rate": 5.3607954545454556e-06, + "loss": 4.7088, + "step": 1888 + }, + { + "epoch": 0.16099889201397766, + "grad_norm": 160.82008811640657, + "learning_rate": 5.3636363636363645e-06, + "loss": 3.9899, + "step": 1889 + }, + { + "epoch": 0.16108412170800307, + "grad_norm": 69.02167161009355, + "learning_rate": 5.3664772727272726e-06, + "loss": 3.2585, + "step": 1890 + }, + { + "epoch": 0.16116935140202845, + "grad_norm": 70.39708420613765, + "learning_rate": 5.369318181818182e-06, + "loss": 2.5308, + "step": 1891 + }, + { + "epoch": 0.16125458109605387, + "grad_norm": 118.31474580666728, + "learning_rate": 5.372159090909091e-06, + "loss": 3.6581, + "step": 1892 + }, + { + "epoch": 0.16133981079007925, + "grad_norm": 66.387582007677, + "learning_rate": 5.375e-06, + "loss": 2.9489, + "step": 1893 + }, + { + "epoch": 0.16142504048410466, + "grad_norm": 111.57129847557009, + "learning_rate": 5.37784090909091e-06, + "loss": 4.1187, + "step": 1894 + }, + { + "epoch": 0.16151027017813005, + "grad_norm": 305.16774787935384, + "learning_rate": 5.380681818181819e-06, + "loss": 3.7711, + "step": 1895 + }, + { + "epoch": 0.16159549987215546, + "grad_norm": 62.570714364019295, + "learning_rate": 5.383522727272728e-06, + "loss": 2.664, + "step": 1896 + }, + { + "epoch": 0.16168072956618085, + "grad_norm": 94.09291683622982, + "learning_rate": 5.386363636363638e-06, + "loss": 3.1784, + "step": 1897 + }, + { + "epoch": 0.16176595926020626, + "grad_norm": 113.53206181760144, + "learning_rate": 5.389204545454546e-06, + "loss": 3.6146, + "step": 1898 + }, + { + "epoch": 0.16185118895423165, + "grad_norm": 220.89394710780348, + "learning_rate": 5.392045454545455e-06, + "loss": 3.4694, + "step": 1899 + }, + { + "epoch": 0.16193641864825706, + "grad_norm": 142.12765622780014, + "learning_rate": 5.3948863636363636e-06, + "loss": 3.6588, + "step": 1900 + }, + { + "epoch": 0.16202164834228244, + "grad_norm": 77.2982790401099, + "learning_rate": 5.397727272727273e-06, + "loss": 3.3114, + "step": 1901 + }, + { + "epoch": 0.16210687803630786, + "grad_norm": 88.78857229395483, + "learning_rate": 5.400568181818182e-06, + "loss": 3.5098, + "step": 1902 + }, + { + "epoch": 0.16219210773033324, + "grad_norm": 60.593773611482206, + "learning_rate": 5.403409090909091e-06, + "loss": 2.8407, + "step": 1903 + }, + { + "epoch": 0.16227733742435865, + "grad_norm": 107.32390787268827, + "learning_rate": 5.406250000000001e-06, + "loss": 3.3921, + "step": 1904 + }, + { + "epoch": 0.16236256711838404, + "grad_norm": 99.19909584744418, + "learning_rate": 5.409090909090909e-06, + "loss": 2.4914, + "step": 1905 + }, + { + "epoch": 0.16244779681240945, + "grad_norm": 102.9403935290656, + "learning_rate": 5.411931818181818e-06, + "loss": 2.7231, + "step": 1906 + }, + { + "epoch": 0.16253302650643484, + "grad_norm": 58.498089446390004, + "learning_rate": 5.414772727272728e-06, + "loss": 3.3444, + "step": 1907 + }, + { + "epoch": 0.16261825620046025, + "grad_norm": 97.91478072625458, + "learning_rate": 5.417613636363637e-06, + "loss": 3.686, + "step": 1908 + }, + { + "epoch": 0.16270348589448563, + "grad_norm": 134.50513805639824, + "learning_rate": 5.420454545454546e-06, + "loss": 3.9117, + "step": 1909 + }, + { + "epoch": 0.16278871558851105, + "grad_norm": 116.23159559716137, + "learning_rate": 5.423295454545455e-06, + "loss": 3.7665, + "step": 1910 + }, + { + "epoch": 0.16287394528253643, + "grad_norm": 179.21058976476746, + "learning_rate": 5.426136363636364e-06, + "loss": 3.4932, + "step": 1911 + }, + { + "epoch": 0.16295917497656184, + "grad_norm": 213.99126609849318, + "learning_rate": 5.428977272727272e-06, + "loss": 4.3296, + "step": 1912 + }, + { + "epoch": 0.16304440467058723, + "grad_norm": 124.34817223218182, + "learning_rate": 5.431818181818182e-06, + "loss": 3.9657, + "step": 1913 + }, + { + "epoch": 0.16312963436461264, + "grad_norm": 161.93689845493026, + "learning_rate": 5.434659090909091e-06, + "loss": 2.3238, + "step": 1914 + }, + { + "epoch": 0.16321486405863803, + "grad_norm": 123.09808701171649, + "learning_rate": 5.4375e-06, + "loss": 3.1894, + "step": 1915 + }, + { + "epoch": 0.16330009375266344, + "grad_norm": 57.262534270038756, + "learning_rate": 5.44034090909091e-06, + "loss": 2.2567, + "step": 1916 + }, + { + "epoch": 0.16338532344668882, + "grad_norm": 102.35300649102335, + "learning_rate": 5.443181818181819e-06, + "loss": 3.9315, + "step": 1917 + }, + { + "epoch": 0.16347055314071424, + "grad_norm": 68.47635478586254, + "learning_rate": 5.446022727272728e-06, + "loss": 3.2921, + "step": 1918 + }, + { + "epoch": 0.16355578283473962, + "grad_norm": 153.0583034662128, + "learning_rate": 5.4488636363636374e-06, + "loss": 4.0733, + "step": 1919 + }, + { + "epoch": 0.16364101252876503, + "grad_norm": 101.56775741149843, + "learning_rate": 5.4517045454545455e-06, + "loss": 2.5545, + "step": 1920 + }, + { + "epoch": 0.16372624222279042, + "grad_norm": 126.55413089486814, + "learning_rate": 5.4545454545454545e-06, + "loss": 3.6287, + "step": 1921 + }, + { + "epoch": 0.16381147191681583, + "grad_norm": 67.31208577628033, + "learning_rate": 5.457386363636364e-06, + "loss": 3.1173, + "step": 1922 + }, + { + "epoch": 0.16389670161084122, + "grad_norm": 72.52574269573213, + "learning_rate": 5.460227272727273e-06, + "loss": 3.8098, + "step": 1923 + }, + { + "epoch": 0.16398193130486663, + "grad_norm": 231.78623072088044, + "learning_rate": 5.463068181818182e-06, + "loss": 3.4985, + "step": 1924 + }, + { + "epoch": 0.16406716099889201, + "grad_norm": 125.13777176513919, + "learning_rate": 5.465909090909092e-06, + "loss": 3.8162, + "step": 1925 + }, + { + "epoch": 0.1641523906929174, + "grad_norm": 68.08283700150068, + "learning_rate": 5.468750000000001e-06, + "loss": 3.8698, + "step": 1926 + }, + { + "epoch": 0.1642376203869428, + "grad_norm": 121.69397914300019, + "learning_rate": 5.471590909090909e-06, + "loss": 3.8707, + "step": 1927 + }, + { + "epoch": 0.1643228500809682, + "grad_norm": 98.5632447383988, + "learning_rate": 5.474431818181819e-06, + "loss": 3.362, + "step": 1928 + }, + { + "epoch": 0.1644080797749936, + "grad_norm": 130.3872968287418, + "learning_rate": 5.477272727272728e-06, + "loss": 3.8459, + "step": 1929 + }, + { + "epoch": 0.164493309469019, + "grad_norm": 106.86274876017106, + "learning_rate": 5.4801136363636365e-06, + "loss": 3.1798, + "step": 1930 + }, + { + "epoch": 0.1645785391630444, + "grad_norm": 81.30816010593499, + "learning_rate": 5.482954545454546e-06, + "loss": 2.6747, + "step": 1931 + }, + { + "epoch": 0.1646637688570698, + "grad_norm": 79.84564895330158, + "learning_rate": 5.485795454545455e-06, + "loss": 2.8742, + "step": 1932 + }, + { + "epoch": 0.1647489985510952, + "grad_norm": 100.24620638053472, + "learning_rate": 5.488636363636364e-06, + "loss": 3.1913, + "step": 1933 + }, + { + "epoch": 0.1648342282451206, + "grad_norm": 299.5339274812267, + "learning_rate": 5.491477272727274e-06, + "loss": 3.4548, + "step": 1934 + }, + { + "epoch": 0.164919457939146, + "grad_norm": 450.3146802424454, + "learning_rate": 5.494318181818182e-06, + "loss": 4.2685, + "step": 1935 + }, + { + "epoch": 0.1650046876331714, + "grad_norm": 140.5685039986956, + "learning_rate": 5.497159090909091e-06, + "loss": 3.7073, + "step": 1936 + }, + { + "epoch": 0.1650899173271968, + "grad_norm": 224.27106418631502, + "learning_rate": 5.500000000000001e-06, + "loss": 4.3809, + "step": 1937 + }, + { + "epoch": 0.16517514702122219, + "grad_norm": 225.50508564897677, + "learning_rate": 5.50284090909091e-06, + "loss": 2.899, + "step": 1938 + }, + { + "epoch": 0.1652603767152476, + "grad_norm": 100.45816997787121, + "learning_rate": 5.5056818181818186e-06, + "loss": 3.7668, + "step": 1939 + }, + { + "epoch": 0.16534560640927298, + "grad_norm": 53.9744035077739, + "learning_rate": 5.508522727272728e-06, + "loss": 2.8955, + "step": 1940 + }, + { + "epoch": 0.1654308361032984, + "grad_norm": 112.70106734717024, + "learning_rate": 5.511363636363637e-06, + "loss": 2.4765, + "step": 1941 + }, + { + "epoch": 0.16551606579732378, + "grad_norm": 121.06364507352227, + "learning_rate": 5.514204545454545e-06, + "loss": 4.3512, + "step": 1942 + }, + { + "epoch": 0.1656012954913492, + "grad_norm": 144.42634955918894, + "learning_rate": 5.517045454545455e-06, + "loss": 3.7399, + "step": 1943 + }, + { + "epoch": 0.16568652518537458, + "grad_norm": 97.99146208626017, + "learning_rate": 5.519886363636364e-06, + "loss": 3.6169, + "step": 1944 + }, + { + "epoch": 0.1657717548794, + "grad_norm": 123.4655671748095, + "learning_rate": 5.522727272727273e-06, + "loss": 3.9832, + "step": 1945 + }, + { + "epoch": 0.16585698457342538, + "grad_norm": 86.70686147819814, + "learning_rate": 5.525568181818183e-06, + "loss": 3.2685, + "step": 1946 + }, + { + "epoch": 0.1659422142674508, + "grad_norm": 112.41027451812927, + "learning_rate": 5.528409090909092e-06, + "loss": 3.2945, + "step": 1947 + }, + { + "epoch": 0.16602744396147617, + "grad_norm": 93.75791849318513, + "learning_rate": 5.531250000000001e-06, + "loss": 4.0213, + "step": 1948 + }, + { + "epoch": 0.16611267365550159, + "grad_norm": 82.50882828109563, + "learning_rate": 5.53409090909091e-06, + "loss": 3.6531, + "step": 1949 + }, + { + "epoch": 0.16619790334952697, + "grad_norm": 171.6365867126143, + "learning_rate": 5.5369318181818185e-06, + "loss": 4.2701, + "step": 1950 + }, + { + "epoch": 0.16628313304355238, + "grad_norm": 73.71872919167208, + "learning_rate": 5.539772727272727e-06, + "loss": 3.0051, + "step": 1951 + }, + { + "epoch": 0.16636836273757777, + "grad_norm": 221.53332175735844, + "learning_rate": 5.542613636363637e-06, + "loss": 5.9937, + "step": 1952 + }, + { + "epoch": 0.16645359243160318, + "grad_norm": 94.43632252774643, + "learning_rate": 5.545454545454546e-06, + "loss": 3.7595, + "step": 1953 + }, + { + "epoch": 0.16653882212562857, + "grad_norm": 125.96489384212086, + "learning_rate": 5.548295454545455e-06, + "loss": 3.7266, + "step": 1954 + }, + { + "epoch": 0.16662405181965398, + "grad_norm": 44.897507262550775, + "learning_rate": 5.551136363636364e-06, + "loss": 3.0612, + "step": 1955 + }, + { + "epoch": 0.16670928151367936, + "grad_norm": 149.95064988662696, + "learning_rate": 5.553977272727274e-06, + "loss": 3.2365, + "step": 1956 + }, + { + "epoch": 0.16679451120770478, + "grad_norm": 69.95700857723233, + "learning_rate": 5.556818181818182e-06, + "loss": 3.2098, + "step": 1957 + }, + { + "epoch": 0.16687974090173016, + "grad_norm": 114.37094077476263, + "learning_rate": 5.559659090909091e-06, + "loss": 2.7326, + "step": 1958 + }, + { + "epoch": 0.16696497059575557, + "grad_norm": 59.76882961828661, + "learning_rate": 5.5625000000000005e-06, + "loss": 3.395, + "step": 1959 + }, + { + "epoch": 0.16705020028978096, + "grad_norm": 171.54639976406438, + "learning_rate": 5.5653409090909095e-06, + "loss": 4.7538, + "step": 1960 + }, + { + "epoch": 0.16713542998380637, + "grad_norm": 144.0139320391528, + "learning_rate": 5.568181818181818e-06, + "loss": 3.6117, + "step": 1961 + }, + { + "epoch": 0.16722065967783176, + "grad_norm": 114.03556582116167, + "learning_rate": 5.571022727272728e-06, + "loss": 2.6611, + "step": 1962 + }, + { + "epoch": 0.16730588937185714, + "grad_norm": 93.68609017467487, + "learning_rate": 5.573863636363637e-06, + "loss": 3.6929, + "step": 1963 + }, + { + "epoch": 0.16739111906588255, + "grad_norm": 79.1048778567079, + "learning_rate": 5.576704545454545e-06, + "loss": 3.499, + "step": 1964 + }, + { + "epoch": 0.16747634875990794, + "grad_norm": 42.53416371569771, + "learning_rate": 5.579545454545455e-06, + "loss": 2.6543, + "step": 1965 + }, + { + "epoch": 0.16756157845393335, + "grad_norm": 135.90100296348618, + "learning_rate": 5.582386363636364e-06, + "loss": 3.6955, + "step": 1966 + }, + { + "epoch": 0.16764680814795874, + "grad_norm": 85.19351615407896, + "learning_rate": 5.585227272727273e-06, + "loss": 2.2412, + "step": 1967 + }, + { + "epoch": 0.16773203784198415, + "grad_norm": 85.86031360210023, + "learning_rate": 5.588068181818183e-06, + "loss": 2.9086, + "step": 1968 + }, + { + "epoch": 0.16781726753600953, + "grad_norm": 164.59406053483585, + "learning_rate": 5.5909090909090915e-06, + "loss": 3.3693, + "step": 1969 + }, + { + "epoch": 0.16790249723003495, + "grad_norm": 58.72992084776697, + "learning_rate": 5.59375e-06, + "loss": 3.2909, + "step": 1970 + }, + { + "epoch": 0.16798772692406033, + "grad_norm": 90.40772281571367, + "learning_rate": 5.59659090909091e-06, + "loss": 3.158, + "step": 1971 + }, + { + "epoch": 0.16807295661808574, + "grad_norm": 82.65824997492528, + "learning_rate": 5.599431818181818e-06, + "loss": 2.8581, + "step": 1972 + }, + { + "epoch": 0.16815818631211113, + "grad_norm": 105.46743529587079, + "learning_rate": 5.602272727272727e-06, + "loss": 3.6265, + "step": 1973 + }, + { + "epoch": 0.16824341600613654, + "grad_norm": 88.61370044227641, + "learning_rate": 5.605113636363637e-06, + "loss": 2.2804, + "step": 1974 + }, + { + "epoch": 0.16832864570016193, + "grad_norm": 43.824187214445836, + "learning_rate": 5.607954545454546e-06, + "loss": 2.1266, + "step": 1975 + }, + { + "epoch": 0.16841387539418734, + "grad_norm": 81.23501406099773, + "learning_rate": 5.610795454545455e-06, + "loss": 3.0238, + "step": 1976 + }, + { + "epoch": 0.16849910508821273, + "grad_norm": 65.22360959746703, + "learning_rate": 5.613636363636365e-06, + "loss": 3.3308, + "step": 1977 + }, + { + "epoch": 0.16858433478223814, + "grad_norm": 234.12748076156464, + "learning_rate": 5.6164772727272736e-06, + "loss": 4.9229, + "step": 1978 + }, + { + "epoch": 0.16866956447626352, + "grad_norm": 75.6434879009168, + "learning_rate": 5.619318181818182e-06, + "loss": 2.921, + "step": 1979 + }, + { + "epoch": 0.16875479417028894, + "grad_norm": 110.95002083795413, + "learning_rate": 5.6221590909090914e-06, + "loss": 3.0315, + "step": 1980 + }, + { + "epoch": 0.16884002386431432, + "grad_norm": 170.9950431392088, + "learning_rate": 5.625e-06, + "loss": 3.4062, + "step": 1981 + }, + { + "epoch": 0.16892525355833973, + "grad_norm": 84.56891856131628, + "learning_rate": 5.627840909090909e-06, + "loss": 2.7938, + "step": 1982 + }, + { + "epoch": 0.16901048325236512, + "grad_norm": 133.33276029393244, + "learning_rate": 5.630681818181819e-06, + "loss": 3.9897, + "step": 1983 + }, + { + "epoch": 0.16909571294639053, + "grad_norm": 168.47420804417877, + "learning_rate": 5.633522727272728e-06, + "loss": 3.7508, + "step": 1984 + }, + { + "epoch": 0.16918094264041592, + "grad_norm": 94.57501743085763, + "learning_rate": 5.636363636363636e-06, + "loss": 3.5372, + "step": 1985 + }, + { + "epoch": 0.16926617233444133, + "grad_norm": 247.53453063147583, + "learning_rate": 5.639204545454547e-06, + "loss": 4.3312, + "step": 1986 + }, + { + "epoch": 0.1693514020284667, + "grad_norm": 50.17580963759968, + "learning_rate": 5.642045454545455e-06, + "loss": 2.3209, + "step": 1987 + }, + { + "epoch": 0.16943663172249213, + "grad_norm": 59.10055927096425, + "learning_rate": 5.644886363636364e-06, + "loss": 3.0515, + "step": 1988 + }, + { + "epoch": 0.1695218614165175, + "grad_norm": 72.8395965264244, + "learning_rate": 5.6477272727272735e-06, + "loss": 3.3499, + "step": 1989 + }, + { + "epoch": 0.16960709111054292, + "grad_norm": 130.63614886671544, + "learning_rate": 5.650568181818182e-06, + "loss": 3.697, + "step": 1990 + }, + { + "epoch": 0.1696923208045683, + "grad_norm": 126.37833839588455, + "learning_rate": 5.653409090909091e-06, + "loss": 2.7765, + "step": 1991 + }, + { + "epoch": 0.16977755049859372, + "grad_norm": 178.58927786925886, + "learning_rate": 5.656250000000001e-06, + "loss": 3.7837, + "step": 1992 + }, + { + "epoch": 0.1698627801926191, + "grad_norm": 79.3650231295696, + "learning_rate": 5.65909090909091e-06, + "loss": 3.1699, + "step": 1993 + }, + { + "epoch": 0.16994800988664452, + "grad_norm": 49.086392371731996, + "learning_rate": 5.661931818181818e-06, + "loss": 2.312, + "step": 1994 + }, + { + "epoch": 0.1700332395806699, + "grad_norm": 47.40776205180969, + "learning_rate": 5.664772727272728e-06, + "loss": 2.9562, + "step": 1995 + }, + { + "epoch": 0.17011846927469532, + "grad_norm": 103.22086934536765, + "learning_rate": 5.667613636363637e-06, + "loss": 3.6197, + "step": 1996 + }, + { + "epoch": 0.1702036989687207, + "grad_norm": 575.3798061244446, + "learning_rate": 5.670454545454546e-06, + "loss": 4.0525, + "step": 1997 + }, + { + "epoch": 0.17028892866274611, + "grad_norm": 181.02089572234362, + "learning_rate": 5.6732954545454555e-06, + "loss": 3.7393, + "step": 1998 + }, + { + "epoch": 0.1703741583567715, + "grad_norm": 87.93907966317829, + "learning_rate": 5.6761363636363645e-06, + "loss": 2.6959, + "step": 1999 + }, + { + "epoch": 0.17045938805079688, + "grad_norm": 124.86662596821455, + "learning_rate": 5.6789772727272725e-06, + "loss": 3.7239, + "step": 2000 + }, + { + "epoch": 0.1705446177448223, + "grad_norm": 82.73430115928782, + "learning_rate": 5.681818181818183e-06, + "loss": 2.8689, + "step": 2001 + }, + { + "epoch": 0.17062984743884768, + "grad_norm": 116.85645820090781, + "learning_rate": 5.684659090909091e-06, + "loss": 2.8652, + "step": 2002 + }, + { + "epoch": 0.1707150771328731, + "grad_norm": 64.6699974761657, + "learning_rate": 5.6875e-06, + "loss": 3.6783, + "step": 2003 + }, + { + "epoch": 0.17080030682689848, + "grad_norm": 66.81926849667894, + "learning_rate": 5.69034090909091e-06, + "loss": 2.894, + "step": 2004 + }, + { + "epoch": 0.1708855365209239, + "grad_norm": 77.96772507479591, + "learning_rate": 5.693181818181819e-06, + "loss": 3.8023, + "step": 2005 + }, + { + "epoch": 0.17097076621494928, + "grad_norm": 91.84033656481344, + "learning_rate": 5.696022727272728e-06, + "loss": 4.0787, + "step": 2006 + }, + { + "epoch": 0.1710559959089747, + "grad_norm": 141.42575078870055, + "learning_rate": 5.698863636363638e-06, + "loss": 4.4352, + "step": 2007 + }, + { + "epoch": 0.17114122560300007, + "grad_norm": 121.09419318848651, + "learning_rate": 5.7017045454545465e-06, + "loss": 3.7628, + "step": 2008 + }, + { + "epoch": 0.1712264552970255, + "grad_norm": 48.19347242611962, + "learning_rate": 5.704545454545455e-06, + "loss": 2.9092, + "step": 2009 + }, + { + "epoch": 0.17131168499105087, + "grad_norm": 238.1630727894455, + "learning_rate": 5.7073863636363635e-06, + "loss": 2.831, + "step": 2010 + }, + { + "epoch": 0.17139691468507628, + "grad_norm": 73.42635020979017, + "learning_rate": 5.710227272727273e-06, + "loss": 3.1143, + "step": 2011 + }, + { + "epoch": 0.17148214437910167, + "grad_norm": 82.29032398099874, + "learning_rate": 5.713068181818182e-06, + "loss": 3.3938, + "step": 2012 + }, + { + "epoch": 0.17156737407312708, + "grad_norm": 137.00543468421012, + "learning_rate": 5.715909090909091e-06, + "loss": 3.846, + "step": 2013 + }, + { + "epoch": 0.17165260376715247, + "grad_norm": 791.0839922166091, + "learning_rate": 5.718750000000001e-06, + "loss": 3.8321, + "step": 2014 + }, + { + "epoch": 0.17173783346117788, + "grad_norm": 63.52321855106094, + "learning_rate": 5.721590909090909e-06, + "loss": 3.7604, + "step": 2015 + }, + { + "epoch": 0.17182306315520327, + "grad_norm": 530.6766411205448, + "learning_rate": 5.724431818181818e-06, + "loss": 2.7884, + "step": 2016 + }, + { + "epoch": 0.17190829284922868, + "grad_norm": 64.17008295625051, + "learning_rate": 5.727272727272728e-06, + "loss": 3.3987, + "step": 2017 + }, + { + "epoch": 0.17199352254325406, + "grad_norm": 145.5735937075718, + "learning_rate": 5.730113636363637e-06, + "loss": 3.1573, + "step": 2018 + }, + { + "epoch": 0.17207875223727948, + "grad_norm": 44.48196138793298, + "learning_rate": 5.732954545454546e-06, + "loss": 2.9762, + "step": 2019 + }, + { + "epoch": 0.17216398193130486, + "grad_norm": 90.80788384836356, + "learning_rate": 5.735795454545455e-06, + "loss": 3.6144, + "step": 2020 + }, + { + "epoch": 0.17224921162533027, + "grad_norm": 82.8874587487279, + "learning_rate": 5.738636363636364e-06, + "loss": 3.0533, + "step": 2021 + }, + { + "epoch": 0.17233444131935566, + "grad_norm": 83.46284929565023, + "learning_rate": 5.741477272727272e-06, + "loss": 3.0323, + "step": 2022 + }, + { + "epoch": 0.17241967101338107, + "grad_norm": 118.16411946994722, + "learning_rate": 5.744318181818183e-06, + "loss": 3.4836, + "step": 2023 + }, + { + "epoch": 0.17250490070740646, + "grad_norm": 67.15165260528728, + "learning_rate": 5.747159090909091e-06, + "loss": 2.9874, + "step": 2024 + }, + { + "epoch": 0.17259013040143187, + "grad_norm": 99.36189885188256, + "learning_rate": 5.75e-06, + "loss": 3.6817, + "step": 2025 + }, + { + "epoch": 0.17267536009545725, + "grad_norm": 71.63614492518334, + "learning_rate": 5.75284090909091e-06, + "loss": 3.3335, + "step": 2026 + }, + { + "epoch": 0.17276058978948267, + "grad_norm": 94.41472067686254, + "learning_rate": 5.755681818181819e-06, + "loss": 3.3996, + "step": 2027 + }, + { + "epoch": 0.17284581948350805, + "grad_norm": 99.55447644983957, + "learning_rate": 5.758522727272728e-06, + "loss": 3.8562, + "step": 2028 + }, + { + "epoch": 0.17293104917753346, + "grad_norm": 46.88441822656839, + "learning_rate": 5.761363636363637e-06, + "loss": 2.7166, + "step": 2029 + }, + { + "epoch": 0.17301627887155885, + "grad_norm": 98.32270753930554, + "learning_rate": 5.7642045454545455e-06, + "loss": 3.7411, + "step": 2030 + }, + { + "epoch": 0.17310150856558426, + "grad_norm": 142.8966062977535, + "learning_rate": 5.7670454545454544e-06, + "loss": 2.6452, + "step": 2031 + }, + { + "epoch": 0.17318673825960965, + "grad_norm": 155.41690712353227, + "learning_rate": 5.769886363636364e-06, + "loss": 4.1411, + "step": 2032 + }, + { + "epoch": 0.17327196795363506, + "grad_norm": 258.0687076527562, + "learning_rate": 5.772727272727273e-06, + "loss": 4.373, + "step": 2033 + }, + { + "epoch": 0.17335719764766044, + "grad_norm": 147.8947194138643, + "learning_rate": 5.775568181818182e-06, + "loss": 4.3896, + "step": 2034 + }, + { + "epoch": 0.17344242734168586, + "grad_norm": 57.504662899773855, + "learning_rate": 5.778409090909092e-06, + "loss": 3.3557, + "step": 2035 + }, + { + "epoch": 0.17352765703571124, + "grad_norm": 137.57881085471766, + "learning_rate": 5.781250000000001e-06, + "loss": 4.0094, + "step": 2036 + }, + { + "epoch": 0.17361288672973665, + "grad_norm": 287.15518328364135, + "learning_rate": 5.784090909090909e-06, + "loss": 3.1709, + "step": 2037 + }, + { + "epoch": 0.17369811642376204, + "grad_norm": 74.40573482996209, + "learning_rate": 5.786931818181819e-06, + "loss": 3.1476, + "step": 2038 + }, + { + "epoch": 0.17378334611778742, + "grad_norm": 116.58100264508936, + "learning_rate": 5.7897727272727276e-06, + "loss": 3.7694, + "step": 2039 + }, + { + "epoch": 0.17386857581181284, + "grad_norm": 104.7172195224555, + "learning_rate": 5.7926136363636365e-06, + "loss": 2.9208, + "step": 2040 + }, + { + "epoch": 0.17395380550583822, + "grad_norm": 82.65230994771177, + "learning_rate": 5.795454545454546e-06, + "loss": 2.8104, + "step": 2041 + }, + { + "epoch": 0.17403903519986363, + "grad_norm": 91.47224611719147, + "learning_rate": 5.798295454545455e-06, + "loss": 3.8766, + "step": 2042 + }, + { + "epoch": 0.17412426489388902, + "grad_norm": 80.62051261948729, + "learning_rate": 5.801136363636364e-06, + "loss": 2.4581, + "step": 2043 + }, + { + "epoch": 0.17420949458791443, + "grad_norm": 84.38184819134779, + "learning_rate": 5.803977272727274e-06, + "loss": 3.328, + "step": 2044 + }, + { + "epoch": 0.17429472428193982, + "grad_norm": 172.4416771666494, + "learning_rate": 5.806818181818182e-06, + "loss": 3.0187, + "step": 2045 + }, + { + "epoch": 0.17437995397596523, + "grad_norm": 127.70398932205441, + "learning_rate": 5.809659090909091e-06, + "loss": 2.961, + "step": 2046 + }, + { + "epoch": 0.17446518366999061, + "grad_norm": 245.16121042660214, + "learning_rate": 5.812500000000001e-06, + "loss": 4.0717, + "step": 2047 + }, + { + "epoch": 0.17455041336401603, + "grad_norm": 101.66914839954597, + "learning_rate": 5.81534090909091e-06, + "loss": 4.218, + "step": 2048 + }, + { + "epoch": 0.1746356430580414, + "grad_norm": 55.1612344490449, + "learning_rate": 5.8181818181818185e-06, + "loss": 3.5541, + "step": 2049 + }, + { + "epoch": 0.17472087275206682, + "grad_norm": 40.180388299205966, + "learning_rate": 5.821022727272728e-06, + "loss": 1.5936, + "step": 2050 + }, + { + "epoch": 0.1748061024460922, + "grad_norm": 76.38265695798087, + "learning_rate": 5.823863636363637e-06, + "loss": 3.6771, + "step": 2051 + }, + { + "epoch": 0.17489133214011762, + "grad_norm": 124.39005761621983, + "learning_rate": 5.826704545454545e-06, + "loss": 3.6095, + "step": 2052 + }, + { + "epoch": 0.174976561834143, + "grad_norm": 115.19978661720974, + "learning_rate": 5.829545454545455e-06, + "loss": 2.838, + "step": 2053 + }, + { + "epoch": 0.17506179152816842, + "grad_norm": 59.43788803778266, + "learning_rate": 5.832386363636364e-06, + "loss": 3.5225, + "step": 2054 + }, + { + "epoch": 0.1751470212221938, + "grad_norm": 105.84974357920457, + "learning_rate": 5.835227272727273e-06, + "loss": 3.4397, + "step": 2055 + }, + { + "epoch": 0.17523225091621922, + "grad_norm": 119.83629645176715, + "learning_rate": 5.838068181818183e-06, + "loss": 3.9481, + "step": 2056 + }, + { + "epoch": 0.1753174806102446, + "grad_norm": 98.86348387128898, + "learning_rate": 5.840909090909092e-06, + "loss": 3.9272, + "step": 2057 + }, + { + "epoch": 0.17540271030427002, + "grad_norm": 105.04423732062313, + "learning_rate": 5.843750000000001e-06, + "loss": 3.8813, + "step": 2058 + }, + { + "epoch": 0.1754879399982954, + "grad_norm": 107.33420437649357, + "learning_rate": 5.84659090909091e-06, + "loss": 2.6132, + "step": 2059 + }, + { + "epoch": 0.1755731696923208, + "grad_norm": 125.865791388516, + "learning_rate": 5.8494318181818184e-06, + "loss": 2.8395, + "step": 2060 + }, + { + "epoch": 0.1756583993863462, + "grad_norm": 215.5896038360586, + "learning_rate": 5.852272727272727e-06, + "loss": 3.4946, + "step": 2061 + }, + { + "epoch": 0.1757436290803716, + "grad_norm": 191.86129386450068, + "learning_rate": 5.855113636363637e-06, + "loss": 3.4535, + "step": 2062 + }, + { + "epoch": 0.175828858774397, + "grad_norm": 86.97690384064089, + "learning_rate": 5.857954545454546e-06, + "loss": 2.9879, + "step": 2063 + }, + { + "epoch": 0.1759140884684224, + "grad_norm": 87.7257951102111, + "learning_rate": 5.860795454545455e-06, + "loss": 3.6125, + "step": 2064 + }, + { + "epoch": 0.1759993181624478, + "grad_norm": 103.76630741119257, + "learning_rate": 5.863636363636364e-06, + "loss": 3.2721, + "step": 2065 + }, + { + "epoch": 0.1760845478564732, + "grad_norm": 107.52315710040739, + "learning_rate": 5.866477272727274e-06, + "loss": 4.3121, + "step": 2066 + }, + { + "epoch": 0.1761697775504986, + "grad_norm": 110.62712334484958, + "learning_rate": 5.869318181818182e-06, + "loss": 3.0313, + "step": 2067 + }, + { + "epoch": 0.176255007244524, + "grad_norm": 65.74551825653822, + "learning_rate": 5.872159090909091e-06, + "loss": 3.5874, + "step": 2068 + }, + { + "epoch": 0.1763402369385494, + "grad_norm": 127.56115286783155, + "learning_rate": 5.8750000000000005e-06, + "loss": 3.5678, + "step": 2069 + }, + { + "epoch": 0.1764254666325748, + "grad_norm": 110.01935916850957, + "learning_rate": 5.8778409090909094e-06, + "loss": 4.7994, + "step": 2070 + }, + { + "epoch": 0.17651069632660019, + "grad_norm": 91.70360431718903, + "learning_rate": 5.880681818181818e-06, + "loss": 3.281, + "step": 2071 + }, + { + "epoch": 0.1765959260206256, + "grad_norm": 144.2588653986909, + "learning_rate": 5.883522727272728e-06, + "loss": 3.1606, + "step": 2072 + }, + { + "epoch": 0.17668115571465098, + "grad_norm": 57.7773885065342, + "learning_rate": 5.886363636363637e-06, + "loss": 2.9706, + "step": 2073 + }, + { + "epoch": 0.1767663854086764, + "grad_norm": 70.87633351953147, + "learning_rate": 5.889204545454545e-06, + "loss": 3.853, + "step": 2074 + }, + { + "epoch": 0.17685161510270178, + "grad_norm": 83.5560661496805, + "learning_rate": 5.892045454545455e-06, + "loss": 3.2564, + "step": 2075 + }, + { + "epoch": 0.17693684479672717, + "grad_norm": 83.32542651484248, + "learning_rate": 5.894886363636364e-06, + "loss": 3.1443, + "step": 2076 + }, + { + "epoch": 0.17702207449075258, + "grad_norm": 111.19808699174648, + "learning_rate": 5.897727272727273e-06, + "loss": 3.4711, + "step": 2077 + }, + { + "epoch": 0.17710730418477796, + "grad_norm": 91.6885030123419, + "learning_rate": 5.9005681818181826e-06, + "loss": 3.5624, + "step": 2078 + }, + { + "epoch": 0.17719253387880338, + "grad_norm": 109.36169676653545, + "learning_rate": 5.9034090909090915e-06, + "loss": 3.4865, + "step": 2079 + }, + { + "epoch": 0.17727776357282876, + "grad_norm": 114.55579478453735, + "learning_rate": 5.90625e-06, + "loss": 3.8473, + "step": 2080 + }, + { + "epoch": 0.17736299326685417, + "grad_norm": 77.37208619341675, + "learning_rate": 5.90909090909091e-06, + "loss": 1.8703, + "step": 2081 + }, + { + "epoch": 0.17744822296087956, + "grad_norm": 133.34794127608, + "learning_rate": 5.911931818181818e-06, + "loss": 3.6173, + "step": 2082 + }, + { + "epoch": 0.17753345265490497, + "grad_norm": 149.83420701702994, + "learning_rate": 5.914772727272727e-06, + "loss": 3.828, + "step": 2083 + }, + { + "epoch": 0.17761868234893036, + "grad_norm": 104.66420391555503, + "learning_rate": 5.917613636363637e-06, + "loss": 4.0314, + "step": 2084 + }, + { + "epoch": 0.17770391204295577, + "grad_norm": 113.79498337393665, + "learning_rate": 5.920454545454546e-06, + "loss": 3.8749, + "step": 2085 + }, + { + "epoch": 0.17778914173698115, + "grad_norm": 87.14416721159894, + "learning_rate": 5.923295454545455e-06, + "loss": 2.9816, + "step": 2086 + }, + { + "epoch": 0.17787437143100657, + "grad_norm": 117.2712911103091, + "learning_rate": 5.926136363636365e-06, + "loss": 4.2248, + "step": 2087 + }, + { + "epoch": 0.17795960112503195, + "grad_norm": 86.38957948490742, + "learning_rate": 5.9289772727272735e-06, + "loss": 4.0067, + "step": 2088 + }, + { + "epoch": 0.17804483081905736, + "grad_norm": 76.20397326464385, + "learning_rate": 5.931818181818182e-06, + "loss": 3.3343, + "step": 2089 + }, + { + "epoch": 0.17813006051308275, + "grad_norm": 105.04380383352597, + "learning_rate": 5.934659090909091e-06, + "loss": 2.8353, + "step": 2090 + }, + { + "epoch": 0.17821529020710816, + "grad_norm": 123.12648735233907, + "learning_rate": 5.9375e-06, + "loss": 3.4602, + "step": 2091 + }, + { + "epoch": 0.17830051990113355, + "grad_norm": 76.62312341990545, + "learning_rate": 5.940340909090909e-06, + "loss": 3.0814, + "step": 2092 + }, + { + "epoch": 0.17838574959515896, + "grad_norm": 232.49910989581326, + "learning_rate": 5.943181818181819e-06, + "loss": 3.5861, + "step": 2093 + }, + { + "epoch": 0.17847097928918434, + "grad_norm": 162.18984399766512, + "learning_rate": 5.946022727272728e-06, + "loss": 2.8095, + "step": 2094 + }, + { + "epoch": 0.17855620898320976, + "grad_norm": 138.67627432376003, + "learning_rate": 5.948863636363637e-06, + "loss": 3.6416, + "step": 2095 + }, + { + "epoch": 0.17864143867723514, + "grad_norm": 84.73340701753422, + "learning_rate": 5.951704545454547e-06, + "loss": 3.7917, + "step": 2096 + }, + { + "epoch": 0.17872666837126056, + "grad_norm": 90.92617773357084, + "learning_rate": 5.954545454545455e-06, + "loss": 3.7256, + "step": 2097 + }, + { + "epoch": 0.17881189806528594, + "grad_norm": 76.42205186851035, + "learning_rate": 5.957386363636364e-06, + "loss": 2.7761, + "step": 2098 + }, + { + "epoch": 0.17889712775931135, + "grad_norm": 279.66418697257706, + "learning_rate": 5.9602272727272735e-06, + "loss": 4.4716, + "step": 2099 + }, + { + "epoch": 0.17898235745333674, + "grad_norm": 64.68435764300787, + "learning_rate": 5.963068181818182e-06, + "loss": 2.848, + "step": 2100 + }, + { + "epoch": 0.17906758714736215, + "grad_norm": 92.63440939892428, + "learning_rate": 5.965909090909091e-06, + "loss": 3.5569, + "step": 2101 + }, + { + "epoch": 0.17915281684138754, + "grad_norm": 178.20365759859496, + "learning_rate": 5.968750000000001e-06, + "loss": 4.2459, + "step": 2102 + }, + { + "epoch": 0.17923804653541295, + "grad_norm": 243.71489027812316, + "learning_rate": 5.97159090909091e-06, + "loss": 4.657, + "step": 2103 + }, + { + "epoch": 0.17932327622943833, + "grad_norm": 80.22502936228287, + "learning_rate": 5.974431818181818e-06, + "loss": 3.1094, + "step": 2104 + }, + { + "epoch": 0.17940850592346375, + "grad_norm": 204.35395325695535, + "learning_rate": 5.977272727272728e-06, + "loss": 3.926, + "step": 2105 + }, + { + "epoch": 0.17949373561748913, + "grad_norm": 65.66833346679877, + "learning_rate": 5.980113636363637e-06, + "loss": 3.3261, + "step": 2106 + }, + { + "epoch": 0.17957896531151454, + "grad_norm": 132.86978377676803, + "learning_rate": 5.982954545454546e-06, + "loss": 3.0857, + "step": 2107 + }, + { + "epoch": 0.17966419500553993, + "grad_norm": 148.7654792387899, + "learning_rate": 5.9857954545454555e-06, + "loss": 3.4817, + "step": 2108 + }, + { + "epoch": 0.17974942469956534, + "grad_norm": 105.37157788890734, + "learning_rate": 5.9886363636363644e-06, + "loss": 3.533, + "step": 2109 + }, + { + "epoch": 0.17983465439359073, + "grad_norm": 138.21964759271685, + "learning_rate": 5.991477272727273e-06, + "loss": 3.2262, + "step": 2110 + }, + { + "epoch": 0.17991988408761614, + "grad_norm": 67.68646355704742, + "learning_rate": 5.994318181818183e-06, + "loss": 3.1154, + "step": 2111 + }, + { + "epoch": 0.18000511378164152, + "grad_norm": 163.38986914501092, + "learning_rate": 5.997159090909091e-06, + "loss": 4.0644, + "step": 2112 + }, + { + "epoch": 0.1800903434756669, + "grad_norm": 84.63018954299041, + "learning_rate": 6e-06, + "loss": 3.4855, + "step": 2113 + }, + { + "epoch": 0.18017557316969232, + "grad_norm": 55.70262592785156, + "learning_rate": 6.00284090909091e-06, + "loss": 2.7551, + "step": 2114 + }, + { + "epoch": 0.1802608028637177, + "grad_norm": 70.01311527763117, + "learning_rate": 6.005681818181819e-06, + "loss": 3.0442, + "step": 2115 + }, + { + "epoch": 0.18034603255774312, + "grad_norm": 221.44227093805435, + "learning_rate": 6.008522727272728e-06, + "loss": 5.7601, + "step": 2116 + }, + { + "epoch": 0.1804312622517685, + "grad_norm": 123.18402462595769, + "learning_rate": 6.0113636363636376e-06, + "loss": 3.8804, + "step": 2117 + }, + { + "epoch": 0.18051649194579392, + "grad_norm": 167.20666798236647, + "learning_rate": 6.0142045454545465e-06, + "loss": 3.7256, + "step": 2118 + }, + { + "epoch": 0.1806017216398193, + "grad_norm": 91.04664964049653, + "learning_rate": 6.0170454545454546e-06, + "loss": 4.1175, + "step": 2119 + }, + { + "epoch": 0.18068695133384471, + "grad_norm": 125.43715667532312, + "learning_rate": 6.0198863636363635e-06, + "loss": 3.2288, + "step": 2120 + }, + { + "epoch": 0.1807721810278701, + "grad_norm": 118.60211572449492, + "learning_rate": 6.022727272727273e-06, + "loss": 4.217, + "step": 2121 + }, + { + "epoch": 0.1808574107218955, + "grad_norm": 65.27016966597256, + "learning_rate": 6.025568181818182e-06, + "loss": 3.2348, + "step": 2122 + }, + { + "epoch": 0.1809426404159209, + "grad_norm": 155.32695273526465, + "learning_rate": 6.028409090909091e-06, + "loss": 4.0307, + "step": 2123 + }, + { + "epoch": 0.1810278701099463, + "grad_norm": 64.80502827875894, + "learning_rate": 6.031250000000001e-06, + "loss": 2.2672, + "step": 2124 + }, + { + "epoch": 0.1811130998039717, + "grad_norm": 180.33974270342154, + "learning_rate": 6.03409090909091e-06, + "loss": 3.9142, + "step": 2125 + }, + { + "epoch": 0.1811983294979971, + "grad_norm": 88.27952617177905, + "learning_rate": 6.036931818181818e-06, + "loss": 3.6935, + "step": 2126 + }, + { + "epoch": 0.1812835591920225, + "grad_norm": 102.55975602564628, + "learning_rate": 6.039772727272728e-06, + "loss": 3.4739, + "step": 2127 + }, + { + "epoch": 0.1813687888860479, + "grad_norm": 70.73590077860045, + "learning_rate": 6.042613636363637e-06, + "loss": 2.9469, + "step": 2128 + }, + { + "epoch": 0.1814540185800733, + "grad_norm": 95.68684064845147, + "learning_rate": 6.0454545454545456e-06, + "loss": 3.6425, + "step": 2129 + }, + { + "epoch": 0.1815392482740987, + "grad_norm": 62.41786589953618, + "learning_rate": 6.048295454545455e-06, + "loss": 3.4575, + "step": 2130 + }, + { + "epoch": 0.1816244779681241, + "grad_norm": 74.98507267779439, + "learning_rate": 6.051136363636364e-06, + "loss": 3.1296, + "step": 2131 + }, + { + "epoch": 0.1817097076621495, + "grad_norm": 66.05278418947228, + "learning_rate": 6.053977272727272e-06, + "loss": 3.4366, + "step": 2132 + }, + { + "epoch": 0.18179493735617488, + "grad_norm": 101.36027626783289, + "learning_rate": 6.056818181818183e-06, + "loss": 3.599, + "step": 2133 + }, + { + "epoch": 0.1818801670502003, + "grad_norm": 83.20424023676141, + "learning_rate": 6.059659090909091e-06, + "loss": 3.1726, + "step": 2134 + }, + { + "epoch": 0.18196539674422568, + "grad_norm": 80.83466371140926, + "learning_rate": 6.0625e-06, + "loss": 3.0616, + "step": 2135 + }, + { + "epoch": 0.1820506264382511, + "grad_norm": 70.05167780640998, + "learning_rate": 6.06534090909091e-06, + "loss": 3.265, + "step": 2136 + }, + { + "epoch": 0.18213585613227648, + "grad_norm": 140.6662863413854, + "learning_rate": 6.068181818181819e-06, + "loss": 3.9357, + "step": 2137 + }, + { + "epoch": 0.1822210858263019, + "grad_norm": 94.74780179667842, + "learning_rate": 6.071022727272728e-06, + "loss": 3.06, + "step": 2138 + }, + { + "epoch": 0.18230631552032728, + "grad_norm": 122.33799532468318, + "learning_rate": 6.073863636363637e-06, + "loss": 2.9332, + "step": 2139 + }, + { + "epoch": 0.1823915452143527, + "grad_norm": 104.33815583150891, + "learning_rate": 6.076704545454546e-06, + "loss": 3.9721, + "step": 2140 + }, + { + "epoch": 0.18247677490837808, + "grad_norm": 91.89870756915165, + "learning_rate": 6.079545454545454e-06, + "loss": 3.7846, + "step": 2141 + }, + { + "epoch": 0.1825620046024035, + "grad_norm": 75.6517984353422, + "learning_rate": 6.082386363636364e-06, + "loss": 3.2601, + "step": 2142 + }, + { + "epoch": 0.18264723429642887, + "grad_norm": 118.78838160216561, + "learning_rate": 6.085227272727273e-06, + "loss": 3.8469, + "step": 2143 + }, + { + "epoch": 0.18273246399045429, + "grad_norm": 129.14358017523446, + "learning_rate": 6.088068181818182e-06, + "loss": 3.4172, + "step": 2144 + }, + { + "epoch": 0.18281769368447967, + "grad_norm": 67.93565986626959, + "learning_rate": 6.090909090909092e-06, + "loss": 2.73, + "step": 2145 + }, + { + "epoch": 0.18290292337850508, + "grad_norm": 95.54070052166763, + "learning_rate": 6.093750000000001e-06, + "loss": 3.5823, + "step": 2146 + }, + { + "epoch": 0.18298815307253047, + "grad_norm": 101.73636109279941, + "learning_rate": 6.096590909090909e-06, + "loss": 3.8246, + "step": 2147 + }, + { + "epoch": 0.18307338276655588, + "grad_norm": 53.7288159731596, + "learning_rate": 6.0994318181818194e-06, + "loss": 3.3898, + "step": 2148 + }, + { + "epoch": 0.18315861246058127, + "grad_norm": 171.202030704218, + "learning_rate": 6.1022727272727275e-06, + "loss": 2.643, + "step": 2149 + }, + { + "epoch": 0.18324384215460668, + "grad_norm": 131.63749020427727, + "learning_rate": 6.1051136363636365e-06, + "loss": 3.9355, + "step": 2150 + }, + { + "epoch": 0.18332907184863206, + "grad_norm": 112.62715771376722, + "learning_rate": 6.107954545454546e-06, + "loss": 3.8409, + "step": 2151 + }, + { + "epoch": 0.18341430154265745, + "grad_norm": 73.66815770593641, + "learning_rate": 6.110795454545455e-06, + "loss": 2.6064, + "step": 2152 + }, + { + "epoch": 0.18349953123668286, + "grad_norm": 95.21621020930081, + "learning_rate": 6.113636363636364e-06, + "loss": 3.056, + "step": 2153 + }, + { + "epoch": 0.18358476093070825, + "grad_norm": 79.16838001517144, + "learning_rate": 6.116477272727274e-06, + "loss": 3.6293, + "step": 2154 + }, + { + "epoch": 0.18366999062473366, + "grad_norm": 65.3115426410358, + "learning_rate": 6.119318181818183e-06, + "loss": 2.6601, + "step": 2155 + }, + { + "epoch": 0.18375522031875904, + "grad_norm": 69.36845194287416, + "learning_rate": 6.122159090909091e-06, + "loss": 3.306, + "step": 2156 + }, + { + "epoch": 0.18384045001278446, + "grad_norm": 63.65790569245756, + "learning_rate": 6.125000000000001e-06, + "loss": 2.4379, + "step": 2157 + }, + { + "epoch": 0.18392567970680984, + "grad_norm": 55.30772994130881, + "learning_rate": 6.12784090909091e-06, + "loss": 3.0914, + "step": 2158 + }, + { + "epoch": 0.18401090940083525, + "grad_norm": 93.49086625399737, + "learning_rate": 6.1306818181818185e-06, + "loss": 4.3984, + "step": 2159 + }, + { + "epoch": 0.18409613909486064, + "grad_norm": 89.46624566548869, + "learning_rate": 6.133522727272728e-06, + "loss": 3.5936, + "step": 2160 + }, + { + "epoch": 0.18418136878888605, + "grad_norm": 89.77035803148934, + "learning_rate": 6.136363636363637e-06, + "loss": 3.1941, + "step": 2161 + }, + { + "epoch": 0.18426659848291144, + "grad_norm": 180.4544545520886, + "learning_rate": 6.139204545454545e-06, + "loss": 4.9302, + "step": 2162 + }, + { + "epoch": 0.18435182817693685, + "grad_norm": 75.72445093569145, + "learning_rate": 6.142045454545456e-06, + "loss": 3.1661, + "step": 2163 + }, + { + "epoch": 0.18443705787096223, + "grad_norm": 77.27191612324629, + "learning_rate": 6.144886363636364e-06, + "loss": 3.4004, + "step": 2164 + }, + { + "epoch": 0.18452228756498765, + "grad_norm": 85.89802078711196, + "learning_rate": 6.147727272727273e-06, + "loss": 2.3471, + "step": 2165 + }, + { + "epoch": 0.18460751725901303, + "grad_norm": 351.0312110909898, + "learning_rate": 6.150568181818183e-06, + "loss": 4.2509, + "step": 2166 + }, + { + "epoch": 0.18469274695303844, + "grad_norm": 112.95438982513382, + "learning_rate": 6.153409090909092e-06, + "loss": 3.629, + "step": 2167 + }, + { + "epoch": 0.18477797664706383, + "grad_norm": 59.867375126405506, + "learning_rate": 6.1562500000000006e-06, + "loss": 3.3594, + "step": 2168 + }, + { + "epoch": 0.18486320634108924, + "grad_norm": 76.77255144739573, + "learning_rate": 6.15909090909091e-06, + "loss": 3.8044, + "step": 2169 + }, + { + "epoch": 0.18494843603511463, + "grad_norm": 79.905489121039, + "learning_rate": 6.161931818181819e-06, + "loss": 3.2818, + "step": 2170 + }, + { + "epoch": 0.18503366572914004, + "grad_norm": 149.48083937785302, + "learning_rate": 6.164772727272727e-06, + "loss": 4.0291, + "step": 2171 + }, + { + "epoch": 0.18511889542316542, + "grad_norm": 69.79620085274894, + "learning_rate": 6.167613636363637e-06, + "loss": 3.8514, + "step": 2172 + }, + { + "epoch": 0.18520412511719084, + "grad_norm": 83.64584981165748, + "learning_rate": 6.170454545454546e-06, + "loss": 2.5636, + "step": 2173 + }, + { + "epoch": 0.18528935481121622, + "grad_norm": 154.39884748047237, + "learning_rate": 6.173295454545455e-06, + "loss": 5.1604, + "step": 2174 + }, + { + "epoch": 0.18537458450524164, + "grad_norm": 74.23657403416496, + "learning_rate": 6.176136363636364e-06, + "loss": 3.4315, + "step": 2175 + }, + { + "epoch": 0.18545981419926702, + "grad_norm": 133.01753134064407, + "learning_rate": 6.178977272727274e-06, + "loss": 3.7548, + "step": 2176 + }, + { + "epoch": 0.18554504389329243, + "grad_norm": 111.01288275296449, + "learning_rate": 6.181818181818182e-06, + "loss": 3.6851, + "step": 2177 + }, + { + "epoch": 0.18563027358731782, + "grad_norm": 166.8367095134269, + "learning_rate": 6.184659090909091e-06, + "loss": 3.3748, + "step": 2178 + }, + { + "epoch": 0.18571550328134323, + "grad_norm": 71.97001292072518, + "learning_rate": 6.1875000000000005e-06, + "loss": 3.5952, + "step": 2179 + }, + { + "epoch": 0.18580073297536862, + "grad_norm": 157.54531846085047, + "learning_rate": 6.190340909090909e-06, + "loss": 3.278, + "step": 2180 + }, + { + "epoch": 0.18588596266939403, + "grad_norm": 101.1345096613944, + "learning_rate": 6.193181818181818e-06, + "loss": 3.5178, + "step": 2181 + }, + { + "epoch": 0.1859711923634194, + "grad_norm": 117.54448275673253, + "learning_rate": 6.196022727272728e-06, + "loss": 4.5492, + "step": 2182 + }, + { + "epoch": 0.18605642205744483, + "grad_norm": 50.30663761573239, + "learning_rate": 6.198863636363637e-06, + "loss": 3.3903, + "step": 2183 + }, + { + "epoch": 0.1861416517514702, + "grad_norm": 127.97010826279171, + "learning_rate": 6.201704545454545e-06, + "loss": 4.2459, + "step": 2184 + }, + { + "epoch": 0.18622688144549562, + "grad_norm": 110.81895745801602, + "learning_rate": 6.204545454545455e-06, + "loss": 3.2299, + "step": 2185 + }, + { + "epoch": 0.186312111139521, + "grad_norm": 55.516723876379416, + "learning_rate": 6.207386363636364e-06, + "loss": 3.6588, + "step": 2186 + }, + { + "epoch": 0.18639734083354642, + "grad_norm": 56.77355214584069, + "learning_rate": 6.210227272727273e-06, + "loss": 3.3551, + "step": 2187 + }, + { + "epoch": 0.1864825705275718, + "grad_norm": 104.7255413120695, + "learning_rate": 6.2130681818181825e-06, + "loss": 3.7466, + "step": 2188 + }, + { + "epoch": 0.1865678002215972, + "grad_norm": 198.29717117188474, + "learning_rate": 6.2159090909090915e-06, + "loss": 4.9458, + "step": 2189 + }, + { + "epoch": 0.1866530299156226, + "grad_norm": 68.0032403668256, + "learning_rate": 6.21875e-06, + "loss": 3.4568, + "step": 2190 + }, + { + "epoch": 0.186738259609648, + "grad_norm": 157.10738657023373, + "learning_rate": 6.22159090909091e-06, + "loss": 3.875, + "step": 2191 + }, + { + "epoch": 0.1868234893036734, + "grad_norm": 105.1390950450694, + "learning_rate": 6.224431818181818e-06, + "loss": 2.9166, + "step": 2192 + }, + { + "epoch": 0.1869087189976988, + "grad_norm": 139.5631623898991, + "learning_rate": 6.227272727272727e-06, + "loss": 4.1318, + "step": 2193 + }, + { + "epoch": 0.1869939486917242, + "grad_norm": 113.99603433470494, + "learning_rate": 6.230113636363637e-06, + "loss": 2.6322, + "step": 2194 + }, + { + "epoch": 0.18707917838574958, + "grad_norm": 257.91378654472965, + "learning_rate": 6.232954545454546e-06, + "loss": 3.8249, + "step": 2195 + }, + { + "epoch": 0.187164408079775, + "grad_norm": 225.1171778999779, + "learning_rate": 6.235795454545455e-06, + "loss": 4.0284, + "step": 2196 + }, + { + "epoch": 0.18724963777380038, + "grad_norm": 78.2606988139878, + "learning_rate": 6.238636363636365e-06, + "loss": 3.1612, + "step": 2197 + }, + { + "epoch": 0.1873348674678258, + "grad_norm": 88.11237789460289, + "learning_rate": 6.2414772727272735e-06, + "loss": 4.0365, + "step": 2198 + }, + { + "epoch": 0.18742009716185118, + "grad_norm": 227.22393978033284, + "learning_rate": 6.244318181818182e-06, + "loss": 4.5589, + "step": 2199 + }, + { + "epoch": 0.1875053268558766, + "grad_norm": 85.33223241451206, + "learning_rate": 6.247159090909091e-06, + "loss": 2.9459, + "step": 2200 + }, + { + "epoch": 0.18759055654990198, + "grad_norm": 135.4767378609759, + "learning_rate": 6.25e-06, + "loss": 3.984, + "step": 2201 + }, + { + "epoch": 0.1876757862439274, + "grad_norm": 125.50571340240238, + "learning_rate": 6.252840909090909e-06, + "loss": 4.5887, + "step": 2202 + }, + { + "epoch": 0.18776101593795277, + "grad_norm": 51.716512563924155, + "learning_rate": 6.255681818181819e-06, + "loss": 3.1976, + "step": 2203 + }, + { + "epoch": 0.1878462456319782, + "grad_norm": 63.48708282814194, + "learning_rate": 6.258522727272728e-06, + "loss": 3.5735, + "step": 2204 + }, + { + "epoch": 0.18793147532600357, + "grad_norm": 65.95411061079588, + "learning_rate": 6.261363636363637e-06, + "loss": 3.2447, + "step": 2205 + }, + { + "epoch": 0.18801670502002898, + "grad_norm": 50.38222774920616, + "learning_rate": 6.264204545454547e-06, + "loss": 2.7906, + "step": 2206 + }, + { + "epoch": 0.18810193471405437, + "grad_norm": 238.0467688136721, + "learning_rate": 6.267045454545455e-06, + "loss": 4.5883, + "step": 2207 + }, + { + "epoch": 0.18818716440807978, + "grad_norm": 87.41013542450189, + "learning_rate": 6.269886363636364e-06, + "loss": 3.5304, + "step": 2208 + }, + { + "epoch": 0.18827239410210517, + "grad_norm": 68.28305495910288, + "learning_rate": 6.2727272727272734e-06, + "loss": 2.803, + "step": 2209 + }, + { + "epoch": 0.18835762379613058, + "grad_norm": 81.66659524101694, + "learning_rate": 6.275568181818182e-06, + "loss": 3.4273, + "step": 2210 + }, + { + "epoch": 0.18844285349015596, + "grad_norm": 116.19809401238288, + "learning_rate": 6.278409090909091e-06, + "loss": 3.9523, + "step": 2211 + }, + { + "epoch": 0.18852808318418138, + "grad_norm": 243.7272901980601, + "learning_rate": 6.281250000000001e-06, + "loss": 4.4293, + "step": 2212 + }, + { + "epoch": 0.18861331287820676, + "grad_norm": 134.43422384437287, + "learning_rate": 6.28409090909091e-06, + "loss": 3.5246, + "step": 2213 + }, + { + "epoch": 0.18869854257223218, + "grad_norm": 84.97773860810291, + "learning_rate": 6.286931818181818e-06, + "loss": 2.9845, + "step": 2214 + }, + { + "epoch": 0.18878377226625756, + "grad_norm": 74.57643667389767, + "learning_rate": 6.289772727272728e-06, + "loss": 3.2546, + "step": 2215 + }, + { + "epoch": 0.18886900196028297, + "grad_norm": 374.78456847446125, + "learning_rate": 6.292613636363637e-06, + "loss": 4.0737, + "step": 2216 + }, + { + "epoch": 0.18895423165430836, + "grad_norm": 154.63408746897244, + "learning_rate": 6.295454545454546e-06, + "loss": 3.158, + "step": 2217 + }, + { + "epoch": 0.18903946134833377, + "grad_norm": 102.59310012609973, + "learning_rate": 6.2982954545454555e-06, + "loss": 4.0074, + "step": 2218 + }, + { + "epoch": 0.18912469104235916, + "grad_norm": 71.724722380757, + "learning_rate": 6.301136363636364e-06, + "loss": 3.5863, + "step": 2219 + }, + { + "epoch": 0.18920992073638457, + "grad_norm": 72.98647291601021, + "learning_rate": 6.303977272727273e-06, + "loss": 3.2241, + "step": 2220 + }, + { + "epoch": 0.18929515043040995, + "grad_norm": 153.52756197449384, + "learning_rate": 6.306818181818183e-06, + "loss": 3.3814, + "step": 2221 + }, + { + "epoch": 0.18938038012443537, + "grad_norm": 108.02334100991281, + "learning_rate": 6.309659090909091e-06, + "loss": 4.1091, + "step": 2222 + }, + { + "epoch": 0.18946560981846075, + "grad_norm": 102.34222146431411, + "learning_rate": 6.3125e-06, + "loss": 4.046, + "step": 2223 + }, + { + "epoch": 0.18955083951248616, + "grad_norm": 124.52377194905708, + "learning_rate": 6.31534090909091e-06, + "loss": 3.5239, + "step": 2224 + }, + { + "epoch": 0.18963606920651155, + "grad_norm": 91.28987635224708, + "learning_rate": 6.318181818181819e-06, + "loss": 3.1475, + "step": 2225 + }, + { + "epoch": 0.18972129890053693, + "grad_norm": 88.38312562050406, + "learning_rate": 6.321022727272728e-06, + "loss": 4.1269, + "step": 2226 + }, + { + "epoch": 0.18980652859456235, + "grad_norm": 164.6595818320549, + "learning_rate": 6.3238636363636375e-06, + "loss": 3.6832, + "step": 2227 + }, + { + "epoch": 0.18989175828858773, + "grad_norm": 109.16044999891544, + "learning_rate": 6.3267045454545465e-06, + "loss": 3.764, + "step": 2228 + }, + { + "epoch": 0.18997698798261314, + "grad_norm": 152.99828370572104, + "learning_rate": 6.3295454545454545e-06, + "loss": 4.4069, + "step": 2229 + }, + { + "epoch": 0.19006221767663853, + "grad_norm": 96.08988187567999, + "learning_rate": 6.3323863636363635e-06, + "loss": 3.3652, + "step": 2230 + }, + { + "epoch": 0.19014744737066394, + "grad_norm": 109.58567886612707, + "learning_rate": 6.335227272727273e-06, + "loss": 3.6796, + "step": 2231 + }, + { + "epoch": 0.19023267706468933, + "grad_norm": 178.3711320432844, + "learning_rate": 6.338068181818182e-06, + "loss": 3.6708, + "step": 2232 + }, + { + "epoch": 0.19031790675871474, + "grad_norm": 384.0652282810376, + "learning_rate": 6.340909090909091e-06, + "loss": 3.1737, + "step": 2233 + }, + { + "epoch": 0.19040313645274012, + "grad_norm": 105.63142743752151, + "learning_rate": 6.343750000000001e-06, + "loss": 3.2756, + "step": 2234 + }, + { + "epoch": 0.19048836614676554, + "grad_norm": 90.28694697741592, + "learning_rate": 6.34659090909091e-06, + "loss": 3.7698, + "step": 2235 + }, + { + "epoch": 0.19057359584079092, + "grad_norm": 121.29757520468519, + "learning_rate": 6.349431818181818e-06, + "loss": 2.9945, + "step": 2236 + }, + { + "epoch": 0.19065882553481633, + "grad_norm": 113.40739787573048, + "learning_rate": 6.352272727272728e-06, + "loss": 3.2129, + "step": 2237 + }, + { + "epoch": 0.19074405522884172, + "grad_norm": 42.672795267120634, + "learning_rate": 6.355113636363637e-06, + "loss": 2.9431, + "step": 2238 + }, + { + "epoch": 0.19082928492286713, + "grad_norm": 120.48891633178775, + "learning_rate": 6.3579545454545455e-06, + "loss": 4.1392, + "step": 2239 + }, + { + "epoch": 0.19091451461689252, + "grad_norm": 158.67216789298132, + "learning_rate": 6.360795454545455e-06, + "loss": 4.1877, + "step": 2240 + }, + { + "epoch": 0.19099974431091793, + "grad_norm": 94.73078134161707, + "learning_rate": 6.363636363636364e-06, + "loss": 3.5095, + "step": 2241 + }, + { + "epoch": 0.19108497400494331, + "grad_norm": 250.0614334035192, + "learning_rate": 6.366477272727273e-06, + "loss": 4.8857, + "step": 2242 + }, + { + "epoch": 0.19117020369896873, + "grad_norm": 176.2472310485101, + "learning_rate": 6.369318181818183e-06, + "loss": 3.1678, + "step": 2243 + }, + { + "epoch": 0.1912554333929941, + "grad_norm": 103.9245003591287, + "learning_rate": 6.372159090909091e-06, + "loss": 3.5492, + "step": 2244 + }, + { + "epoch": 0.19134066308701952, + "grad_norm": 110.78342684417849, + "learning_rate": 6.375e-06, + "loss": 4.7692, + "step": 2245 + }, + { + "epoch": 0.1914258927810449, + "grad_norm": 68.02858237647368, + "learning_rate": 6.37784090909091e-06, + "loss": 3.4782, + "step": 2246 + }, + { + "epoch": 0.19151112247507032, + "grad_norm": 131.10068342166488, + "learning_rate": 6.380681818181819e-06, + "loss": 4.6015, + "step": 2247 + }, + { + "epoch": 0.1915963521690957, + "grad_norm": 118.72634843031327, + "learning_rate": 6.383522727272728e-06, + "loss": 3.86, + "step": 2248 + }, + { + "epoch": 0.19168158186312112, + "grad_norm": 87.7943228447823, + "learning_rate": 6.386363636363637e-06, + "loss": 3.3929, + "step": 2249 + }, + { + "epoch": 0.1917668115571465, + "grad_norm": 90.8378499310141, + "learning_rate": 6.389204545454546e-06, + "loss": 2.7648, + "step": 2250 + }, + { + "epoch": 0.19185204125117192, + "grad_norm": 99.33258478408884, + "learning_rate": 6.392045454545454e-06, + "loss": 3.433, + "step": 2251 + }, + { + "epoch": 0.1919372709451973, + "grad_norm": 79.15180373389971, + "learning_rate": 6.394886363636364e-06, + "loss": 3.7454, + "step": 2252 + }, + { + "epoch": 0.19202250063922272, + "grad_norm": 71.98531719419822, + "learning_rate": 6.397727272727273e-06, + "loss": 3.4997, + "step": 2253 + }, + { + "epoch": 0.1921077303332481, + "grad_norm": 81.22092218200709, + "learning_rate": 6.400568181818182e-06, + "loss": 1.9461, + "step": 2254 + }, + { + "epoch": 0.1921929600272735, + "grad_norm": 166.803167268248, + "learning_rate": 6.403409090909092e-06, + "loss": 3.6484, + "step": 2255 + }, + { + "epoch": 0.1922781897212989, + "grad_norm": 119.20064861346059, + "learning_rate": 6.406250000000001e-06, + "loss": 3.6896, + "step": 2256 + }, + { + "epoch": 0.1923634194153243, + "grad_norm": 89.08405215703674, + "learning_rate": 6.40909090909091e-06, + "loss": 4.0158, + "step": 2257 + }, + { + "epoch": 0.1924486491093497, + "grad_norm": 58.78188296079241, + "learning_rate": 6.411931818181819e-06, + "loss": 3.3528, + "step": 2258 + }, + { + "epoch": 0.1925338788033751, + "grad_norm": 91.82189468376326, + "learning_rate": 6.4147727272727275e-06, + "loss": 3.8508, + "step": 2259 + }, + { + "epoch": 0.1926191084974005, + "grad_norm": 90.12873200957316, + "learning_rate": 6.4176136363636364e-06, + "loss": 3.0472, + "step": 2260 + }, + { + "epoch": 0.1927043381914259, + "grad_norm": 94.01177384074134, + "learning_rate": 6.420454545454546e-06, + "loss": 3.457, + "step": 2261 + }, + { + "epoch": 0.1927895678854513, + "grad_norm": 137.25147316075532, + "learning_rate": 6.423295454545455e-06, + "loss": 4.4935, + "step": 2262 + }, + { + "epoch": 0.1928747975794767, + "grad_norm": 173.7465873543221, + "learning_rate": 6.426136363636364e-06, + "loss": 3.8887, + "step": 2263 + }, + { + "epoch": 0.1929600272735021, + "grad_norm": 63.584820244940474, + "learning_rate": 6.428977272727274e-06, + "loss": 3.5553, + "step": 2264 + }, + { + "epoch": 0.19304525696752747, + "grad_norm": 87.02918232691103, + "learning_rate": 6.431818181818183e-06, + "loss": 2.521, + "step": 2265 + }, + { + "epoch": 0.19313048666155289, + "grad_norm": 85.79953169825922, + "learning_rate": 6.434659090909091e-06, + "loss": 3.4646, + "step": 2266 + }, + { + "epoch": 0.19321571635557827, + "grad_norm": 66.88417079309, + "learning_rate": 6.437500000000001e-06, + "loss": 2.8907, + "step": 2267 + }, + { + "epoch": 0.19330094604960368, + "grad_norm": 56.812995702577666, + "learning_rate": 6.4403409090909096e-06, + "loss": 3.2487, + "step": 2268 + }, + { + "epoch": 0.19338617574362907, + "grad_norm": 83.63315694872358, + "learning_rate": 6.4431818181818185e-06, + "loss": 3.0294, + "step": 2269 + }, + { + "epoch": 0.19347140543765448, + "grad_norm": 169.68272189572588, + "learning_rate": 6.446022727272728e-06, + "loss": 3.1208, + "step": 2270 + }, + { + "epoch": 0.19355663513167987, + "grad_norm": 72.70251540429553, + "learning_rate": 6.448863636363637e-06, + "loss": 3.7973, + "step": 2271 + }, + { + "epoch": 0.19364186482570528, + "grad_norm": 77.639710906979, + "learning_rate": 6.451704545454546e-06, + "loss": 3.6787, + "step": 2272 + }, + { + "epoch": 0.19372709451973066, + "grad_norm": 116.7715169651414, + "learning_rate": 6.454545454545456e-06, + "loss": 3.9653, + "step": 2273 + }, + { + "epoch": 0.19381232421375608, + "grad_norm": 87.25400491662593, + "learning_rate": 6.457386363636364e-06, + "loss": 3.4503, + "step": 2274 + }, + { + "epoch": 0.19389755390778146, + "grad_norm": 234.7122461316696, + "learning_rate": 6.460227272727273e-06, + "loss": 4.4592, + "step": 2275 + }, + { + "epoch": 0.19398278360180687, + "grad_norm": 117.29876216687668, + "learning_rate": 6.463068181818183e-06, + "loss": 3.3747, + "step": 2276 + }, + { + "epoch": 0.19406801329583226, + "grad_norm": 92.51731483882745, + "learning_rate": 6.465909090909092e-06, + "loss": 3.0711, + "step": 2277 + }, + { + "epoch": 0.19415324298985767, + "grad_norm": 95.68804007649487, + "learning_rate": 6.4687500000000005e-06, + "loss": 4.6188, + "step": 2278 + }, + { + "epoch": 0.19423847268388306, + "grad_norm": 144.03767548618495, + "learning_rate": 6.47159090909091e-06, + "loss": 5.1201, + "step": 2279 + }, + { + "epoch": 0.19432370237790847, + "grad_norm": 53.66724185976653, + "learning_rate": 6.474431818181819e-06, + "loss": 2.4575, + "step": 2280 + }, + { + "epoch": 0.19440893207193385, + "grad_norm": 144.41045905051018, + "learning_rate": 6.477272727272727e-06, + "loss": 3.4313, + "step": 2281 + }, + { + "epoch": 0.19449416176595927, + "grad_norm": 184.94449020410548, + "learning_rate": 6.480113636363637e-06, + "loss": 4.8274, + "step": 2282 + }, + { + "epoch": 0.19457939145998465, + "grad_norm": 67.63503949907154, + "learning_rate": 6.482954545454546e-06, + "loss": 3.2961, + "step": 2283 + }, + { + "epoch": 0.19466462115401006, + "grad_norm": 203.30091502163896, + "learning_rate": 6.485795454545455e-06, + "loss": 3.2182, + "step": 2284 + }, + { + "epoch": 0.19474985084803545, + "grad_norm": 117.49195845191811, + "learning_rate": 6.488636363636364e-06, + "loss": 4.7423, + "step": 2285 + }, + { + "epoch": 0.19483508054206086, + "grad_norm": 238.5180047499615, + "learning_rate": 6.491477272727274e-06, + "loss": 4.7071, + "step": 2286 + }, + { + "epoch": 0.19492031023608625, + "grad_norm": 254.34231522116423, + "learning_rate": 6.494318181818183e-06, + "loss": 3.2437, + "step": 2287 + }, + { + "epoch": 0.19500553993011166, + "grad_norm": 145.65614132076306, + "learning_rate": 6.497159090909091e-06, + "loss": 5.0872, + "step": 2288 + }, + { + "epoch": 0.19509076962413704, + "grad_norm": 75.36861594960865, + "learning_rate": 6.5000000000000004e-06, + "loss": 3.2495, + "step": 2289 + }, + { + "epoch": 0.19517599931816246, + "grad_norm": 231.69389072658825, + "learning_rate": 6.502840909090909e-06, + "loss": 4.7251, + "step": 2290 + }, + { + "epoch": 0.19526122901218784, + "grad_norm": 87.46867867650299, + "learning_rate": 6.505681818181818e-06, + "loss": 3.7484, + "step": 2291 + }, + { + "epoch": 0.19534645870621326, + "grad_norm": 76.65285710486386, + "learning_rate": 6.508522727272728e-06, + "loss": 2.9203, + "step": 2292 + }, + { + "epoch": 0.19543168840023864, + "grad_norm": 230.9499076269397, + "learning_rate": 6.511363636363637e-06, + "loss": 3.6709, + "step": 2293 + }, + { + "epoch": 0.19551691809426405, + "grad_norm": 199.73678897916977, + "learning_rate": 6.514204545454545e-06, + "loss": 4.9405, + "step": 2294 + }, + { + "epoch": 0.19560214778828944, + "grad_norm": 88.6134358709167, + "learning_rate": 6.517045454545456e-06, + "loss": 3.1724, + "step": 2295 + }, + { + "epoch": 0.19568737748231485, + "grad_norm": 97.95506488166309, + "learning_rate": 6.519886363636364e-06, + "loss": 3.4634, + "step": 2296 + }, + { + "epoch": 0.19577260717634024, + "grad_norm": 55.00370102842095, + "learning_rate": 6.522727272727273e-06, + "loss": 3.2236, + "step": 2297 + }, + { + "epoch": 0.19585783687036565, + "grad_norm": 180.10953742440796, + "learning_rate": 6.5255681818181825e-06, + "loss": 2.6924, + "step": 2298 + }, + { + "epoch": 0.19594306656439103, + "grad_norm": 188.473873775219, + "learning_rate": 6.5284090909090914e-06, + "loss": 3.5234, + "step": 2299 + }, + { + "epoch": 0.19602829625841645, + "grad_norm": 104.01369457942519, + "learning_rate": 6.53125e-06, + "loss": 2.9951, + "step": 2300 + }, + { + "epoch": 0.19611352595244183, + "grad_norm": 72.08466848363656, + "learning_rate": 6.53409090909091e-06, + "loss": 3.0334, + "step": 2301 + }, + { + "epoch": 0.19619875564646722, + "grad_norm": 147.3635677001184, + "learning_rate": 6.536931818181819e-06, + "loss": 3.5926, + "step": 2302 + }, + { + "epoch": 0.19628398534049263, + "grad_norm": 96.27397771299417, + "learning_rate": 6.539772727272727e-06, + "loss": 3.5704, + "step": 2303 + }, + { + "epoch": 0.196369215034518, + "grad_norm": 93.0845352879712, + "learning_rate": 6.542613636363637e-06, + "loss": 3.5846, + "step": 2304 + }, + { + "epoch": 0.19645444472854343, + "grad_norm": 110.02510650733184, + "learning_rate": 6.545454545454546e-06, + "loss": 3.4724, + "step": 2305 + }, + { + "epoch": 0.1965396744225688, + "grad_norm": 172.0112352051539, + "learning_rate": 6.548295454545455e-06, + "loss": 3.9218, + "step": 2306 + }, + { + "epoch": 0.19662490411659422, + "grad_norm": 120.32881898515329, + "learning_rate": 6.5511363636363646e-06, + "loss": 3.9836, + "step": 2307 + }, + { + "epoch": 0.1967101338106196, + "grad_norm": 159.5004135673852, + "learning_rate": 6.5539772727272735e-06, + "loss": 3.5781, + "step": 2308 + }, + { + "epoch": 0.19679536350464502, + "grad_norm": 133.1027869497171, + "learning_rate": 6.5568181818181816e-06, + "loss": 3.3712, + "step": 2309 + }, + { + "epoch": 0.1968805931986704, + "grad_norm": 338.5372304078201, + "learning_rate": 6.559659090909092e-06, + "loss": 3.861, + "step": 2310 + }, + { + "epoch": 0.19696582289269582, + "grad_norm": 156.49247970110486, + "learning_rate": 6.5625e-06, + "loss": 4.2645, + "step": 2311 + }, + { + "epoch": 0.1970510525867212, + "grad_norm": 118.50787894973745, + "learning_rate": 6.565340909090909e-06, + "loss": 4.3165, + "step": 2312 + }, + { + "epoch": 0.19713628228074662, + "grad_norm": 121.75721246723784, + "learning_rate": 6.568181818181819e-06, + "loss": 3.9565, + "step": 2313 + }, + { + "epoch": 0.197221511974772, + "grad_norm": 63.44859682763639, + "learning_rate": 6.571022727272728e-06, + "loss": 3.1742, + "step": 2314 + }, + { + "epoch": 0.19730674166879741, + "grad_norm": 96.24298442960513, + "learning_rate": 6.573863636363637e-06, + "loss": 3.7033, + "step": 2315 + }, + { + "epoch": 0.1973919713628228, + "grad_norm": 75.01276366356355, + "learning_rate": 6.576704545454547e-06, + "loss": 3.4945, + "step": 2316 + }, + { + "epoch": 0.1974772010568482, + "grad_norm": 93.39536699786657, + "learning_rate": 6.5795454545454555e-06, + "loss": 2.8573, + "step": 2317 + }, + { + "epoch": 0.1975624307508736, + "grad_norm": 142.07370071094394, + "learning_rate": 6.582386363636364e-06, + "loss": 4.1167, + "step": 2318 + }, + { + "epoch": 0.197647660444899, + "grad_norm": 116.28012730668365, + "learning_rate": 6.585227272727273e-06, + "loss": 4.6662, + "step": 2319 + }, + { + "epoch": 0.1977328901389244, + "grad_norm": 238.75784639965102, + "learning_rate": 6.588068181818182e-06, + "loss": 3.9425, + "step": 2320 + }, + { + "epoch": 0.1978181198329498, + "grad_norm": 56.77857746343108, + "learning_rate": 6.590909090909091e-06, + "loss": 4.0094, + "step": 2321 + }, + { + "epoch": 0.1979033495269752, + "grad_norm": 75.96574150572181, + "learning_rate": 6.593750000000001e-06, + "loss": 3.0791, + "step": 2322 + }, + { + "epoch": 0.1979885792210006, + "grad_norm": 65.48364106160216, + "learning_rate": 6.59659090909091e-06, + "loss": 3.6213, + "step": 2323 + }, + { + "epoch": 0.198073808915026, + "grad_norm": 310.894737975072, + "learning_rate": 6.599431818181818e-06, + "loss": 3.6088, + "step": 2324 + }, + { + "epoch": 0.1981590386090514, + "grad_norm": 73.4490154423045, + "learning_rate": 6.602272727272729e-06, + "loss": 3.1299, + "step": 2325 + }, + { + "epoch": 0.1982442683030768, + "grad_norm": 39.35364741706838, + "learning_rate": 6.605113636363637e-06, + "loss": 2.8832, + "step": 2326 + }, + { + "epoch": 0.1983294979971022, + "grad_norm": 81.58175928988825, + "learning_rate": 6.607954545454546e-06, + "loss": 4.0316, + "step": 2327 + }, + { + "epoch": 0.19841472769112758, + "grad_norm": 84.92393959410047, + "learning_rate": 6.6107954545454555e-06, + "loss": 3.2182, + "step": 2328 + }, + { + "epoch": 0.198499957385153, + "grad_norm": 208.14023297104154, + "learning_rate": 6.613636363636364e-06, + "loss": 4.1044, + "step": 2329 + }, + { + "epoch": 0.19858518707917838, + "grad_norm": 113.00279432600576, + "learning_rate": 6.616477272727273e-06, + "loss": 3.178, + "step": 2330 + }, + { + "epoch": 0.1986704167732038, + "grad_norm": 48.94601476900091, + "learning_rate": 6.619318181818183e-06, + "loss": 3.0324, + "step": 2331 + }, + { + "epoch": 0.19875564646722918, + "grad_norm": 60.39226419265237, + "learning_rate": 6.622159090909092e-06, + "loss": 3.4793, + "step": 2332 + }, + { + "epoch": 0.1988408761612546, + "grad_norm": 53.47323149664931, + "learning_rate": 6.625e-06, + "loss": 2.9986, + "step": 2333 + }, + { + "epoch": 0.19892610585527998, + "grad_norm": 62.65635368518228, + "learning_rate": 6.62784090909091e-06, + "loss": 4.1521, + "step": 2334 + }, + { + "epoch": 0.1990113355493054, + "grad_norm": 73.7981491989783, + "learning_rate": 6.630681818181819e-06, + "loss": 3.0003, + "step": 2335 + }, + { + "epoch": 0.19909656524333078, + "grad_norm": 87.65744962068966, + "learning_rate": 6.633522727272728e-06, + "loss": 3.6505, + "step": 2336 + }, + { + "epoch": 0.1991817949373562, + "grad_norm": 84.11931955955676, + "learning_rate": 6.6363636363636375e-06, + "loss": 3.7215, + "step": 2337 + }, + { + "epoch": 0.19926702463138157, + "grad_norm": 86.62567895030492, + "learning_rate": 6.6392045454545464e-06, + "loss": 3.7433, + "step": 2338 + }, + { + "epoch": 0.19935225432540699, + "grad_norm": 108.51527313400143, + "learning_rate": 6.6420454545454545e-06, + "loss": 4.092, + "step": 2339 + }, + { + "epoch": 0.19943748401943237, + "grad_norm": 212.8377447361258, + "learning_rate": 6.6448863636363634e-06, + "loss": 5.1629, + "step": 2340 + }, + { + "epoch": 0.19952271371345776, + "grad_norm": 174.78190638371632, + "learning_rate": 6.647727272727273e-06, + "loss": 3.3852, + "step": 2341 + }, + { + "epoch": 0.19960794340748317, + "grad_norm": 92.54366257018981, + "learning_rate": 6.650568181818182e-06, + "loss": 3.2569, + "step": 2342 + }, + { + "epoch": 0.19969317310150855, + "grad_norm": 146.19772497122824, + "learning_rate": 6.653409090909091e-06, + "loss": 3.8574, + "step": 2343 + }, + { + "epoch": 0.19977840279553397, + "grad_norm": 105.27538659028738, + "learning_rate": 6.656250000000001e-06, + "loss": 3.4821, + "step": 2344 + }, + { + "epoch": 0.19986363248955935, + "grad_norm": 80.7855640046493, + "learning_rate": 6.65909090909091e-06, + "loss": 2.8064, + "step": 2345 + }, + { + "epoch": 0.19994886218358476, + "grad_norm": 208.18456512738993, + "learning_rate": 6.661931818181818e-06, + "loss": 3.6015, + "step": 2346 + }, + { + "epoch": 0.20003409187761015, + "grad_norm": 647.7990411401415, + "learning_rate": 6.664772727272728e-06, + "loss": 3.0986, + "step": 2347 + }, + { + "epoch": 0.20011932157163556, + "grad_norm": 262.11739174038905, + "learning_rate": 6.6676136363636366e-06, + "loss": 3.5884, + "step": 2348 + }, + { + "epoch": 0.20020455126566095, + "grad_norm": 89.25480730591134, + "learning_rate": 6.6704545454545455e-06, + "loss": 3.6456, + "step": 2349 + }, + { + "epoch": 0.20028978095968636, + "grad_norm": 86.43803318243111, + "learning_rate": 6.673295454545455e-06, + "loss": 4.1583, + "step": 2350 + }, + { + "epoch": 0.20037501065371174, + "grad_norm": 65.23659719701662, + "learning_rate": 6.676136363636364e-06, + "loss": 3.8067, + "step": 2351 + }, + { + "epoch": 0.20046024034773716, + "grad_norm": 75.32986985064244, + "learning_rate": 6.678977272727273e-06, + "loss": 3.0105, + "step": 2352 + }, + { + "epoch": 0.20054547004176254, + "grad_norm": 95.31327404642903, + "learning_rate": 6.681818181818183e-06, + "loss": 3.4463, + "step": 2353 + }, + { + "epoch": 0.20063069973578795, + "grad_norm": 94.11248237407672, + "learning_rate": 6.684659090909091e-06, + "loss": 3.1094, + "step": 2354 + }, + { + "epoch": 0.20071592942981334, + "grad_norm": 129.78628285233316, + "learning_rate": 6.6875e-06, + "loss": 3.8256, + "step": 2355 + }, + { + "epoch": 0.20080115912383875, + "grad_norm": 62.937137247501965, + "learning_rate": 6.69034090909091e-06, + "loss": 3.2114, + "step": 2356 + }, + { + "epoch": 0.20088638881786414, + "grad_norm": 42.40460879631251, + "learning_rate": 6.693181818181819e-06, + "loss": 2.088, + "step": 2357 + }, + { + "epoch": 0.20097161851188955, + "grad_norm": 147.02198095295498, + "learning_rate": 6.6960227272727276e-06, + "loss": 3.6927, + "step": 2358 + }, + { + "epoch": 0.20105684820591493, + "grad_norm": 91.45880132804, + "learning_rate": 6.698863636363637e-06, + "loss": 3.666, + "step": 2359 + }, + { + "epoch": 0.20114207789994035, + "grad_norm": 84.97204040964172, + "learning_rate": 6.701704545454546e-06, + "loss": 3.5961, + "step": 2360 + }, + { + "epoch": 0.20122730759396573, + "grad_norm": 101.04938600634256, + "learning_rate": 6.704545454545454e-06, + "loss": 4.219, + "step": 2361 + }, + { + "epoch": 0.20131253728799114, + "grad_norm": 67.1312032536706, + "learning_rate": 6.707386363636364e-06, + "loss": 3.5718, + "step": 2362 + }, + { + "epoch": 0.20139776698201653, + "grad_norm": 82.70928636221663, + "learning_rate": 6.710227272727273e-06, + "loss": 3.424, + "step": 2363 + }, + { + "epoch": 0.20148299667604194, + "grad_norm": 52.037398588050635, + "learning_rate": 6.713068181818182e-06, + "loss": 2.8214, + "step": 2364 + }, + { + "epoch": 0.20156822637006733, + "grad_norm": 170.54515066844263, + "learning_rate": 6.715909090909092e-06, + "loss": 3.4421, + "step": 2365 + }, + { + "epoch": 0.20165345606409274, + "grad_norm": 120.35146736507639, + "learning_rate": 6.718750000000001e-06, + "loss": 2.9615, + "step": 2366 + }, + { + "epoch": 0.20173868575811812, + "grad_norm": 73.06558406375336, + "learning_rate": 6.72159090909091e-06, + "loss": 2.6084, + "step": 2367 + }, + { + "epoch": 0.20182391545214354, + "grad_norm": 106.39608737972442, + "learning_rate": 6.724431818181819e-06, + "loss": 2.9209, + "step": 2368 + }, + { + "epoch": 0.20190914514616892, + "grad_norm": 74.4326390265908, + "learning_rate": 6.7272727272727275e-06, + "loss": 2.8575, + "step": 2369 + }, + { + "epoch": 0.20199437484019434, + "grad_norm": 75.04153960011973, + "learning_rate": 6.730113636363636e-06, + "loss": 3.2765, + "step": 2370 + }, + { + "epoch": 0.20207960453421972, + "grad_norm": 189.3155229685032, + "learning_rate": 6.732954545454546e-06, + "loss": 4.3038, + "step": 2371 + }, + { + "epoch": 0.20216483422824513, + "grad_norm": 87.12815618847657, + "learning_rate": 6.735795454545455e-06, + "loss": 3.6324, + "step": 2372 + }, + { + "epoch": 0.20225006392227052, + "grad_norm": 131.40049748156193, + "learning_rate": 6.738636363636364e-06, + "loss": 4.0592, + "step": 2373 + }, + { + "epoch": 0.20233529361629593, + "grad_norm": 213.4786482873789, + "learning_rate": 6.741477272727274e-06, + "loss": 4.1685, + "step": 2374 + }, + { + "epoch": 0.20242052331032132, + "grad_norm": 72.31053956113621, + "learning_rate": 6.744318181818183e-06, + "loss": 2.9903, + "step": 2375 + }, + { + "epoch": 0.20250575300434673, + "grad_norm": 173.24619193911528, + "learning_rate": 6.747159090909091e-06, + "loss": 4.8703, + "step": 2376 + }, + { + "epoch": 0.2025909826983721, + "grad_norm": 94.84874802115776, + "learning_rate": 6.750000000000001e-06, + "loss": 3.7502, + "step": 2377 + }, + { + "epoch": 0.2026762123923975, + "grad_norm": 79.28373799621288, + "learning_rate": 6.7528409090909095e-06, + "loss": 3.2411, + "step": 2378 + }, + { + "epoch": 0.2027614420864229, + "grad_norm": 132.1985690199149, + "learning_rate": 6.7556818181818185e-06, + "loss": 3.5575, + "step": 2379 + }, + { + "epoch": 0.2028466717804483, + "grad_norm": 110.23840313287738, + "learning_rate": 6.758522727272728e-06, + "loss": 3.7673, + "step": 2380 + }, + { + "epoch": 0.2029319014744737, + "grad_norm": 91.67605227134177, + "learning_rate": 6.761363636363637e-06, + "loss": 2.9171, + "step": 2381 + }, + { + "epoch": 0.2030171311684991, + "grad_norm": 159.18496023279812, + "learning_rate": 6.764204545454546e-06, + "loss": 3.7052, + "step": 2382 + }, + { + "epoch": 0.2031023608625245, + "grad_norm": 87.9563828860764, + "learning_rate": 6.767045454545456e-06, + "loss": 3.2944, + "step": 2383 + }, + { + "epoch": 0.2031875905565499, + "grad_norm": 60.152658447563574, + "learning_rate": 6.769886363636364e-06, + "loss": 2.0794, + "step": 2384 + }, + { + "epoch": 0.2032728202505753, + "grad_norm": 54.7545685874594, + "learning_rate": 6.772727272727273e-06, + "loss": 2.9785, + "step": 2385 + }, + { + "epoch": 0.2033580499446007, + "grad_norm": 166.94103172273694, + "learning_rate": 6.775568181818183e-06, + "loss": 4.1399, + "step": 2386 + }, + { + "epoch": 0.2034432796386261, + "grad_norm": 60.56748742751066, + "learning_rate": 6.778409090909092e-06, + "loss": 3.1173, + "step": 2387 + }, + { + "epoch": 0.20352850933265149, + "grad_norm": 190.55371610797044, + "learning_rate": 6.7812500000000005e-06, + "loss": 3.4525, + "step": 2388 + }, + { + "epoch": 0.2036137390266769, + "grad_norm": 114.13214134538985, + "learning_rate": 6.78409090909091e-06, + "loss": 3.7332, + "step": 2389 + }, + { + "epoch": 0.20369896872070228, + "grad_norm": 89.88104657363759, + "learning_rate": 6.786931818181819e-06, + "loss": 3.4018, + "step": 2390 + }, + { + "epoch": 0.2037841984147277, + "grad_norm": 94.91918703992128, + "learning_rate": 6.789772727272727e-06, + "loss": 3.4415, + "step": 2391 + }, + { + "epoch": 0.20386942810875308, + "grad_norm": 52.509283916057164, + "learning_rate": 6.792613636363637e-06, + "loss": 2.5911, + "step": 2392 + }, + { + "epoch": 0.2039546578027785, + "grad_norm": 145.9277129183053, + "learning_rate": 6.795454545454546e-06, + "loss": 4.4592, + "step": 2393 + }, + { + "epoch": 0.20403988749680388, + "grad_norm": 102.67969196822966, + "learning_rate": 6.798295454545455e-06, + "loss": 3.0902, + "step": 2394 + }, + { + "epoch": 0.2041251171908293, + "grad_norm": 98.2651583262112, + "learning_rate": 6.801136363636364e-06, + "loss": 3.78, + "step": 2395 + }, + { + "epoch": 0.20421034688485468, + "grad_norm": 84.6001971137604, + "learning_rate": 6.803977272727274e-06, + "loss": 2.241, + "step": 2396 + }, + { + "epoch": 0.2042955765788801, + "grad_norm": 59.1035136518153, + "learning_rate": 6.8068181818181826e-06, + "loss": 3.015, + "step": 2397 + }, + { + "epoch": 0.20438080627290547, + "grad_norm": 144.82012701510254, + "learning_rate": 6.809659090909091e-06, + "loss": 3.493, + "step": 2398 + }, + { + "epoch": 0.2044660359669309, + "grad_norm": 63.75815678627515, + "learning_rate": 6.8125e-06, + "loss": 3.0311, + "step": 2399 + }, + { + "epoch": 0.20455126566095627, + "grad_norm": 87.75253621656438, + "learning_rate": 6.815340909090909e-06, + "loss": 2.8271, + "step": 2400 + }, + { + "epoch": 0.20463649535498168, + "grad_norm": 117.74346761489328, + "learning_rate": 6.818181818181818e-06, + "loss": 3.5623, + "step": 2401 + }, + { + "epoch": 0.20472172504900707, + "grad_norm": 145.90229562048987, + "learning_rate": 6.821022727272728e-06, + "loss": 3.0126, + "step": 2402 + }, + { + "epoch": 0.20480695474303248, + "grad_norm": 104.52690234346996, + "learning_rate": 6.823863636363637e-06, + "loss": 2.71, + "step": 2403 + }, + { + "epoch": 0.20489218443705787, + "grad_norm": 144.09070053926533, + "learning_rate": 6.826704545454546e-06, + "loss": 4.4012, + "step": 2404 + }, + { + "epoch": 0.20497741413108328, + "grad_norm": 83.73035030456306, + "learning_rate": 6.829545454545456e-06, + "loss": 3.1138, + "step": 2405 + }, + { + "epoch": 0.20506264382510866, + "grad_norm": 88.95073202191715, + "learning_rate": 6.832386363636364e-06, + "loss": 3.6346, + "step": 2406 + }, + { + "epoch": 0.20514787351913408, + "grad_norm": 110.99417383345435, + "learning_rate": 6.835227272727273e-06, + "loss": 4.3124, + "step": 2407 + }, + { + "epoch": 0.20523310321315946, + "grad_norm": 75.1436347397471, + "learning_rate": 6.8380681818181825e-06, + "loss": 3.3844, + "step": 2408 + }, + { + "epoch": 0.20531833290718488, + "grad_norm": 90.46013618663997, + "learning_rate": 6.840909090909091e-06, + "loss": 3.3362, + "step": 2409 + }, + { + "epoch": 0.20540356260121026, + "grad_norm": 106.4377031799655, + "learning_rate": 6.84375e-06, + "loss": 3.3773, + "step": 2410 + }, + { + "epoch": 0.20548879229523567, + "grad_norm": 73.36696965010681, + "learning_rate": 6.84659090909091e-06, + "loss": 3.0351, + "step": 2411 + }, + { + "epoch": 0.20557402198926106, + "grad_norm": 56.81371493809059, + "learning_rate": 6.849431818181819e-06, + "loss": 2.9742, + "step": 2412 + }, + { + "epoch": 0.20565925168328647, + "grad_norm": 64.91560932364183, + "learning_rate": 6.852272727272727e-06, + "loss": 3.2893, + "step": 2413 + }, + { + "epoch": 0.20574448137731186, + "grad_norm": 519.4531516622362, + "learning_rate": 6.855113636363637e-06, + "loss": 4.105, + "step": 2414 + }, + { + "epoch": 0.20582971107133724, + "grad_norm": 64.18748184739843, + "learning_rate": 6.857954545454546e-06, + "loss": 3.498, + "step": 2415 + }, + { + "epoch": 0.20591494076536265, + "grad_norm": 248.00142408833256, + "learning_rate": 6.860795454545455e-06, + "loss": 4.5188, + "step": 2416 + }, + { + "epoch": 0.20600017045938804, + "grad_norm": 151.91619699206714, + "learning_rate": 6.8636363636363645e-06, + "loss": 4.1848, + "step": 2417 + }, + { + "epoch": 0.20608540015341345, + "grad_norm": 117.80821728413841, + "learning_rate": 6.8664772727272735e-06, + "loss": 2.951, + "step": 2418 + }, + { + "epoch": 0.20617062984743884, + "grad_norm": 102.45738725137822, + "learning_rate": 6.869318181818182e-06, + "loss": 2.8719, + "step": 2419 + }, + { + "epoch": 0.20625585954146425, + "grad_norm": 112.81542728335697, + "learning_rate": 6.872159090909092e-06, + "loss": 3.3543, + "step": 2420 + }, + { + "epoch": 0.20634108923548963, + "grad_norm": 64.52848707439892, + "learning_rate": 6.875e-06, + "loss": 3.2375, + "step": 2421 + }, + { + "epoch": 0.20642631892951505, + "grad_norm": 77.46108140277832, + "learning_rate": 6.877840909090909e-06, + "loss": 2.5669, + "step": 2422 + }, + { + "epoch": 0.20651154862354043, + "grad_norm": 65.51470776714534, + "learning_rate": 6.880681818181819e-06, + "loss": 2.1125, + "step": 2423 + }, + { + "epoch": 0.20659677831756584, + "grad_norm": 136.50219402815372, + "learning_rate": 6.883522727272728e-06, + "loss": 4.1342, + "step": 2424 + }, + { + "epoch": 0.20668200801159123, + "grad_norm": 63.712638702849745, + "learning_rate": 6.886363636363637e-06, + "loss": 2.6655, + "step": 2425 + }, + { + "epoch": 0.20676723770561664, + "grad_norm": 102.32243012951932, + "learning_rate": 6.889204545454547e-06, + "loss": 3.6458, + "step": 2426 + }, + { + "epoch": 0.20685246739964203, + "grad_norm": 68.0068899857826, + "learning_rate": 6.8920454545454555e-06, + "loss": 3.183, + "step": 2427 + }, + { + "epoch": 0.20693769709366744, + "grad_norm": 81.2510148383568, + "learning_rate": 6.894886363636364e-06, + "loss": 3.1466, + "step": 2428 + }, + { + "epoch": 0.20702292678769282, + "grad_norm": 83.69449918681572, + "learning_rate": 6.897727272727273e-06, + "loss": 3.2838, + "step": 2429 + }, + { + "epoch": 0.20710815648171824, + "grad_norm": 108.95011015838062, + "learning_rate": 6.900568181818182e-06, + "loss": 3.1744, + "step": 2430 + }, + { + "epoch": 0.20719338617574362, + "grad_norm": 101.94796064960808, + "learning_rate": 6.903409090909091e-06, + "loss": 3.0592, + "step": 2431 + }, + { + "epoch": 0.20727861586976903, + "grad_norm": 55.30299199166858, + "learning_rate": 6.906250000000001e-06, + "loss": 2.3568, + "step": 2432 + }, + { + "epoch": 0.20736384556379442, + "grad_norm": 155.7865778176695, + "learning_rate": 6.90909090909091e-06, + "loss": 4.517, + "step": 2433 + }, + { + "epoch": 0.20744907525781983, + "grad_norm": 115.16646809562957, + "learning_rate": 6.911931818181819e-06, + "loss": 3.5998, + "step": 2434 + }, + { + "epoch": 0.20753430495184522, + "grad_norm": 84.25374254874706, + "learning_rate": 6.914772727272729e-06, + "loss": 3.7096, + "step": 2435 + }, + { + "epoch": 0.20761953464587063, + "grad_norm": 71.03135707500002, + "learning_rate": 6.917613636363637e-06, + "loss": 3.6846, + "step": 2436 + }, + { + "epoch": 0.20770476433989601, + "grad_norm": 154.38457563919386, + "learning_rate": 6.920454545454546e-06, + "loss": 2.2835, + "step": 2437 + }, + { + "epoch": 0.20778999403392143, + "grad_norm": 82.65642039649121, + "learning_rate": 6.9232954545454554e-06, + "loss": 3.3925, + "step": 2438 + }, + { + "epoch": 0.2078752237279468, + "grad_norm": 101.17554444712282, + "learning_rate": 6.926136363636364e-06, + "loss": 3.4446, + "step": 2439 + }, + { + "epoch": 0.20796045342197222, + "grad_norm": 66.69295899190391, + "learning_rate": 6.928977272727273e-06, + "loss": 2.8992, + "step": 2440 + }, + { + "epoch": 0.2080456831159976, + "grad_norm": 65.24330863383011, + "learning_rate": 6.931818181818183e-06, + "loss": 3.3088, + "step": 2441 + }, + { + "epoch": 0.20813091281002302, + "grad_norm": 661.9801906029315, + "learning_rate": 6.934659090909092e-06, + "loss": 3.745, + "step": 2442 + }, + { + "epoch": 0.2082161425040484, + "grad_norm": 178.1880299326305, + "learning_rate": 6.9375e-06, + "loss": 4.1306, + "step": 2443 + }, + { + "epoch": 0.20830137219807382, + "grad_norm": 115.84916001481075, + "learning_rate": 6.94034090909091e-06, + "loss": 3.3331, + "step": 2444 + }, + { + "epoch": 0.2083866018920992, + "grad_norm": 535.6513646146017, + "learning_rate": 6.943181818181819e-06, + "loss": 4.4536, + "step": 2445 + }, + { + "epoch": 0.20847183158612462, + "grad_norm": 66.1899223464181, + "learning_rate": 6.946022727272728e-06, + "loss": 3.4415, + "step": 2446 + }, + { + "epoch": 0.20855706128015, + "grad_norm": 52.503936055557006, + "learning_rate": 6.9488636363636375e-06, + "loss": 2.5696, + "step": 2447 + }, + { + "epoch": 0.20864229097417542, + "grad_norm": 128.74572943018737, + "learning_rate": 6.951704545454546e-06, + "loss": 4.1559, + "step": 2448 + }, + { + "epoch": 0.2087275206682008, + "grad_norm": 105.90572596593435, + "learning_rate": 6.954545454545455e-06, + "loss": 4.9352, + "step": 2449 + }, + { + "epoch": 0.2088127503622262, + "grad_norm": 64.47757546836729, + "learning_rate": 6.957386363636363e-06, + "loss": 3.108, + "step": 2450 + }, + { + "epoch": 0.2088979800562516, + "grad_norm": 161.81390384735218, + "learning_rate": 6.960227272727273e-06, + "loss": 4.0587, + "step": 2451 + }, + { + "epoch": 0.208983209750277, + "grad_norm": 377.2203994127174, + "learning_rate": 6.963068181818182e-06, + "loss": 4.828, + "step": 2452 + }, + { + "epoch": 0.2090684394443024, + "grad_norm": 108.76315532461207, + "learning_rate": 6.965909090909091e-06, + "loss": 3.8538, + "step": 2453 + }, + { + "epoch": 0.20915366913832778, + "grad_norm": 323.6560378333802, + "learning_rate": 6.968750000000001e-06, + "loss": 2.7357, + "step": 2454 + }, + { + "epoch": 0.2092388988323532, + "grad_norm": 63.325641084185804, + "learning_rate": 6.97159090909091e-06, + "loss": 3.4963, + "step": 2455 + }, + { + "epoch": 0.20932412852637858, + "grad_norm": 52.378414159816614, + "learning_rate": 6.974431818181818e-06, + "loss": 2.9026, + "step": 2456 + }, + { + "epoch": 0.209409358220404, + "grad_norm": 69.25960931657423, + "learning_rate": 6.9772727272727285e-06, + "loss": 3.7657, + "step": 2457 + }, + { + "epoch": 0.20949458791442938, + "grad_norm": 161.83255672449502, + "learning_rate": 6.9801136363636365e-06, + "loss": 4.2693, + "step": 2458 + }, + { + "epoch": 0.2095798176084548, + "grad_norm": 94.94837522469268, + "learning_rate": 6.9829545454545455e-06, + "loss": 3.5601, + "step": 2459 + }, + { + "epoch": 0.20966504730248017, + "grad_norm": 73.96857314025263, + "learning_rate": 6.985795454545455e-06, + "loss": 2.7855, + "step": 2460 + }, + { + "epoch": 0.20975027699650559, + "grad_norm": 44.60240323778246, + "learning_rate": 6.988636363636364e-06, + "loss": 3.1636, + "step": 2461 + }, + { + "epoch": 0.20983550669053097, + "grad_norm": 83.10509542546113, + "learning_rate": 6.991477272727273e-06, + "loss": 2.8268, + "step": 2462 + }, + { + "epoch": 0.20992073638455638, + "grad_norm": 69.97521151598279, + "learning_rate": 6.994318181818183e-06, + "loss": 2.5749, + "step": 2463 + }, + { + "epoch": 0.21000596607858177, + "grad_norm": 63.44968619939029, + "learning_rate": 6.997159090909092e-06, + "loss": 2.4456, + "step": 2464 + }, + { + "epoch": 0.21009119577260718, + "grad_norm": 53.977843796923935, + "learning_rate": 7e-06, + "loss": 3.2283, + "step": 2465 + }, + { + "epoch": 0.21017642546663257, + "grad_norm": 34.53217507336127, + "learning_rate": 7.00284090909091e-06, + "loss": 1.5964, + "step": 2466 + }, + { + "epoch": 0.21026165516065798, + "grad_norm": 87.9596821087549, + "learning_rate": 7.005681818181819e-06, + "loss": 4.1572, + "step": 2467 + }, + { + "epoch": 0.21034688485468336, + "grad_norm": 82.0694855097698, + "learning_rate": 7.0085227272727275e-06, + "loss": 3.0863, + "step": 2468 + }, + { + "epoch": 0.21043211454870878, + "grad_norm": 59.37145997028306, + "learning_rate": 7.011363636363637e-06, + "loss": 3.0282, + "step": 2469 + }, + { + "epoch": 0.21051734424273416, + "grad_norm": 178.95241175241728, + "learning_rate": 7.014204545454546e-06, + "loss": 3.6815, + "step": 2470 + }, + { + "epoch": 0.21060257393675957, + "grad_norm": 69.96241863453213, + "learning_rate": 7.017045454545454e-06, + "loss": 2.6058, + "step": 2471 + }, + { + "epoch": 0.21068780363078496, + "grad_norm": 238.34382221831837, + "learning_rate": 7.019886363636365e-06, + "loss": 3.3819, + "step": 2472 + }, + { + "epoch": 0.21077303332481037, + "grad_norm": 100.77372839984248, + "learning_rate": 7.022727272727273e-06, + "loss": 3.8548, + "step": 2473 + }, + { + "epoch": 0.21085826301883576, + "grad_norm": 75.5500548516174, + "learning_rate": 7.025568181818182e-06, + "loss": 3.3609, + "step": 2474 + }, + { + "epoch": 0.21094349271286117, + "grad_norm": 81.37719202924255, + "learning_rate": 7.028409090909092e-06, + "loss": 2.8905, + "step": 2475 + }, + { + "epoch": 0.21102872240688655, + "grad_norm": 168.90076414509994, + "learning_rate": 7.031250000000001e-06, + "loss": 4.6591, + "step": 2476 + }, + { + "epoch": 0.21111395210091197, + "grad_norm": 137.3402310109873, + "learning_rate": 7.03409090909091e-06, + "loss": 4.3225, + "step": 2477 + }, + { + "epoch": 0.21119918179493735, + "grad_norm": 57.274651368476874, + "learning_rate": 7.036931818181819e-06, + "loss": 3.4215, + "step": 2478 + }, + { + "epoch": 0.21128441148896276, + "grad_norm": 115.17527872123334, + "learning_rate": 7.039772727272728e-06, + "loss": 3.6297, + "step": 2479 + }, + { + "epoch": 0.21136964118298815, + "grad_norm": 76.70187576798246, + "learning_rate": 7.042613636363636e-06, + "loss": 3.0899, + "step": 2480 + }, + { + "epoch": 0.21145487087701356, + "grad_norm": 109.18052728923045, + "learning_rate": 7.045454545454546e-06, + "loss": 2.6705, + "step": 2481 + }, + { + "epoch": 0.21154010057103895, + "grad_norm": 214.34732308264657, + "learning_rate": 7.048295454545455e-06, + "loss": 4.2298, + "step": 2482 + }, + { + "epoch": 0.21162533026506436, + "grad_norm": 58.2456500165294, + "learning_rate": 7.051136363636364e-06, + "loss": 2.4892, + "step": 2483 + }, + { + "epoch": 0.21171055995908974, + "grad_norm": 527.1057591092901, + "learning_rate": 7.053977272727274e-06, + "loss": 2.9406, + "step": 2484 + }, + { + "epoch": 0.21179578965311516, + "grad_norm": 66.5289499546701, + "learning_rate": 7.056818181818183e-06, + "loss": 3.7693, + "step": 2485 + }, + { + "epoch": 0.21188101934714054, + "grad_norm": 80.63787754805657, + "learning_rate": 7.059659090909091e-06, + "loss": 3.1882, + "step": 2486 + }, + { + "epoch": 0.21196624904116596, + "grad_norm": 112.29493903510097, + "learning_rate": 7.062500000000001e-06, + "loss": 4.1032, + "step": 2487 + }, + { + "epoch": 0.21205147873519134, + "grad_norm": 76.24429223889653, + "learning_rate": 7.0653409090909095e-06, + "loss": 2.5534, + "step": 2488 + }, + { + "epoch": 0.21213670842921675, + "grad_norm": 140.86688072116647, + "learning_rate": 7.0681818181818184e-06, + "loss": 4.2085, + "step": 2489 + }, + { + "epoch": 0.21222193812324214, + "grad_norm": 91.61618486158605, + "learning_rate": 7.071022727272728e-06, + "loss": 3.7784, + "step": 2490 + }, + { + "epoch": 0.21230716781726752, + "grad_norm": 125.24111726452062, + "learning_rate": 7.073863636363637e-06, + "loss": 3.3855, + "step": 2491 + }, + { + "epoch": 0.21239239751129294, + "grad_norm": 446.2927468475905, + "learning_rate": 7.076704545454546e-06, + "loss": 5.3589, + "step": 2492 + }, + { + "epoch": 0.21247762720531832, + "grad_norm": 91.84415616343531, + "learning_rate": 7.079545454545456e-06, + "loss": 3.5773, + "step": 2493 + }, + { + "epoch": 0.21256285689934373, + "grad_norm": 100.86403130110648, + "learning_rate": 7.082386363636364e-06, + "loss": 4.3832, + "step": 2494 + }, + { + "epoch": 0.21264808659336912, + "grad_norm": 101.7458071272961, + "learning_rate": 7.085227272727273e-06, + "loss": 4.4551, + "step": 2495 + }, + { + "epoch": 0.21273331628739453, + "grad_norm": 132.96883345505884, + "learning_rate": 7.088068181818183e-06, + "loss": 4.0355, + "step": 2496 + }, + { + "epoch": 0.21281854598141992, + "grad_norm": 76.52637653412681, + "learning_rate": 7.0909090909090916e-06, + "loss": 2.9856, + "step": 2497 + }, + { + "epoch": 0.21290377567544533, + "grad_norm": 194.58135877203952, + "learning_rate": 7.0937500000000005e-06, + "loss": 4.4149, + "step": 2498 + }, + { + "epoch": 0.2129890053694707, + "grad_norm": 138.60983807983263, + "learning_rate": 7.09659090909091e-06, + "loss": 4.3746, + "step": 2499 + }, + { + "epoch": 0.21307423506349613, + "grad_norm": 50.55252580035909, + "learning_rate": 7.099431818181819e-06, + "loss": 2.6554, + "step": 2500 + }, + { + "epoch": 0.2131594647575215, + "grad_norm": 102.86170210197577, + "learning_rate": 7.102272727272727e-06, + "loss": 2.6978, + "step": 2501 + }, + { + "epoch": 0.21324469445154692, + "grad_norm": 72.8485274966453, + "learning_rate": 7.105113636363638e-06, + "loss": 3.1833, + "step": 2502 + }, + { + "epoch": 0.2133299241455723, + "grad_norm": 105.65004739106034, + "learning_rate": 7.107954545454546e-06, + "loss": 2.6668, + "step": 2503 + }, + { + "epoch": 0.21341515383959772, + "grad_norm": 47.47021769533333, + "learning_rate": 7.110795454545455e-06, + "loss": 2.5797, + "step": 2504 + }, + { + "epoch": 0.2135003835336231, + "grad_norm": 46.664634474009844, + "learning_rate": 7.113636363636364e-06, + "loss": 2.753, + "step": 2505 + }, + { + "epoch": 0.21358561322764852, + "grad_norm": 73.5144101018347, + "learning_rate": 7.116477272727274e-06, + "loss": 3.1588, + "step": 2506 + }, + { + "epoch": 0.2136708429216739, + "grad_norm": 215.33648375057143, + "learning_rate": 7.1193181818181825e-06, + "loss": 4.8151, + "step": 2507 + }, + { + "epoch": 0.21375607261569932, + "grad_norm": 221.9341312242064, + "learning_rate": 7.122159090909091e-06, + "loss": 2.6428, + "step": 2508 + }, + { + "epoch": 0.2138413023097247, + "grad_norm": 134.29012979708526, + "learning_rate": 7.125e-06, + "loss": 4.1412, + "step": 2509 + }, + { + "epoch": 0.21392653200375011, + "grad_norm": 86.37477254565685, + "learning_rate": 7.127840909090909e-06, + "loss": 3.8308, + "step": 2510 + }, + { + "epoch": 0.2140117616977755, + "grad_norm": 101.20766283489381, + "learning_rate": 7.130681818181818e-06, + "loss": 3.8307, + "step": 2511 + }, + { + "epoch": 0.2140969913918009, + "grad_norm": 208.18952045829934, + "learning_rate": 7.133522727272728e-06, + "loss": 5.7237, + "step": 2512 + }, + { + "epoch": 0.2141822210858263, + "grad_norm": 160.09029564312232, + "learning_rate": 7.136363636363637e-06, + "loss": 3.7671, + "step": 2513 + }, + { + "epoch": 0.2142674507798517, + "grad_norm": 117.42758065405945, + "learning_rate": 7.139204545454546e-06, + "loss": 3.1848, + "step": 2514 + }, + { + "epoch": 0.2143526804738771, + "grad_norm": 104.76802063226025, + "learning_rate": 7.142045454545456e-06, + "loss": 3.9932, + "step": 2515 + }, + { + "epoch": 0.2144379101679025, + "grad_norm": 154.43720813747157, + "learning_rate": 7.144886363636364e-06, + "loss": 5.6884, + "step": 2516 + }, + { + "epoch": 0.2145231398619279, + "grad_norm": 171.46311071142787, + "learning_rate": 7.147727272727273e-06, + "loss": 3.4676, + "step": 2517 + }, + { + "epoch": 0.2146083695559533, + "grad_norm": 166.70106092288103, + "learning_rate": 7.1505681818181824e-06, + "loss": 3.7344, + "step": 2518 + }, + { + "epoch": 0.2146935992499787, + "grad_norm": 101.94393677666184, + "learning_rate": 7.153409090909091e-06, + "loss": 3.367, + "step": 2519 + }, + { + "epoch": 0.2147788289440041, + "grad_norm": 61.290231160853985, + "learning_rate": 7.15625e-06, + "loss": 3.4248, + "step": 2520 + }, + { + "epoch": 0.2148640586380295, + "grad_norm": 84.09836714533104, + "learning_rate": 7.15909090909091e-06, + "loss": 3.0171, + "step": 2521 + }, + { + "epoch": 0.2149492883320549, + "grad_norm": 170.79565025336683, + "learning_rate": 7.161931818181819e-06, + "loss": 3.7656, + "step": 2522 + }, + { + "epoch": 0.21503451802608028, + "grad_norm": 91.70925809701241, + "learning_rate": 7.164772727272727e-06, + "loss": 4.18, + "step": 2523 + }, + { + "epoch": 0.2151197477201057, + "grad_norm": 116.99460125887656, + "learning_rate": 7.167613636363637e-06, + "loss": 4.5727, + "step": 2524 + }, + { + "epoch": 0.21520497741413108, + "grad_norm": 130.3736091064011, + "learning_rate": 7.170454545454546e-06, + "loss": 4.6052, + "step": 2525 + }, + { + "epoch": 0.2152902071081565, + "grad_norm": 68.54069962202028, + "learning_rate": 7.173295454545455e-06, + "loss": 3.2418, + "step": 2526 + }, + { + "epoch": 0.21537543680218188, + "grad_norm": 120.85734099103426, + "learning_rate": 7.1761363636363645e-06, + "loss": 3.4975, + "step": 2527 + }, + { + "epoch": 0.21546066649620726, + "grad_norm": 123.85451561894946, + "learning_rate": 7.1789772727272734e-06, + "loss": 3.882, + "step": 2528 + }, + { + "epoch": 0.21554589619023268, + "grad_norm": 98.91050584692837, + "learning_rate": 7.181818181818182e-06, + "loss": 4.0914, + "step": 2529 + }, + { + "epoch": 0.21563112588425806, + "grad_norm": 96.40073279738012, + "learning_rate": 7.184659090909092e-06, + "loss": 4.1863, + "step": 2530 + }, + { + "epoch": 0.21571635557828348, + "grad_norm": 118.90328405036723, + "learning_rate": 7.1875e-06, + "loss": 3.959, + "step": 2531 + }, + { + "epoch": 0.21580158527230886, + "grad_norm": 113.9407885637741, + "learning_rate": 7.190340909090909e-06, + "loss": 3.6528, + "step": 2532 + }, + { + "epoch": 0.21588681496633427, + "grad_norm": 136.98642281110696, + "learning_rate": 7.193181818181819e-06, + "loss": 4.0586, + "step": 2533 + }, + { + "epoch": 0.21597204466035966, + "grad_norm": 103.77247180806017, + "learning_rate": 7.196022727272728e-06, + "loss": 3.1184, + "step": 2534 + }, + { + "epoch": 0.21605727435438507, + "grad_norm": 126.5406524528247, + "learning_rate": 7.198863636363637e-06, + "loss": 4.1183, + "step": 2535 + }, + { + "epoch": 0.21614250404841046, + "grad_norm": 147.07372157977866, + "learning_rate": 7.2017045454545466e-06, + "loss": 3.352, + "step": 2536 + }, + { + "epoch": 0.21622773374243587, + "grad_norm": 192.19487667257732, + "learning_rate": 7.2045454545454555e-06, + "loss": 3.4222, + "step": 2537 + }, + { + "epoch": 0.21631296343646125, + "grad_norm": 196.33672054420452, + "learning_rate": 7.2073863636363636e-06, + "loss": 5.2862, + "step": 2538 + }, + { + "epoch": 0.21639819313048667, + "grad_norm": 95.38912041119828, + "learning_rate": 7.210227272727273e-06, + "loss": 3.0494, + "step": 2539 + }, + { + "epoch": 0.21648342282451205, + "grad_norm": 74.06725975911309, + "learning_rate": 7.213068181818182e-06, + "loss": 3.5858, + "step": 2540 + }, + { + "epoch": 0.21656865251853746, + "grad_norm": 121.4625863806881, + "learning_rate": 7.215909090909091e-06, + "loss": 3.4998, + "step": 2541 + }, + { + "epoch": 0.21665388221256285, + "grad_norm": 247.10167188890233, + "learning_rate": 7.218750000000001e-06, + "loss": 4.4495, + "step": 2542 + }, + { + "epoch": 0.21673911190658826, + "grad_norm": 119.72987903100865, + "learning_rate": 7.22159090909091e-06, + "loss": 5.3331, + "step": 2543 + }, + { + "epoch": 0.21682434160061365, + "grad_norm": 133.61937825414734, + "learning_rate": 7.224431818181819e-06, + "loss": 4.0402, + "step": 2544 + }, + { + "epoch": 0.21690957129463906, + "grad_norm": 133.65414697715997, + "learning_rate": 7.227272727272729e-06, + "loss": 3.7099, + "step": 2545 + }, + { + "epoch": 0.21699480098866444, + "grad_norm": 126.85818510724653, + "learning_rate": 7.230113636363637e-06, + "loss": 4.1793, + "step": 2546 + }, + { + "epoch": 0.21708003068268986, + "grad_norm": 60.3639512598973, + "learning_rate": 7.232954545454546e-06, + "loss": 3.4211, + "step": 2547 + }, + { + "epoch": 0.21716526037671524, + "grad_norm": 110.10863791765253, + "learning_rate": 7.235795454545455e-06, + "loss": 3.9016, + "step": 2548 + }, + { + "epoch": 0.21725049007074065, + "grad_norm": 102.37349715510364, + "learning_rate": 7.238636363636364e-06, + "loss": 3.3665, + "step": 2549 + }, + { + "epoch": 0.21733571976476604, + "grad_norm": 108.43825515894345, + "learning_rate": 7.241477272727273e-06, + "loss": 3.1614, + "step": 2550 + }, + { + "epoch": 0.21742094945879145, + "grad_norm": 109.28076054504663, + "learning_rate": 7.244318181818183e-06, + "loss": 3.8252, + "step": 2551 + }, + { + "epoch": 0.21750617915281684, + "grad_norm": 95.59095575972837, + "learning_rate": 7.247159090909092e-06, + "loss": 3.5274, + "step": 2552 + }, + { + "epoch": 0.21759140884684225, + "grad_norm": 106.74164207140791, + "learning_rate": 7.25e-06, + "loss": 3.7886, + "step": 2553 + }, + { + "epoch": 0.21767663854086763, + "grad_norm": 143.76231830148555, + "learning_rate": 7.25284090909091e-06, + "loss": 3.9452, + "step": 2554 + }, + { + "epoch": 0.21776186823489305, + "grad_norm": 198.0740078725449, + "learning_rate": 7.255681818181819e-06, + "loss": 6.1072, + "step": 2555 + }, + { + "epoch": 0.21784709792891843, + "grad_norm": 183.55793565192587, + "learning_rate": 7.258522727272728e-06, + "loss": 4.4756, + "step": 2556 + }, + { + "epoch": 0.21793232762294384, + "grad_norm": 77.00133233465674, + "learning_rate": 7.2613636363636375e-06, + "loss": 3.1906, + "step": 2557 + }, + { + "epoch": 0.21801755731696923, + "grad_norm": 111.01194380712698, + "learning_rate": 7.264204545454546e-06, + "loss": 3.7951, + "step": 2558 + }, + { + "epoch": 0.21810278701099464, + "grad_norm": 84.30431957406418, + "learning_rate": 7.267045454545455e-06, + "loss": 3.4919, + "step": 2559 + }, + { + "epoch": 0.21818801670502003, + "grad_norm": 74.78297313009338, + "learning_rate": 7.269886363636363e-06, + "loss": 2.9968, + "step": 2560 + }, + { + "epoch": 0.21827324639904544, + "grad_norm": 73.68455186346108, + "learning_rate": 7.272727272727273e-06, + "loss": 4.4492, + "step": 2561 + }, + { + "epoch": 0.21835847609307082, + "grad_norm": 132.25589411557925, + "learning_rate": 7.275568181818182e-06, + "loss": 4.6725, + "step": 2562 + }, + { + "epoch": 0.21844370578709624, + "grad_norm": 91.04039389131269, + "learning_rate": 7.278409090909091e-06, + "loss": 3.5933, + "step": 2563 + }, + { + "epoch": 0.21852893548112162, + "grad_norm": 183.96643443040514, + "learning_rate": 7.281250000000001e-06, + "loss": 5.4707, + "step": 2564 + }, + { + "epoch": 0.21861416517514703, + "grad_norm": 102.10296488570287, + "learning_rate": 7.28409090909091e-06, + "loss": 4.0583, + "step": 2565 + }, + { + "epoch": 0.21869939486917242, + "grad_norm": 98.45164813970466, + "learning_rate": 7.286931818181819e-06, + "loss": 3.1167, + "step": 2566 + }, + { + "epoch": 0.2187846245631978, + "grad_norm": 334.9308452410238, + "learning_rate": 7.2897727272727284e-06, + "loss": 4.432, + "step": 2567 + }, + { + "epoch": 0.21886985425722322, + "grad_norm": 108.29781442517, + "learning_rate": 7.2926136363636365e-06, + "loss": 3.8042, + "step": 2568 + }, + { + "epoch": 0.2189550839512486, + "grad_norm": 69.91497849579832, + "learning_rate": 7.2954545454545454e-06, + "loss": 3.2652, + "step": 2569 + }, + { + "epoch": 0.21904031364527402, + "grad_norm": 86.38218601050562, + "learning_rate": 7.298295454545455e-06, + "loss": 3.1601, + "step": 2570 + }, + { + "epoch": 0.2191255433392994, + "grad_norm": 74.32978013072842, + "learning_rate": 7.301136363636364e-06, + "loss": 2.9013, + "step": 2571 + }, + { + "epoch": 0.2192107730333248, + "grad_norm": 110.18095664559961, + "learning_rate": 7.303977272727273e-06, + "loss": 3.8541, + "step": 2572 + }, + { + "epoch": 0.2192960027273502, + "grad_norm": 108.83254293341724, + "learning_rate": 7.306818181818183e-06, + "loss": 3.9162, + "step": 2573 + }, + { + "epoch": 0.2193812324213756, + "grad_norm": 620.0374007982829, + "learning_rate": 7.309659090909092e-06, + "loss": 3.6096, + "step": 2574 + }, + { + "epoch": 0.219466462115401, + "grad_norm": 68.37017561082612, + "learning_rate": 7.3125e-06, + "loss": 4.2803, + "step": 2575 + }, + { + "epoch": 0.2195516918094264, + "grad_norm": 121.42926911045204, + "learning_rate": 7.31534090909091e-06, + "loss": 3.7469, + "step": 2576 + }, + { + "epoch": 0.2196369215034518, + "grad_norm": 92.6647928248673, + "learning_rate": 7.3181818181818186e-06, + "loss": 2.6624, + "step": 2577 + }, + { + "epoch": 0.2197221511974772, + "grad_norm": 124.04190410886604, + "learning_rate": 7.3210227272727275e-06, + "loss": 3.7847, + "step": 2578 + }, + { + "epoch": 0.2198073808915026, + "grad_norm": 158.636299354994, + "learning_rate": 7.323863636363637e-06, + "loss": 3.297, + "step": 2579 + }, + { + "epoch": 0.219892610585528, + "grad_norm": 90.61127673035132, + "learning_rate": 7.326704545454546e-06, + "loss": 4.2993, + "step": 2580 + }, + { + "epoch": 0.2199778402795534, + "grad_norm": 52.51168557513741, + "learning_rate": 7.329545454545455e-06, + "loss": 3.2905, + "step": 2581 + }, + { + "epoch": 0.2200630699735788, + "grad_norm": 97.90386927189047, + "learning_rate": 7.332386363636365e-06, + "loss": 3.9517, + "step": 2582 + }, + { + "epoch": 0.22014829966760419, + "grad_norm": 79.35428299087535, + "learning_rate": 7.335227272727273e-06, + "loss": 3.9529, + "step": 2583 + }, + { + "epoch": 0.2202335293616296, + "grad_norm": 95.99144070644192, + "learning_rate": 7.338068181818182e-06, + "loss": 3.3114, + "step": 2584 + }, + { + "epoch": 0.22031875905565498, + "grad_norm": 90.34769590187119, + "learning_rate": 7.340909090909092e-06, + "loss": 3.4621, + "step": 2585 + }, + { + "epoch": 0.2204039887496804, + "grad_norm": 161.01953144671958, + "learning_rate": 7.343750000000001e-06, + "loss": 5.0606, + "step": 2586 + }, + { + "epoch": 0.22048921844370578, + "grad_norm": 98.21242083586905, + "learning_rate": 7.3465909090909096e-06, + "loss": 3.6991, + "step": 2587 + }, + { + "epoch": 0.2205744481377312, + "grad_norm": 66.29096732853694, + "learning_rate": 7.349431818181819e-06, + "loss": 2.5426, + "step": 2588 + }, + { + "epoch": 0.22065967783175658, + "grad_norm": 209.7070665519131, + "learning_rate": 7.352272727272728e-06, + "loss": 4.2476, + "step": 2589 + }, + { + "epoch": 0.220744907525782, + "grad_norm": 99.26458519895102, + "learning_rate": 7.355113636363636e-06, + "loss": 3.551, + "step": 2590 + }, + { + "epoch": 0.22083013721980738, + "grad_norm": 41.696550338228306, + "learning_rate": 7.357954545454546e-06, + "loss": 2.3746, + "step": 2591 + }, + { + "epoch": 0.2209153669138328, + "grad_norm": 54.29772646223744, + "learning_rate": 7.360795454545455e-06, + "loss": 2.9424, + "step": 2592 + }, + { + "epoch": 0.22100059660785817, + "grad_norm": 103.7630948363584, + "learning_rate": 7.363636363636364e-06, + "loss": 3.7617, + "step": 2593 + }, + { + "epoch": 0.2210858263018836, + "grad_norm": 68.96270064027534, + "learning_rate": 7.366477272727274e-06, + "loss": 2.4908, + "step": 2594 + }, + { + "epoch": 0.22117105599590897, + "grad_norm": 72.94150135838588, + "learning_rate": 7.369318181818183e-06, + "loss": 3.3033, + "step": 2595 + }, + { + "epoch": 0.22125628568993438, + "grad_norm": 82.03841048700042, + "learning_rate": 7.372159090909092e-06, + "loss": 3.2314, + "step": 2596 + }, + { + "epoch": 0.22134151538395977, + "grad_norm": 105.04859999455638, + "learning_rate": 7.375000000000001e-06, + "loss": 3.5632, + "step": 2597 + }, + { + "epoch": 0.22142674507798518, + "grad_norm": 62.77930581097341, + "learning_rate": 7.3778409090909095e-06, + "loss": 3.6748, + "step": 2598 + }, + { + "epoch": 0.22151197477201057, + "grad_norm": 135.70517069953362, + "learning_rate": 7.380681818181818e-06, + "loss": 4.4632, + "step": 2599 + }, + { + "epoch": 0.22159720446603598, + "grad_norm": 113.38440349160403, + "learning_rate": 7.383522727272728e-06, + "loss": 4.4481, + "step": 2600 + }, + { + "epoch": 0.22168243416006136, + "grad_norm": 96.29654269776955, + "learning_rate": 7.386363636363637e-06, + "loss": 3.3535, + "step": 2601 + }, + { + "epoch": 0.22176766385408678, + "grad_norm": 254.19991055410222, + "learning_rate": 7.389204545454546e-06, + "loss": 4.4115, + "step": 2602 + }, + { + "epoch": 0.22185289354811216, + "grad_norm": 122.64294897417439, + "learning_rate": 7.392045454545456e-06, + "loss": 3.8369, + "step": 2603 + }, + { + "epoch": 0.22193812324213755, + "grad_norm": 598.490750156817, + "learning_rate": 7.394886363636365e-06, + "loss": 3.7691, + "step": 2604 + }, + { + "epoch": 0.22202335293616296, + "grad_norm": 81.7306543006714, + "learning_rate": 7.397727272727273e-06, + "loss": 3.1378, + "step": 2605 + }, + { + "epoch": 0.22210858263018834, + "grad_norm": 68.26388053444732, + "learning_rate": 7.400568181818183e-06, + "loss": 2.9762, + "step": 2606 + }, + { + "epoch": 0.22219381232421376, + "grad_norm": 82.38955947053923, + "learning_rate": 7.4034090909090915e-06, + "loss": 4.114, + "step": 2607 + }, + { + "epoch": 0.22227904201823914, + "grad_norm": 108.01327303234883, + "learning_rate": 7.4062500000000005e-06, + "loss": 3.1246, + "step": 2608 + }, + { + "epoch": 0.22236427171226456, + "grad_norm": 65.89464648611373, + "learning_rate": 7.40909090909091e-06, + "loss": 3.3103, + "step": 2609 + }, + { + "epoch": 0.22244950140628994, + "grad_norm": 118.11563812384324, + "learning_rate": 7.411931818181819e-06, + "loss": 3.8339, + "step": 2610 + }, + { + "epoch": 0.22253473110031535, + "grad_norm": 64.91517024596396, + "learning_rate": 7.414772727272728e-06, + "loss": 3.3362, + "step": 2611 + }, + { + "epoch": 0.22261996079434074, + "grad_norm": 112.49889725946117, + "learning_rate": 7.417613636363638e-06, + "loss": 3.6049, + "step": 2612 + }, + { + "epoch": 0.22270519048836615, + "grad_norm": 100.71435232630384, + "learning_rate": 7.420454545454546e-06, + "loss": 3.2231, + "step": 2613 + }, + { + "epoch": 0.22279042018239154, + "grad_norm": 172.41040968845329, + "learning_rate": 7.423295454545455e-06, + "loss": 3.4614, + "step": 2614 + }, + { + "epoch": 0.22287564987641695, + "grad_norm": 133.0632127483462, + "learning_rate": 7.426136363636364e-06, + "loss": 3.3455, + "step": 2615 + }, + { + "epoch": 0.22296087957044233, + "grad_norm": 50.41201833792012, + "learning_rate": 7.428977272727274e-06, + "loss": 3.0292, + "step": 2616 + }, + { + "epoch": 0.22304610926446775, + "grad_norm": 82.92005569494907, + "learning_rate": 7.4318181818181825e-06, + "loss": 3.7693, + "step": 2617 + }, + { + "epoch": 0.22313133895849313, + "grad_norm": 91.61667510662704, + "learning_rate": 7.434659090909091e-06, + "loss": 3.7129, + "step": 2618 + }, + { + "epoch": 0.22321656865251854, + "grad_norm": 80.97936339982833, + "learning_rate": 7.437500000000001e-06, + "loss": 3.966, + "step": 2619 + }, + { + "epoch": 0.22330179834654393, + "grad_norm": 53.19865090358604, + "learning_rate": 7.440340909090909e-06, + "loss": 3.3371, + "step": 2620 + }, + { + "epoch": 0.22338702804056934, + "grad_norm": 84.23484449985813, + "learning_rate": 7.443181818181818e-06, + "loss": 2.5283, + "step": 2621 + }, + { + "epoch": 0.22347225773459473, + "grad_norm": 107.89891269340917, + "learning_rate": 7.446022727272728e-06, + "loss": 4.0496, + "step": 2622 + }, + { + "epoch": 0.22355748742862014, + "grad_norm": 136.8746754390815, + "learning_rate": 7.448863636363637e-06, + "loss": 4.3666, + "step": 2623 + }, + { + "epoch": 0.22364271712264552, + "grad_norm": 74.63391194108367, + "learning_rate": 7.451704545454546e-06, + "loss": 4.2607, + "step": 2624 + }, + { + "epoch": 0.22372794681667094, + "grad_norm": 112.46457737133451, + "learning_rate": 7.454545454545456e-06, + "loss": 5.1449, + "step": 2625 + }, + { + "epoch": 0.22381317651069632, + "grad_norm": 62.0794525165367, + "learning_rate": 7.4573863636363646e-06, + "loss": 3.4766, + "step": 2626 + }, + { + "epoch": 0.22389840620472173, + "grad_norm": 46.07493607287515, + "learning_rate": 7.460227272727273e-06, + "loss": 2.4817, + "step": 2627 + }, + { + "epoch": 0.22398363589874712, + "grad_norm": 128.67504489355107, + "learning_rate": 7.463068181818182e-06, + "loss": 3.6949, + "step": 2628 + }, + { + "epoch": 0.22406886559277253, + "grad_norm": 44.052135200286834, + "learning_rate": 7.465909090909091e-06, + "loss": 3.2203, + "step": 2629 + }, + { + "epoch": 0.22415409528679792, + "grad_norm": 52.98048410080949, + "learning_rate": 7.46875e-06, + "loss": 3.2432, + "step": 2630 + }, + { + "epoch": 0.22423932498082333, + "grad_norm": 66.63934382194152, + "learning_rate": 7.47159090909091e-06, + "loss": 3.5095, + "step": 2631 + }, + { + "epoch": 0.22432455467484871, + "grad_norm": 89.67171387316425, + "learning_rate": 7.474431818181819e-06, + "loss": 2.9994, + "step": 2632 + }, + { + "epoch": 0.22440978436887413, + "grad_norm": 107.58492207940114, + "learning_rate": 7.477272727272727e-06, + "loss": 3.9829, + "step": 2633 + }, + { + "epoch": 0.2244950140628995, + "grad_norm": 76.27365061061663, + "learning_rate": 7.480113636363638e-06, + "loss": 3.1336, + "step": 2634 + }, + { + "epoch": 0.22458024375692492, + "grad_norm": 85.18972065245556, + "learning_rate": 7.482954545454546e-06, + "loss": 3.3432, + "step": 2635 + }, + { + "epoch": 0.2246654734509503, + "grad_norm": 56.45022410357928, + "learning_rate": 7.485795454545455e-06, + "loss": 2.736, + "step": 2636 + }, + { + "epoch": 0.22475070314497572, + "grad_norm": 158.85270333323808, + "learning_rate": 7.4886363636363645e-06, + "loss": 3.7242, + "step": 2637 + }, + { + "epoch": 0.2248359328390011, + "grad_norm": 54.6543377803036, + "learning_rate": 7.491477272727273e-06, + "loss": 3.2389, + "step": 2638 + }, + { + "epoch": 0.22492116253302652, + "grad_norm": 45.56012146282541, + "learning_rate": 7.494318181818182e-06, + "loss": 2.9839, + "step": 2639 + }, + { + "epoch": 0.2250063922270519, + "grad_norm": 75.11989958682643, + "learning_rate": 7.497159090909092e-06, + "loss": 3.304, + "step": 2640 + }, + { + "epoch": 0.2250916219210773, + "grad_norm": 113.4713092919264, + "learning_rate": 7.500000000000001e-06, + "loss": 4.3668, + "step": 2641 + }, + { + "epoch": 0.2251768516151027, + "grad_norm": 234.39067537569065, + "learning_rate": 7.502840909090909e-06, + "loss": 4.264, + "step": 2642 + }, + { + "epoch": 0.2252620813091281, + "grad_norm": 120.27859898473093, + "learning_rate": 7.505681818181819e-06, + "loss": 3.4734, + "step": 2643 + }, + { + "epoch": 0.2253473110031535, + "grad_norm": 38.3957266277573, + "learning_rate": 7.508522727272728e-06, + "loss": 1.691, + "step": 2644 + }, + { + "epoch": 0.22543254069717888, + "grad_norm": 142.1088175760175, + "learning_rate": 7.511363636363637e-06, + "loss": 3.801, + "step": 2645 + }, + { + "epoch": 0.2255177703912043, + "grad_norm": 184.1846846389528, + "learning_rate": 7.5142045454545465e-06, + "loss": 4.6093, + "step": 2646 + }, + { + "epoch": 0.22560300008522968, + "grad_norm": 134.7805170865722, + "learning_rate": 7.5170454545454555e-06, + "loss": 3.8506, + "step": 2647 + }, + { + "epoch": 0.2256882297792551, + "grad_norm": 73.24609629677957, + "learning_rate": 7.5198863636363635e-06, + "loss": 3.402, + "step": 2648 + }, + { + "epoch": 0.22577345947328048, + "grad_norm": 72.29332037579556, + "learning_rate": 7.522727272727274e-06, + "loss": 2.3959, + "step": 2649 + }, + { + "epoch": 0.2258586891673059, + "grad_norm": 363.3693943870859, + "learning_rate": 7.525568181818182e-06, + "loss": 4.9584, + "step": 2650 + }, + { + "epoch": 0.22594391886133128, + "grad_norm": 127.20364253677624, + "learning_rate": 7.528409090909091e-06, + "loss": 3.9943, + "step": 2651 + }, + { + "epoch": 0.2260291485553567, + "grad_norm": 153.44098706192486, + "learning_rate": 7.531250000000001e-06, + "loss": 3.7824, + "step": 2652 + }, + { + "epoch": 0.22611437824938208, + "grad_norm": 81.51150393626645, + "learning_rate": 7.53409090909091e-06, + "loss": 3.6638, + "step": 2653 + }, + { + "epoch": 0.2261996079434075, + "grad_norm": 129.34946802696163, + "learning_rate": 7.536931818181819e-06, + "loss": 3.8301, + "step": 2654 + }, + { + "epoch": 0.22628483763743287, + "grad_norm": 111.92776869451288, + "learning_rate": 7.539772727272729e-06, + "loss": 3.5619, + "step": 2655 + }, + { + "epoch": 0.22637006733145829, + "grad_norm": 81.72975561120751, + "learning_rate": 7.542613636363637e-06, + "loss": 3.7424, + "step": 2656 + }, + { + "epoch": 0.22645529702548367, + "grad_norm": 88.48725721847966, + "learning_rate": 7.545454545454546e-06, + "loss": 3.2763, + "step": 2657 + }, + { + "epoch": 0.22654052671950908, + "grad_norm": 175.4765877017633, + "learning_rate": 7.548295454545455e-06, + "loss": 4.2379, + "step": 2658 + }, + { + "epoch": 0.22662575641353447, + "grad_norm": 76.54049472469528, + "learning_rate": 7.551136363636364e-06, + "loss": 3.414, + "step": 2659 + }, + { + "epoch": 0.22671098610755988, + "grad_norm": 97.09811147912787, + "learning_rate": 7.553977272727273e-06, + "loss": 3.3416, + "step": 2660 + }, + { + "epoch": 0.22679621580158527, + "grad_norm": 70.79450201817552, + "learning_rate": 7.556818181818183e-06, + "loss": 3.5458, + "step": 2661 + }, + { + "epoch": 0.22688144549561068, + "grad_norm": 227.04870140788128, + "learning_rate": 7.559659090909092e-06, + "loss": 4.4798, + "step": 2662 + }, + { + "epoch": 0.22696667518963606, + "grad_norm": 49.45749454957066, + "learning_rate": 7.5625e-06, + "loss": 3.1627, + "step": 2663 + }, + { + "epoch": 0.22705190488366148, + "grad_norm": 149.31883539796272, + "learning_rate": 7.565340909090911e-06, + "loss": 4.9632, + "step": 2664 + }, + { + "epoch": 0.22713713457768686, + "grad_norm": 84.24131045315593, + "learning_rate": 7.568181818181819e-06, + "loss": 3.3715, + "step": 2665 + }, + { + "epoch": 0.22722236427171227, + "grad_norm": 417.7686850811436, + "learning_rate": 7.571022727272728e-06, + "loss": 3.7877, + "step": 2666 + }, + { + "epoch": 0.22730759396573766, + "grad_norm": 121.32481153746915, + "learning_rate": 7.5738636363636374e-06, + "loss": 3.9857, + "step": 2667 + }, + { + "epoch": 0.22739282365976307, + "grad_norm": 79.03916228193435, + "learning_rate": 7.576704545454546e-06, + "loss": 2.0664, + "step": 2668 + }, + { + "epoch": 0.22747805335378846, + "grad_norm": 100.03357654989435, + "learning_rate": 7.579545454545455e-06, + "loss": 4.2956, + "step": 2669 + }, + { + "epoch": 0.22756328304781387, + "grad_norm": 72.97883986409182, + "learning_rate": 7.582386363636363e-06, + "loss": 3.3555, + "step": 2670 + }, + { + "epoch": 0.22764851274183925, + "grad_norm": 150.896673999148, + "learning_rate": 7.585227272727273e-06, + "loss": 4.6932, + "step": 2671 + }, + { + "epoch": 0.22773374243586467, + "grad_norm": 161.10098008941452, + "learning_rate": 7.588068181818182e-06, + "loss": 3.4078, + "step": 2672 + }, + { + "epoch": 0.22781897212989005, + "grad_norm": 111.32468001538756, + "learning_rate": 7.590909090909091e-06, + "loss": 4.4016, + "step": 2673 + }, + { + "epoch": 0.22790420182391546, + "grad_norm": 92.9137765729974, + "learning_rate": 7.593750000000001e-06, + "loss": 3.2422, + "step": 2674 + }, + { + "epoch": 0.22798943151794085, + "grad_norm": 131.96902012906165, + "learning_rate": 7.59659090909091e-06, + "loss": 4.2174, + "step": 2675 + }, + { + "epoch": 0.22807466121196626, + "grad_norm": 51.80155219639679, + "learning_rate": 7.599431818181819e-06, + "loss": 3.11, + "step": 2676 + }, + { + "epoch": 0.22815989090599165, + "grad_norm": 74.37147244625811, + "learning_rate": 7.602272727272728e-06, + "loss": 3.5221, + "step": 2677 + }, + { + "epoch": 0.22824512060001706, + "grad_norm": 66.5829567215677, + "learning_rate": 7.6051136363636365e-06, + "loss": 3.0873, + "step": 2678 + }, + { + "epoch": 0.22833035029404244, + "grad_norm": 60.878182715032, + "learning_rate": 7.607954545454545e-06, + "loss": 3.6778, + "step": 2679 + }, + { + "epoch": 0.22841557998806783, + "grad_norm": 139.92083159728352, + "learning_rate": 7.610795454545455e-06, + "loss": 5.216, + "step": 2680 + }, + { + "epoch": 0.22850080968209324, + "grad_norm": 92.27539811462721, + "learning_rate": 7.613636363636364e-06, + "loss": 3.5015, + "step": 2681 + }, + { + "epoch": 0.22858603937611863, + "grad_norm": 81.21990107973565, + "learning_rate": 7.616477272727273e-06, + "loss": 3.493, + "step": 2682 + }, + { + "epoch": 0.22867126907014404, + "grad_norm": 74.42828706816674, + "learning_rate": 7.619318181818183e-06, + "loss": 3.9039, + "step": 2683 + }, + { + "epoch": 0.22875649876416942, + "grad_norm": 138.42299831609554, + "learning_rate": 7.622159090909092e-06, + "loss": 3.2569, + "step": 2684 + }, + { + "epoch": 0.22884172845819484, + "grad_norm": 154.19689419800983, + "learning_rate": 7.625e-06, + "loss": 5.0706, + "step": 2685 + }, + { + "epoch": 0.22892695815222022, + "grad_norm": 58.781122048017984, + "learning_rate": 7.62784090909091e-06, + "loss": 3.2763, + "step": 2686 + }, + { + "epoch": 0.22901218784624564, + "grad_norm": 161.0388874978416, + "learning_rate": 7.630681818181819e-06, + "loss": 4.6018, + "step": 2687 + }, + { + "epoch": 0.22909741754027102, + "grad_norm": 92.33101229886097, + "learning_rate": 7.633522727272727e-06, + "loss": 3.4601, + "step": 2688 + }, + { + "epoch": 0.22918264723429643, + "grad_norm": 97.31068575369382, + "learning_rate": 7.636363636363638e-06, + "loss": 3.4256, + "step": 2689 + }, + { + "epoch": 0.22926787692832182, + "grad_norm": 53.50048359596107, + "learning_rate": 7.639204545454546e-06, + "loss": 2.6307, + "step": 2690 + }, + { + "epoch": 0.22935310662234723, + "grad_norm": 104.05424674042229, + "learning_rate": 7.642045454545454e-06, + "loss": 2.3206, + "step": 2691 + }, + { + "epoch": 0.22943833631637262, + "grad_norm": 76.89073823811074, + "learning_rate": 7.644886363636364e-06, + "loss": 3.6929, + "step": 2692 + }, + { + "epoch": 0.22952356601039803, + "grad_norm": 67.24052394823259, + "learning_rate": 7.647727272727274e-06, + "loss": 3.283, + "step": 2693 + }, + { + "epoch": 0.2296087957044234, + "grad_norm": 81.94882660510339, + "learning_rate": 7.650568181818182e-06, + "loss": 3.6627, + "step": 2694 + }, + { + "epoch": 0.22969402539844883, + "grad_norm": 124.57194668159883, + "learning_rate": 7.653409090909092e-06, + "loss": 4.4312, + "step": 2695 + }, + { + "epoch": 0.2297792550924742, + "grad_norm": 106.99781276274702, + "learning_rate": 7.656250000000001e-06, + "loss": 3.3345, + "step": 2696 + }, + { + "epoch": 0.22986448478649962, + "grad_norm": 65.91869545095416, + "learning_rate": 7.65909090909091e-06, + "loss": 3.8809, + "step": 2697 + }, + { + "epoch": 0.229949714480525, + "grad_norm": 95.19156546917361, + "learning_rate": 7.66193181818182e-06, + "loss": 3.9804, + "step": 2698 + }, + { + "epoch": 0.23003494417455042, + "grad_norm": 118.39020478943316, + "learning_rate": 7.664772727272727e-06, + "loss": 4.347, + "step": 2699 + }, + { + "epoch": 0.2301201738685758, + "grad_norm": 122.75712703288943, + "learning_rate": 7.667613636363637e-06, + "loss": 3.5389, + "step": 2700 + }, + { + "epoch": 0.23020540356260122, + "grad_norm": 82.19901535687463, + "learning_rate": 7.670454545454547e-06, + "loss": 3.0307, + "step": 2701 + }, + { + "epoch": 0.2302906332566266, + "grad_norm": 225.9930788906456, + "learning_rate": 7.673295454545455e-06, + "loss": 4.5467, + "step": 2702 + }, + { + "epoch": 0.23037586295065202, + "grad_norm": 131.97192637367272, + "learning_rate": 7.676136363636363e-06, + "loss": 4.0121, + "step": 2703 + }, + { + "epoch": 0.2304610926446774, + "grad_norm": 101.76615544648052, + "learning_rate": 7.678977272727275e-06, + "loss": 3.6303, + "step": 2704 + }, + { + "epoch": 0.2305463223387028, + "grad_norm": 117.25922435121785, + "learning_rate": 7.681818181818183e-06, + "loss": 3.9601, + "step": 2705 + }, + { + "epoch": 0.2306315520327282, + "grad_norm": 175.11800691489987, + "learning_rate": 7.68465909090909e-06, + "loss": 4.1887, + "step": 2706 + }, + { + "epoch": 0.2307167817267536, + "grad_norm": 129.49436909201938, + "learning_rate": 7.6875e-06, + "loss": 4.0736, + "step": 2707 + }, + { + "epoch": 0.230802011420779, + "grad_norm": 148.06380603901385, + "learning_rate": 7.69034090909091e-06, + "loss": 4.7263, + "step": 2708 + }, + { + "epoch": 0.2308872411148044, + "grad_norm": 63.16992886472539, + "learning_rate": 7.693181818181818e-06, + "loss": 3.6096, + "step": 2709 + }, + { + "epoch": 0.2309724708088298, + "grad_norm": 67.78440422957054, + "learning_rate": 7.696022727272728e-06, + "loss": 3.1141, + "step": 2710 + }, + { + "epoch": 0.2310577005028552, + "grad_norm": 61.00608894077478, + "learning_rate": 7.698863636363638e-06, + "loss": 3.3921, + "step": 2711 + }, + { + "epoch": 0.2311429301968806, + "grad_norm": 120.53464908146412, + "learning_rate": 7.701704545454546e-06, + "loss": 4.2948, + "step": 2712 + }, + { + "epoch": 0.231228159890906, + "grad_norm": 138.45634150036219, + "learning_rate": 7.704545454545456e-06, + "loss": 4.3493, + "step": 2713 + }, + { + "epoch": 0.2313133895849314, + "grad_norm": 108.9209294965003, + "learning_rate": 7.707386363636364e-06, + "loss": 4.7107, + "step": 2714 + }, + { + "epoch": 0.2313986192789568, + "grad_norm": 40.32340422637816, + "learning_rate": 7.710227272727274e-06, + "loss": 3.0904, + "step": 2715 + }, + { + "epoch": 0.2314838489729822, + "grad_norm": 73.33974719377123, + "learning_rate": 7.713068181818183e-06, + "loss": 4.0481, + "step": 2716 + }, + { + "epoch": 0.23156907866700757, + "grad_norm": 107.55819539441987, + "learning_rate": 7.715909090909091e-06, + "loss": 3.8503, + "step": 2717 + }, + { + "epoch": 0.23165430836103298, + "grad_norm": 166.16190248142718, + "learning_rate": 7.71875e-06, + "loss": 4.6533, + "step": 2718 + }, + { + "epoch": 0.23173953805505837, + "grad_norm": 109.61455916519454, + "learning_rate": 7.721590909090911e-06, + "loss": 3.6061, + "step": 2719 + }, + { + "epoch": 0.23182476774908378, + "grad_norm": 99.60102986343186, + "learning_rate": 7.724431818181819e-06, + "loss": 3.9115, + "step": 2720 + }, + { + "epoch": 0.23190999744310917, + "grad_norm": 161.8934233555223, + "learning_rate": 7.727272727272727e-06, + "loss": 5.3012, + "step": 2721 + }, + { + "epoch": 0.23199522713713458, + "grad_norm": 69.85747866630962, + "learning_rate": 7.730113636363637e-06, + "loss": 3.7161, + "step": 2722 + }, + { + "epoch": 0.23208045683115996, + "grad_norm": 50.96279012761702, + "learning_rate": 7.732954545454547e-06, + "loss": 3.2323, + "step": 2723 + }, + { + "epoch": 0.23216568652518538, + "grad_norm": 168.1927194553905, + "learning_rate": 7.735795454545455e-06, + "loss": 5.1151, + "step": 2724 + }, + { + "epoch": 0.23225091621921076, + "grad_norm": 125.21360318968043, + "learning_rate": 7.738636363636363e-06, + "loss": 4.3379, + "step": 2725 + }, + { + "epoch": 0.23233614591323618, + "grad_norm": 406.05446780358017, + "learning_rate": 7.741477272727274e-06, + "loss": 2.6921, + "step": 2726 + }, + { + "epoch": 0.23242137560726156, + "grad_norm": 61.749820994014094, + "learning_rate": 7.744318181818182e-06, + "loss": 2.9084, + "step": 2727 + }, + { + "epoch": 0.23250660530128697, + "grad_norm": 92.95501357551692, + "learning_rate": 7.74715909090909e-06, + "loss": 3.7335, + "step": 2728 + }, + { + "epoch": 0.23259183499531236, + "grad_norm": 77.28243496970826, + "learning_rate": 7.75e-06, + "loss": 3.3197, + "step": 2729 + }, + { + "epoch": 0.23267706468933777, + "grad_norm": 212.528147083281, + "learning_rate": 7.75284090909091e-06, + "loss": 5.7048, + "step": 2730 + }, + { + "epoch": 0.23276229438336316, + "grad_norm": 97.81462154174928, + "learning_rate": 7.755681818181818e-06, + "loss": 4.0107, + "step": 2731 + }, + { + "epoch": 0.23284752407738857, + "grad_norm": 301.13929207526985, + "learning_rate": 7.758522727272728e-06, + "loss": 3.3094, + "step": 2732 + }, + { + "epoch": 0.23293275377141395, + "grad_norm": 96.29562475558807, + "learning_rate": 7.761363636363636e-06, + "loss": 2.5529, + "step": 2733 + }, + { + "epoch": 0.23301798346543937, + "grad_norm": 152.3117911493153, + "learning_rate": 7.764204545454546e-06, + "loss": 3.8211, + "step": 2734 + }, + { + "epoch": 0.23310321315946475, + "grad_norm": 68.892510591783, + "learning_rate": 7.767045454545456e-06, + "loss": 3.3935, + "step": 2735 + }, + { + "epoch": 0.23318844285349016, + "grad_norm": 105.5012699625677, + "learning_rate": 7.769886363636364e-06, + "loss": 3.0343, + "step": 2736 + }, + { + "epoch": 0.23327367254751555, + "grad_norm": 57.89016763514784, + "learning_rate": 7.772727272727273e-06, + "loss": 3.2058, + "step": 2737 + }, + { + "epoch": 0.23335890224154096, + "grad_norm": 108.62812052431474, + "learning_rate": 7.775568181818183e-06, + "loss": 3.7385, + "step": 2738 + }, + { + "epoch": 0.23344413193556635, + "grad_norm": 46.91994682169576, + "learning_rate": 7.778409090909091e-06, + "loss": 3.5963, + "step": 2739 + }, + { + "epoch": 0.23352936162959176, + "grad_norm": 433.9124457455167, + "learning_rate": 7.78125e-06, + "loss": 5.3718, + "step": 2740 + }, + { + "epoch": 0.23361459132361714, + "grad_norm": 45.65911238795724, + "learning_rate": 7.784090909090911e-06, + "loss": 2.4078, + "step": 2741 + }, + { + "epoch": 0.23369982101764256, + "grad_norm": 133.4383328875499, + "learning_rate": 7.786931818181819e-06, + "loss": 3.8366, + "step": 2742 + }, + { + "epoch": 0.23378505071166794, + "grad_norm": 115.88056233337767, + "learning_rate": 7.789772727272727e-06, + "loss": 3.7542, + "step": 2743 + }, + { + "epoch": 0.23387028040569335, + "grad_norm": 99.85724573473003, + "learning_rate": 7.792613636363637e-06, + "loss": 3.8596, + "step": 2744 + }, + { + "epoch": 0.23395551009971874, + "grad_norm": 136.11985577463713, + "learning_rate": 7.795454545454547e-06, + "loss": 3.3418, + "step": 2745 + }, + { + "epoch": 0.23404073979374415, + "grad_norm": 69.524539713955, + "learning_rate": 7.798295454545455e-06, + "loss": 3.3445, + "step": 2746 + }, + { + "epoch": 0.23412596948776954, + "grad_norm": 55.58194946167449, + "learning_rate": 7.801136363636364e-06, + "loss": 3.5222, + "step": 2747 + }, + { + "epoch": 0.23421119918179495, + "grad_norm": 133.30937246996842, + "learning_rate": 7.803977272727273e-06, + "loss": 3.7821, + "step": 2748 + }, + { + "epoch": 0.23429642887582033, + "grad_norm": 111.12970849369007, + "learning_rate": 7.806818181818182e-06, + "loss": 3.7445, + "step": 2749 + }, + { + "epoch": 0.23438165856984575, + "grad_norm": 129.9034879558492, + "learning_rate": 7.809659090909092e-06, + "loss": 3.2161, + "step": 2750 + }, + { + "epoch": 0.23446688826387113, + "grad_norm": 67.30142274061042, + "learning_rate": 7.8125e-06, + "loss": 3.9229, + "step": 2751 + }, + { + "epoch": 0.23455211795789654, + "grad_norm": 47.019728128852186, + "learning_rate": 7.81534090909091e-06, + "loss": 3.7761, + "step": 2752 + }, + { + "epoch": 0.23463734765192193, + "grad_norm": 138.11865714549424, + "learning_rate": 7.81818181818182e-06, + "loss": 4.1165, + "step": 2753 + }, + { + "epoch": 0.23472257734594734, + "grad_norm": 103.9721293648649, + "learning_rate": 7.821022727272728e-06, + "loss": 4.2481, + "step": 2754 + }, + { + "epoch": 0.23480780703997273, + "grad_norm": 113.26144793528793, + "learning_rate": 7.823863636363636e-06, + "loss": 3.5213, + "step": 2755 + }, + { + "epoch": 0.2348930367339981, + "grad_norm": 84.3880330256138, + "learning_rate": 7.826704545454546e-06, + "loss": 3.4256, + "step": 2756 + }, + { + "epoch": 0.23497826642802352, + "grad_norm": 76.36680577014498, + "learning_rate": 7.829545454545455e-06, + "loss": 3.8266, + "step": 2757 + }, + { + "epoch": 0.2350634961220489, + "grad_norm": 97.96906371130987, + "learning_rate": 7.832386363636364e-06, + "loss": 3.9796, + "step": 2758 + }, + { + "epoch": 0.23514872581607432, + "grad_norm": 76.91464787813726, + "learning_rate": 7.835227272727273e-06, + "loss": 2.0586, + "step": 2759 + }, + { + "epoch": 0.2352339555100997, + "grad_norm": 100.00258089450288, + "learning_rate": 7.838068181818183e-06, + "loss": 4.2147, + "step": 2760 + }, + { + "epoch": 0.23531918520412512, + "grad_norm": 90.10462038007545, + "learning_rate": 7.840909090909091e-06, + "loss": 3.5383, + "step": 2761 + }, + { + "epoch": 0.2354044148981505, + "grad_norm": 114.18167832124234, + "learning_rate": 7.843750000000001e-06, + "loss": 2.8186, + "step": 2762 + }, + { + "epoch": 0.23548964459217592, + "grad_norm": 92.23762551173971, + "learning_rate": 7.846590909090909e-06, + "loss": 3.887, + "step": 2763 + }, + { + "epoch": 0.2355748742862013, + "grad_norm": 118.69134803416713, + "learning_rate": 7.849431818181819e-06, + "loss": 3.8729, + "step": 2764 + }, + { + "epoch": 0.23566010398022672, + "grad_norm": 64.16344179488567, + "learning_rate": 7.852272727272729e-06, + "loss": 3.0804, + "step": 2765 + }, + { + "epoch": 0.2357453336742521, + "grad_norm": 56.80871781854174, + "learning_rate": 7.855113636363637e-06, + "loss": 3.3567, + "step": 2766 + }, + { + "epoch": 0.2358305633682775, + "grad_norm": 50.914616713383225, + "learning_rate": 7.857954545454546e-06, + "loss": 3.1401, + "step": 2767 + }, + { + "epoch": 0.2359157930623029, + "grad_norm": 41.38254633410235, + "learning_rate": 7.860795454545456e-06, + "loss": 2.7171, + "step": 2768 + }, + { + "epoch": 0.2360010227563283, + "grad_norm": 80.21938916612022, + "learning_rate": 7.863636363636364e-06, + "loss": 3.6442, + "step": 2769 + }, + { + "epoch": 0.2360862524503537, + "grad_norm": 56.0287003756317, + "learning_rate": 7.866477272727272e-06, + "loss": 3.2046, + "step": 2770 + }, + { + "epoch": 0.2361714821443791, + "grad_norm": 164.097599189649, + "learning_rate": 7.869318181818182e-06, + "loss": 3.0742, + "step": 2771 + }, + { + "epoch": 0.2362567118384045, + "grad_norm": 120.43811261876678, + "learning_rate": 7.872159090909092e-06, + "loss": 3.7836, + "step": 2772 + }, + { + "epoch": 0.2363419415324299, + "grad_norm": 137.73550866939962, + "learning_rate": 7.875e-06, + "loss": 3.2476, + "step": 2773 + }, + { + "epoch": 0.2364271712264553, + "grad_norm": 56.9004676023307, + "learning_rate": 7.87784090909091e-06, + "loss": 3.0837, + "step": 2774 + }, + { + "epoch": 0.2365124009204807, + "grad_norm": 172.1097783585311, + "learning_rate": 7.88068181818182e-06, + "loss": 4.0163, + "step": 2775 + }, + { + "epoch": 0.2365976306145061, + "grad_norm": 83.92560824047199, + "learning_rate": 7.883522727272728e-06, + "loss": 3.6364, + "step": 2776 + }, + { + "epoch": 0.2366828603085315, + "grad_norm": 134.5696563262896, + "learning_rate": 7.886363636363637e-06, + "loss": 3.9728, + "step": 2777 + }, + { + "epoch": 0.23676809000255689, + "grad_norm": 156.19396111939193, + "learning_rate": 7.889204545454545e-06, + "loss": 4.7174, + "step": 2778 + }, + { + "epoch": 0.2368533196965823, + "grad_norm": 114.55594750271061, + "learning_rate": 7.892045454545455e-06, + "loss": 2.5382, + "step": 2779 + }, + { + "epoch": 0.23693854939060768, + "grad_norm": 79.8712722317394, + "learning_rate": 7.894886363636363e-06, + "loss": 3.5073, + "step": 2780 + }, + { + "epoch": 0.2370237790846331, + "grad_norm": 76.95570553789777, + "learning_rate": 7.897727272727273e-06, + "loss": 3.6022, + "step": 2781 + }, + { + "epoch": 0.23710900877865848, + "grad_norm": 128.664013181736, + "learning_rate": 7.900568181818183e-06, + "loss": 3.6155, + "step": 2782 + }, + { + "epoch": 0.2371942384726839, + "grad_norm": 59.86315059878006, + "learning_rate": 7.903409090909091e-06, + "loss": 3.3667, + "step": 2783 + }, + { + "epoch": 0.23727946816670928, + "grad_norm": 165.46000859804911, + "learning_rate": 7.90625e-06, + "loss": 4.2771, + "step": 2784 + }, + { + "epoch": 0.2373646978607347, + "grad_norm": 62.72070374000012, + "learning_rate": 7.909090909090909e-06, + "loss": 3.0389, + "step": 2785 + }, + { + "epoch": 0.23744992755476008, + "grad_norm": 134.06845262314832, + "learning_rate": 7.911931818181819e-06, + "loss": 4.5167, + "step": 2786 + }, + { + "epoch": 0.2375351572487855, + "grad_norm": 165.48270490440885, + "learning_rate": 7.914772727272728e-06, + "loss": 4.9725, + "step": 2787 + }, + { + "epoch": 0.23762038694281087, + "grad_norm": 78.46864607231082, + "learning_rate": 7.917613636363636e-06, + "loss": 3.4589, + "step": 2788 + }, + { + "epoch": 0.2377056166368363, + "grad_norm": 68.73059847249995, + "learning_rate": 7.920454545454546e-06, + "loss": 3.5814, + "step": 2789 + }, + { + "epoch": 0.23779084633086167, + "grad_norm": 56.365974932947374, + "learning_rate": 7.923295454545456e-06, + "loss": 3.0252, + "step": 2790 + }, + { + "epoch": 0.23787607602488708, + "grad_norm": 57.02035792631159, + "learning_rate": 7.926136363636364e-06, + "loss": 4.0214, + "step": 2791 + }, + { + "epoch": 0.23796130571891247, + "grad_norm": 71.9104564719414, + "learning_rate": 7.928977272727272e-06, + "loss": 4.4719, + "step": 2792 + }, + { + "epoch": 0.23804653541293785, + "grad_norm": 45.48331120755991, + "learning_rate": 7.931818181818182e-06, + "loss": 2.7819, + "step": 2793 + }, + { + "epoch": 0.23813176510696327, + "grad_norm": 131.6590752357438, + "learning_rate": 7.934659090909092e-06, + "loss": 4.1203, + "step": 2794 + }, + { + "epoch": 0.23821699480098865, + "grad_norm": 58.55927234160015, + "learning_rate": 7.9375e-06, + "loss": 3.1476, + "step": 2795 + }, + { + "epoch": 0.23830222449501406, + "grad_norm": 88.08419527093842, + "learning_rate": 7.94034090909091e-06, + "loss": 4.0437, + "step": 2796 + }, + { + "epoch": 0.23838745418903945, + "grad_norm": 62.182136728130985, + "learning_rate": 7.94318181818182e-06, + "loss": 3.6778, + "step": 2797 + }, + { + "epoch": 0.23847268388306486, + "grad_norm": 229.92385208957276, + "learning_rate": 7.946022727272727e-06, + "loss": 4.53, + "step": 2798 + }, + { + "epoch": 0.23855791357709025, + "grad_norm": 128.96660122244052, + "learning_rate": 7.948863636363637e-06, + "loss": 3.9485, + "step": 2799 + }, + { + "epoch": 0.23864314327111566, + "grad_norm": 88.53458363636034, + "learning_rate": 7.951704545454545e-06, + "loss": 3.5086, + "step": 2800 + }, + { + "epoch": 0.23872837296514104, + "grad_norm": 55.15761440907985, + "learning_rate": 7.954545454545455e-06, + "loss": 3.802, + "step": 2801 + }, + { + "epoch": 0.23881360265916646, + "grad_norm": 71.93266829359676, + "learning_rate": 7.957386363636365e-06, + "loss": 4.0723, + "step": 2802 + }, + { + "epoch": 0.23889883235319184, + "grad_norm": 147.9783081615919, + "learning_rate": 7.960227272727273e-06, + "loss": 4.1545, + "step": 2803 + }, + { + "epoch": 0.23898406204721725, + "grad_norm": 61.65790667584698, + "learning_rate": 7.963068181818183e-06, + "loss": 3.1679, + "step": 2804 + }, + { + "epoch": 0.23906929174124264, + "grad_norm": 66.98614592560777, + "learning_rate": 7.965909090909092e-06, + "loss": 2.6574, + "step": 2805 + }, + { + "epoch": 0.23915452143526805, + "grad_norm": 134.78774979513452, + "learning_rate": 7.96875e-06, + "loss": 4.0537, + "step": 2806 + }, + { + "epoch": 0.23923975112929344, + "grad_norm": 96.1119517980222, + "learning_rate": 7.971590909090909e-06, + "loss": 4.1829, + "step": 2807 + }, + { + "epoch": 0.23932498082331885, + "grad_norm": 110.89128365271876, + "learning_rate": 7.974431818181818e-06, + "loss": 3.5189, + "step": 2808 + }, + { + "epoch": 0.23941021051734424, + "grad_norm": 60.7271203536704, + "learning_rate": 7.977272727272728e-06, + "loss": 2.9661, + "step": 2809 + }, + { + "epoch": 0.23949544021136965, + "grad_norm": 74.5261262447709, + "learning_rate": 7.980113636363636e-06, + "loss": 3.5456, + "step": 2810 + }, + { + "epoch": 0.23958066990539503, + "grad_norm": 101.6732183064983, + "learning_rate": 7.982954545454546e-06, + "loss": 3.8361, + "step": 2811 + }, + { + "epoch": 0.23966589959942045, + "grad_norm": 83.76968424333587, + "learning_rate": 7.985795454545456e-06, + "loss": 3.5607, + "step": 2812 + }, + { + "epoch": 0.23975112929344583, + "grad_norm": 220.48699504731175, + "learning_rate": 7.988636363636364e-06, + "loss": 3.8044, + "step": 2813 + }, + { + "epoch": 0.23983635898747124, + "grad_norm": 58.533863856719805, + "learning_rate": 7.991477272727274e-06, + "loss": 2.7071, + "step": 2814 + }, + { + "epoch": 0.23992158868149663, + "grad_norm": 101.41796083679917, + "learning_rate": 7.994318181818182e-06, + "loss": 3.9322, + "step": 2815 + }, + { + "epoch": 0.24000681837552204, + "grad_norm": 162.33231924065845, + "learning_rate": 7.997159090909092e-06, + "loss": 5.1233, + "step": 2816 + }, + { + "epoch": 0.24009204806954743, + "grad_norm": 125.00430207609375, + "learning_rate": 8.000000000000001e-06, + "loss": 4.1448, + "step": 2817 + }, + { + "epoch": 0.24017727776357284, + "grad_norm": 116.21022107075298, + "learning_rate": 8.00284090909091e-06, + "loss": 2.96, + "step": 2818 + }, + { + "epoch": 0.24026250745759822, + "grad_norm": 91.08790501392438, + "learning_rate": 8.00568181818182e-06, + "loss": 3.8601, + "step": 2819 + }, + { + "epoch": 0.24034773715162364, + "grad_norm": 84.96713287459323, + "learning_rate": 8.008522727272729e-06, + "loss": 3.0839, + "step": 2820 + }, + { + "epoch": 0.24043296684564902, + "grad_norm": 74.87830958972052, + "learning_rate": 8.011363636363637e-06, + "loss": 3.1803, + "step": 2821 + }, + { + "epoch": 0.24051819653967443, + "grad_norm": 88.7370668285076, + "learning_rate": 8.014204545454545e-06, + "loss": 3.3006, + "step": 2822 + }, + { + "epoch": 0.24060342623369982, + "grad_norm": 146.02470362894937, + "learning_rate": 8.017045454545455e-06, + "loss": 3.8609, + "step": 2823 + }, + { + "epoch": 0.24068865592772523, + "grad_norm": 366.9009610675342, + "learning_rate": 8.019886363636365e-06, + "loss": 4.4612, + "step": 2824 + }, + { + "epoch": 0.24077388562175062, + "grad_norm": 96.46822461723113, + "learning_rate": 8.022727272727273e-06, + "loss": 3.2155, + "step": 2825 + }, + { + "epoch": 0.24085911531577603, + "grad_norm": 649.8658006683114, + "learning_rate": 8.025568181818183e-06, + "loss": 5.6051, + "step": 2826 + }, + { + "epoch": 0.24094434500980141, + "grad_norm": 119.47657471543559, + "learning_rate": 8.028409090909092e-06, + "loss": 4.6841, + "step": 2827 + }, + { + "epoch": 0.24102957470382683, + "grad_norm": 213.45920398940453, + "learning_rate": 8.03125e-06, + "loss": 5.5656, + "step": 2828 + }, + { + "epoch": 0.2411148043978522, + "grad_norm": 157.13988536487665, + "learning_rate": 8.03409090909091e-06, + "loss": 4.9244, + "step": 2829 + }, + { + "epoch": 0.2412000340918776, + "grad_norm": 117.61946570590605, + "learning_rate": 8.036931818181818e-06, + "loss": 3.3837, + "step": 2830 + }, + { + "epoch": 0.241285263785903, + "grad_norm": 276.0746123657444, + "learning_rate": 8.039772727272728e-06, + "loss": 4.264, + "step": 2831 + }, + { + "epoch": 0.2413704934799284, + "grad_norm": 67.33379072403172, + "learning_rate": 8.042613636363638e-06, + "loss": 3.507, + "step": 2832 + }, + { + "epoch": 0.2414557231739538, + "grad_norm": 83.84850556145868, + "learning_rate": 8.045454545454546e-06, + "loss": 3.3395, + "step": 2833 + }, + { + "epoch": 0.2415409528679792, + "grad_norm": 67.83835140744212, + "learning_rate": 8.048295454545456e-06, + "loss": 3.7378, + "step": 2834 + }, + { + "epoch": 0.2416261825620046, + "grad_norm": 270.18615506459207, + "learning_rate": 8.051136363636364e-06, + "loss": 3.997, + "step": 2835 + }, + { + "epoch": 0.24171141225603, + "grad_norm": 57.35308665345135, + "learning_rate": 8.053977272727274e-06, + "loss": 3.6769, + "step": 2836 + }, + { + "epoch": 0.2417966419500554, + "grad_norm": 68.00566051535664, + "learning_rate": 8.056818181818182e-06, + "loss": 3.5853, + "step": 2837 + }, + { + "epoch": 0.2418818716440808, + "grad_norm": 123.51687209540894, + "learning_rate": 8.059659090909091e-06, + "loss": 4.0761, + "step": 2838 + }, + { + "epoch": 0.2419671013381062, + "grad_norm": 73.17495371191829, + "learning_rate": 8.062500000000001e-06, + "loss": 3.8039, + "step": 2839 + }, + { + "epoch": 0.24205233103213158, + "grad_norm": 123.32570148307711, + "learning_rate": 8.06534090909091e-06, + "loss": 4.0215, + "step": 2840 + }, + { + "epoch": 0.242137560726157, + "grad_norm": 48.597840559234974, + "learning_rate": 8.068181818181819e-06, + "loss": 2.6716, + "step": 2841 + }, + { + "epoch": 0.24222279042018238, + "grad_norm": 135.54447822847172, + "learning_rate": 8.071022727272729e-06, + "loss": 2.7959, + "step": 2842 + }, + { + "epoch": 0.2423080201142078, + "grad_norm": 245.81846745951307, + "learning_rate": 8.073863636363637e-06, + "loss": 5.0952, + "step": 2843 + }, + { + "epoch": 0.24239324980823318, + "grad_norm": 110.6582470478801, + "learning_rate": 8.076704545454545e-06, + "loss": 4.2173, + "step": 2844 + }, + { + "epoch": 0.2424784795022586, + "grad_norm": 56.322670506722666, + "learning_rate": 8.079545454545455e-06, + "loss": 3.9653, + "step": 2845 + }, + { + "epoch": 0.24256370919628398, + "grad_norm": 75.02272068211025, + "learning_rate": 8.082386363636365e-06, + "loss": 3.3811, + "step": 2846 + }, + { + "epoch": 0.2426489388903094, + "grad_norm": 98.52027468290383, + "learning_rate": 8.085227272727273e-06, + "loss": 4.3677, + "step": 2847 + }, + { + "epoch": 0.24273416858433478, + "grad_norm": 109.06152688146028, + "learning_rate": 8.088068181818182e-06, + "loss": 4.1576, + "step": 2848 + }, + { + "epoch": 0.2428193982783602, + "grad_norm": 144.14032647058426, + "learning_rate": 8.090909090909092e-06, + "loss": 4.2543, + "step": 2849 + }, + { + "epoch": 0.24290462797238557, + "grad_norm": 53.29272021526653, + "learning_rate": 8.09375e-06, + "loss": 3.4612, + "step": 2850 + }, + { + "epoch": 0.24298985766641099, + "grad_norm": 50.24851664840546, + "learning_rate": 8.09659090909091e-06, + "loss": 2.8262, + "step": 2851 + }, + { + "epoch": 0.24307508736043637, + "grad_norm": 75.24009140965073, + "learning_rate": 8.099431818181818e-06, + "loss": 3.8971, + "step": 2852 + }, + { + "epoch": 0.24316031705446178, + "grad_norm": 66.24244524048898, + "learning_rate": 8.102272727272728e-06, + "loss": 3.3596, + "step": 2853 + }, + { + "epoch": 0.24324554674848717, + "grad_norm": 83.06229360469145, + "learning_rate": 8.105113636363638e-06, + "loss": 3.5367, + "step": 2854 + }, + { + "epoch": 0.24333077644251258, + "grad_norm": 65.05992722715416, + "learning_rate": 8.107954545454546e-06, + "loss": 2.6751, + "step": 2855 + }, + { + "epoch": 0.24341600613653797, + "grad_norm": 130.6427801820005, + "learning_rate": 8.110795454545455e-06, + "loss": 4.2243, + "step": 2856 + }, + { + "epoch": 0.24350123583056338, + "grad_norm": 91.68166128710732, + "learning_rate": 8.113636363636365e-06, + "loss": 4.0941, + "step": 2857 + }, + { + "epoch": 0.24358646552458876, + "grad_norm": 132.73625554324087, + "learning_rate": 8.116477272727273e-06, + "loss": 4.5954, + "step": 2858 + }, + { + "epoch": 0.24367169521861418, + "grad_norm": 89.01479452653531, + "learning_rate": 8.119318181818181e-06, + "loss": 3.8934, + "step": 2859 + }, + { + "epoch": 0.24375692491263956, + "grad_norm": 50.64629754355463, + "learning_rate": 8.122159090909091e-06, + "loss": 3.4697, + "step": 2860 + }, + { + "epoch": 0.24384215460666497, + "grad_norm": 132.9323101307325, + "learning_rate": 8.125000000000001e-06, + "loss": 2.8472, + "step": 2861 + }, + { + "epoch": 0.24392738430069036, + "grad_norm": 429.4636577728742, + "learning_rate": 8.127840909090909e-06, + "loss": 5.1912, + "step": 2862 + }, + { + "epoch": 0.24401261399471577, + "grad_norm": 46.89867048662866, + "learning_rate": 8.130681818181819e-06, + "loss": 3.5228, + "step": 2863 + }, + { + "epoch": 0.24409784368874116, + "grad_norm": 75.94378042065237, + "learning_rate": 8.133522727272729e-06, + "loss": 3.1642, + "step": 2864 + }, + { + "epoch": 0.24418307338276657, + "grad_norm": 45.934238767079094, + "learning_rate": 8.136363636363637e-06, + "loss": 3.4559, + "step": 2865 + }, + { + "epoch": 0.24426830307679195, + "grad_norm": 182.82048402003198, + "learning_rate": 8.139204545454546e-06, + "loss": 3.928, + "step": 2866 + }, + { + "epoch": 0.24435353277081737, + "grad_norm": 469.48115734851643, + "learning_rate": 8.142045454545455e-06, + "loss": 3.5812, + "step": 2867 + }, + { + "epoch": 0.24443876246484275, + "grad_norm": 65.25877548486751, + "learning_rate": 8.144886363636364e-06, + "loss": 3.7875, + "step": 2868 + }, + { + "epoch": 0.24452399215886814, + "grad_norm": 59.87633311887026, + "learning_rate": 8.147727272727274e-06, + "loss": 3.332, + "step": 2869 + }, + { + "epoch": 0.24460922185289355, + "grad_norm": 53.6920718889996, + "learning_rate": 8.150568181818182e-06, + "loss": 2.8762, + "step": 2870 + }, + { + "epoch": 0.24469445154691893, + "grad_norm": 72.45588427195051, + "learning_rate": 8.153409090909092e-06, + "loss": 3.4227, + "step": 2871 + }, + { + "epoch": 0.24477968124094435, + "grad_norm": 356.5274022987035, + "learning_rate": 8.156250000000002e-06, + "loss": 3.8341, + "step": 2872 + }, + { + "epoch": 0.24486491093496973, + "grad_norm": 77.13034658876019, + "learning_rate": 8.15909090909091e-06, + "loss": 3.8682, + "step": 2873 + }, + { + "epoch": 0.24495014062899514, + "grad_norm": 131.68429102055237, + "learning_rate": 8.161931818181818e-06, + "loss": 3.5099, + "step": 2874 + }, + { + "epoch": 0.24503537032302053, + "grad_norm": 77.21353652499715, + "learning_rate": 8.164772727272728e-06, + "loss": 4.1698, + "step": 2875 + }, + { + "epoch": 0.24512060001704594, + "grad_norm": 79.50262541530282, + "learning_rate": 8.167613636363637e-06, + "loss": 3.6625, + "step": 2876 + }, + { + "epoch": 0.24520582971107133, + "grad_norm": 79.25590895299798, + "learning_rate": 8.170454545454546e-06, + "loss": 2.7918, + "step": 2877 + }, + { + "epoch": 0.24529105940509674, + "grad_norm": 100.1752009890791, + "learning_rate": 8.173295454545455e-06, + "loss": 4.3611, + "step": 2878 + }, + { + "epoch": 0.24537628909912212, + "grad_norm": 77.6684999624379, + "learning_rate": 8.176136363636365e-06, + "loss": 3.8305, + "step": 2879 + }, + { + "epoch": 0.24546151879314754, + "grad_norm": 109.77932754248513, + "learning_rate": 8.178977272727273e-06, + "loss": 3.3336, + "step": 2880 + }, + { + "epoch": 0.24554674848717292, + "grad_norm": 89.70617731625633, + "learning_rate": 8.181818181818183e-06, + "loss": 3.74, + "step": 2881 + }, + { + "epoch": 0.24563197818119833, + "grad_norm": 108.36176171286397, + "learning_rate": 8.184659090909091e-06, + "loss": 3.8657, + "step": 2882 + }, + { + "epoch": 0.24571720787522372, + "grad_norm": 125.02087425570443, + "learning_rate": 8.1875e-06, + "loss": 2.7784, + "step": 2883 + }, + { + "epoch": 0.24580243756924913, + "grad_norm": 127.99050967323093, + "learning_rate": 8.19034090909091e-06, + "loss": 3.4637, + "step": 2884 + }, + { + "epoch": 0.24588766726327452, + "grad_norm": 73.16721096993672, + "learning_rate": 8.193181818181819e-06, + "loss": 3.3071, + "step": 2885 + }, + { + "epoch": 0.24597289695729993, + "grad_norm": 173.62634476353205, + "learning_rate": 8.196022727272728e-06, + "loss": 5.0873, + "step": 2886 + }, + { + "epoch": 0.24605812665132532, + "grad_norm": 99.19435343573727, + "learning_rate": 8.198863636363638e-06, + "loss": 4.1667, + "step": 2887 + }, + { + "epoch": 0.24614335634535073, + "grad_norm": 83.65380977610283, + "learning_rate": 8.201704545454546e-06, + "loss": 3.1386, + "step": 2888 + }, + { + "epoch": 0.2462285860393761, + "grad_norm": 101.27693606358743, + "learning_rate": 8.204545454545454e-06, + "loss": 3.8102, + "step": 2889 + }, + { + "epoch": 0.24631381573340153, + "grad_norm": 53.04625288526474, + "learning_rate": 8.207386363636364e-06, + "loss": 4.1831, + "step": 2890 + }, + { + "epoch": 0.2463990454274269, + "grad_norm": 77.88981957556459, + "learning_rate": 8.210227272727274e-06, + "loss": 4.3739, + "step": 2891 + }, + { + "epoch": 0.24648427512145232, + "grad_norm": 133.87763265467197, + "learning_rate": 8.213068181818182e-06, + "loss": 3.8425, + "step": 2892 + }, + { + "epoch": 0.2465695048154777, + "grad_norm": 84.95574432580561, + "learning_rate": 8.215909090909092e-06, + "loss": 4.0589, + "step": 2893 + }, + { + "epoch": 0.24665473450950312, + "grad_norm": 58.59325232989107, + "learning_rate": 8.218750000000002e-06, + "loss": 3.8462, + "step": 2894 + }, + { + "epoch": 0.2467399642035285, + "grad_norm": 62.16297505423158, + "learning_rate": 8.22159090909091e-06, + "loss": 3.3874, + "step": 2895 + }, + { + "epoch": 0.24682519389755392, + "grad_norm": 71.22380733622181, + "learning_rate": 8.224431818181818e-06, + "loss": 3.6032, + "step": 2896 + }, + { + "epoch": 0.2469104235915793, + "grad_norm": 90.75016716531924, + "learning_rate": 8.227272727272728e-06, + "loss": 3.9517, + "step": 2897 + }, + { + "epoch": 0.24699565328560472, + "grad_norm": 43.91590457378036, + "learning_rate": 8.230113636363637e-06, + "loss": 3.0036, + "step": 2898 + }, + { + "epoch": 0.2470808829796301, + "grad_norm": 163.41009368634903, + "learning_rate": 8.232954545454545e-06, + "loss": 4.7894, + "step": 2899 + }, + { + "epoch": 0.2471661126736555, + "grad_norm": 71.01204742359202, + "learning_rate": 8.235795454545455e-06, + "loss": 3.4842, + "step": 2900 + }, + { + "epoch": 0.2472513423676809, + "grad_norm": 117.94161302864454, + "learning_rate": 8.238636363636365e-06, + "loss": 3.8354, + "step": 2901 + }, + { + "epoch": 0.2473365720617063, + "grad_norm": 237.10763247254343, + "learning_rate": 8.241477272727273e-06, + "loss": 4.6463, + "step": 2902 + }, + { + "epoch": 0.2474218017557317, + "grad_norm": 68.666793517204, + "learning_rate": 8.244318181818183e-06, + "loss": 3.7133, + "step": 2903 + }, + { + "epoch": 0.2475070314497571, + "grad_norm": 233.9906657107201, + "learning_rate": 8.247159090909091e-06, + "loss": 5.1638, + "step": 2904 + }, + { + "epoch": 0.2475922611437825, + "grad_norm": 67.07244914027218, + "learning_rate": 8.25e-06, + "loss": 3.2812, + "step": 2905 + }, + { + "epoch": 0.24767749083780788, + "grad_norm": 100.77372065791734, + "learning_rate": 8.25284090909091e-06, + "loss": 3.0076, + "step": 2906 + }, + { + "epoch": 0.2477627205318333, + "grad_norm": 115.68453519480481, + "learning_rate": 8.255681818181818e-06, + "loss": 4.0009, + "step": 2907 + }, + { + "epoch": 0.24784795022585868, + "grad_norm": 187.20609450924744, + "learning_rate": 8.258522727272728e-06, + "loss": 4.8026, + "step": 2908 + }, + { + "epoch": 0.2479331799198841, + "grad_norm": 175.82560115140714, + "learning_rate": 8.261363636363638e-06, + "loss": 3.5451, + "step": 2909 + }, + { + "epoch": 0.24801840961390947, + "grad_norm": 118.98384531411841, + "learning_rate": 8.264204545454546e-06, + "loss": 3.9136, + "step": 2910 + }, + { + "epoch": 0.2481036393079349, + "grad_norm": 58.07421021042808, + "learning_rate": 8.267045454545454e-06, + "loss": 3.0706, + "step": 2911 + }, + { + "epoch": 0.24818886900196027, + "grad_norm": 235.8132869132823, + "learning_rate": 8.269886363636364e-06, + "loss": 4.4743, + "step": 2912 + }, + { + "epoch": 0.24827409869598568, + "grad_norm": 41.87758186925683, + "learning_rate": 8.272727272727274e-06, + "loss": 2.4448, + "step": 2913 + }, + { + "epoch": 0.24835932839001107, + "grad_norm": 145.5916290640161, + "learning_rate": 8.275568181818182e-06, + "loss": 3.6972, + "step": 2914 + }, + { + "epoch": 0.24844455808403648, + "grad_norm": 45.4355478853946, + "learning_rate": 8.278409090909092e-06, + "loss": 3.4418, + "step": 2915 + }, + { + "epoch": 0.24852978777806187, + "grad_norm": 141.55621289618907, + "learning_rate": 8.281250000000001e-06, + "loss": 2.3397, + "step": 2916 + }, + { + "epoch": 0.24861501747208728, + "grad_norm": 69.637538409624, + "learning_rate": 8.28409090909091e-06, + "loss": 3.5555, + "step": 2917 + }, + { + "epoch": 0.24870024716611266, + "grad_norm": 176.52687375914473, + "learning_rate": 8.28693181818182e-06, + "loss": 5.3073, + "step": 2918 + }, + { + "epoch": 0.24878547686013808, + "grad_norm": 77.57057886450588, + "learning_rate": 8.289772727272727e-06, + "loss": 3.6452, + "step": 2919 + }, + { + "epoch": 0.24887070655416346, + "grad_norm": 47.159048942048145, + "learning_rate": 8.292613636363637e-06, + "loss": 3.2151, + "step": 2920 + }, + { + "epoch": 0.24895593624818887, + "grad_norm": 61.738798526809994, + "learning_rate": 8.295454545454547e-06, + "loss": 4.0424, + "step": 2921 + }, + { + "epoch": 0.24904116594221426, + "grad_norm": 67.21439197383835, + "learning_rate": 8.298295454545455e-06, + "loss": 4.0609, + "step": 2922 + }, + { + "epoch": 0.24912639563623967, + "grad_norm": 61.68191036964518, + "learning_rate": 8.301136363636365e-06, + "loss": 3.9022, + "step": 2923 + }, + { + "epoch": 0.24921162533026506, + "grad_norm": 52.421766472246475, + "learning_rate": 8.303977272727275e-06, + "loss": 3.156, + "step": 2924 + }, + { + "epoch": 0.24929685502429047, + "grad_norm": 101.83319007106526, + "learning_rate": 8.306818181818183e-06, + "loss": 4.1285, + "step": 2925 + }, + { + "epoch": 0.24938208471831586, + "grad_norm": 82.87602096587555, + "learning_rate": 8.30965909090909e-06, + "loss": 3.762, + "step": 2926 + }, + { + "epoch": 0.24946731441234127, + "grad_norm": 186.5278925804658, + "learning_rate": 8.3125e-06, + "loss": 4.7375, + "step": 2927 + }, + { + "epoch": 0.24955254410636665, + "grad_norm": 69.79610965744651, + "learning_rate": 8.31534090909091e-06, + "loss": 3.6304, + "step": 2928 + }, + { + "epoch": 0.24963777380039207, + "grad_norm": 30.40615546136222, + "learning_rate": 8.318181818181818e-06, + "loss": 2.3344, + "step": 2929 + }, + { + "epoch": 0.24972300349441745, + "grad_norm": 70.94419518887183, + "learning_rate": 8.321022727272728e-06, + "loss": 3.3183, + "step": 2930 + }, + { + "epoch": 0.24980823318844286, + "grad_norm": 105.59651580227363, + "learning_rate": 8.323863636363638e-06, + "loss": 3.3861, + "step": 2931 + }, + { + "epoch": 0.24989346288246825, + "grad_norm": 79.71433025286998, + "learning_rate": 8.326704545454546e-06, + "loss": 3.9898, + "step": 2932 + }, + { + "epoch": 0.24997869257649366, + "grad_norm": 47.27510631781818, + "learning_rate": 8.329545454545456e-06, + "loss": 3.7309, + "step": 2933 + }, + { + "epoch": 0.25006392227051905, + "grad_norm": 316.0516097119254, + "learning_rate": 8.332386363636364e-06, + "loss": 2.987, + "step": 2934 + }, + { + "epoch": 0.25014915196454446, + "grad_norm": 71.47242354908721, + "learning_rate": 8.335227272727274e-06, + "loss": 3.4114, + "step": 2935 + }, + { + "epoch": 0.25023438165856987, + "grad_norm": 95.79910328008546, + "learning_rate": 8.338068181818183e-06, + "loss": 3.9309, + "step": 2936 + }, + { + "epoch": 0.25031961135259523, + "grad_norm": 175.88368406923652, + "learning_rate": 8.340909090909091e-06, + "loss": 2.4607, + "step": 2937 + }, + { + "epoch": 0.25040484104662064, + "grad_norm": 84.59165894515918, + "learning_rate": 8.343750000000001e-06, + "loss": 3.5591, + "step": 2938 + }, + { + "epoch": 0.25049007074064605, + "grad_norm": 69.17899410665166, + "learning_rate": 8.346590909090911e-06, + "loss": 3.9173, + "step": 2939 + }, + { + "epoch": 0.25057530043467147, + "grad_norm": 134.52193824531378, + "learning_rate": 8.349431818181819e-06, + "loss": 4.6775, + "step": 2940 + }, + { + "epoch": 0.2506605301286968, + "grad_norm": 207.85501890044125, + "learning_rate": 8.352272727272727e-06, + "loss": 5.7833, + "step": 2941 + }, + { + "epoch": 0.25074575982272224, + "grad_norm": 93.13978884845639, + "learning_rate": 8.355113636363637e-06, + "loss": 3.577, + "step": 2942 + }, + { + "epoch": 0.25083098951674765, + "grad_norm": 113.1488136587508, + "learning_rate": 8.357954545454547e-06, + "loss": 3.604, + "step": 2943 + }, + { + "epoch": 0.250916219210773, + "grad_norm": 60.20034609931357, + "learning_rate": 8.360795454545455e-06, + "loss": 3.6241, + "step": 2944 + }, + { + "epoch": 0.2510014489047984, + "grad_norm": 128.49434261389416, + "learning_rate": 8.363636363636365e-06, + "loss": 4.5404, + "step": 2945 + }, + { + "epoch": 0.25108667859882383, + "grad_norm": 77.32578925737907, + "learning_rate": 8.366477272727274e-06, + "loss": 3.6053, + "step": 2946 + }, + { + "epoch": 0.25117190829284924, + "grad_norm": 104.77225667605819, + "learning_rate": 8.369318181818182e-06, + "loss": 2.824, + "step": 2947 + }, + { + "epoch": 0.2512571379868746, + "grad_norm": 176.10788855145418, + "learning_rate": 8.37215909090909e-06, + "loss": 3.1119, + "step": 2948 + }, + { + "epoch": 0.2513423676809, + "grad_norm": 70.16471700221368, + "learning_rate": 8.375e-06, + "loss": 4.1771, + "step": 2949 + }, + { + "epoch": 0.2514275973749254, + "grad_norm": 91.55342368752406, + "learning_rate": 8.37784090909091e-06, + "loss": 3.8266, + "step": 2950 + }, + { + "epoch": 0.25151282706895084, + "grad_norm": 58.23615628199592, + "learning_rate": 8.380681818181818e-06, + "loss": 2.6478, + "step": 2951 + }, + { + "epoch": 0.2515980567629762, + "grad_norm": 61.28695998256878, + "learning_rate": 8.383522727272728e-06, + "loss": 3.8755, + "step": 2952 + }, + { + "epoch": 0.2516832864570016, + "grad_norm": 54.76392682693907, + "learning_rate": 8.386363636363638e-06, + "loss": 2.9441, + "step": 2953 + }, + { + "epoch": 0.251768516151027, + "grad_norm": 142.06675426132767, + "learning_rate": 8.389204545454546e-06, + "loss": 4.9038, + "step": 2954 + }, + { + "epoch": 0.25185374584505243, + "grad_norm": 48.00312237194917, + "learning_rate": 8.392045454545456e-06, + "loss": 3.153, + "step": 2955 + }, + { + "epoch": 0.2519389755390778, + "grad_norm": 142.04128035164294, + "learning_rate": 8.394886363636364e-06, + "loss": 4.1652, + "step": 2956 + }, + { + "epoch": 0.2520242052331032, + "grad_norm": 152.34668005629567, + "learning_rate": 8.397727272727273e-06, + "loss": 2.7308, + "step": 2957 + }, + { + "epoch": 0.2521094349271286, + "grad_norm": 96.57347634653145, + "learning_rate": 8.400568181818183e-06, + "loss": 3.9034, + "step": 2958 + }, + { + "epoch": 0.25219466462115403, + "grad_norm": 134.85912797851407, + "learning_rate": 8.403409090909091e-06, + "loss": 4.5704, + "step": 2959 + }, + { + "epoch": 0.2522798943151794, + "grad_norm": 62.995283898438686, + "learning_rate": 8.406250000000001e-06, + "loss": 3.8119, + "step": 2960 + }, + { + "epoch": 0.2523651240092048, + "grad_norm": 50.317531900735815, + "learning_rate": 8.40909090909091e-06, + "loss": 3.4116, + "step": 2961 + }, + { + "epoch": 0.2524503537032302, + "grad_norm": 54.310088666649186, + "learning_rate": 8.411931818181819e-06, + "loss": 3.8431, + "step": 2962 + }, + { + "epoch": 0.2525355833972556, + "grad_norm": 114.35670117586224, + "learning_rate": 8.414772727272727e-06, + "loss": 3.1621, + "step": 2963 + }, + { + "epoch": 0.252620813091281, + "grad_norm": 101.53531450763232, + "learning_rate": 8.417613636363637e-06, + "loss": 4.1257, + "step": 2964 + }, + { + "epoch": 0.2527060427853064, + "grad_norm": 126.20830573201891, + "learning_rate": 8.420454545454547e-06, + "loss": 4.4194, + "step": 2965 + }, + { + "epoch": 0.2527912724793318, + "grad_norm": 543.0707782516548, + "learning_rate": 8.423295454545455e-06, + "loss": 2.5304, + "step": 2966 + }, + { + "epoch": 0.2528765021733572, + "grad_norm": 59.68244091361169, + "learning_rate": 8.426136363636364e-06, + "loss": 3.9143, + "step": 2967 + }, + { + "epoch": 0.2529617318673826, + "grad_norm": 64.91754683038317, + "learning_rate": 8.428977272727274e-06, + "loss": 3.9239, + "step": 2968 + }, + { + "epoch": 0.253046961561408, + "grad_norm": 64.30803588992585, + "learning_rate": 8.431818181818182e-06, + "loss": 3.4834, + "step": 2969 + }, + { + "epoch": 0.2531321912554334, + "grad_norm": 57.49089193755807, + "learning_rate": 8.434659090909092e-06, + "loss": 3.6465, + "step": 2970 + }, + { + "epoch": 0.2532174209494588, + "grad_norm": 82.77316262452383, + "learning_rate": 8.4375e-06, + "loss": 3.1897, + "step": 2971 + }, + { + "epoch": 0.2533026506434842, + "grad_norm": 103.5823572506321, + "learning_rate": 8.44034090909091e-06, + "loss": 2.8937, + "step": 2972 + }, + { + "epoch": 0.2533878803375096, + "grad_norm": 79.58890558887346, + "learning_rate": 8.44318181818182e-06, + "loss": 3.0548, + "step": 2973 + }, + { + "epoch": 0.253473110031535, + "grad_norm": 60.91873347464422, + "learning_rate": 8.446022727272728e-06, + "loss": 3.7686, + "step": 2974 + }, + { + "epoch": 0.2535583397255604, + "grad_norm": 99.0708301683515, + "learning_rate": 8.448863636363638e-06, + "loss": 3.3027, + "step": 2975 + }, + { + "epoch": 0.25364356941958577, + "grad_norm": 95.47281476118296, + "learning_rate": 8.451704545454547e-06, + "loss": 3.4824, + "step": 2976 + }, + { + "epoch": 0.2537287991136112, + "grad_norm": 89.72277519250852, + "learning_rate": 8.454545454545455e-06, + "loss": 3.6383, + "step": 2977 + }, + { + "epoch": 0.2538140288076366, + "grad_norm": 147.5808858866884, + "learning_rate": 8.457386363636363e-06, + "loss": 5.0003, + "step": 2978 + }, + { + "epoch": 0.253899258501662, + "grad_norm": 100.42056576662597, + "learning_rate": 8.460227272727273e-06, + "loss": 4.9906, + "step": 2979 + }, + { + "epoch": 0.25398448819568736, + "grad_norm": 61.23340711591481, + "learning_rate": 8.463068181818183e-06, + "loss": 4.1817, + "step": 2980 + }, + { + "epoch": 0.2540697178897128, + "grad_norm": 105.86850747385667, + "learning_rate": 8.465909090909091e-06, + "loss": 4.7014, + "step": 2981 + }, + { + "epoch": 0.2541549475837382, + "grad_norm": 112.22117864833405, + "learning_rate": 8.468750000000001e-06, + "loss": 4.1988, + "step": 2982 + }, + { + "epoch": 0.25424017727776355, + "grad_norm": 56.239519707552425, + "learning_rate": 8.47159090909091e-06, + "loss": 3.1932, + "step": 2983 + }, + { + "epoch": 0.25432540697178896, + "grad_norm": 84.88491622611575, + "learning_rate": 8.474431818181819e-06, + "loss": 3.2494, + "step": 2984 + }, + { + "epoch": 0.25441063666581437, + "grad_norm": 153.3645933949183, + "learning_rate": 8.477272727272729e-06, + "loss": 4.1553, + "step": 2985 + }, + { + "epoch": 0.2544958663598398, + "grad_norm": 124.76117419558572, + "learning_rate": 8.480113636363637e-06, + "loss": 4.9047, + "step": 2986 + }, + { + "epoch": 0.25458109605386514, + "grad_norm": 74.75511884240258, + "learning_rate": 8.482954545454546e-06, + "loss": 3.7742, + "step": 2987 + }, + { + "epoch": 0.25466632574789055, + "grad_norm": 133.38213613551156, + "learning_rate": 8.485795454545456e-06, + "loss": 4.7637, + "step": 2988 + }, + { + "epoch": 0.25475155544191597, + "grad_norm": 107.46302916020689, + "learning_rate": 8.488636363636364e-06, + "loss": 2.5074, + "step": 2989 + }, + { + "epoch": 0.2548367851359414, + "grad_norm": 205.40447180157997, + "learning_rate": 8.491477272727274e-06, + "loss": 3.9554, + "step": 2990 + }, + { + "epoch": 0.25492201482996674, + "grad_norm": 50.05066051274419, + "learning_rate": 8.494318181818184e-06, + "loss": 3.4854, + "step": 2991 + }, + { + "epoch": 0.25500724452399215, + "grad_norm": 75.61172171332706, + "learning_rate": 8.497159090909092e-06, + "loss": 3.6922, + "step": 2992 + }, + { + "epoch": 0.25509247421801756, + "grad_norm": 51.04585228396244, + "learning_rate": 8.5e-06, + "loss": 3.1415, + "step": 2993 + }, + { + "epoch": 0.255177703912043, + "grad_norm": 618.8710908672037, + "learning_rate": 8.50284090909091e-06, + "loss": 4.1372, + "step": 2994 + }, + { + "epoch": 0.25526293360606833, + "grad_norm": 74.6994102131257, + "learning_rate": 8.50568181818182e-06, + "loss": 4.3548, + "step": 2995 + }, + { + "epoch": 0.25534816330009374, + "grad_norm": 2574.618369218324, + "learning_rate": 8.508522727272728e-06, + "loss": 5.2665, + "step": 2996 + }, + { + "epoch": 0.25543339299411916, + "grad_norm": 80.06798257355895, + "learning_rate": 8.511363636363637e-06, + "loss": 4.7046, + "step": 2997 + }, + { + "epoch": 0.25551862268814457, + "grad_norm": 95.31113519342802, + "learning_rate": 8.514204545454547e-06, + "loss": 4.0544, + "step": 2998 + }, + { + "epoch": 0.2556038523821699, + "grad_norm": 83.54293737238737, + "learning_rate": 8.517045454545455e-06, + "loss": 3.2043, + "step": 2999 + }, + { + "epoch": 0.25568908207619534, + "grad_norm": 46.5325522023819, + "learning_rate": 8.519886363636363e-06, + "loss": 3.3556, + "step": 3000 + }, + { + "epoch": 0.25577431177022075, + "grad_norm": 242.8061256608409, + "learning_rate": 8.522727272727273e-06, + "loss": 3.9299, + "step": 3001 + }, + { + "epoch": 0.25585954146424617, + "grad_norm": 181.04552032424078, + "learning_rate": 8.525568181818183e-06, + "loss": 3.5843, + "step": 3002 + }, + { + "epoch": 0.2559447711582715, + "grad_norm": 90.95831757034478, + "learning_rate": 8.528409090909091e-06, + "loss": 3.4405, + "step": 3003 + }, + { + "epoch": 0.25603000085229694, + "grad_norm": 95.7561602712433, + "learning_rate": 8.53125e-06, + "loss": 3.8059, + "step": 3004 + }, + { + "epoch": 0.25611523054632235, + "grad_norm": 76.63020673983488, + "learning_rate": 8.53409090909091e-06, + "loss": 2.5183, + "step": 3005 + }, + { + "epoch": 0.25620046024034776, + "grad_norm": 114.87331469602036, + "learning_rate": 8.536931818181819e-06, + "loss": 3.5734, + "step": 3006 + }, + { + "epoch": 0.2562856899343731, + "grad_norm": 138.74310035422994, + "learning_rate": 8.539772727272728e-06, + "loss": 3.8351, + "step": 3007 + }, + { + "epoch": 0.25637091962839853, + "grad_norm": 80.80530827130266, + "learning_rate": 8.542613636363636e-06, + "loss": 3.7562, + "step": 3008 + }, + { + "epoch": 0.25645614932242394, + "grad_norm": 102.04741298657055, + "learning_rate": 8.545454545454546e-06, + "loss": 3.4524, + "step": 3009 + }, + { + "epoch": 0.25654137901644936, + "grad_norm": 109.17324411121827, + "learning_rate": 8.548295454545456e-06, + "loss": 4.1682, + "step": 3010 + }, + { + "epoch": 0.2566266087104747, + "grad_norm": 67.25160609330227, + "learning_rate": 8.551136363636364e-06, + "loss": 3.6122, + "step": 3011 + }, + { + "epoch": 0.2567118384045001, + "grad_norm": 73.35218508411234, + "learning_rate": 8.553977272727272e-06, + "loss": 2.9897, + "step": 3012 + }, + { + "epoch": 0.25679706809852554, + "grad_norm": 92.22501403557847, + "learning_rate": 8.556818181818184e-06, + "loss": 4.3412, + "step": 3013 + }, + { + "epoch": 0.25688229779255095, + "grad_norm": 158.9334336528952, + "learning_rate": 8.559659090909092e-06, + "loss": 4.2778, + "step": 3014 + }, + { + "epoch": 0.2569675274865763, + "grad_norm": 78.1609219544419, + "learning_rate": 8.5625e-06, + "loss": 3.5658, + "step": 3015 + }, + { + "epoch": 0.2570527571806017, + "grad_norm": 27.261528150828926, + "learning_rate": 8.56534090909091e-06, + "loss": 2.1218, + "step": 3016 + }, + { + "epoch": 0.25713798687462713, + "grad_norm": 89.83170851588872, + "learning_rate": 8.56818181818182e-06, + "loss": 4.5459, + "step": 3017 + }, + { + "epoch": 0.25722321656865255, + "grad_norm": 100.20655513955114, + "learning_rate": 8.571022727272727e-06, + "loss": 4.302, + "step": 3018 + }, + { + "epoch": 0.2573084462626779, + "grad_norm": 51.49248260991803, + "learning_rate": 8.573863636363637e-06, + "loss": 3.7805, + "step": 3019 + }, + { + "epoch": 0.2573936759567033, + "grad_norm": 96.21747332162465, + "learning_rate": 8.576704545454547e-06, + "loss": 4.6026, + "step": 3020 + }, + { + "epoch": 0.25747890565072873, + "grad_norm": 45.826738034255406, + "learning_rate": 8.579545454545455e-06, + "loss": 3.2605, + "step": 3021 + }, + { + "epoch": 0.2575641353447541, + "grad_norm": 75.13954676155132, + "learning_rate": 8.582386363636365e-06, + "loss": 3.3842, + "step": 3022 + }, + { + "epoch": 0.2576493650387795, + "grad_norm": 85.39329393483688, + "learning_rate": 8.585227272727273e-06, + "loss": 2.8297, + "step": 3023 + }, + { + "epoch": 0.2577345947328049, + "grad_norm": 59.25174488940964, + "learning_rate": 8.588068181818183e-06, + "loss": 3.7643, + "step": 3024 + }, + { + "epoch": 0.2578198244268303, + "grad_norm": 127.73718826315634, + "learning_rate": 8.590909090909092e-06, + "loss": 4.571, + "step": 3025 + }, + { + "epoch": 0.2579050541208557, + "grad_norm": 118.17304233209826, + "learning_rate": 8.59375e-06, + "loss": 4.4971, + "step": 3026 + }, + { + "epoch": 0.2579902838148811, + "grad_norm": 67.9965297250278, + "learning_rate": 8.596590909090909e-06, + "loss": 3.2914, + "step": 3027 + }, + { + "epoch": 0.2580755135089065, + "grad_norm": 64.85369649710164, + "learning_rate": 8.59943181818182e-06, + "loss": 2.9558, + "step": 3028 + }, + { + "epoch": 0.2581607432029319, + "grad_norm": 106.70139285130674, + "learning_rate": 8.602272727272728e-06, + "loss": 3.927, + "step": 3029 + }, + { + "epoch": 0.2582459728969573, + "grad_norm": 60.61990924736378, + "learning_rate": 8.605113636363636e-06, + "loss": 3.6815, + "step": 3030 + }, + { + "epoch": 0.2583312025909827, + "grad_norm": 348.63339347201224, + "learning_rate": 8.607954545454546e-06, + "loss": 2.0902, + "step": 3031 + }, + { + "epoch": 0.2584164322850081, + "grad_norm": 148.74034631852783, + "learning_rate": 8.610795454545456e-06, + "loss": 4.6808, + "step": 3032 + }, + { + "epoch": 0.2585016619790335, + "grad_norm": 108.77538379705884, + "learning_rate": 8.613636363636364e-06, + "loss": 4.0888, + "step": 3033 + }, + { + "epoch": 0.25858689167305887, + "grad_norm": 104.28536401081618, + "learning_rate": 8.616477272727274e-06, + "loss": 4.1312, + "step": 3034 + }, + { + "epoch": 0.2586721213670843, + "grad_norm": 1121.111983214497, + "learning_rate": 8.619318181818183e-06, + "loss": 7.1906, + "step": 3035 + }, + { + "epoch": 0.2587573510611097, + "grad_norm": 355.8340399416938, + "learning_rate": 8.622159090909092e-06, + "loss": 5.4113, + "step": 3036 + }, + { + "epoch": 0.2588425807551351, + "grad_norm": 95.45715032956514, + "learning_rate": 8.625000000000001e-06, + "loss": 3.6201, + "step": 3037 + }, + { + "epoch": 0.25892781044916047, + "grad_norm": 117.77545489536108, + "learning_rate": 8.62784090909091e-06, + "loss": 3.9744, + "step": 3038 + }, + { + "epoch": 0.2590130401431859, + "grad_norm": 177.65882338841317, + "learning_rate": 8.630681818181819e-06, + "loss": 3.8692, + "step": 3039 + }, + { + "epoch": 0.2590982698372113, + "grad_norm": 47.62185043300599, + "learning_rate": 8.633522727272729e-06, + "loss": 3.5875, + "step": 3040 + }, + { + "epoch": 0.2591834995312367, + "grad_norm": 67.67054707924513, + "learning_rate": 8.636363636363637e-06, + "loss": 3.4498, + "step": 3041 + }, + { + "epoch": 0.25926872922526206, + "grad_norm": 66.68724924257847, + "learning_rate": 8.639204545454545e-06, + "loss": 4.0461, + "step": 3042 + }, + { + "epoch": 0.2593539589192875, + "grad_norm": 229.0829377673785, + "learning_rate": 8.642045454545457e-06, + "loss": 4.4738, + "step": 3043 + }, + { + "epoch": 0.2594391886133129, + "grad_norm": 138.75385328230865, + "learning_rate": 8.644886363636365e-06, + "loss": 4.5291, + "step": 3044 + }, + { + "epoch": 0.2595244183073383, + "grad_norm": 93.24356019800707, + "learning_rate": 8.647727272727273e-06, + "loss": 4.1268, + "step": 3045 + }, + { + "epoch": 0.25960964800136366, + "grad_norm": 175.98780263010323, + "learning_rate": 8.650568181818182e-06, + "loss": 4.8715, + "step": 3046 + }, + { + "epoch": 0.25969487769538907, + "grad_norm": 53.73347939530239, + "learning_rate": 8.653409090909092e-06, + "loss": 3.8074, + "step": 3047 + }, + { + "epoch": 0.2597801073894145, + "grad_norm": 254.99906101847444, + "learning_rate": 8.65625e-06, + "loss": 3.3318, + "step": 3048 + }, + { + "epoch": 0.2598653370834399, + "grad_norm": 50.17402118291145, + "learning_rate": 8.65909090909091e-06, + "loss": 3.2598, + "step": 3049 + }, + { + "epoch": 0.25995056677746525, + "grad_norm": 97.87026543636553, + "learning_rate": 8.66193181818182e-06, + "loss": 3.6259, + "step": 3050 + }, + { + "epoch": 0.26003579647149067, + "grad_norm": 105.60361553204227, + "learning_rate": 8.664772727272728e-06, + "loss": 4.5926, + "step": 3051 + }, + { + "epoch": 0.2601210261655161, + "grad_norm": 364.84336513613766, + "learning_rate": 8.667613636363638e-06, + "loss": 5.7984, + "step": 3052 + }, + { + "epoch": 0.2602062558595415, + "grad_norm": 46.170056916214115, + "learning_rate": 8.670454545454546e-06, + "loss": 3.4455, + "step": 3053 + }, + { + "epoch": 0.26029148555356685, + "grad_norm": 87.78112943069071, + "learning_rate": 8.673295454545456e-06, + "loss": 4.4594, + "step": 3054 + }, + { + "epoch": 0.26037671524759226, + "grad_norm": 51.23116354878412, + "learning_rate": 8.676136363636364e-06, + "loss": 3.0134, + "step": 3055 + }, + { + "epoch": 0.2604619449416177, + "grad_norm": 76.65053576840833, + "learning_rate": 8.678977272727273e-06, + "loss": 4.0631, + "step": 3056 + }, + { + "epoch": 0.26054717463564303, + "grad_norm": 113.04319724527429, + "learning_rate": 8.681818181818182e-06, + "loss": 3.6069, + "step": 3057 + }, + { + "epoch": 0.26063240432966844, + "grad_norm": 84.21267978631398, + "learning_rate": 8.684659090909091e-06, + "loss": 3.1718, + "step": 3058 + }, + { + "epoch": 0.26071763402369386, + "grad_norm": 82.49291510002365, + "learning_rate": 8.687500000000001e-06, + "loss": 2.9678, + "step": 3059 + }, + { + "epoch": 0.26080286371771927, + "grad_norm": 61.709521822714066, + "learning_rate": 8.69034090909091e-06, + "loss": 3.8819, + "step": 3060 + }, + { + "epoch": 0.2608880934117446, + "grad_norm": 194.8927032863408, + "learning_rate": 8.693181818181819e-06, + "loss": 4.3235, + "step": 3061 + }, + { + "epoch": 0.26097332310577004, + "grad_norm": 77.7853700137822, + "learning_rate": 8.696022727272729e-06, + "loss": 3.4069, + "step": 3062 + }, + { + "epoch": 0.26105855279979545, + "grad_norm": 80.83161238903192, + "learning_rate": 8.698863636363637e-06, + "loss": 3.4564, + "step": 3063 + }, + { + "epoch": 0.26114378249382086, + "grad_norm": 80.80000963052477, + "learning_rate": 8.701704545454545e-06, + "loss": 3.9461, + "step": 3064 + }, + { + "epoch": 0.2612290121878462, + "grad_norm": 141.8190298074923, + "learning_rate": 8.704545454545455e-06, + "loss": 4.7467, + "step": 3065 + }, + { + "epoch": 0.26131424188187163, + "grad_norm": 135.14421276425534, + "learning_rate": 8.707386363636364e-06, + "loss": 3.9889, + "step": 3066 + }, + { + "epoch": 0.26139947157589705, + "grad_norm": 105.8758739644015, + "learning_rate": 8.710227272727273e-06, + "loss": 4.2876, + "step": 3067 + }, + { + "epoch": 0.26148470126992246, + "grad_norm": 298.8820708032003, + "learning_rate": 8.713068181818182e-06, + "loss": 5.0097, + "step": 3068 + }, + { + "epoch": 0.2615699309639478, + "grad_norm": 82.02449990977395, + "learning_rate": 8.715909090909092e-06, + "loss": 2.7845, + "step": 3069 + }, + { + "epoch": 0.26165516065797323, + "grad_norm": 86.38460351634335, + "learning_rate": 8.71875e-06, + "loss": 3.2996, + "step": 3070 + }, + { + "epoch": 0.26174039035199864, + "grad_norm": 117.02146577621393, + "learning_rate": 8.72159090909091e-06, + "loss": 2.6186, + "step": 3071 + }, + { + "epoch": 0.26182562004602405, + "grad_norm": 116.06771389546, + "learning_rate": 8.724431818181818e-06, + "loss": 4.269, + "step": 3072 + }, + { + "epoch": 0.2619108497400494, + "grad_norm": 52.75758965127289, + "learning_rate": 8.727272727272728e-06, + "loss": 3.8093, + "step": 3073 + }, + { + "epoch": 0.2619960794340748, + "grad_norm": 270.733954157798, + "learning_rate": 8.730113636363638e-06, + "loss": 4.6057, + "step": 3074 + }, + { + "epoch": 0.26208130912810024, + "grad_norm": 212.99487677046858, + "learning_rate": 8.732954545454546e-06, + "loss": 5.0753, + "step": 3075 + }, + { + "epoch": 0.26216653882212565, + "grad_norm": 99.4212191412518, + "learning_rate": 8.735795454545455e-06, + "loss": 4.4689, + "step": 3076 + }, + { + "epoch": 0.262251768516151, + "grad_norm": 149.68210241680904, + "learning_rate": 8.738636363636365e-06, + "loss": 3.5491, + "step": 3077 + }, + { + "epoch": 0.2623369982101764, + "grad_norm": 139.73877463588516, + "learning_rate": 8.741477272727273e-06, + "loss": 4.7712, + "step": 3078 + }, + { + "epoch": 0.26242222790420183, + "grad_norm": 130.1530649239625, + "learning_rate": 8.744318181818181e-06, + "loss": 4.1859, + "step": 3079 + }, + { + "epoch": 0.26250745759822725, + "grad_norm": 153.55077651141855, + "learning_rate": 8.747159090909091e-06, + "loss": 4.0014, + "step": 3080 + }, + { + "epoch": 0.2625926872922526, + "grad_norm": 258.0852469748748, + "learning_rate": 8.750000000000001e-06, + "loss": 3.3359, + "step": 3081 + }, + { + "epoch": 0.262677916986278, + "grad_norm": 103.21813842869398, + "learning_rate": 8.752840909090909e-06, + "loss": 2.7825, + "step": 3082 + }, + { + "epoch": 0.2627631466803034, + "grad_norm": 98.67419476352768, + "learning_rate": 8.755681818181819e-06, + "loss": 2.7995, + "step": 3083 + }, + { + "epoch": 0.26284837637432884, + "grad_norm": 83.37560528575708, + "learning_rate": 8.758522727272729e-06, + "loss": 3.4486, + "step": 3084 + }, + { + "epoch": 0.2629336060683542, + "grad_norm": 160.97458846275725, + "learning_rate": 8.761363636363637e-06, + "loss": 5.0536, + "step": 3085 + }, + { + "epoch": 0.2630188357623796, + "grad_norm": 71.2243734946513, + "learning_rate": 8.764204545454546e-06, + "loss": 3.9467, + "step": 3086 + }, + { + "epoch": 0.263104065456405, + "grad_norm": 92.91656100264663, + "learning_rate": 8.767045454545455e-06, + "loss": 3.6038, + "step": 3087 + }, + { + "epoch": 0.26318929515043044, + "grad_norm": 74.33925002200145, + "learning_rate": 8.769886363636364e-06, + "loss": 3.4879, + "step": 3088 + }, + { + "epoch": 0.2632745248444558, + "grad_norm": 72.41276849948954, + "learning_rate": 8.772727272727274e-06, + "loss": 3.5617, + "step": 3089 + }, + { + "epoch": 0.2633597545384812, + "grad_norm": 69.91011461565166, + "learning_rate": 8.775568181818182e-06, + "loss": 3.6113, + "step": 3090 + }, + { + "epoch": 0.2634449842325066, + "grad_norm": 61.22789895178265, + "learning_rate": 8.778409090909092e-06, + "loss": 3.5781, + "step": 3091 + }, + { + "epoch": 0.26353021392653203, + "grad_norm": 526.1345559702054, + "learning_rate": 8.781250000000002e-06, + "loss": 3.6931, + "step": 3092 + }, + { + "epoch": 0.2636154436205574, + "grad_norm": 60.964286809383864, + "learning_rate": 8.78409090909091e-06, + "loss": 3.0325, + "step": 3093 + }, + { + "epoch": 0.2637006733145828, + "grad_norm": 79.47436777737146, + "learning_rate": 8.786931818181818e-06, + "loss": 4.075, + "step": 3094 + }, + { + "epoch": 0.2637859030086082, + "grad_norm": 99.98039782402884, + "learning_rate": 8.789772727272728e-06, + "loss": 3.8549, + "step": 3095 + }, + { + "epoch": 0.26387113270263357, + "grad_norm": 64.9078915881197, + "learning_rate": 8.792613636363637e-06, + "loss": 3.2822, + "step": 3096 + }, + { + "epoch": 0.263956362396659, + "grad_norm": 67.48961471883518, + "learning_rate": 8.795454545454545e-06, + "loss": 4.3448, + "step": 3097 + }, + { + "epoch": 0.2640415920906844, + "grad_norm": 54.909564207328636, + "learning_rate": 8.798295454545455e-06, + "loss": 3.7966, + "step": 3098 + }, + { + "epoch": 0.2641268217847098, + "grad_norm": 101.82514732190924, + "learning_rate": 8.801136363636365e-06, + "loss": 4.4312, + "step": 3099 + }, + { + "epoch": 0.26421205147873517, + "grad_norm": 162.24854239054613, + "learning_rate": 8.803977272727273e-06, + "loss": 4.5254, + "step": 3100 + }, + { + "epoch": 0.2642972811727606, + "grad_norm": 95.43318970431855, + "learning_rate": 8.806818181818183e-06, + "loss": 4.4948, + "step": 3101 + }, + { + "epoch": 0.264382510866786, + "grad_norm": 82.5759639607405, + "learning_rate": 8.809659090909091e-06, + "loss": 3.4806, + "step": 3102 + }, + { + "epoch": 0.2644677405608114, + "grad_norm": 241.07848021827658, + "learning_rate": 8.8125e-06, + "loss": 4.2364, + "step": 3103 + }, + { + "epoch": 0.26455297025483676, + "grad_norm": 127.99174487694407, + "learning_rate": 8.81534090909091e-06, + "loss": 3.9623, + "step": 3104 + }, + { + "epoch": 0.2646381999488622, + "grad_norm": 90.64674396024743, + "learning_rate": 8.818181818181819e-06, + "loss": 3.5022, + "step": 3105 + }, + { + "epoch": 0.2647234296428876, + "grad_norm": 104.14558648020511, + "learning_rate": 8.821022727272728e-06, + "loss": 2.9465, + "step": 3106 + }, + { + "epoch": 0.264808659336913, + "grad_norm": 118.25003629934669, + "learning_rate": 8.823863636363638e-06, + "loss": 3.6299, + "step": 3107 + }, + { + "epoch": 0.26489388903093836, + "grad_norm": 41.63253593854056, + "learning_rate": 8.826704545454546e-06, + "loss": 2.7296, + "step": 3108 + }, + { + "epoch": 0.26497911872496377, + "grad_norm": 79.65278911475667, + "learning_rate": 8.829545454545454e-06, + "loss": 3.848, + "step": 3109 + }, + { + "epoch": 0.2650643484189892, + "grad_norm": 103.06554169643587, + "learning_rate": 8.832386363636364e-06, + "loss": 3.1921, + "step": 3110 + }, + { + "epoch": 0.2651495781130146, + "grad_norm": 105.9847155263076, + "learning_rate": 8.835227272727274e-06, + "loss": 4.6796, + "step": 3111 + }, + { + "epoch": 0.26523480780703995, + "grad_norm": 232.24582991628142, + "learning_rate": 8.838068181818182e-06, + "loss": 4.7015, + "step": 3112 + }, + { + "epoch": 0.26532003750106536, + "grad_norm": 45.725118949639196, + "learning_rate": 8.840909090909092e-06, + "loss": 2.8044, + "step": 3113 + }, + { + "epoch": 0.2654052671950908, + "grad_norm": 365.94616340737224, + "learning_rate": 8.843750000000002e-06, + "loss": 4.0622, + "step": 3114 + }, + { + "epoch": 0.2654904968891162, + "grad_norm": 135.6612481088818, + "learning_rate": 8.84659090909091e-06, + "loss": 4.1122, + "step": 3115 + }, + { + "epoch": 0.26557572658314155, + "grad_norm": 114.61275865337024, + "learning_rate": 8.849431818181818e-06, + "loss": 5.6427, + "step": 3116 + }, + { + "epoch": 0.26566095627716696, + "grad_norm": 116.31642325462644, + "learning_rate": 8.852272727272727e-06, + "loss": 4.414, + "step": 3117 + }, + { + "epoch": 0.2657461859711924, + "grad_norm": 43.192872516932965, + "learning_rate": 8.855113636363637e-06, + "loss": 3.3026, + "step": 3118 + }, + { + "epoch": 0.2658314156652178, + "grad_norm": 115.44479592145242, + "learning_rate": 8.857954545454545e-06, + "loss": 4.0134, + "step": 3119 + }, + { + "epoch": 0.26591664535924314, + "grad_norm": 119.73713721840937, + "learning_rate": 8.860795454545455e-06, + "loss": 5.3227, + "step": 3120 + }, + { + "epoch": 0.26600187505326855, + "grad_norm": 63.12909433908823, + "learning_rate": 8.863636363636365e-06, + "loss": 3.2231, + "step": 3121 + }, + { + "epoch": 0.26608710474729397, + "grad_norm": 175.2097753101297, + "learning_rate": 8.866477272727273e-06, + "loss": 2.4132, + "step": 3122 + }, + { + "epoch": 0.2661723344413194, + "grad_norm": 153.30312235402485, + "learning_rate": 8.869318181818183e-06, + "loss": 4.0333, + "step": 3123 + }, + { + "epoch": 0.26625756413534474, + "grad_norm": 78.19313235349529, + "learning_rate": 8.87215909090909e-06, + "loss": 3.8101, + "step": 3124 + }, + { + "epoch": 0.26634279382937015, + "grad_norm": 108.00795984521613, + "learning_rate": 8.875e-06, + "loss": 4.467, + "step": 3125 + }, + { + "epoch": 0.26642802352339556, + "grad_norm": 43.549913586021326, + "learning_rate": 8.87784090909091e-06, + "loss": 3.1004, + "step": 3126 + }, + { + "epoch": 0.266513253217421, + "grad_norm": 94.60892516363798, + "learning_rate": 8.880681818181818e-06, + "loss": 3.8011, + "step": 3127 + }, + { + "epoch": 0.26659848291144633, + "grad_norm": 38.563144508814304, + "learning_rate": 8.883522727272728e-06, + "loss": 3.2199, + "step": 3128 + }, + { + "epoch": 0.26668371260547175, + "grad_norm": 104.05641853096772, + "learning_rate": 8.886363636363638e-06, + "loss": 3.4702, + "step": 3129 + }, + { + "epoch": 0.26676894229949716, + "grad_norm": 64.53115684324858, + "learning_rate": 8.889204545454546e-06, + "loss": 3.9064, + "step": 3130 + }, + { + "epoch": 0.26685417199352257, + "grad_norm": 58.343143961832446, + "learning_rate": 8.892045454545454e-06, + "loss": 3.1414, + "step": 3131 + }, + { + "epoch": 0.26693940168754793, + "grad_norm": 74.50966351411515, + "learning_rate": 8.894886363636364e-06, + "loss": 3.974, + "step": 3132 + }, + { + "epoch": 0.26702463138157334, + "grad_norm": 70.495020386769, + "learning_rate": 8.897727272727274e-06, + "loss": 3.7361, + "step": 3133 + }, + { + "epoch": 0.26710986107559875, + "grad_norm": 158.4101506715612, + "learning_rate": 8.900568181818182e-06, + "loss": 5.0796, + "step": 3134 + }, + { + "epoch": 0.2671950907696241, + "grad_norm": 86.84209231127078, + "learning_rate": 8.903409090909092e-06, + "loss": 3.9705, + "step": 3135 + }, + { + "epoch": 0.2672803204636495, + "grad_norm": 91.26772921124595, + "learning_rate": 8.906250000000001e-06, + "loss": 4.0967, + "step": 3136 + }, + { + "epoch": 0.26736555015767494, + "grad_norm": 89.62413928149597, + "learning_rate": 8.90909090909091e-06, + "loss": 3.6207, + "step": 3137 + }, + { + "epoch": 0.26745077985170035, + "grad_norm": 72.47574036306551, + "learning_rate": 8.91193181818182e-06, + "loss": 4.3327, + "step": 3138 + }, + { + "epoch": 0.2675360095457257, + "grad_norm": 59.79523474136235, + "learning_rate": 8.914772727272727e-06, + "loss": 3.7807, + "step": 3139 + }, + { + "epoch": 0.2676212392397511, + "grad_norm": 140.36828717462802, + "learning_rate": 8.917613636363637e-06, + "loss": 4.8938, + "step": 3140 + }, + { + "epoch": 0.26770646893377653, + "grad_norm": 89.40447473808803, + "learning_rate": 8.920454545454547e-06, + "loss": 2.8603, + "step": 3141 + }, + { + "epoch": 0.26779169862780194, + "grad_norm": 113.48341548906349, + "learning_rate": 8.923295454545455e-06, + "loss": 3.7965, + "step": 3142 + }, + { + "epoch": 0.2678769283218273, + "grad_norm": 238.67653172330122, + "learning_rate": 8.926136363636365e-06, + "loss": 3.8058, + "step": 3143 + }, + { + "epoch": 0.2679621580158527, + "grad_norm": 168.01487434049068, + "learning_rate": 8.928977272727274e-06, + "loss": 4.875, + "step": 3144 + }, + { + "epoch": 0.2680473877098781, + "grad_norm": 81.42348796421639, + "learning_rate": 8.931818181818183e-06, + "loss": 3.6882, + "step": 3145 + }, + { + "epoch": 0.26813261740390354, + "grad_norm": 312.997589780559, + "learning_rate": 8.93465909090909e-06, + "loss": 3.642, + "step": 3146 + }, + { + "epoch": 0.2682178470979289, + "grad_norm": 174.67515718529557, + "learning_rate": 8.9375e-06, + "loss": 5.1633, + "step": 3147 + }, + { + "epoch": 0.2683030767919543, + "grad_norm": 61.04540291633297, + "learning_rate": 8.94034090909091e-06, + "loss": 4.1209, + "step": 3148 + }, + { + "epoch": 0.2683883064859797, + "grad_norm": 93.02912809476184, + "learning_rate": 8.943181818181818e-06, + "loss": 4.106, + "step": 3149 + }, + { + "epoch": 0.26847353618000513, + "grad_norm": 99.55013613554705, + "learning_rate": 8.946022727272728e-06, + "loss": 3.7294, + "step": 3150 + }, + { + "epoch": 0.2685587658740305, + "grad_norm": 72.06907470166692, + "learning_rate": 8.948863636363638e-06, + "loss": 3.8481, + "step": 3151 + }, + { + "epoch": 0.2686439955680559, + "grad_norm": 79.25602473940172, + "learning_rate": 8.951704545454546e-06, + "loss": 4.6939, + "step": 3152 + }, + { + "epoch": 0.2687292252620813, + "grad_norm": 71.27002865568139, + "learning_rate": 8.954545454545456e-06, + "loss": 4.3542, + "step": 3153 + }, + { + "epoch": 0.26881445495610673, + "grad_norm": 551.9469024443019, + "learning_rate": 8.957386363636364e-06, + "loss": 3.1529, + "step": 3154 + }, + { + "epoch": 0.2688996846501321, + "grad_norm": 78.48597930742254, + "learning_rate": 8.960227272727274e-06, + "loss": 2.7927, + "step": 3155 + }, + { + "epoch": 0.2689849143441575, + "grad_norm": 85.05014843468432, + "learning_rate": 8.963068181818183e-06, + "loss": 2.9914, + "step": 3156 + }, + { + "epoch": 0.2690701440381829, + "grad_norm": 136.24438379749316, + "learning_rate": 8.965909090909091e-06, + "loss": 4.4251, + "step": 3157 + }, + { + "epoch": 0.2691553737322083, + "grad_norm": 84.72027274667822, + "learning_rate": 8.968750000000001e-06, + "loss": 3.3906, + "step": 3158 + }, + { + "epoch": 0.2692406034262337, + "grad_norm": 56.11161359636879, + "learning_rate": 8.971590909090911e-06, + "loss": 3.969, + "step": 3159 + }, + { + "epoch": 0.2693258331202591, + "grad_norm": 73.0502664489308, + "learning_rate": 8.974431818181819e-06, + "loss": 3.7346, + "step": 3160 + }, + { + "epoch": 0.2694110628142845, + "grad_norm": 96.82520340967902, + "learning_rate": 8.977272727272727e-06, + "loss": 4.557, + "step": 3161 + }, + { + "epoch": 0.2694962925083099, + "grad_norm": 62.628510705378424, + "learning_rate": 8.980113636363637e-06, + "loss": 3.4506, + "step": 3162 + }, + { + "epoch": 0.2695815222023353, + "grad_norm": 55.88537188958803, + "learning_rate": 8.982954545454547e-06, + "loss": 3.5029, + "step": 3163 + }, + { + "epoch": 0.2696667518963607, + "grad_norm": 100.63536970940919, + "learning_rate": 8.985795454545455e-06, + "loss": 3.0759, + "step": 3164 + }, + { + "epoch": 0.2697519815903861, + "grad_norm": 81.2211879541675, + "learning_rate": 8.988636363636365e-06, + "loss": 2.956, + "step": 3165 + }, + { + "epoch": 0.2698372112844115, + "grad_norm": 55.02454415071363, + "learning_rate": 8.991477272727274e-06, + "loss": 3.359, + "step": 3166 + }, + { + "epoch": 0.2699224409784369, + "grad_norm": 96.17408547459317, + "learning_rate": 8.994318181818182e-06, + "loss": 4.0261, + "step": 3167 + }, + { + "epoch": 0.2700076706724623, + "grad_norm": 92.29729561996832, + "learning_rate": 8.99715909090909e-06, + "loss": 3.721, + "step": 3168 + }, + { + "epoch": 0.2700929003664877, + "grad_norm": 89.36121809087577, + "learning_rate": 9e-06, + "loss": 4.1754, + "step": 3169 + }, + { + "epoch": 0.27017813006051306, + "grad_norm": 150.2525056624184, + "learning_rate": 9.00284090909091e-06, + "loss": 3.0775, + "step": 3170 + }, + { + "epoch": 0.27026335975453847, + "grad_norm": 111.57672930556704, + "learning_rate": 9.005681818181818e-06, + "loss": 4.7589, + "step": 3171 + }, + { + "epoch": 0.2703485894485639, + "grad_norm": 108.15231522530677, + "learning_rate": 9.008522727272728e-06, + "loss": 4.073, + "step": 3172 + }, + { + "epoch": 0.2704338191425893, + "grad_norm": 46.00225013056992, + "learning_rate": 9.011363636363638e-06, + "loss": 3.0437, + "step": 3173 + }, + { + "epoch": 0.27051904883661465, + "grad_norm": 114.66148600227642, + "learning_rate": 9.014204545454546e-06, + "loss": 4.0568, + "step": 3174 + }, + { + "epoch": 0.27060427853064006, + "grad_norm": 158.2321862123181, + "learning_rate": 9.017045454545455e-06, + "loss": 4.3, + "step": 3175 + }, + { + "epoch": 0.2706895082246655, + "grad_norm": 82.3467876700908, + "learning_rate": 9.019886363636364e-06, + "loss": 3.45, + "step": 3176 + }, + { + "epoch": 0.2707747379186909, + "grad_norm": 64.02603028093473, + "learning_rate": 9.022727272727273e-06, + "loss": 3.358, + "step": 3177 + }, + { + "epoch": 0.27085996761271625, + "grad_norm": 207.39408061436617, + "learning_rate": 9.025568181818183e-06, + "loss": 3.7997, + "step": 3178 + }, + { + "epoch": 0.27094519730674166, + "grad_norm": 82.12940266974206, + "learning_rate": 9.028409090909091e-06, + "loss": 4.0317, + "step": 3179 + }, + { + "epoch": 0.27103042700076707, + "grad_norm": 59.514528097612114, + "learning_rate": 9.031250000000001e-06, + "loss": 3.7332, + "step": 3180 + }, + { + "epoch": 0.2711156566947925, + "grad_norm": 165.66288248245596, + "learning_rate": 9.03409090909091e-06, + "loss": 4.94, + "step": 3181 + }, + { + "epoch": 0.27120088638881784, + "grad_norm": 104.72550518259843, + "learning_rate": 9.036931818181819e-06, + "loss": 3.879, + "step": 3182 + }, + { + "epoch": 0.27128611608284325, + "grad_norm": 84.17078421565257, + "learning_rate": 9.039772727272727e-06, + "loss": 4.1885, + "step": 3183 + }, + { + "epoch": 0.27137134577686867, + "grad_norm": 71.9688289118292, + "learning_rate": 9.042613636363637e-06, + "loss": 3.7811, + "step": 3184 + }, + { + "epoch": 0.2714565754708941, + "grad_norm": 85.42496129805141, + "learning_rate": 9.045454545454546e-06, + "loss": 3.3238, + "step": 3185 + }, + { + "epoch": 0.27154180516491944, + "grad_norm": 112.68341820508229, + "learning_rate": 9.048295454545455e-06, + "loss": 3.833, + "step": 3186 + }, + { + "epoch": 0.27162703485894485, + "grad_norm": 75.06985152217224, + "learning_rate": 9.051136363636364e-06, + "loss": 3.1783, + "step": 3187 + }, + { + "epoch": 0.27171226455297026, + "grad_norm": 54.18960226740934, + "learning_rate": 9.053977272727274e-06, + "loss": 3.1027, + "step": 3188 + }, + { + "epoch": 0.2717974942469957, + "grad_norm": 50.11123367466513, + "learning_rate": 9.056818181818182e-06, + "loss": 3.0616, + "step": 3189 + }, + { + "epoch": 0.27188272394102103, + "grad_norm": 67.51872397371632, + "learning_rate": 9.059659090909092e-06, + "loss": 3.9518, + "step": 3190 + }, + { + "epoch": 0.27196795363504644, + "grad_norm": 55.04918002748434, + "learning_rate": 9.0625e-06, + "loss": 3.3047, + "step": 3191 + }, + { + "epoch": 0.27205318332907186, + "grad_norm": 198.99717553576994, + "learning_rate": 9.06534090909091e-06, + "loss": 3.731, + "step": 3192 + }, + { + "epoch": 0.27213841302309727, + "grad_norm": 50.153381601743, + "learning_rate": 9.06818181818182e-06, + "loss": 2.695, + "step": 3193 + }, + { + "epoch": 0.2722236427171226, + "grad_norm": 108.5278784067608, + "learning_rate": 9.071022727272728e-06, + "loss": 4.252, + "step": 3194 + }, + { + "epoch": 0.27230887241114804, + "grad_norm": 86.18849446763812, + "learning_rate": 9.073863636363637e-06, + "loss": 3.7806, + "step": 3195 + }, + { + "epoch": 0.27239410210517345, + "grad_norm": 57.011673875628574, + "learning_rate": 9.076704545454547e-06, + "loss": 2.9853, + "step": 3196 + }, + { + "epoch": 0.27247933179919887, + "grad_norm": 121.56102207586659, + "learning_rate": 9.079545454545455e-06, + "loss": 3.1941, + "step": 3197 + }, + { + "epoch": 0.2725645614932242, + "grad_norm": 78.84461408707313, + "learning_rate": 9.082386363636363e-06, + "loss": 3.0652, + "step": 3198 + }, + { + "epoch": 0.27264979118724963, + "grad_norm": 100.57383268908941, + "learning_rate": 9.085227272727273e-06, + "loss": 3.6936, + "step": 3199 + }, + { + "epoch": 0.27273502088127505, + "grad_norm": 252.71636722434056, + "learning_rate": 9.088068181818183e-06, + "loss": 4.7238, + "step": 3200 + }, + { + "epoch": 0.27282025057530046, + "grad_norm": 60.913704921987836, + "learning_rate": 9.090909090909091e-06, + "loss": 3.7207, + "step": 3201 + }, + { + "epoch": 0.2729054802693258, + "grad_norm": 129.12907875119632, + "learning_rate": 9.09375e-06, + "loss": 4.7256, + "step": 3202 + }, + { + "epoch": 0.27299070996335123, + "grad_norm": 118.59592701617095, + "learning_rate": 9.09659090909091e-06, + "loss": 2.906, + "step": 3203 + }, + { + "epoch": 0.27307593965737664, + "grad_norm": 81.29399576042078, + "learning_rate": 9.099431818181819e-06, + "loss": 2.6979, + "step": 3204 + }, + { + "epoch": 0.27316116935140206, + "grad_norm": 90.44063779606306, + "learning_rate": 9.102272727272728e-06, + "loss": 3.726, + "step": 3205 + }, + { + "epoch": 0.2732463990454274, + "grad_norm": 109.55719623852576, + "learning_rate": 9.105113636363637e-06, + "loss": 3.8519, + "step": 3206 + }, + { + "epoch": 0.2733316287394528, + "grad_norm": 65.72578280443366, + "learning_rate": 9.107954545454546e-06, + "loss": 3.8338, + "step": 3207 + }, + { + "epoch": 0.27341685843347824, + "grad_norm": 130.31740624095883, + "learning_rate": 9.110795454545456e-06, + "loss": 3.6603, + "step": 3208 + }, + { + "epoch": 0.2735020881275036, + "grad_norm": 62.99334366642103, + "learning_rate": 9.113636363636364e-06, + "loss": 3.4024, + "step": 3209 + }, + { + "epoch": 0.273587317821529, + "grad_norm": 144.52188387480368, + "learning_rate": 9.116477272727274e-06, + "loss": 3.4859, + "step": 3210 + }, + { + "epoch": 0.2736725475155544, + "grad_norm": 168.49699164314964, + "learning_rate": 9.119318181818184e-06, + "loss": 3.5567, + "step": 3211 + }, + { + "epoch": 0.27375777720957983, + "grad_norm": 107.04517093966734, + "learning_rate": 9.122159090909092e-06, + "loss": 3.3858, + "step": 3212 + }, + { + "epoch": 0.2738430069036052, + "grad_norm": 50.84194086451342, + "learning_rate": 9.125e-06, + "loss": 3.5916, + "step": 3213 + }, + { + "epoch": 0.2739282365976306, + "grad_norm": 163.49358236581054, + "learning_rate": 9.12784090909091e-06, + "loss": 2.4935, + "step": 3214 + }, + { + "epoch": 0.274013466291656, + "grad_norm": 72.84739348735084, + "learning_rate": 9.13068181818182e-06, + "loss": 3.9613, + "step": 3215 + }, + { + "epoch": 0.27409869598568143, + "grad_norm": 171.07190075740436, + "learning_rate": 9.133522727272728e-06, + "loss": 4.2174, + "step": 3216 + }, + { + "epoch": 0.2741839256797068, + "grad_norm": 76.72387521835816, + "learning_rate": 9.136363636363637e-06, + "loss": 4.0462, + "step": 3217 + }, + { + "epoch": 0.2742691553737322, + "grad_norm": 54.35102665023357, + "learning_rate": 9.139204545454547e-06, + "loss": 2.8916, + "step": 3218 + }, + { + "epoch": 0.2743543850677576, + "grad_norm": 73.16380429435635, + "learning_rate": 9.142045454545455e-06, + "loss": 3.7426, + "step": 3219 + }, + { + "epoch": 0.274439614761783, + "grad_norm": 116.19536321899182, + "learning_rate": 9.144886363636363e-06, + "loss": 4.7648, + "step": 3220 + }, + { + "epoch": 0.2745248444558084, + "grad_norm": 144.83263145469328, + "learning_rate": 9.147727272727273e-06, + "loss": 4.1508, + "step": 3221 + }, + { + "epoch": 0.2746100741498338, + "grad_norm": 130.02440553171297, + "learning_rate": 9.150568181818183e-06, + "loss": 3.3366, + "step": 3222 + }, + { + "epoch": 0.2746953038438592, + "grad_norm": 62.950882520536, + "learning_rate": 9.153409090909091e-06, + "loss": 2.6544, + "step": 3223 + }, + { + "epoch": 0.2747805335378846, + "grad_norm": 123.54946501338169, + "learning_rate": 9.15625e-06, + "loss": 4.1352, + "step": 3224 + }, + { + "epoch": 0.27486576323191, + "grad_norm": 66.61781984388224, + "learning_rate": 9.15909090909091e-06, + "loss": 3.6237, + "step": 3225 + }, + { + "epoch": 0.2749509929259354, + "grad_norm": 75.47606227680981, + "learning_rate": 9.161931818181818e-06, + "loss": 3.1558, + "step": 3226 + }, + { + "epoch": 0.2750362226199608, + "grad_norm": 75.7728792172026, + "learning_rate": 9.164772727272728e-06, + "loss": 3.4062, + "step": 3227 + }, + { + "epoch": 0.2751214523139862, + "grad_norm": 84.1346805974619, + "learning_rate": 9.167613636363636e-06, + "loss": 3.4098, + "step": 3228 + }, + { + "epoch": 0.27520668200801157, + "grad_norm": 126.31759748837499, + "learning_rate": 9.170454545454546e-06, + "loss": 4.9725, + "step": 3229 + }, + { + "epoch": 0.275291911702037, + "grad_norm": 55.23526958628262, + "learning_rate": 9.173295454545456e-06, + "loss": 3.3429, + "step": 3230 + }, + { + "epoch": 0.2753771413960624, + "grad_norm": 66.91403636527893, + "learning_rate": 9.176136363636364e-06, + "loss": 2.902, + "step": 3231 + }, + { + "epoch": 0.2754623710900878, + "grad_norm": 118.30247605474393, + "learning_rate": 9.178977272727274e-06, + "loss": 3.9159, + "step": 3232 + }, + { + "epoch": 0.27554760078411317, + "grad_norm": 89.43010535593667, + "learning_rate": 9.181818181818184e-06, + "loss": 4.0204, + "step": 3233 + }, + { + "epoch": 0.2756328304781386, + "grad_norm": 83.04224442127699, + "learning_rate": 9.184659090909092e-06, + "loss": 3.5045, + "step": 3234 + }, + { + "epoch": 0.275718060172164, + "grad_norm": 56.15245748901696, + "learning_rate": 9.1875e-06, + "loss": 3.1743, + "step": 3235 + }, + { + "epoch": 0.2758032898661894, + "grad_norm": 219.98400563747703, + "learning_rate": 9.19034090909091e-06, + "loss": 5.062, + "step": 3236 + }, + { + "epoch": 0.27588851956021476, + "grad_norm": 89.5714337632383, + "learning_rate": 9.19318181818182e-06, + "loss": 3.6765, + "step": 3237 + }, + { + "epoch": 0.2759737492542402, + "grad_norm": 87.02792438351662, + "learning_rate": 9.196022727272727e-06, + "loss": 4.2467, + "step": 3238 + }, + { + "epoch": 0.2760589789482656, + "grad_norm": 98.56449206394943, + "learning_rate": 9.198863636363637e-06, + "loss": 4.0586, + "step": 3239 + }, + { + "epoch": 0.276144208642291, + "grad_norm": 85.06446828689745, + "learning_rate": 9.201704545454547e-06, + "loss": 3.4999, + "step": 3240 + }, + { + "epoch": 0.27622943833631636, + "grad_norm": 71.72288191606869, + "learning_rate": 9.204545454545455e-06, + "loss": 3.1273, + "step": 3241 + }, + { + "epoch": 0.27631466803034177, + "grad_norm": 126.20405917692462, + "learning_rate": 9.207386363636365e-06, + "loss": 3.3897, + "step": 3242 + }, + { + "epoch": 0.2763998977243672, + "grad_norm": 105.68755350187631, + "learning_rate": 9.210227272727273e-06, + "loss": 3.5716, + "step": 3243 + }, + { + "epoch": 0.2764851274183926, + "grad_norm": 77.34382129111718, + "learning_rate": 9.213068181818183e-06, + "loss": 3.4574, + "step": 3244 + }, + { + "epoch": 0.27657035711241795, + "grad_norm": 177.50677842801082, + "learning_rate": 9.215909090909092e-06, + "loss": 4.9836, + "step": 3245 + }, + { + "epoch": 0.27665558680644337, + "grad_norm": 84.05136344962825, + "learning_rate": 9.21875e-06, + "loss": 4.6747, + "step": 3246 + }, + { + "epoch": 0.2767408165004688, + "grad_norm": 97.07417432806706, + "learning_rate": 9.22159090909091e-06, + "loss": 4.2506, + "step": 3247 + }, + { + "epoch": 0.27682604619449414, + "grad_norm": 91.58059636339505, + "learning_rate": 9.22443181818182e-06, + "loss": 4.5451, + "step": 3248 + }, + { + "epoch": 0.27691127588851955, + "grad_norm": 108.31141984706376, + "learning_rate": 9.227272727272728e-06, + "loss": 3.5373, + "step": 3249 + }, + { + "epoch": 0.27699650558254496, + "grad_norm": 158.59723744946578, + "learning_rate": 9.230113636363636e-06, + "loss": 4.1573, + "step": 3250 + }, + { + "epoch": 0.2770817352765704, + "grad_norm": 100.48561904624032, + "learning_rate": 9.232954545454546e-06, + "loss": 4.1493, + "step": 3251 + }, + { + "epoch": 0.27716696497059573, + "grad_norm": 43.94327499143819, + "learning_rate": 9.235795454545456e-06, + "loss": 3.67, + "step": 3252 + }, + { + "epoch": 0.27725219466462114, + "grad_norm": 140.144464348549, + "learning_rate": 9.238636363636364e-06, + "loss": 3.2742, + "step": 3253 + }, + { + "epoch": 0.27733742435864656, + "grad_norm": 78.83551529395471, + "learning_rate": 9.241477272727274e-06, + "loss": 3.6242, + "step": 3254 + }, + { + "epoch": 0.27742265405267197, + "grad_norm": 67.34776944980483, + "learning_rate": 9.244318181818183e-06, + "loss": 3.3626, + "step": 3255 + }, + { + "epoch": 0.2775078837466973, + "grad_norm": 137.87430952447443, + "learning_rate": 9.247159090909091e-06, + "loss": 5.7391, + "step": 3256 + }, + { + "epoch": 0.27759311344072274, + "grad_norm": 147.85369203082936, + "learning_rate": 9.250000000000001e-06, + "loss": 4.6242, + "step": 3257 + }, + { + "epoch": 0.27767834313474815, + "grad_norm": 95.09289832193444, + "learning_rate": 9.25284090909091e-06, + "loss": 4.4245, + "step": 3258 + }, + { + "epoch": 0.27776357282877356, + "grad_norm": 83.00418400133259, + "learning_rate": 9.255681818181819e-06, + "loss": 4.9401, + "step": 3259 + }, + { + "epoch": 0.2778488025227989, + "grad_norm": 53.99377602821575, + "learning_rate": 9.258522727272729e-06, + "loss": 3.0538, + "step": 3260 + }, + { + "epoch": 0.27793403221682433, + "grad_norm": 75.75188237592151, + "learning_rate": 9.261363636363637e-06, + "loss": 2.8813, + "step": 3261 + }, + { + "epoch": 0.27801926191084975, + "grad_norm": 137.52212498797792, + "learning_rate": 9.264204545454547e-06, + "loss": 5.182, + "step": 3262 + }, + { + "epoch": 0.27810449160487516, + "grad_norm": 73.71467737514227, + "learning_rate": 9.267045454545456e-06, + "loss": 3.8698, + "step": 3263 + }, + { + "epoch": 0.2781897212989005, + "grad_norm": 132.62106948574527, + "learning_rate": 9.269886363636365e-06, + "loss": 4.3258, + "step": 3264 + }, + { + "epoch": 0.27827495099292593, + "grad_norm": 434.2253352184606, + "learning_rate": 9.272727272727273e-06, + "loss": 2.8186, + "step": 3265 + }, + { + "epoch": 0.27836018068695134, + "grad_norm": 126.58864984364493, + "learning_rate": 9.275568181818182e-06, + "loss": 3.8418, + "step": 3266 + }, + { + "epoch": 0.27844541038097675, + "grad_norm": 88.40215805646204, + "learning_rate": 9.278409090909092e-06, + "loss": 4.5447, + "step": 3267 + }, + { + "epoch": 0.2785306400750021, + "grad_norm": 217.344241215452, + "learning_rate": 9.28125e-06, + "loss": 5.265, + "step": 3268 + }, + { + "epoch": 0.2786158697690275, + "grad_norm": 64.74925057667588, + "learning_rate": 9.28409090909091e-06, + "loss": 4.0657, + "step": 3269 + }, + { + "epoch": 0.27870109946305294, + "grad_norm": 78.23278506817947, + "learning_rate": 9.28693181818182e-06, + "loss": 4.1918, + "step": 3270 + }, + { + "epoch": 0.27878632915707835, + "grad_norm": 56.29872171594904, + "learning_rate": 9.289772727272728e-06, + "loss": 3.9995, + "step": 3271 + }, + { + "epoch": 0.2788715588511037, + "grad_norm": 378.95411370275025, + "learning_rate": 9.292613636363638e-06, + "loss": 4.1799, + "step": 3272 + }, + { + "epoch": 0.2789567885451291, + "grad_norm": 39.85181599233042, + "learning_rate": 9.295454545454546e-06, + "loss": 3.2139, + "step": 3273 + }, + { + "epoch": 0.27904201823915453, + "grad_norm": 115.28786620456768, + "learning_rate": 9.298295454545456e-06, + "loss": 2.196, + "step": 3274 + }, + { + "epoch": 0.27912724793317994, + "grad_norm": 42.08442018369902, + "learning_rate": 9.301136363636364e-06, + "loss": 2.5371, + "step": 3275 + }, + { + "epoch": 0.2792124776272053, + "grad_norm": 70.5635911561391, + "learning_rate": 9.303977272727273e-06, + "loss": 3.5285, + "step": 3276 + }, + { + "epoch": 0.2792977073212307, + "grad_norm": 50.831269749267705, + "learning_rate": 9.306818181818183e-06, + "loss": 2.9571, + "step": 3277 + }, + { + "epoch": 0.2793829370152561, + "grad_norm": 84.7610600295731, + "learning_rate": 9.309659090909091e-06, + "loss": 3.4726, + "step": 3278 + }, + { + "epoch": 0.27946816670928154, + "grad_norm": 78.13755345205811, + "learning_rate": 9.312500000000001e-06, + "loss": 3.8347, + "step": 3279 + }, + { + "epoch": 0.2795533964033069, + "grad_norm": 96.66073343450019, + "learning_rate": 9.315340909090909e-06, + "loss": 4.2449, + "step": 3280 + }, + { + "epoch": 0.2796386260973323, + "grad_norm": 63.48454769750423, + "learning_rate": 9.318181818181819e-06, + "loss": 2.8915, + "step": 3281 + }, + { + "epoch": 0.2797238557913577, + "grad_norm": 59.62451844955777, + "learning_rate": 9.321022727272729e-06, + "loss": 3.1478, + "step": 3282 + }, + { + "epoch": 0.2798090854853831, + "grad_norm": 116.45475073543821, + "learning_rate": 9.323863636363637e-06, + "loss": 3.0046, + "step": 3283 + }, + { + "epoch": 0.2798943151794085, + "grad_norm": 112.49561365566346, + "learning_rate": 9.326704545454547e-06, + "loss": 3.761, + "step": 3284 + }, + { + "epoch": 0.2799795448734339, + "grad_norm": 60.84797223492781, + "learning_rate": 9.329545454545456e-06, + "loss": 3.8028, + "step": 3285 + }, + { + "epoch": 0.2800647745674593, + "grad_norm": 50.980290665676854, + "learning_rate": 9.332386363636364e-06, + "loss": 3.3288, + "step": 3286 + }, + { + "epoch": 0.2801500042614847, + "grad_norm": 122.27475101055228, + "learning_rate": 9.335227272727272e-06, + "loss": 3.8523, + "step": 3287 + }, + { + "epoch": 0.2802352339555101, + "grad_norm": 49.72697459688628, + "learning_rate": 9.338068181818182e-06, + "loss": 3.5673, + "step": 3288 + }, + { + "epoch": 0.2803204636495355, + "grad_norm": 63.41879925306822, + "learning_rate": 9.340909090909092e-06, + "loss": 3.5853, + "step": 3289 + }, + { + "epoch": 0.2804056933435609, + "grad_norm": 56.43398319625509, + "learning_rate": 9.34375e-06, + "loss": 3.7883, + "step": 3290 + }, + { + "epoch": 0.28049092303758627, + "grad_norm": 58.79270152419748, + "learning_rate": 9.34659090909091e-06, + "loss": 2.8991, + "step": 3291 + }, + { + "epoch": 0.2805761527316117, + "grad_norm": 77.02479282082558, + "learning_rate": 9.34943181818182e-06, + "loss": 4.2206, + "step": 3292 + }, + { + "epoch": 0.2806613824256371, + "grad_norm": 63.44668765004152, + "learning_rate": 9.352272727272728e-06, + "loss": 3.3609, + "step": 3293 + }, + { + "epoch": 0.2807466121196625, + "grad_norm": 89.51652651810824, + "learning_rate": 9.355113636363638e-06, + "loss": 4.1247, + "step": 3294 + }, + { + "epoch": 0.28083184181368787, + "grad_norm": 97.73059952632109, + "learning_rate": 9.357954545454546e-06, + "loss": 4.7978, + "step": 3295 + }, + { + "epoch": 0.2809170715077133, + "grad_norm": 49.13331385340771, + "learning_rate": 9.360795454545455e-06, + "loss": 3.3359, + "step": 3296 + }, + { + "epoch": 0.2810023012017387, + "grad_norm": 102.63841815787814, + "learning_rate": 9.363636363636365e-06, + "loss": 3.7931, + "step": 3297 + }, + { + "epoch": 0.2810875308957641, + "grad_norm": 110.54267008218586, + "learning_rate": 9.366477272727273e-06, + "loss": 3.4149, + "step": 3298 + }, + { + "epoch": 0.28117276058978946, + "grad_norm": 60.32341213318364, + "learning_rate": 9.369318181818183e-06, + "loss": 3.7641, + "step": 3299 + }, + { + "epoch": 0.2812579902838149, + "grad_norm": 102.80315144352825, + "learning_rate": 9.372159090909093e-06, + "loss": 3.9526, + "step": 3300 + }, + { + "epoch": 0.2813432199778403, + "grad_norm": 66.5439159118095, + "learning_rate": 9.375000000000001e-06, + "loss": 3.4904, + "step": 3301 + }, + { + "epoch": 0.2814284496718657, + "grad_norm": 197.75047723109356, + "learning_rate": 9.377840909090909e-06, + "loss": 4.8236, + "step": 3302 + }, + { + "epoch": 0.28151367936589106, + "grad_norm": 113.64878478038653, + "learning_rate": 9.380681818181819e-06, + "loss": 3.7529, + "step": 3303 + }, + { + "epoch": 0.28159890905991647, + "grad_norm": 72.04259927235199, + "learning_rate": 9.383522727272729e-06, + "loss": 4.4087, + "step": 3304 + }, + { + "epoch": 0.2816841387539419, + "grad_norm": 108.95564573947941, + "learning_rate": 9.386363636363637e-06, + "loss": 4.6603, + "step": 3305 + }, + { + "epoch": 0.2817693684479673, + "grad_norm": 96.43445244641056, + "learning_rate": 9.389204545454546e-06, + "loss": 4.0659, + "step": 3306 + }, + { + "epoch": 0.28185459814199265, + "grad_norm": 55.440028538279705, + "learning_rate": 9.392045454545456e-06, + "loss": 4.5343, + "step": 3307 + }, + { + "epoch": 0.28193982783601806, + "grad_norm": 69.52475956506083, + "learning_rate": 9.394886363636364e-06, + "loss": 3.9628, + "step": 3308 + }, + { + "epoch": 0.2820250575300435, + "grad_norm": 152.5476182916554, + "learning_rate": 9.397727272727274e-06, + "loss": 3.5453, + "step": 3309 + }, + { + "epoch": 0.2821102872240689, + "grad_norm": 56.169526993151706, + "learning_rate": 9.400568181818182e-06, + "loss": 3.2817, + "step": 3310 + }, + { + "epoch": 0.28219551691809425, + "grad_norm": 102.58570346053135, + "learning_rate": 9.403409090909092e-06, + "loss": 3.665, + "step": 3311 + }, + { + "epoch": 0.28228074661211966, + "grad_norm": 96.09821818770976, + "learning_rate": 9.406250000000002e-06, + "loss": 3.8811, + "step": 3312 + }, + { + "epoch": 0.2823659763061451, + "grad_norm": 106.87997640103731, + "learning_rate": 9.40909090909091e-06, + "loss": 3.6154, + "step": 3313 + }, + { + "epoch": 0.2824512060001705, + "grad_norm": 81.46376750776952, + "learning_rate": 9.41193181818182e-06, + "loss": 3.7407, + "step": 3314 + }, + { + "epoch": 0.28253643569419584, + "grad_norm": 181.93946208594681, + "learning_rate": 9.41477272727273e-06, + "loss": 4.1952, + "step": 3315 + }, + { + "epoch": 0.28262166538822125, + "grad_norm": 41.723721337701804, + "learning_rate": 9.417613636363637e-06, + "loss": 3.2326, + "step": 3316 + }, + { + "epoch": 0.28270689508224667, + "grad_norm": 108.2524523316317, + "learning_rate": 9.420454545454545e-06, + "loss": 4.0342, + "step": 3317 + }, + { + "epoch": 0.2827921247762721, + "grad_norm": 185.58714365762796, + "learning_rate": 9.423295454545455e-06, + "loss": 4.1989, + "step": 3318 + }, + { + "epoch": 0.28287735447029744, + "grad_norm": 53.78253025588418, + "learning_rate": 9.426136363636365e-06, + "loss": 3.5282, + "step": 3319 + }, + { + "epoch": 0.28296258416432285, + "grad_norm": 64.08118674737378, + "learning_rate": 9.428977272727273e-06, + "loss": 3.5593, + "step": 3320 + }, + { + "epoch": 0.28304781385834826, + "grad_norm": 79.74777687096578, + "learning_rate": 9.431818181818183e-06, + "loss": 3.8265, + "step": 3321 + }, + { + "epoch": 0.2831330435523736, + "grad_norm": 48.354906072072495, + "learning_rate": 9.434659090909093e-06, + "loss": 3.558, + "step": 3322 + }, + { + "epoch": 0.28321827324639903, + "grad_norm": 530.4989916191375, + "learning_rate": 9.4375e-06, + "loss": 5.077, + "step": 3323 + }, + { + "epoch": 0.28330350294042445, + "grad_norm": 93.9195404941205, + "learning_rate": 9.44034090909091e-06, + "loss": 4.2503, + "step": 3324 + }, + { + "epoch": 0.28338873263444986, + "grad_norm": 69.76176528035505, + "learning_rate": 9.443181818181819e-06, + "loss": 3.5537, + "step": 3325 + }, + { + "epoch": 0.2834739623284752, + "grad_norm": 51.566170201929566, + "learning_rate": 9.446022727272728e-06, + "loss": 3.0305, + "step": 3326 + }, + { + "epoch": 0.28355919202250063, + "grad_norm": 110.13316585580282, + "learning_rate": 9.448863636363638e-06, + "loss": 4.2824, + "step": 3327 + }, + { + "epoch": 0.28364442171652604, + "grad_norm": 88.22997949425022, + "learning_rate": 9.451704545454546e-06, + "loss": 3.9066, + "step": 3328 + }, + { + "epoch": 0.28372965141055145, + "grad_norm": 49.96996132916845, + "learning_rate": 9.454545454545456e-06, + "loss": 3.1957, + "step": 3329 + }, + { + "epoch": 0.2838148811045768, + "grad_norm": 88.22975192681477, + "learning_rate": 9.457386363636364e-06, + "loss": 3.3967, + "step": 3330 + }, + { + "epoch": 0.2839001107986022, + "grad_norm": 182.5076548274656, + "learning_rate": 9.460227272727274e-06, + "loss": 5.6919, + "step": 3331 + }, + { + "epoch": 0.28398534049262764, + "grad_norm": 72.09854515050756, + "learning_rate": 9.463068181818182e-06, + "loss": 2.7865, + "step": 3332 + }, + { + "epoch": 0.28407057018665305, + "grad_norm": 72.8512508911399, + "learning_rate": 9.465909090909092e-06, + "loss": 3.0601, + "step": 3333 + }, + { + "epoch": 0.2841557998806784, + "grad_norm": 148.88639501280895, + "learning_rate": 9.468750000000001e-06, + "loss": 4.2525, + "step": 3334 + }, + { + "epoch": 0.2842410295747038, + "grad_norm": 69.05084889830943, + "learning_rate": 9.47159090909091e-06, + "loss": 4.1377, + "step": 3335 + }, + { + "epoch": 0.28432625926872923, + "grad_norm": 76.72081690475015, + "learning_rate": 9.474431818181818e-06, + "loss": 3.9692, + "step": 3336 + }, + { + "epoch": 0.28441148896275464, + "grad_norm": 170.692949782973, + "learning_rate": 9.477272727272729e-06, + "loss": 4.3827, + "step": 3337 + }, + { + "epoch": 0.28449671865678, + "grad_norm": 150.78104473012905, + "learning_rate": 9.480113636363637e-06, + "loss": 4.1418, + "step": 3338 + }, + { + "epoch": 0.2845819483508054, + "grad_norm": 312.39233581536297, + "learning_rate": 9.482954545454545e-06, + "loss": 4.3667, + "step": 3339 + }, + { + "epoch": 0.2846671780448308, + "grad_norm": 50.13034816784062, + "learning_rate": 9.485795454545455e-06, + "loss": 3.524, + "step": 3340 + }, + { + "epoch": 0.28475240773885624, + "grad_norm": 97.71022831607739, + "learning_rate": 9.488636363636365e-06, + "loss": 4.0242, + "step": 3341 + }, + { + "epoch": 0.2848376374328816, + "grad_norm": 95.67789824481417, + "learning_rate": 9.491477272727273e-06, + "loss": 3.3062, + "step": 3342 + }, + { + "epoch": 0.284922867126907, + "grad_norm": 88.78943643880122, + "learning_rate": 9.494318181818183e-06, + "loss": 3.5302, + "step": 3343 + }, + { + "epoch": 0.2850080968209324, + "grad_norm": 215.31622187913717, + "learning_rate": 9.497159090909092e-06, + "loss": 6.0497, + "step": 3344 + }, + { + "epoch": 0.28509332651495783, + "grad_norm": 36.62920695093224, + "learning_rate": 9.5e-06, + "loss": 2.7943, + "step": 3345 + }, + { + "epoch": 0.2851785562089832, + "grad_norm": 107.44358609170888, + "learning_rate": 9.50284090909091e-06, + "loss": 4.4092, + "step": 3346 + }, + { + "epoch": 0.2852637859030086, + "grad_norm": 39.146784553481815, + "learning_rate": 9.505681818181818e-06, + "loss": 2.802, + "step": 3347 + }, + { + "epoch": 0.285349015597034, + "grad_norm": 39.53356095379945, + "learning_rate": 9.508522727272728e-06, + "loss": 3.4444, + "step": 3348 + }, + { + "epoch": 0.28543424529105943, + "grad_norm": 210.40983815192786, + "learning_rate": 9.511363636363638e-06, + "loss": 4.5452, + "step": 3349 + }, + { + "epoch": 0.2855194749850848, + "grad_norm": 101.20910210877686, + "learning_rate": 9.514204545454546e-06, + "loss": 4.1538, + "step": 3350 + }, + { + "epoch": 0.2856047046791102, + "grad_norm": 51.96875106170758, + "learning_rate": 9.517045454545454e-06, + "loss": 3.5054, + "step": 3351 + }, + { + "epoch": 0.2856899343731356, + "grad_norm": 287.6840323781935, + "learning_rate": 9.519886363636366e-06, + "loss": 4.3524, + "step": 3352 + }, + { + "epoch": 0.285775164067161, + "grad_norm": 77.7758572215801, + "learning_rate": 9.522727272727274e-06, + "loss": 2.8724, + "step": 3353 + }, + { + "epoch": 0.2858603937611864, + "grad_norm": 63.22110015100047, + "learning_rate": 9.525568181818182e-06, + "loss": 3.2419, + "step": 3354 + }, + { + "epoch": 0.2859456234552118, + "grad_norm": 157.52030188663826, + "learning_rate": 9.528409090909092e-06, + "loss": 5.1293, + "step": 3355 + }, + { + "epoch": 0.2860308531492372, + "grad_norm": 53.78901603221538, + "learning_rate": 9.531250000000001e-06, + "loss": 2.9679, + "step": 3356 + }, + { + "epoch": 0.2861160828432626, + "grad_norm": 103.28581620988365, + "learning_rate": 9.53409090909091e-06, + "loss": 3.7561, + "step": 3357 + }, + { + "epoch": 0.286201312537288, + "grad_norm": 110.78513166883805, + "learning_rate": 9.536931818181819e-06, + "loss": 3.6622, + "step": 3358 + }, + { + "epoch": 0.2862865422313134, + "grad_norm": 66.17070217273103, + "learning_rate": 9.539772727272729e-06, + "loss": 3.6164, + "step": 3359 + }, + { + "epoch": 0.2863717719253388, + "grad_norm": 105.90673008040116, + "learning_rate": 9.542613636363637e-06, + "loss": 4.9228, + "step": 3360 + }, + { + "epoch": 0.28645700161936416, + "grad_norm": 58.058587961197425, + "learning_rate": 9.545454545454547e-06, + "loss": 3.5007, + "step": 3361 + }, + { + "epoch": 0.2865422313133896, + "grad_norm": 61.197233431761894, + "learning_rate": 9.548295454545455e-06, + "loss": 3.8071, + "step": 3362 + }, + { + "epoch": 0.286627461007415, + "grad_norm": 89.44445204456267, + "learning_rate": 9.551136363636365e-06, + "loss": 3.6655, + "step": 3363 + }, + { + "epoch": 0.2867126907014404, + "grad_norm": 66.9173959955533, + "learning_rate": 9.553977272727274e-06, + "loss": 3.5853, + "step": 3364 + }, + { + "epoch": 0.28679792039546576, + "grad_norm": 120.0744933153448, + "learning_rate": 9.556818181818182e-06, + "loss": 4.7064, + "step": 3365 + }, + { + "epoch": 0.28688315008949117, + "grad_norm": 66.36276627330626, + "learning_rate": 9.55965909090909e-06, + "loss": 2.9959, + "step": 3366 + }, + { + "epoch": 0.2869683797835166, + "grad_norm": 115.15532609171215, + "learning_rate": 9.562500000000002e-06, + "loss": 4.0829, + "step": 3367 + }, + { + "epoch": 0.287053609477542, + "grad_norm": 102.64497028169725, + "learning_rate": 9.56534090909091e-06, + "loss": 4.8976, + "step": 3368 + }, + { + "epoch": 0.28713883917156735, + "grad_norm": 59.58842917663587, + "learning_rate": 9.568181818181818e-06, + "loss": 4.3173, + "step": 3369 + }, + { + "epoch": 0.28722406886559276, + "grad_norm": 38.615903122339056, + "learning_rate": 9.571022727272728e-06, + "loss": 2.7757, + "step": 3370 + }, + { + "epoch": 0.2873092985596182, + "grad_norm": 92.15619567778381, + "learning_rate": 9.573863636363638e-06, + "loss": 3.8633, + "step": 3371 + }, + { + "epoch": 0.2873945282536436, + "grad_norm": 104.51326738360133, + "learning_rate": 9.576704545454546e-06, + "loss": 3.2508, + "step": 3372 + }, + { + "epoch": 0.28747975794766895, + "grad_norm": 53.464995782878674, + "learning_rate": 9.579545454545456e-06, + "loss": 3.3412, + "step": 3373 + }, + { + "epoch": 0.28756498764169436, + "grad_norm": 78.11006625898636, + "learning_rate": 9.582386363636364e-06, + "loss": 2.259, + "step": 3374 + }, + { + "epoch": 0.28765021733571977, + "grad_norm": 65.22347662634881, + "learning_rate": 9.585227272727273e-06, + "loss": 3.4711, + "step": 3375 + }, + { + "epoch": 0.2877354470297452, + "grad_norm": 90.6393341728298, + "learning_rate": 9.588068181818183e-06, + "loss": 3.8107, + "step": 3376 + }, + { + "epoch": 0.28782067672377054, + "grad_norm": 73.96809759601365, + "learning_rate": 9.590909090909091e-06, + "loss": 3.6737, + "step": 3377 + }, + { + "epoch": 0.28790590641779595, + "grad_norm": 88.03522706932368, + "learning_rate": 9.593750000000001e-06, + "loss": 4.3463, + "step": 3378 + }, + { + "epoch": 0.28799113611182137, + "grad_norm": 129.34553184394625, + "learning_rate": 9.596590909090911e-06, + "loss": 3.9082, + "step": 3379 + }, + { + "epoch": 0.2880763658058468, + "grad_norm": 132.59570821832233, + "learning_rate": 9.599431818181819e-06, + "loss": 4.2887, + "step": 3380 + }, + { + "epoch": 0.28816159549987214, + "grad_norm": 128.47081935807864, + "learning_rate": 9.602272727272727e-06, + "loss": 3.75, + "step": 3381 + }, + { + "epoch": 0.28824682519389755, + "grad_norm": 124.23200736722096, + "learning_rate": 9.605113636363639e-06, + "loss": 4.5765, + "step": 3382 + }, + { + "epoch": 0.28833205488792296, + "grad_norm": 97.74093635294017, + "learning_rate": 9.607954545454547e-06, + "loss": 4.6795, + "step": 3383 + }, + { + "epoch": 0.2884172845819484, + "grad_norm": 78.70518569812084, + "learning_rate": 9.610795454545455e-06, + "loss": 3.7338, + "step": 3384 + }, + { + "epoch": 0.28850251427597373, + "grad_norm": 142.60125864243275, + "learning_rate": 9.613636363636364e-06, + "loss": 6.1039, + "step": 3385 + }, + { + "epoch": 0.28858774396999914, + "grad_norm": 97.02706743221785, + "learning_rate": 9.616477272727274e-06, + "loss": 4.3753, + "step": 3386 + }, + { + "epoch": 0.28867297366402456, + "grad_norm": 111.38017170662894, + "learning_rate": 9.619318181818182e-06, + "loss": 4.5387, + "step": 3387 + }, + { + "epoch": 0.28875820335804997, + "grad_norm": 695.8570157688081, + "learning_rate": 9.62215909090909e-06, + "loss": 4.2573, + "step": 3388 + }, + { + "epoch": 0.2888434330520753, + "grad_norm": 103.43519219499521, + "learning_rate": 9.625e-06, + "loss": 4.4614, + "step": 3389 + }, + { + "epoch": 0.28892866274610074, + "grad_norm": 58.9207202586063, + "learning_rate": 9.62784090909091e-06, + "loss": 4.0572, + "step": 3390 + }, + { + "epoch": 0.28901389244012615, + "grad_norm": 231.3694901572708, + "learning_rate": 9.630681818181818e-06, + "loss": 2.7017, + "step": 3391 + }, + { + "epoch": 0.28909912213415156, + "grad_norm": 155.70251377058085, + "learning_rate": 9.633522727272728e-06, + "loss": 4.1731, + "step": 3392 + }, + { + "epoch": 0.2891843518281769, + "grad_norm": 174.41808645813276, + "learning_rate": 9.636363636363638e-06, + "loss": 5.5052, + "step": 3393 + }, + { + "epoch": 0.28926958152220233, + "grad_norm": 107.740243422069, + "learning_rate": 9.639204545454546e-06, + "loss": 5.0336, + "step": 3394 + }, + { + "epoch": 0.28935481121622775, + "grad_norm": 56.007436885184816, + "learning_rate": 9.642045454545455e-06, + "loss": 3.1727, + "step": 3395 + }, + { + "epoch": 0.2894400409102531, + "grad_norm": 161.159741252078, + "learning_rate": 9.644886363636364e-06, + "loss": 3.6062, + "step": 3396 + }, + { + "epoch": 0.2895252706042785, + "grad_norm": 73.18541248299506, + "learning_rate": 9.647727272727273e-06, + "loss": 3.526, + "step": 3397 + }, + { + "epoch": 0.28961050029830393, + "grad_norm": 36.2647331300225, + "learning_rate": 9.650568181818183e-06, + "loss": 3.0332, + "step": 3398 + }, + { + "epoch": 0.28969572999232934, + "grad_norm": 76.74765554353145, + "learning_rate": 9.653409090909091e-06, + "loss": 4.4202, + "step": 3399 + }, + { + "epoch": 0.2897809596863547, + "grad_norm": 59.255814342639695, + "learning_rate": 9.656250000000001e-06, + "loss": 3.0461, + "step": 3400 + }, + { + "epoch": 0.2898661893803801, + "grad_norm": 51.50420002820249, + "learning_rate": 9.65909090909091e-06, + "loss": 3.637, + "step": 3401 + }, + { + "epoch": 0.2899514190744055, + "grad_norm": 30.887210273879965, + "learning_rate": 9.661931818181819e-06, + "loss": 3.0174, + "step": 3402 + }, + { + "epoch": 0.29003664876843094, + "grad_norm": 101.2516517565732, + "learning_rate": 9.664772727272727e-06, + "loss": 3.9915, + "step": 3403 + }, + { + "epoch": 0.2901218784624563, + "grad_norm": 44.10100132927209, + "learning_rate": 9.667613636363637e-06, + "loss": 3.1184, + "step": 3404 + }, + { + "epoch": 0.2902071081564817, + "grad_norm": 105.81728157569788, + "learning_rate": 9.670454545454546e-06, + "loss": 4.9376, + "step": 3405 + }, + { + "epoch": 0.2902923378505071, + "grad_norm": 120.05402892340143, + "learning_rate": 9.673295454545455e-06, + "loss": 3.8764, + "step": 3406 + }, + { + "epoch": 0.29037756754453253, + "grad_norm": 79.04266940374184, + "learning_rate": 9.676136363636364e-06, + "loss": 4.2925, + "step": 3407 + }, + { + "epoch": 0.2904627972385579, + "grad_norm": 74.17444440217652, + "learning_rate": 9.678977272727274e-06, + "loss": 4.0248, + "step": 3408 + }, + { + "epoch": 0.2905480269325833, + "grad_norm": 104.67192336202102, + "learning_rate": 9.681818181818182e-06, + "loss": 3.5257, + "step": 3409 + }, + { + "epoch": 0.2906332566266087, + "grad_norm": 57.84350733882128, + "learning_rate": 9.684659090909092e-06, + "loss": 4.0817, + "step": 3410 + }, + { + "epoch": 0.29071848632063413, + "grad_norm": 102.98244202845328, + "learning_rate": 9.6875e-06, + "loss": 5.0442, + "step": 3411 + }, + { + "epoch": 0.2908037160146595, + "grad_norm": 56.65882896337686, + "learning_rate": 9.69034090909091e-06, + "loss": 3.9453, + "step": 3412 + }, + { + "epoch": 0.2908889457086849, + "grad_norm": 102.36781695682424, + "learning_rate": 9.69318181818182e-06, + "loss": 4.7098, + "step": 3413 + }, + { + "epoch": 0.2909741754027103, + "grad_norm": 43.306640663167094, + "learning_rate": 9.696022727272728e-06, + "loss": 3.0407, + "step": 3414 + }, + { + "epoch": 0.2910594050967357, + "grad_norm": 47.79280195865059, + "learning_rate": 9.698863636363637e-06, + "loss": 3.7308, + "step": 3415 + }, + { + "epoch": 0.2911446347907611, + "grad_norm": 86.66297648720943, + "learning_rate": 9.701704545454547e-06, + "loss": 4.2128, + "step": 3416 + }, + { + "epoch": 0.2912298644847865, + "grad_norm": 72.9193489199687, + "learning_rate": 9.704545454545455e-06, + "loss": 3.792, + "step": 3417 + }, + { + "epoch": 0.2913150941788119, + "grad_norm": 61.124441912993134, + "learning_rate": 9.707386363636363e-06, + "loss": 3.708, + "step": 3418 + }, + { + "epoch": 0.2914003238728373, + "grad_norm": 72.54289214926321, + "learning_rate": 9.710227272727273e-06, + "loss": 3.3025, + "step": 3419 + }, + { + "epoch": 0.2914855535668627, + "grad_norm": 109.49649867721418, + "learning_rate": 9.713068181818183e-06, + "loss": 4.1419, + "step": 3420 + }, + { + "epoch": 0.2915707832608881, + "grad_norm": 102.67485127863111, + "learning_rate": 9.715909090909091e-06, + "loss": 4.678, + "step": 3421 + }, + { + "epoch": 0.2916560129549135, + "grad_norm": 93.49206972005352, + "learning_rate": 9.71875e-06, + "loss": 4.5173, + "step": 3422 + }, + { + "epoch": 0.2917412426489389, + "grad_norm": 49.275238373084484, + "learning_rate": 9.72159090909091e-06, + "loss": 4.0884, + "step": 3423 + }, + { + "epoch": 0.29182647234296427, + "grad_norm": 56.24262288124333, + "learning_rate": 9.724431818181819e-06, + "loss": 3.2285, + "step": 3424 + }, + { + "epoch": 0.2919117020369897, + "grad_norm": 66.62420366785058, + "learning_rate": 9.727272727272728e-06, + "loss": 2.9885, + "step": 3425 + }, + { + "epoch": 0.2919969317310151, + "grad_norm": 49.34313651358443, + "learning_rate": 9.730113636363636e-06, + "loss": 3.2515, + "step": 3426 + }, + { + "epoch": 0.2920821614250405, + "grad_norm": 46.41807161144253, + "learning_rate": 9.732954545454546e-06, + "loss": 3.5914, + "step": 3427 + }, + { + "epoch": 0.29216739111906587, + "grad_norm": 265.8014113892425, + "learning_rate": 9.735795454545456e-06, + "loss": 4.7498, + "step": 3428 + }, + { + "epoch": 0.2922526208130913, + "grad_norm": 42.767557472881336, + "learning_rate": 9.738636363636364e-06, + "loss": 2.7785, + "step": 3429 + }, + { + "epoch": 0.2923378505071167, + "grad_norm": 106.17577379960326, + "learning_rate": 9.741477272727274e-06, + "loss": 4.1735, + "step": 3430 + }, + { + "epoch": 0.2924230802011421, + "grad_norm": 71.66958069229581, + "learning_rate": 9.744318181818184e-06, + "loss": 3.9816, + "step": 3431 + }, + { + "epoch": 0.29250830989516746, + "grad_norm": 120.47958043453103, + "learning_rate": 9.747159090909092e-06, + "loss": 3.9479, + "step": 3432 + }, + { + "epoch": 0.2925935395891929, + "grad_norm": 66.23894666226256, + "learning_rate": 9.75e-06, + "loss": 3.3358, + "step": 3433 + }, + { + "epoch": 0.2926787692832183, + "grad_norm": 63.553777771660236, + "learning_rate": 9.75284090909091e-06, + "loss": 3.7711, + "step": 3434 + }, + { + "epoch": 0.29276399897724364, + "grad_norm": 74.8703897206818, + "learning_rate": 9.75568181818182e-06, + "loss": 3.9362, + "step": 3435 + }, + { + "epoch": 0.29284922867126906, + "grad_norm": 80.11017468883814, + "learning_rate": 9.758522727272727e-06, + "loss": 4.3253, + "step": 3436 + }, + { + "epoch": 0.29293445836529447, + "grad_norm": 39.56801054853702, + "learning_rate": 9.761363636363637e-06, + "loss": 1.7709, + "step": 3437 + }, + { + "epoch": 0.2930196880593199, + "grad_norm": 79.23093206037068, + "learning_rate": 9.764204545454547e-06, + "loss": 4.1646, + "step": 3438 + }, + { + "epoch": 0.29310491775334524, + "grad_norm": 117.2328238956991, + "learning_rate": 9.767045454545455e-06, + "loss": 4.8207, + "step": 3439 + }, + { + "epoch": 0.29319014744737065, + "grad_norm": 100.78819116261548, + "learning_rate": 9.769886363636363e-06, + "loss": 4.9809, + "step": 3440 + }, + { + "epoch": 0.29327537714139607, + "grad_norm": 95.55511584008855, + "learning_rate": 9.772727272727273e-06, + "loss": 3.8624, + "step": 3441 + }, + { + "epoch": 0.2933606068354215, + "grad_norm": 94.79447046784426, + "learning_rate": 9.775568181818183e-06, + "loss": 3.3061, + "step": 3442 + }, + { + "epoch": 0.29344583652944684, + "grad_norm": 124.50953487348977, + "learning_rate": 9.77840909090909e-06, + "loss": 5.3103, + "step": 3443 + }, + { + "epoch": 0.29353106622347225, + "grad_norm": 81.39049038771384, + "learning_rate": 9.78125e-06, + "loss": 3.6252, + "step": 3444 + }, + { + "epoch": 0.29361629591749766, + "grad_norm": 148.92477934269283, + "learning_rate": 9.78409090909091e-06, + "loss": 4.6073, + "step": 3445 + }, + { + "epoch": 0.2937015256115231, + "grad_norm": 79.03846205611667, + "learning_rate": 9.786931818181818e-06, + "loss": 4.3305, + "step": 3446 + }, + { + "epoch": 0.29378675530554843, + "grad_norm": 86.79641547417926, + "learning_rate": 9.789772727272728e-06, + "loss": 4.1703, + "step": 3447 + }, + { + "epoch": 0.29387198499957384, + "grad_norm": 50.81558513593237, + "learning_rate": 9.792613636363636e-06, + "loss": 4.3407, + "step": 3448 + }, + { + "epoch": 0.29395721469359926, + "grad_norm": 46.57457753042483, + "learning_rate": 9.795454545454546e-06, + "loss": 3.1513, + "step": 3449 + }, + { + "epoch": 0.29404244438762467, + "grad_norm": 190.12609147098655, + "learning_rate": 9.798295454545456e-06, + "loss": 4.2049, + "step": 3450 + }, + { + "epoch": 0.29412767408165, + "grad_norm": 55.86874179112734, + "learning_rate": 9.801136363636364e-06, + "loss": 4.0577, + "step": 3451 + }, + { + "epoch": 0.29421290377567544, + "grad_norm": 77.23176049299782, + "learning_rate": 9.803977272727274e-06, + "loss": 4.8997, + "step": 3452 + }, + { + "epoch": 0.29429813346970085, + "grad_norm": 69.33957742359691, + "learning_rate": 9.806818181818183e-06, + "loss": 4.0603, + "step": 3453 + }, + { + "epoch": 0.29438336316372626, + "grad_norm": 73.87601471197486, + "learning_rate": 9.809659090909092e-06, + "loss": 4.1381, + "step": 3454 + }, + { + "epoch": 0.2944685928577516, + "grad_norm": 123.37326310470485, + "learning_rate": 9.8125e-06, + "loss": 3.572, + "step": 3455 + }, + { + "epoch": 0.29455382255177703, + "grad_norm": 136.56593829258284, + "learning_rate": 9.81534090909091e-06, + "loss": 4.1658, + "step": 3456 + }, + { + "epoch": 0.29463905224580245, + "grad_norm": 68.77110955071622, + "learning_rate": 9.81818181818182e-06, + "loss": 4.238, + "step": 3457 + }, + { + "epoch": 0.29472428193982786, + "grad_norm": 149.51516176810603, + "learning_rate": 9.821022727272727e-06, + "loss": 3.7922, + "step": 3458 + }, + { + "epoch": 0.2948095116338532, + "grad_norm": 107.65482330264172, + "learning_rate": 9.823863636363637e-06, + "loss": 3.6212, + "step": 3459 + }, + { + "epoch": 0.29489474132787863, + "grad_norm": 83.35380664144542, + "learning_rate": 9.826704545454547e-06, + "loss": 3.2896, + "step": 3460 + }, + { + "epoch": 0.29497997102190404, + "grad_norm": 51.45230910696642, + "learning_rate": 9.829545454545455e-06, + "loss": 3.9487, + "step": 3461 + }, + { + "epoch": 0.29506520071592945, + "grad_norm": 198.22025945676728, + "learning_rate": 9.832386363636365e-06, + "loss": 2.8964, + "step": 3462 + }, + { + "epoch": 0.2951504304099548, + "grad_norm": 74.79480490494055, + "learning_rate": 9.835227272727273e-06, + "loss": 3.6117, + "step": 3463 + }, + { + "epoch": 0.2952356601039802, + "grad_norm": 77.65408429946682, + "learning_rate": 9.838068181818183e-06, + "loss": 3.7341, + "step": 3464 + }, + { + "epoch": 0.29532088979800564, + "grad_norm": 55.02655532204462, + "learning_rate": 9.840909090909092e-06, + "loss": 4.1321, + "step": 3465 + }, + { + "epoch": 0.29540611949203105, + "grad_norm": 88.57986648565658, + "learning_rate": 9.84375e-06, + "loss": 4.3026, + "step": 3466 + }, + { + "epoch": 0.2954913491860564, + "grad_norm": 122.20510712727359, + "learning_rate": 9.84659090909091e-06, + "loss": 5.0823, + "step": 3467 + }, + { + "epoch": 0.2955765788800818, + "grad_norm": 74.71592187965489, + "learning_rate": 9.84943181818182e-06, + "loss": 4.0924, + "step": 3468 + }, + { + "epoch": 0.29566180857410723, + "grad_norm": 63.57292180404807, + "learning_rate": 9.852272727272728e-06, + "loss": 3.7804, + "step": 3469 + }, + { + "epoch": 0.29574703826813264, + "grad_norm": 58.17433376430402, + "learning_rate": 9.855113636363636e-06, + "loss": 3.8182, + "step": 3470 + }, + { + "epoch": 0.295832267962158, + "grad_norm": 87.9948454518646, + "learning_rate": 9.857954545454546e-06, + "loss": 4.0191, + "step": 3471 + }, + { + "epoch": 0.2959174976561834, + "grad_norm": 90.04779236482264, + "learning_rate": 9.860795454545456e-06, + "loss": 4.4034, + "step": 3472 + }, + { + "epoch": 0.2960027273502088, + "grad_norm": 105.11352823862133, + "learning_rate": 9.863636363636364e-06, + "loss": 4.5871, + "step": 3473 + }, + { + "epoch": 0.2960879570442342, + "grad_norm": 123.40981988592463, + "learning_rate": 9.866477272727274e-06, + "loss": 5.2675, + "step": 3474 + }, + { + "epoch": 0.2961731867382596, + "grad_norm": 49.177906527870675, + "learning_rate": 9.869318181818183e-06, + "loss": 3.0999, + "step": 3475 + }, + { + "epoch": 0.296258416432285, + "grad_norm": 209.58039852220497, + "learning_rate": 9.872159090909091e-06, + "loss": 4.4654, + "step": 3476 + }, + { + "epoch": 0.2963436461263104, + "grad_norm": 51.69419667157767, + "learning_rate": 9.875000000000001e-06, + "loss": 2.6233, + "step": 3477 + }, + { + "epoch": 0.2964288758203358, + "grad_norm": 45.230944923732665, + "learning_rate": 9.87784090909091e-06, + "loss": 2.5766, + "step": 3478 + }, + { + "epoch": 0.2965141055143612, + "grad_norm": 81.12378164362799, + "learning_rate": 9.880681818181819e-06, + "loss": 4.9445, + "step": 3479 + }, + { + "epoch": 0.2965993352083866, + "grad_norm": 67.57784521231379, + "learning_rate": 9.883522727272729e-06, + "loss": 4.0095, + "step": 3480 + }, + { + "epoch": 0.296684564902412, + "grad_norm": 87.07411393141372, + "learning_rate": 9.886363636363637e-06, + "loss": 3.4837, + "step": 3481 + }, + { + "epoch": 0.2967697945964374, + "grad_norm": 101.57703819210623, + "learning_rate": 9.889204545454547e-06, + "loss": 4.4058, + "step": 3482 + }, + { + "epoch": 0.2968550242904628, + "grad_norm": 171.187792805753, + "learning_rate": 9.892045454545456e-06, + "loss": 5.0902, + "step": 3483 + }, + { + "epoch": 0.2969402539844882, + "grad_norm": 171.29758995764527, + "learning_rate": 9.894886363636365e-06, + "loss": 4.7568, + "step": 3484 + }, + { + "epoch": 0.2970254836785136, + "grad_norm": 106.63037205970629, + "learning_rate": 9.897727272727273e-06, + "loss": 4.1278, + "step": 3485 + }, + { + "epoch": 0.29711071337253897, + "grad_norm": 183.89155948415166, + "learning_rate": 9.900568181818182e-06, + "loss": 4.6431, + "step": 3486 + }, + { + "epoch": 0.2971959430665644, + "grad_norm": 98.69376381362173, + "learning_rate": 9.903409090909092e-06, + "loss": 3.4349, + "step": 3487 + }, + { + "epoch": 0.2972811727605898, + "grad_norm": 61.65252895656829, + "learning_rate": 9.90625e-06, + "loss": 3.7984, + "step": 3488 + }, + { + "epoch": 0.2973664024546152, + "grad_norm": 54.298097644470964, + "learning_rate": 9.90909090909091e-06, + "loss": 3.6176, + "step": 3489 + }, + { + "epoch": 0.29745163214864057, + "grad_norm": 88.16592783680984, + "learning_rate": 9.91193181818182e-06, + "loss": 4.1225, + "step": 3490 + }, + { + "epoch": 0.297536861842666, + "grad_norm": 97.8331678106395, + "learning_rate": 9.914772727272728e-06, + "loss": 4.4916, + "step": 3491 + }, + { + "epoch": 0.2976220915366914, + "grad_norm": 41.28363816919563, + "learning_rate": 9.917613636363638e-06, + "loss": 3.1969, + "step": 3492 + }, + { + "epoch": 0.2977073212307168, + "grad_norm": 75.54933936848595, + "learning_rate": 9.920454545454546e-06, + "loss": 3.5069, + "step": 3493 + }, + { + "epoch": 0.29779255092474216, + "grad_norm": 153.13883467551744, + "learning_rate": 9.923295454545456e-06, + "loss": 5.4506, + "step": 3494 + }, + { + "epoch": 0.2978777806187676, + "grad_norm": 86.16505489725739, + "learning_rate": 9.926136363636364e-06, + "loss": 4.1156, + "step": 3495 + }, + { + "epoch": 0.297963010312793, + "grad_norm": 97.55282803464515, + "learning_rate": 9.928977272727273e-06, + "loss": 4.0975, + "step": 3496 + }, + { + "epoch": 0.2980482400068184, + "grad_norm": 69.02239183544938, + "learning_rate": 9.931818181818183e-06, + "loss": 3.3548, + "step": 3497 + }, + { + "epoch": 0.29813346970084376, + "grad_norm": 69.56657106606636, + "learning_rate": 9.934659090909091e-06, + "loss": 3.4735, + "step": 3498 + }, + { + "epoch": 0.29821869939486917, + "grad_norm": 56.39479519841421, + "learning_rate": 9.937500000000001e-06, + "loss": 3.8274, + "step": 3499 + }, + { + "epoch": 0.2983039290888946, + "grad_norm": 63.370693345233164, + "learning_rate": 9.940340909090909e-06, + "loss": 3.7545, + "step": 3500 + }, + { + "epoch": 0.29838915878292, + "grad_norm": 76.85585423855373, + "learning_rate": 9.943181818181819e-06, + "loss": 4.0892, + "step": 3501 + }, + { + "epoch": 0.29847438847694535, + "grad_norm": 114.35250989720765, + "learning_rate": 9.946022727272729e-06, + "loss": 3.6984, + "step": 3502 + }, + { + "epoch": 0.29855961817097076, + "grad_norm": 147.01420298146212, + "learning_rate": 9.948863636363637e-06, + "loss": 4.2678, + "step": 3503 + }, + { + "epoch": 0.2986448478649962, + "grad_norm": 100.45257973770275, + "learning_rate": 9.951704545454546e-06, + "loss": 3.955, + "step": 3504 + }, + { + "epoch": 0.2987300775590216, + "grad_norm": 111.36029137433165, + "learning_rate": 9.954545454545456e-06, + "loss": 4.072, + "step": 3505 + }, + { + "epoch": 0.29881530725304695, + "grad_norm": 65.89619405060053, + "learning_rate": 9.957386363636364e-06, + "loss": 3.6226, + "step": 3506 + }, + { + "epoch": 0.29890053694707236, + "grad_norm": 370.82134778574715, + "learning_rate": 9.960227272727272e-06, + "loss": 4.5933, + "step": 3507 + }, + { + "epoch": 0.29898576664109777, + "grad_norm": 93.59582993647744, + "learning_rate": 9.963068181818182e-06, + "loss": 4.1233, + "step": 3508 + }, + { + "epoch": 0.29907099633512313, + "grad_norm": 69.07329838331067, + "learning_rate": 9.965909090909092e-06, + "loss": 3.6593, + "step": 3509 + }, + { + "epoch": 0.29915622602914854, + "grad_norm": 65.08722107642191, + "learning_rate": 9.96875e-06, + "loss": 4.0312, + "step": 3510 + }, + { + "epoch": 0.29924145572317395, + "grad_norm": 58.75521963635854, + "learning_rate": 9.97159090909091e-06, + "loss": 3.8159, + "step": 3511 + }, + { + "epoch": 0.29932668541719937, + "grad_norm": 52.58084391469145, + "learning_rate": 9.97443181818182e-06, + "loss": 3.2237, + "step": 3512 + }, + { + "epoch": 0.2994119151112247, + "grad_norm": 162.25142296789198, + "learning_rate": 9.977272727272728e-06, + "loss": 4.1916, + "step": 3513 + }, + { + "epoch": 0.29949714480525014, + "grad_norm": 103.78562264126613, + "learning_rate": 9.980113636363637e-06, + "loss": 4.581, + "step": 3514 + }, + { + "epoch": 0.29958237449927555, + "grad_norm": 65.74050193538896, + "learning_rate": 9.982954545454546e-06, + "loss": 3.9235, + "step": 3515 + }, + { + "epoch": 0.29966760419330096, + "grad_norm": 53.13606492749633, + "learning_rate": 9.985795454545455e-06, + "loss": 3.6221, + "step": 3516 + }, + { + "epoch": 0.2997528338873263, + "grad_norm": 104.97338088378575, + "learning_rate": 9.988636363636365e-06, + "loss": 3.5875, + "step": 3517 + }, + { + "epoch": 0.29983806358135173, + "grad_norm": 66.29565843318639, + "learning_rate": 9.991477272727273e-06, + "loss": 3.085, + "step": 3518 + }, + { + "epoch": 0.29992329327537715, + "grad_norm": 75.09338564606337, + "learning_rate": 9.994318181818183e-06, + "loss": 3.6423, + "step": 3519 + }, + { + "epoch": 0.30000852296940256, + "grad_norm": 69.06980775961557, + "learning_rate": 9.997159090909093e-06, + "loss": 3.3125, + "step": 3520 + }, + { + "epoch": 0.3000937526634279, + "grad_norm": 186.96747088252417, + "learning_rate": 1e-05, + "loss": 2.995, + "step": 3521 + }, + { + "epoch": 0.30017898235745333, + "grad_norm": 236.10961558298723, + "learning_rate": 9.999999975413494e-06, + "loss": 4.1037, + "step": 3522 + }, + { + "epoch": 0.30026421205147874, + "grad_norm": 218.8208074791077, + "learning_rate": 9.999999901653973e-06, + "loss": 3.7325, + "step": 3523 + }, + { + "epoch": 0.30034944174550415, + "grad_norm": 97.35972621054552, + "learning_rate": 9.999999778721442e-06, + "loss": 3.9667, + "step": 3524 + }, + { + "epoch": 0.3004346714395295, + "grad_norm": 139.20474511087818, + "learning_rate": 9.9999996066159e-06, + "loss": 4.3612, + "step": 3525 + }, + { + "epoch": 0.3005199011335549, + "grad_norm": 98.76044002446378, + "learning_rate": 9.999999385337348e-06, + "loss": 4.7182, + "step": 3526 + }, + { + "epoch": 0.30060513082758034, + "grad_norm": 74.90979879112577, + "learning_rate": 9.999999114885789e-06, + "loss": 4.1344, + "step": 3527 + }, + { + "epoch": 0.30069036052160575, + "grad_norm": 43.15711914258759, + "learning_rate": 9.999998795261225e-06, + "loss": 2.6652, + "step": 3528 + }, + { + "epoch": 0.3007755902156311, + "grad_norm": 80.67078965795876, + "learning_rate": 9.999998426463659e-06, + "loss": 3.8036, + "step": 3529 + }, + { + "epoch": 0.3008608199096565, + "grad_norm": 58.37580787336459, + "learning_rate": 9.999998008493096e-06, + "loss": 3.6586, + "step": 3530 + }, + { + "epoch": 0.30094604960368193, + "grad_norm": 40.25465941713098, + "learning_rate": 9.99999754134954e-06, + "loss": 3.7214, + "step": 3531 + }, + { + "epoch": 0.30103127929770734, + "grad_norm": 42.60929780070754, + "learning_rate": 9.999997025032994e-06, + "loss": 3.3553, + "step": 3532 + }, + { + "epoch": 0.3011165089917327, + "grad_norm": 70.99229617354254, + "learning_rate": 9.999996459543466e-06, + "loss": 3.783, + "step": 3533 + }, + { + "epoch": 0.3012017386857581, + "grad_norm": 74.46352125533208, + "learning_rate": 9.999995844880959e-06, + "loss": 2.96, + "step": 3534 + }, + { + "epoch": 0.3012869683797835, + "grad_norm": 76.91596241719287, + "learning_rate": 9.999995181045478e-06, + "loss": 3.8103, + "step": 3535 + }, + { + "epoch": 0.30137219807380894, + "grad_norm": 121.89747717138185, + "learning_rate": 9.999994468037031e-06, + "loss": 4.7924, + "step": 3536 + }, + { + "epoch": 0.3014574277678343, + "grad_norm": 166.20756757902765, + "learning_rate": 9.999993705855628e-06, + "loss": 4.4585, + "step": 3537 + }, + { + "epoch": 0.3015426574618597, + "grad_norm": 53.8883003285962, + "learning_rate": 9.999992894501272e-06, + "loss": 3.7197, + "step": 3538 + }, + { + "epoch": 0.3016278871558851, + "grad_norm": 106.06829763312783, + "learning_rate": 9.999992033973972e-06, + "loss": 3.4377, + "step": 3539 + }, + { + "epoch": 0.30171311684991053, + "grad_norm": 69.51621013936015, + "learning_rate": 9.999991124273738e-06, + "loss": 3.2636, + "step": 3540 + }, + { + "epoch": 0.3017983465439359, + "grad_norm": 56.0967788949807, + "learning_rate": 9.999990165400578e-06, + "loss": 3.8864, + "step": 3541 + }, + { + "epoch": 0.3018835762379613, + "grad_norm": 119.34162245903659, + "learning_rate": 9.999989157354502e-06, + "loss": 4.1031, + "step": 3542 + }, + { + "epoch": 0.3019688059319867, + "grad_norm": 48.72545701004692, + "learning_rate": 9.999988100135518e-06, + "loss": 3.9336, + "step": 3543 + }, + { + "epoch": 0.30205403562601213, + "grad_norm": 68.42688096535247, + "learning_rate": 9.999986993743639e-06, + "loss": 3.0219, + "step": 3544 + }, + { + "epoch": 0.3021392653200375, + "grad_norm": 135.18030329903283, + "learning_rate": 9.999985838178874e-06, + "loss": 4.7042, + "step": 3545 + }, + { + "epoch": 0.3022244950140629, + "grad_norm": 61.998217187474964, + "learning_rate": 9.999984633441235e-06, + "loss": 3.526, + "step": 3546 + }, + { + "epoch": 0.3023097247080883, + "grad_norm": 220.0035328353267, + "learning_rate": 9.999983379530735e-06, + "loss": 5.0702, + "step": 3547 + }, + { + "epoch": 0.30239495440211367, + "grad_norm": 100.89629900640713, + "learning_rate": 9.999982076447385e-06, + "loss": 4.1487, + "step": 3548 + }, + { + "epoch": 0.3024801840961391, + "grad_norm": 97.23849168678763, + "learning_rate": 9.999980724191199e-06, + "loss": 2.9104, + "step": 3549 + }, + { + "epoch": 0.3025654137901645, + "grad_norm": 69.67455753176726, + "learning_rate": 9.999979322762187e-06, + "loss": 2.8944, + "step": 3550 + }, + { + "epoch": 0.3026506434841899, + "grad_norm": 77.69166213957931, + "learning_rate": 9.999977872160367e-06, + "loss": 3.7342, + "step": 3551 + }, + { + "epoch": 0.30273587317821526, + "grad_norm": 409.1268234132012, + "learning_rate": 9.99997637238575e-06, + "loss": 5.0771, + "step": 3552 + }, + { + "epoch": 0.3028211028722407, + "grad_norm": 53.33911937516843, + "learning_rate": 9.999974823438353e-06, + "loss": 4.1142, + "step": 3553 + }, + { + "epoch": 0.3029063325662661, + "grad_norm": 95.35375274451422, + "learning_rate": 9.999973225318191e-06, + "loss": 3.9765, + "step": 3554 + }, + { + "epoch": 0.3029915622602915, + "grad_norm": 141.76329095422273, + "learning_rate": 9.999971578025278e-06, + "loss": 3.3659, + "step": 3555 + }, + { + "epoch": 0.30307679195431686, + "grad_norm": 339.2237390573436, + "learning_rate": 9.999969881559632e-06, + "loss": 3.2142, + "step": 3556 + }, + { + "epoch": 0.3031620216483423, + "grad_norm": 51.620798843346805, + "learning_rate": 9.999968135921268e-06, + "loss": 3.6145, + "step": 3557 + }, + { + "epoch": 0.3032472513423677, + "grad_norm": 262.09305209755684, + "learning_rate": 9.999966341110205e-06, + "loss": 5.1257, + "step": 3558 + }, + { + "epoch": 0.3033324810363931, + "grad_norm": 81.31716286530455, + "learning_rate": 9.99996449712646e-06, + "loss": 3.8216, + "step": 3559 + }, + { + "epoch": 0.30341771073041846, + "grad_norm": 114.42261667480908, + "learning_rate": 9.99996260397005e-06, + "loss": 4.5991, + "step": 3560 + }, + { + "epoch": 0.30350294042444387, + "grad_norm": 233.74958823227738, + "learning_rate": 9.999960661640996e-06, + "loss": 5.3115, + "step": 3561 + }, + { + "epoch": 0.3035881701184693, + "grad_norm": 59.9363642411002, + "learning_rate": 9.999958670139316e-06, + "loss": 3.3628, + "step": 3562 + }, + { + "epoch": 0.3036733998124947, + "grad_norm": 87.36399870033542, + "learning_rate": 9.999956629465027e-06, + "loss": 4.6783, + "step": 3563 + }, + { + "epoch": 0.30375862950652005, + "grad_norm": 115.39640636539902, + "learning_rate": 9.999954539618153e-06, + "loss": 4.3492, + "step": 3564 + }, + { + "epoch": 0.30384385920054546, + "grad_norm": 61.51512160166281, + "learning_rate": 9.999952400598713e-06, + "loss": 3.5795, + "step": 3565 + }, + { + "epoch": 0.3039290888945709, + "grad_norm": 99.25321342056755, + "learning_rate": 9.999950212406728e-06, + "loss": 3.736, + "step": 3566 + }, + { + "epoch": 0.3040143185885963, + "grad_norm": 48.80936490048318, + "learning_rate": 9.999947975042218e-06, + "loss": 3.5582, + "step": 3567 + }, + { + "epoch": 0.30409954828262165, + "grad_norm": 276.89955050050867, + "learning_rate": 9.999945688505209e-06, + "loss": 4.3211, + "step": 3568 + }, + { + "epoch": 0.30418477797664706, + "grad_norm": 99.74885478270869, + "learning_rate": 9.999943352795717e-06, + "loss": 3.9139, + "step": 3569 + }, + { + "epoch": 0.30427000767067247, + "grad_norm": 194.36351654599406, + "learning_rate": 9.999940967913773e-06, + "loss": 4.5363, + "step": 3570 + }, + { + "epoch": 0.3043552373646979, + "grad_norm": 86.75372690136841, + "learning_rate": 9.999938533859394e-06, + "loss": 5.1896, + "step": 3571 + }, + { + "epoch": 0.30444046705872324, + "grad_norm": 77.91629993033003, + "learning_rate": 9.999936050632606e-06, + "loss": 4.3465, + "step": 3572 + }, + { + "epoch": 0.30452569675274865, + "grad_norm": 217.14784026098985, + "learning_rate": 9.999933518233435e-06, + "loss": 4.6247, + "step": 3573 + }, + { + "epoch": 0.30461092644677407, + "grad_norm": 49.25047786493157, + "learning_rate": 9.999930936661904e-06, + "loss": 3.5762, + "step": 3574 + }, + { + "epoch": 0.3046961561407995, + "grad_norm": 90.7130999737786, + "learning_rate": 9.999928305918038e-06, + "loss": 4.3975, + "step": 3575 + }, + { + "epoch": 0.30478138583482484, + "grad_norm": 83.55561281498365, + "learning_rate": 9.999925626001867e-06, + "loss": 3.8535, + "step": 3576 + }, + { + "epoch": 0.30486661552885025, + "grad_norm": 67.50395798747341, + "learning_rate": 9.999922896913412e-06, + "loss": 4.1158, + "step": 3577 + }, + { + "epoch": 0.30495184522287566, + "grad_norm": 186.87904268784482, + "learning_rate": 9.999920118652703e-06, + "loss": 3.9038, + "step": 3578 + }, + { + "epoch": 0.3050370749169011, + "grad_norm": 130.5392902951015, + "learning_rate": 9.999917291219765e-06, + "loss": 4.1569, + "step": 3579 + }, + { + "epoch": 0.30512230461092643, + "grad_norm": 91.32799822942833, + "learning_rate": 9.999914414614629e-06, + "loss": 4.2763, + "step": 3580 + }, + { + "epoch": 0.30520753430495184, + "grad_norm": 120.23428825059055, + "learning_rate": 9.999911488837319e-06, + "loss": 4.2917, + "step": 3581 + }, + { + "epoch": 0.30529276399897726, + "grad_norm": 89.08324778207948, + "learning_rate": 9.999908513887868e-06, + "loss": 4.6165, + "step": 3582 + }, + { + "epoch": 0.30537799369300267, + "grad_norm": 91.08357041407749, + "learning_rate": 9.999905489766304e-06, + "loss": 5.2246, + "step": 3583 + }, + { + "epoch": 0.305463223387028, + "grad_norm": 62.88634017461629, + "learning_rate": 9.999902416472655e-06, + "loss": 3.1919, + "step": 3584 + }, + { + "epoch": 0.30554845308105344, + "grad_norm": 134.84440882694827, + "learning_rate": 9.999899294006953e-06, + "loss": 4.2344, + "step": 3585 + }, + { + "epoch": 0.30563368277507885, + "grad_norm": 52.55189392116836, + "learning_rate": 9.999896122369228e-06, + "loss": 3.5209, + "step": 3586 + }, + { + "epoch": 0.3057189124691042, + "grad_norm": 99.02265705393108, + "learning_rate": 9.999892901559513e-06, + "loss": 4.6102, + "step": 3587 + }, + { + "epoch": 0.3058041421631296, + "grad_norm": 46.1211620963377, + "learning_rate": 9.999889631577837e-06, + "loss": 3.8762, + "step": 3588 + }, + { + "epoch": 0.30588937185715503, + "grad_norm": 80.47300056585026, + "learning_rate": 9.999886312424232e-06, + "loss": 4.3471, + "step": 3589 + }, + { + "epoch": 0.30597460155118045, + "grad_norm": 71.0073372623989, + "learning_rate": 9.999882944098733e-06, + "loss": 3.3189, + "step": 3590 + }, + { + "epoch": 0.3060598312452058, + "grad_norm": 105.25505718671741, + "learning_rate": 9.999879526601372e-06, + "loss": 5.0045, + "step": 3591 + }, + { + "epoch": 0.3061450609392312, + "grad_norm": 60.05828088407608, + "learning_rate": 9.999876059932184e-06, + "loss": 2.8294, + "step": 3592 + }, + { + "epoch": 0.30623029063325663, + "grad_norm": 84.51772702367587, + "learning_rate": 9.9998725440912e-06, + "loss": 3.697, + "step": 3593 + }, + { + "epoch": 0.30631552032728204, + "grad_norm": 52.5949752090585, + "learning_rate": 9.999868979078455e-06, + "loss": 3.5279, + "step": 3594 + }, + { + "epoch": 0.3064007500213074, + "grad_norm": 53.24105927560954, + "learning_rate": 9.999865364893987e-06, + "loss": 2.2893, + "step": 3595 + }, + { + "epoch": 0.3064859797153328, + "grad_norm": 52.29577201551128, + "learning_rate": 9.999861701537829e-06, + "loss": 3.7372, + "step": 3596 + }, + { + "epoch": 0.3065712094093582, + "grad_norm": 49.86773147602773, + "learning_rate": 9.99985798901002e-06, + "loss": 3.0914, + "step": 3597 + }, + { + "epoch": 0.30665643910338364, + "grad_norm": 82.28457081357138, + "learning_rate": 9.999854227310593e-06, + "loss": 4.3409, + "step": 3598 + }, + { + "epoch": 0.306741668797409, + "grad_norm": 85.72899172970156, + "learning_rate": 9.999850416439587e-06, + "loss": 3.9432, + "step": 3599 + }, + { + "epoch": 0.3068268984914344, + "grad_norm": 95.45524172547329, + "learning_rate": 9.999846556397039e-06, + "loss": 4.3559, + "step": 3600 + }, + { + "epoch": 0.3069121281854598, + "grad_norm": 80.96288648768541, + "learning_rate": 9.999842647182984e-06, + "loss": 4.0026, + "step": 3601 + }, + { + "epoch": 0.30699735787948523, + "grad_norm": 59.16407840310599, + "learning_rate": 9.999838688797468e-06, + "loss": 3.2369, + "step": 3602 + }, + { + "epoch": 0.3070825875735106, + "grad_norm": 65.2484499477735, + "learning_rate": 9.99983468124052e-06, + "loss": 3.6837, + "step": 3603 + }, + { + "epoch": 0.307167817267536, + "grad_norm": 87.99387194410227, + "learning_rate": 9.999830624512189e-06, + "loss": 3.7904, + "step": 3604 + }, + { + "epoch": 0.3072530469615614, + "grad_norm": 55.405282967449075, + "learning_rate": 9.99982651861251e-06, + "loss": 4.2658, + "step": 3605 + }, + { + "epoch": 0.30733827665558683, + "grad_norm": 162.677130391132, + "learning_rate": 9.999822363541522e-06, + "loss": 4.6133, + "step": 3606 + }, + { + "epoch": 0.3074235063496122, + "grad_norm": 81.63153568336712, + "learning_rate": 9.999818159299268e-06, + "loss": 3.9797, + "step": 3607 + }, + { + "epoch": 0.3075087360436376, + "grad_norm": 83.09182621928429, + "learning_rate": 9.999813905885792e-06, + "loss": 3.5685, + "step": 3608 + }, + { + "epoch": 0.307593965737663, + "grad_norm": 57.58310249327277, + "learning_rate": 9.99980960330113e-06, + "loss": 3.5861, + "step": 3609 + }, + { + "epoch": 0.3076791954316884, + "grad_norm": 109.18377200550061, + "learning_rate": 9.999805251545327e-06, + "loss": 5.3147, + "step": 3610 + }, + { + "epoch": 0.3077644251257138, + "grad_norm": 150.3251679773545, + "learning_rate": 9.999800850618428e-06, + "loss": 3.6758, + "step": 3611 + }, + { + "epoch": 0.3078496548197392, + "grad_norm": 241.42788471788984, + "learning_rate": 9.999796400520474e-06, + "loss": 4.7998, + "step": 3612 + }, + { + "epoch": 0.3079348845137646, + "grad_norm": 64.37749818906028, + "learning_rate": 9.999791901251508e-06, + "loss": 3.5185, + "step": 3613 + }, + { + "epoch": 0.30802011420779, + "grad_norm": 89.41807491599576, + "learning_rate": 9.999787352811576e-06, + "loss": 4.0703, + "step": 3614 + }, + { + "epoch": 0.3081053439018154, + "grad_norm": 150.3620486743426, + "learning_rate": 9.999782755200723e-06, + "loss": 3.8051, + "step": 3615 + }, + { + "epoch": 0.3081905735958408, + "grad_norm": 84.18399006156787, + "learning_rate": 9.999778108418991e-06, + "loss": 4.2599, + "step": 3616 + }, + { + "epoch": 0.3082758032898662, + "grad_norm": 112.13312589909549, + "learning_rate": 9.99977341246643e-06, + "loss": 4.1139, + "step": 3617 + }, + { + "epoch": 0.3083610329838916, + "grad_norm": 55.58646756181961, + "learning_rate": 9.999768667343084e-06, + "loss": 3.8358, + "step": 3618 + }, + { + "epoch": 0.30844626267791697, + "grad_norm": 103.46369538492783, + "learning_rate": 9.999763873049e-06, + "loss": 4.9792, + "step": 3619 + }, + { + "epoch": 0.3085314923719424, + "grad_norm": 53.88513165318414, + "learning_rate": 9.999759029584225e-06, + "loss": 3.6597, + "step": 3620 + }, + { + "epoch": 0.3086167220659678, + "grad_norm": 91.68194906516554, + "learning_rate": 9.999754136948805e-06, + "loss": 3.4243, + "step": 3621 + }, + { + "epoch": 0.30870195175999315, + "grad_norm": 68.13927006504642, + "learning_rate": 9.999749195142792e-06, + "loss": 4.5353, + "step": 3622 + }, + { + "epoch": 0.30878718145401857, + "grad_norm": 56.586461770932104, + "learning_rate": 9.99974420416623e-06, + "loss": 3.3119, + "step": 3623 + }, + { + "epoch": 0.308872411148044, + "grad_norm": 49.674333838531695, + "learning_rate": 9.999739164019174e-06, + "loss": 3.4724, + "step": 3624 + }, + { + "epoch": 0.3089576408420694, + "grad_norm": 93.22229147737917, + "learning_rate": 9.999734074701667e-06, + "loss": 4.3037, + "step": 3625 + }, + { + "epoch": 0.30904287053609475, + "grad_norm": 93.06154983245867, + "learning_rate": 9.999728936213763e-06, + "loss": 3.8512, + "step": 3626 + }, + { + "epoch": 0.30912810023012016, + "grad_norm": 49.31911730609618, + "learning_rate": 9.999723748555512e-06, + "loss": 4.1229, + "step": 3627 + }, + { + "epoch": 0.3092133299241456, + "grad_norm": 49.184279497355654, + "learning_rate": 9.999718511726964e-06, + "loss": 3.7007, + "step": 3628 + }, + { + "epoch": 0.309298559618171, + "grad_norm": 60.563469308776455, + "learning_rate": 9.99971322572817e-06, + "loss": 3.8968, + "step": 3629 + }, + { + "epoch": 0.30938378931219634, + "grad_norm": 42.30193545019338, + "learning_rate": 9.999707890559185e-06, + "loss": 3.3567, + "step": 3630 + }, + { + "epoch": 0.30946901900622176, + "grad_norm": 96.63062621828462, + "learning_rate": 9.99970250622006e-06, + "loss": 2.9735, + "step": 3631 + }, + { + "epoch": 0.30955424870024717, + "grad_norm": 39.337229010060256, + "learning_rate": 9.999697072710845e-06, + "loss": 1.0926, + "step": 3632 + }, + { + "epoch": 0.3096394783942726, + "grad_norm": 70.64365936835274, + "learning_rate": 9.999691590031598e-06, + "loss": 3.2098, + "step": 3633 + }, + { + "epoch": 0.30972470808829794, + "grad_norm": 48.805928994138135, + "learning_rate": 9.999686058182371e-06, + "loss": 3.5956, + "step": 3634 + }, + { + "epoch": 0.30980993778232335, + "grad_norm": 46.53243826136372, + "learning_rate": 9.999680477163218e-06, + "loss": 2.9525, + "step": 3635 + }, + { + "epoch": 0.30989516747634877, + "grad_norm": 202.0001758512995, + "learning_rate": 9.999674846974193e-06, + "loss": 5.4598, + "step": 3636 + }, + { + "epoch": 0.3099803971703742, + "grad_norm": 63.17562577554065, + "learning_rate": 9.999669167615354e-06, + "loss": 3.6151, + "step": 3637 + }, + { + "epoch": 0.31006562686439954, + "grad_norm": 81.67358742849628, + "learning_rate": 9.999663439086756e-06, + "loss": 3.9987, + "step": 3638 + }, + { + "epoch": 0.31015085655842495, + "grad_norm": 141.65794703848024, + "learning_rate": 9.999657661388452e-06, + "loss": 4.6094, + "step": 3639 + }, + { + "epoch": 0.31023608625245036, + "grad_norm": 136.2212759358175, + "learning_rate": 9.999651834520503e-06, + "loss": 5.4625, + "step": 3640 + }, + { + "epoch": 0.3103213159464758, + "grad_norm": 108.04979601277552, + "learning_rate": 9.999645958482967e-06, + "loss": 4.5628, + "step": 3641 + }, + { + "epoch": 0.31040654564050113, + "grad_norm": 134.24702912136146, + "learning_rate": 9.999640033275897e-06, + "loss": 5.0895, + "step": 3642 + }, + { + "epoch": 0.31049177533452654, + "grad_norm": 68.71550868082785, + "learning_rate": 9.999634058899354e-06, + "loss": 3.4725, + "step": 3643 + }, + { + "epoch": 0.31057700502855196, + "grad_norm": 114.19727275493081, + "learning_rate": 9.999628035353397e-06, + "loss": 4.9385, + "step": 3644 + }, + { + "epoch": 0.31066223472257737, + "grad_norm": 127.4422143563623, + "learning_rate": 9.999621962638087e-06, + "loss": 4.317, + "step": 3645 + }, + { + "epoch": 0.3107474644166027, + "grad_norm": 110.27269789084406, + "learning_rate": 9.999615840753478e-06, + "loss": 4.9125, + "step": 3646 + }, + { + "epoch": 0.31083269411062814, + "grad_norm": 165.18717336601884, + "learning_rate": 9.999609669699637e-06, + "loss": 4.1395, + "step": 3647 + }, + { + "epoch": 0.31091792380465355, + "grad_norm": 95.88983491512515, + "learning_rate": 9.99960344947662e-06, + "loss": 4.0422, + "step": 3648 + }, + { + "epoch": 0.31100315349867896, + "grad_norm": 122.44396253386516, + "learning_rate": 9.99959718008449e-06, + "loss": 4.083, + "step": 3649 + }, + { + "epoch": 0.3110883831927043, + "grad_norm": 123.52380290648385, + "learning_rate": 9.999590861523309e-06, + "loss": 4.0698, + "step": 3650 + }, + { + "epoch": 0.31117361288672973, + "grad_norm": 146.00020318424149, + "learning_rate": 9.999584493793138e-06, + "loss": 3.3454, + "step": 3651 + }, + { + "epoch": 0.31125884258075515, + "grad_norm": 86.01500163099908, + "learning_rate": 9.99957807689404e-06, + "loss": 3.7394, + "step": 3652 + }, + { + "epoch": 0.31134407227478056, + "grad_norm": 152.63377782733704, + "learning_rate": 9.999571610826077e-06, + "loss": 4.6431, + "step": 3653 + }, + { + "epoch": 0.3114293019688059, + "grad_norm": 208.9763651426162, + "learning_rate": 9.999565095589316e-06, + "loss": 5.5437, + "step": 3654 + }, + { + "epoch": 0.31151453166283133, + "grad_norm": 107.99439854975206, + "learning_rate": 9.999558531183818e-06, + "loss": 5.0216, + "step": 3655 + }, + { + "epoch": 0.31159976135685674, + "grad_norm": 44.81722666451343, + "learning_rate": 9.99955191760965e-06, + "loss": 3.3235, + "step": 3656 + }, + { + "epoch": 0.31168499105088215, + "grad_norm": 78.58551150598873, + "learning_rate": 9.999545254866874e-06, + "loss": 3.0172, + "step": 3657 + }, + { + "epoch": 0.3117702207449075, + "grad_norm": 41.85621741414249, + "learning_rate": 9.999538542955556e-06, + "loss": 3.7495, + "step": 3658 + }, + { + "epoch": 0.3118554504389329, + "grad_norm": 173.3703354176067, + "learning_rate": 9.999531781875764e-06, + "loss": 4.3262, + "step": 3659 + }, + { + "epoch": 0.31194068013295834, + "grad_norm": 74.16049609781348, + "learning_rate": 9.999524971627564e-06, + "loss": 4.5047, + "step": 3660 + }, + { + "epoch": 0.3120259098269837, + "grad_norm": 100.95568567825173, + "learning_rate": 9.999518112211022e-06, + "loss": 3.8552, + "step": 3661 + }, + { + "epoch": 0.3121111395210091, + "grad_norm": 42.46805825735001, + "learning_rate": 9.999511203626207e-06, + "loss": 3.063, + "step": 3662 + }, + { + "epoch": 0.3121963692150345, + "grad_norm": 73.85405741213555, + "learning_rate": 9.999504245873186e-06, + "loss": 3.6497, + "step": 3663 + }, + { + "epoch": 0.31228159890905993, + "grad_norm": 50.79571243286435, + "learning_rate": 9.999497238952024e-06, + "loss": 3.445, + "step": 3664 + }, + { + "epoch": 0.3123668286030853, + "grad_norm": 128.35623921882348, + "learning_rate": 9.999490182862796e-06, + "loss": 4.7563, + "step": 3665 + }, + { + "epoch": 0.3124520582971107, + "grad_norm": 54.47103339279794, + "learning_rate": 9.99948307760557e-06, + "loss": 2.3809, + "step": 3666 + }, + { + "epoch": 0.3125372879911361, + "grad_norm": 420.0927650659648, + "learning_rate": 9.999475923180413e-06, + "loss": 5.5688, + "step": 3667 + }, + { + "epoch": 0.3126225176851615, + "grad_norm": 49.72564977671504, + "learning_rate": 9.999468719587396e-06, + "loss": 3.3022, + "step": 3668 + }, + { + "epoch": 0.3127077473791869, + "grad_norm": 93.39198980951814, + "learning_rate": 9.999461466826593e-06, + "loss": 3.8971, + "step": 3669 + }, + { + "epoch": 0.3127929770732123, + "grad_norm": 63.84889715542843, + "learning_rate": 9.99945416489807e-06, + "loss": 3.8363, + "step": 3670 + }, + { + "epoch": 0.3128782067672377, + "grad_norm": 112.3422479172303, + "learning_rate": 9.999446813801904e-06, + "loss": 4.6726, + "step": 3671 + }, + { + "epoch": 0.3129634364612631, + "grad_norm": 56.685723533269744, + "learning_rate": 9.999439413538166e-06, + "loss": 4.1833, + "step": 3672 + }, + { + "epoch": 0.3130486661552885, + "grad_norm": 80.31465328880579, + "learning_rate": 9.999431964106925e-06, + "loss": 3.6394, + "step": 3673 + }, + { + "epoch": 0.3131338958493139, + "grad_norm": 58.72996678116461, + "learning_rate": 9.99942446550826e-06, + "loss": 3.2982, + "step": 3674 + }, + { + "epoch": 0.3132191255433393, + "grad_norm": 762.257115643073, + "learning_rate": 9.99941691774224e-06, + "loss": 4.5351, + "step": 3675 + }, + { + "epoch": 0.3133043552373647, + "grad_norm": 82.15929584981626, + "learning_rate": 9.999409320808942e-06, + "loss": 3.364, + "step": 3676 + }, + { + "epoch": 0.3133895849313901, + "grad_norm": 57.04743128205131, + "learning_rate": 9.99940167470844e-06, + "loss": 3.7137, + "step": 3677 + }, + { + "epoch": 0.3134748146254155, + "grad_norm": 87.2181261377871, + "learning_rate": 9.99939397944081e-06, + "loss": 4.287, + "step": 3678 + }, + { + "epoch": 0.3135600443194409, + "grad_norm": 73.21734261146976, + "learning_rate": 9.999386235006126e-06, + "loss": 3.7786, + "step": 3679 + }, + { + "epoch": 0.3136452740134663, + "grad_norm": 77.41996410298567, + "learning_rate": 9.999378441404464e-06, + "loss": 3.4959, + "step": 3680 + }, + { + "epoch": 0.31373050370749167, + "grad_norm": 79.83582900120318, + "learning_rate": 9.999370598635903e-06, + "loss": 3.9362, + "step": 3681 + }, + { + "epoch": 0.3138157334015171, + "grad_norm": 54.40969094785184, + "learning_rate": 9.999362706700517e-06, + "loss": 3.3449, + "step": 3682 + }, + { + "epoch": 0.3139009630955425, + "grad_norm": 82.62887481754925, + "learning_rate": 9.999354765598385e-06, + "loss": 3.5428, + "step": 3683 + }, + { + "epoch": 0.3139861927895679, + "grad_norm": 52.185468853850416, + "learning_rate": 9.999346775329587e-06, + "loss": 3.595, + "step": 3684 + }, + { + "epoch": 0.31407142248359327, + "grad_norm": 155.045546220529, + "learning_rate": 9.999338735894201e-06, + "loss": 5.1088, + "step": 3685 + }, + { + "epoch": 0.3141566521776187, + "grad_norm": 82.2643290836065, + "learning_rate": 9.999330647292305e-06, + "loss": 2.6108, + "step": 3686 + }, + { + "epoch": 0.3142418818716441, + "grad_norm": 50.692378184607215, + "learning_rate": 9.999322509523977e-06, + "loss": 3.9526, + "step": 3687 + }, + { + "epoch": 0.3143271115656695, + "grad_norm": 93.01864623205906, + "learning_rate": 9.999314322589299e-06, + "loss": 3.9185, + "step": 3688 + }, + { + "epoch": 0.31441234125969486, + "grad_norm": 50.70333172776214, + "learning_rate": 9.999306086488351e-06, + "loss": 4.1683, + "step": 3689 + }, + { + "epoch": 0.3144975709537203, + "grad_norm": 40.63934299925455, + "learning_rate": 9.999297801221215e-06, + "loss": 3.373, + "step": 3690 + }, + { + "epoch": 0.3145828006477457, + "grad_norm": 40.072331727281615, + "learning_rate": 9.999289466787971e-06, + "loss": 3.4829, + "step": 3691 + }, + { + "epoch": 0.3146680303417711, + "grad_norm": 43.66232105372684, + "learning_rate": 9.9992810831887e-06, + "loss": 3.2203, + "step": 3692 + }, + { + "epoch": 0.31475326003579646, + "grad_norm": 94.84885889476331, + "learning_rate": 9.99927265042349e-06, + "loss": 5.6947, + "step": 3693 + }, + { + "epoch": 0.31483848972982187, + "grad_norm": 84.36784131784837, + "learning_rate": 9.999264168492418e-06, + "loss": 4.0423, + "step": 3694 + }, + { + "epoch": 0.3149237194238473, + "grad_norm": 61.80716189694687, + "learning_rate": 9.999255637395569e-06, + "loss": 3.0941, + "step": 3695 + }, + { + "epoch": 0.3150089491178727, + "grad_norm": 58.61307144485102, + "learning_rate": 9.999247057133027e-06, + "loss": 2.9437, + "step": 3696 + }, + { + "epoch": 0.31509417881189805, + "grad_norm": 98.41006647732551, + "learning_rate": 9.999238427704877e-06, + "loss": 4.0788, + "step": 3697 + }, + { + "epoch": 0.31517940850592346, + "grad_norm": 124.79332404734386, + "learning_rate": 9.999229749111204e-06, + "loss": 3.9219, + "step": 3698 + }, + { + "epoch": 0.3152646381999489, + "grad_norm": 66.64831815131396, + "learning_rate": 9.999221021352093e-06, + "loss": 4.2671, + "step": 3699 + }, + { + "epoch": 0.31534986789397423, + "grad_norm": 103.84252437405917, + "learning_rate": 9.99921224442763e-06, + "loss": 4.4472, + "step": 3700 + }, + { + "epoch": 0.31543509758799965, + "grad_norm": 107.97671065758374, + "learning_rate": 9.9992034183379e-06, + "loss": 4.2435, + "step": 3701 + }, + { + "epoch": 0.31552032728202506, + "grad_norm": 102.57482927597252, + "learning_rate": 9.99919454308299e-06, + "loss": 4.0727, + "step": 3702 + }, + { + "epoch": 0.31560555697605047, + "grad_norm": 73.57755779323536, + "learning_rate": 9.999185618662991e-06, + "loss": 4.2244, + "step": 3703 + }, + { + "epoch": 0.31569078667007583, + "grad_norm": 125.47557548724694, + "learning_rate": 9.999176645077985e-06, + "loss": 3.9043, + "step": 3704 + }, + { + "epoch": 0.31577601636410124, + "grad_norm": 108.46395637889637, + "learning_rate": 9.999167622328065e-06, + "loss": 2.9984, + "step": 3705 + }, + { + "epoch": 0.31586124605812665, + "grad_norm": 54.6755626762168, + "learning_rate": 9.99915855041332e-06, + "loss": 3.7007, + "step": 3706 + }, + { + "epoch": 0.31594647575215207, + "grad_norm": 95.9262443839936, + "learning_rate": 9.999149429333834e-06, + "loss": 4.1252, + "step": 3707 + }, + { + "epoch": 0.3160317054461774, + "grad_norm": 89.57192002793158, + "learning_rate": 9.999140259089701e-06, + "loss": 4.878, + "step": 3708 + }, + { + "epoch": 0.31611693514020284, + "grad_norm": 93.93599066336212, + "learning_rate": 9.999131039681009e-06, + "loss": 3.5093, + "step": 3709 + }, + { + "epoch": 0.31620216483422825, + "grad_norm": 147.64864792977946, + "learning_rate": 9.99912177110785e-06, + "loss": 4.6733, + "step": 3710 + }, + { + "epoch": 0.31628739452825366, + "grad_norm": 70.16822020016555, + "learning_rate": 9.999112453370314e-06, + "loss": 3.3351, + "step": 3711 + }, + { + "epoch": 0.316372624222279, + "grad_norm": 127.33984406926801, + "learning_rate": 9.999103086468495e-06, + "loss": 4.8108, + "step": 3712 + }, + { + "epoch": 0.31645785391630443, + "grad_norm": 146.96648721827566, + "learning_rate": 9.999093670402484e-06, + "loss": 3.7806, + "step": 3713 + }, + { + "epoch": 0.31654308361032985, + "grad_norm": 72.41270746757357, + "learning_rate": 9.999084205172371e-06, + "loss": 3.5978, + "step": 3714 + }, + { + "epoch": 0.31662831330435526, + "grad_norm": 98.1755105882808, + "learning_rate": 9.999074690778255e-06, + "loss": 4.6393, + "step": 3715 + }, + { + "epoch": 0.3167135429983806, + "grad_norm": 124.54893053425386, + "learning_rate": 9.999065127220222e-06, + "loss": 4.2098, + "step": 3716 + }, + { + "epoch": 0.316798772692406, + "grad_norm": 62.99730688700755, + "learning_rate": 9.999055514498372e-06, + "loss": 3.8295, + "step": 3717 + }, + { + "epoch": 0.31688400238643144, + "grad_norm": 209.93890464835476, + "learning_rate": 9.999045852612797e-06, + "loss": 5.6478, + "step": 3718 + }, + { + "epoch": 0.31696923208045685, + "grad_norm": 116.45214851083827, + "learning_rate": 9.999036141563592e-06, + "loss": 4.0141, + "step": 3719 + }, + { + "epoch": 0.3170544617744822, + "grad_norm": 36.688348798231594, + "learning_rate": 9.999026381350853e-06, + "loss": 3.1711, + "step": 3720 + }, + { + "epoch": 0.3171396914685076, + "grad_norm": 61.14409349278289, + "learning_rate": 9.999016571974675e-06, + "loss": 3.7519, + "step": 3721 + }, + { + "epoch": 0.31722492116253304, + "grad_norm": 70.46859789977778, + "learning_rate": 9.999006713435157e-06, + "loss": 4.1334, + "step": 3722 + }, + { + "epoch": 0.31731015085655845, + "grad_norm": 101.18782381204957, + "learning_rate": 9.998996805732394e-06, + "loss": 4.6269, + "step": 3723 + }, + { + "epoch": 0.3173953805505838, + "grad_norm": 85.77738480434918, + "learning_rate": 9.998986848866486e-06, + "loss": 4.0202, + "step": 3724 + }, + { + "epoch": 0.3174806102446092, + "grad_norm": 66.54608072119389, + "learning_rate": 9.998976842837525e-06, + "loss": 3.8891, + "step": 3725 + }, + { + "epoch": 0.31756583993863463, + "grad_norm": 54.53586920781954, + "learning_rate": 9.998966787645615e-06, + "loss": 3.7375, + "step": 3726 + }, + { + "epoch": 0.31765106963266004, + "grad_norm": 77.4787728492063, + "learning_rate": 9.998956683290852e-06, + "loss": 4.0831, + "step": 3727 + }, + { + "epoch": 0.3177362993266854, + "grad_norm": 203.72011565492505, + "learning_rate": 9.998946529773337e-06, + "loss": 4.9895, + "step": 3728 + }, + { + "epoch": 0.3178215290207108, + "grad_norm": 88.86146088307493, + "learning_rate": 9.99893632709317e-06, + "loss": 4.4848, + "step": 3729 + }, + { + "epoch": 0.3179067587147362, + "grad_norm": 323.84478352602173, + "learning_rate": 9.99892607525045e-06, + "loss": 3.7366, + "step": 3730 + }, + { + "epoch": 0.31799198840876164, + "grad_norm": 87.27916814346804, + "learning_rate": 9.99891577424528e-06, + "loss": 4.3406, + "step": 3731 + }, + { + "epoch": 0.318077218102787, + "grad_norm": 42.8461582619471, + "learning_rate": 9.998905424077757e-06, + "loss": 3.0715, + "step": 3732 + }, + { + "epoch": 0.3181624477968124, + "grad_norm": 71.99474577639303, + "learning_rate": 9.998895024747988e-06, + "loss": 4.5987, + "step": 3733 + }, + { + "epoch": 0.3182476774908378, + "grad_norm": 256.1349601956998, + "learning_rate": 9.998884576256072e-06, + "loss": 3.6653, + "step": 3734 + }, + { + "epoch": 0.3183329071848632, + "grad_norm": 40.114753585655414, + "learning_rate": 9.99887407860211e-06, + "loss": 2.918, + "step": 3735 + }, + { + "epoch": 0.3184181368788886, + "grad_norm": 50.13096528959433, + "learning_rate": 9.998863531786211e-06, + "loss": 3.9277, + "step": 3736 + }, + { + "epoch": 0.318503366572914, + "grad_norm": 103.88105462745614, + "learning_rate": 9.998852935808474e-06, + "loss": 4.0214, + "step": 3737 + }, + { + "epoch": 0.3185885962669394, + "grad_norm": 63.57040353520559, + "learning_rate": 9.998842290669005e-06, + "loss": 3.713, + "step": 3738 + }, + { + "epoch": 0.3186738259609648, + "grad_norm": 110.89052576631082, + "learning_rate": 9.998831596367908e-06, + "loss": 5.4004, + "step": 3739 + }, + { + "epoch": 0.3187590556549902, + "grad_norm": 80.47287980219515, + "learning_rate": 9.99882085290529e-06, + "loss": 4.8969, + "step": 3740 + }, + { + "epoch": 0.3188442853490156, + "grad_norm": 66.58651063175179, + "learning_rate": 9.998810060281255e-06, + "loss": 4.449, + "step": 3741 + }, + { + "epoch": 0.318929515043041, + "grad_norm": 77.87203353165344, + "learning_rate": 9.998799218495908e-06, + "loss": 4.896, + "step": 3742 + }, + { + "epoch": 0.31901474473706637, + "grad_norm": 48.59643790158586, + "learning_rate": 9.998788327549357e-06, + "loss": 3.1814, + "step": 3743 + }, + { + "epoch": 0.3190999744310918, + "grad_norm": 67.89072489043383, + "learning_rate": 9.998777387441712e-06, + "loss": 4.0404, + "step": 3744 + }, + { + "epoch": 0.3191852041251172, + "grad_norm": 106.68351657491263, + "learning_rate": 9.998766398173074e-06, + "loss": 4.0681, + "step": 3745 + }, + { + "epoch": 0.3192704338191426, + "grad_norm": 59.96254673216443, + "learning_rate": 9.998755359743558e-06, + "loss": 3.5359, + "step": 3746 + }, + { + "epoch": 0.31935566351316796, + "grad_norm": 48.927047433024285, + "learning_rate": 9.998744272153267e-06, + "loss": 3.797, + "step": 3747 + }, + { + "epoch": 0.3194408932071934, + "grad_norm": 56.096435857809105, + "learning_rate": 9.998733135402313e-06, + "loss": 3.4138, + "step": 3748 + }, + { + "epoch": 0.3195261229012188, + "grad_norm": 42.531010836657515, + "learning_rate": 9.998721949490805e-06, + "loss": 3.7071, + "step": 3749 + }, + { + "epoch": 0.3196113525952442, + "grad_norm": 93.26075467917028, + "learning_rate": 9.998710714418854e-06, + "loss": 3.5902, + "step": 3750 + }, + { + "epoch": 0.31969658228926956, + "grad_norm": 113.55222301098856, + "learning_rate": 9.998699430186567e-06, + "loss": 2.9327, + "step": 3751 + }, + { + "epoch": 0.319781811983295, + "grad_norm": 36.627923387511075, + "learning_rate": 9.99868809679406e-06, + "loss": 2.8412, + "step": 3752 + }, + { + "epoch": 0.3198670416773204, + "grad_norm": 118.91688962741198, + "learning_rate": 9.99867671424144e-06, + "loss": 4.1644, + "step": 3753 + }, + { + "epoch": 0.3199522713713458, + "grad_norm": 623.4549420688159, + "learning_rate": 9.998665282528821e-06, + "loss": 4.2377, + "step": 3754 + }, + { + "epoch": 0.32003750106537115, + "grad_norm": 105.66013023548554, + "learning_rate": 9.998653801656316e-06, + "loss": 4.2902, + "step": 3755 + }, + { + "epoch": 0.32012273075939657, + "grad_norm": 112.87731642750711, + "learning_rate": 9.998642271624038e-06, + "loss": 4.4942, + "step": 3756 + }, + { + "epoch": 0.320207960453422, + "grad_norm": 93.15022564858147, + "learning_rate": 9.998630692432097e-06, + "loss": 3.7625, + "step": 3757 + }, + { + "epoch": 0.3202931901474474, + "grad_norm": 82.1766843824513, + "learning_rate": 9.998619064080612e-06, + "loss": 4.2142, + "step": 3758 + }, + { + "epoch": 0.32037841984147275, + "grad_norm": 50.17875145264197, + "learning_rate": 9.998607386569691e-06, + "loss": 3.2619, + "step": 3759 + }, + { + "epoch": 0.32046364953549816, + "grad_norm": 79.47228519842716, + "learning_rate": 9.998595659899457e-06, + "loss": 3.7104, + "step": 3760 + }, + { + "epoch": 0.3205488792295236, + "grad_norm": 45.61572030503549, + "learning_rate": 9.998583884070018e-06, + "loss": 3.1953, + "step": 3761 + }, + { + "epoch": 0.320634108923549, + "grad_norm": 108.19893698384068, + "learning_rate": 9.998572059081495e-06, + "loss": 4.1589, + "step": 3762 + }, + { + "epoch": 0.32071933861757435, + "grad_norm": 50.202239460479184, + "learning_rate": 9.998560184934001e-06, + "loss": 3.9417, + "step": 3763 + }, + { + "epoch": 0.32080456831159976, + "grad_norm": 89.51408617070493, + "learning_rate": 9.998548261627653e-06, + "loss": 3.6837, + "step": 3764 + }, + { + "epoch": 0.32088979800562517, + "grad_norm": 155.45378314423664, + "learning_rate": 9.99853628916257e-06, + "loss": 4.0295, + "step": 3765 + }, + { + "epoch": 0.3209750276996506, + "grad_norm": 57.519501322970626, + "learning_rate": 9.998524267538868e-06, + "loss": 2.9619, + "step": 3766 + }, + { + "epoch": 0.32106025739367594, + "grad_norm": 79.06136028837969, + "learning_rate": 9.998512196756667e-06, + "loss": 2.9692, + "step": 3767 + }, + { + "epoch": 0.32114548708770135, + "grad_norm": 67.213652316681, + "learning_rate": 9.998500076816084e-06, + "loss": 3.3441, + "step": 3768 + }, + { + "epoch": 0.32123071678172677, + "grad_norm": 33.07577648496225, + "learning_rate": 9.99848790771724e-06, + "loss": 2.3039, + "step": 3769 + }, + { + "epoch": 0.3213159464757522, + "grad_norm": 47.69171920765413, + "learning_rate": 9.998475689460253e-06, + "loss": 4.0173, + "step": 3770 + }, + { + "epoch": 0.32140117616977754, + "grad_norm": 89.69759488831876, + "learning_rate": 9.998463422045243e-06, + "loss": 4.439, + "step": 3771 + }, + { + "epoch": 0.32148640586380295, + "grad_norm": 56.64105855994055, + "learning_rate": 9.998451105472333e-06, + "loss": 3.742, + "step": 3772 + }, + { + "epoch": 0.32157163555782836, + "grad_norm": 40.67140611751605, + "learning_rate": 9.99843873974164e-06, + "loss": 3.4131, + "step": 3773 + }, + { + "epoch": 0.3216568652518537, + "grad_norm": 139.62232255144102, + "learning_rate": 9.998426324853292e-06, + "loss": 3.7542, + "step": 3774 + }, + { + "epoch": 0.32174209494587913, + "grad_norm": 63.08160276964918, + "learning_rate": 9.998413860807405e-06, + "loss": 4.0514, + "step": 3775 + }, + { + "epoch": 0.32182732463990454, + "grad_norm": 90.43382355354017, + "learning_rate": 9.998401347604104e-06, + "loss": 4.4721, + "step": 3776 + }, + { + "epoch": 0.32191255433392996, + "grad_norm": 69.98134295031235, + "learning_rate": 9.998388785243511e-06, + "loss": 3.0495, + "step": 3777 + }, + { + "epoch": 0.3219977840279553, + "grad_norm": 91.64778267330608, + "learning_rate": 9.998376173725753e-06, + "loss": 4.0702, + "step": 3778 + }, + { + "epoch": 0.3220830137219807, + "grad_norm": 77.0752737821364, + "learning_rate": 9.998363513050949e-06, + "loss": 4.3709, + "step": 3779 + }, + { + "epoch": 0.32216824341600614, + "grad_norm": 97.9793363198934, + "learning_rate": 9.998350803219227e-06, + "loss": 5.0069, + "step": 3780 + }, + { + "epoch": 0.32225347311003155, + "grad_norm": 106.29857502917349, + "learning_rate": 9.998338044230711e-06, + "loss": 3.8176, + "step": 3781 + }, + { + "epoch": 0.3223387028040569, + "grad_norm": 66.68977401353413, + "learning_rate": 9.998325236085527e-06, + "loss": 3.8026, + "step": 3782 + }, + { + "epoch": 0.3224239324980823, + "grad_norm": 96.67391876524938, + "learning_rate": 9.998312378783798e-06, + "loss": 4.5933, + "step": 3783 + }, + { + "epoch": 0.32250916219210773, + "grad_norm": 1213.6099300064175, + "learning_rate": 9.998299472325656e-06, + "loss": 3.4051, + "step": 3784 + }, + { + "epoch": 0.32259439188613315, + "grad_norm": 44.461085901078924, + "learning_rate": 9.998286516711223e-06, + "loss": 2.9819, + "step": 3785 + }, + { + "epoch": 0.3226796215801585, + "grad_norm": 130.80903473290294, + "learning_rate": 9.998273511940628e-06, + "loss": 4.9607, + "step": 3786 + }, + { + "epoch": 0.3227648512741839, + "grad_norm": 52.74800494996651, + "learning_rate": 9.998260458014e-06, + "loss": 2.3241, + "step": 3787 + }, + { + "epoch": 0.32285008096820933, + "grad_norm": 60.077284751797464, + "learning_rate": 9.998247354931468e-06, + "loss": 3.3745, + "step": 3788 + }, + { + "epoch": 0.32293531066223474, + "grad_norm": 57.707697521150514, + "learning_rate": 9.998234202693156e-06, + "loss": 4.2716, + "step": 3789 + }, + { + "epoch": 0.3230205403562601, + "grad_norm": 86.53214295381547, + "learning_rate": 9.9982210012992e-06, + "loss": 3.9903, + "step": 3790 + }, + { + "epoch": 0.3231057700502855, + "grad_norm": 60.47795872035031, + "learning_rate": 9.998207750749724e-06, + "loss": 3.4809, + "step": 3791 + }, + { + "epoch": 0.3231909997443109, + "grad_norm": 119.73584919607072, + "learning_rate": 9.998194451044862e-06, + "loss": 3.9269, + "step": 3792 + }, + { + "epoch": 0.32327622943833634, + "grad_norm": 55.00775400142601, + "learning_rate": 9.998181102184743e-06, + "loss": 3.4019, + "step": 3793 + }, + { + "epoch": 0.3233614591323617, + "grad_norm": 67.96488829003566, + "learning_rate": 9.9981677041695e-06, + "loss": 4.6158, + "step": 3794 + }, + { + "epoch": 0.3234466888263871, + "grad_norm": 90.91666384439378, + "learning_rate": 9.998154256999264e-06, + "loss": 3.834, + "step": 3795 + }, + { + "epoch": 0.3235319185204125, + "grad_norm": 886.4484323677632, + "learning_rate": 9.998140760674167e-06, + "loss": 4.2373, + "step": 3796 + }, + { + "epoch": 0.32361714821443793, + "grad_norm": 66.3964369286158, + "learning_rate": 9.99812721519434e-06, + "loss": 4.5026, + "step": 3797 + }, + { + "epoch": 0.3237023779084633, + "grad_norm": 148.2098766978595, + "learning_rate": 9.998113620559917e-06, + "loss": 5.155, + "step": 3798 + }, + { + "epoch": 0.3237876076024887, + "grad_norm": 136.4539440495221, + "learning_rate": 9.998099976771036e-06, + "loss": 3.9606, + "step": 3799 + }, + { + "epoch": 0.3238728372965141, + "grad_norm": 92.94190916683485, + "learning_rate": 9.998086283827825e-06, + "loss": 4.1941, + "step": 3800 + }, + { + "epoch": 0.32395806699053953, + "grad_norm": 50.95168543540639, + "learning_rate": 9.998072541730423e-06, + "loss": 3.4671, + "step": 3801 + }, + { + "epoch": 0.3240432966845649, + "grad_norm": 40.51276915499095, + "learning_rate": 9.998058750478964e-06, + "loss": 2.4582, + "step": 3802 + }, + { + "epoch": 0.3241285263785903, + "grad_norm": 58.06784523547354, + "learning_rate": 9.998044910073583e-06, + "loss": 3.7664, + "step": 3803 + }, + { + "epoch": 0.3242137560726157, + "grad_norm": 77.95884569140492, + "learning_rate": 9.998031020514414e-06, + "loss": 4.5556, + "step": 3804 + }, + { + "epoch": 0.3242989857666411, + "grad_norm": 76.85564650407846, + "learning_rate": 9.998017081801598e-06, + "loss": 4.7541, + "step": 3805 + }, + { + "epoch": 0.3243842154606665, + "grad_norm": 34.2961611582182, + "learning_rate": 9.99800309393527e-06, + "loss": 2.5105, + "step": 3806 + }, + { + "epoch": 0.3244694451546919, + "grad_norm": 76.56504763617741, + "learning_rate": 9.997989056915567e-06, + "loss": 2.9773, + "step": 3807 + }, + { + "epoch": 0.3245546748487173, + "grad_norm": 75.74137438997234, + "learning_rate": 9.997974970742628e-06, + "loss": 3.7076, + "step": 3808 + }, + { + "epoch": 0.3246399045427427, + "grad_norm": 70.19751444688583, + "learning_rate": 9.997960835416592e-06, + "loss": 3.7266, + "step": 3809 + }, + { + "epoch": 0.3247251342367681, + "grad_norm": 82.07704269779089, + "learning_rate": 9.997946650937595e-06, + "loss": 4.2819, + "step": 3810 + }, + { + "epoch": 0.3248103639307935, + "grad_norm": 96.55932335598725, + "learning_rate": 9.99793241730578e-06, + "loss": 4.0907, + "step": 3811 + }, + { + "epoch": 0.3248955936248189, + "grad_norm": 67.27791257510025, + "learning_rate": 9.997918134521286e-06, + "loss": 3.0601, + "step": 3812 + }, + { + "epoch": 0.32498082331884426, + "grad_norm": 130.54660463259003, + "learning_rate": 9.997903802584252e-06, + "loss": 4.5972, + "step": 3813 + }, + { + "epoch": 0.32506605301286967, + "grad_norm": 60.045007046252714, + "learning_rate": 9.997889421494822e-06, + "loss": 3.0876, + "step": 3814 + }, + { + "epoch": 0.3251512827068951, + "grad_norm": 159.95750390860061, + "learning_rate": 9.997874991253134e-06, + "loss": 3.2727, + "step": 3815 + }, + { + "epoch": 0.3252365124009205, + "grad_norm": 328.7465513580719, + "learning_rate": 9.997860511859332e-06, + "loss": 4.6822, + "step": 3816 + }, + { + "epoch": 0.32532174209494585, + "grad_norm": 61.79676330860415, + "learning_rate": 9.997845983313556e-06, + "loss": 3.2663, + "step": 3817 + }, + { + "epoch": 0.32540697178897127, + "grad_norm": 162.24451421266414, + "learning_rate": 9.997831405615953e-06, + "loss": 4.3933, + "step": 3818 + }, + { + "epoch": 0.3254922014829967, + "grad_norm": 57.48168413224521, + "learning_rate": 9.997816778766664e-06, + "loss": 3.9802, + "step": 3819 + }, + { + "epoch": 0.3255774311770221, + "grad_norm": 101.0808529651794, + "learning_rate": 9.997802102765832e-06, + "loss": 3.9731, + "step": 3820 + }, + { + "epoch": 0.32566266087104745, + "grad_norm": 59.147078510405194, + "learning_rate": 9.997787377613602e-06, + "loss": 3.829, + "step": 3821 + }, + { + "epoch": 0.32574789056507286, + "grad_norm": 121.08280813982294, + "learning_rate": 9.997772603310119e-06, + "loss": 4.6811, + "step": 3822 + }, + { + "epoch": 0.3258331202590983, + "grad_norm": 66.07386322868656, + "learning_rate": 9.99775777985553e-06, + "loss": 4.0377, + "step": 3823 + }, + { + "epoch": 0.3259183499531237, + "grad_norm": 49.72618848548065, + "learning_rate": 9.997742907249977e-06, + "loss": 2.1503, + "step": 3824 + }, + { + "epoch": 0.32600357964714904, + "grad_norm": 87.26974682695074, + "learning_rate": 9.997727985493611e-06, + "loss": 4.0293, + "step": 3825 + }, + { + "epoch": 0.32608880934117446, + "grad_norm": 98.6220191704986, + "learning_rate": 9.997713014586575e-06, + "loss": 4.6872, + "step": 3826 + }, + { + "epoch": 0.32617403903519987, + "grad_norm": 40.40955725801297, + "learning_rate": 9.997697994529016e-06, + "loss": 3.8889, + "step": 3827 + }, + { + "epoch": 0.3262592687292253, + "grad_norm": 142.96667428092064, + "learning_rate": 9.997682925321084e-06, + "loss": 5.7422, + "step": 3828 + }, + { + "epoch": 0.32634449842325064, + "grad_norm": 47.50747148802879, + "learning_rate": 9.997667806962926e-06, + "loss": 3.537, + "step": 3829 + }, + { + "epoch": 0.32642972811727605, + "grad_norm": 64.03416510924411, + "learning_rate": 9.99765263945469e-06, + "loss": 4.2106, + "step": 3830 + }, + { + "epoch": 0.32651495781130147, + "grad_norm": 82.90818010629918, + "learning_rate": 9.99763742279653e-06, + "loss": 3.8559, + "step": 3831 + }, + { + "epoch": 0.3266001875053269, + "grad_norm": 125.4205176911906, + "learning_rate": 9.99762215698859e-06, + "loss": 5.312, + "step": 3832 + }, + { + "epoch": 0.32668541719935223, + "grad_norm": 35.56797752554696, + "learning_rate": 9.99760684203102e-06, + "loss": 2.6662, + "step": 3833 + }, + { + "epoch": 0.32677064689337765, + "grad_norm": 135.98187120821564, + "learning_rate": 9.997591477923974e-06, + "loss": 4.2184, + "step": 3834 + }, + { + "epoch": 0.32685587658740306, + "grad_norm": 164.15920144317124, + "learning_rate": 9.997576064667603e-06, + "loss": 3.9317, + "step": 3835 + }, + { + "epoch": 0.3269411062814285, + "grad_norm": 48.53661569156881, + "learning_rate": 9.997560602262055e-06, + "loss": 3.9141, + "step": 3836 + }, + { + "epoch": 0.32702633597545383, + "grad_norm": 139.25354415475013, + "learning_rate": 9.997545090707486e-06, + "loss": 4.4781, + "step": 3837 + }, + { + "epoch": 0.32711156566947924, + "grad_norm": 57.26268090302751, + "learning_rate": 9.997529530004044e-06, + "loss": 4.1112, + "step": 3838 + }, + { + "epoch": 0.32719679536350466, + "grad_norm": 53.22385491953588, + "learning_rate": 9.997513920151888e-06, + "loss": 3.5439, + "step": 3839 + }, + { + "epoch": 0.32728202505753007, + "grad_norm": 76.31214256322374, + "learning_rate": 9.997498261151167e-06, + "loss": 4.7562, + "step": 3840 + }, + { + "epoch": 0.3273672547515554, + "grad_norm": 101.61759419550147, + "learning_rate": 9.997482553002037e-06, + "loss": 3.4356, + "step": 3841 + }, + { + "epoch": 0.32745248444558084, + "grad_norm": 41.374270043404884, + "learning_rate": 9.99746679570465e-06, + "loss": 3.175, + "step": 3842 + }, + { + "epoch": 0.32753771413960625, + "grad_norm": 148.12499090495155, + "learning_rate": 9.997450989259165e-06, + "loss": 4.6921, + "step": 3843 + }, + { + "epoch": 0.32762294383363166, + "grad_norm": 75.87845134885632, + "learning_rate": 9.997435133665735e-06, + "loss": 4.0258, + "step": 3844 + }, + { + "epoch": 0.327708173527657, + "grad_norm": 161.84809495890556, + "learning_rate": 9.997419228924517e-06, + "loss": 3.3159, + "step": 3845 + }, + { + "epoch": 0.32779340322168243, + "grad_norm": 78.00942951752131, + "learning_rate": 9.997403275035667e-06, + "loss": 3.8398, + "step": 3846 + }, + { + "epoch": 0.32787863291570785, + "grad_norm": 129.67977807654597, + "learning_rate": 9.99738727199934e-06, + "loss": 5.6108, + "step": 3847 + }, + { + "epoch": 0.32796386260973326, + "grad_norm": 70.17517959081584, + "learning_rate": 9.997371219815695e-06, + "loss": 3.4956, + "step": 3848 + }, + { + "epoch": 0.3280490923037586, + "grad_norm": 71.35171202528457, + "learning_rate": 9.99735511848489e-06, + "loss": 3.4968, + "step": 3849 + }, + { + "epoch": 0.32813432199778403, + "grad_norm": 123.68488633454263, + "learning_rate": 9.997338968007083e-06, + "loss": 4.2397, + "step": 3850 + }, + { + "epoch": 0.32821955169180944, + "grad_norm": 86.75118402538587, + "learning_rate": 9.997322768382433e-06, + "loss": 4.0513, + "step": 3851 + }, + { + "epoch": 0.3283047813858348, + "grad_norm": 75.29702884650993, + "learning_rate": 9.997306519611099e-06, + "loss": 3.8489, + "step": 3852 + }, + { + "epoch": 0.3283900110798602, + "grad_norm": 69.42234287869495, + "learning_rate": 9.997290221693241e-06, + "loss": 4.0152, + "step": 3853 + }, + { + "epoch": 0.3284752407738856, + "grad_norm": 78.46729793499043, + "learning_rate": 9.997273874629021e-06, + "loss": 4.5853, + "step": 3854 + }, + { + "epoch": 0.32856047046791104, + "grad_norm": 88.17212545687872, + "learning_rate": 9.997257478418597e-06, + "loss": 3.3874, + "step": 3855 + }, + { + "epoch": 0.3286457001619364, + "grad_norm": 245.9299097292099, + "learning_rate": 9.997241033062132e-06, + "loss": 5.3298, + "step": 3856 + }, + { + "epoch": 0.3287309298559618, + "grad_norm": 105.02783242467609, + "learning_rate": 9.997224538559786e-06, + "loss": 4.214, + "step": 3857 + }, + { + "epoch": 0.3288161595499872, + "grad_norm": 72.35005931917338, + "learning_rate": 9.997207994911724e-06, + "loss": 3.5645, + "step": 3858 + }, + { + "epoch": 0.32890138924401263, + "grad_norm": 36.616927228476285, + "learning_rate": 9.997191402118106e-06, + "loss": 2.8553, + "step": 3859 + }, + { + "epoch": 0.328986618938038, + "grad_norm": 73.85212496120481, + "learning_rate": 9.997174760179095e-06, + "loss": 2.9888, + "step": 3860 + }, + { + "epoch": 0.3290718486320634, + "grad_norm": 86.45428402534735, + "learning_rate": 9.99715806909486e-06, + "loss": 3.5152, + "step": 3861 + }, + { + "epoch": 0.3291570783260888, + "grad_norm": 64.93117886338287, + "learning_rate": 9.997141328865558e-06, + "loss": 4.2319, + "step": 3862 + }, + { + "epoch": 0.3292423080201142, + "grad_norm": 134.68978913318762, + "learning_rate": 9.997124539491356e-06, + "loss": 3.6476, + "step": 3863 + }, + { + "epoch": 0.3293275377141396, + "grad_norm": 124.46746057558187, + "learning_rate": 9.997107700972421e-06, + "loss": 3.8173, + "step": 3864 + }, + { + "epoch": 0.329412767408165, + "grad_norm": 101.55257000066844, + "learning_rate": 9.997090813308917e-06, + "loss": 4.947, + "step": 3865 + }, + { + "epoch": 0.3294979971021904, + "grad_norm": 87.38176195562195, + "learning_rate": 9.99707387650101e-06, + "loss": 3.109, + "step": 3866 + }, + { + "epoch": 0.3295832267962158, + "grad_norm": 83.09212612203406, + "learning_rate": 9.997056890548868e-06, + "loss": 3.6539, + "step": 3867 + }, + { + "epoch": 0.3296684564902412, + "grad_norm": 63.64127350360597, + "learning_rate": 9.997039855452657e-06, + "loss": 3.8894, + "step": 3868 + }, + { + "epoch": 0.3297536861842666, + "grad_norm": 68.38772212628932, + "learning_rate": 9.997022771212546e-06, + "loss": 2.5527, + "step": 3869 + }, + { + "epoch": 0.329838915878292, + "grad_norm": 71.07296101981983, + "learning_rate": 9.9970056378287e-06, + "loss": 3.8874, + "step": 3870 + }, + { + "epoch": 0.3299241455723174, + "grad_norm": 107.38418464154523, + "learning_rate": 9.99698845530129e-06, + "loss": 5.1289, + "step": 3871 + }, + { + "epoch": 0.3300093752663428, + "grad_norm": 75.84543124148279, + "learning_rate": 9.996971223630483e-06, + "loss": 4.3075, + "step": 3872 + }, + { + "epoch": 0.3300946049603682, + "grad_norm": 67.78459426182333, + "learning_rate": 9.99695394281645e-06, + "loss": 3.7816, + "step": 3873 + }, + { + "epoch": 0.3301798346543936, + "grad_norm": 104.74653800247772, + "learning_rate": 9.996936612859363e-06, + "loss": 5.716, + "step": 3874 + }, + { + "epoch": 0.330265064348419, + "grad_norm": 55.646160033204545, + "learning_rate": 9.996919233759387e-06, + "loss": 3.2872, + "step": 3875 + }, + { + "epoch": 0.33035029404244437, + "grad_norm": 201.35686005193213, + "learning_rate": 9.996901805516696e-06, + "loss": 5.466, + "step": 3876 + }, + { + "epoch": 0.3304355237364698, + "grad_norm": 45.883019289148564, + "learning_rate": 9.996884328131462e-06, + "loss": 3.0947, + "step": 3877 + }, + { + "epoch": 0.3305207534304952, + "grad_norm": 80.50879714797851, + "learning_rate": 9.996866801603858e-06, + "loss": 3.5967, + "step": 3878 + }, + { + "epoch": 0.3306059831245206, + "grad_norm": 97.63452152479417, + "learning_rate": 9.996849225934051e-06, + "loss": 4.7913, + "step": 3879 + }, + { + "epoch": 0.33069121281854597, + "grad_norm": 859.6729980479332, + "learning_rate": 9.99683160112222e-06, + "loss": 4.0923, + "step": 3880 + }, + { + "epoch": 0.3307764425125714, + "grad_norm": 34.89301848893954, + "learning_rate": 9.996813927168535e-06, + "loss": 3.3972, + "step": 3881 + }, + { + "epoch": 0.3308616722065968, + "grad_norm": 98.73525497505346, + "learning_rate": 9.996796204073171e-06, + "loss": 4.3225, + "step": 3882 + }, + { + "epoch": 0.3309469019006222, + "grad_norm": 46.387881005525045, + "learning_rate": 9.996778431836301e-06, + "loss": 3.4588, + "step": 3883 + }, + { + "epoch": 0.33103213159464756, + "grad_norm": 184.6547434240277, + "learning_rate": 9.9967606104581e-06, + "loss": 5.4254, + "step": 3884 + }, + { + "epoch": 0.331117361288673, + "grad_norm": 89.79768566250364, + "learning_rate": 9.996742739938747e-06, + "loss": 4.3643, + "step": 3885 + }, + { + "epoch": 0.3312025909826984, + "grad_norm": 57.703395316453225, + "learning_rate": 9.996724820278411e-06, + "loss": 3.3679, + "step": 3886 + }, + { + "epoch": 0.33128782067672374, + "grad_norm": 80.65569392833993, + "learning_rate": 9.996706851477273e-06, + "loss": 4.6809, + "step": 3887 + }, + { + "epoch": 0.33137305037074916, + "grad_norm": 132.0937652076342, + "learning_rate": 9.99668883353551e-06, + "loss": 4.7785, + "step": 3888 + }, + { + "epoch": 0.33145828006477457, + "grad_norm": 69.54775561243522, + "learning_rate": 9.996670766453299e-06, + "loss": 4.0991, + "step": 3889 + }, + { + "epoch": 0.3315435097588, + "grad_norm": 52.041302235512624, + "learning_rate": 9.996652650230814e-06, + "loss": 3.695, + "step": 3890 + }, + { + "epoch": 0.33162873945282534, + "grad_norm": 76.38423286658987, + "learning_rate": 9.996634484868235e-06, + "loss": 4.5028, + "step": 3891 + }, + { + "epoch": 0.33171396914685075, + "grad_norm": 63.73802263715664, + "learning_rate": 9.996616270365741e-06, + "loss": 3.7383, + "step": 3892 + }, + { + "epoch": 0.33179919884087616, + "grad_norm": 75.61865584752405, + "learning_rate": 9.996598006723515e-06, + "loss": 4.3577, + "step": 3893 + }, + { + "epoch": 0.3318844285349016, + "grad_norm": 43.82493410950447, + "learning_rate": 9.99657969394173e-06, + "loss": 3.1111, + "step": 3894 + }, + { + "epoch": 0.33196965822892693, + "grad_norm": 50.18890521546778, + "learning_rate": 9.996561332020571e-06, + "loss": 2.802, + "step": 3895 + }, + { + "epoch": 0.33205488792295235, + "grad_norm": 99.64903715225805, + "learning_rate": 9.996542920960216e-06, + "loss": 5.0091, + "step": 3896 + }, + { + "epoch": 0.33214011761697776, + "grad_norm": 123.47054539989664, + "learning_rate": 9.996524460760847e-06, + "loss": 4.2394, + "step": 3897 + }, + { + "epoch": 0.33222534731100317, + "grad_norm": 58.387818481392245, + "learning_rate": 9.996505951422643e-06, + "loss": 4.171, + "step": 3898 + }, + { + "epoch": 0.33231057700502853, + "grad_norm": 72.78262065573759, + "learning_rate": 9.996487392945792e-06, + "loss": 3.1693, + "step": 3899 + }, + { + "epoch": 0.33239580669905394, + "grad_norm": 46.71031443376812, + "learning_rate": 9.99646878533047e-06, + "loss": 3.4453, + "step": 3900 + }, + { + "epoch": 0.33248103639307935, + "grad_norm": 209.0773047007764, + "learning_rate": 9.996450128576865e-06, + "loss": 4.941, + "step": 3901 + }, + { + "epoch": 0.33256626608710477, + "grad_norm": 107.54553221720201, + "learning_rate": 9.996431422685158e-06, + "loss": 4.9033, + "step": 3902 + }, + { + "epoch": 0.3326514957811301, + "grad_norm": 216.50569527449022, + "learning_rate": 9.996412667655532e-06, + "loss": 5.1082, + "step": 3903 + }, + { + "epoch": 0.33273672547515554, + "grad_norm": 48.253840636971105, + "learning_rate": 9.996393863488174e-06, + "loss": 3.4571, + "step": 3904 + }, + { + "epoch": 0.33282195516918095, + "grad_norm": 101.66556452289726, + "learning_rate": 9.996375010183269e-06, + "loss": 4.1475, + "step": 3905 + }, + { + "epoch": 0.33290718486320636, + "grad_norm": 102.09307593999102, + "learning_rate": 9.996356107741e-06, + "loss": 4.3771, + "step": 3906 + }, + { + "epoch": 0.3329924145572317, + "grad_norm": 110.67011531156517, + "learning_rate": 9.996337156161551e-06, + "loss": 4.5234, + "step": 3907 + }, + { + "epoch": 0.33307764425125713, + "grad_norm": 49.46687004267498, + "learning_rate": 9.996318155445115e-06, + "loss": 3.9766, + "step": 3908 + }, + { + "epoch": 0.33316287394528254, + "grad_norm": 71.36534046199918, + "learning_rate": 9.996299105591874e-06, + "loss": 4.3793, + "step": 3909 + }, + { + "epoch": 0.33324810363930796, + "grad_norm": 157.75484763679708, + "learning_rate": 9.996280006602017e-06, + "loss": 4.451, + "step": 3910 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 136.32711367321687, + "learning_rate": 9.996260858475732e-06, + "loss": 4.1275, + "step": 3911 + }, + { + "epoch": 0.3334185630273587, + "grad_norm": 61.8313760198257, + "learning_rate": 9.996241661213207e-06, + "loss": 3.6699, + "step": 3912 + }, + { + "epoch": 0.33350379272138414, + "grad_norm": 70.38870268556633, + "learning_rate": 9.99622241481463e-06, + "loss": 3.7621, + "step": 3913 + }, + { + "epoch": 0.33358902241540955, + "grad_norm": 84.61036703314207, + "learning_rate": 9.99620311928019e-06, + "loss": 3.4549, + "step": 3914 + }, + { + "epoch": 0.3336742521094349, + "grad_norm": 76.36190910096823, + "learning_rate": 9.996183774610078e-06, + "loss": 3.7851, + "step": 3915 + }, + { + "epoch": 0.3337594818034603, + "grad_norm": 63.23251737223489, + "learning_rate": 9.996164380804482e-06, + "loss": 3.4202, + "step": 3916 + }, + { + "epoch": 0.33384471149748574, + "grad_norm": 123.4978588494963, + "learning_rate": 9.996144937863598e-06, + "loss": 4.746, + "step": 3917 + }, + { + "epoch": 0.33392994119151115, + "grad_norm": 78.51308454252683, + "learning_rate": 9.996125445787612e-06, + "loss": 4.4258, + "step": 3918 + }, + { + "epoch": 0.3340151708855365, + "grad_norm": 70.16664807667267, + "learning_rate": 9.996105904576718e-06, + "loss": 3.9956, + "step": 3919 + }, + { + "epoch": 0.3341004005795619, + "grad_norm": 51.97252389650759, + "learning_rate": 9.996086314231108e-06, + "loss": 2.754, + "step": 3920 + }, + { + "epoch": 0.33418563027358733, + "grad_norm": 62.743739043494344, + "learning_rate": 9.996066674750975e-06, + "loss": 4.0702, + "step": 3921 + }, + { + "epoch": 0.33427085996761274, + "grad_norm": 68.4694594020035, + "learning_rate": 9.99604698613651e-06, + "loss": 4.3924, + "step": 3922 + }, + { + "epoch": 0.3343560896616381, + "grad_norm": 198.97495282580653, + "learning_rate": 9.99602724838791e-06, + "loss": 3.9105, + "step": 3923 + }, + { + "epoch": 0.3344413193556635, + "grad_norm": 60.76377411302366, + "learning_rate": 9.996007461505366e-06, + "loss": 2.3116, + "step": 3924 + }, + { + "epoch": 0.3345265490496889, + "grad_norm": 73.70260578023478, + "learning_rate": 9.995987625489073e-06, + "loss": 4.1374, + "step": 3925 + }, + { + "epoch": 0.3346117787437143, + "grad_norm": 131.44526697222287, + "learning_rate": 9.995967740339227e-06, + "loss": 3.9912, + "step": 3926 + }, + { + "epoch": 0.3346970084377397, + "grad_norm": 70.49449478822484, + "learning_rate": 9.995947806056025e-06, + "loss": 4.5669, + "step": 3927 + }, + { + "epoch": 0.3347822381317651, + "grad_norm": 55.60366881975305, + "learning_rate": 9.99592782263966e-06, + "loss": 4.7961, + "step": 3928 + }, + { + "epoch": 0.3348674678257905, + "grad_norm": 97.49334593229521, + "learning_rate": 9.995907790090332e-06, + "loss": 5.1016, + "step": 3929 + }, + { + "epoch": 0.3349526975198159, + "grad_norm": 106.43906924691582, + "learning_rate": 9.995887708408234e-06, + "loss": 4.1366, + "step": 3930 + }, + { + "epoch": 0.3350379272138413, + "grad_norm": 113.02642764364158, + "learning_rate": 9.995867577593566e-06, + "loss": 4.5986, + "step": 3931 + }, + { + "epoch": 0.3351231569078667, + "grad_norm": 116.04931111114864, + "learning_rate": 9.995847397646526e-06, + "loss": 4.2265, + "step": 3932 + }, + { + "epoch": 0.3352083866018921, + "grad_norm": 72.83845713125277, + "learning_rate": 9.995827168567311e-06, + "loss": 4.5286, + "step": 3933 + }, + { + "epoch": 0.3352936162959175, + "grad_norm": 73.00853542053939, + "learning_rate": 9.995806890356123e-06, + "loss": 2.813, + "step": 3934 + }, + { + "epoch": 0.3353788459899429, + "grad_norm": 95.77629246629141, + "learning_rate": 9.995786563013158e-06, + "loss": 4.0344, + "step": 3935 + }, + { + "epoch": 0.3354640756839683, + "grad_norm": 121.33763009195742, + "learning_rate": 9.995766186538617e-06, + "loss": 4.0329, + "step": 3936 + }, + { + "epoch": 0.3355493053779937, + "grad_norm": 74.85825253019951, + "learning_rate": 9.9957457609327e-06, + "loss": 3.4193, + "step": 3937 + }, + { + "epoch": 0.33563453507201907, + "grad_norm": 48.72389153769097, + "learning_rate": 9.99572528619561e-06, + "loss": 3.51, + "step": 3938 + }, + { + "epoch": 0.3357197647660445, + "grad_norm": 56.170528165223864, + "learning_rate": 9.995704762327548e-06, + "loss": 3.9016, + "step": 3939 + }, + { + "epoch": 0.3358049944600699, + "grad_norm": 61.40229695621098, + "learning_rate": 9.995684189328712e-06, + "loss": 4.0573, + "step": 3940 + }, + { + "epoch": 0.3358902241540953, + "grad_norm": 55.305725189194824, + "learning_rate": 9.99566356719931e-06, + "loss": 3.8478, + "step": 3941 + }, + { + "epoch": 0.33597545384812066, + "grad_norm": 49.75295101518396, + "learning_rate": 9.99564289593954e-06, + "loss": 3.7617, + "step": 3942 + }, + { + "epoch": 0.3360606835421461, + "grad_norm": 89.4035510149111, + "learning_rate": 9.995622175549606e-06, + "loss": 4.1571, + "step": 3943 + }, + { + "epoch": 0.3361459132361715, + "grad_norm": 91.62671863328124, + "learning_rate": 9.995601406029716e-06, + "loss": 4.7435, + "step": 3944 + }, + { + "epoch": 0.3362311429301969, + "grad_norm": 79.18516955012828, + "learning_rate": 9.99558058738007e-06, + "loss": 3.8796, + "step": 3945 + }, + { + "epoch": 0.33631637262422226, + "grad_norm": 70.00365349439411, + "learning_rate": 9.995559719600873e-06, + "loss": 4.3069, + "step": 3946 + }, + { + "epoch": 0.33640160231824767, + "grad_norm": 61.9492821004363, + "learning_rate": 9.995538802692333e-06, + "loss": 2.8508, + "step": 3947 + }, + { + "epoch": 0.3364868320122731, + "grad_norm": 68.11809719505095, + "learning_rate": 9.995517836654652e-06, + "loss": 3.8907, + "step": 3948 + }, + { + "epoch": 0.3365720617062985, + "grad_norm": 135.66384422230018, + "learning_rate": 9.995496821488038e-06, + "loss": 4.6799, + "step": 3949 + }, + { + "epoch": 0.33665729140032385, + "grad_norm": 163.08715201950304, + "learning_rate": 9.9954757571927e-06, + "loss": 4.9185, + "step": 3950 + }, + { + "epoch": 0.33674252109434927, + "grad_norm": 30.92854883310766, + "learning_rate": 9.99545464376884e-06, + "loss": 2.798, + "step": 3951 + }, + { + "epoch": 0.3368277507883747, + "grad_norm": 48.44899492065393, + "learning_rate": 9.995433481216673e-06, + "loss": 3.6161, + "step": 3952 + }, + { + "epoch": 0.3369129804824001, + "grad_norm": 58.005944154500284, + "learning_rate": 9.995412269536398e-06, + "loss": 3.6891, + "step": 3953 + }, + { + "epoch": 0.33699821017642545, + "grad_norm": 142.30826832409358, + "learning_rate": 9.99539100872823e-06, + "loss": 4.5465, + "step": 3954 + }, + { + "epoch": 0.33708343987045086, + "grad_norm": 77.60185637351896, + "learning_rate": 9.995369698792377e-06, + "loss": 4.3426, + "step": 3955 + }, + { + "epoch": 0.3371686695644763, + "grad_norm": 101.38034716322628, + "learning_rate": 9.995348339729047e-06, + "loss": 4.1969, + "step": 3956 + }, + { + "epoch": 0.3372538992585017, + "grad_norm": 45.37137055227733, + "learning_rate": 9.995326931538453e-06, + "loss": 3.3393, + "step": 3957 + }, + { + "epoch": 0.33733912895252705, + "grad_norm": 84.27359848070815, + "learning_rate": 9.995305474220802e-06, + "loss": 3.2957, + "step": 3958 + }, + { + "epoch": 0.33742435864655246, + "grad_norm": 103.7142264866234, + "learning_rate": 9.995283967776305e-06, + "loss": 3.6421, + "step": 3959 + }, + { + "epoch": 0.33750958834057787, + "grad_norm": 83.14367608424267, + "learning_rate": 9.995262412205177e-06, + "loss": 4.4943, + "step": 3960 + }, + { + "epoch": 0.3375948180346033, + "grad_norm": 43.110206989192214, + "learning_rate": 9.995240807507628e-06, + "loss": 3.2237, + "step": 3961 + }, + { + "epoch": 0.33768004772862864, + "grad_norm": 196.43470220144746, + "learning_rate": 9.995219153683873e-06, + "loss": 4.606, + "step": 3962 + }, + { + "epoch": 0.33776527742265405, + "grad_norm": 80.72067079616453, + "learning_rate": 9.99519745073412e-06, + "loss": 3.8616, + "step": 3963 + }, + { + "epoch": 0.33785050711667947, + "grad_norm": 118.88210606127562, + "learning_rate": 9.995175698658583e-06, + "loss": 4.5305, + "step": 3964 + }, + { + "epoch": 0.3379357368107048, + "grad_norm": 62.248066220798826, + "learning_rate": 9.995153897457481e-06, + "loss": 3.6309, + "step": 3965 + }, + { + "epoch": 0.33802096650473024, + "grad_norm": 48.39701871244946, + "learning_rate": 9.995132047131023e-06, + "loss": 3.6961, + "step": 3966 + }, + { + "epoch": 0.33810619619875565, + "grad_norm": 91.92606461308473, + "learning_rate": 9.99511014767943e-06, + "loss": 4.5351, + "step": 3967 + }, + { + "epoch": 0.33819142589278106, + "grad_norm": 126.60951514343859, + "learning_rate": 9.99508819910291e-06, + "loss": 5.0329, + "step": 3968 + }, + { + "epoch": 0.3382766555868064, + "grad_norm": 95.63264967442942, + "learning_rate": 9.995066201401684e-06, + "loss": 3.841, + "step": 3969 + }, + { + "epoch": 0.33836188528083183, + "grad_norm": 55.180217021541814, + "learning_rate": 9.995044154575966e-06, + "loss": 3.4191, + "step": 3970 + }, + { + "epoch": 0.33844711497485724, + "grad_norm": 52.41351276738725, + "learning_rate": 9.995022058625974e-06, + "loss": 3.6657, + "step": 3971 + }, + { + "epoch": 0.33853234466888266, + "grad_norm": 52.16875408317478, + "learning_rate": 9.994999913551924e-06, + "loss": 4.401, + "step": 3972 + }, + { + "epoch": 0.338617574362908, + "grad_norm": 95.02747131819379, + "learning_rate": 9.994977719354035e-06, + "loss": 3.7666, + "step": 3973 + }, + { + "epoch": 0.3387028040569334, + "grad_norm": 42.998663162616765, + "learning_rate": 9.994955476032527e-06, + "loss": 2.6674, + "step": 3974 + }, + { + "epoch": 0.33878803375095884, + "grad_norm": 55.674296850838154, + "learning_rate": 9.994933183587613e-06, + "loss": 3.9957, + "step": 3975 + }, + { + "epoch": 0.33887326344498425, + "grad_norm": 119.5474744381577, + "learning_rate": 9.994910842019519e-06, + "loss": 3.7793, + "step": 3976 + }, + { + "epoch": 0.3389584931390096, + "grad_norm": 145.7339218783816, + "learning_rate": 9.99488845132846e-06, + "loss": 4.9319, + "step": 3977 + }, + { + "epoch": 0.339043722833035, + "grad_norm": 75.85976526171923, + "learning_rate": 9.99486601151466e-06, + "loss": 4.3279, + "step": 3978 + }, + { + "epoch": 0.33912895252706043, + "grad_norm": 54.88930044796843, + "learning_rate": 9.994843522578334e-06, + "loss": 2.4612, + "step": 3979 + }, + { + "epoch": 0.33921418222108585, + "grad_norm": 83.34906340291853, + "learning_rate": 9.99482098451971e-06, + "loss": 3.9842, + "step": 3980 + }, + { + "epoch": 0.3392994119151112, + "grad_norm": 59.32500124831451, + "learning_rate": 9.994798397339004e-06, + "loss": 3.4256, + "step": 3981 + }, + { + "epoch": 0.3393846416091366, + "grad_norm": 73.7139158196312, + "learning_rate": 9.994775761036441e-06, + "loss": 3.2954, + "step": 3982 + }, + { + "epoch": 0.33946987130316203, + "grad_norm": 66.97657969497783, + "learning_rate": 9.994753075612245e-06, + "loss": 4.6931, + "step": 3983 + }, + { + "epoch": 0.33955510099718744, + "grad_norm": 45.02987305091405, + "learning_rate": 9.994730341066635e-06, + "loss": 4.3603, + "step": 3984 + }, + { + "epoch": 0.3396403306912128, + "grad_norm": 67.10088221453712, + "learning_rate": 9.994707557399838e-06, + "loss": 3.2237, + "step": 3985 + }, + { + "epoch": 0.3397255603852382, + "grad_norm": 69.00194282027864, + "learning_rate": 9.994684724612076e-06, + "loss": 3.0842, + "step": 3986 + }, + { + "epoch": 0.3398107900792636, + "grad_norm": 57.12342784187237, + "learning_rate": 9.994661842703575e-06, + "loss": 3.6129, + "step": 3987 + }, + { + "epoch": 0.33989601977328904, + "grad_norm": 56.11892786414207, + "learning_rate": 9.99463891167456e-06, + "loss": 3.7617, + "step": 3988 + }, + { + "epoch": 0.3399812494673144, + "grad_norm": 52.15562609418636, + "learning_rate": 9.994615931525255e-06, + "loss": 3.7727, + "step": 3989 + }, + { + "epoch": 0.3400664791613398, + "grad_norm": 62.43962023289409, + "learning_rate": 9.994592902255888e-06, + "loss": 3.2652, + "step": 3990 + }, + { + "epoch": 0.3401517088553652, + "grad_norm": 53.76121685040481, + "learning_rate": 9.994569823866684e-06, + "loss": 3.2187, + "step": 3991 + }, + { + "epoch": 0.34023693854939063, + "grad_norm": 57.87513317938855, + "learning_rate": 9.99454669635787e-06, + "loss": 2.5562, + "step": 3992 + }, + { + "epoch": 0.340322168243416, + "grad_norm": 39.10864801346343, + "learning_rate": 9.994523519729674e-06, + "loss": 2.183, + "step": 3993 + }, + { + "epoch": 0.3404073979374414, + "grad_norm": 61.93864286940185, + "learning_rate": 9.994500293982323e-06, + "loss": 3.8912, + "step": 3994 + }, + { + "epoch": 0.3404926276314668, + "grad_norm": 60.44362421902777, + "learning_rate": 9.994477019116049e-06, + "loss": 3.0741, + "step": 3995 + }, + { + "epoch": 0.34057785732549223, + "grad_norm": 228.1605001510804, + "learning_rate": 9.994453695131075e-06, + "loss": 4.0943, + "step": 3996 + }, + { + "epoch": 0.3406630870195176, + "grad_norm": 427.27141135137856, + "learning_rate": 9.994430322027635e-06, + "loss": 5.9451, + "step": 3997 + }, + { + "epoch": 0.340748316713543, + "grad_norm": 121.2177065989637, + "learning_rate": 9.994406899805958e-06, + "loss": 4.6488, + "step": 3998 + }, + { + "epoch": 0.3408335464075684, + "grad_norm": 60.0544774067534, + "learning_rate": 9.994383428466272e-06, + "loss": 3.6253, + "step": 3999 + }, + { + "epoch": 0.34091877610159377, + "grad_norm": 88.21388171117026, + "learning_rate": 9.994359908008811e-06, + "loss": 4.6465, + "step": 4000 + }, + { + "epoch": 0.3410040057956192, + "grad_norm": 112.32491812203078, + "learning_rate": 9.994336338433805e-06, + "loss": 3.5614, + "step": 4001 + }, + { + "epoch": 0.3410892354896446, + "grad_norm": 58.49629228688089, + "learning_rate": 9.994312719741483e-06, + "loss": 3.5368, + "step": 4002 + }, + { + "epoch": 0.34117446518367, + "grad_norm": 109.39727941952752, + "learning_rate": 9.994289051932083e-06, + "loss": 5.4184, + "step": 4003 + }, + { + "epoch": 0.34125969487769536, + "grad_norm": 92.76650368253384, + "learning_rate": 9.994265335005835e-06, + "loss": 5.1249, + "step": 4004 + }, + { + "epoch": 0.3413449245717208, + "grad_norm": 85.28703256634091, + "learning_rate": 9.99424156896297e-06, + "loss": 3.0326, + "step": 4005 + }, + { + "epoch": 0.3414301542657462, + "grad_norm": 49.4478863175298, + "learning_rate": 9.994217753803724e-06, + "loss": 2.9783, + "step": 4006 + }, + { + "epoch": 0.3415153839597716, + "grad_norm": 51.85124020295949, + "learning_rate": 9.994193889528332e-06, + "loss": 4.0393, + "step": 4007 + }, + { + "epoch": 0.34160061365379696, + "grad_norm": 106.21567711384337, + "learning_rate": 9.994169976137027e-06, + "loss": 4.1895, + "step": 4008 + }, + { + "epoch": 0.34168584334782237, + "grad_norm": 94.64064760636123, + "learning_rate": 9.994146013630043e-06, + "loss": 4.2865, + "step": 4009 + }, + { + "epoch": 0.3417710730418478, + "grad_norm": 71.71688480419786, + "learning_rate": 9.99412200200762e-06, + "loss": 4.2645, + "step": 4010 + }, + { + "epoch": 0.3418563027358732, + "grad_norm": 79.05804184074648, + "learning_rate": 9.99409794126999e-06, + "loss": 3.7686, + "step": 4011 + }, + { + "epoch": 0.34194153242989855, + "grad_norm": 48.81702143499277, + "learning_rate": 9.994073831417393e-06, + "loss": 3.7493, + "step": 4012 + }, + { + "epoch": 0.34202676212392397, + "grad_norm": 125.2078780311322, + "learning_rate": 9.994049672450062e-06, + "loss": 4.4288, + "step": 4013 + }, + { + "epoch": 0.3421119918179494, + "grad_norm": 54.07042963084547, + "learning_rate": 9.994025464368237e-06, + "loss": 3.8635, + "step": 4014 + }, + { + "epoch": 0.3421972215119748, + "grad_norm": 52.40332685109047, + "learning_rate": 9.994001207172159e-06, + "loss": 3.9502, + "step": 4015 + }, + { + "epoch": 0.34228245120600015, + "grad_norm": 69.14699904844127, + "learning_rate": 9.993976900862061e-06, + "loss": 3.5427, + "step": 4016 + }, + { + "epoch": 0.34236768090002556, + "grad_norm": 68.8227165488685, + "learning_rate": 9.993952545438186e-06, + "loss": 3.7806, + "step": 4017 + }, + { + "epoch": 0.342452910594051, + "grad_norm": 72.13754286383559, + "learning_rate": 9.99392814090077e-06, + "loss": 3.1729, + "step": 4018 + }, + { + "epoch": 0.3425381402880764, + "grad_norm": 70.83096637847967, + "learning_rate": 9.993903687250057e-06, + "loss": 5.2097, + "step": 4019 + }, + { + "epoch": 0.34262336998210174, + "grad_norm": 166.30081212828577, + "learning_rate": 9.993879184486285e-06, + "loss": 4.4276, + "step": 4020 + }, + { + "epoch": 0.34270859967612716, + "grad_norm": 100.73243196218637, + "learning_rate": 9.993854632609696e-06, + "loss": 4.0894, + "step": 4021 + }, + { + "epoch": 0.34279382937015257, + "grad_norm": 68.17885571691092, + "learning_rate": 9.99383003162053e-06, + "loss": 3.5332, + "step": 4022 + }, + { + "epoch": 0.342879059064178, + "grad_norm": 51.614296496323675, + "learning_rate": 9.993805381519031e-06, + "loss": 3.8195, + "step": 4023 + }, + { + "epoch": 0.34296428875820334, + "grad_norm": 46.09378837600473, + "learning_rate": 9.99378068230544e-06, + "loss": 3.4797, + "step": 4024 + }, + { + "epoch": 0.34304951845222875, + "grad_norm": 57.21883990075386, + "learning_rate": 9.993755933980001e-06, + "loss": 3.6062, + "step": 4025 + }, + { + "epoch": 0.34313474814625416, + "grad_norm": 65.82306927919922, + "learning_rate": 9.993731136542955e-06, + "loss": 4.1088, + "step": 4026 + }, + { + "epoch": 0.3432199778402796, + "grad_norm": 74.18227253748036, + "learning_rate": 9.993706289994549e-06, + "loss": 4.3896, + "step": 4027 + }, + { + "epoch": 0.34330520753430493, + "grad_norm": 103.07683872745571, + "learning_rate": 9.993681394335027e-06, + "loss": 3.4993, + "step": 4028 + }, + { + "epoch": 0.34339043722833035, + "grad_norm": 74.66491799871214, + "learning_rate": 9.993656449564631e-06, + "loss": 4.0806, + "step": 4029 + }, + { + "epoch": 0.34347566692235576, + "grad_norm": 43.55236463161666, + "learning_rate": 9.993631455683607e-06, + "loss": 3.4274, + "step": 4030 + }, + { + "epoch": 0.3435608966163812, + "grad_norm": 59.080841437112014, + "learning_rate": 9.993606412692204e-06, + "loss": 4.3073, + "step": 4031 + }, + { + "epoch": 0.34364612631040653, + "grad_norm": 58.87634114197267, + "learning_rate": 9.993581320590666e-06, + "loss": 3.1181, + "step": 4032 + }, + { + "epoch": 0.34373135600443194, + "grad_norm": 76.08745224232189, + "learning_rate": 9.99355617937924e-06, + "loss": 4.2, + "step": 4033 + }, + { + "epoch": 0.34381658569845736, + "grad_norm": 102.01230410591054, + "learning_rate": 9.993530989058172e-06, + "loss": 4.9129, + "step": 4034 + }, + { + "epoch": 0.34390181539248277, + "grad_norm": 38.55770262081728, + "learning_rate": 9.993505749627713e-06, + "loss": 3.2442, + "step": 4035 + }, + { + "epoch": 0.3439870450865081, + "grad_norm": 46.564537810670316, + "learning_rate": 9.993480461088108e-06, + "loss": 3.0219, + "step": 4036 + }, + { + "epoch": 0.34407227478053354, + "grad_norm": 115.90957794425422, + "learning_rate": 9.993455123439606e-06, + "loss": 3.7676, + "step": 4037 + }, + { + "epoch": 0.34415750447455895, + "grad_norm": 91.93893826273029, + "learning_rate": 9.993429736682458e-06, + "loss": 4.3076, + "step": 4038 + }, + { + "epoch": 0.3442427341685843, + "grad_norm": 87.91702038027407, + "learning_rate": 9.993404300816913e-06, + "loss": 5.4126, + "step": 4039 + }, + { + "epoch": 0.3443279638626097, + "grad_norm": 76.93422635266263, + "learning_rate": 9.99337881584322e-06, + "loss": 3.9562, + "step": 4040 + }, + { + "epoch": 0.34441319355663513, + "grad_norm": 70.43563133892303, + "learning_rate": 9.99335328176163e-06, + "loss": 2.1156, + "step": 4041 + }, + { + "epoch": 0.34449842325066055, + "grad_norm": 241.09072555782038, + "learning_rate": 9.993327698572397e-06, + "loss": 5.1298, + "step": 4042 + }, + { + "epoch": 0.3445836529446859, + "grad_norm": 83.3080779009048, + "learning_rate": 9.993302066275768e-06, + "loss": 3.3776, + "step": 4043 + }, + { + "epoch": 0.3446688826387113, + "grad_norm": 67.20937458774252, + "learning_rate": 9.993276384871999e-06, + "loss": 3.1951, + "step": 4044 + }, + { + "epoch": 0.34475411233273673, + "grad_norm": 88.11024239442239, + "learning_rate": 9.993250654361339e-06, + "loss": 4.2173, + "step": 4045 + }, + { + "epoch": 0.34483934202676214, + "grad_norm": 33.05289699947126, + "learning_rate": 9.993224874744045e-06, + "loss": 2.5299, + "step": 4046 + }, + { + "epoch": 0.3449245717207875, + "grad_norm": 66.38695126541981, + "learning_rate": 9.993199046020367e-06, + "loss": 3.8589, + "step": 4047 + }, + { + "epoch": 0.3450098014148129, + "grad_norm": 125.57219937037527, + "learning_rate": 9.993173168190562e-06, + "loss": 5.3476, + "step": 4048 + }, + { + "epoch": 0.3450950311088383, + "grad_norm": 110.6278355341349, + "learning_rate": 9.993147241254881e-06, + "loss": 4.1699, + "step": 4049 + }, + { + "epoch": 0.34518026080286374, + "grad_norm": 93.85195481219857, + "learning_rate": 9.993121265213582e-06, + "loss": 5.0978, + "step": 4050 + }, + { + "epoch": 0.3452654904968891, + "grad_norm": 40.47784577762199, + "learning_rate": 9.99309524006692e-06, + "loss": 3.7984, + "step": 4051 + }, + { + "epoch": 0.3453507201909145, + "grad_norm": 245.4448155035767, + "learning_rate": 9.99306916581515e-06, + "loss": 4.6606, + "step": 4052 + }, + { + "epoch": 0.3454359498849399, + "grad_norm": 59.692235852549295, + "learning_rate": 9.993043042458528e-06, + "loss": 4.6366, + "step": 4053 + }, + { + "epoch": 0.34552117957896533, + "grad_norm": 49.213173379032746, + "learning_rate": 9.993016869997311e-06, + "loss": 2.8574, + "step": 4054 + }, + { + "epoch": 0.3456064092729907, + "grad_norm": 65.31182703493188, + "learning_rate": 9.99299064843176e-06, + "loss": 3.6979, + "step": 4055 + }, + { + "epoch": 0.3456916389670161, + "grad_norm": 64.14490031404087, + "learning_rate": 9.99296437776213e-06, + "loss": 2.867, + "step": 4056 + }, + { + "epoch": 0.3457768686610415, + "grad_norm": 86.90643479173713, + "learning_rate": 9.992938057988676e-06, + "loss": 3.8727, + "step": 4057 + }, + { + "epoch": 0.3458620983550669, + "grad_norm": 87.58040902061683, + "learning_rate": 9.992911689111661e-06, + "loss": 3.3509, + "step": 4058 + }, + { + "epoch": 0.3459473280490923, + "grad_norm": 58.29849685289922, + "learning_rate": 9.992885271131344e-06, + "loss": 3.8146, + "step": 4059 + }, + { + "epoch": 0.3460325577431177, + "grad_norm": 82.01910958102557, + "learning_rate": 9.992858804047988e-06, + "loss": 4.6825, + "step": 4060 + }, + { + "epoch": 0.3461177874371431, + "grad_norm": 152.04270081017813, + "learning_rate": 9.992832287861844e-06, + "loss": 5.3782, + "step": 4061 + }, + { + "epoch": 0.3462030171311685, + "grad_norm": 73.19644074067618, + "learning_rate": 9.992805722573183e-06, + "loss": 3.7215, + "step": 4062 + }, + { + "epoch": 0.3462882468251939, + "grad_norm": 59.052584156445015, + "learning_rate": 9.992779108182258e-06, + "loss": 4.6452, + "step": 4063 + }, + { + "epoch": 0.3463734765192193, + "grad_norm": 204.68905042133935, + "learning_rate": 9.992752444689335e-06, + "loss": 3.9119, + "step": 4064 + }, + { + "epoch": 0.3464587062132447, + "grad_norm": 143.1764443022495, + "learning_rate": 9.992725732094678e-06, + "loss": 4.1378, + "step": 4065 + }, + { + "epoch": 0.3465439359072701, + "grad_norm": 99.62211156806964, + "learning_rate": 9.992698970398545e-06, + "loss": 3.9324, + "step": 4066 + }, + { + "epoch": 0.3466291656012955, + "grad_norm": 159.21661451501967, + "learning_rate": 9.992672159601204e-06, + "loss": 4.6152, + "step": 4067 + }, + { + "epoch": 0.3467143952953209, + "grad_norm": 63.363147803901235, + "learning_rate": 9.992645299702914e-06, + "loss": 3.6086, + "step": 4068 + }, + { + "epoch": 0.3467996249893463, + "grad_norm": 54.13032632024464, + "learning_rate": 9.992618390703941e-06, + "loss": 3.2109, + "step": 4069 + }, + { + "epoch": 0.3468848546833717, + "grad_norm": 145.37155063518554, + "learning_rate": 9.992591432604551e-06, + "loss": 4.3399, + "step": 4070 + }, + { + "epoch": 0.34697008437739707, + "grad_norm": 105.13067753755519, + "learning_rate": 9.99256442540501e-06, + "loss": 3.7867, + "step": 4071 + }, + { + "epoch": 0.3470553140714225, + "grad_norm": 106.51886704639301, + "learning_rate": 9.99253736910558e-06, + "loss": 4.4471, + "step": 4072 + }, + { + "epoch": 0.3471405437654479, + "grad_norm": 123.85269333739335, + "learning_rate": 9.992510263706527e-06, + "loss": 3.8023, + "step": 4073 + }, + { + "epoch": 0.3472257734594733, + "grad_norm": 45.71999895134011, + "learning_rate": 9.992483109208122e-06, + "loss": 3.9464, + "step": 4074 + }, + { + "epoch": 0.34731100315349867, + "grad_norm": 45.434268075544416, + "learning_rate": 9.992455905610627e-06, + "loss": 3.5152, + "step": 4075 + }, + { + "epoch": 0.3473962328475241, + "grad_norm": 132.29432160680182, + "learning_rate": 9.992428652914315e-06, + "loss": 4.8553, + "step": 4076 + }, + { + "epoch": 0.3474814625415495, + "grad_norm": 59.43379889623135, + "learning_rate": 9.992401351119448e-06, + "loss": 3.5439, + "step": 4077 + }, + { + "epoch": 0.34756669223557485, + "grad_norm": 53.75977999706322, + "learning_rate": 9.9923740002263e-06, + "loss": 3.417, + "step": 4078 + }, + { + "epoch": 0.34765192192960026, + "grad_norm": 37.81592797389982, + "learning_rate": 9.992346600235136e-06, + "loss": 2.9784, + "step": 4079 + }, + { + "epoch": 0.3477371516236257, + "grad_norm": 101.25467453411353, + "learning_rate": 9.992319151146227e-06, + "loss": 3.1492, + "step": 4080 + }, + { + "epoch": 0.3478223813176511, + "grad_norm": 205.97421629078758, + "learning_rate": 9.992291652959843e-06, + "loss": 5.0008, + "step": 4081 + }, + { + "epoch": 0.34790761101167644, + "grad_norm": 138.95367258479712, + "learning_rate": 9.992264105676255e-06, + "loss": 4.3511, + "step": 4082 + }, + { + "epoch": 0.34799284070570186, + "grad_norm": 110.14775346698593, + "learning_rate": 9.992236509295733e-06, + "loss": 5.3313, + "step": 4083 + }, + { + "epoch": 0.34807807039972727, + "grad_norm": 42.30790821857475, + "learning_rate": 9.99220886381855e-06, + "loss": 3.442, + "step": 4084 + }, + { + "epoch": 0.3481633000937527, + "grad_norm": 68.11850145837953, + "learning_rate": 9.992181169244974e-06, + "loss": 3.8481, + "step": 4085 + }, + { + "epoch": 0.34824852978777804, + "grad_norm": 167.42004551331053, + "learning_rate": 9.99215342557528e-06, + "loss": 4.729, + "step": 4086 + }, + { + "epoch": 0.34833375948180345, + "grad_norm": 109.5022189383576, + "learning_rate": 9.992125632809743e-06, + "loss": 4.2946, + "step": 4087 + }, + { + "epoch": 0.34841898917582886, + "grad_norm": 109.3329372886256, + "learning_rate": 9.992097790948631e-06, + "loss": 4.6068, + "step": 4088 + }, + { + "epoch": 0.3485042188698543, + "grad_norm": 83.76798735377612, + "learning_rate": 9.992069899992224e-06, + "loss": 2.978, + "step": 4089 + }, + { + "epoch": 0.34858944856387963, + "grad_norm": 88.20929814925132, + "learning_rate": 9.992041959940793e-06, + "loss": 4.4592, + "step": 4090 + }, + { + "epoch": 0.34867467825790505, + "grad_norm": 53.19139929352149, + "learning_rate": 9.99201397079461e-06, + "loss": 3.5295, + "step": 4091 + }, + { + "epoch": 0.34875990795193046, + "grad_norm": 65.08063633284578, + "learning_rate": 9.991985932553957e-06, + "loss": 4.0963, + "step": 4092 + }, + { + "epoch": 0.34884513764595587, + "grad_norm": 82.80167776252358, + "learning_rate": 9.991957845219102e-06, + "loss": 3.3232, + "step": 4093 + }, + { + "epoch": 0.34893036733998123, + "grad_norm": 93.44438237692931, + "learning_rate": 9.991929708790327e-06, + "loss": 4.0088, + "step": 4094 + }, + { + "epoch": 0.34901559703400664, + "grad_norm": 66.48371085544544, + "learning_rate": 9.991901523267908e-06, + "loss": 3.1782, + "step": 4095 + }, + { + "epoch": 0.34910082672803205, + "grad_norm": 122.13435293900432, + "learning_rate": 9.991873288652121e-06, + "loss": 4.4583, + "step": 4096 + }, + { + "epoch": 0.34918605642205747, + "grad_norm": 51.50487190916073, + "learning_rate": 9.991845004943243e-06, + "loss": 3.2543, + "step": 4097 + }, + { + "epoch": 0.3492712861160828, + "grad_norm": 63.80296991034899, + "learning_rate": 9.991816672141552e-06, + "loss": 3.5778, + "step": 4098 + }, + { + "epoch": 0.34935651581010824, + "grad_norm": 46.482489080157414, + "learning_rate": 9.991788290247329e-06, + "loss": 3.9448, + "step": 4099 + }, + { + "epoch": 0.34944174550413365, + "grad_norm": 113.62345208220509, + "learning_rate": 9.991759859260851e-06, + "loss": 3.8132, + "step": 4100 + }, + { + "epoch": 0.34952697519815906, + "grad_norm": 106.7643393681944, + "learning_rate": 9.9917313791824e-06, + "loss": 5.32, + "step": 4101 + }, + { + "epoch": 0.3496122048921844, + "grad_norm": 58.189151525905125, + "learning_rate": 9.991702850012252e-06, + "loss": 4.3153, + "step": 4102 + }, + { + "epoch": 0.34969743458620983, + "grad_norm": 66.42278026331626, + "learning_rate": 9.99167427175069e-06, + "loss": 4.2153, + "step": 4103 + }, + { + "epoch": 0.34978266428023524, + "grad_norm": 41.52287630427308, + "learning_rate": 9.991645644397996e-06, + "loss": 3.257, + "step": 4104 + }, + { + "epoch": 0.34986789397426066, + "grad_norm": 57.672892232312435, + "learning_rate": 9.991616967954452e-06, + "loss": 3.3478, + "step": 4105 + }, + { + "epoch": 0.349953123668286, + "grad_norm": 81.64322049042798, + "learning_rate": 9.991588242420336e-06, + "loss": 5.3074, + "step": 4106 + }, + { + "epoch": 0.3500383533623114, + "grad_norm": 61.81829125438843, + "learning_rate": 9.991559467795935e-06, + "loss": 4.105, + "step": 4107 + }, + { + "epoch": 0.35012358305633684, + "grad_norm": 83.21130912314486, + "learning_rate": 9.991530644081528e-06, + "loss": 4.4246, + "step": 4108 + }, + { + "epoch": 0.35020881275036225, + "grad_norm": 80.15685185722143, + "learning_rate": 9.991501771277404e-06, + "loss": 4.242, + "step": 4109 + }, + { + "epoch": 0.3502940424443876, + "grad_norm": 73.26790305183481, + "learning_rate": 9.99147284938384e-06, + "loss": 3.2061, + "step": 4110 + }, + { + "epoch": 0.350379272138413, + "grad_norm": 94.63800061372596, + "learning_rate": 9.991443878401125e-06, + "loss": 3.4259, + "step": 4111 + }, + { + "epoch": 0.35046450183243844, + "grad_norm": 40.49833149800954, + "learning_rate": 9.991414858329545e-06, + "loss": 3.4992, + "step": 4112 + }, + { + "epoch": 0.3505497315264638, + "grad_norm": 43.67823487264583, + "learning_rate": 9.99138578916938e-06, + "loss": 3.2516, + "step": 4113 + }, + { + "epoch": 0.3506349612204892, + "grad_norm": 71.25967397552621, + "learning_rate": 9.991356670920922e-06, + "loss": 4.1538, + "step": 4114 + }, + { + "epoch": 0.3507201909145146, + "grad_norm": 78.21570550121471, + "learning_rate": 9.991327503584451e-06, + "loss": 4.013, + "step": 4115 + }, + { + "epoch": 0.35080542060854003, + "grad_norm": 146.55099432682277, + "learning_rate": 9.991298287160261e-06, + "loss": 3.8625, + "step": 4116 + }, + { + "epoch": 0.3508906503025654, + "grad_norm": 65.67297244985025, + "learning_rate": 9.991269021648633e-06, + "loss": 2.4549, + "step": 4117 + }, + { + "epoch": 0.3509758799965908, + "grad_norm": 57.19325226539877, + "learning_rate": 9.991239707049858e-06, + "loss": 4.5858, + "step": 4118 + }, + { + "epoch": 0.3510611096906162, + "grad_norm": 472.5670539946481, + "learning_rate": 9.991210343364227e-06, + "loss": 3.8227, + "step": 4119 + }, + { + "epoch": 0.3511463393846416, + "grad_norm": 65.6781370048713, + "learning_rate": 9.991180930592021e-06, + "loss": 2.4301, + "step": 4120 + }, + { + "epoch": 0.351231569078667, + "grad_norm": 108.94090365723788, + "learning_rate": 9.991151468733536e-06, + "loss": 5.0313, + "step": 4121 + }, + { + "epoch": 0.3513167987726924, + "grad_norm": 161.7809128582191, + "learning_rate": 9.99112195778906e-06, + "loss": 3.5701, + "step": 4122 + }, + { + "epoch": 0.3514020284667178, + "grad_norm": 308.8397035382708, + "learning_rate": 9.991092397758882e-06, + "loss": 4.4749, + "step": 4123 + }, + { + "epoch": 0.3514872581607432, + "grad_norm": 90.99231177966072, + "learning_rate": 9.991062788643294e-06, + "loss": 4.3238, + "step": 4124 + }, + { + "epoch": 0.3515724878547686, + "grad_norm": 91.40047088926038, + "learning_rate": 9.991033130442586e-06, + "loss": 5.2806, + "step": 4125 + }, + { + "epoch": 0.351657717548794, + "grad_norm": 299.5419183078016, + "learning_rate": 9.991003423157052e-06, + "loss": 3.181, + "step": 4126 + }, + { + "epoch": 0.3517429472428194, + "grad_norm": 480.8273931659846, + "learning_rate": 9.990973666786981e-06, + "loss": 4.4423, + "step": 4127 + }, + { + "epoch": 0.3518281769368448, + "grad_norm": 116.67941132021504, + "learning_rate": 9.99094386133267e-06, + "loss": 4.187, + "step": 4128 + }, + { + "epoch": 0.3519134066308702, + "grad_norm": 127.74246345127132, + "learning_rate": 9.990914006794405e-06, + "loss": 4.6984, + "step": 4129 + }, + { + "epoch": 0.3519986363248956, + "grad_norm": 71.12917300278058, + "learning_rate": 9.990884103172486e-06, + "loss": 2.0042, + "step": 4130 + }, + { + "epoch": 0.352083866018921, + "grad_norm": 66.49300615648092, + "learning_rate": 9.990854150467206e-06, + "loss": 4.0068, + "step": 4131 + }, + { + "epoch": 0.3521690957129464, + "grad_norm": 275.40986656329216, + "learning_rate": 9.990824148678856e-06, + "loss": 4.2868, + "step": 4132 + }, + { + "epoch": 0.35225432540697177, + "grad_norm": 76.56144365790587, + "learning_rate": 9.990794097807736e-06, + "loss": 4.0272, + "step": 4133 + }, + { + "epoch": 0.3523395551009972, + "grad_norm": 148.5875017758896, + "learning_rate": 9.99076399785414e-06, + "loss": 4.64, + "step": 4134 + }, + { + "epoch": 0.3524247847950226, + "grad_norm": 63.75220292291029, + "learning_rate": 9.990733848818362e-06, + "loss": 4.0687, + "step": 4135 + }, + { + "epoch": 0.352510014489048, + "grad_norm": 69.86764049842658, + "learning_rate": 9.990703650700699e-06, + "loss": 4.7418, + "step": 4136 + }, + { + "epoch": 0.35259524418307336, + "grad_norm": 67.66364781342325, + "learning_rate": 9.990673403501448e-06, + "loss": 4.1823, + "step": 4137 + }, + { + "epoch": 0.3526804738770988, + "grad_norm": 61.19022751048487, + "learning_rate": 9.990643107220908e-06, + "loss": 3.0902, + "step": 4138 + }, + { + "epoch": 0.3527657035711242, + "grad_norm": 471.49857957145673, + "learning_rate": 9.990612761859378e-06, + "loss": 4.2199, + "step": 4139 + }, + { + "epoch": 0.3528509332651496, + "grad_norm": 70.71475021734327, + "learning_rate": 9.990582367417152e-06, + "loss": 3.3525, + "step": 4140 + }, + { + "epoch": 0.35293616295917496, + "grad_norm": 244.53240469545068, + "learning_rate": 9.990551923894534e-06, + "loss": 4.9772, + "step": 4141 + }, + { + "epoch": 0.35302139265320037, + "grad_norm": 120.89217127302527, + "learning_rate": 9.990521431291819e-06, + "loss": 5.0431, + "step": 4142 + }, + { + "epoch": 0.3531066223472258, + "grad_norm": 45.95947439781163, + "learning_rate": 9.990490889609307e-06, + "loss": 3.2592, + "step": 4143 + }, + { + "epoch": 0.3531918520412512, + "grad_norm": 50.24323525076254, + "learning_rate": 9.990460298847304e-06, + "loss": 3.1996, + "step": 4144 + }, + { + "epoch": 0.35327708173527655, + "grad_norm": 54.6861740712435, + "learning_rate": 9.990429659006105e-06, + "loss": 3.9399, + "step": 4145 + }, + { + "epoch": 0.35336231142930197, + "grad_norm": 123.74980372936729, + "learning_rate": 9.990398970086014e-06, + "loss": 4.6344, + "step": 4146 + }, + { + "epoch": 0.3534475411233274, + "grad_norm": 69.64140581308061, + "learning_rate": 9.99036823208733e-06, + "loss": 3.3199, + "step": 4147 + }, + { + "epoch": 0.3535327708173528, + "grad_norm": 58.47057939533626, + "learning_rate": 9.990337445010361e-06, + "loss": 3.6073, + "step": 4148 + }, + { + "epoch": 0.35361800051137815, + "grad_norm": 36.971367127150614, + "learning_rate": 9.990306608855403e-06, + "loss": 2.6916, + "step": 4149 + }, + { + "epoch": 0.35370323020540356, + "grad_norm": 94.70546000562379, + "learning_rate": 9.990275723622764e-06, + "loss": 4.6441, + "step": 4150 + }, + { + "epoch": 0.353788459899429, + "grad_norm": 234.83659596674406, + "learning_rate": 9.990244789312746e-06, + "loss": 4.391, + "step": 4151 + }, + { + "epoch": 0.35387368959345433, + "grad_norm": 117.81302847546318, + "learning_rate": 9.990213805925654e-06, + "loss": 5.0035, + "step": 4152 + }, + { + "epoch": 0.35395891928747975, + "grad_norm": 59.161934650182346, + "learning_rate": 9.990182773461792e-06, + "loss": 4.0975, + "step": 4153 + }, + { + "epoch": 0.35404414898150516, + "grad_norm": 49.53110823852236, + "learning_rate": 9.990151691921464e-06, + "loss": 3.7982, + "step": 4154 + }, + { + "epoch": 0.35412937867553057, + "grad_norm": 58.43393171112066, + "learning_rate": 9.990120561304977e-06, + "loss": 4.5216, + "step": 4155 + }, + { + "epoch": 0.35421460836955593, + "grad_norm": 79.18397558547903, + "learning_rate": 9.990089381612638e-06, + "loss": 4.7043, + "step": 4156 + }, + { + "epoch": 0.35429983806358134, + "grad_norm": 169.21079925538604, + "learning_rate": 9.990058152844754e-06, + "loss": 4.198, + "step": 4157 + }, + { + "epoch": 0.35438506775760675, + "grad_norm": 103.16066415151965, + "learning_rate": 9.99002687500163e-06, + "loss": 4.921, + "step": 4158 + }, + { + "epoch": 0.35447029745163217, + "grad_norm": 106.8985541374618, + "learning_rate": 9.989995548083574e-06, + "loss": 5.6231, + "step": 4159 + }, + { + "epoch": 0.3545555271456575, + "grad_norm": 77.79694417374404, + "learning_rate": 9.989964172090895e-06, + "loss": 4.6024, + "step": 4160 + }, + { + "epoch": 0.35464075683968294, + "grad_norm": 128.53995341957094, + "learning_rate": 9.989932747023901e-06, + "loss": 3.9987, + "step": 4161 + }, + { + "epoch": 0.35472598653370835, + "grad_norm": 60.19809187115325, + "learning_rate": 9.9899012728829e-06, + "loss": 3.6337, + "step": 4162 + }, + { + "epoch": 0.35481121622773376, + "grad_norm": 111.40404889792482, + "learning_rate": 9.989869749668205e-06, + "loss": 3.7085, + "step": 4163 + }, + { + "epoch": 0.3548964459217591, + "grad_norm": 37.33787392121492, + "learning_rate": 9.989838177380122e-06, + "loss": 3.1709, + "step": 4164 + }, + { + "epoch": 0.35498167561578453, + "grad_norm": 128.17054983719098, + "learning_rate": 9.989806556018964e-06, + "loss": 3.8985, + "step": 4165 + }, + { + "epoch": 0.35506690530980994, + "grad_norm": 39.98936505946375, + "learning_rate": 9.989774885585041e-06, + "loss": 3.5896, + "step": 4166 + }, + { + "epoch": 0.35515213500383536, + "grad_norm": 146.71438956009837, + "learning_rate": 9.989743166078668e-06, + "loss": 3.4783, + "step": 4167 + }, + { + "epoch": 0.3552373646978607, + "grad_norm": 532.7584569340521, + "learning_rate": 9.989711397500148e-06, + "loss": 4.7352, + "step": 4168 + }, + { + "epoch": 0.3553225943918861, + "grad_norm": 74.43998029136117, + "learning_rate": 9.989679579849802e-06, + "loss": 4.0389, + "step": 4169 + }, + { + "epoch": 0.35540782408591154, + "grad_norm": 165.86994528826062, + "learning_rate": 9.989647713127942e-06, + "loss": 4.3392, + "step": 4170 + }, + { + "epoch": 0.35549305377993695, + "grad_norm": 138.1994759683654, + "learning_rate": 9.989615797334877e-06, + "loss": 3.3776, + "step": 4171 + }, + { + "epoch": 0.3555782834739623, + "grad_norm": 54.18083754826206, + "learning_rate": 9.989583832470923e-06, + "loss": 3.5248, + "step": 4172 + }, + { + "epoch": 0.3556635131679877, + "grad_norm": 119.53377581647244, + "learning_rate": 9.989551818536397e-06, + "loss": 4.631, + "step": 4173 + }, + { + "epoch": 0.35574874286201313, + "grad_norm": 123.65679695143203, + "learning_rate": 9.989519755531608e-06, + "loss": 4.4527, + "step": 4174 + }, + { + "epoch": 0.35583397255603855, + "grad_norm": 70.5466116101337, + "learning_rate": 9.989487643456879e-06, + "loss": 3.0098, + "step": 4175 + }, + { + "epoch": 0.3559192022500639, + "grad_norm": 42.884214082439, + "learning_rate": 9.989455482312519e-06, + "loss": 3.7345, + "step": 4176 + }, + { + "epoch": 0.3560044319440893, + "grad_norm": 73.95804101151192, + "learning_rate": 9.989423272098846e-06, + "loss": 3.4968, + "step": 4177 + }, + { + "epoch": 0.35608966163811473, + "grad_norm": 85.61964344858279, + "learning_rate": 9.98939101281618e-06, + "loss": 3.5674, + "step": 4178 + }, + { + "epoch": 0.35617489133214014, + "grad_norm": 64.04786173726674, + "learning_rate": 9.989358704464834e-06, + "loss": 3.6819, + "step": 4179 + }, + { + "epoch": 0.3562601210261655, + "grad_norm": 40.32403868073746, + "learning_rate": 9.98932634704513e-06, + "loss": 3.1271, + "step": 4180 + }, + { + "epoch": 0.3563453507201909, + "grad_norm": 308.06367292935056, + "learning_rate": 9.989293940557382e-06, + "loss": 4.1999, + "step": 4181 + }, + { + "epoch": 0.3564305804142163, + "grad_norm": 86.9359924635085, + "learning_rate": 9.98926148500191e-06, + "loss": 4.8428, + "step": 4182 + }, + { + "epoch": 0.35651581010824174, + "grad_norm": 54.972675416692404, + "learning_rate": 9.989228980379036e-06, + "loss": 3.8774, + "step": 4183 + }, + { + "epoch": 0.3566010398022671, + "grad_norm": 48.96961591592733, + "learning_rate": 9.989196426689078e-06, + "loss": 2.603, + "step": 4184 + }, + { + "epoch": 0.3566862694962925, + "grad_norm": 152.4271858257845, + "learning_rate": 9.989163823932354e-06, + "loss": 5.5227, + "step": 4185 + }, + { + "epoch": 0.3567714991903179, + "grad_norm": 231.41153253530032, + "learning_rate": 9.989131172109187e-06, + "loss": 3.2346, + "step": 4186 + }, + { + "epoch": 0.35685672888434333, + "grad_norm": 75.87425839710419, + "learning_rate": 9.989098471219897e-06, + "loss": 4.2601, + "step": 4187 + }, + { + "epoch": 0.3569419585783687, + "grad_norm": 78.40346690140703, + "learning_rate": 9.989065721264806e-06, + "loss": 4.0519, + "step": 4188 + }, + { + "epoch": 0.3570271882723941, + "grad_norm": 47.434919899827136, + "learning_rate": 9.989032922244235e-06, + "loss": 3.2168, + "step": 4189 + }, + { + "epoch": 0.3571124179664195, + "grad_norm": 75.03993508151092, + "learning_rate": 9.98900007415851e-06, + "loss": 4.8934, + "step": 4190 + }, + { + "epoch": 0.3571976476604449, + "grad_norm": 66.53715127210023, + "learning_rate": 9.988967177007951e-06, + "loss": 2.8036, + "step": 4191 + }, + { + "epoch": 0.3572828773544703, + "grad_norm": 43.39112646477405, + "learning_rate": 9.988934230792881e-06, + "loss": 2.9596, + "step": 4192 + }, + { + "epoch": 0.3573681070484957, + "grad_norm": 60.26196420836849, + "learning_rate": 9.988901235513626e-06, + "loss": 3.9068, + "step": 4193 + }, + { + "epoch": 0.3574533367425211, + "grad_norm": 132.8547804765994, + "learning_rate": 9.988868191170509e-06, + "loss": 5.3248, + "step": 4194 + }, + { + "epoch": 0.35753856643654647, + "grad_norm": 170.00243190433056, + "learning_rate": 9.988835097763857e-06, + "loss": 4.9607, + "step": 4195 + }, + { + "epoch": 0.3576237961305719, + "grad_norm": 96.97141748825652, + "learning_rate": 9.988801955293995e-06, + "loss": 5.6156, + "step": 4196 + }, + { + "epoch": 0.3577090258245973, + "grad_norm": 536.8936147450802, + "learning_rate": 9.988768763761246e-06, + "loss": 3.2562, + "step": 4197 + }, + { + "epoch": 0.3577942555186227, + "grad_norm": 62.510227432263704, + "learning_rate": 9.98873552316594e-06, + "loss": 4.4457, + "step": 4198 + }, + { + "epoch": 0.35787948521264806, + "grad_norm": 67.76488130971899, + "learning_rate": 9.9887022335084e-06, + "loss": 4.5437, + "step": 4199 + }, + { + "epoch": 0.3579647149066735, + "grad_norm": 87.94607316897256, + "learning_rate": 9.988668894788958e-06, + "loss": 4.2999, + "step": 4200 + }, + { + "epoch": 0.3580499446006989, + "grad_norm": 45.88501610238068, + "learning_rate": 9.98863550700794e-06, + "loss": 4.0648, + "step": 4201 + }, + { + "epoch": 0.3581351742947243, + "grad_norm": 71.40178333940351, + "learning_rate": 9.988602070165673e-06, + "loss": 3.7745, + "step": 4202 + }, + { + "epoch": 0.35822040398874966, + "grad_norm": 70.23011366026479, + "learning_rate": 9.988568584262489e-06, + "loss": 3.4517, + "step": 4203 + }, + { + "epoch": 0.35830563368277507, + "grad_norm": 60.339920925588224, + "learning_rate": 9.988535049298714e-06, + "loss": 4.1097, + "step": 4204 + }, + { + "epoch": 0.3583908633768005, + "grad_norm": 149.39054386200067, + "learning_rate": 9.988501465274677e-06, + "loss": 5.5374, + "step": 4205 + }, + { + "epoch": 0.3584760930708259, + "grad_norm": 49.6614501433508, + "learning_rate": 9.988467832190712e-06, + "loss": 3.2352, + "step": 4206 + }, + { + "epoch": 0.35856132276485125, + "grad_norm": 154.7007542645624, + "learning_rate": 9.988434150047149e-06, + "loss": 4.3845, + "step": 4207 + }, + { + "epoch": 0.35864655245887667, + "grad_norm": 108.2039772824885, + "learning_rate": 9.988400418844317e-06, + "loss": 5.5035, + "step": 4208 + }, + { + "epoch": 0.3587317821529021, + "grad_norm": 48.17214586603922, + "learning_rate": 9.98836663858255e-06, + "loss": 3.25, + "step": 4209 + }, + { + "epoch": 0.3588170118469275, + "grad_norm": 173.7919823673283, + "learning_rate": 9.988332809262178e-06, + "loss": 4.5248, + "step": 4210 + }, + { + "epoch": 0.35890224154095285, + "grad_norm": 76.77677278473456, + "learning_rate": 9.988298930883536e-06, + "loss": 4.1978, + "step": 4211 + }, + { + "epoch": 0.35898747123497826, + "grad_norm": 63.70705289572243, + "learning_rate": 9.988265003446955e-06, + "loss": 4.1258, + "step": 4212 + }, + { + "epoch": 0.3590727009290037, + "grad_norm": 88.5766626041091, + "learning_rate": 9.98823102695277e-06, + "loss": 4.1416, + "step": 4213 + }, + { + "epoch": 0.3591579306230291, + "grad_norm": 158.040249233813, + "learning_rate": 9.988197001401317e-06, + "loss": 4.509, + "step": 4214 + }, + { + "epoch": 0.35924316031705444, + "grad_norm": 67.46359479494319, + "learning_rate": 9.988162926792929e-06, + "loss": 3.5349, + "step": 4215 + }, + { + "epoch": 0.35932839001107986, + "grad_norm": 108.90086573449707, + "learning_rate": 9.988128803127936e-06, + "loss": 4.0185, + "step": 4216 + }, + { + "epoch": 0.35941361970510527, + "grad_norm": 58.401967708713066, + "learning_rate": 9.988094630406682e-06, + "loss": 3.6657, + "step": 4217 + }, + { + "epoch": 0.3594988493991307, + "grad_norm": 151.40990198336843, + "learning_rate": 9.9880604086295e-06, + "loss": 4.78, + "step": 4218 + }, + { + "epoch": 0.35958407909315604, + "grad_norm": 124.21455842799725, + "learning_rate": 9.988026137796725e-06, + "loss": 4.9157, + "step": 4219 + }, + { + "epoch": 0.35966930878718145, + "grad_norm": 52.890367868018004, + "learning_rate": 9.987991817908694e-06, + "loss": 3.7024, + "step": 4220 + }, + { + "epoch": 0.35975453848120686, + "grad_norm": 68.08890738085344, + "learning_rate": 9.987957448965747e-06, + "loss": 3.5495, + "step": 4221 + }, + { + "epoch": 0.3598397681752323, + "grad_norm": 88.60369128495272, + "learning_rate": 9.987923030968218e-06, + "loss": 3.4437, + "step": 4222 + }, + { + "epoch": 0.35992499786925763, + "grad_norm": 100.85538273473553, + "learning_rate": 9.98788856391645e-06, + "loss": 3.4763, + "step": 4223 + }, + { + "epoch": 0.36001022756328305, + "grad_norm": 56.43693745649418, + "learning_rate": 9.98785404781078e-06, + "loss": 3.3502, + "step": 4224 + }, + { + "epoch": 0.36009545725730846, + "grad_norm": 100.17741384774997, + "learning_rate": 9.987819482651547e-06, + "loss": 3.3072, + "step": 4225 + }, + { + "epoch": 0.3601806869513338, + "grad_norm": 60.05357122764881, + "learning_rate": 9.987784868439092e-06, + "loss": 3.996, + "step": 4226 + }, + { + "epoch": 0.36026591664535923, + "grad_norm": 56.705268012195546, + "learning_rate": 9.987750205173754e-06, + "loss": 3.7952, + "step": 4227 + }, + { + "epoch": 0.36035114633938464, + "grad_norm": 467.31400256015604, + "learning_rate": 9.987715492855875e-06, + "loss": 4.6453, + "step": 4228 + }, + { + "epoch": 0.36043637603341006, + "grad_norm": 53.19015188887059, + "learning_rate": 9.987680731485796e-06, + "loss": 3.9501, + "step": 4229 + }, + { + "epoch": 0.3605216057274354, + "grad_norm": 65.48006979399402, + "learning_rate": 9.98764592106386e-06, + "loss": 4.4215, + "step": 4230 + }, + { + "epoch": 0.3606068354214608, + "grad_norm": 121.21936401453738, + "learning_rate": 9.987611061590406e-06, + "loss": 4.3608, + "step": 4231 + }, + { + "epoch": 0.36069206511548624, + "grad_norm": 208.71246111284142, + "learning_rate": 9.98757615306578e-06, + "loss": 3.7857, + "step": 4232 + }, + { + "epoch": 0.36077729480951165, + "grad_norm": 155.28296459802064, + "learning_rate": 9.987541195490325e-06, + "loss": 3.911, + "step": 4233 + }, + { + "epoch": 0.360862524503537, + "grad_norm": 191.72134171213958, + "learning_rate": 9.987506188864383e-06, + "loss": 4.0135, + "step": 4234 + }, + { + "epoch": 0.3609477541975624, + "grad_norm": 86.84559648052878, + "learning_rate": 9.9874711331883e-06, + "loss": 3.8968, + "step": 4235 + }, + { + "epoch": 0.36103298389158783, + "grad_norm": 57.870129264815006, + "learning_rate": 9.98743602846242e-06, + "loss": 3.4019, + "step": 4236 + }, + { + "epoch": 0.36111821358561325, + "grad_norm": 78.67315546769551, + "learning_rate": 9.98740087468709e-06, + "loss": 4.1375, + "step": 4237 + }, + { + "epoch": 0.3612034432796386, + "grad_norm": 121.34973884478715, + "learning_rate": 9.987365671862654e-06, + "loss": 4.0923, + "step": 4238 + }, + { + "epoch": 0.361288672973664, + "grad_norm": 104.0723113554396, + "learning_rate": 9.987330419989456e-06, + "loss": 4.8578, + "step": 4239 + }, + { + "epoch": 0.36137390266768943, + "grad_norm": 141.11995036468844, + "learning_rate": 9.987295119067848e-06, + "loss": 4.5035, + "step": 4240 + }, + { + "epoch": 0.36145913236171484, + "grad_norm": 57.279756460025546, + "learning_rate": 9.987259769098172e-06, + "loss": 3.859, + "step": 4241 + }, + { + "epoch": 0.3615443620557402, + "grad_norm": 83.51676333748495, + "learning_rate": 9.987224370080779e-06, + "loss": 4.5604, + "step": 4242 + }, + { + "epoch": 0.3616295917497656, + "grad_norm": 54.726107624083696, + "learning_rate": 9.987188922016016e-06, + "loss": 3.8542, + "step": 4243 + }, + { + "epoch": 0.361714821443791, + "grad_norm": 310.5420723304312, + "learning_rate": 9.987153424904232e-06, + "loss": 4.6501, + "step": 4244 + }, + { + "epoch": 0.36180005113781644, + "grad_norm": 56.50915464999966, + "learning_rate": 9.987117878745776e-06, + "loss": 4.1195, + "step": 4245 + }, + { + "epoch": 0.3618852808318418, + "grad_norm": 32.787819348536715, + "learning_rate": 9.987082283540996e-06, + "loss": 3.4792, + "step": 4246 + }, + { + "epoch": 0.3619705105258672, + "grad_norm": 112.1407860333675, + "learning_rate": 9.987046639290243e-06, + "loss": 4.3738, + "step": 4247 + }, + { + "epoch": 0.3620557402198926, + "grad_norm": 146.56261189365128, + "learning_rate": 9.987010945993868e-06, + "loss": 3.3763, + "step": 4248 + }, + { + "epoch": 0.36214096991391803, + "grad_norm": 59.773812837535466, + "learning_rate": 9.986975203652226e-06, + "loss": 3.4285, + "step": 4249 + }, + { + "epoch": 0.3622261996079434, + "grad_norm": 68.54648818671946, + "learning_rate": 9.986939412265661e-06, + "loss": 4.8911, + "step": 4250 + }, + { + "epoch": 0.3623114293019688, + "grad_norm": 54.38025688947503, + "learning_rate": 9.986903571834527e-06, + "loss": 4.1453, + "step": 4251 + }, + { + "epoch": 0.3623966589959942, + "grad_norm": 250.75323910745837, + "learning_rate": 9.986867682359182e-06, + "loss": 4.3311, + "step": 4252 + }, + { + "epoch": 0.3624818886900196, + "grad_norm": 400.98103189470754, + "learning_rate": 9.986831743839972e-06, + "loss": 4.023, + "step": 4253 + }, + { + "epoch": 0.362567118384045, + "grad_norm": 95.78525399322444, + "learning_rate": 9.986795756277255e-06, + "loss": 4.1054, + "step": 4254 + }, + { + "epoch": 0.3626523480780704, + "grad_norm": 73.69984382969307, + "learning_rate": 9.986759719671381e-06, + "loss": 3.1971, + "step": 4255 + }, + { + "epoch": 0.3627375777720958, + "grad_norm": 69.8512431331994, + "learning_rate": 9.986723634022711e-06, + "loss": 3.941, + "step": 4256 + }, + { + "epoch": 0.3628228074661212, + "grad_norm": 137.06676984602296, + "learning_rate": 9.98668749933159e-06, + "loss": 5.101, + "step": 4257 + }, + { + "epoch": 0.3629080371601466, + "grad_norm": 146.59421850651688, + "learning_rate": 9.986651315598384e-06, + "loss": 4.0976, + "step": 4258 + }, + { + "epoch": 0.362993266854172, + "grad_norm": 134.28846327952976, + "learning_rate": 9.98661508282344e-06, + "loss": 4.2975, + "step": 4259 + }, + { + "epoch": 0.3630784965481974, + "grad_norm": 48.40431248943272, + "learning_rate": 9.98657880100712e-06, + "loss": 3.6834, + "step": 4260 + }, + { + "epoch": 0.3631637262422228, + "grad_norm": 34.53934661918537, + "learning_rate": 9.98654247014978e-06, + "loss": 2.6057, + "step": 4261 + }, + { + "epoch": 0.3632489559362482, + "grad_norm": 72.11674366463507, + "learning_rate": 9.986506090251775e-06, + "loss": 4.2618, + "step": 4262 + }, + { + "epoch": 0.3633341856302736, + "grad_norm": 45.24048393581491, + "learning_rate": 9.986469661313464e-06, + "loss": 3.103, + "step": 4263 + }, + { + "epoch": 0.363419415324299, + "grad_norm": 172.9374414346958, + "learning_rate": 9.986433183335204e-06, + "loss": 5.147, + "step": 4264 + }, + { + "epoch": 0.36350464501832436, + "grad_norm": 109.83967215880146, + "learning_rate": 9.986396656317357e-06, + "loss": 3.1369, + "step": 4265 + }, + { + "epoch": 0.36358987471234977, + "grad_norm": 76.81366212672307, + "learning_rate": 9.98636008026028e-06, + "loss": 3.8097, + "step": 4266 + }, + { + "epoch": 0.3636751044063752, + "grad_norm": 114.50643887602902, + "learning_rate": 9.986323455164333e-06, + "loss": 5.3037, + "step": 4267 + }, + { + "epoch": 0.3637603341004006, + "grad_norm": 57.10488202534337, + "learning_rate": 9.986286781029876e-06, + "loss": 3.6448, + "step": 4268 + }, + { + "epoch": 0.36384556379442595, + "grad_norm": 70.95418851775126, + "learning_rate": 9.98625005785727e-06, + "loss": 3.5756, + "step": 4269 + }, + { + "epoch": 0.36393079348845137, + "grad_norm": 103.27262511874976, + "learning_rate": 9.986213285646876e-06, + "loss": 5.3201, + "step": 4270 + }, + { + "epoch": 0.3640160231824768, + "grad_norm": 53.85326312723, + "learning_rate": 9.986176464399056e-06, + "loss": 4.5356, + "step": 4271 + }, + { + "epoch": 0.3641012528765022, + "grad_norm": 49.45006364578579, + "learning_rate": 9.98613959411417e-06, + "loss": 3.9494, + "step": 4272 + }, + { + "epoch": 0.36418648257052755, + "grad_norm": 61.88891649430554, + "learning_rate": 9.986102674792583e-06, + "loss": 4.2526, + "step": 4273 + }, + { + "epoch": 0.36427171226455296, + "grad_norm": 71.64152186216597, + "learning_rate": 9.986065706434658e-06, + "loss": 4.2751, + "step": 4274 + }, + { + "epoch": 0.3643569419585784, + "grad_norm": 96.68104703977401, + "learning_rate": 9.98602868904076e-06, + "loss": 4.3793, + "step": 4275 + }, + { + "epoch": 0.3644421716526038, + "grad_norm": 100.14622762303634, + "learning_rate": 9.985991622611249e-06, + "loss": 4.4623, + "step": 4276 + }, + { + "epoch": 0.36452740134662914, + "grad_norm": 66.28033613361531, + "learning_rate": 9.985954507146491e-06, + "loss": 3.7762, + "step": 4277 + }, + { + "epoch": 0.36461263104065456, + "grad_norm": 76.27406981477996, + "learning_rate": 9.985917342646852e-06, + "loss": 4.7176, + "step": 4278 + }, + { + "epoch": 0.36469786073467997, + "grad_norm": 88.21529536165714, + "learning_rate": 9.985880129112698e-06, + "loss": 4.7139, + "step": 4279 + }, + { + "epoch": 0.3647830904287054, + "grad_norm": 1001.4909783441722, + "learning_rate": 9.985842866544394e-06, + "loss": 4.0853, + "step": 4280 + }, + { + "epoch": 0.36486832012273074, + "grad_norm": 56.54862933968669, + "learning_rate": 9.985805554942308e-06, + "loss": 3.9252, + "step": 4281 + }, + { + "epoch": 0.36495354981675615, + "grad_norm": 121.1870021435065, + "learning_rate": 9.985768194306801e-06, + "loss": 4.64, + "step": 4282 + }, + { + "epoch": 0.36503877951078156, + "grad_norm": 41.27323493778521, + "learning_rate": 9.98573078463825e-06, + "loss": 3.4028, + "step": 4283 + }, + { + "epoch": 0.365124009204807, + "grad_norm": 54.49648374908667, + "learning_rate": 9.985693325937015e-06, + "loss": 3.2574, + "step": 4284 + }, + { + "epoch": 0.36520923889883233, + "grad_norm": 72.04868768084644, + "learning_rate": 9.985655818203468e-06, + "loss": 3.8837, + "step": 4285 + }, + { + "epoch": 0.36529446859285775, + "grad_norm": 72.15853603063599, + "learning_rate": 9.985618261437977e-06, + "loss": 3.6151, + "step": 4286 + }, + { + "epoch": 0.36537969828688316, + "grad_norm": 58.59638190304475, + "learning_rate": 9.985580655640911e-06, + "loss": 3.6668, + "step": 4287 + }, + { + "epoch": 0.36546492798090857, + "grad_norm": 97.25382250176031, + "learning_rate": 9.985543000812642e-06, + "loss": 4.3094, + "step": 4288 + }, + { + "epoch": 0.36555015767493393, + "grad_norm": 44.87583961382518, + "learning_rate": 9.985505296953537e-06, + "loss": 3.7518, + "step": 4289 + }, + { + "epoch": 0.36563538736895934, + "grad_norm": 42.23080480839399, + "learning_rate": 9.985467544063969e-06, + "loss": 2.9426, + "step": 4290 + }, + { + "epoch": 0.36572061706298475, + "grad_norm": 104.27030194863185, + "learning_rate": 9.985429742144309e-06, + "loss": 5.2549, + "step": 4291 + }, + { + "epoch": 0.36580584675701017, + "grad_norm": 91.71605912096375, + "learning_rate": 9.985391891194929e-06, + "loss": 4.6769, + "step": 4292 + }, + { + "epoch": 0.3658910764510355, + "grad_norm": 54.67826615090666, + "learning_rate": 9.9853539912162e-06, + "loss": 2.428, + "step": 4293 + }, + { + "epoch": 0.36597630614506094, + "grad_norm": 1005.5093698410755, + "learning_rate": 9.985316042208495e-06, + "loss": 3.9993, + "step": 4294 + }, + { + "epoch": 0.36606153583908635, + "grad_norm": 215.04542515091654, + "learning_rate": 9.98527804417219e-06, + "loss": 4.7252, + "step": 4295 + }, + { + "epoch": 0.36614676553311176, + "grad_norm": 61.3654626870405, + "learning_rate": 9.985239997107653e-06, + "loss": 3.896, + "step": 4296 + }, + { + "epoch": 0.3662319952271371, + "grad_norm": 117.94974591947299, + "learning_rate": 9.985201901015262e-06, + "loss": 4.1333, + "step": 4297 + }, + { + "epoch": 0.36631722492116253, + "grad_norm": 85.35045316439297, + "learning_rate": 9.985163755895393e-06, + "loss": 4.4742, + "step": 4298 + }, + { + "epoch": 0.36640245461518794, + "grad_norm": 79.52536848128135, + "learning_rate": 9.985125561748419e-06, + "loss": 3.6122, + "step": 4299 + }, + { + "epoch": 0.36648768430921336, + "grad_norm": 57.43255778558425, + "learning_rate": 9.985087318574716e-06, + "loss": 3.3562, + "step": 4300 + }, + { + "epoch": 0.3665729140032387, + "grad_norm": 80.66361561923524, + "learning_rate": 9.98504902637466e-06, + "loss": 4.2663, + "step": 4301 + }, + { + "epoch": 0.3666581436972641, + "grad_norm": 58.172724575031076, + "learning_rate": 9.985010685148628e-06, + "loss": 3.5143, + "step": 4302 + }, + { + "epoch": 0.36674337339128954, + "grad_norm": 51.63133497126026, + "learning_rate": 9.984972294896996e-06, + "loss": 4.0368, + "step": 4303 + }, + { + "epoch": 0.3668286030853149, + "grad_norm": 54.152460254081085, + "learning_rate": 9.984933855620142e-06, + "loss": 3.7045, + "step": 4304 + }, + { + "epoch": 0.3669138327793403, + "grad_norm": 59.57045471980876, + "learning_rate": 9.984895367318446e-06, + "loss": 3.2957, + "step": 4305 + }, + { + "epoch": 0.3669990624733657, + "grad_norm": 82.6863550471784, + "learning_rate": 9.984856829992284e-06, + "loss": 5.4824, + "step": 4306 + }, + { + "epoch": 0.36708429216739114, + "grad_norm": 164.726432907227, + "learning_rate": 9.984818243642035e-06, + "loss": 3.9535, + "step": 4307 + }, + { + "epoch": 0.3671695218614165, + "grad_norm": 37.75833816472691, + "learning_rate": 9.98477960826808e-06, + "loss": 2.599, + "step": 4308 + }, + { + "epoch": 0.3672547515554419, + "grad_norm": 48.95917159238195, + "learning_rate": 9.984740923870798e-06, + "loss": 3.5907, + "step": 4309 + }, + { + "epoch": 0.3673399812494673, + "grad_norm": 98.24696616415292, + "learning_rate": 9.98470219045057e-06, + "loss": 5.3572, + "step": 4310 + }, + { + "epoch": 0.36742521094349273, + "grad_norm": 120.84213628076311, + "learning_rate": 9.984663408007778e-06, + "loss": 3.9926, + "step": 4311 + }, + { + "epoch": 0.3675104406375181, + "grad_norm": 48.87277229344009, + "learning_rate": 9.984624576542801e-06, + "loss": 3.4036, + "step": 4312 + }, + { + "epoch": 0.3675956703315435, + "grad_norm": 127.52937213734504, + "learning_rate": 9.984585696056021e-06, + "loss": 4.3015, + "step": 4313 + }, + { + "epoch": 0.3676809000255689, + "grad_norm": 72.96914881682176, + "learning_rate": 9.984546766547822e-06, + "loss": 4.4666, + "step": 4314 + }, + { + "epoch": 0.3677661297195943, + "grad_norm": 60.35199685740621, + "learning_rate": 9.984507788018586e-06, + "loss": 3.2724, + "step": 4315 + }, + { + "epoch": 0.3678513594136197, + "grad_norm": 139.7560301039051, + "learning_rate": 9.984468760468697e-06, + "loss": 3.1398, + "step": 4316 + }, + { + "epoch": 0.3679365891076451, + "grad_norm": 302.79392788678007, + "learning_rate": 9.98442968389854e-06, + "loss": 5.7094, + "step": 4317 + }, + { + "epoch": 0.3680218188016705, + "grad_norm": 60.666763117459205, + "learning_rate": 9.984390558308496e-06, + "loss": 3.0846, + "step": 4318 + }, + { + "epoch": 0.3681070484956959, + "grad_norm": 168.9128653204037, + "learning_rate": 9.984351383698952e-06, + "loss": 2.6763, + "step": 4319 + }, + { + "epoch": 0.3681922781897213, + "grad_norm": 94.80808459841204, + "learning_rate": 9.98431216007029e-06, + "loss": 3.0319, + "step": 4320 + }, + { + "epoch": 0.3682775078837467, + "grad_norm": 110.99105664228235, + "learning_rate": 9.9842728874229e-06, + "loss": 4.774, + "step": 4321 + }, + { + "epoch": 0.3683627375777721, + "grad_norm": 120.93528935325531, + "learning_rate": 9.984233565757168e-06, + "loss": 5.3169, + "step": 4322 + }, + { + "epoch": 0.3684479672717975, + "grad_norm": 70.38684185653051, + "learning_rate": 9.98419419507348e-06, + "loss": 4.3937, + "step": 4323 + }, + { + "epoch": 0.3685331969658229, + "grad_norm": 128.1371548885947, + "learning_rate": 9.984154775372221e-06, + "loss": 3.9091, + "step": 4324 + }, + { + "epoch": 0.3686184266598483, + "grad_norm": 32.86250233044413, + "learning_rate": 9.984115306653782e-06, + "loss": 1.7252, + "step": 4325 + }, + { + "epoch": 0.3687036563538737, + "grad_norm": 74.87366675443371, + "learning_rate": 9.984075788918548e-06, + "loss": 3.939, + "step": 4326 + }, + { + "epoch": 0.3687888860478991, + "grad_norm": 86.80323948000354, + "learning_rate": 9.984036222166908e-06, + "loss": 4.0928, + "step": 4327 + }, + { + "epoch": 0.36887411574192447, + "grad_norm": 106.55403615686852, + "learning_rate": 9.983996606399254e-06, + "loss": 3.9128, + "step": 4328 + }, + { + "epoch": 0.3689593454359499, + "grad_norm": 360.4207572276943, + "learning_rate": 9.983956941615973e-06, + "loss": 8.1743, + "step": 4329 + }, + { + "epoch": 0.3690445751299753, + "grad_norm": 149.62879039005384, + "learning_rate": 9.983917227817457e-06, + "loss": 4.4678, + "step": 4330 + }, + { + "epoch": 0.3691298048240007, + "grad_norm": 2880.5071676699267, + "learning_rate": 9.983877465004095e-06, + "loss": 5.7102, + "step": 4331 + }, + { + "epoch": 0.36921503451802606, + "grad_norm": 246.49380417358208, + "learning_rate": 9.983837653176278e-06, + "loss": 6.4075, + "step": 4332 + }, + { + "epoch": 0.3693002642120515, + "grad_norm": 54.61896566934928, + "learning_rate": 9.983797792334398e-06, + "loss": 3.1007, + "step": 4333 + }, + { + "epoch": 0.3693854939060769, + "grad_norm": 84.40459968061262, + "learning_rate": 9.983757882478848e-06, + "loss": 4.5213, + "step": 4334 + }, + { + "epoch": 0.3694707236001023, + "grad_norm": 85.128580200435, + "learning_rate": 9.983717923610018e-06, + "loss": 4.204, + "step": 4335 + }, + { + "epoch": 0.36955595329412766, + "grad_norm": 98.9694784099755, + "learning_rate": 9.983677915728305e-06, + "loss": 4.2394, + "step": 4336 + }, + { + "epoch": 0.36964118298815307, + "grad_norm": 133.4551539262184, + "learning_rate": 9.983637858834098e-06, + "loss": 4.512, + "step": 4337 + }, + { + "epoch": 0.3697264126821785, + "grad_norm": 143.4435758268646, + "learning_rate": 9.983597752927794e-06, + "loss": 3.8488, + "step": 4338 + }, + { + "epoch": 0.36981164237620384, + "grad_norm": 98.34328312198836, + "learning_rate": 9.983557598009785e-06, + "loss": 4.8863, + "step": 4339 + }, + { + "epoch": 0.36989687207022925, + "grad_norm": 58.06401421354646, + "learning_rate": 9.98351739408047e-06, + "loss": 2.868, + "step": 4340 + }, + { + "epoch": 0.36998210176425467, + "grad_norm": 43.78878156874455, + "learning_rate": 9.983477141140238e-06, + "loss": 3.8835, + "step": 4341 + }, + { + "epoch": 0.3700673314582801, + "grad_norm": 601.8030184656546, + "learning_rate": 9.983436839189492e-06, + "loss": 4.0401, + "step": 4342 + }, + { + "epoch": 0.37015256115230544, + "grad_norm": 47.91620108922097, + "learning_rate": 9.983396488228623e-06, + "loss": 3.3069, + "step": 4343 + }, + { + "epoch": 0.37023779084633085, + "grad_norm": 66.27822322470857, + "learning_rate": 9.983356088258029e-06, + "loss": 4.6558, + "step": 4344 + }, + { + "epoch": 0.37032302054035626, + "grad_norm": 95.9817270208553, + "learning_rate": 9.983315639278109e-06, + "loss": 4.2477, + "step": 4345 + }, + { + "epoch": 0.3704082502343817, + "grad_norm": 37.58475904090107, + "learning_rate": 9.983275141289261e-06, + "loss": 2.8782, + "step": 4346 + }, + { + "epoch": 0.37049347992840703, + "grad_norm": 193.90847048392837, + "learning_rate": 9.983234594291881e-06, + "loss": 2.9473, + "step": 4347 + }, + { + "epoch": 0.37057870962243245, + "grad_norm": 41.4584132177794, + "learning_rate": 9.983193998286369e-06, + "loss": 4.2817, + "step": 4348 + }, + { + "epoch": 0.37066393931645786, + "grad_norm": 55.19516801971149, + "learning_rate": 9.983153353273123e-06, + "loss": 3.6542, + "step": 4349 + }, + { + "epoch": 0.37074916901048327, + "grad_norm": 61.11023712065068, + "learning_rate": 9.983112659252547e-06, + "loss": 3.8627, + "step": 4350 + }, + { + "epoch": 0.3708343987045086, + "grad_norm": 148.97141967726537, + "learning_rate": 9.983071916225036e-06, + "loss": 4.0681, + "step": 4351 + }, + { + "epoch": 0.37091962839853404, + "grad_norm": 85.05532011333672, + "learning_rate": 9.983031124190992e-06, + "loss": 2.787, + "step": 4352 + }, + { + "epoch": 0.37100485809255945, + "grad_norm": 79.82341943133682, + "learning_rate": 9.982990283150819e-06, + "loss": 5.247, + "step": 4353 + }, + { + "epoch": 0.37109008778658487, + "grad_norm": 95.04256810980655, + "learning_rate": 9.982949393104914e-06, + "loss": 3.6103, + "step": 4354 + }, + { + "epoch": 0.3711753174806102, + "grad_norm": 35.93322052747552, + "learning_rate": 9.982908454053684e-06, + "loss": 3.1665, + "step": 4355 + }, + { + "epoch": 0.37126054717463564, + "grad_norm": 64.37588564334487, + "learning_rate": 9.98286746599753e-06, + "loss": 3.1222, + "step": 4356 + }, + { + "epoch": 0.37134577686866105, + "grad_norm": 51.20017384326847, + "learning_rate": 9.98282642893685e-06, + "loss": 3.9985, + "step": 4357 + }, + { + "epoch": 0.37143100656268646, + "grad_norm": 83.19069298575154, + "learning_rate": 9.982785342872057e-06, + "loss": 4.32, + "step": 4358 + }, + { + "epoch": 0.3715162362567118, + "grad_norm": 73.9012079394464, + "learning_rate": 9.982744207803549e-06, + "loss": 3.4509, + "step": 4359 + }, + { + "epoch": 0.37160146595073723, + "grad_norm": 45.549897146406956, + "learning_rate": 9.98270302373173e-06, + "loss": 3.3917, + "step": 4360 + }, + { + "epoch": 0.37168669564476264, + "grad_norm": 53.456611668751925, + "learning_rate": 9.98266179065701e-06, + "loss": 4.2724, + "step": 4361 + }, + { + "epoch": 0.37177192533878806, + "grad_norm": 48.27319377557304, + "learning_rate": 9.982620508579787e-06, + "loss": 2.4203, + "step": 4362 + }, + { + "epoch": 0.3718571550328134, + "grad_norm": 38.637857898267875, + "learning_rate": 9.982579177500473e-06, + "loss": 3.2136, + "step": 4363 + }, + { + "epoch": 0.3719423847268388, + "grad_norm": 33.74514152273735, + "learning_rate": 9.982537797419474e-06, + "loss": 3.1906, + "step": 4364 + }, + { + "epoch": 0.37202761442086424, + "grad_norm": 50.719655843435966, + "learning_rate": 9.982496368337194e-06, + "loss": 3.9681, + "step": 4365 + }, + { + "epoch": 0.37211284411488965, + "grad_norm": 73.45010936413435, + "learning_rate": 9.982454890254042e-06, + "loss": 4.276, + "step": 4366 + }, + { + "epoch": 0.372198073808915, + "grad_norm": 85.67044377327404, + "learning_rate": 9.982413363170426e-06, + "loss": 4.8313, + "step": 4367 + }, + { + "epoch": 0.3722833035029404, + "grad_norm": 82.87867076167055, + "learning_rate": 9.982371787086755e-06, + "loss": 4.0996, + "step": 4368 + }, + { + "epoch": 0.37236853319696583, + "grad_norm": 38.59191330640277, + "learning_rate": 9.982330162003438e-06, + "loss": 3.6848, + "step": 4369 + }, + { + "epoch": 0.37245376289099125, + "grad_norm": 55.868823484038366, + "learning_rate": 9.982288487920881e-06, + "loss": 3.0319, + "step": 4370 + }, + { + "epoch": 0.3725389925850166, + "grad_norm": 74.88534799296377, + "learning_rate": 9.982246764839497e-06, + "loss": 3.2366, + "step": 4371 + }, + { + "epoch": 0.372624222279042, + "grad_norm": 68.9633320945482, + "learning_rate": 9.982204992759699e-06, + "loss": 4.5241, + "step": 4372 + }, + { + "epoch": 0.37270945197306743, + "grad_norm": 40.388689134374076, + "learning_rate": 9.982163171681894e-06, + "loss": 3.9425, + "step": 4373 + }, + { + "epoch": 0.37279468166709284, + "grad_norm": 55.48751850447856, + "learning_rate": 9.98212130160649e-06, + "loss": 4.2788, + "step": 4374 + }, + { + "epoch": 0.3728799113611182, + "grad_norm": 99.54890392628515, + "learning_rate": 9.982079382533907e-06, + "loss": 5.0488, + "step": 4375 + }, + { + "epoch": 0.3729651410551436, + "grad_norm": 47.76470248782462, + "learning_rate": 9.982037414464551e-06, + "loss": 3.4715, + "step": 4376 + }, + { + "epoch": 0.373050370749169, + "grad_norm": 110.49310061399316, + "learning_rate": 9.981995397398838e-06, + "loss": 4.5853, + "step": 4377 + }, + { + "epoch": 0.3731356004431944, + "grad_norm": 45.47612897752786, + "learning_rate": 9.981953331337178e-06, + "loss": 3.8682, + "step": 4378 + }, + { + "epoch": 0.3732208301372198, + "grad_norm": 87.7709820772619, + "learning_rate": 9.981911216279987e-06, + "loss": 5.9091, + "step": 4379 + }, + { + "epoch": 0.3733060598312452, + "grad_norm": 72.2204032962477, + "learning_rate": 9.981869052227679e-06, + "loss": 4.3373, + "step": 4380 + }, + { + "epoch": 0.3733912895252706, + "grad_norm": 50.285147754836046, + "learning_rate": 9.98182683918067e-06, + "loss": 3.8392, + "step": 4381 + }, + { + "epoch": 0.373476519219296, + "grad_norm": 61.0463890266712, + "learning_rate": 9.981784577139371e-06, + "loss": 3.9081, + "step": 4382 + }, + { + "epoch": 0.3735617489133214, + "grad_norm": 49.12229496246685, + "learning_rate": 9.981742266104202e-06, + "loss": 4.5747, + "step": 4383 + }, + { + "epoch": 0.3736469786073468, + "grad_norm": 60.805351449533774, + "learning_rate": 9.981699906075577e-06, + "loss": 4.0433, + "step": 4384 + }, + { + "epoch": 0.3737322083013722, + "grad_norm": 106.37547978248514, + "learning_rate": 9.981657497053914e-06, + "loss": 4.3228, + "step": 4385 + }, + { + "epoch": 0.3738174379953976, + "grad_norm": 54.38898861506565, + "learning_rate": 9.981615039039627e-06, + "loss": 4.1626, + "step": 4386 + }, + { + "epoch": 0.373902667689423, + "grad_norm": 39.39494163161503, + "learning_rate": 9.981572532033137e-06, + "loss": 3.3063, + "step": 4387 + }, + { + "epoch": 0.3739878973834484, + "grad_norm": 44.32524970762021, + "learning_rate": 9.981529976034861e-06, + "loss": 4.1171, + "step": 4388 + }, + { + "epoch": 0.3740731270774738, + "grad_norm": 55.3619503529534, + "learning_rate": 9.981487371045216e-06, + "loss": 4.4994, + "step": 4389 + }, + { + "epoch": 0.37415835677149917, + "grad_norm": 216.25615978681407, + "learning_rate": 9.981444717064621e-06, + "loss": 3.905, + "step": 4390 + }, + { + "epoch": 0.3742435864655246, + "grad_norm": 81.98076669494925, + "learning_rate": 9.9814020140935e-06, + "loss": 3.7418, + "step": 4391 + }, + { + "epoch": 0.37432881615955, + "grad_norm": 95.07730994683322, + "learning_rate": 9.981359262132266e-06, + "loss": 4.2291, + "step": 4392 + }, + { + "epoch": 0.3744140458535754, + "grad_norm": 45.34082274789903, + "learning_rate": 9.981316461181345e-06, + "loss": 3.1683, + "step": 4393 + }, + { + "epoch": 0.37449927554760076, + "grad_norm": 71.94775045495973, + "learning_rate": 9.981273611241155e-06, + "loss": 4.0977, + "step": 4394 + }, + { + "epoch": 0.3745845052416262, + "grad_norm": 61.78633860169128, + "learning_rate": 9.98123071231212e-06, + "loss": 4.2047, + "step": 4395 + }, + { + "epoch": 0.3746697349356516, + "grad_norm": 108.6061680674535, + "learning_rate": 9.98118776439466e-06, + "loss": 5.3535, + "step": 4396 + }, + { + "epoch": 0.374754964629677, + "grad_norm": 101.68109239185739, + "learning_rate": 9.981144767489197e-06, + "loss": 3.8083, + "step": 4397 + }, + { + "epoch": 0.37484019432370236, + "grad_norm": 52.6955798307451, + "learning_rate": 9.981101721596156e-06, + "loss": 3.5966, + "step": 4398 + }, + { + "epoch": 0.37492542401772777, + "grad_norm": 76.51583242150251, + "learning_rate": 9.981058626715956e-06, + "loss": 2.8941, + "step": 4399 + }, + { + "epoch": 0.3750106537117532, + "grad_norm": 55.32566333275947, + "learning_rate": 9.981015482849025e-06, + "loss": 3.6682, + "step": 4400 + }, + { + "epoch": 0.3750958834057786, + "grad_norm": 109.7033962241283, + "learning_rate": 9.980972289995787e-06, + "loss": 5.56, + "step": 4401 + }, + { + "epoch": 0.37518111309980395, + "grad_norm": 105.93187683717647, + "learning_rate": 9.980929048156665e-06, + "loss": 4.1258, + "step": 4402 + }, + { + "epoch": 0.37526634279382937, + "grad_norm": 42.3681966707311, + "learning_rate": 9.980885757332086e-06, + "loss": 3.5695, + "step": 4403 + }, + { + "epoch": 0.3753515724878548, + "grad_norm": 69.01056506449109, + "learning_rate": 9.980842417522473e-06, + "loss": 4.239, + "step": 4404 + }, + { + "epoch": 0.3754368021818802, + "grad_norm": 94.50030092127534, + "learning_rate": 9.980799028728254e-06, + "loss": 4.5789, + "step": 4405 + }, + { + "epoch": 0.37552203187590555, + "grad_norm": 112.54796837872213, + "learning_rate": 9.980755590949857e-06, + "loss": 4.6265, + "step": 4406 + }, + { + "epoch": 0.37560726156993096, + "grad_norm": 104.9693087557528, + "learning_rate": 9.980712104187707e-06, + "loss": 4.0285, + "step": 4407 + }, + { + "epoch": 0.3756924912639564, + "grad_norm": 65.0852889453776, + "learning_rate": 9.980668568442234e-06, + "loss": 3.217, + "step": 4408 + }, + { + "epoch": 0.3757777209579818, + "grad_norm": 90.40973549942228, + "learning_rate": 9.980624983713863e-06, + "loss": 3.5909, + "step": 4409 + }, + { + "epoch": 0.37586295065200714, + "grad_norm": 149.48510902333854, + "learning_rate": 9.980581350003027e-06, + "loss": 3.4912, + "step": 4410 + }, + { + "epoch": 0.37594818034603256, + "grad_norm": 48.90725462981674, + "learning_rate": 9.980537667310149e-06, + "loss": 3.4843, + "step": 4411 + }, + { + "epoch": 0.37603341004005797, + "grad_norm": 131.7389818216043, + "learning_rate": 9.980493935635664e-06, + "loss": 5.1992, + "step": 4412 + }, + { + "epoch": 0.3761186397340834, + "grad_norm": 74.30191823631533, + "learning_rate": 9.98045015498e-06, + "loss": 4.2202, + "step": 4413 + }, + { + "epoch": 0.37620386942810874, + "grad_norm": 112.07951382736366, + "learning_rate": 9.980406325343588e-06, + "loss": 3.8977, + "step": 4414 + }, + { + "epoch": 0.37628909912213415, + "grad_norm": 78.82026607055509, + "learning_rate": 9.980362446726858e-06, + "loss": 4.9455, + "step": 4415 + }, + { + "epoch": 0.37637432881615956, + "grad_norm": 150.46255934065482, + "learning_rate": 9.980318519130243e-06, + "loss": 5.2419, + "step": 4416 + }, + { + "epoch": 0.3764595585101849, + "grad_norm": 54.333983896527705, + "learning_rate": 9.980274542554177e-06, + "loss": 3.2442, + "step": 4417 + }, + { + "epoch": 0.37654478820421033, + "grad_norm": 56.069942879673654, + "learning_rate": 9.980230516999085e-06, + "loss": 2.7019, + "step": 4418 + }, + { + "epoch": 0.37663001789823575, + "grad_norm": 83.99019411785063, + "learning_rate": 9.980186442465407e-06, + "loss": 4.3781, + "step": 4419 + }, + { + "epoch": 0.37671524759226116, + "grad_norm": 515.7724620536345, + "learning_rate": 9.980142318953574e-06, + "loss": 3.5703, + "step": 4420 + }, + { + "epoch": 0.3768004772862865, + "grad_norm": 53.99747666012821, + "learning_rate": 9.980098146464022e-06, + "loss": 2.925, + "step": 4421 + }, + { + "epoch": 0.37688570698031193, + "grad_norm": 90.44156606485967, + "learning_rate": 9.980053924997181e-06, + "loss": 5.2184, + "step": 4422 + }, + { + "epoch": 0.37697093667433734, + "grad_norm": 267.8510712633379, + "learning_rate": 9.98000965455349e-06, + "loss": 4.6018, + "step": 4423 + }, + { + "epoch": 0.37705616636836276, + "grad_norm": 135.5913969931085, + "learning_rate": 9.979965335133382e-06, + "loss": 4.5294, + "step": 4424 + }, + { + "epoch": 0.3771413960623881, + "grad_norm": 108.03956929485518, + "learning_rate": 9.979920966737294e-06, + "loss": 3.923, + "step": 4425 + }, + { + "epoch": 0.3772266257564135, + "grad_norm": 88.1655316137288, + "learning_rate": 9.979876549365663e-06, + "loss": 4.1011, + "step": 4426 + }, + { + "epoch": 0.37731185545043894, + "grad_norm": 121.50332777015458, + "learning_rate": 9.979832083018924e-06, + "loss": 4.1663, + "step": 4427 + }, + { + "epoch": 0.37739708514446435, + "grad_norm": 147.45203943618634, + "learning_rate": 9.979787567697517e-06, + "loss": 5.7328, + "step": 4428 + }, + { + "epoch": 0.3774823148384897, + "grad_norm": 96.67724722140915, + "learning_rate": 9.979743003401875e-06, + "loss": 4.5911, + "step": 4429 + }, + { + "epoch": 0.3775675445325151, + "grad_norm": 72.7181675201244, + "learning_rate": 9.979698390132442e-06, + "loss": 4.1386, + "step": 4430 + }, + { + "epoch": 0.37765277422654053, + "grad_norm": 58.36302446884132, + "learning_rate": 9.979653727889652e-06, + "loss": 3.0081, + "step": 4431 + }, + { + "epoch": 0.37773800392056595, + "grad_norm": 73.24660639211912, + "learning_rate": 9.979609016673948e-06, + "loss": 5.1139, + "step": 4432 + }, + { + "epoch": 0.3778232336145913, + "grad_norm": 50.94811075896834, + "learning_rate": 9.979564256485768e-06, + "loss": 3.8575, + "step": 4433 + }, + { + "epoch": 0.3779084633086167, + "grad_norm": 122.13375775481, + "learning_rate": 9.97951944732555e-06, + "loss": 3.6606, + "step": 4434 + }, + { + "epoch": 0.37799369300264213, + "grad_norm": 53.49238974144025, + "learning_rate": 9.979474589193738e-06, + "loss": 3.5833, + "step": 4435 + }, + { + "epoch": 0.37807892269666754, + "grad_norm": 48.54446970826296, + "learning_rate": 9.979429682090773e-06, + "loss": 3.6451, + "step": 4436 + }, + { + "epoch": 0.3781641523906929, + "grad_norm": 77.28933632177456, + "learning_rate": 9.979384726017097e-06, + "loss": 4.94, + "step": 4437 + }, + { + "epoch": 0.3782493820847183, + "grad_norm": 97.75343851025372, + "learning_rate": 9.979339720973148e-06, + "loss": 4.7902, + "step": 4438 + }, + { + "epoch": 0.3783346117787437, + "grad_norm": 82.65379712385838, + "learning_rate": 9.979294666959373e-06, + "loss": 4.0538, + "step": 4439 + }, + { + "epoch": 0.37841984147276914, + "grad_norm": 34.900468906925695, + "learning_rate": 9.979249563976212e-06, + "loss": 4.0377, + "step": 4440 + }, + { + "epoch": 0.3785050711667945, + "grad_norm": 63.631857199668666, + "learning_rate": 9.979204412024112e-06, + "loss": 4.3937, + "step": 4441 + }, + { + "epoch": 0.3785903008608199, + "grad_norm": 71.63346378199773, + "learning_rate": 9.979159211103515e-06, + "loss": 4.6394, + "step": 4442 + }, + { + "epoch": 0.3786755305548453, + "grad_norm": 64.9210656407757, + "learning_rate": 9.979113961214863e-06, + "loss": 4.6798, + "step": 4443 + }, + { + "epoch": 0.37876076024887073, + "grad_norm": 56.63602357351818, + "learning_rate": 9.979068662358606e-06, + "loss": 4.2377, + "step": 4444 + }, + { + "epoch": 0.3788459899428961, + "grad_norm": 123.50169870883508, + "learning_rate": 9.979023314535189e-06, + "loss": 5.3015, + "step": 4445 + }, + { + "epoch": 0.3789312196369215, + "grad_norm": 74.0784550665339, + "learning_rate": 9.978977917745053e-06, + "loss": 3.9222, + "step": 4446 + }, + { + "epoch": 0.3790164493309469, + "grad_norm": 54.81811756052882, + "learning_rate": 9.978932471988648e-06, + "loss": 2.5768, + "step": 4447 + }, + { + "epoch": 0.3791016790249723, + "grad_norm": 99.89573947184302, + "learning_rate": 9.978886977266421e-06, + "loss": 3.9335, + "step": 4448 + }, + { + "epoch": 0.3791869087189977, + "grad_norm": 98.79433972295278, + "learning_rate": 9.978841433578819e-06, + "loss": 4.5608, + "step": 4449 + }, + { + "epoch": 0.3792721384130231, + "grad_norm": 94.21026197954029, + "learning_rate": 9.97879584092629e-06, + "loss": 3.34, + "step": 4450 + }, + { + "epoch": 0.3793573681070485, + "grad_norm": 66.86842783600302, + "learning_rate": 9.978750199309283e-06, + "loss": 4.3453, + "step": 4451 + }, + { + "epoch": 0.37944259780107387, + "grad_norm": 45.45171083956204, + "learning_rate": 9.978704508728245e-06, + "loss": 3.747, + "step": 4452 + }, + { + "epoch": 0.3795278274950993, + "grad_norm": 117.84377310242446, + "learning_rate": 9.978658769183628e-06, + "loss": 4.7057, + "step": 4453 + }, + { + "epoch": 0.3796130571891247, + "grad_norm": 85.67439843210907, + "learning_rate": 9.978612980675879e-06, + "loss": 3.0181, + "step": 4454 + }, + { + "epoch": 0.3796982868831501, + "grad_norm": 78.83510228864728, + "learning_rate": 9.97856714320545e-06, + "loss": 3.967, + "step": 4455 + }, + { + "epoch": 0.37978351657717546, + "grad_norm": 94.5001209504749, + "learning_rate": 9.978521256772792e-06, + "loss": 4.2553, + "step": 4456 + }, + { + "epoch": 0.3798687462712009, + "grad_norm": 60.11442928156567, + "learning_rate": 9.978475321378355e-06, + "loss": 3.219, + "step": 4457 + }, + { + "epoch": 0.3799539759652263, + "grad_norm": 141.39373901570818, + "learning_rate": 9.978429337022592e-06, + "loss": 5.3762, + "step": 4458 + }, + { + "epoch": 0.3800392056592517, + "grad_norm": 39.80375561834037, + "learning_rate": 9.978383303705955e-06, + "loss": 3.082, + "step": 4459 + }, + { + "epoch": 0.38012443535327706, + "grad_norm": 151.87154627404513, + "learning_rate": 9.978337221428895e-06, + "loss": 3.964, + "step": 4460 + }, + { + "epoch": 0.38020966504730247, + "grad_norm": 56.1908607614599, + "learning_rate": 9.978291090191868e-06, + "loss": 4.3937, + "step": 4461 + }, + { + "epoch": 0.3802948947413279, + "grad_norm": 132.50397648980558, + "learning_rate": 9.978244909995324e-06, + "loss": 5.305, + "step": 4462 + }, + { + "epoch": 0.3803801244353533, + "grad_norm": 56.32561673078198, + "learning_rate": 9.978198680839721e-06, + "loss": 3.2948, + "step": 4463 + }, + { + "epoch": 0.38046535412937865, + "grad_norm": 83.8387432606978, + "learning_rate": 9.978152402725514e-06, + "loss": 3.6344, + "step": 4464 + }, + { + "epoch": 0.38055058382340406, + "grad_norm": 102.15791270318299, + "learning_rate": 9.978106075653153e-06, + "loss": 4.6307, + "step": 4465 + }, + { + "epoch": 0.3806358135174295, + "grad_norm": 41.32878441537846, + "learning_rate": 9.9780596996231e-06, + "loss": 3.1162, + "step": 4466 + }, + { + "epoch": 0.3807210432114549, + "grad_norm": 52.06146818094554, + "learning_rate": 9.978013274635805e-06, + "loss": 3.5391, + "step": 4467 + }, + { + "epoch": 0.38080627290548025, + "grad_norm": 95.70400369625271, + "learning_rate": 9.97796680069173e-06, + "loss": 3.772, + "step": 4468 + }, + { + "epoch": 0.38089150259950566, + "grad_norm": 150.7981061891385, + "learning_rate": 9.977920277791329e-06, + "loss": 4.8292, + "step": 4469 + }, + { + "epoch": 0.3809767322935311, + "grad_norm": 63.69519682695663, + "learning_rate": 9.977873705935059e-06, + "loss": 3.0175, + "step": 4470 + }, + { + "epoch": 0.3810619619875565, + "grad_norm": 74.23306935679233, + "learning_rate": 9.977827085123379e-06, + "loss": 3.6141, + "step": 4471 + }, + { + "epoch": 0.38114719168158184, + "grad_norm": 91.30169493924902, + "learning_rate": 9.977780415356748e-06, + "loss": 4.1174, + "step": 4472 + }, + { + "epoch": 0.38123242137560726, + "grad_norm": 366.275839007717, + "learning_rate": 9.977733696635625e-06, + "loss": 3.287, + "step": 4473 + }, + { + "epoch": 0.38131765106963267, + "grad_norm": 106.74760013080329, + "learning_rate": 9.977686928960469e-06, + "loss": 4.7286, + "step": 4474 + }, + { + "epoch": 0.3814028807636581, + "grad_norm": 49.06319031645223, + "learning_rate": 9.97764011233174e-06, + "loss": 3.5717, + "step": 4475 + }, + { + "epoch": 0.38148811045768344, + "grad_norm": 221.90846922529983, + "learning_rate": 9.977593246749898e-06, + "loss": 3.4323, + "step": 4476 + }, + { + "epoch": 0.38157334015170885, + "grad_norm": 100.10806964459637, + "learning_rate": 9.977546332215404e-06, + "loss": 4.9646, + "step": 4477 + }, + { + "epoch": 0.38165856984573426, + "grad_norm": 48.216096796352886, + "learning_rate": 9.977499368728722e-06, + "loss": 3.8689, + "step": 4478 + }, + { + "epoch": 0.3817437995397597, + "grad_norm": 60.792216789833056, + "learning_rate": 9.977452356290309e-06, + "loss": 3.2842, + "step": 4479 + }, + { + "epoch": 0.38182902923378503, + "grad_norm": 137.67428225172915, + "learning_rate": 9.97740529490063e-06, + "loss": 5.7795, + "step": 4480 + }, + { + "epoch": 0.38191425892781045, + "grad_norm": 139.21793045857328, + "learning_rate": 9.977358184560148e-06, + "loss": 4.5077, + "step": 4481 + }, + { + "epoch": 0.38199948862183586, + "grad_norm": 59.0190329644561, + "learning_rate": 9.977311025269328e-06, + "loss": 3.2786, + "step": 4482 + }, + { + "epoch": 0.38208471831586127, + "grad_norm": 178.67508468288673, + "learning_rate": 9.97726381702863e-06, + "loss": 4.7339, + "step": 4483 + }, + { + "epoch": 0.38216994800988663, + "grad_norm": 78.67898115683933, + "learning_rate": 9.97721655983852e-06, + "loss": 4.4969, + "step": 4484 + }, + { + "epoch": 0.38225517770391204, + "grad_norm": 45.36628350698702, + "learning_rate": 9.977169253699465e-06, + "loss": 3.7546, + "step": 4485 + }, + { + "epoch": 0.38234040739793745, + "grad_norm": 156.75032319755192, + "learning_rate": 9.977121898611926e-06, + "loss": 3.2478, + "step": 4486 + }, + { + "epoch": 0.38242563709196287, + "grad_norm": 35.25353423676869, + "learning_rate": 9.97707449457637e-06, + "loss": 3.3332, + "step": 4487 + }, + { + "epoch": 0.3825108667859882, + "grad_norm": 42.903319244487925, + "learning_rate": 9.977027041593268e-06, + "loss": 2.7138, + "step": 4488 + }, + { + "epoch": 0.38259609648001364, + "grad_norm": 79.34180965565247, + "learning_rate": 9.976979539663079e-06, + "loss": 4.1281, + "step": 4489 + }, + { + "epoch": 0.38268132617403905, + "grad_norm": 96.7076870974422, + "learning_rate": 9.976931988786277e-06, + "loss": 4.7437, + "step": 4490 + }, + { + "epoch": 0.3827665558680644, + "grad_norm": 62.00110427388447, + "learning_rate": 9.976884388963324e-06, + "loss": 3.5507, + "step": 4491 + }, + { + "epoch": 0.3828517855620898, + "grad_norm": 46.716995083107214, + "learning_rate": 9.97683674019469e-06, + "loss": 3.2258, + "step": 4492 + }, + { + "epoch": 0.38293701525611523, + "grad_norm": 244.82762587461235, + "learning_rate": 9.976789042480846e-06, + "loss": 5.0334, + "step": 4493 + }, + { + "epoch": 0.38302224495014064, + "grad_norm": 178.10535917706798, + "learning_rate": 9.976741295822258e-06, + "loss": 3.6835, + "step": 4494 + }, + { + "epoch": 0.383107474644166, + "grad_norm": 112.14509505764246, + "learning_rate": 9.9766935002194e-06, + "loss": 4.5667, + "step": 4495 + }, + { + "epoch": 0.3831927043381914, + "grad_norm": 92.1048036962047, + "learning_rate": 9.976645655672736e-06, + "loss": 4.0129, + "step": 4496 + }, + { + "epoch": 0.3832779340322168, + "grad_norm": 152.08779637580014, + "learning_rate": 9.97659776218274e-06, + "loss": 3.8304, + "step": 4497 + }, + { + "epoch": 0.38336316372624224, + "grad_norm": 45.47911754812088, + "learning_rate": 9.976549819749883e-06, + "loss": 3.7607, + "step": 4498 + }, + { + "epoch": 0.3834483934202676, + "grad_norm": 89.50214362227376, + "learning_rate": 9.976501828374635e-06, + "loss": 3.4965, + "step": 4499 + }, + { + "epoch": 0.383533623114293, + "grad_norm": 140.01295282322147, + "learning_rate": 9.97645378805747e-06, + "loss": 4.2561, + "step": 4500 + }, + { + "epoch": 0.3836188528083184, + "grad_norm": 52.05445238020751, + "learning_rate": 9.97640569879886e-06, + "loss": 3.7474, + "step": 4501 + }, + { + "epoch": 0.38370408250234384, + "grad_norm": 76.58123128009912, + "learning_rate": 9.976357560599277e-06, + "loss": 4.3511, + "step": 4502 + }, + { + "epoch": 0.3837893121963692, + "grad_norm": 50.18749162978012, + "learning_rate": 9.976309373459194e-06, + "loss": 3.6235, + "step": 4503 + }, + { + "epoch": 0.3838745418903946, + "grad_norm": 100.29814959011982, + "learning_rate": 9.976261137379086e-06, + "loss": 3.9993, + "step": 4504 + }, + { + "epoch": 0.38395977158442, + "grad_norm": 137.88324701823157, + "learning_rate": 9.976212852359426e-06, + "loss": 4.2568, + "step": 4505 + }, + { + "epoch": 0.38404500127844543, + "grad_norm": 121.38996855151055, + "learning_rate": 9.976164518400692e-06, + "loss": 4.6859, + "step": 4506 + }, + { + "epoch": 0.3841302309724708, + "grad_norm": 173.32400473739176, + "learning_rate": 9.976116135503357e-06, + "loss": 4.0076, + "step": 4507 + }, + { + "epoch": 0.3842154606664962, + "grad_norm": 116.06842412851648, + "learning_rate": 9.976067703667895e-06, + "loss": 3.7617, + "step": 4508 + }, + { + "epoch": 0.3843006903605216, + "grad_norm": 75.80719376941148, + "learning_rate": 9.976019222894786e-06, + "loss": 3.6521, + "step": 4509 + }, + { + "epoch": 0.384385920054547, + "grad_norm": 50.954772750176105, + "learning_rate": 9.975970693184507e-06, + "loss": 4.3497, + "step": 4510 + }, + { + "epoch": 0.3844711497485724, + "grad_norm": 50.82494945033514, + "learning_rate": 9.97592211453753e-06, + "loss": 3.3795, + "step": 4511 + }, + { + "epoch": 0.3845563794425978, + "grad_norm": 110.05020065556958, + "learning_rate": 9.975873486954339e-06, + "loss": 3.6704, + "step": 4512 + }, + { + "epoch": 0.3846416091366232, + "grad_norm": 57.199053623735864, + "learning_rate": 9.975824810435409e-06, + "loss": 4.1539, + "step": 4513 + }, + { + "epoch": 0.3847268388306486, + "grad_norm": 55.04874604457098, + "learning_rate": 9.975776084981218e-06, + "loss": 2.9578, + "step": 4514 + }, + { + "epoch": 0.384812068524674, + "grad_norm": 84.29754976003605, + "learning_rate": 9.975727310592247e-06, + "loss": 4.5565, + "step": 4515 + }, + { + "epoch": 0.3848972982186994, + "grad_norm": 49.611253954498615, + "learning_rate": 9.975678487268974e-06, + "loss": 2.5763, + "step": 4516 + }, + { + "epoch": 0.3849825279127248, + "grad_norm": 106.34569949622927, + "learning_rate": 9.975629615011882e-06, + "loss": 3.1905, + "step": 4517 + }, + { + "epoch": 0.3850677576067502, + "grad_norm": 80.56651981932863, + "learning_rate": 9.97558069382145e-06, + "loss": 3.6852, + "step": 4518 + }, + { + "epoch": 0.3851529873007756, + "grad_norm": 78.97179255165716, + "learning_rate": 9.975531723698158e-06, + "loss": 3.927, + "step": 4519 + }, + { + "epoch": 0.385238216994801, + "grad_norm": 51.281794286703686, + "learning_rate": 9.97548270464249e-06, + "loss": 3.3608, + "step": 4520 + }, + { + "epoch": 0.3853234466888264, + "grad_norm": 95.32156254665529, + "learning_rate": 9.975433636654923e-06, + "loss": 5.0388, + "step": 4521 + }, + { + "epoch": 0.3854086763828518, + "grad_norm": 145.9722465709633, + "learning_rate": 9.975384519735946e-06, + "loss": 4.8195, + "step": 4522 + }, + { + "epoch": 0.38549390607687717, + "grad_norm": 102.06742681619303, + "learning_rate": 9.97533535388604e-06, + "loss": 3.5963, + "step": 4523 + }, + { + "epoch": 0.3855791357709026, + "grad_norm": 36.812724808285786, + "learning_rate": 9.975286139105686e-06, + "loss": 3.1416, + "step": 4524 + }, + { + "epoch": 0.385664365464928, + "grad_norm": 75.10689736046065, + "learning_rate": 9.97523687539537e-06, + "loss": 4.0619, + "step": 4525 + }, + { + "epoch": 0.3857495951589534, + "grad_norm": 70.9247089822408, + "learning_rate": 9.975187562755576e-06, + "loss": 4.3188, + "step": 4526 + }, + { + "epoch": 0.38583482485297876, + "grad_norm": 78.85746934686279, + "learning_rate": 9.97513820118679e-06, + "loss": 4.0081, + "step": 4527 + }, + { + "epoch": 0.3859200545470042, + "grad_norm": 119.63355789324812, + "learning_rate": 9.975088790689497e-06, + "loss": 3.7986, + "step": 4528 + }, + { + "epoch": 0.3860052842410296, + "grad_norm": 49.4147224877708, + "learning_rate": 9.975039331264184e-06, + "loss": 2.8116, + "step": 4529 + }, + { + "epoch": 0.38609051393505495, + "grad_norm": 93.56395490901726, + "learning_rate": 9.974989822911335e-06, + "loss": 4.0291, + "step": 4530 + }, + { + "epoch": 0.38617574362908036, + "grad_norm": 69.81634208154132, + "learning_rate": 9.974940265631436e-06, + "loss": 3.889, + "step": 4531 + }, + { + "epoch": 0.38626097332310577, + "grad_norm": 56.77526413247323, + "learning_rate": 9.974890659424978e-06, + "loss": 3.9892, + "step": 4532 + }, + { + "epoch": 0.3863462030171312, + "grad_norm": 62.73810296047534, + "learning_rate": 9.974841004292446e-06, + "loss": 3.958, + "step": 4533 + }, + { + "epoch": 0.38643143271115654, + "grad_norm": 76.89344696850097, + "learning_rate": 9.974791300234332e-06, + "loss": 3.562, + "step": 4534 + }, + { + "epoch": 0.38651666240518195, + "grad_norm": 81.29331119934346, + "learning_rate": 9.97474154725112e-06, + "loss": 3.9887, + "step": 4535 + }, + { + "epoch": 0.38660189209920737, + "grad_norm": 52.126076977538524, + "learning_rate": 9.974691745343303e-06, + "loss": 2.8209, + "step": 4536 + }, + { + "epoch": 0.3866871217932328, + "grad_norm": 50.845553790591325, + "learning_rate": 9.974641894511368e-06, + "loss": 3.6548, + "step": 4537 + }, + { + "epoch": 0.38677235148725814, + "grad_norm": 58.146865131862675, + "learning_rate": 9.97459199475581e-06, + "loss": 4.713, + "step": 4538 + }, + { + "epoch": 0.38685758118128355, + "grad_norm": 52.52773123157273, + "learning_rate": 9.974542046077112e-06, + "loss": 3.932, + "step": 4539 + }, + { + "epoch": 0.38694281087530896, + "grad_norm": 77.38319418889769, + "learning_rate": 9.974492048475772e-06, + "loss": 3.31, + "step": 4540 + }, + { + "epoch": 0.3870280405693344, + "grad_norm": 69.58178210701922, + "learning_rate": 9.97444200195228e-06, + "loss": 3.354, + "step": 4541 + }, + { + "epoch": 0.38711327026335973, + "grad_norm": 140.5344062166526, + "learning_rate": 9.974391906507126e-06, + "loss": 4.0278, + "step": 4542 + }, + { + "epoch": 0.38719849995738514, + "grad_norm": 39.7938969315902, + "learning_rate": 9.974341762140804e-06, + "loss": 3.1116, + "step": 4543 + }, + { + "epoch": 0.38728372965141056, + "grad_norm": 42.789621851920536, + "learning_rate": 9.97429156885381e-06, + "loss": 3.8226, + "step": 4544 + }, + { + "epoch": 0.38736895934543597, + "grad_norm": 57.728850178515664, + "learning_rate": 9.974241326646632e-06, + "loss": 3.5887, + "step": 4545 + }, + { + "epoch": 0.3874541890394613, + "grad_norm": 55.531205698021886, + "learning_rate": 9.97419103551977e-06, + "loss": 4.3861, + "step": 4546 + }, + { + "epoch": 0.38753941873348674, + "grad_norm": 44.63205157549753, + "learning_rate": 9.974140695473712e-06, + "loss": 3.2787, + "step": 4547 + }, + { + "epoch": 0.38762464842751215, + "grad_norm": 101.41199226875162, + "learning_rate": 9.97409030650896e-06, + "loss": 4.8679, + "step": 4548 + }, + { + "epoch": 0.38770987812153757, + "grad_norm": 87.7705664827696, + "learning_rate": 9.974039868626004e-06, + "loss": 4.2234, + "step": 4549 + }, + { + "epoch": 0.3877951078155629, + "grad_norm": 55.65436481602436, + "learning_rate": 9.973989381825344e-06, + "loss": 3.8486, + "step": 4550 + }, + { + "epoch": 0.38788033750958834, + "grad_norm": 59.50088634502026, + "learning_rate": 9.973938846107474e-06, + "loss": 3.8683, + "step": 4551 + }, + { + "epoch": 0.38796556720361375, + "grad_norm": 45.58615575594585, + "learning_rate": 9.973888261472893e-06, + "loss": 3.0708, + "step": 4552 + }, + { + "epoch": 0.38805079689763916, + "grad_norm": 61.512336226680176, + "learning_rate": 9.973837627922097e-06, + "loss": 3.4765, + "step": 4553 + }, + { + "epoch": 0.3881360265916645, + "grad_norm": 154.57053875602824, + "learning_rate": 9.973786945455583e-06, + "loss": 4.8626, + "step": 4554 + }, + { + "epoch": 0.38822125628568993, + "grad_norm": 106.457165978986, + "learning_rate": 9.973736214073853e-06, + "loss": 4.1109, + "step": 4555 + }, + { + "epoch": 0.38830648597971534, + "grad_norm": 89.5387303339069, + "learning_rate": 9.973685433777403e-06, + "loss": 4.2384, + "step": 4556 + }, + { + "epoch": 0.38839171567374076, + "grad_norm": 42.77127122074847, + "learning_rate": 9.973634604566734e-06, + "loss": 3.3321, + "step": 4557 + }, + { + "epoch": 0.3884769453677661, + "grad_norm": 81.7115637985917, + "learning_rate": 9.973583726442341e-06, + "loss": 4.6499, + "step": 4558 + }, + { + "epoch": 0.3885621750617915, + "grad_norm": 54.87195919315742, + "learning_rate": 9.973532799404733e-06, + "loss": 3.4956, + "step": 4559 + }, + { + "epoch": 0.38864740475581694, + "grad_norm": 77.95524008481439, + "learning_rate": 9.973481823454403e-06, + "loss": 4.1157, + "step": 4560 + }, + { + "epoch": 0.38873263444984235, + "grad_norm": 96.30636598853569, + "learning_rate": 9.973430798591857e-06, + "loss": 4.1658, + "step": 4561 + }, + { + "epoch": 0.3888178641438677, + "grad_norm": 105.71693099038886, + "learning_rate": 9.973379724817595e-06, + "loss": 4.1935, + "step": 4562 + }, + { + "epoch": 0.3889030938378931, + "grad_norm": 37.70700681828784, + "learning_rate": 9.97332860213212e-06, + "loss": 2.5476, + "step": 4563 + }, + { + "epoch": 0.38898832353191853, + "grad_norm": 67.31248224136968, + "learning_rate": 9.973277430535934e-06, + "loss": 4.2461, + "step": 4564 + }, + { + "epoch": 0.3890735532259439, + "grad_norm": 66.3262661672975, + "learning_rate": 9.97322621002954e-06, + "loss": 3.8901, + "step": 4565 + }, + { + "epoch": 0.3891587829199693, + "grad_norm": 54.941617825796826, + "learning_rate": 9.973174940613446e-06, + "loss": 2.8548, + "step": 4566 + }, + { + "epoch": 0.3892440126139947, + "grad_norm": 37.637446020162656, + "learning_rate": 9.973123622288148e-06, + "loss": 2.8658, + "step": 4567 + }, + { + "epoch": 0.38932924230802013, + "grad_norm": 103.98390120614648, + "learning_rate": 9.973072255054158e-06, + "loss": 4.4335, + "step": 4568 + }, + { + "epoch": 0.3894144720020455, + "grad_norm": 73.52184811521906, + "learning_rate": 9.973020838911977e-06, + "loss": 2.9762, + "step": 4569 + }, + { + "epoch": 0.3894997016960709, + "grad_norm": 58.40645724442088, + "learning_rate": 9.972969373862114e-06, + "loss": 4.2062, + "step": 4570 + }, + { + "epoch": 0.3895849313900963, + "grad_norm": 76.84684151435829, + "learning_rate": 9.972917859905072e-06, + "loss": 4.6077, + "step": 4571 + }, + { + "epoch": 0.3896701610841217, + "grad_norm": 59.71707579168221, + "learning_rate": 9.972866297041359e-06, + "loss": 3.8523, + "step": 4572 + }, + { + "epoch": 0.3897553907781471, + "grad_norm": 37.59275325161427, + "learning_rate": 9.972814685271481e-06, + "loss": 2.5914, + "step": 4573 + }, + { + "epoch": 0.3898406204721725, + "grad_norm": 64.08457619776419, + "learning_rate": 9.972763024595947e-06, + "loss": 3.5862, + "step": 4574 + }, + { + "epoch": 0.3899258501661979, + "grad_norm": 40.17949644282979, + "learning_rate": 9.972711315015265e-06, + "loss": 3.8887, + "step": 4575 + }, + { + "epoch": 0.3900110798602233, + "grad_norm": 64.7230029719891, + "learning_rate": 9.972659556529945e-06, + "loss": 3.7715, + "step": 4576 + }, + { + "epoch": 0.3900963095542487, + "grad_norm": 61.45082966484686, + "learning_rate": 9.97260774914049e-06, + "loss": 3.4629, + "step": 4577 + }, + { + "epoch": 0.3901815392482741, + "grad_norm": 96.1199279917023, + "learning_rate": 9.972555892847418e-06, + "loss": 3.877, + "step": 4578 + }, + { + "epoch": 0.3902667689422995, + "grad_norm": 198.90720392201254, + "learning_rate": 9.972503987651233e-06, + "loss": 3.9835, + "step": 4579 + }, + { + "epoch": 0.3903519986363249, + "grad_norm": 80.60322013159913, + "learning_rate": 9.972452033552447e-06, + "loss": 3.1145, + "step": 4580 + }, + { + "epoch": 0.39043722833035027, + "grad_norm": 37.22301948209143, + "learning_rate": 9.97240003055157e-06, + "loss": 3.6618, + "step": 4581 + }, + { + "epoch": 0.3905224580243757, + "grad_norm": 49.46819747197378, + "learning_rate": 9.972347978649116e-06, + "loss": 3.7737, + "step": 4582 + }, + { + "epoch": 0.3906076877184011, + "grad_norm": 150.42329671032863, + "learning_rate": 9.972295877845594e-06, + "loss": 4.2408, + "step": 4583 + }, + { + "epoch": 0.3906929174124265, + "grad_norm": 78.88888700388611, + "learning_rate": 9.97224372814152e-06, + "loss": 4.1064, + "step": 4584 + }, + { + "epoch": 0.39077814710645187, + "grad_norm": 223.7612073171264, + "learning_rate": 9.972191529537405e-06, + "loss": 4.3805, + "step": 4585 + }, + { + "epoch": 0.3908633768004773, + "grad_norm": 43.98821218750787, + "learning_rate": 9.972139282033759e-06, + "loss": 4.1764, + "step": 4586 + }, + { + "epoch": 0.3909486064945027, + "grad_norm": 89.4480188408694, + "learning_rate": 9.972086985631103e-06, + "loss": 4.8793, + "step": 4587 + }, + { + "epoch": 0.3910338361885281, + "grad_norm": 36.3312895021546, + "learning_rate": 9.972034640329944e-06, + "loss": 3.7576, + "step": 4588 + }, + { + "epoch": 0.39111906588255346, + "grad_norm": 48.256781441360985, + "learning_rate": 9.9719822461308e-06, + "loss": 3.199, + "step": 4589 + }, + { + "epoch": 0.3912042955765789, + "grad_norm": 122.66812734050856, + "learning_rate": 9.971929803034188e-06, + "loss": 4.4797, + "step": 4590 + }, + { + "epoch": 0.3912895252706043, + "grad_norm": 43.30544447161124, + "learning_rate": 9.971877311040621e-06, + "loss": 3.9416, + "step": 4591 + }, + { + "epoch": 0.3913747549646297, + "grad_norm": 76.81688838015258, + "learning_rate": 9.971824770150615e-06, + "loss": 4.3379, + "step": 4592 + }, + { + "epoch": 0.39145998465865506, + "grad_norm": 149.41237143504517, + "learning_rate": 9.97177218036469e-06, + "loss": 5.3052, + "step": 4593 + }, + { + "epoch": 0.39154521435268047, + "grad_norm": 69.47832715832728, + "learning_rate": 9.971719541683362e-06, + "loss": 3.6154, + "step": 4594 + }, + { + "epoch": 0.3916304440467059, + "grad_norm": 83.56799966463933, + "learning_rate": 9.971666854107146e-06, + "loss": 4.5739, + "step": 4595 + }, + { + "epoch": 0.3917156737407313, + "grad_norm": 64.09626878015516, + "learning_rate": 9.971614117636564e-06, + "loss": 4.309, + "step": 4596 + }, + { + "epoch": 0.39180090343475665, + "grad_norm": 56.56038807922973, + "learning_rate": 9.97156133227213e-06, + "loss": 3.661, + "step": 4597 + }, + { + "epoch": 0.39188613312878207, + "grad_norm": 199.66021649951625, + "learning_rate": 9.971508498014366e-06, + "loss": 3.035, + "step": 4598 + }, + { + "epoch": 0.3919713628228075, + "grad_norm": 540.0141624971089, + "learning_rate": 9.971455614863793e-06, + "loss": 2.3726, + "step": 4599 + }, + { + "epoch": 0.3920565925168329, + "grad_norm": 147.69810977054613, + "learning_rate": 9.97140268282093e-06, + "loss": 5.6554, + "step": 4600 + }, + { + "epoch": 0.39214182221085825, + "grad_norm": 45.09369672803256, + "learning_rate": 9.971349701886294e-06, + "loss": 2.7847, + "step": 4601 + }, + { + "epoch": 0.39222705190488366, + "grad_norm": 124.56697338384933, + "learning_rate": 9.971296672060412e-06, + "loss": 4.3843, + "step": 4602 + }, + { + "epoch": 0.3923122815989091, + "grad_norm": 48.56305703690648, + "learning_rate": 9.971243593343803e-06, + "loss": 3.2498, + "step": 4603 + }, + { + "epoch": 0.39239751129293443, + "grad_norm": 77.81729463791363, + "learning_rate": 9.971190465736986e-06, + "loss": 4.3272, + "step": 4604 + }, + { + "epoch": 0.39248274098695984, + "grad_norm": 1470.499210865023, + "learning_rate": 9.971137289240487e-06, + "loss": 4.9119, + "step": 4605 + }, + { + "epoch": 0.39256797068098526, + "grad_norm": 147.37877932047348, + "learning_rate": 9.97108406385483e-06, + "loss": 4.9121, + "step": 4606 + }, + { + "epoch": 0.39265320037501067, + "grad_norm": 41.856768515969186, + "learning_rate": 9.971030789580534e-06, + "loss": 3.5338, + "step": 4607 + }, + { + "epoch": 0.392738430069036, + "grad_norm": 88.21671011218778, + "learning_rate": 9.970977466418129e-06, + "loss": 4.7369, + "step": 4608 + }, + { + "epoch": 0.39282365976306144, + "grad_norm": 59.36878942359035, + "learning_rate": 9.970924094368132e-06, + "loss": 4.4527, + "step": 4609 + }, + { + "epoch": 0.39290888945708685, + "grad_norm": 97.96122164511863, + "learning_rate": 9.970870673431074e-06, + "loss": 3.7297, + "step": 4610 + }, + { + "epoch": 0.39299411915111226, + "grad_norm": 86.82147779586133, + "learning_rate": 9.970817203607477e-06, + "loss": 4.0823, + "step": 4611 + }, + { + "epoch": 0.3930793488451376, + "grad_norm": 65.35150846846423, + "learning_rate": 9.970763684897868e-06, + "loss": 3.9115, + "step": 4612 + }, + { + "epoch": 0.39316457853916303, + "grad_norm": 112.22055211719965, + "learning_rate": 9.970710117302774e-06, + "loss": 4.372, + "step": 4613 + }, + { + "epoch": 0.39324980823318845, + "grad_norm": 39.89096276180924, + "learning_rate": 9.97065650082272e-06, + "loss": 3.3754, + "step": 4614 + }, + { + "epoch": 0.39333503792721386, + "grad_norm": 68.75917219361408, + "learning_rate": 9.970602835458236e-06, + "loss": 4.4944, + "step": 4615 + }, + { + "epoch": 0.3934202676212392, + "grad_norm": 117.18824479398252, + "learning_rate": 9.970549121209848e-06, + "loss": 4.3731, + "step": 4616 + }, + { + "epoch": 0.39350549731526463, + "grad_norm": 137.11849953691473, + "learning_rate": 9.970495358078085e-06, + "loss": 4.5762, + "step": 4617 + }, + { + "epoch": 0.39359072700929004, + "grad_norm": 192.79241363952693, + "learning_rate": 9.970441546063473e-06, + "loss": 4.1591, + "step": 4618 + }, + { + "epoch": 0.39367595670331545, + "grad_norm": 91.54097985425581, + "learning_rate": 9.970387685166545e-06, + "loss": 5.0157, + "step": 4619 + }, + { + "epoch": 0.3937611863973408, + "grad_norm": 88.31179432301056, + "learning_rate": 9.97033377538783e-06, + "loss": 5.7087, + "step": 4620 + }, + { + "epoch": 0.3938464160913662, + "grad_norm": 71.06183573155177, + "learning_rate": 9.970279816727856e-06, + "loss": 5.4726, + "step": 4621 + }, + { + "epoch": 0.39393164578539164, + "grad_norm": 38.592820683520635, + "learning_rate": 9.970225809187156e-06, + "loss": 3.1276, + "step": 4622 + }, + { + "epoch": 0.39401687547941705, + "grad_norm": 59.953998596698376, + "learning_rate": 9.970171752766258e-06, + "loss": 4.5104, + "step": 4623 + }, + { + "epoch": 0.3941021051734424, + "grad_norm": 180.41955305243397, + "learning_rate": 9.970117647465698e-06, + "loss": 1.6991, + "step": 4624 + }, + { + "epoch": 0.3941873348674678, + "grad_norm": 61.69238949249836, + "learning_rate": 9.970063493286005e-06, + "loss": 4.5958, + "step": 4625 + }, + { + "epoch": 0.39427256456149323, + "grad_norm": 38.080665108798094, + "learning_rate": 9.970009290227712e-06, + "loss": 2.5252, + "step": 4626 + }, + { + "epoch": 0.39435779425551865, + "grad_norm": 85.62307004859954, + "learning_rate": 9.969955038291354e-06, + "loss": 4.0421, + "step": 4627 + }, + { + "epoch": 0.394443023949544, + "grad_norm": 130.72594332343638, + "learning_rate": 9.969900737477463e-06, + "loss": 3.944, + "step": 4628 + }, + { + "epoch": 0.3945282536435694, + "grad_norm": 105.03656279481477, + "learning_rate": 9.96984638778657e-06, + "loss": 4.5389, + "step": 4629 + }, + { + "epoch": 0.39461348333759483, + "grad_norm": 132.77931857153686, + "learning_rate": 9.969791989219216e-06, + "loss": 3.4358, + "step": 4630 + }, + { + "epoch": 0.39469871303162024, + "grad_norm": 50.97902138964264, + "learning_rate": 9.96973754177593e-06, + "loss": 2.8766, + "step": 4631 + }, + { + "epoch": 0.3947839427256456, + "grad_norm": 56.91841635987828, + "learning_rate": 9.969683045457252e-06, + "loss": 3.6731, + "step": 4632 + }, + { + "epoch": 0.394869172419671, + "grad_norm": 69.3126217740489, + "learning_rate": 9.969628500263717e-06, + "loss": 3.668, + "step": 4633 + }, + { + "epoch": 0.3949544021136964, + "grad_norm": 266.86944836367786, + "learning_rate": 9.969573906195859e-06, + "loss": 5.0765, + "step": 4634 + }, + { + "epoch": 0.39503963180772184, + "grad_norm": 125.56298256665733, + "learning_rate": 9.969519263254215e-06, + "loss": 3.8395, + "step": 4635 + }, + { + "epoch": 0.3951248615017472, + "grad_norm": 258.16876298039915, + "learning_rate": 9.969464571439326e-06, + "loss": 4.2252, + "step": 4636 + }, + { + "epoch": 0.3952100911957726, + "grad_norm": 42.615458957764595, + "learning_rate": 9.969409830751727e-06, + "loss": 3.4934, + "step": 4637 + }, + { + "epoch": 0.395295320889798, + "grad_norm": 76.29152721493145, + "learning_rate": 9.969355041191956e-06, + "loss": 4.5303, + "step": 4638 + }, + { + "epoch": 0.39538055058382343, + "grad_norm": 112.409913263592, + "learning_rate": 9.969300202760553e-06, + "loss": 4.9536, + "step": 4639 + }, + { + "epoch": 0.3954657802778488, + "grad_norm": 65.25669011330963, + "learning_rate": 9.969245315458059e-06, + "loss": 4.5411, + "step": 4640 + }, + { + "epoch": 0.3955510099718742, + "grad_norm": 237.16401252498335, + "learning_rate": 9.96919037928501e-06, + "loss": 5.0321, + "step": 4641 + }, + { + "epoch": 0.3956362396658996, + "grad_norm": 118.33307221828362, + "learning_rate": 9.969135394241948e-06, + "loss": 4.169, + "step": 4642 + }, + { + "epoch": 0.39572146935992497, + "grad_norm": 46.39385018229466, + "learning_rate": 9.969080360329414e-06, + "loss": 3.7554, + "step": 4643 + }, + { + "epoch": 0.3958066990539504, + "grad_norm": 658.7680260161379, + "learning_rate": 9.96902527754795e-06, + "loss": 3.3399, + "step": 4644 + }, + { + "epoch": 0.3958919287479758, + "grad_norm": 136.5462942600915, + "learning_rate": 9.968970145898097e-06, + "loss": 4.552, + "step": 4645 + }, + { + "epoch": 0.3959771584420012, + "grad_norm": 49.51055814251923, + "learning_rate": 9.968914965380396e-06, + "loss": 3.6762, + "step": 4646 + }, + { + "epoch": 0.39606238813602657, + "grad_norm": 36.93380012376101, + "learning_rate": 9.968859735995391e-06, + "loss": 2.8129, + "step": 4647 + }, + { + "epoch": 0.396147617830052, + "grad_norm": 93.92195933834068, + "learning_rate": 9.968804457743626e-06, + "loss": 4.0409, + "step": 4648 + }, + { + "epoch": 0.3962328475240774, + "grad_norm": 143.08931461135242, + "learning_rate": 9.968749130625644e-06, + "loss": 5.095, + "step": 4649 + }, + { + "epoch": 0.3963180772181028, + "grad_norm": 60.059906833930675, + "learning_rate": 9.968693754641987e-06, + "loss": 3.8729, + "step": 4650 + }, + { + "epoch": 0.39640330691212816, + "grad_norm": 58.8665178501061, + "learning_rate": 9.968638329793202e-06, + "loss": 3.8971, + "step": 4651 + }, + { + "epoch": 0.3964885366061536, + "grad_norm": 97.47524993981642, + "learning_rate": 9.968582856079834e-06, + "loss": 5.9662, + "step": 4652 + }, + { + "epoch": 0.396573766300179, + "grad_norm": 287.4068232375016, + "learning_rate": 9.968527333502426e-06, + "loss": 4.6423, + "step": 4653 + }, + { + "epoch": 0.3966589959942044, + "grad_norm": 48.805909147096244, + "learning_rate": 9.968471762061526e-06, + "loss": 3.8105, + "step": 4654 + }, + { + "epoch": 0.39674422568822976, + "grad_norm": 59.37584916831378, + "learning_rate": 9.968416141757684e-06, + "loss": 2.6861, + "step": 4655 + }, + { + "epoch": 0.39682945538225517, + "grad_norm": 56.491241181418445, + "learning_rate": 9.968360472591442e-06, + "loss": 3.6195, + "step": 4656 + }, + { + "epoch": 0.3969146850762806, + "grad_norm": 59.72785052234774, + "learning_rate": 9.968304754563346e-06, + "loss": 4.3513, + "step": 4657 + }, + { + "epoch": 0.396999914770306, + "grad_norm": 71.29314696155939, + "learning_rate": 9.968248987673949e-06, + "loss": 4.2165, + "step": 4658 + }, + { + "epoch": 0.39708514446433135, + "grad_norm": 101.77296176078555, + "learning_rate": 9.968193171923798e-06, + "loss": 5.0149, + "step": 4659 + }, + { + "epoch": 0.39717037415835676, + "grad_norm": 63.90007565431731, + "learning_rate": 9.968137307313441e-06, + "loss": 3.1522, + "step": 4660 + }, + { + "epoch": 0.3972556038523822, + "grad_norm": 60.62767201789494, + "learning_rate": 9.968081393843427e-06, + "loss": 5.2485, + "step": 4661 + }, + { + "epoch": 0.3973408335464076, + "grad_norm": 65.50649544732363, + "learning_rate": 9.968025431514308e-06, + "loss": 3.9142, + "step": 4662 + }, + { + "epoch": 0.39742606324043295, + "grad_norm": 88.48232881947727, + "learning_rate": 9.967969420326633e-06, + "loss": 3.262, + "step": 4663 + }, + { + "epoch": 0.39751129293445836, + "grad_norm": 129.05567802750238, + "learning_rate": 9.967913360280952e-06, + "loss": 4.5038, + "step": 4664 + }, + { + "epoch": 0.3975965226284838, + "grad_norm": 82.09916841776739, + "learning_rate": 9.967857251377816e-06, + "loss": 5.4358, + "step": 4665 + }, + { + "epoch": 0.3976817523225092, + "grad_norm": 95.60701317287543, + "learning_rate": 9.967801093617779e-06, + "loss": 4.583, + "step": 4666 + }, + { + "epoch": 0.39776698201653454, + "grad_norm": 106.7880227364928, + "learning_rate": 9.967744887001393e-06, + "loss": 4.4903, + "step": 4667 + }, + { + "epoch": 0.39785221171055996, + "grad_norm": 45.0210173920571, + "learning_rate": 9.967688631529209e-06, + "loss": 3.0691, + "step": 4668 + }, + { + "epoch": 0.39793744140458537, + "grad_norm": 75.92591423632344, + "learning_rate": 9.967632327201782e-06, + "loss": 3.3702, + "step": 4669 + }, + { + "epoch": 0.3980226710986108, + "grad_norm": 85.40840535183848, + "learning_rate": 9.967575974019665e-06, + "loss": 5.4679, + "step": 4670 + }, + { + "epoch": 0.39810790079263614, + "grad_norm": 36.12511163601138, + "learning_rate": 9.96751957198341e-06, + "loss": 3.2352, + "step": 4671 + }, + { + "epoch": 0.39819313048666155, + "grad_norm": 102.38683905881975, + "learning_rate": 9.967463121093576e-06, + "loss": 4.1738, + "step": 4672 + }, + { + "epoch": 0.39827836018068696, + "grad_norm": 90.9234298149734, + "learning_rate": 9.967406621350717e-06, + "loss": 5.2735, + "step": 4673 + }, + { + "epoch": 0.3983635898747124, + "grad_norm": 32.99841883924217, + "learning_rate": 9.967350072755385e-06, + "loss": 2.7038, + "step": 4674 + }, + { + "epoch": 0.39844881956873773, + "grad_norm": 42.55675957266211, + "learning_rate": 9.967293475308142e-06, + "loss": 4.4009, + "step": 4675 + }, + { + "epoch": 0.39853404926276315, + "grad_norm": 81.49783403039851, + "learning_rate": 9.96723682900954e-06, + "loss": 4.2321, + "step": 4676 + }, + { + "epoch": 0.39861927895678856, + "grad_norm": 157.58002663633687, + "learning_rate": 9.967180133860137e-06, + "loss": 4.353, + "step": 4677 + }, + { + "epoch": 0.39870450865081397, + "grad_norm": 56.72016499010354, + "learning_rate": 9.967123389860493e-06, + "loss": 4.1933, + "step": 4678 + }, + { + "epoch": 0.39878973834483933, + "grad_norm": 39.383815248030835, + "learning_rate": 9.967066597011161e-06, + "loss": 2.9063, + "step": 4679 + }, + { + "epoch": 0.39887496803886474, + "grad_norm": 40.886123526803715, + "learning_rate": 9.967009755312707e-06, + "loss": 3.6002, + "step": 4680 + }, + { + "epoch": 0.39896019773289015, + "grad_norm": 34.800140462800655, + "learning_rate": 9.966952864765682e-06, + "loss": 3.1841, + "step": 4681 + }, + { + "epoch": 0.3990454274269155, + "grad_norm": 69.38396857800889, + "learning_rate": 9.966895925370651e-06, + "loss": 4.5909, + "step": 4682 + }, + { + "epoch": 0.3991306571209409, + "grad_norm": 36.18437635315109, + "learning_rate": 9.966838937128172e-06, + "loss": 2.7089, + "step": 4683 + }, + { + "epoch": 0.39921588681496634, + "grad_norm": 48.655857916569516, + "learning_rate": 9.966781900038804e-06, + "loss": 3.9883, + "step": 4684 + }, + { + "epoch": 0.39930111650899175, + "grad_norm": 101.87973275777777, + "learning_rate": 9.966724814103113e-06, + "loss": 4.6687, + "step": 4685 + }, + { + "epoch": 0.3993863462030171, + "grad_norm": 134.59988054765603, + "learning_rate": 9.966667679321654e-06, + "loss": 4.627, + "step": 4686 + }, + { + "epoch": 0.3994715758970425, + "grad_norm": 55.17730783397517, + "learning_rate": 9.966610495694993e-06, + "loss": 3.638, + "step": 4687 + }, + { + "epoch": 0.39955680559106793, + "grad_norm": 84.43924684123101, + "learning_rate": 9.966553263223693e-06, + "loss": 3.6368, + "step": 4688 + }, + { + "epoch": 0.39964203528509334, + "grad_norm": 57.70855625306666, + "learning_rate": 9.966495981908312e-06, + "loss": 3.2994, + "step": 4689 + }, + { + "epoch": 0.3997272649791187, + "grad_norm": 151.9492232501952, + "learning_rate": 9.966438651749419e-06, + "loss": 4.8496, + "step": 4690 + }, + { + "epoch": 0.3998124946731441, + "grad_norm": 167.7369092248177, + "learning_rate": 9.966381272747573e-06, + "loss": 5.491, + "step": 4691 + }, + { + "epoch": 0.3998977243671695, + "grad_norm": 40.04115307182929, + "learning_rate": 9.966323844903343e-06, + "loss": 2.9558, + "step": 4692 + }, + { + "epoch": 0.39998295406119494, + "grad_norm": 113.86901288309389, + "learning_rate": 9.96626636821729e-06, + "loss": 4.2943, + "step": 4693 + }, + { + "epoch": 0.4000681837552203, + "grad_norm": 45.905038989350835, + "learning_rate": 9.966208842689982e-06, + "loss": 2.0696, + "step": 4694 + }, + { + "epoch": 0.4001534134492457, + "grad_norm": 65.73529036967271, + "learning_rate": 9.966151268321981e-06, + "loss": 4.3564, + "step": 4695 + }, + { + "epoch": 0.4002386431432711, + "grad_norm": 124.83007863111061, + "learning_rate": 9.966093645113859e-06, + "loss": 5.0312, + "step": 4696 + }, + { + "epoch": 0.40032387283729653, + "grad_norm": 56.02139644454264, + "learning_rate": 9.966035973066178e-06, + "loss": 3.2272, + "step": 4697 + }, + { + "epoch": 0.4004091025313219, + "grad_norm": 88.09085103858655, + "learning_rate": 9.965978252179505e-06, + "loss": 3.5744, + "step": 4698 + }, + { + "epoch": 0.4004943322253473, + "grad_norm": 48.26202827545703, + "learning_rate": 9.96592048245441e-06, + "loss": 3.7936, + "step": 4699 + }, + { + "epoch": 0.4005795619193727, + "grad_norm": 45.18013866478271, + "learning_rate": 9.965862663891461e-06, + "loss": 4.1616, + "step": 4700 + }, + { + "epoch": 0.40066479161339813, + "grad_norm": 79.8929543030272, + "learning_rate": 9.965804796491224e-06, + "loss": 3.7403, + "step": 4701 + }, + { + "epoch": 0.4007500213074235, + "grad_norm": 91.21599252259935, + "learning_rate": 9.965746880254272e-06, + "loss": 3.9094, + "step": 4702 + }, + { + "epoch": 0.4008352510014489, + "grad_norm": 41.62892042007941, + "learning_rate": 9.965688915181173e-06, + "loss": 3.763, + "step": 4703 + }, + { + "epoch": 0.4009204806954743, + "grad_norm": 102.53511194453141, + "learning_rate": 9.965630901272496e-06, + "loss": 4.5955, + "step": 4704 + }, + { + "epoch": 0.4010057103894997, + "grad_norm": 111.83052956029904, + "learning_rate": 9.965572838528813e-06, + "loss": 4.0966, + "step": 4705 + }, + { + "epoch": 0.4010909400835251, + "grad_norm": 39.10690079380482, + "learning_rate": 9.965514726950692e-06, + "loss": 3.4803, + "step": 4706 + }, + { + "epoch": 0.4011761697775505, + "grad_norm": 38.91005377805991, + "learning_rate": 9.965456566538708e-06, + "loss": 3.4344, + "step": 4707 + }, + { + "epoch": 0.4012613994715759, + "grad_norm": 46.002837810865124, + "learning_rate": 9.965398357293433e-06, + "loss": 3.9988, + "step": 4708 + }, + { + "epoch": 0.4013466291656013, + "grad_norm": 86.80860280264648, + "learning_rate": 9.965340099215438e-06, + "loss": 4.58, + "step": 4709 + }, + { + "epoch": 0.4014318588596267, + "grad_norm": 53.51089966205711, + "learning_rate": 9.965281792305294e-06, + "loss": 3.7616, + "step": 4710 + }, + { + "epoch": 0.4015170885536521, + "grad_norm": 39.78944933605752, + "learning_rate": 9.965223436563577e-06, + "loss": 3.4845, + "step": 4711 + }, + { + "epoch": 0.4016023182476775, + "grad_norm": 36.92193559234193, + "learning_rate": 9.965165031990862e-06, + "loss": 3.441, + "step": 4712 + }, + { + "epoch": 0.4016875479417029, + "grad_norm": 262.5867871355821, + "learning_rate": 9.96510657858772e-06, + "loss": 5.0558, + "step": 4713 + }, + { + "epoch": 0.4017727776357283, + "grad_norm": 49.51182571097147, + "learning_rate": 9.96504807635473e-06, + "loss": 3.8176, + "step": 4714 + }, + { + "epoch": 0.4018580073297537, + "grad_norm": 31.0842714385147, + "learning_rate": 9.964989525292463e-06, + "loss": 3.6473, + "step": 4715 + }, + { + "epoch": 0.4019432370237791, + "grad_norm": 37.8905419182403, + "learning_rate": 9.964930925401498e-06, + "loss": 3.8477, + "step": 4716 + }, + { + "epoch": 0.40202846671780446, + "grad_norm": 37.6348731660638, + "learning_rate": 9.96487227668241e-06, + "loss": 3.9115, + "step": 4717 + }, + { + "epoch": 0.40211369641182987, + "grad_norm": 80.26433325020909, + "learning_rate": 9.964813579135778e-06, + "loss": 4.9196, + "step": 4718 + }, + { + "epoch": 0.4021989261058553, + "grad_norm": 40.07413653388398, + "learning_rate": 9.964754832762174e-06, + "loss": 3.4756, + "step": 4719 + }, + { + "epoch": 0.4022841557998807, + "grad_norm": 52.329249022088625, + "learning_rate": 9.96469603756218e-06, + "loss": 4.1887, + "step": 4720 + }, + { + "epoch": 0.40236938549390605, + "grad_norm": 41.407830287438784, + "learning_rate": 9.964637193536373e-06, + "loss": 3.7385, + "step": 4721 + }, + { + "epoch": 0.40245461518793146, + "grad_norm": 61.880074776927685, + "learning_rate": 9.964578300685333e-06, + "loss": 3.4737, + "step": 4722 + }, + { + "epoch": 0.4025398448819569, + "grad_norm": 82.82028724644836, + "learning_rate": 9.964519359009639e-06, + "loss": 4.9873, + "step": 4723 + }, + { + "epoch": 0.4026250745759823, + "grad_norm": 175.11437189135475, + "learning_rate": 9.964460368509868e-06, + "loss": 2.7546, + "step": 4724 + }, + { + "epoch": 0.40271030427000765, + "grad_norm": 52.38839179010208, + "learning_rate": 9.964401329186602e-06, + "loss": 3.9886, + "step": 4725 + }, + { + "epoch": 0.40279553396403306, + "grad_norm": 97.92460725075577, + "learning_rate": 9.964342241040422e-06, + "loss": 4.353, + "step": 4726 + }, + { + "epoch": 0.40288076365805847, + "grad_norm": 44.42608877925331, + "learning_rate": 9.96428310407191e-06, + "loss": 3.3739, + "step": 4727 + }, + { + "epoch": 0.4029659933520839, + "grad_norm": 52.52560097385025, + "learning_rate": 9.964223918281644e-06, + "loss": 3.9459, + "step": 4728 + }, + { + "epoch": 0.40305122304610924, + "grad_norm": 57.52911161740925, + "learning_rate": 9.96416468367021e-06, + "loss": 3.7135, + "step": 4729 + }, + { + "epoch": 0.40313645274013465, + "grad_norm": 113.22407907510848, + "learning_rate": 9.96410540023819e-06, + "loss": 4.5677, + "step": 4730 + }, + { + "epoch": 0.40322168243416007, + "grad_norm": 52.85466128474176, + "learning_rate": 9.964046067986162e-06, + "loss": 3.7871, + "step": 4731 + }, + { + "epoch": 0.4033069121281855, + "grad_norm": 48.13368365234068, + "learning_rate": 9.963986686914716e-06, + "loss": 3.2853, + "step": 4732 + }, + { + "epoch": 0.40339214182221084, + "grad_norm": 39.57232439985135, + "learning_rate": 9.963927257024433e-06, + "loss": 3.3377, + "step": 4733 + }, + { + "epoch": 0.40347737151623625, + "grad_norm": 87.38135608014551, + "learning_rate": 9.9638677783159e-06, + "loss": 4.8686, + "step": 4734 + }, + { + "epoch": 0.40356260121026166, + "grad_norm": 151.1327368437993, + "learning_rate": 9.963808250789696e-06, + "loss": 3.5579, + "step": 4735 + }, + { + "epoch": 0.4036478309042871, + "grad_norm": 97.1409179880258, + "learning_rate": 9.963748674446412e-06, + "loss": 3.726, + "step": 4736 + }, + { + "epoch": 0.40373306059831243, + "grad_norm": 108.08928191407354, + "learning_rate": 9.963689049286634e-06, + "loss": 4.4524, + "step": 4737 + }, + { + "epoch": 0.40381829029233784, + "grad_norm": 87.48577203884362, + "learning_rate": 9.963629375310944e-06, + "loss": 4.032, + "step": 4738 + }, + { + "epoch": 0.40390351998636326, + "grad_norm": 188.89694719553924, + "learning_rate": 9.963569652519933e-06, + "loss": 4.0719, + "step": 4739 + }, + { + "epoch": 0.40398874968038867, + "grad_norm": 342.22869935828226, + "learning_rate": 9.963509880914186e-06, + "loss": 3.4083, + "step": 4740 + }, + { + "epoch": 0.404073979374414, + "grad_norm": 92.90482931673377, + "learning_rate": 9.96345006049429e-06, + "loss": 4.7779, + "step": 4741 + }, + { + "epoch": 0.40415920906843944, + "grad_norm": 76.08741562099935, + "learning_rate": 9.963390191260839e-06, + "loss": 3.7157, + "step": 4742 + }, + { + "epoch": 0.40424443876246485, + "grad_norm": 36.5012392993202, + "learning_rate": 9.963330273214413e-06, + "loss": 3.3378, + "step": 4743 + }, + { + "epoch": 0.40432966845649027, + "grad_norm": 40.53389711428854, + "learning_rate": 9.96327030635561e-06, + "loss": 3.0721, + "step": 4744 + }, + { + "epoch": 0.4044148981505156, + "grad_norm": 48.81341538872049, + "learning_rate": 9.963210290685013e-06, + "loss": 3.4065, + "step": 4745 + }, + { + "epoch": 0.40450012784454104, + "grad_norm": 77.6452809944845, + "learning_rate": 9.963150226203218e-06, + "loss": 4.0317, + "step": 4746 + }, + { + "epoch": 0.40458535753856645, + "grad_norm": 38.11787669865391, + "learning_rate": 9.963090112910809e-06, + "loss": 3.4063, + "step": 4747 + }, + { + "epoch": 0.40467058723259186, + "grad_norm": 39.897392966941204, + "learning_rate": 9.963029950808383e-06, + "loss": 2.1925, + "step": 4748 + }, + { + "epoch": 0.4047558169266172, + "grad_norm": 47.933182665962164, + "learning_rate": 9.96296973989653e-06, + "loss": 3.818, + "step": 4749 + }, + { + "epoch": 0.40484104662064263, + "grad_norm": 56.60586600927238, + "learning_rate": 9.962909480175841e-06, + "loss": 2.6678, + "step": 4750 + }, + { + "epoch": 0.40492627631466804, + "grad_norm": 36.713437142515964, + "learning_rate": 9.962849171646908e-06, + "loss": 3.4232, + "step": 4751 + }, + { + "epoch": 0.40501150600869346, + "grad_norm": 151.140080365335, + "learning_rate": 9.962788814310327e-06, + "loss": 5.1994, + "step": 4752 + }, + { + "epoch": 0.4050967357027188, + "grad_norm": 63.69345133399982, + "learning_rate": 9.96272840816669e-06, + "loss": 3.8852, + "step": 4753 + }, + { + "epoch": 0.4051819653967442, + "grad_norm": 81.92027059386093, + "learning_rate": 9.962667953216591e-06, + "loss": 3.6221, + "step": 4754 + }, + { + "epoch": 0.40526719509076964, + "grad_norm": 36.00749347323659, + "learning_rate": 9.962607449460624e-06, + "loss": 3.2859, + "step": 4755 + }, + { + "epoch": 0.405352424784795, + "grad_norm": 230.58291223830633, + "learning_rate": 9.962546896899385e-06, + "loss": 2.4412, + "step": 4756 + }, + { + "epoch": 0.4054376544788204, + "grad_norm": 134.07772706978244, + "learning_rate": 9.962486295533471e-06, + "loss": 6.6816, + "step": 4757 + }, + { + "epoch": 0.4055228841728458, + "grad_norm": 49.15903274641245, + "learning_rate": 9.962425645363474e-06, + "loss": 3.908, + "step": 4758 + }, + { + "epoch": 0.40560811386687123, + "grad_norm": 70.4405747576382, + "learning_rate": 9.962364946389993e-06, + "loss": 3.5695, + "step": 4759 + }, + { + "epoch": 0.4056933435608966, + "grad_norm": 40.68029833330733, + "learning_rate": 9.962304198613626e-06, + "loss": 3.4254, + "step": 4760 + }, + { + "epoch": 0.405778573254922, + "grad_norm": 117.45544693102272, + "learning_rate": 9.962243402034968e-06, + "loss": 4.5292, + "step": 4761 + }, + { + "epoch": 0.4058638029489474, + "grad_norm": 62.7169043548773, + "learning_rate": 9.962182556654617e-06, + "loss": 4.1096, + "step": 4762 + }, + { + "epoch": 0.40594903264297283, + "grad_norm": 71.44721559177744, + "learning_rate": 9.962121662473174e-06, + "loss": 4.6726, + "step": 4763 + }, + { + "epoch": 0.4060342623369982, + "grad_norm": 36.43240847746527, + "learning_rate": 9.962060719491235e-06, + "loss": 3.0164, + "step": 4764 + }, + { + "epoch": 0.4061194920310236, + "grad_norm": 81.97553710056734, + "learning_rate": 9.961999727709401e-06, + "loss": 5.3131, + "step": 4765 + }, + { + "epoch": 0.406204721725049, + "grad_norm": 85.02133564419299, + "learning_rate": 9.961938687128271e-06, + "loss": 4.8859, + "step": 4766 + }, + { + "epoch": 0.4062899514190744, + "grad_norm": 99.5512101596682, + "learning_rate": 9.961877597748446e-06, + "loss": 3.488, + "step": 4767 + }, + { + "epoch": 0.4063751811130998, + "grad_norm": 85.58160944139911, + "learning_rate": 9.961816459570526e-06, + "loss": 4.2449, + "step": 4768 + }, + { + "epoch": 0.4064604108071252, + "grad_norm": 65.4033968766732, + "learning_rate": 9.961755272595113e-06, + "loss": 4.0985, + "step": 4769 + }, + { + "epoch": 0.4065456405011506, + "grad_norm": 46.11119050226467, + "learning_rate": 9.96169403682281e-06, + "loss": 3.7064, + "step": 4770 + }, + { + "epoch": 0.406630870195176, + "grad_norm": 45.1590578221303, + "learning_rate": 9.961632752254215e-06, + "loss": 3.7108, + "step": 4771 + }, + { + "epoch": 0.4067160998892014, + "grad_norm": 49.96935394483503, + "learning_rate": 9.961571418889934e-06, + "loss": 4.1094, + "step": 4772 + }, + { + "epoch": 0.4068013295832268, + "grad_norm": 47.72067555012153, + "learning_rate": 9.961510036730573e-06, + "loss": 2.986, + "step": 4773 + }, + { + "epoch": 0.4068865592772522, + "grad_norm": 78.10772656260976, + "learning_rate": 9.96144860577673e-06, + "loss": 3.6188, + "step": 4774 + }, + { + "epoch": 0.4069717889712776, + "grad_norm": 32.36884802508886, + "learning_rate": 9.96138712602901e-06, + "loss": 2.8385, + "step": 4775 + }, + { + "epoch": 0.40705701866530297, + "grad_norm": 98.18550506618512, + "learning_rate": 9.96132559748802e-06, + "loss": 4.7406, + "step": 4776 + }, + { + "epoch": 0.4071422483593284, + "grad_norm": 82.78031276280288, + "learning_rate": 9.961264020154365e-06, + "loss": 5.2377, + "step": 4777 + }, + { + "epoch": 0.4072274780533538, + "grad_norm": 79.89401167393129, + "learning_rate": 9.961202394028649e-06, + "loss": 4.5741, + "step": 4778 + }, + { + "epoch": 0.4073127077473792, + "grad_norm": 214.49394552113168, + "learning_rate": 9.96114071911148e-06, + "loss": 3.3166, + "step": 4779 + }, + { + "epoch": 0.40739793744140457, + "grad_norm": 48.92702767692178, + "learning_rate": 9.961078995403462e-06, + "loss": 3.9134, + "step": 4780 + }, + { + "epoch": 0.40748316713543, + "grad_norm": 74.27702678474282, + "learning_rate": 9.961017222905204e-06, + "loss": 3.841, + "step": 4781 + }, + { + "epoch": 0.4075683968294554, + "grad_norm": 167.38281872775104, + "learning_rate": 9.960955401617315e-06, + "loss": 4.1637, + "step": 4782 + }, + { + "epoch": 0.4076536265234808, + "grad_norm": 50.362164809802415, + "learning_rate": 9.960893531540397e-06, + "loss": 2.6716, + "step": 4783 + }, + { + "epoch": 0.40773885621750616, + "grad_norm": 96.93712602430583, + "learning_rate": 9.960831612675064e-06, + "loss": 5.4893, + "step": 4784 + }, + { + "epoch": 0.4078240859115316, + "grad_norm": 75.8129894052415, + "learning_rate": 9.960769645021923e-06, + "loss": 3.9515, + "step": 4785 + }, + { + "epoch": 0.407909315605557, + "grad_norm": 97.54713572933568, + "learning_rate": 9.960707628581585e-06, + "loss": 4.1683, + "step": 4786 + }, + { + "epoch": 0.4079945452995824, + "grad_norm": 31.237480261237106, + "learning_rate": 9.960645563354658e-06, + "loss": 1.8644, + "step": 4787 + }, + { + "epoch": 0.40807977499360776, + "grad_norm": 95.89444265369363, + "learning_rate": 9.960583449341752e-06, + "loss": 4.7911, + "step": 4788 + }, + { + "epoch": 0.40816500468763317, + "grad_norm": 91.9829253074527, + "learning_rate": 9.96052128654348e-06, + "loss": 4.1923, + "step": 4789 + }, + { + "epoch": 0.4082502343816586, + "grad_norm": 73.4916204611408, + "learning_rate": 9.960459074960454e-06, + "loss": 4.0689, + "step": 4790 + }, + { + "epoch": 0.408335464075684, + "grad_norm": 59.69910640288802, + "learning_rate": 9.96039681459328e-06, + "loss": 4.2192, + "step": 4791 + }, + { + "epoch": 0.40842069376970935, + "grad_norm": 45.694419742003426, + "learning_rate": 9.960334505442578e-06, + "loss": 2.4426, + "step": 4792 + }, + { + "epoch": 0.40850592346373477, + "grad_norm": 98.60312035115102, + "learning_rate": 9.960272147508957e-06, + "loss": 3.7028, + "step": 4793 + }, + { + "epoch": 0.4085911531577602, + "grad_norm": 52.37887785101728, + "learning_rate": 9.960209740793028e-06, + "loss": 3.1184, + "step": 4794 + }, + { + "epoch": 0.40867638285178554, + "grad_norm": 87.26271737318876, + "learning_rate": 9.960147285295408e-06, + "loss": 5.0106, + "step": 4795 + }, + { + "epoch": 0.40876161254581095, + "grad_norm": 73.82584053553705, + "learning_rate": 9.960084781016711e-06, + "loss": 4.0872, + "step": 4796 + }, + { + "epoch": 0.40884684223983636, + "grad_norm": 154.35856127297507, + "learning_rate": 9.96002222795755e-06, + "loss": 5.5473, + "step": 4797 + }, + { + "epoch": 0.4089320719338618, + "grad_norm": 65.567860277517, + "learning_rate": 9.959959626118544e-06, + "loss": 3.9525, + "step": 4798 + }, + { + "epoch": 0.40901730162788713, + "grad_norm": 50.69987947621856, + "learning_rate": 9.959896975500303e-06, + "loss": 3.4721, + "step": 4799 + }, + { + "epoch": 0.40910253132191254, + "grad_norm": 100.80193939670858, + "learning_rate": 9.959834276103449e-06, + "loss": 4.1513, + "step": 4800 + }, + { + "epoch": 0.40918776101593796, + "grad_norm": 88.22285316043448, + "learning_rate": 9.959771527928593e-06, + "loss": 3.7614, + "step": 4801 + }, + { + "epoch": 0.40927299070996337, + "grad_norm": 64.42149930550693, + "learning_rate": 9.959708730976356e-06, + "loss": 3.4211, + "step": 4802 + }, + { + "epoch": 0.4093582204039887, + "grad_norm": 58.00673493154396, + "learning_rate": 9.959645885247355e-06, + "loss": 3.5406, + "step": 4803 + }, + { + "epoch": 0.40944345009801414, + "grad_norm": 122.92283362633464, + "learning_rate": 9.959582990742207e-06, + "loss": 3.7344, + "step": 4804 + }, + { + "epoch": 0.40952867979203955, + "grad_norm": 30.453539211375286, + "learning_rate": 9.959520047461531e-06, + "loss": 3.484, + "step": 4805 + }, + { + "epoch": 0.40961390948606496, + "grad_norm": 124.72609240811528, + "learning_rate": 9.959457055405945e-06, + "loss": 5.2155, + "step": 4806 + }, + { + "epoch": 0.4096991391800903, + "grad_norm": 50.226542211701954, + "learning_rate": 9.959394014576072e-06, + "loss": 4.4749, + "step": 4807 + }, + { + "epoch": 0.40978436887411573, + "grad_norm": 95.61502779603146, + "learning_rate": 9.959330924972528e-06, + "loss": 5.0452, + "step": 4808 + }, + { + "epoch": 0.40986959856814115, + "grad_norm": 133.92000561270805, + "learning_rate": 9.959267786595935e-06, + "loss": 5.6352, + "step": 4809 + }, + { + "epoch": 0.40995482826216656, + "grad_norm": 57.59961055869613, + "learning_rate": 9.959204599446915e-06, + "loss": 4.1392, + "step": 4810 + }, + { + "epoch": 0.4100400579561919, + "grad_norm": 94.33639613389589, + "learning_rate": 9.959141363526087e-06, + "loss": 4.6014, + "step": 4811 + }, + { + "epoch": 0.41012528765021733, + "grad_norm": 45.46246306711948, + "learning_rate": 9.959078078834074e-06, + "loss": 2.1827, + "step": 4812 + }, + { + "epoch": 0.41021051734424274, + "grad_norm": 68.62453597442676, + "learning_rate": 9.9590147453715e-06, + "loss": 4.2822, + "step": 4813 + }, + { + "epoch": 0.41029574703826815, + "grad_norm": 47.17309347336381, + "learning_rate": 9.958951363138984e-06, + "loss": 4.1269, + "step": 4814 + }, + { + "epoch": 0.4103809767322935, + "grad_norm": 40.44933958784025, + "learning_rate": 9.958887932137154e-06, + "loss": 3.6147, + "step": 4815 + }, + { + "epoch": 0.4104662064263189, + "grad_norm": 75.24025298899018, + "learning_rate": 9.958824452366632e-06, + "loss": 4.5436, + "step": 4816 + }, + { + "epoch": 0.41055143612034434, + "grad_norm": 93.70636018820593, + "learning_rate": 9.958760923828042e-06, + "loss": 4.2422, + "step": 4817 + }, + { + "epoch": 0.41063666581436975, + "grad_norm": 94.23484486429967, + "learning_rate": 9.958697346522007e-06, + "loss": 4.5392, + "step": 4818 + }, + { + "epoch": 0.4107218955083951, + "grad_norm": 88.27770054122605, + "learning_rate": 9.958633720449155e-06, + "loss": 5.2194, + "step": 4819 + }, + { + "epoch": 0.4108071252024205, + "grad_norm": 89.75113314033507, + "learning_rate": 9.958570045610111e-06, + "loss": 4.3232, + "step": 4820 + }, + { + "epoch": 0.41089235489644593, + "grad_norm": 127.39816299366188, + "learning_rate": 9.9585063220055e-06, + "loss": 5.34, + "step": 4821 + }, + { + "epoch": 0.41097758459047135, + "grad_norm": 50.03303044164168, + "learning_rate": 9.95844254963595e-06, + "loss": 3.6417, + "step": 4822 + }, + { + "epoch": 0.4110628142844967, + "grad_norm": 93.43155844026789, + "learning_rate": 9.95837872850209e-06, + "loss": 4.2851, + "step": 4823 + }, + { + "epoch": 0.4111480439785221, + "grad_norm": 118.97218559722126, + "learning_rate": 9.958314858604543e-06, + "loss": 4.9731, + "step": 4824 + }, + { + "epoch": 0.41123327367254753, + "grad_norm": 92.48342855733321, + "learning_rate": 9.95825093994394e-06, + "loss": 4.0815, + "step": 4825 + }, + { + "epoch": 0.41131850336657294, + "grad_norm": 51.05565752395929, + "learning_rate": 9.95818697252091e-06, + "loss": 3.0457, + "step": 4826 + }, + { + "epoch": 0.4114037330605983, + "grad_norm": 83.76922535692269, + "learning_rate": 9.95812295633608e-06, + "loss": 4.4782, + "step": 4827 + }, + { + "epoch": 0.4114889627546237, + "grad_norm": 43.31562278548169, + "learning_rate": 9.958058891390081e-06, + "loss": 3.9374, + "step": 4828 + }, + { + "epoch": 0.4115741924486491, + "grad_norm": 102.3525283440354, + "learning_rate": 9.957994777683545e-06, + "loss": 5.208, + "step": 4829 + }, + { + "epoch": 0.4116594221426745, + "grad_norm": 44.21291622484485, + "learning_rate": 9.957930615217098e-06, + "loss": 4.2621, + "step": 4830 + }, + { + "epoch": 0.4117446518366999, + "grad_norm": 68.1782558366877, + "learning_rate": 9.957866403991373e-06, + "loss": 5.0016, + "step": 4831 + }, + { + "epoch": 0.4118298815307253, + "grad_norm": 59.668354587189945, + "learning_rate": 9.957802144007006e-06, + "loss": 4.1164, + "step": 4832 + }, + { + "epoch": 0.4119151112247507, + "grad_norm": 86.9533009083292, + "learning_rate": 9.95773783526462e-06, + "loss": 4.4214, + "step": 4833 + }, + { + "epoch": 0.4120003409187761, + "grad_norm": 91.59145761634677, + "learning_rate": 9.957673477764855e-06, + "loss": 4.6999, + "step": 4834 + }, + { + "epoch": 0.4120855706128015, + "grad_norm": 51.4218693880107, + "learning_rate": 9.957609071508341e-06, + "loss": 3.5564, + "step": 4835 + }, + { + "epoch": 0.4121708003068269, + "grad_norm": 44.03006764445452, + "learning_rate": 9.957544616495713e-06, + "loss": 3.8315, + "step": 4836 + }, + { + "epoch": 0.4122560300008523, + "grad_norm": 85.75192734702182, + "learning_rate": 9.957480112727603e-06, + "loss": 4.5847, + "step": 4837 + }, + { + "epoch": 0.41234125969487767, + "grad_norm": 150.93700035588233, + "learning_rate": 9.957415560204645e-06, + "loss": 3.7072, + "step": 4838 + }, + { + "epoch": 0.4124264893889031, + "grad_norm": 101.53270356817704, + "learning_rate": 9.957350958927476e-06, + "loss": 4.1282, + "step": 4839 + }, + { + "epoch": 0.4125117190829285, + "grad_norm": 67.7626440669049, + "learning_rate": 9.95728630889673e-06, + "loss": 3.7611, + "step": 4840 + }, + { + "epoch": 0.4125969487769539, + "grad_norm": 45.88567751017798, + "learning_rate": 9.957221610113042e-06, + "loss": 3.8939, + "step": 4841 + }, + { + "epoch": 0.41268217847097927, + "grad_norm": 60.51423985839139, + "learning_rate": 9.957156862577049e-06, + "loss": 4.4225, + "step": 4842 + }, + { + "epoch": 0.4127674081650047, + "grad_norm": 40.294854160510205, + "learning_rate": 9.95709206628939e-06, + "loss": 3.8633, + "step": 4843 + }, + { + "epoch": 0.4128526378590301, + "grad_norm": 53.37771342805763, + "learning_rate": 9.9570272212507e-06, + "loss": 2.2517, + "step": 4844 + }, + { + "epoch": 0.4129378675530555, + "grad_norm": 60.41956970075836, + "learning_rate": 9.956962327461615e-06, + "loss": 2.8538, + "step": 4845 + }, + { + "epoch": 0.41302309724708086, + "grad_norm": 41.41728079546044, + "learning_rate": 9.956897384922776e-06, + "loss": 3.054, + "step": 4846 + }, + { + "epoch": 0.4131083269411063, + "grad_norm": 45.49839241068324, + "learning_rate": 9.956832393634822e-06, + "loss": 4.0501, + "step": 4847 + }, + { + "epoch": 0.4131935566351317, + "grad_norm": 76.31316025379495, + "learning_rate": 9.956767353598389e-06, + "loss": 3.4105, + "step": 4848 + }, + { + "epoch": 0.4132787863291571, + "grad_norm": 99.23115229534045, + "learning_rate": 9.95670226481412e-06, + "loss": 4.2576, + "step": 4849 + }, + { + "epoch": 0.41336401602318246, + "grad_norm": 40.33476271448407, + "learning_rate": 9.956637127282655e-06, + "loss": 3.5441, + "step": 4850 + }, + { + "epoch": 0.41344924571720787, + "grad_norm": 56.904935590332, + "learning_rate": 9.956571941004632e-06, + "loss": 4.2053, + "step": 4851 + }, + { + "epoch": 0.4135344754112333, + "grad_norm": 79.04348240464948, + "learning_rate": 9.956506705980693e-06, + "loss": 3.9453, + "step": 4852 + }, + { + "epoch": 0.4136197051052587, + "grad_norm": 148.068724712612, + "learning_rate": 9.956441422211481e-06, + "loss": 4.464, + "step": 4853 + }, + { + "epoch": 0.41370493479928405, + "grad_norm": 61.388753970856136, + "learning_rate": 9.956376089697638e-06, + "loss": 3.274, + "step": 4854 + }, + { + "epoch": 0.41379016449330946, + "grad_norm": 113.0649935525184, + "learning_rate": 9.956310708439804e-06, + "loss": 3.9113, + "step": 4855 + }, + { + "epoch": 0.4138753941873349, + "grad_norm": 70.92512156331783, + "learning_rate": 9.956245278438623e-06, + "loss": 4.2888, + "step": 4856 + }, + { + "epoch": 0.4139606238813603, + "grad_norm": 44.15627513494163, + "learning_rate": 9.956179799694741e-06, + "loss": 3.4533, + "step": 4857 + }, + { + "epoch": 0.41404585357538565, + "grad_norm": 71.97773058006047, + "learning_rate": 9.956114272208799e-06, + "loss": 4.0718, + "step": 4858 + }, + { + "epoch": 0.41413108326941106, + "grad_norm": 63.93505125616071, + "learning_rate": 9.956048695981444e-06, + "loss": 4.6178, + "step": 4859 + }, + { + "epoch": 0.4142163129634365, + "grad_norm": 87.8840809709514, + "learning_rate": 9.955983071013317e-06, + "loss": 4.1423, + "step": 4860 + }, + { + "epoch": 0.4143015426574619, + "grad_norm": 68.06985733856881, + "learning_rate": 9.955917397305068e-06, + "loss": 3.4635, + "step": 4861 + }, + { + "epoch": 0.41438677235148724, + "grad_norm": 39.793927558061384, + "learning_rate": 9.95585167485734e-06, + "loss": 3.9085, + "step": 4862 + }, + { + "epoch": 0.41447200204551266, + "grad_norm": 72.67178938174152, + "learning_rate": 9.955785903670779e-06, + "loss": 4.1147, + "step": 4863 + }, + { + "epoch": 0.41455723173953807, + "grad_norm": 76.36154840347214, + "learning_rate": 9.955720083746034e-06, + "loss": 4.0714, + "step": 4864 + }, + { + "epoch": 0.4146424614335635, + "grad_norm": 127.9595704425898, + "learning_rate": 9.955654215083752e-06, + "loss": 4.9143, + "step": 4865 + }, + { + "epoch": 0.41472769112758884, + "grad_norm": 75.69331326402786, + "learning_rate": 9.955588297684578e-06, + "loss": 4.5291, + "step": 4866 + }, + { + "epoch": 0.41481292082161425, + "grad_norm": 187.60213547116112, + "learning_rate": 9.955522331549163e-06, + "loss": 4.5481, + "step": 4867 + }, + { + "epoch": 0.41489815051563966, + "grad_norm": 46.29422907706693, + "learning_rate": 9.955456316678156e-06, + "loss": 3.3886, + "step": 4868 + }, + { + "epoch": 0.414983380209665, + "grad_norm": 37.34614314077599, + "learning_rate": 9.955390253072203e-06, + "loss": 3.1812, + "step": 4869 + }, + { + "epoch": 0.41506860990369043, + "grad_norm": 57.429352871823376, + "learning_rate": 9.955324140731958e-06, + "loss": 2.6766, + "step": 4870 + }, + { + "epoch": 0.41515383959771585, + "grad_norm": 38.70722467473146, + "learning_rate": 9.955257979658068e-06, + "loss": 2.9248, + "step": 4871 + }, + { + "epoch": 0.41523906929174126, + "grad_norm": 186.46807564014944, + "learning_rate": 9.955191769851186e-06, + "loss": 5.287, + "step": 4872 + }, + { + "epoch": 0.4153242989857666, + "grad_norm": 39.159864991884156, + "learning_rate": 9.955125511311961e-06, + "loss": 3.1537, + "step": 4873 + }, + { + "epoch": 0.41540952867979203, + "grad_norm": 59.67853956674003, + "learning_rate": 9.955059204041045e-06, + "loss": 4.0974, + "step": 4874 + }, + { + "epoch": 0.41549475837381744, + "grad_norm": 116.52850570283455, + "learning_rate": 9.95499284803909e-06, + "loss": 4.2063, + "step": 4875 + }, + { + "epoch": 0.41557998806784285, + "grad_norm": 42.09943727572899, + "learning_rate": 9.954926443306753e-06, + "loss": 3.3905, + "step": 4876 + }, + { + "epoch": 0.4156652177618682, + "grad_norm": 36.59063131485327, + "learning_rate": 9.95485998984468e-06, + "loss": 3.2467, + "step": 4877 + }, + { + "epoch": 0.4157504474558936, + "grad_norm": 73.71882881489243, + "learning_rate": 9.95479348765353e-06, + "loss": 4.0461, + "step": 4878 + }, + { + "epoch": 0.41583567714991904, + "grad_norm": 99.28411038594781, + "learning_rate": 9.954726936733954e-06, + "loss": 5.7888, + "step": 4879 + }, + { + "epoch": 0.41592090684394445, + "grad_norm": 50.70351238116887, + "learning_rate": 9.954660337086606e-06, + "loss": 3.4009, + "step": 4880 + }, + { + "epoch": 0.4160061365379698, + "grad_norm": 107.33323273363071, + "learning_rate": 9.954593688712144e-06, + "loss": 5.4477, + "step": 4881 + }, + { + "epoch": 0.4160913662319952, + "grad_norm": 45.67506879849117, + "learning_rate": 9.954526991611223e-06, + "loss": 4.1728, + "step": 4882 + }, + { + "epoch": 0.41617659592602063, + "grad_norm": 75.09587673005848, + "learning_rate": 9.954460245784494e-06, + "loss": 4.2798, + "step": 4883 + }, + { + "epoch": 0.41626182562004604, + "grad_norm": 70.29798217196686, + "learning_rate": 9.95439345123262e-06, + "loss": 4.8797, + "step": 4884 + }, + { + "epoch": 0.4163470553140714, + "grad_norm": 278.6346993026776, + "learning_rate": 9.954326607956255e-06, + "loss": 4.5835, + "step": 4885 + }, + { + "epoch": 0.4164322850080968, + "grad_norm": 55.40400226340014, + "learning_rate": 9.954259715956056e-06, + "loss": 4.3798, + "step": 4886 + }, + { + "epoch": 0.4165175147021222, + "grad_norm": 114.00279461677975, + "learning_rate": 9.95419277523268e-06, + "loss": 4.9409, + "step": 4887 + }, + { + "epoch": 0.41660274439614764, + "grad_norm": 57.008550510661415, + "learning_rate": 9.954125785786789e-06, + "loss": 4.5366, + "step": 4888 + }, + { + "epoch": 0.416687974090173, + "grad_norm": 46.782410415300774, + "learning_rate": 9.954058747619037e-06, + "loss": 3.8519, + "step": 4889 + }, + { + "epoch": 0.4167732037841984, + "grad_norm": 35.17087327938136, + "learning_rate": 9.953991660730087e-06, + "loss": 3.0568, + "step": 4890 + }, + { + "epoch": 0.4168584334782238, + "grad_norm": 49.93835895440707, + "learning_rate": 9.953924525120597e-06, + "loss": 3.4682, + "step": 4891 + }, + { + "epoch": 0.41694366317224923, + "grad_norm": 45.67344696093648, + "learning_rate": 9.95385734079123e-06, + "loss": 3.9267, + "step": 4892 + }, + { + "epoch": 0.4170288928662746, + "grad_norm": 64.26820682902905, + "learning_rate": 9.953790107742642e-06, + "loss": 3.9189, + "step": 4893 + }, + { + "epoch": 0.4171141225603, + "grad_norm": 98.67769745724014, + "learning_rate": 9.953722825975498e-06, + "loss": 4.5267, + "step": 4894 + }, + { + "epoch": 0.4171993522543254, + "grad_norm": 99.10252869181426, + "learning_rate": 9.953655495490458e-06, + "loss": 4.6383, + "step": 4895 + }, + { + "epoch": 0.41728458194835083, + "grad_norm": 52.118451116749426, + "learning_rate": 9.953588116288184e-06, + "loss": 4.0438, + "step": 4896 + }, + { + "epoch": 0.4173698116423762, + "grad_norm": 88.00958405048296, + "learning_rate": 9.95352068836934e-06, + "loss": 4.9204, + "step": 4897 + }, + { + "epoch": 0.4174550413364016, + "grad_norm": 65.57635884911556, + "learning_rate": 9.95345321173459e-06, + "loss": 3.9996, + "step": 4898 + }, + { + "epoch": 0.417540271030427, + "grad_norm": 44.5005267798344, + "learning_rate": 9.953385686384593e-06, + "loss": 4.4292, + "step": 4899 + }, + { + "epoch": 0.4176255007244524, + "grad_norm": 37.4327750906673, + "learning_rate": 9.953318112320019e-06, + "loss": 3.5474, + "step": 4900 + }, + { + "epoch": 0.4177107304184778, + "grad_norm": 80.57509799892051, + "learning_rate": 9.953250489541526e-06, + "loss": 3.9659, + "step": 4901 + }, + { + "epoch": 0.4177959601125032, + "grad_norm": 68.32823583405265, + "learning_rate": 9.953182818049784e-06, + "loss": 4.0691, + "step": 4902 + }, + { + "epoch": 0.4178811898065286, + "grad_norm": 48.56201537927854, + "learning_rate": 9.953115097845459e-06, + "loss": 2.2172, + "step": 4903 + }, + { + "epoch": 0.417966419500554, + "grad_norm": 192.03933192827705, + "learning_rate": 9.953047328929215e-06, + "loss": 5.6955, + "step": 4904 + }, + { + "epoch": 0.4180516491945794, + "grad_norm": 61.582291786898004, + "learning_rate": 9.952979511301717e-06, + "loss": 3.8352, + "step": 4905 + }, + { + "epoch": 0.4181368788886048, + "grad_norm": 74.56216018372905, + "learning_rate": 9.952911644963635e-06, + "loss": 4.0714, + "step": 4906 + }, + { + "epoch": 0.4182221085826302, + "grad_norm": 48.2610364375796, + "learning_rate": 9.952843729915633e-06, + "loss": 2.6809, + "step": 4907 + }, + { + "epoch": 0.41830733827665556, + "grad_norm": 39.65930472722931, + "learning_rate": 9.952775766158383e-06, + "loss": 3.6452, + "step": 4908 + }, + { + "epoch": 0.418392567970681, + "grad_norm": 44.342112187055825, + "learning_rate": 9.95270775369255e-06, + "loss": 3.9392, + "step": 4909 + }, + { + "epoch": 0.4184777976647064, + "grad_norm": 75.39889686426788, + "learning_rate": 9.952639692518807e-06, + "loss": 4.6959, + "step": 4910 + }, + { + "epoch": 0.4185630273587318, + "grad_norm": 47.391023272945446, + "learning_rate": 9.952571582637817e-06, + "loss": 3.6478, + "step": 4911 + }, + { + "epoch": 0.41864825705275716, + "grad_norm": 52.47078446782911, + "learning_rate": 9.952503424050255e-06, + "loss": 3.9933, + "step": 4912 + }, + { + "epoch": 0.41873348674678257, + "grad_norm": 36.24369711594481, + "learning_rate": 9.95243521675679e-06, + "loss": 3.2335, + "step": 4913 + }, + { + "epoch": 0.418818716440808, + "grad_norm": 138.2530319147151, + "learning_rate": 9.952366960758093e-06, + "loss": 4.5132, + "step": 4914 + }, + { + "epoch": 0.4189039461348334, + "grad_norm": 74.7904955040744, + "learning_rate": 9.952298656054833e-06, + "loss": 3.8609, + "step": 4915 + }, + { + "epoch": 0.41898917582885875, + "grad_norm": 72.48868218187363, + "learning_rate": 9.952230302647686e-06, + "loss": 4.261, + "step": 4916 + }, + { + "epoch": 0.41907440552288416, + "grad_norm": 47.917258871443146, + "learning_rate": 9.952161900537321e-06, + "loss": 3.4228, + "step": 4917 + }, + { + "epoch": 0.4191596352169096, + "grad_norm": 52.07145370325104, + "learning_rate": 9.95209344972441e-06, + "loss": 2.9818, + "step": 4918 + }, + { + "epoch": 0.419244864910935, + "grad_norm": 59.09056056895931, + "learning_rate": 9.952024950209629e-06, + "loss": 3.934, + "step": 4919 + }, + { + "epoch": 0.41933009460496035, + "grad_norm": 101.71069252295817, + "learning_rate": 9.95195640199365e-06, + "loss": 4.0031, + "step": 4920 + }, + { + "epoch": 0.41941532429898576, + "grad_norm": 35.269407498992926, + "learning_rate": 9.951887805077147e-06, + "loss": 3.061, + "step": 4921 + }, + { + "epoch": 0.41950055399301117, + "grad_norm": 41.228747445779405, + "learning_rate": 9.951819159460797e-06, + "loss": 3.5043, + "step": 4922 + }, + { + "epoch": 0.4195857836870366, + "grad_norm": 58.75530170484985, + "learning_rate": 9.951750465145272e-06, + "loss": 2.2572, + "step": 4923 + }, + { + "epoch": 0.41967101338106194, + "grad_norm": 57.61982295190175, + "learning_rate": 9.951681722131247e-06, + "loss": 3.7273, + "step": 4924 + }, + { + "epoch": 0.41975624307508735, + "grad_norm": 70.48458346256724, + "learning_rate": 9.951612930419401e-06, + "loss": 4.6688, + "step": 4925 + }, + { + "epoch": 0.41984147276911277, + "grad_norm": 98.50492699943088, + "learning_rate": 9.951544090010412e-06, + "loss": 5.1693, + "step": 4926 + }, + { + "epoch": 0.4199267024631382, + "grad_norm": 101.95969034814625, + "learning_rate": 9.951475200904952e-06, + "loss": 2.9819, + "step": 4927 + }, + { + "epoch": 0.42001193215716354, + "grad_norm": 55.65985780760146, + "learning_rate": 9.9514062631037e-06, + "loss": 4.5579, + "step": 4928 + }, + { + "epoch": 0.42009716185118895, + "grad_norm": 36.851468815745214, + "learning_rate": 9.951337276607337e-06, + "loss": 3.4647, + "step": 4929 + }, + { + "epoch": 0.42018239154521436, + "grad_norm": 46.49685407634929, + "learning_rate": 9.951268241416538e-06, + "loss": 4.4047, + "step": 4930 + }, + { + "epoch": 0.4202676212392398, + "grad_norm": 37.178080733033816, + "learning_rate": 9.951199157531985e-06, + "loss": 3.0275, + "step": 4931 + }, + { + "epoch": 0.42035285093326513, + "grad_norm": 99.01318946498564, + "learning_rate": 9.951130024954354e-06, + "loss": 4.3093, + "step": 4932 + }, + { + "epoch": 0.42043808062729054, + "grad_norm": 65.70247558685223, + "learning_rate": 9.951060843684329e-06, + "loss": 3.8765, + "step": 4933 + }, + { + "epoch": 0.42052331032131596, + "grad_norm": 53.36715198876221, + "learning_rate": 9.950991613722584e-06, + "loss": 4.6812, + "step": 4934 + }, + { + "epoch": 0.42060854001534137, + "grad_norm": 71.34792518217945, + "learning_rate": 9.950922335069807e-06, + "loss": 3.5203, + "step": 4935 + }, + { + "epoch": 0.4206937697093667, + "grad_norm": 70.87171620165917, + "learning_rate": 9.950853007726675e-06, + "loss": 4.5851, + "step": 4936 + }, + { + "epoch": 0.42077899940339214, + "grad_norm": 40.60804553675349, + "learning_rate": 9.95078363169387e-06, + "loss": 4.299, + "step": 4937 + }, + { + "epoch": 0.42086422909741755, + "grad_norm": 44.988974498913684, + "learning_rate": 9.950714206972077e-06, + "loss": 3.2252, + "step": 4938 + }, + { + "epoch": 0.42094945879144297, + "grad_norm": 64.90206933173737, + "learning_rate": 9.950644733561975e-06, + "loss": 4.6349, + "step": 4939 + }, + { + "epoch": 0.4210346884854683, + "grad_norm": 46.3826908166006, + "learning_rate": 9.950575211464251e-06, + "loss": 3.728, + "step": 4940 + }, + { + "epoch": 0.42111991817949374, + "grad_norm": 79.93594985388368, + "learning_rate": 9.950505640679586e-06, + "loss": 4.528, + "step": 4941 + }, + { + "epoch": 0.42120514787351915, + "grad_norm": 77.99282979660815, + "learning_rate": 9.950436021208667e-06, + "loss": 3.9405, + "step": 4942 + }, + { + "epoch": 0.4212903775675445, + "grad_norm": 99.85585945219316, + "learning_rate": 9.950366353052175e-06, + "loss": 3.4129, + "step": 4943 + }, + { + "epoch": 0.4213756072615699, + "grad_norm": 70.9639834101227, + "learning_rate": 9.950296636210796e-06, + "loss": 3.5343, + "step": 4944 + }, + { + "epoch": 0.42146083695559533, + "grad_norm": 79.26959532993405, + "learning_rate": 9.95022687068522e-06, + "loss": 2.6581, + "step": 4945 + }, + { + "epoch": 0.42154606664962074, + "grad_norm": 76.45522187488585, + "learning_rate": 9.950157056476128e-06, + "loss": 3.9868, + "step": 4946 + }, + { + "epoch": 0.4216312963436461, + "grad_norm": 77.53214851359363, + "learning_rate": 9.950087193584207e-06, + "loss": 3.8612, + "step": 4947 + }, + { + "epoch": 0.4217165260376715, + "grad_norm": 152.30335282080813, + "learning_rate": 9.950017282010148e-06, + "loss": 3.6233, + "step": 4948 + }, + { + "epoch": 0.4218017557316969, + "grad_norm": 72.56979927744501, + "learning_rate": 9.949947321754635e-06, + "loss": 4.3457, + "step": 4949 + }, + { + "epoch": 0.42188698542572234, + "grad_norm": 48.57461103577709, + "learning_rate": 9.949877312818356e-06, + "loss": 3.7858, + "step": 4950 + }, + { + "epoch": 0.4219722151197477, + "grad_norm": 35.91678013913818, + "learning_rate": 9.949807255202e-06, + "loss": 3.3466, + "step": 4951 + }, + { + "epoch": 0.4220574448137731, + "grad_norm": 52.83100736467661, + "learning_rate": 9.949737148906258e-06, + "loss": 4.4178, + "step": 4952 + }, + { + "epoch": 0.4221426745077985, + "grad_norm": 70.8549205349084, + "learning_rate": 9.949666993931817e-06, + "loss": 3.0117, + "step": 4953 + }, + { + "epoch": 0.42222790420182393, + "grad_norm": 294.64233364270564, + "learning_rate": 9.94959679027937e-06, + "loss": 5.2131, + "step": 4954 + }, + { + "epoch": 0.4223131338958493, + "grad_norm": 140.61015629162097, + "learning_rate": 9.949526537949603e-06, + "loss": 3.8039, + "step": 4955 + }, + { + "epoch": 0.4223983635898747, + "grad_norm": 35.643788799109, + "learning_rate": 9.94945623694321e-06, + "loss": 2.1916, + "step": 4956 + }, + { + "epoch": 0.4224835932839001, + "grad_norm": 72.26006663385334, + "learning_rate": 9.949385887260882e-06, + "loss": 4.513, + "step": 4957 + }, + { + "epoch": 0.42256882297792553, + "grad_norm": 60.90330069416509, + "learning_rate": 9.94931548890331e-06, + "loss": 4.342, + "step": 4958 + }, + { + "epoch": 0.4226540526719509, + "grad_norm": 33.83107783120724, + "learning_rate": 9.949245041871188e-06, + "loss": 2.8317, + "step": 4959 + }, + { + "epoch": 0.4227392823659763, + "grad_norm": 55.392955969403026, + "learning_rate": 9.949174546165206e-06, + "loss": 4.4531, + "step": 4960 + }, + { + "epoch": 0.4228245120600017, + "grad_norm": 87.67633174707572, + "learning_rate": 9.949104001786059e-06, + "loss": 3.5428, + "step": 4961 + }, + { + "epoch": 0.4229097417540271, + "grad_norm": 33.826964351959425, + "learning_rate": 9.949033408734442e-06, + "loss": 2.9452, + "step": 4962 + }, + { + "epoch": 0.4229949714480525, + "grad_norm": 121.76985945920714, + "learning_rate": 9.948962767011048e-06, + "loss": 3.7348, + "step": 4963 + }, + { + "epoch": 0.4230802011420779, + "grad_norm": 52.90594410268107, + "learning_rate": 9.94889207661657e-06, + "loss": 3.6957, + "step": 4964 + }, + { + "epoch": 0.4231654308361033, + "grad_norm": 49.882073161888215, + "learning_rate": 9.948821337551706e-06, + "loss": 3.9164, + "step": 4965 + }, + { + "epoch": 0.4232506605301287, + "grad_norm": 137.15192764880908, + "learning_rate": 9.948750549817152e-06, + "loss": 3.6085, + "step": 4966 + }, + { + "epoch": 0.4233358902241541, + "grad_norm": 45.880333296728885, + "learning_rate": 9.9486797134136e-06, + "loss": 3.8638, + "step": 4967 + }, + { + "epoch": 0.4234211199181795, + "grad_norm": 102.37661717546936, + "learning_rate": 9.948608828341751e-06, + "loss": 4.6938, + "step": 4968 + }, + { + "epoch": 0.4235063496122049, + "grad_norm": 284.653094779597, + "learning_rate": 9.948537894602302e-06, + "loss": 4.3466, + "step": 4969 + }, + { + "epoch": 0.4235915793062303, + "grad_norm": 47.08792168617024, + "learning_rate": 9.948466912195946e-06, + "loss": 3.7173, + "step": 4970 + }, + { + "epoch": 0.42367680900025567, + "grad_norm": 42.703934375925144, + "learning_rate": 9.948395881123387e-06, + "loss": 2.7273, + "step": 4971 + }, + { + "epoch": 0.4237620386942811, + "grad_norm": 44.031268368380296, + "learning_rate": 9.948324801385318e-06, + "loss": 4.0255, + "step": 4972 + }, + { + "epoch": 0.4238472683883065, + "grad_norm": 51.833649997268566, + "learning_rate": 9.948253672982443e-06, + "loss": 2.4505, + "step": 4973 + }, + { + "epoch": 0.4239324980823319, + "grad_norm": 195.6014943769568, + "learning_rate": 9.94818249591546e-06, + "loss": 4.0104, + "step": 4974 + }, + { + "epoch": 0.42401772777635727, + "grad_norm": 55.251727920191165, + "learning_rate": 9.948111270185066e-06, + "loss": 3.0933, + "step": 4975 + }, + { + "epoch": 0.4241029574703827, + "grad_norm": 53.76113234593592, + "learning_rate": 9.948039995791965e-06, + "loss": 3.8908, + "step": 4976 + }, + { + "epoch": 0.4241881871644081, + "grad_norm": 67.41351955976647, + "learning_rate": 9.947968672736856e-06, + "loss": 3.113, + "step": 4977 + }, + { + "epoch": 0.4242734168584335, + "grad_norm": 70.13823886726496, + "learning_rate": 9.947897301020444e-06, + "loss": 4.5959, + "step": 4978 + }, + { + "epoch": 0.42435864655245886, + "grad_norm": 83.80380828750995, + "learning_rate": 9.947825880643425e-06, + "loss": 4.82, + "step": 4979 + }, + { + "epoch": 0.4244438762464843, + "grad_norm": 98.79859493170534, + "learning_rate": 9.947754411606507e-06, + "loss": 3.3201, + "step": 4980 + }, + { + "epoch": 0.4245291059405097, + "grad_norm": 44.19064645627977, + "learning_rate": 9.947682893910389e-06, + "loss": 3.4619, + "step": 4981 + }, + { + "epoch": 0.42461433563453505, + "grad_norm": 98.54550176839655, + "learning_rate": 9.947611327555775e-06, + "loss": 4.4926, + "step": 4982 + }, + { + "epoch": 0.42469956532856046, + "grad_norm": 80.37872311057654, + "learning_rate": 9.947539712543371e-06, + "loss": 4.2315, + "step": 4983 + }, + { + "epoch": 0.42478479502258587, + "grad_norm": 45.75065587935271, + "learning_rate": 9.947468048873879e-06, + "loss": 4.2671, + "step": 4984 + }, + { + "epoch": 0.4248700247166113, + "grad_norm": 112.81699363379815, + "learning_rate": 9.947396336548006e-06, + "loss": 4.4895, + "step": 4985 + }, + { + "epoch": 0.42495525441063664, + "grad_norm": 136.52458908985204, + "learning_rate": 9.947324575566454e-06, + "loss": 4.2926, + "step": 4986 + }, + { + "epoch": 0.42504048410466205, + "grad_norm": 158.45361956460258, + "learning_rate": 9.947252765929931e-06, + "loss": 4.787, + "step": 4987 + }, + { + "epoch": 0.42512571379868747, + "grad_norm": 138.61275938391915, + "learning_rate": 9.947180907639144e-06, + "loss": 3.9698, + "step": 4988 + }, + { + "epoch": 0.4252109434927129, + "grad_norm": 75.30608910641543, + "learning_rate": 9.947109000694799e-06, + "loss": 3.5785, + "step": 4989 + }, + { + "epoch": 0.42529617318673824, + "grad_norm": 49.76870957083571, + "learning_rate": 9.9470370450976e-06, + "loss": 3.6585, + "step": 4990 + }, + { + "epoch": 0.42538140288076365, + "grad_norm": 54.79661544815866, + "learning_rate": 9.94696504084826e-06, + "loss": 3.8868, + "step": 4991 + }, + { + "epoch": 0.42546663257478906, + "grad_norm": 62.95156065470787, + "learning_rate": 9.946892987947486e-06, + "loss": 3.4716, + "step": 4992 + }, + { + "epoch": 0.4255518622688145, + "grad_norm": 79.33799349644147, + "learning_rate": 9.946820886395982e-06, + "loss": 3.3086, + "step": 4993 + }, + { + "epoch": 0.42563709196283983, + "grad_norm": 134.17156589164296, + "learning_rate": 9.94674873619446e-06, + "loss": 5.32, + "step": 4994 + }, + { + "epoch": 0.42572232165686524, + "grad_norm": 76.99631484822453, + "learning_rate": 9.946676537343631e-06, + "loss": 4.2339, + "step": 4995 + }, + { + "epoch": 0.42580755135089066, + "grad_norm": 76.80355247777806, + "learning_rate": 9.946604289844205e-06, + "loss": 3.6697, + "step": 4996 + }, + { + "epoch": 0.42589278104491607, + "grad_norm": 64.57033162097856, + "learning_rate": 9.94653199369689e-06, + "loss": 4.4283, + "step": 4997 + }, + { + "epoch": 0.4259780107389414, + "grad_norm": 69.38235123285995, + "learning_rate": 9.9464596489024e-06, + "loss": 3.6542, + "step": 4998 + }, + { + "epoch": 0.42606324043296684, + "grad_norm": 60.580685046966096, + "learning_rate": 9.946387255461443e-06, + "loss": 3.9146, + "step": 4999 + }, + { + "epoch": 0.42614847012699225, + "grad_norm": 114.26812878904883, + "learning_rate": 9.946314813374733e-06, + "loss": 4.7648, + "step": 5000 + }, + { + "epoch": 0.42623369982101766, + "grad_norm": 171.31791803715277, + "learning_rate": 9.946242322642982e-06, + "loss": 4.9774, + "step": 5001 + }, + { + "epoch": 0.426318929515043, + "grad_norm": 117.8174082474643, + "learning_rate": 9.946169783266906e-06, + "loss": 4.9168, + "step": 5002 + }, + { + "epoch": 0.42640415920906843, + "grad_norm": 99.44901176253617, + "learning_rate": 9.946097195247213e-06, + "loss": 4.327, + "step": 5003 + }, + { + "epoch": 0.42648938890309385, + "grad_norm": 64.48925255765437, + "learning_rate": 9.94602455858462e-06, + "loss": 4.8596, + "step": 5004 + }, + { + "epoch": 0.42657461859711926, + "grad_norm": 3134.6364049652702, + "learning_rate": 9.94595187327984e-06, + "loss": 3.785, + "step": 5005 + }, + { + "epoch": 0.4266598482911446, + "grad_norm": 42.692189011033385, + "learning_rate": 9.94587913933359e-06, + "loss": 2.8868, + "step": 5006 + }, + { + "epoch": 0.42674507798517003, + "grad_norm": 81.38133348415855, + "learning_rate": 9.945806356746583e-06, + "loss": 4.1075, + "step": 5007 + }, + { + "epoch": 0.42683030767919544, + "grad_norm": 185.7402110008649, + "learning_rate": 9.945733525519536e-06, + "loss": 4.4039, + "step": 5008 + }, + { + "epoch": 0.42691553737322085, + "grad_norm": 55.54811074236617, + "learning_rate": 9.945660645653166e-06, + "loss": 3.677, + "step": 5009 + }, + { + "epoch": 0.4270007670672462, + "grad_norm": 62.55214572282104, + "learning_rate": 9.945587717148188e-06, + "loss": 4.9383, + "step": 5010 + }, + { + "epoch": 0.4270859967612716, + "grad_norm": 41.83486591873784, + "learning_rate": 9.94551474000532e-06, + "loss": 2.874, + "step": 5011 + }, + { + "epoch": 0.42717122645529704, + "grad_norm": 42.09092449040982, + "learning_rate": 9.94544171422528e-06, + "loss": 3.672, + "step": 5012 + }, + { + "epoch": 0.42725645614932245, + "grad_norm": 82.92541041050715, + "learning_rate": 9.945368639808785e-06, + "loss": 5.0243, + "step": 5013 + }, + { + "epoch": 0.4273416858433478, + "grad_norm": 29.56830769572954, + "learning_rate": 9.945295516756555e-06, + "loss": 2.9107, + "step": 5014 + }, + { + "epoch": 0.4274269155373732, + "grad_norm": 64.86239070728769, + "learning_rate": 9.94522234506931e-06, + "loss": 4.2426, + "step": 5015 + }, + { + "epoch": 0.42751214523139863, + "grad_norm": 153.6270143783555, + "learning_rate": 9.945149124747766e-06, + "loss": 3.8865, + "step": 5016 + }, + { + "epoch": 0.42759737492542405, + "grad_norm": 33.34789252486084, + "learning_rate": 9.945075855792648e-06, + "loss": 4.1579, + "step": 5017 + }, + { + "epoch": 0.4276826046194494, + "grad_norm": 51.06825996898236, + "learning_rate": 9.945002538204671e-06, + "loss": 3.3719, + "step": 5018 + }, + { + "epoch": 0.4277678343134748, + "grad_norm": 54.21733719396685, + "learning_rate": 9.944929171984561e-06, + "loss": 4.3891, + "step": 5019 + }, + { + "epoch": 0.42785306400750023, + "grad_norm": 52.47461988288656, + "learning_rate": 9.944855757133036e-06, + "loss": 2.7463, + "step": 5020 + }, + { + "epoch": 0.4279382937015256, + "grad_norm": 103.671992802159, + "learning_rate": 9.944782293650821e-06, + "loss": 4.4646, + "step": 5021 + }, + { + "epoch": 0.428023523395551, + "grad_norm": 53.53385184261066, + "learning_rate": 9.944708781538637e-06, + "loss": 3.0365, + "step": 5022 + }, + { + "epoch": 0.4281087530895764, + "grad_norm": 69.11443791932612, + "learning_rate": 9.944635220797205e-06, + "loss": 4.1303, + "step": 5023 + }, + { + "epoch": 0.4281939827836018, + "grad_norm": 102.72813383725605, + "learning_rate": 9.944561611427253e-06, + "loss": 4.135, + "step": 5024 + }, + { + "epoch": 0.4282792124776272, + "grad_norm": 38.44507447119793, + "learning_rate": 9.9444879534295e-06, + "loss": 3.4754, + "step": 5025 + }, + { + "epoch": 0.4283644421716526, + "grad_norm": 48.53520856706848, + "learning_rate": 9.944414246804675e-06, + "loss": 4.2014, + "step": 5026 + }, + { + "epoch": 0.428449671865678, + "grad_norm": 39.36018516986301, + "learning_rate": 9.944340491553499e-06, + "loss": 4.0958, + "step": 5027 + }, + { + "epoch": 0.4285349015597034, + "grad_norm": 84.39367911029251, + "learning_rate": 9.9442666876767e-06, + "loss": 3.5161, + "step": 5028 + }, + { + "epoch": 0.4286201312537288, + "grad_norm": 49.072850103614115, + "learning_rate": 9.944192835175001e-06, + "loss": 2.5201, + "step": 5029 + }, + { + "epoch": 0.4287053609477542, + "grad_norm": 76.77264570719188, + "learning_rate": 9.944118934049131e-06, + "loss": 4.6374, + "step": 5030 + }, + { + "epoch": 0.4287905906417796, + "grad_norm": 50.46134882190134, + "learning_rate": 9.944044984299817e-06, + "loss": 3.6725, + "step": 5031 + }, + { + "epoch": 0.428875820335805, + "grad_norm": 52.63965534900462, + "learning_rate": 9.943970985927785e-06, + "loss": 4.9122, + "step": 5032 + }, + { + "epoch": 0.42896105002983037, + "grad_norm": 283.39833024212186, + "learning_rate": 9.943896938933761e-06, + "loss": 4.1942, + "step": 5033 + }, + { + "epoch": 0.4290462797238558, + "grad_norm": 53.29643139363179, + "learning_rate": 9.943822843318475e-06, + "loss": 3.4092, + "step": 5034 + }, + { + "epoch": 0.4291315094178812, + "grad_norm": 43.32118663476158, + "learning_rate": 9.943748699082658e-06, + "loss": 3.5923, + "step": 5035 + }, + { + "epoch": 0.4292167391119066, + "grad_norm": 49.2400811440772, + "learning_rate": 9.943674506227037e-06, + "loss": 3.7631, + "step": 5036 + }, + { + "epoch": 0.42930196880593197, + "grad_norm": 52.88409334643902, + "learning_rate": 9.94360026475234e-06, + "loss": 3.0631, + "step": 5037 + }, + { + "epoch": 0.4293871984999574, + "grad_norm": 42.63127974170017, + "learning_rate": 9.9435259746593e-06, + "loss": 3.5245, + "step": 5038 + }, + { + "epoch": 0.4294724281939828, + "grad_norm": 90.83088954043055, + "learning_rate": 9.943451635948647e-06, + "loss": 4.67, + "step": 5039 + }, + { + "epoch": 0.4295576578880082, + "grad_norm": 113.4199934969492, + "learning_rate": 9.943377248621108e-06, + "loss": 5.2221, + "step": 5040 + }, + { + "epoch": 0.42964288758203356, + "grad_norm": 93.36294975766549, + "learning_rate": 9.943302812677422e-06, + "loss": 5.4007, + "step": 5041 + }, + { + "epoch": 0.429728117276059, + "grad_norm": 69.06446411541795, + "learning_rate": 9.943228328118315e-06, + "loss": 4.2104, + "step": 5042 + }, + { + "epoch": 0.4298133469700844, + "grad_norm": 93.1266190403879, + "learning_rate": 9.943153794944522e-06, + "loss": 4.6608, + "step": 5043 + }, + { + "epoch": 0.4298985766641098, + "grad_norm": 53.26373555039052, + "learning_rate": 9.943079213156777e-06, + "loss": 4.1373, + "step": 5044 + }, + { + "epoch": 0.42998380635813516, + "grad_norm": 56.72661128163338, + "learning_rate": 9.94300458275581e-06, + "loss": 4.3939, + "step": 5045 + }, + { + "epoch": 0.43006903605216057, + "grad_norm": 67.34979748115705, + "learning_rate": 9.942929903742359e-06, + "loss": 4.8268, + "step": 5046 + }, + { + "epoch": 0.430154265746186, + "grad_norm": 60.93279373479752, + "learning_rate": 9.942855176117155e-06, + "loss": 3.9697, + "step": 5047 + }, + { + "epoch": 0.4302394954402114, + "grad_norm": 54.48749198477807, + "learning_rate": 9.942780399880935e-06, + "loss": 4.2622, + "step": 5048 + }, + { + "epoch": 0.43032472513423675, + "grad_norm": 81.64385892465835, + "learning_rate": 9.942705575034435e-06, + "loss": 3.816, + "step": 5049 + }, + { + "epoch": 0.43040995482826216, + "grad_norm": 61.38292856093549, + "learning_rate": 9.942630701578389e-06, + "loss": 3.4336, + "step": 5050 + }, + { + "epoch": 0.4304951845222876, + "grad_norm": 156.36057994780427, + "learning_rate": 9.942555779513532e-06, + "loss": 4.8309, + "step": 5051 + }, + { + "epoch": 0.430580414216313, + "grad_norm": 113.79509657312867, + "learning_rate": 9.942480808840605e-06, + "loss": 3.0693, + "step": 5052 + }, + { + "epoch": 0.43066564391033835, + "grad_norm": 106.33250399302518, + "learning_rate": 9.942405789560342e-06, + "loss": 4.9384, + "step": 5053 + }, + { + "epoch": 0.43075087360436376, + "grad_norm": 88.59654283544553, + "learning_rate": 9.942330721673484e-06, + "loss": 4.1252, + "step": 5054 + }, + { + "epoch": 0.4308361032983892, + "grad_norm": 103.35783569957051, + "learning_rate": 9.942255605180766e-06, + "loss": 4.7307, + "step": 5055 + }, + { + "epoch": 0.43092133299241453, + "grad_norm": 65.45064446936026, + "learning_rate": 9.942180440082928e-06, + "loss": 3.9696, + "step": 5056 + }, + { + "epoch": 0.43100656268643994, + "grad_norm": 63.555199538835545, + "learning_rate": 9.94210522638071e-06, + "loss": 3.5902, + "step": 5057 + }, + { + "epoch": 0.43109179238046536, + "grad_norm": 41.359518341003074, + "learning_rate": 9.942029964074849e-06, + "loss": 3.6434, + "step": 5058 + }, + { + "epoch": 0.43117702207449077, + "grad_norm": 199.9754291927641, + "learning_rate": 9.941954653166087e-06, + "loss": 4.2075, + "step": 5059 + }, + { + "epoch": 0.4312622517685161, + "grad_norm": 112.64477998189066, + "learning_rate": 9.941879293655167e-06, + "loss": 5.2653, + "step": 5060 + }, + { + "epoch": 0.43134748146254154, + "grad_norm": 65.70324203790315, + "learning_rate": 9.941803885542826e-06, + "loss": 3.4447, + "step": 5061 + }, + { + "epoch": 0.43143271115656695, + "grad_norm": 42.59171774994245, + "learning_rate": 9.94172842882981e-06, + "loss": 3.9868, + "step": 5062 + }, + { + "epoch": 0.43151794085059236, + "grad_norm": 80.01625479590314, + "learning_rate": 9.941652923516855e-06, + "loss": 4.0739, + "step": 5063 + }, + { + "epoch": 0.4316031705446177, + "grad_norm": 45.95584595406555, + "learning_rate": 9.941577369604709e-06, + "loss": 4.0277, + "step": 5064 + }, + { + "epoch": 0.43168840023864313, + "grad_norm": 76.83373175625648, + "learning_rate": 9.941501767094114e-06, + "loss": 3.6719, + "step": 5065 + }, + { + "epoch": 0.43177362993266855, + "grad_norm": 223.65243076926797, + "learning_rate": 9.941426115985811e-06, + "loss": 4.4432, + "step": 5066 + }, + { + "epoch": 0.43185885962669396, + "grad_norm": 109.64708818509145, + "learning_rate": 9.941350416280547e-06, + "loss": 3.6782, + "step": 5067 + }, + { + "epoch": 0.4319440893207193, + "grad_norm": 71.04908965421419, + "learning_rate": 9.941274667979064e-06, + "loss": 3.6829, + "step": 5068 + }, + { + "epoch": 0.43202931901474473, + "grad_norm": 37.50365122118741, + "learning_rate": 9.94119887108211e-06, + "loss": 3.6876, + "step": 5069 + }, + { + "epoch": 0.43211454870877014, + "grad_norm": 110.31085358705234, + "learning_rate": 9.941123025590427e-06, + "loss": 4.0251, + "step": 5070 + }, + { + "epoch": 0.43219977840279555, + "grad_norm": 69.40333692698691, + "learning_rate": 9.941047131504762e-06, + "loss": 3.9706, + "step": 5071 + }, + { + "epoch": 0.4322850080968209, + "grad_norm": 81.17492369577192, + "learning_rate": 9.940971188825862e-06, + "loss": 4.7467, + "step": 5072 + }, + { + "epoch": 0.4323702377908463, + "grad_norm": 109.3863611061919, + "learning_rate": 9.940895197554475e-06, + "loss": 4.7985, + "step": 5073 + }, + { + "epoch": 0.43245546748487174, + "grad_norm": 71.10858649399377, + "learning_rate": 9.940819157691346e-06, + "loss": 4.5025, + "step": 5074 + }, + { + "epoch": 0.43254069717889715, + "grad_norm": 98.88529437231544, + "learning_rate": 9.940743069237224e-06, + "loss": 3.3779, + "step": 5075 + }, + { + "epoch": 0.4326259268729225, + "grad_norm": 220.2076452738365, + "learning_rate": 9.940666932192857e-06, + "loss": 6.0261, + "step": 5076 + }, + { + "epoch": 0.4327111565669479, + "grad_norm": 59.32147192497675, + "learning_rate": 9.940590746558995e-06, + "loss": 4.0909, + "step": 5077 + }, + { + "epoch": 0.43279638626097333, + "grad_norm": 110.03181006315737, + "learning_rate": 9.940514512336385e-06, + "loss": 4.7566, + "step": 5078 + }, + { + "epoch": 0.43288161595499874, + "grad_norm": 43.485505788045245, + "learning_rate": 9.940438229525779e-06, + "loss": 3.7467, + "step": 5079 + }, + { + "epoch": 0.4329668456490241, + "grad_norm": 69.8244032382067, + "learning_rate": 9.940361898127925e-06, + "loss": 4.1109, + "step": 5080 + }, + { + "epoch": 0.4330520753430495, + "grad_norm": 37.40088256057286, + "learning_rate": 9.940285518143575e-06, + "loss": 3.5732, + "step": 5081 + }, + { + "epoch": 0.4331373050370749, + "grad_norm": 116.42155465821536, + "learning_rate": 9.94020908957348e-06, + "loss": 4.7942, + "step": 5082 + }, + { + "epoch": 0.43322253473110034, + "grad_norm": 56.0042438093518, + "learning_rate": 9.940132612418393e-06, + "loss": 3.959, + "step": 5083 + }, + { + "epoch": 0.4333077644251257, + "grad_norm": 65.48588970491515, + "learning_rate": 9.940056086679065e-06, + "loss": 3.8003, + "step": 5084 + }, + { + "epoch": 0.4333929941191511, + "grad_norm": 51.80508114474012, + "learning_rate": 9.939979512356246e-06, + "loss": 4.2554, + "step": 5085 + }, + { + "epoch": 0.4334782238131765, + "grad_norm": 116.00997447899844, + "learning_rate": 9.939902889450692e-06, + "loss": 4.8259, + "step": 5086 + }, + { + "epoch": 0.43356345350720193, + "grad_norm": 38.31901893262666, + "learning_rate": 9.939826217963158e-06, + "loss": 3.4573, + "step": 5087 + }, + { + "epoch": 0.4336486832012273, + "grad_norm": 104.68578448482137, + "learning_rate": 9.939749497894392e-06, + "loss": 3.9225, + "step": 5088 + }, + { + "epoch": 0.4337339128952527, + "grad_norm": 86.86738343081542, + "learning_rate": 9.939672729245156e-06, + "loss": 3.8553, + "step": 5089 + }, + { + "epoch": 0.4338191425892781, + "grad_norm": 237.43936407972998, + "learning_rate": 9.939595912016202e-06, + "loss": 5.2058, + "step": 5090 + }, + { + "epoch": 0.43390437228330353, + "grad_norm": 68.27600115290436, + "learning_rate": 9.939519046208284e-06, + "loss": 3.7515, + "step": 5091 + }, + { + "epoch": 0.4339896019773289, + "grad_norm": 60.65836157764072, + "learning_rate": 9.939442131822159e-06, + "loss": 3.235, + "step": 5092 + }, + { + "epoch": 0.4340748316713543, + "grad_norm": 65.79289092754307, + "learning_rate": 9.939365168858582e-06, + "loss": 3.5446, + "step": 5093 + }, + { + "epoch": 0.4341600613653797, + "grad_norm": 84.24014972205231, + "learning_rate": 9.939288157318312e-06, + "loss": 3.6991, + "step": 5094 + }, + { + "epoch": 0.43424529105940507, + "grad_norm": 64.01826234581777, + "learning_rate": 9.939211097202106e-06, + "loss": 3.2244, + "step": 5095 + }, + { + "epoch": 0.4343305207534305, + "grad_norm": 157.76826099012123, + "learning_rate": 9.939133988510722e-06, + "loss": 5.7767, + "step": 5096 + }, + { + "epoch": 0.4344157504474559, + "grad_norm": 106.83066554835548, + "learning_rate": 9.939056831244918e-06, + "loss": 5.1419, + "step": 5097 + }, + { + "epoch": 0.4345009801414813, + "grad_norm": 267.72308810797506, + "learning_rate": 9.938979625405452e-06, + "loss": 4.7565, + "step": 5098 + }, + { + "epoch": 0.43458620983550666, + "grad_norm": 148.1963498410877, + "learning_rate": 9.938902370993085e-06, + "loss": 4.6894, + "step": 5099 + }, + { + "epoch": 0.4346714395295321, + "grad_norm": 78.353702204365, + "learning_rate": 9.938825068008572e-06, + "loss": 3.9051, + "step": 5100 + }, + { + "epoch": 0.4347566692235575, + "grad_norm": 111.5758935221369, + "learning_rate": 9.938747716452682e-06, + "loss": 4.6878, + "step": 5101 + }, + { + "epoch": 0.4348418989175829, + "grad_norm": 51.67143673968912, + "learning_rate": 9.938670316326167e-06, + "loss": 3.2035, + "step": 5102 + }, + { + "epoch": 0.43492712861160826, + "grad_norm": 50.978232149417984, + "learning_rate": 9.938592867629793e-06, + "loss": 3.1225, + "step": 5103 + }, + { + "epoch": 0.4350123583056337, + "grad_norm": 534.9495455724294, + "learning_rate": 9.938515370364322e-06, + "loss": 3.6131, + "step": 5104 + }, + { + "epoch": 0.4350975879996591, + "grad_norm": 165.0067298179344, + "learning_rate": 9.938437824530513e-06, + "loss": 4.2408, + "step": 5105 + }, + { + "epoch": 0.4351828176936845, + "grad_norm": 65.40147493086299, + "learning_rate": 9.93836023012913e-06, + "loss": 4.308, + "step": 5106 + }, + { + "epoch": 0.43526804738770986, + "grad_norm": 95.72693358541352, + "learning_rate": 9.938282587160939e-06, + "loss": 4.6195, + "step": 5107 + }, + { + "epoch": 0.43535327708173527, + "grad_norm": 63.7290706910118, + "learning_rate": 9.938204895626697e-06, + "loss": 4.0332, + "step": 5108 + }, + { + "epoch": 0.4354385067757607, + "grad_norm": 78.03139284824627, + "learning_rate": 9.938127155527175e-06, + "loss": 4.6045, + "step": 5109 + }, + { + "epoch": 0.4355237364697861, + "grad_norm": 50.03394462692871, + "learning_rate": 9.938049366863133e-06, + "loss": 3.6677, + "step": 5110 + }, + { + "epoch": 0.43560896616381145, + "grad_norm": 37.017608446573156, + "learning_rate": 9.937971529635339e-06, + "loss": 3.2677, + "step": 5111 + }, + { + "epoch": 0.43569419585783686, + "grad_norm": 124.34171184949398, + "learning_rate": 9.937893643844559e-06, + "loss": 4.6894, + "step": 5112 + }, + { + "epoch": 0.4357794255518623, + "grad_norm": 96.47490923025083, + "learning_rate": 9.937815709491554e-06, + "loss": 4.7635, + "step": 5113 + }, + { + "epoch": 0.4358646552458877, + "grad_norm": 131.51772490925958, + "learning_rate": 9.937737726577094e-06, + "loss": 4.6503, + "step": 5114 + }, + { + "epoch": 0.43594988493991305, + "grad_norm": 47.89727140119655, + "learning_rate": 9.937659695101946e-06, + "loss": 3.5191, + "step": 5115 + }, + { + "epoch": 0.43603511463393846, + "grad_norm": 43.1489986707135, + "learning_rate": 9.937581615066879e-06, + "loss": 4.59, + "step": 5116 + }, + { + "epoch": 0.43612034432796387, + "grad_norm": 70.34543604518112, + "learning_rate": 9.937503486472656e-06, + "loss": 3.0128, + "step": 5117 + }, + { + "epoch": 0.4362055740219893, + "grad_norm": 48.85549964993334, + "learning_rate": 9.937425309320048e-06, + "loss": 3.6082, + "step": 5118 + }, + { + "epoch": 0.43629080371601464, + "grad_norm": 68.96236563212062, + "learning_rate": 9.937347083609825e-06, + "loss": 3.663, + "step": 5119 + }, + { + "epoch": 0.43637603341004005, + "grad_norm": 48.72231584837469, + "learning_rate": 9.937268809342756e-06, + "loss": 3.2862, + "step": 5120 + }, + { + "epoch": 0.43646126310406547, + "grad_norm": 33.59359117105687, + "learning_rate": 9.93719048651961e-06, + "loss": 2.1892, + "step": 5121 + }, + { + "epoch": 0.4365464927980909, + "grad_norm": 50.70446977535311, + "learning_rate": 9.937112115141157e-06, + "loss": 3.6766, + "step": 5122 + }, + { + "epoch": 0.43663172249211624, + "grad_norm": 49.877302003088715, + "learning_rate": 9.937033695208167e-06, + "loss": 3.5924, + "step": 5123 + }, + { + "epoch": 0.43671695218614165, + "grad_norm": 104.27763196202515, + "learning_rate": 9.936955226721414e-06, + "loss": 3.9006, + "step": 5124 + }, + { + "epoch": 0.43680218188016706, + "grad_norm": 55.597135765464536, + "learning_rate": 9.936876709681668e-06, + "loss": 4.239, + "step": 5125 + }, + { + "epoch": 0.4368874115741925, + "grad_norm": 43.71864981625781, + "learning_rate": 9.9367981440897e-06, + "loss": 3.8974, + "step": 5126 + }, + { + "epoch": 0.43697264126821783, + "grad_norm": 86.77474012658254, + "learning_rate": 9.936719529946287e-06, + "loss": 3.8088, + "step": 5127 + }, + { + "epoch": 0.43705787096224324, + "grad_norm": 54.434534089526736, + "learning_rate": 9.936640867252196e-06, + "loss": 3.2814, + "step": 5128 + }, + { + "epoch": 0.43714310065626866, + "grad_norm": 40.30162920164126, + "learning_rate": 9.936562156008205e-06, + "loss": 2.5161, + "step": 5129 + }, + { + "epoch": 0.43722833035029407, + "grad_norm": 74.61206441093438, + "learning_rate": 9.936483396215085e-06, + "loss": 3.5951, + "step": 5130 + }, + { + "epoch": 0.4373135600443194, + "grad_norm": 152.86905613054145, + "learning_rate": 9.936404587873613e-06, + "loss": 5.7195, + "step": 5131 + }, + { + "epoch": 0.43739878973834484, + "grad_norm": 73.21693513298968, + "learning_rate": 9.936325730984567e-06, + "loss": 4.2201, + "step": 5132 + }, + { + "epoch": 0.43748401943237025, + "grad_norm": 41.3654040138766, + "learning_rate": 9.936246825548714e-06, + "loss": 3.3239, + "step": 5133 + }, + { + "epoch": 0.4375692491263956, + "grad_norm": 76.67352258149991, + "learning_rate": 9.936167871566838e-06, + "loss": 4.7552, + "step": 5134 + }, + { + "epoch": 0.437654478820421, + "grad_norm": 99.98925634035797, + "learning_rate": 9.93608886903971e-06, + "loss": 3.0408, + "step": 5135 + }, + { + "epoch": 0.43773970851444644, + "grad_norm": 65.3095348042091, + "learning_rate": 9.936009817968112e-06, + "loss": 4.3528, + "step": 5136 + }, + { + "epoch": 0.43782493820847185, + "grad_norm": 72.91718541295174, + "learning_rate": 9.935930718352818e-06, + "loss": 4.5718, + "step": 5137 + }, + { + "epoch": 0.4379101679024972, + "grad_norm": 132.2569822565727, + "learning_rate": 9.935851570194606e-06, + "loss": 3.7904, + "step": 5138 + }, + { + "epoch": 0.4379953975965226, + "grad_norm": 36.8695295291493, + "learning_rate": 9.935772373494256e-06, + "loss": 3.769, + "step": 5139 + }, + { + "epoch": 0.43808062729054803, + "grad_norm": 80.53104403111051, + "learning_rate": 9.935693128252546e-06, + "loss": 4.2184, + "step": 5140 + }, + { + "epoch": 0.43816585698457344, + "grad_norm": 64.21892882298823, + "learning_rate": 9.935613834470255e-06, + "loss": 3.7159, + "step": 5141 + }, + { + "epoch": 0.4382510866785988, + "grad_norm": 58.977343479671894, + "learning_rate": 9.935534492148161e-06, + "loss": 3.6445, + "step": 5142 + }, + { + "epoch": 0.4383363163726242, + "grad_norm": 57.45807179000271, + "learning_rate": 9.93545510128705e-06, + "loss": 4.6182, + "step": 5143 + }, + { + "epoch": 0.4384215460666496, + "grad_norm": 63.47025509435089, + "learning_rate": 9.935375661887698e-06, + "loss": 4.1563, + "step": 5144 + }, + { + "epoch": 0.43850677576067504, + "grad_norm": 90.51215893158523, + "learning_rate": 9.935296173950888e-06, + "loss": 5.0905, + "step": 5145 + }, + { + "epoch": 0.4385920054547004, + "grad_norm": 1467.888891392024, + "learning_rate": 9.935216637477399e-06, + "loss": 5.2313, + "step": 5146 + }, + { + "epoch": 0.4386772351487258, + "grad_norm": 86.9401252981018, + "learning_rate": 9.935137052468016e-06, + "loss": 4.6882, + "step": 5147 + }, + { + "epoch": 0.4387624648427512, + "grad_norm": 40.92792807560822, + "learning_rate": 9.935057418923522e-06, + "loss": 3.6253, + "step": 5148 + }, + { + "epoch": 0.43884769453677663, + "grad_norm": 80.38761478882373, + "learning_rate": 9.9349777368447e-06, + "loss": 3.8682, + "step": 5149 + }, + { + "epoch": 0.438932924230802, + "grad_norm": 57.93638186204511, + "learning_rate": 9.934898006232331e-06, + "loss": 3.7817, + "step": 5150 + }, + { + "epoch": 0.4390181539248274, + "grad_norm": 37.9891145432405, + "learning_rate": 9.934818227087203e-06, + "loss": 3.3323, + "step": 5151 + }, + { + "epoch": 0.4391033836188528, + "grad_norm": 85.08539840439019, + "learning_rate": 9.9347383994101e-06, + "loss": 4.3721, + "step": 5152 + }, + { + "epoch": 0.43918861331287823, + "grad_norm": 186.21132790185095, + "learning_rate": 9.934658523201803e-06, + "loss": 3.611, + "step": 5153 + }, + { + "epoch": 0.4392738430069036, + "grad_norm": 75.10992922522955, + "learning_rate": 9.9345785984631e-06, + "loss": 5.0223, + "step": 5154 + }, + { + "epoch": 0.439359072700929, + "grad_norm": 36.22161214458307, + "learning_rate": 9.934498625194779e-06, + "loss": 3.3156, + "step": 5155 + }, + { + "epoch": 0.4394443023949544, + "grad_norm": 69.5675689445564, + "learning_rate": 9.934418603397623e-06, + "loss": 5.3061, + "step": 5156 + }, + { + "epoch": 0.4395295320889798, + "grad_norm": 122.93144110684426, + "learning_rate": 9.934338533072421e-06, + "loss": 4.7557, + "step": 5157 + }, + { + "epoch": 0.4396147617830052, + "grad_norm": 100.38139728400456, + "learning_rate": 9.934258414219961e-06, + "loss": 3.802, + "step": 5158 + }, + { + "epoch": 0.4396999914770306, + "grad_norm": 65.21885258304802, + "learning_rate": 9.93417824684103e-06, + "loss": 3.6078, + "step": 5159 + }, + { + "epoch": 0.439785221171056, + "grad_norm": 190.85204673109226, + "learning_rate": 9.934098030936418e-06, + "loss": 4.6029, + "step": 5160 + }, + { + "epoch": 0.4398704508650814, + "grad_norm": 73.27058170495269, + "learning_rate": 9.934017766506912e-06, + "loss": 4.0945, + "step": 5161 + }, + { + "epoch": 0.4399556805591068, + "grad_norm": 41.37167523783917, + "learning_rate": 9.9339374535533e-06, + "loss": 3.1264, + "step": 5162 + }, + { + "epoch": 0.4400409102531322, + "grad_norm": 97.26734326914388, + "learning_rate": 9.933857092076375e-06, + "loss": 4.5991, + "step": 5163 + }, + { + "epoch": 0.4401261399471576, + "grad_norm": 68.32132619517435, + "learning_rate": 9.933776682076925e-06, + "loss": 4.387, + "step": 5164 + }, + { + "epoch": 0.440211369641183, + "grad_norm": 52.355415788726816, + "learning_rate": 9.933696223555744e-06, + "loss": 4.0057, + "step": 5165 + }, + { + "epoch": 0.44029659933520837, + "grad_norm": 40.36445165739935, + "learning_rate": 9.933615716513619e-06, + "loss": 3.4651, + "step": 5166 + }, + { + "epoch": 0.4403818290292338, + "grad_norm": 84.99527600612231, + "learning_rate": 9.933535160951344e-06, + "loss": 4.2356, + "step": 5167 + }, + { + "epoch": 0.4404670587232592, + "grad_norm": 82.35970736512586, + "learning_rate": 9.933454556869712e-06, + "loss": 3.9808, + "step": 5168 + }, + { + "epoch": 0.44055228841728455, + "grad_norm": 43.540168493751786, + "learning_rate": 9.933373904269515e-06, + "loss": 3.6345, + "step": 5169 + }, + { + "epoch": 0.44063751811130997, + "grad_norm": 66.1850323015313, + "learning_rate": 9.933293203151546e-06, + "loss": 3.5198, + "step": 5170 + }, + { + "epoch": 0.4407227478053354, + "grad_norm": 41.56308952424583, + "learning_rate": 9.933212453516598e-06, + "loss": 3.6761, + "step": 5171 + }, + { + "epoch": 0.4408079774993608, + "grad_norm": 85.31729283805109, + "learning_rate": 9.933131655365466e-06, + "loss": 4.8603, + "step": 5172 + }, + { + "epoch": 0.44089320719338615, + "grad_norm": 73.57969890729287, + "learning_rate": 9.933050808698943e-06, + "loss": 4.5547, + "step": 5173 + }, + { + "epoch": 0.44097843688741156, + "grad_norm": 169.87686696909964, + "learning_rate": 9.932969913517827e-06, + "loss": 3.6927, + "step": 5174 + }, + { + "epoch": 0.441063666581437, + "grad_norm": 66.90973693754472, + "learning_rate": 9.932888969822912e-06, + "loss": 3.8681, + "step": 5175 + }, + { + "epoch": 0.4411488962754624, + "grad_norm": 53.38485818077051, + "learning_rate": 9.932807977614994e-06, + "loss": 3.2294, + "step": 5176 + }, + { + "epoch": 0.44123412596948774, + "grad_norm": 38.345775965322844, + "learning_rate": 9.93272693689487e-06, + "loss": 3.4124, + "step": 5177 + }, + { + "epoch": 0.44131935566351316, + "grad_norm": 65.95255738259877, + "learning_rate": 9.932645847663336e-06, + "loss": 3.3602, + "step": 5178 + }, + { + "epoch": 0.44140458535753857, + "grad_norm": 37.07651930038126, + "learning_rate": 9.93256470992119e-06, + "loss": 3.2562, + "step": 5179 + }, + { + "epoch": 0.441489815051564, + "grad_norm": 62.78972955023585, + "learning_rate": 9.932483523669231e-06, + "loss": 3.3523, + "step": 5180 + }, + { + "epoch": 0.44157504474558934, + "grad_norm": 137.01622030533647, + "learning_rate": 9.932402288908256e-06, + "loss": 3.8454, + "step": 5181 + }, + { + "epoch": 0.44166027443961475, + "grad_norm": 70.88661161925135, + "learning_rate": 9.932321005639065e-06, + "loss": 4.5284, + "step": 5182 + }, + { + "epoch": 0.44174550413364017, + "grad_norm": 36.616453388802114, + "learning_rate": 9.932239673862456e-06, + "loss": 2.8773, + "step": 5183 + }, + { + "epoch": 0.4418307338276656, + "grad_norm": 47.824628628853375, + "learning_rate": 9.93215829357923e-06, + "loss": 4.8789, + "step": 5184 + }, + { + "epoch": 0.44191596352169094, + "grad_norm": 92.66045375012692, + "learning_rate": 9.932076864790187e-06, + "loss": 4.6474, + "step": 5185 + }, + { + "epoch": 0.44200119321571635, + "grad_norm": 104.45710813156239, + "learning_rate": 9.931995387496127e-06, + "loss": 4.8347, + "step": 5186 + }, + { + "epoch": 0.44208642290974176, + "grad_norm": 180.51315637398184, + "learning_rate": 9.931913861697853e-06, + "loss": 4.7602, + "step": 5187 + }, + { + "epoch": 0.4421716526037672, + "grad_norm": 50.65673224502055, + "learning_rate": 9.931832287396166e-06, + "loss": 4.0972, + "step": 5188 + }, + { + "epoch": 0.44225688229779253, + "grad_norm": 70.68060905517046, + "learning_rate": 9.931750664591868e-06, + "loss": 4.2436, + "step": 5189 + }, + { + "epoch": 0.44234211199181794, + "grad_norm": 47.761494904161275, + "learning_rate": 9.931668993285762e-06, + "loss": 3.7317, + "step": 5190 + }, + { + "epoch": 0.44242734168584336, + "grad_norm": 44.50613805871333, + "learning_rate": 9.93158727347865e-06, + "loss": 3.5911, + "step": 5191 + }, + { + "epoch": 0.44251257137986877, + "grad_norm": 33.07637114629151, + "learning_rate": 9.93150550517134e-06, + "loss": 2.9687, + "step": 5192 + }, + { + "epoch": 0.4425978010738941, + "grad_norm": 62.47390234636597, + "learning_rate": 9.93142368836463e-06, + "loss": 3.1371, + "step": 5193 + }, + { + "epoch": 0.44268303076791954, + "grad_norm": 59.30382135778561, + "learning_rate": 9.931341823059328e-06, + "loss": 3.1526, + "step": 5194 + }, + { + "epoch": 0.44276826046194495, + "grad_norm": 42.42968726857259, + "learning_rate": 9.931259909256239e-06, + "loss": 3.5107, + "step": 5195 + }, + { + "epoch": 0.44285349015597036, + "grad_norm": 93.2211339309086, + "learning_rate": 9.931177946956168e-06, + "loss": 4.8027, + "step": 5196 + }, + { + "epoch": 0.4429387198499957, + "grad_norm": 112.5868657235412, + "learning_rate": 9.931095936159923e-06, + "loss": 4.7564, + "step": 5197 + }, + { + "epoch": 0.44302394954402113, + "grad_norm": 48.5909440873632, + "learning_rate": 9.931013876868308e-06, + "loss": 2.1252, + "step": 5198 + }, + { + "epoch": 0.44310917923804655, + "grad_norm": 69.61254549145121, + "learning_rate": 9.930931769082131e-06, + "loss": 4.5462, + "step": 5199 + }, + { + "epoch": 0.44319440893207196, + "grad_norm": 49.78645042418338, + "learning_rate": 9.930849612802198e-06, + "loss": 3.7084, + "step": 5200 + }, + { + "epoch": 0.4432796386260973, + "grad_norm": 75.17983848371847, + "learning_rate": 9.93076740802932e-06, + "loss": 3.4031, + "step": 5201 + }, + { + "epoch": 0.44336486832012273, + "grad_norm": 44.4007913337716, + "learning_rate": 9.930685154764302e-06, + "loss": 3.78, + "step": 5202 + }, + { + "epoch": 0.44345009801414814, + "grad_norm": 36.77332240561474, + "learning_rate": 9.930602853007957e-06, + "loss": 3.4381, + "step": 5203 + }, + { + "epoch": 0.44353532770817355, + "grad_norm": 84.02565563547309, + "learning_rate": 9.930520502761089e-06, + "loss": 4.2626, + "step": 5204 + }, + { + "epoch": 0.4436205574021989, + "grad_norm": 77.38292189016153, + "learning_rate": 9.930438104024514e-06, + "loss": 3.9546, + "step": 5205 + }, + { + "epoch": 0.4437057870962243, + "grad_norm": 160.08022209246613, + "learning_rate": 9.930355656799038e-06, + "loss": 5.2614, + "step": 5206 + }, + { + "epoch": 0.44379101679024974, + "grad_norm": 121.39365693613357, + "learning_rate": 9.930273161085475e-06, + "loss": 4.9418, + "step": 5207 + }, + { + "epoch": 0.4438762464842751, + "grad_norm": 104.02669670023349, + "learning_rate": 9.930190616884634e-06, + "loss": 3.9264, + "step": 5208 + }, + { + "epoch": 0.4439614761783005, + "grad_norm": 73.76818809179983, + "learning_rate": 9.930108024197326e-06, + "loss": 4.3052, + "step": 5209 + }, + { + "epoch": 0.4440467058723259, + "grad_norm": 45.240321891050336, + "learning_rate": 9.930025383024365e-06, + "loss": 4.1158, + "step": 5210 + }, + { + "epoch": 0.44413193556635133, + "grad_norm": 72.21352808844023, + "learning_rate": 9.929942693366564e-06, + "loss": 3.3091, + "step": 5211 + }, + { + "epoch": 0.4442171652603767, + "grad_norm": 39.6492362054561, + "learning_rate": 9.929859955224734e-06, + "loss": 3.9716, + "step": 5212 + }, + { + "epoch": 0.4443023949544021, + "grad_norm": 103.28991481803045, + "learning_rate": 9.929777168599692e-06, + "loss": 5.3973, + "step": 5213 + }, + { + "epoch": 0.4443876246484275, + "grad_norm": 144.13664125616825, + "learning_rate": 9.92969433349225e-06, + "loss": 6.2551, + "step": 5214 + }, + { + "epoch": 0.4444728543424529, + "grad_norm": 47.59889130868684, + "learning_rate": 9.929611449903224e-06, + "loss": 3.8047, + "step": 5215 + }, + { + "epoch": 0.4445580840364783, + "grad_norm": 96.58037603918527, + "learning_rate": 9.929528517833427e-06, + "loss": 4.1268, + "step": 5216 + }, + { + "epoch": 0.4446433137305037, + "grad_norm": 56.40825655269367, + "learning_rate": 9.929445537283674e-06, + "loss": 3.8215, + "step": 5217 + }, + { + "epoch": 0.4447285434245291, + "grad_norm": 99.17721069948617, + "learning_rate": 9.929362508254788e-06, + "loss": 3.7973, + "step": 5218 + }, + { + "epoch": 0.4448137731185545, + "grad_norm": 47.90013575399539, + "learning_rate": 9.929279430747576e-06, + "loss": 2.834, + "step": 5219 + }, + { + "epoch": 0.4448990028125799, + "grad_norm": 98.15661513309688, + "learning_rate": 9.929196304762861e-06, + "loss": 4.3237, + "step": 5220 + }, + { + "epoch": 0.4449842325066053, + "grad_norm": 103.72156840374318, + "learning_rate": 9.92911313030146e-06, + "loss": 4.7913, + "step": 5221 + }, + { + "epoch": 0.4450694622006307, + "grad_norm": 110.68869230827377, + "learning_rate": 9.929029907364189e-06, + "loss": 5.3298, + "step": 5222 + }, + { + "epoch": 0.4451546918946561, + "grad_norm": 74.22968529259985, + "learning_rate": 9.928946635951867e-06, + "loss": 3.5674, + "step": 5223 + }, + { + "epoch": 0.4452399215886815, + "grad_norm": 125.17305818416743, + "learning_rate": 9.928863316065314e-06, + "loss": 4.2651, + "step": 5224 + }, + { + "epoch": 0.4453251512827069, + "grad_norm": 98.15187958793064, + "learning_rate": 9.928779947705349e-06, + "loss": 5.0554, + "step": 5225 + }, + { + "epoch": 0.4454103809767323, + "grad_norm": 165.98087801189234, + "learning_rate": 9.928696530872791e-06, + "loss": 5.3759, + "step": 5226 + }, + { + "epoch": 0.4454956106707577, + "grad_norm": 100.3471704756586, + "learning_rate": 9.928613065568462e-06, + "loss": 4.3186, + "step": 5227 + }, + { + "epoch": 0.44558084036478307, + "grad_norm": 180.83490872291594, + "learning_rate": 9.928529551793182e-06, + "loss": 4.6194, + "step": 5228 + }, + { + "epoch": 0.4456660700588085, + "grad_norm": 50.620213396461644, + "learning_rate": 9.92844598954777e-06, + "loss": 4.1203, + "step": 5229 + }, + { + "epoch": 0.4457512997528339, + "grad_norm": 91.27542912920366, + "learning_rate": 9.928362378833052e-06, + "loss": 3.5942, + "step": 5230 + }, + { + "epoch": 0.4458365294468593, + "grad_norm": 58.09712206078189, + "learning_rate": 9.928278719649849e-06, + "loss": 3.232, + "step": 5231 + }, + { + "epoch": 0.44592175914088467, + "grad_norm": 78.13301732029244, + "learning_rate": 9.928195011998983e-06, + "loss": 4.7116, + "step": 5232 + }, + { + "epoch": 0.4460069888349101, + "grad_norm": 48.90588384158971, + "learning_rate": 9.928111255881275e-06, + "loss": 3.6081, + "step": 5233 + }, + { + "epoch": 0.4460922185289355, + "grad_norm": 86.24096887977132, + "learning_rate": 9.928027451297553e-06, + "loss": 4.9845, + "step": 5234 + }, + { + "epoch": 0.4461774482229609, + "grad_norm": 47.1118220747722, + "learning_rate": 9.92794359824864e-06, + "loss": 3.5938, + "step": 5235 + }, + { + "epoch": 0.44626267791698626, + "grad_norm": 93.10110174387032, + "learning_rate": 9.927859696735358e-06, + "loss": 5.6844, + "step": 5236 + }, + { + "epoch": 0.4463479076110117, + "grad_norm": 92.82663953622583, + "learning_rate": 9.927775746758534e-06, + "loss": 5.0169, + "step": 5237 + }, + { + "epoch": 0.4464331373050371, + "grad_norm": 35.008852773259186, + "learning_rate": 9.927691748318994e-06, + "loss": 3.8153, + "step": 5238 + }, + { + "epoch": 0.4465183669990625, + "grad_norm": 44.10437069016024, + "learning_rate": 9.927607701417566e-06, + "loss": 3.9731, + "step": 5239 + }, + { + "epoch": 0.44660359669308786, + "grad_norm": 95.37638849104356, + "learning_rate": 9.927523606055071e-06, + "loss": 3.9474, + "step": 5240 + }, + { + "epoch": 0.44668882638711327, + "grad_norm": 74.22582100903603, + "learning_rate": 9.927439462232341e-06, + "loss": 4.3044, + "step": 5241 + }, + { + "epoch": 0.4467740560811387, + "grad_norm": 48.181241719973606, + "learning_rate": 9.927355269950201e-06, + "loss": 3.1167, + "step": 5242 + }, + { + "epoch": 0.4468592857751641, + "grad_norm": 78.08535529690542, + "learning_rate": 9.92727102920948e-06, + "loss": 4.2522, + "step": 5243 + }, + { + "epoch": 0.44694451546918945, + "grad_norm": 48.29022400930478, + "learning_rate": 9.927186740011007e-06, + "loss": 3.6707, + "step": 5244 + }, + { + "epoch": 0.44702974516321486, + "grad_norm": 55.82116371984755, + "learning_rate": 9.927102402355611e-06, + "loss": 3.9126, + "step": 5245 + }, + { + "epoch": 0.4471149748572403, + "grad_norm": 77.59819159145445, + "learning_rate": 9.92701801624412e-06, + "loss": 3.6681, + "step": 5246 + }, + { + "epoch": 0.44720020455126563, + "grad_norm": 63.896698059887036, + "learning_rate": 9.926933581677364e-06, + "loss": 4.4068, + "step": 5247 + }, + { + "epoch": 0.44728543424529105, + "grad_norm": 43.27576405106937, + "learning_rate": 9.926849098656176e-06, + "loss": 3.3708, + "step": 5248 + }, + { + "epoch": 0.44737066393931646, + "grad_norm": 55.94353706036335, + "learning_rate": 9.926764567181382e-06, + "loss": 4.117, + "step": 5249 + }, + { + "epoch": 0.4474558936333419, + "grad_norm": 35.68882936195415, + "learning_rate": 9.926679987253817e-06, + "loss": 3.0705, + "step": 5250 + }, + { + "epoch": 0.44754112332736723, + "grad_norm": 103.40197527502175, + "learning_rate": 9.926595358874313e-06, + "loss": 3.9466, + "step": 5251 + }, + { + "epoch": 0.44762635302139264, + "grad_norm": 80.63577526272252, + "learning_rate": 9.9265106820437e-06, + "loss": 4.0267, + "step": 5252 + }, + { + "epoch": 0.44771158271541805, + "grad_norm": 72.80070015828244, + "learning_rate": 9.926425956762813e-06, + "loss": 4.7616, + "step": 5253 + }, + { + "epoch": 0.44779681240944347, + "grad_norm": 43.6727796218, + "learning_rate": 9.926341183032484e-06, + "loss": 3.6407, + "step": 5254 + }, + { + "epoch": 0.4478820421034688, + "grad_norm": 72.54528441020884, + "learning_rate": 9.926256360853549e-06, + "loss": 5.2288, + "step": 5255 + }, + { + "epoch": 0.44796727179749424, + "grad_norm": 171.85419861057431, + "learning_rate": 9.926171490226838e-06, + "loss": 4.9912, + "step": 5256 + }, + { + "epoch": 0.44805250149151965, + "grad_norm": 32.08864374735553, + "learning_rate": 9.926086571153186e-06, + "loss": 3.0035, + "step": 5257 + }, + { + "epoch": 0.44813773118554506, + "grad_norm": 34.36458679576627, + "learning_rate": 9.926001603633433e-06, + "loss": 2.1515, + "step": 5258 + }, + { + "epoch": 0.4482229608795704, + "grad_norm": 87.79925001291632, + "learning_rate": 9.92591658766841e-06, + "loss": 4.9825, + "step": 5259 + }, + { + "epoch": 0.44830819057359583, + "grad_norm": 352.9260658284983, + "learning_rate": 9.925831523258956e-06, + "loss": 4.4657, + "step": 5260 + }, + { + "epoch": 0.44839342026762125, + "grad_norm": 60.57858246934863, + "learning_rate": 9.925746410405906e-06, + "loss": 3.3353, + "step": 5261 + }, + { + "epoch": 0.44847864996164666, + "grad_norm": 141.90608067852077, + "learning_rate": 9.925661249110097e-06, + "loss": 5.3476, + "step": 5262 + }, + { + "epoch": 0.448563879655672, + "grad_norm": 135.12403569353134, + "learning_rate": 9.925576039372366e-06, + "loss": 4.3516, + "step": 5263 + }, + { + "epoch": 0.44864910934969743, + "grad_norm": 58.89742038815525, + "learning_rate": 9.925490781193553e-06, + "loss": 4.4563, + "step": 5264 + }, + { + "epoch": 0.44873433904372284, + "grad_norm": 57.25923497014915, + "learning_rate": 9.925405474574495e-06, + "loss": 4.6448, + "step": 5265 + }, + { + "epoch": 0.44881956873774825, + "grad_norm": 212.30026631845487, + "learning_rate": 9.925320119516031e-06, + "loss": 4.4329, + "step": 5266 + }, + { + "epoch": 0.4489047984317736, + "grad_norm": 81.45128576401699, + "learning_rate": 9.925234716019003e-06, + "loss": 5.1862, + "step": 5267 + }, + { + "epoch": 0.448990028125799, + "grad_norm": 62.279368580981256, + "learning_rate": 9.925149264084246e-06, + "loss": 4.3211, + "step": 5268 + }, + { + "epoch": 0.44907525781982444, + "grad_norm": 104.1423343478214, + "learning_rate": 9.925063763712605e-06, + "loss": 4.5739, + "step": 5269 + }, + { + "epoch": 0.44916048751384985, + "grad_norm": 73.34761382764967, + "learning_rate": 9.924978214904917e-06, + "loss": 4.6283, + "step": 5270 + }, + { + "epoch": 0.4492457172078752, + "grad_norm": 50.55902390344336, + "learning_rate": 9.924892617662028e-06, + "loss": 4.1437, + "step": 5271 + }, + { + "epoch": 0.4493309469019006, + "grad_norm": 54.20586859305451, + "learning_rate": 9.924806971984774e-06, + "loss": 3.8577, + "step": 5272 + }, + { + "epoch": 0.44941617659592603, + "grad_norm": 49.173615694558414, + "learning_rate": 9.924721277874e-06, + "loss": 3.0779, + "step": 5273 + }, + { + "epoch": 0.44950140628995144, + "grad_norm": 68.34984714772246, + "learning_rate": 9.924635535330553e-06, + "loss": 3.9444, + "step": 5274 + }, + { + "epoch": 0.4495866359839768, + "grad_norm": 102.02395619426913, + "learning_rate": 9.92454974435527e-06, + "loss": 5.3277, + "step": 5275 + }, + { + "epoch": 0.4496718656780022, + "grad_norm": 44.830175171703736, + "learning_rate": 9.924463904948996e-06, + "loss": 4.4565, + "step": 5276 + }, + { + "epoch": 0.4497570953720276, + "grad_norm": 48.618199922734604, + "learning_rate": 9.924378017112579e-06, + "loss": 3.2622, + "step": 5277 + }, + { + "epoch": 0.44984232506605304, + "grad_norm": 51.54424839433121, + "learning_rate": 9.924292080846858e-06, + "loss": 4.3654, + "step": 5278 + }, + { + "epoch": 0.4499275547600784, + "grad_norm": 35.77723002992374, + "learning_rate": 9.924206096152683e-06, + "loss": 3.5437, + "step": 5279 + }, + { + "epoch": 0.4500127844541038, + "grad_norm": 48.16830745112146, + "learning_rate": 9.924120063030898e-06, + "loss": 3.7012, + "step": 5280 + }, + { + "epoch": 0.4500980141481292, + "grad_norm": 103.41339612763856, + "learning_rate": 9.924033981482349e-06, + "loss": 4.257, + "step": 5281 + }, + { + "epoch": 0.4501832438421546, + "grad_norm": 63.346636458545284, + "learning_rate": 9.92394785150788e-06, + "loss": 3.5163, + "step": 5282 + }, + { + "epoch": 0.45026847353618, + "grad_norm": 57.29373978540516, + "learning_rate": 9.923861673108342e-06, + "loss": 2.8255, + "step": 5283 + }, + { + "epoch": 0.4503537032302054, + "grad_norm": 69.47874878880356, + "learning_rate": 9.92377544628458e-06, + "loss": 4.8617, + "step": 5284 + }, + { + "epoch": 0.4504389329242308, + "grad_norm": 112.80966589405297, + "learning_rate": 9.923689171037446e-06, + "loss": 5.0883, + "step": 5285 + }, + { + "epoch": 0.4505241626182562, + "grad_norm": 35.88286426524662, + "learning_rate": 9.923602847367783e-06, + "loss": 3.9923, + "step": 5286 + }, + { + "epoch": 0.4506093923122816, + "grad_norm": 92.22952111138765, + "learning_rate": 9.923516475276442e-06, + "loss": 4.2168, + "step": 5287 + }, + { + "epoch": 0.450694622006307, + "grad_norm": 96.08247868993199, + "learning_rate": 9.923430054764274e-06, + "loss": 4.8141, + "step": 5288 + }, + { + "epoch": 0.4507798517003324, + "grad_norm": 76.10267078263605, + "learning_rate": 9.923343585832128e-06, + "loss": 3.9972, + "step": 5289 + }, + { + "epoch": 0.45086508139435777, + "grad_norm": 87.24797657058441, + "learning_rate": 9.923257068480853e-06, + "loss": 3.8542, + "step": 5290 + }, + { + "epoch": 0.4509503110883832, + "grad_norm": 94.93102728650841, + "learning_rate": 9.923170502711303e-06, + "loss": 4.166, + "step": 5291 + }, + { + "epoch": 0.4510355407824086, + "grad_norm": 50.73057434869673, + "learning_rate": 9.923083888524326e-06, + "loss": 4.7497, + "step": 5292 + }, + { + "epoch": 0.451120770476434, + "grad_norm": 89.16452977735723, + "learning_rate": 9.922997225920775e-06, + "loss": 5.0067, + "step": 5293 + }, + { + "epoch": 0.45120600017045936, + "grad_norm": 42.36881771526916, + "learning_rate": 9.922910514901503e-06, + "loss": 3.4668, + "step": 5294 + }, + { + "epoch": 0.4512912298644848, + "grad_norm": 51.28172022570074, + "learning_rate": 9.922823755467361e-06, + "loss": 4.2833, + "step": 5295 + }, + { + "epoch": 0.4513764595585102, + "grad_norm": 53.353957903561984, + "learning_rate": 9.922736947619205e-06, + "loss": 3.6563, + "step": 5296 + }, + { + "epoch": 0.4514616892525356, + "grad_norm": 70.641443774956, + "learning_rate": 9.922650091357888e-06, + "loss": 3.706, + "step": 5297 + }, + { + "epoch": 0.45154691894656096, + "grad_norm": 48.02428104958693, + "learning_rate": 9.922563186684262e-06, + "loss": 3.553, + "step": 5298 + }, + { + "epoch": 0.4516321486405864, + "grad_norm": 54.0317667180489, + "learning_rate": 9.922476233599183e-06, + "loss": 4.6727, + "step": 5299 + }, + { + "epoch": 0.4517173783346118, + "grad_norm": 40.426199256898286, + "learning_rate": 9.922389232103506e-06, + "loss": 3.6583, + "step": 5300 + }, + { + "epoch": 0.4518026080286372, + "grad_norm": 49.13985701710644, + "learning_rate": 9.922302182198088e-06, + "loss": 3.5117, + "step": 5301 + }, + { + "epoch": 0.45188783772266256, + "grad_norm": 34.727179872030106, + "learning_rate": 9.922215083883784e-06, + "loss": 3.5258, + "step": 5302 + }, + { + "epoch": 0.45197306741668797, + "grad_norm": 37.63775691144788, + "learning_rate": 9.92212793716145e-06, + "loss": 4.0493, + "step": 5303 + }, + { + "epoch": 0.4520582971107134, + "grad_norm": 61.916227031149155, + "learning_rate": 9.922040742031943e-06, + "loss": 3.9153, + "step": 5304 + }, + { + "epoch": 0.4521435268047388, + "grad_norm": 109.20366919836272, + "learning_rate": 9.921953498496122e-06, + "loss": 5.6476, + "step": 5305 + }, + { + "epoch": 0.45222875649876415, + "grad_norm": 41.708379026342726, + "learning_rate": 9.921866206554845e-06, + "loss": 3.2656, + "step": 5306 + }, + { + "epoch": 0.45231398619278956, + "grad_norm": 73.2665763950499, + "learning_rate": 9.921778866208967e-06, + "loss": 4.388, + "step": 5307 + }, + { + "epoch": 0.452399215886815, + "grad_norm": 45.57566187969627, + "learning_rate": 9.921691477459352e-06, + "loss": 3.273, + "step": 5308 + }, + { + "epoch": 0.4524844455808404, + "grad_norm": 40.942644884605095, + "learning_rate": 9.921604040306857e-06, + "loss": 3.9442, + "step": 5309 + }, + { + "epoch": 0.45256967527486575, + "grad_norm": 413.2132646531264, + "learning_rate": 9.92151655475234e-06, + "loss": 4.0207, + "step": 5310 + }, + { + "epoch": 0.45265490496889116, + "grad_norm": 79.02650596920557, + "learning_rate": 9.921429020796664e-06, + "loss": 4.467, + "step": 5311 + }, + { + "epoch": 0.45274013466291657, + "grad_norm": 67.06042448487213, + "learning_rate": 9.92134143844069e-06, + "loss": 3.1227, + "step": 5312 + }, + { + "epoch": 0.452825364356942, + "grad_norm": 90.2297215973217, + "learning_rate": 9.921253807685278e-06, + "loss": 4.4506, + "step": 5313 + }, + { + "epoch": 0.45291059405096734, + "grad_norm": 42.80603017182199, + "learning_rate": 9.92116612853129e-06, + "loss": 4.3851, + "step": 5314 + }, + { + "epoch": 0.45299582374499275, + "grad_norm": 97.93359851744943, + "learning_rate": 9.921078400979589e-06, + "loss": 4.6076, + "step": 5315 + }, + { + "epoch": 0.45308105343901817, + "grad_norm": 90.90110597365936, + "learning_rate": 9.920990625031038e-06, + "loss": 4.3724, + "step": 5316 + }, + { + "epoch": 0.4531662831330436, + "grad_norm": 69.18034792472417, + "learning_rate": 9.920902800686497e-06, + "loss": 2.9991, + "step": 5317 + }, + { + "epoch": 0.45325151282706894, + "grad_norm": 35.979287410518566, + "learning_rate": 9.920814927946835e-06, + "loss": 3.4852, + "step": 5318 + }, + { + "epoch": 0.45333674252109435, + "grad_norm": 48.76481282225489, + "learning_rate": 9.920727006812913e-06, + "loss": 3.6005, + "step": 5319 + }, + { + "epoch": 0.45342197221511976, + "grad_norm": 178.19213912442896, + "learning_rate": 9.920639037285595e-06, + "loss": 4.124, + "step": 5320 + }, + { + "epoch": 0.4535072019091451, + "grad_norm": 52.87602794383135, + "learning_rate": 9.920551019365749e-06, + "loss": 3.3952, + "step": 5321 + }, + { + "epoch": 0.45359243160317053, + "grad_norm": 112.6742449167064, + "learning_rate": 9.920462953054237e-06, + "loss": 4.5232, + "step": 5322 + }, + { + "epoch": 0.45367766129719594, + "grad_norm": 58.52678308365079, + "learning_rate": 9.92037483835193e-06, + "loss": 4.0868, + "step": 5323 + }, + { + "epoch": 0.45376289099122136, + "grad_norm": 87.8347810815136, + "learning_rate": 9.920286675259689e-06, + "loss": 5.0104, + "step": 5324 + }, + { + "epoch": 0.4538481206852467, + "grad_norm": 83.24528535669425, + "learning_rate": 9.920198463778384e-06, + "loss": 4.8404, + "step": 5325 + }, + { + "epoch": 0.4539333503792721, + "grad_norm": 37.746339880686314, + "learning_rate": 9.920110203908883e-06, + "loss": 3.7258, + "step": 5326 + }, + { + "epoch": 0.45401858007329754, + "grad_norm": 71.8665705876366, + "learning_rate": 9.920021895652053e-06, + "loss": 5.681, + "step": 5327 + }, + { + "epoch": 0.45410380976732295, + "grad_norm": 68.132039661922, + "learning_rate": 9.919933539008762e-06, + "loss": 2.1953, + "step": 5328 + }, + { + "epoch": 0.4541890394613483, + "grad_norm": 39.49960217396621, + "learning_rate": 9.919845133979881e-06, + "loss": 3.3343, + "step": 5329 + }, + { + "epoch": 0.4542742691553737, + "grad_norm": 35.08030548018815, + "learning_rate": 9.919756680566278e-06, + "loss": 3.7848, + "step": 5330 + }, + { + "epoch": 0.45435949884939913, + "grad_norm": 74.69225797932837, + "learning_rate": 9.919668178768823e-06, + "loss": 3.8733, + "step": 5331 + }, + { + "epoch": 0.45444472854342455, + "grad_norm": 71.75065659063868, + "learning_rate": 9.919579628588385e-06, + "loss": 3.8449, + "step": 5332 + }, + { + "epoch": 0.4545299582374499, + "grad_norm": 103.90365410201572, + "learning_rate": 9.919491030025837e-06, + "loss": 4.8877, + "step": 5333 + }, + { + "epoch": 0.4546151879314753, + "grad_norm": 242.40702524293533, + "learning_rate": 9.919402383082049e-06, + "loss": 4.9809, + "step": 5334 + }, + { + "epoch": 0.45470041762550073, + "grad_norm": 194.56760366290322, + "learning_rate": 9.919313687757894e-06, + "loss": 4.253, + "step": 5335 + }, + { + "epoch": 0.45478564731952614, + "grad_norm": 42.04400024677006, + "learning_rate": 9.919224944054243e-06, + "loss": 3.7265, + "step": 5336 + }, + { + "epoch": 0.4548708770135515, + "grad_norm": 59.192834436595334, + "learning_rate": 9.91913615197197e-06, + "loss": 2.2826, + "step": 5337 + }, + { + "epoch": 0.4549561067075769, + "grad_norm": 49.557585086623206, + "learning_rate": 9.919047311511946e-06, + "loss": 4.7149, + "step": 5338 + }, + { + "epoch": 0.4550413364016023, + "grad_norm": 43.5128635779426, + "learning_rate": 9.91895842267505e-06, + "loss": 4.2198, + "step": 5339 + }, + { + "epoch": 0.45512656609562774, + "grad_norm": 50.37735091157026, + "learning_rate": 9.918869485462147e-06, + "loss": 3.1398, + "step": 5340 + }, + { + "epoch": 0.4552117957896531, + "grad_norm": 32.854118160949255, + "learning_rate": 9.918780499874121e-06, + "loss": 3.0515, + "step": 5341 + }, + { + "epoch": 0.4552970254836785, + "grad_norm": 108.48547445832315, + "learning_rate": 9.918691465911844e-06, + "loss": 3.0046, + "step": 5342 + }, + { + "epoch": 0.4553822551777039, + "grad_norm": 121.6743563766812, + "learning_rate": 9.91860238357619e-06, + "loss": 4.3669, + "step": 5343 + }, + { + "epoch": 0.45546748487172933, + "grad_norm": 64.83713566936528, + "learning_rate": 9.918513252868034e-06, + "loss": 4.3211, + "step": 5344 + }, + { + "epoch": 0.4555527145657547, + "grad_norm": 76.80702495449357, + "learning_rate": 9.918424073788257e-06, + "loss": 4.2212, + "step": 5345 + }, + { + "epoch": 0.4556379442597801, + "grad_norm": 46.198827311943276, + "learning_rate": 9.918334846337734e-06, + "loss": 4.4046, + "step": 5346 + }, + { + "epoch": 0.4557231739538055, + "grad_norm": 38.96499021130072, + "learning_rate": 9.91824557051734e-06, + "loss": 3.4917, + "step": 5347 + }, + { + "epoch": 0.45580840364783093, + "grad_norm": 105.02787686137427, + "learning_rate": 9.918156246327956e-06, + "loss": 4.9734, + "step": 5348 + }, + { + "epoch": 0.4558936333418563, + "grad_norm": 39.31923196870676, + "learning_rate": 9.91806687377046e-06, + "loss": 3.7132, + "step": 5349 + }, + { + "epoch": 0.4559788630358817, + "grad_norm": 123.8317251203376, + "learning_rate": 9.91797745284573e-06, + "loss": 6.121, + "step": 5350 + }, + { + "epoch": 0.4560640927299071, + "grad_norm": 51.14989389597561, + "learning_rate": 9.917887983554646e-06, + "loss": 4.3565, + "step": 5351 + }, + { + "epoch": 0.4561493224239325, + "grad_norm": 71.72965649218298, + "learning_rate": 9.917798465898089e-06, + "loss": 4.0622, + "step": 5352 + }, + { + "epoch": 0.4562345521179579, + "grad_norm": 88.15391789150708, + "learning_rate": 9.917708899876936e-06, + "loss": 5.6491, + "step": 5353 + }, + { + "epoch": 0.4563197818119833, + "grad_norm": 54.71493392250119, + "learning_rate": 9.917619285492072e-06, + "loss": 3.9911, + "step": 5354 + }, + { + "epoch": 0.4564050115060087, + "grad_norm": 118.2614919582031, + "learning_rate": 9.917529622744376e-06, + "loss": 4.5987, + "step": 5355 + }, + { + "epoch": 0.4564902412000341, + "grad_norm": 73.49259838917827, + "learning_rate": 9.91743991163473e-06, + "loss": 5.1651, + "step": 5356 + }, + { + "epoch": 0.4565754708940595, + "grad_norm": 43.11026273211544, + "learning_rate": 9.917350152164017e-06, + "loss": 3.1126, + "step": 5357 + }, + { + "epoch": 0.4566607005880849, + "grad_norm": 36.78723453870108, + "learning_rate": 9.917260344333118e-06, + "loss": 3.1549, + "step": 5358 + }, + { + "epoch": 0.4567459302821103, + "grad_norm": 37.059641253172046, + "learning_rate": 9.91717048814292e-06, + "loss": 3.4164, + "step": 5359 + }, + { + "epoch": 0.45683115997613566, + "grad_norm": 88.75879814945955, + "learning_rate": 9.9170805835943e-06, + "loss": 5.1554, + "step": 5360 + }, + { + "epoch": 0.45691638967016107, + "grad_norm": 53.04892115655237, + "learning_rate": 9.916990630688148e-06, + "loss": 4.5028, + "step": 5361 + }, + { + "epoch": 0.4570016193641865, + "grad_norm": 78.9167809623455, + "learning_rate": 9.916900629425348e-06, + "loss": 4.1654, + "step": 5362 + }, + { + "epoch": 0.4570868490582119, + "grad_norm": 67.55684171721714, + "learning_rate": 9.916810579806784e-06, + "loss": 4.3817, + "step": 5363 + }, + { + "epoch": 0.45717207875223725, + "grad_norm": 53.67772814182023, + "learning_rate": 9.916720481833341e-06, + "loss": 4.9094, + "step": 5364 + }, + { + "epoch": 0.45725730844626267, + "grad_norm": 69.55790981411745, + "learning_rate": 9.916630335505906e-06, + "loss": 4.2769, + "step": 5365 + }, + { + "epoch": 0.4573425381402881, + "grad_norm": 35.7076305329105, + "learning_rate": 9.916540140825365e-06, + "loss": 3.1028, + "step": 5366 + }, + { + "epoch": 0.4574277678343135, + "grad_norm": 42.39209732687025, + "learning_rate": 9.916449897792607e-06, + "loss": 2.8503, + "step": 5367 + }, + { + "epoch": 0.45751299752833885, + "grad_norm": 268.70348594850225, + "learning_rate": 9.916359606408517e-06, + "loss": 5.2206, + "step": 5368 + }, + { + "epoch": 0.45759822722236426, + "grad_norm": 104.04656364511355, + "learning_rate": 9.916269266673984e-06, + "loss": 3.3724, + "step": 5369 + }, + { + "epoch": 0.4576834569163897, + "grad_norm": 128.5324136414362, + "learning_rate": 9.916178878589895e-06, + "loss": 5.0136, + "step": 5370 + }, + { + "epoch": 0.4577686866104151, + "grad_norm": 92.8174729117957, + "learning_rate": 9.916088442157142e-06, + "loss": 4.4635, + "step": 5371 + }, + { + "epoch": 0.45785391630444044, + "grad_norm": 68.57141868141626, + "learning_rate": 9.91599795737661e-06, + "loss": 4.1737, + "step": 5372 + }, + { + "epoch": 0.45793914599846586, + "grad_norm": 39.961189007949436, + "learning_rate": 9.915907424249193e-06, + "loss": 3.7916, + "step": 5373 + }, + { + "epoch": 0.45802437569249127, + "grad_norm": 94.48955059433224, + "learning_rate": 9.915816842775781e-06, + "loss": 4.3522, + "step": 5374 + }, + { + "epoch": 0.4581096053865167, + "grad_norm": 36.795020120266805, + "learning_rate": 9.915726212957263e-06, + "loss": 3.3615, + "step": 5375 + }, + { + "epoch": 0.45819483508054204, + "grad_norm": 49.26257301858201, + "learning_rate": 9.91563553479453e-06, + "loss": 3.7754, + "step": 5376 + }, + { + "epoch": 0.45828006477456745, + "grad_norm": 94.34210572195296, + "learning_rate": 9.915544808288475e-06, + "loss": 4.337, + "step": 5377 + }, + { + "epoch": 0.45836529446859287, + "grad_norm": 44.45053244741022, + "learning_rate": 9.915454033439991e-06, + "loss": 3.4541, + "step": 5378 + }, + { + "epoch": 0.4584505241626183, + "grad_norm": 148.80676435183403, + "learning_rate": 9.915363210249971e-06, + "loss": 3.0168, + "step": 5379 + }, + { + "epoch": 0.45853575385664364, + "grad_norm": 40.32745553838727, + "learning_rate": 9.915272338719304e-06, + "loss": 3.8177, + "step": 5380 + }, + { + "epoch": 0.45862098355066905, + "grad_norm": 45.089561991588596, + "learning_rate": 9.91518141884889e-06, + "loss": 2.9448, + "step": 5381 + }, + { + "epoch": 0.45870621324469446, + "grad_norm": 40.958454722582516, + "learning_rate": 9.915090450639617e-06, + "loss": 2.6619, + "step": 5382 + }, + { + "epoch": 0.4587914429387199, + "grad_norm": 93.2713622130834, + "learning_rate": 9.914999434092383e-06, + "loss": 5.7086, + "step": 5383 + }, + { + "epoch": 0.45887667263274523, + "grad_norm": 32.09333155381636, + "learning_rate": 9.914908369208083e-06, + "loss": 2.5993, + "step": 5384 + }, + { + "epoch": 0.45896190232677064, + "grad_norm": 126.62334141590013, + "learning_rate": 9.914817255987612e-06, + "loss": 4.754, + "step": 5385 + }, + { + "epoch": 0.45904713202079606, + "grad_norm": 64.94259144355219, + "learning_rate": 9.914726094431867e-06, + "loss": 3.767, + "step": 5386 + }, + { + "epoch": 0.45913236171482147, + "grad_norm": 811.3727311454868, + "learning_rate": 9.914634884541743e-06, + "loss": 5.3945, + "step": 5387 + }, + { + "epoch": 0.4592175914088468, + "grad_norm": 98.75146301428609, + "learning_rate": 9.914543626318139e-06, + "loss": 3.9211, + "step": 5388 + }, + { + "epoch": 0.45930282110287224, + "grad_norm": 71.51788528080512, + "learning_rate": 9.91445231976195e-06, + "loss": 3.9537, + "step": 5389 + }, + { + "epoch": 0.45938805079689765, + "grad_norm": 39.56752148763817, + "learning_rate": 9.914360964874075e-06, + "loss": 3.2824, + "step": 5390 + }, + { + "epoch": 0.45947328049092306, + "grad_norm": 87.3932030860152, + "learning_rate": 9.914269561655412e-06, + "loss": 5.3888, + "step": 5391 + }, + { + "epoch": 0.4595585101849484, + "grad_norm": 58.44566147512316, + "learning_rate": 9.914178110106862e-06, + "loss": 4.0685, + "step": 5392 + }, + { + "epoch": 0.45964373987897383, + "grad_norm": 42.055327217408845, + "learning_rate": 9.914086610229324e-06, + "loss": 3.9519, + "step": 5393 + }, + { + "epoch": 0.45972896957299925, + "grad_norm": 94.83773435919004, + "learning_rate": 9.913995062023696e-06, + "loss": 3.9278, + "step": 5394 + }, + { + "epoch": 0.4598141992670246, + "grad_norm": 126.57847522227978, + "learning_rate": 9.913903465490878e-06, + "loss": 3.7006, + "step": 5395 + }, + { + "epoch": 0.45989942896105, + "grad_norm": 39.11257742798402, + "learning_rate": 9.913811820631774e-06, + "loss": 3.7263, + "step": 5396 + }, + { + "epoch": 0.45998465865507543, + "grad_norm": 77.24331920228417, + "learning_rate": 9.913720127447282e-06, + "loss": 3.3255, + "step": 5397 + }, + { + "epoch": 0.46006988834910084, + "grad_norm": 89.80060363482693, + "learning_rate": 9.913628385938305e-06, + "loss": 4.3121, + "step": 5398 + }, + { + "epoch": 0.4601551180431262, + "grad_norm": 92.8633284339389, + "learning_rate": 9.913536596105745e-06, + "loss": 6.2414, + "step": 5399 + }, + { + "epoch": 0.4602403477371516, + "grad_norm": 41.34487883995613, + "learning_rate": 9.913444757950505e-06, + "loss": 3.8151, + "step": 5400 + }, + { + "epoch": 0.460325577431177, + "grad_norm": 42.35568126588218, + "learning_rate": 9.91335287147349e-06, + "loss": 3.2485, + "step": 5401 + }, + { + "epoch": 0.46041080712520244, + "grad_norm": 56.532984817025174, + "learning_rate": 9.913260936675599e-06, + "loss": 4.1618, + "step": 5402 + }, + { + "epoch": 0.4604960368192278, + "grad_norm": 106.2407879900519, + "learning_rate": 9.913168953557741e-06, + "loss": 6.0322, + "step": 5403 + }, + { + "epoch": 0.4605812665132532, + "grad_norm": 38.02149534073561, + "learning_rate": 9.913076922120818e-06, + "loss": 2.9445, + "step": 5404 + }, + { + "epoch": 0.4606664962072786, + "grad_norm": 269.11607382856533, + "learning_rate": 9.912984842365737e-06, + "loss": 5.4929, + "step": 5405 + }, + { + "epoch": 0.46075172590130403, + "grad_norm": 51.18647336084211, + "learning_rate": 9.912892714293402e-06, + "loss": 3.837, + "step": 5406 + }, + { + "epoch": 0.4608369555953294, + "grad_norm": 91.4201438379182, + "learning_rate": 9.912800537904718e-06, + "loss": 4.094, + "step": 5407 + }, + { + "epoch": 0.4609221852893548, + "grad_norm": 53.185339548667265, + "learning_rate": 9.912708313200594e-06, + "loss": 4.2244, + "step": 5408 + }, + { + "epoch": 0.4610074149833802, + "grad_norm": 39.970406113759985, + "learning_rate": 9.912616040181935e-06, + "loss": 2.6445, + "step": 5409 + }, + { + "epoch": 0.4610926446774056, + "grad_norm": 46.877264980672365, + "learning_rate": 9.91252371884965e-06, + "loss": 4.1873, + "step": 5410 + }, + { + "epoch": 0.461177874371431, + "grad_norm": 68.22869585126358, + "learning_rate": 9.912431349204647e-06, + "loss": 3.7717, + "step": 5411 + }, + { + "epoch": 0.4612631040654564, + "grad_norm": 115.0199685381399, + "learning_rate": 9.912338931247833e-06, + "loss": 4.7646, + "step": 5412 + }, + { + "epoch": 0.4613483337594818, + "grad_norm": 106.41498069495962, + "learning_rate": 9.912246464980116e-06, + "loss": 5.2053, + "step": 5413 + }, + { + "epoch": 0.4614335634535072, + "grad_norm": 58.6504054254884, + "learning_rate": 9.912153950402408e-06, + "loss": 3.4234, + "step": 5414 + }, + { + "epoch": 0.4615187931475326, + "grad_norm": 88.0238555574711, + "learning_rate": 9.91206138751562e-06, + "loss": 3.7635, + "step": 5415 + }, + { + "epoch": 0.461604022841558, + "grad_norm": 70.54348733493006, + "learning_rate": 9.911968776320657e-06, + "loss": 3.7903, + "step": 5416 + }, + { + "epoch": 0.4616892525355834, + "grad_norm": 69.27301405692914, + "learning_rate": 9.911876116818434e-06, + "loss": 3.3574, + "step": 5417 + }, + { + "epoch": 0.4617744822296088, + "grad_norm": 47.50688120405187, + "learning_rate": 9.911783409009861e-06, + "loss": 2.8628, + "step": 5418 + }, + { + "epoch": 0.4618597119236342, + "grad_norm": 73.3008881903712, + "learning_rate": 9.911690652895852e-06, + "loss": 4.6549, + "step": 5419 + }, + { + "epoch": 0.4619449416176596, + "grad_norm": 44.17114216662143, + "learning_rate": 9.911597848477315e-06, + "loss": 3.1868, + "step": 5420 + }, + { + "epoch": 0.462030171311685, + "grad_norm": 101.2590879934965, + "learning_rate": 9.911504995755164e-06, + "loss": 3.7798, + "step": 5421 + }, + { + "epoch": 0.4621154010057104, + "grad_norm": 49.465102386182544, + "learning_rate": 9.911412094730316e-06, + "loss": 3.6045, + "step": 5422 + }, + { + "epoch": 0.46220063069973577, + "grad_norm": 57.63868709055846, + "learning_rate": 9.911319145403678e-06, + "loss": 3.5027, + "step": 5423 + }, + { + "epoch": 0.4622858603937612, + "grad_norm": 97.2089923132107, + "learning_rate": 9.91122614777617e-06, + "loss": 5.0482, + "step": 5424 + }, + { + "epoch": 0.4623710900877866, + "grad_norm": 40.77485879519758, + "learning_rate": 9.911133101848705e-06, + "loss": 4.4118, + "step": 5425 + }, + { + "epoch": 0.462456319781812, + "grad_norm": 56.23896896808773, + "learning_rate": 9.911040007622195e-06, + "loss": 3.7006, + "step": 5426 + }, + { + "epoch": 0.46254154947583737, + "grad_norm": 61.555874146112586, + "learning_rate": 9.910946865097559e-06, + "loss": 4.7457, + "step": 5427 + }, + { + "epoch": 0.4626267791698628, + "grad_norm": 187.39766210510922, + "learning_rate": 9.910853674275714e-06, + "loss": 4.644, + "step": 5428 + }, + { + "epoch": 0.4627120088638882, + "grad_norm": 49.07084671332705, + "learning_rate": 9.910760435157572e-06, + "loss": 3.2598, + "step": 5429 + }, + { + "epoch": 0.4627972385579136, + "grad_norm": 52.45547019701292, + "learning_rate": 9.910667147744053e-06, + "loss": 3.7244, + "step": 5430 + }, + { + "epoch": 0.46288246825193896, + "grad_norm": 54.65277098548531, + "learning_rate": 9.910573812036076e-06, + "loss": 4.0588, + "step": 5431 + }, + { + "epoch": 0.4629676979459644, + "grad_norm": 112.2499046001319, + "learning_rate": 9.910480428034554e-06, + "loss": 5.0095, + "step": 5432 + }, + { + "epoch": 0.4630529276399898, + "grad_norm": 77.93091623981391, + "learning_rate": 9.910386995740409e-06, + "loss": 3.5871, + "step": 5433 + }, + { + "epoch": 0.46313815733401514, + "grad_norm": 34.206253596876664, + "learning_rate": 9.910293515154559e-06, + "loss": 3.3106, + "step": 5434 + }, + { + "epoch": 0.46322338702804056, + "grad_norm": 37.26927588833703, + "learning_rate": 9.910199986277924e-06, + "loss": 3.8907, + "step": 5435 + }, + { + "epoch": 0.46330861672206597, + "grad_norm": 51.57035290417991, + "learning_rate": 9.910106409111425e-06, + "loss": 3.3523, + "step": 5436 + }, + { + "epoch": 0.4633938464160914, + "grad_norm": 52.05222438428845, + "learning_rate": 9.910012783655978e-06, + "loss": 4.7586, + "step": 5437 + }, + { + "epoch": 0.46347907611011674, + "grad_norm": 45.48788659235725, + "learning_rate": 9.909919109912506e-06, + "loss": 3.2789, + "step": 5438 + }, + { + "epoch": 0.46356430580414215, + "grad_norm": 89.87235838985633, + "learning_rate": 9.909825387881931e-06, + "loss": 4.178, + "step": 5439 + }, + { + "epoch": 0.46364953549816756, + "grad_norm": 99.4665698138857, + "learning_rate": 9.909731617565175e-06, + "loss": 4.3676, + "step": 5440 + }, + { + "epoch": 0.463734765192193, + "grad_norm": 106.08391577924769, + "learning_rate": 9.90963779896316e-06, + "loss": 4.352, + "step": 5441 + }, + { + "epoch": 0.46381999488621833, + "grad_norm": 63.24561016438307, + "learning_rate": 9.909543932076808e-06, + "loss": 4.4645, + "step": 5442 + }, + { + "epoch": 0.46390522458024375, + "grad_norm": 100.94499904009325, + "learning_rate": 9.90945001690704e-06, + "loss": 5.067, + "step": 5443 + }, + { + "epoch": 0.46399045427426916, + "grad_norm": 35.67424939136554, + "learning_rate": 9.909356053454786e-06, + "loss": 3.4345, + "step": 5444 + }, + { + "epoch": 0.4640756839682946, + "grad_norm": 72.24830336706526, + "learning_rate": 9.909262041720963e-06, + "loss": 4.269, + "step": 5445 + }, + { + "epoch": 0.46416091366231993, + "grad_norm": 88.37369195580786, + "learning_rate": 9.9091679817065e-06, + "loss": 3.9571, + "step": 5446 + }, + { + "epoch": 0.46424614335634534, + "grad_norm": 62.11794961068512, + "learning_rate": 9.909073873412319e-06, + "loss": 3.7886, + "step": 5447 + }, + { + "epoch": 0.46433137305037075, + "grad_norm": 75.88616175228094, + "learning_rate": 9.908979716839347e-06, + "loss": 4.0806, + "step": 5448 + }, + { + "epoch": 0.46441660274439617, + "grad_norm": 61.46422680447865, + "learning_rate": 9.908885511988513e-06, + "loss": 3.826, + "step": 5449 + }, + { + "epoch": 0.4645018324384215, + "grad_norm": 103.93520163804455, + "learning_rate": 9.908791258860737e-06, + "loss": 3.5849, + "step": 5450 + }, + { + "epoch": 0.46458706213244694, + "grad_norm": 58.840227141009166, + "learning_rate": 9.908696957456951e-06, + "loss": 3.7461, + "step": 5451 + }, + { + "epoch": 0.46467229182647235, + "grad_norm": 102.42588195819579, + "learning_rate": 9.908602607778082e-06, + "loss": 4.8665, + "step": 5452 + }, + { + "epoch": 0.46475752152049776, + "grad_norm": 224.58309490044599, + "learning_rate": 9.908508209825056e-06, + "loss": 4.0006, + "step": 5453 + }, + { + "epoch": 0.4648427512145231, + "grad_norm": 78.18953390814711, + "learning_rate": 9.908413763598801e-06, + "loss": 3.8384, + "step": 5454 + }, + { + "epoch": 0.46492798090854853, + "grad_norm": 66.61486970359509, + "learning_rate": 9.90831926910025e-06, + "loss": 3.4943, + "step": 5455 + }, + { + "epoch": 0.46501321060257395, + "grad_norm": 64.27755276358351, + "learning_rate": 9.908224726330327e-06, + "loss": 3.9735, + "step": 5456 + }, + { + "epoch": 0.46509844029659936, + "grad_norm": 79.64319864974505, + "learning_rate": 9.908130135289964e-06, + "loss": 3.6719, + "step": 5457 + }, + { + "epoch": 0.4651836699906247, + "grad_norm": 63.003147949303354, + "learning_rate": 9.908035495980093e-06, + "loss": 4.3538, + "step": 5458 + }, + { + "epoch": 0.46526889968465013, + "grad_norm": 40.382290929729265, + "learning_rate": 9.907940808401641e-06, + "loss": 4.343, + "step": 5459 + }, + { + "epoch": 0.46535412937867554, + "grad_norm": 36.9105234128018, + "learning_rate": 9.907846072555544e-06, + "loss": 3.1187, + "step": 5460 + }, + { + "epoch": 0.46543935907270095, + "grad_norm": 62.02282742762238, + "learning_rate": 9.90775128844273e-06, + "loss": 3.6789, + "step": 5461 + }, + { + "epoch": 0.4655245887667263, + "grad_norm": 50.54630392083353, + "learning_rate": 9.907656456064133e-06, + "loss": 4.0724, + "step": 5462 + }, + { + "epoch": 0.4656098184607517, + "grad_norm": 94.32642980089102, + "learning_rate": 9.907561575420684e-06, + "loss": 3.3754, + "step": 5463 + }, + { + "epoch": 0.46569504815477714, + "grad_norm": 39.907103285253235, + "learning_rate": 9.907466646513315e-06, + "loss": 4.0619, + "step": 5464 + }, + { + "epoch": 0.46578027784880255, + "grad_norm": 153.10907587159582, + "learning_rate": 9.907371669342965e-06, + "loss": 4.5345, + "step": 5465 + }, + { + "epoch": 0.4658655075428279, + "grad_norm": 47.26926794041255, + "learning_rate": 9.907276643910562e-06, + "loss": 3.6628, + "step": 5466 + }, + { + "epoch": 0.4659507372368533, + "grad_norm": 139.48832114677558, + "learning_rate": 9.907181570217044e-06, + "loss": 5.321, + "step": 5467 + }, + { + "epoch": 0.46603596693087873, + "grad_norm": 49.874696357903005, + "learning_rate": 9.907086448263346e-06, + "loss": 3.2103, + "step": 5468 + }, + { + "epoch": 0.46612119662490414, + "grad_norm": 65.69827980981927, + "learning_rate": 9.906991278050402e-06, + "loss": 4.8705, + "step": 5469 + }, + { + "epoch": 0.4662064263189295, + "grad_norm": 29.423397126679255, + "learning_rate": 9.906896059579148e-06, + "loss": 3.38, + "step": 5470 + }, + { + "epoch": 0.4662916560129549, + "grad_norm": 128.99868461540402, + "learning_rate": 9.906800792850521e-06, + "loss": 5.6084, + "step": 5471 + }, + { + "epoch": 0.4663768857069803, + "grad_norm": 53.02408629829051, + "learning_rate": 9.90670547786546e-06, + "loss": 3.0875, + "step": 5472 + }, + { + "epoch": 0.4664621154010057, + "grad_norm": 36.18653439868689, + "learning_rate": 9.9066101146249e-06, + "loss": 4.0283, + "step": 5473 + }, + { + "epoch": 0.4665473450950311, + "grad_norm": 149.40644174994236, + "learning_rate": 9.906514703129777e-06, + "loss": 4.9543, + "step": 5474 + }, + { + "epoch": 0.4666325747890565, + "grad_norm": 32.84979964650631, + "learning_rate": 9.906419243381034e-06, + "loss": 2.834, + "step": 5475 + }, + { + "epoch": 0.4667178044830819, + "grad_norm": 51.813728595799624, + "learning_rate": 9.906323735379605e-06, + "loss": 3.3724, + "step": 5476 + }, + { + "epoch": 0.4668030341771073, + "grad_norm": 79.25392587699328, + "learning_rate": 9.906228179126433e-06, + "loss": 4.7742, + "step": 5477 + }, + { + "epoch": 0.4668882638711327, + "grad_norm": 73.1080234489965, + "learning_rate": 9.906132574622456e-06, + "loss": 4.2812, + "step": 5478 + }, + { + "epoch": 0.4669734935651581, + "grad_norm": 76.76810264790464, + "learning_rate": 9.906036921868616e-06, + "loss": 3.8737, + "step": 5479 + }, + { + "epoch": 0.4670587232591835, + "grad_norm": 50.5357207134529, + "learning_rate": 9.905941220865852e-06, + "loss": 4.9098, + "step": 5480 + }, + { + "epoch": 0.4671439529532089, + "grad_norm": 111.81282019058204, + "learning_rate": 9.905845471615103e-06, + "loss": 5.4615, + "step": 5481 + }, + { + "epoch": 0.4672291826472343, + "grad_norm": 200.68569754706382, + "learning_rate": 9.905749674117315e-06, + "loss": 4.716, + "step": 5482 + }, + { + "epoch": 0.4673144123412597, + "grad_norm": 49.561695403721096, + "learning_rate": 9.905653828373429e-06, + "loss": 3.174, + "step": 5483 + }, + { + "epoch": 0.4673996420352851, + "grad_norm": 63.922183940939014, + "learning_rate": 9.905557934384387e-06, + "loss": 5.3193, + "step": 5484 + }, + { + "epoch": 0.46748487172931047, + "grad_norm": 51.64533923702252, + "learning_rate": 9.90546199215113e-06, + "loss": 4.3132, + "step": 5485 + }, + { + "epoch": 0.4675701014233359, + "grad_norm": 94.06603277639769, + "learning_rate": 9.905366001674605e-06, + "loss": 5.2037, + "step": 5486 + }, + { + "epoch": 0.4676553311173613, + "grad_norm": 66.20276350673281, + "learning_rate": 9.905269962955755e-06, + "loss": 4.3689, + "step": 5487 + }, + { + "epoch": 0.4677405608113867, + "grad_norm": 43.01161262978955, + "learning_rate": 9.905173875995522e-06, + "loss": 4.1471, + "step": 5488 + }, + { + "epoch": 0.46782579050541206, + "grad_norm": 37.096424898722425, + "learning_rate": 9.905077740794854e-06, + "loss": 3.4171, + "step": 5489 + }, + { + "epoch": 0.4679110201994375, + "grad_norm": 85.20644917858533, + "learning_rate": 9.904981557354697e-06, + "loss": 4.3909, + "step": 5490 + }, + { + "epoch": 0.4679962498934629, + "grad_norm": 32.91228671985774, + "learning_rate": 9.904885325675994e-06, + "loss": 2.9114, + "step": 5491 + }, + { + "epoch": 0.4680814795874883, + "grad_norm": 96.03795709951869, + "learning_rate": 9.904789045759693e-06, + "loss": 5.0643, + "step": 5492 + }, + { + "epoch": 0.46816670928151366, + "grad_norm": 349.8442630509923, + "learning_rate": 9.904692717606742e-06, + "loss": 2.8614, + "step": 5493 + }, + { + "epoch": 0.4682519389755391, + "grad_norm": 42.392813058470814, + "learning_rate": 9.904596341218084e-06, + "loss": 3.7694, + "step": 5494 + }, + { + "epoch": 0.4683371686695645, + "grad_norm": 154.51917571606143, + "learning_rate": 9.904499916594673e-06, + "loss": 3.8852, + "step": 5495 + }, + { + "epoch": 0.4684223983635899, + "grad_norm": 102.03101804378976, + "learning_rate": 9.904403443737453e-06, + "loss": 4.038, + "step": 5496 + }, + { + "epoch": 0.46850762805761526, + "grad_norm": 74.14883678597171, + "learning_rate": 9.904306922647373e-06, + "loss": 3.9295, + "step": 5497 + }, + { + "epoch": 0.46859285775164067, + "grad_norm": 63.95594366556717, + "learning_rate": 9.904210353325386e-06, + "loss": 4.5829, + "step": 5498 + }, + { + "epoch": 0.4686780874456661, + "grad_norm": 93.2195261846026, + "learning_rate": 9.904113735772437e-06, + "loss": 4.7826, + "step": 5499 + }, + { + "epoch": 0.4687633171396915, + "grad_norm": 160.48443551312528, + "learning_rate": 9.904017069989477e-06, + "loss": 4.241, + "step": 5500 + }, + { + "epoch": 0.46884854683371685, + "grad_norm": 108.86501474453053, + "learning_rate": 9.90392035597746e-06, + "loss": 4.8401, + "step": 5501 + }, + { + "epoch": 0.46893377652774226, + "grad_norm": 40.28468081787664, + "learning_rate": 9.903823593737334e-06, + "loss": 3.4433, + "step": 5502 + }, + { + "epoch": 0.4690190062217677, + "grad_norm": 82.24517759841018, + "learning_rate": 9.903726783270051e-06, + "loss": 4.7214, + "step": 5503 + }, + { + "epoch": 0.4691042359157931, + "grad_norm": 65.86071138997303, + "learning_rate": 9.903629924576563e-06, + "loss": 4.484, + "step": 5504 + }, + { + "epoch": 0.46918946560981845, + "grad_norm": 49.78679924125653, + "learning_rate": 9.903533017657824e-06, + "loss": 5.1133, + "step": 5505 + }, + { + "epoch": 0.46927469530384386, + "grad_norm": 56.77269198044236, + "learning_rate": 9.903436062514788e-06, + "loss": 3.6343, + "step": 5506 + }, + { + "epoch": 0.46935992499786927, + "grad_norm": 94.1748847511129, + "learning_rate": 9.903339059148404e-06, + "loss": 3.6448, + "step": 5507 + }, + { + "epoch": 0.4694451546918947, + "grad_norm": 58.34422917030352, + "learning_rate": 9.90324200755963e-06, + "loss": 4.757, + "step": 5508 + }, + { + "epoch": 0.46953038438592004, + "grad_norm": 54.24974135795656, + "learning_rate": 9.90314490774942e-06, + "loss": 4.3722, + "step": 5509 + }, + { + "epoch": 0.46961561407994545, + "grad_norm": 42.38779684238181, + "learning_rate": 9.903047759718726e-06, + "loss": 4.1364, + "step": 5510 + }, + { + "epoch": 0.46970084377397087, + "grad_norm": 64.31972220411316, + "learning_rate": 9.902950563468507e-06, + "loss": 4.043, + "step": 5511 + }, + { + "epoch": 0.4697860734679962, + "grad_norm": 38.21843336726917, + "learning_rate": 9.902853318999719e-06, + "loss": 2.8185, + "step": 5512 + }, + { + "epoch": 0.46987130316202164, + "grad_norm": 129.4067535612076, + "learning_rate": 9.902756026313315e-06, + "loss": 6.3987, + "step": 5513 + }, + { + "epoch": 0.46995653285604705, + "grad_norm": 38.70590047661476, + "learning_rate": 9.902658685410253e-06, + "loss": 3.5072, + "step": 5514 + }, + { + "epoch": 0.47004176255007246, + "grad_norm": 50.94209470944253, + "learning_rate": 9.902561296291494e-06, + "loss": 3.9567, + "step": 5515 + }, + { + "epoch": 0.4701269922440978, + "grad_norm": 45.461748131064255, + "learning_rate": 9.90246385895799e-06, + "loss": 4.911, + "step": 5516 + }, + { + "epoch": 0.47021222193812323, + "grad_norm": 47.912196061121364, + "learning_rate": 9.902366373410701e-06, + "loss": 3.8823, + "step": 5517 + }, + { + "epoch": 0.47029745163214864, + "grad_norm": 108.34953571046101, + "learning_rate": 9.902268839650588e-06, + "loss": 5.0557, + "step": 5518 + }, + { + "epoch": 0.47038268132617406, + "grad_norm": 72.09540289406151, + "learning_rate": 9.90217125767861e-06, + "loss": 4.8487, + "step": 5519 + }, + { + "epoch": 0.4704679110201994, + "grad_norm": 169.02198021152296, + "learning_rate": 9.902073627495725e-06, + "loss": 5.6253, + "step": 5520 + }, + { + "epoch": 0.4705531407142248, + "grad_norm": 46.349744045104764, + "learning_rate": 9.901975949102893e-06, + "loss": 3.8058, + "step": 5521 + }, + { + "epoch": 0.47063837040825024, + "grad_norm": 143.1106917036931, + "learning_rate": 9.901878222501077e-06, + "loss": 3.2168, + "step": 5522 + }, + { + "epoch": 0.47072360010227565, + "grad_norm": 134.9156517960304, + "learning_rate": 9.901780447691235e-06, + "loss": 4.7165, + "step": 5523 + }, + { + "epoch": 0.470808829796301, + "grad_norm": 116.82574755820042, + "learning_rate": 9.90168262467433e-06, + "loss": 3.9976, + "step": 5524 + }, + { + "epoch": 0.4708940594903264, + "grad_norm": 116.49997389548109, + "learning_rate": 9.901584753451326e-06, + "loss": 5.2809, + "step": 5525 + }, + { + "epoch": 0.47097928918435183, + "grad_norm": 132.89008696745464, + "learning_rate": 9.901486834023182e-06, + "loss": 3.235, + "step": 5526 + }, + { + "epoch": 0.47106451887837725, + "grad_norm": 73.72207004022786, + "learning_rate": 9.901388866390862e-06, + "loss": 3.7006, + "step": 5527 + }, + { + "epoch": 0.4711497485724026, + "grad_norm": 229.67168100505307, + "learning_rate": 9.901290850555333e-06, + "loss": 4.4033, + "step": 5528 + }, + { + "epoch": 0.471234978266428, + "grad_norm": 40.05934861699214, + "learning_rate": 9.901192786517553e-06, + "loss": 3.379, + "step": 5529 + }, + { + "epoch": 0.47132020796045343, + "grad_norm": 39.146117263877485, + "learning_rate": 9.901094674278492e-06, + "loss": 3.4538, + "step": 5530 + }, + { + "epoch": 0.47140543765447884, + "grad_norm": 117.1929220049283, + "learning_rate": 9.90099651383911e-06, + "loss": 4.5906, + "step": 5531 + }, + { + "epoch": 0.4714906673485042, + "grad_norm": 198.72546029283347, + "learning_rate": 9.900898305200375e-06, + "loss": 2.6992, + "step": 5532 + }, + { + "epoch": 0.4715758970425296, + "grad_norm": 112.64500124673394, + "learning_rate": 9.900800048363252e-06, + "loss": 5.0751, + "step": 5533 + }, + { + "epoch": 0.471661126736555, + "grad_norm": 72.41311089353285, + "learning_rate": 9.90070174332871e-06, + "loss": 3.3801, + "step": 5534 + }, + { + "epoch": 0.47174635643058044, + "grad_norm": 47.72961462060569, + "learning_rate": 9.900603390097713e-06, + "loss": 4.8012, + "step": 5535 + }, + { + "epoch": 0.4718315861246058, + "grad_norm": 37.731542998377606, + "learning_rate": 9.900504988671229e-06, + "loss": 2.8415, + "step": 5536 + }, + { + "epoch": 0.4719168158186312, + "grad_norm": 39.670606554631796, + "learning_rate": 9.900406539050227e-06, + "loss": 3.1164, + "step": 5537 + }, + { + "epoch": 0.4720020455126566, + "grad_norm": 39.73774963583077, + "learning_rate": 9.900308041235672e-06, + "loss": 3.883, + "step": 5538 + }, + { + "epoch": 0.47208727520668203, + "grad_norm": 368.1892216190213, + "learning_rate": 9.900209495228534e-06, + "loss": 4.817, + "step": 5539 + }, + { + "epoch": 0.4721725049007074, + "grad_norm": 58.73622429423502, + "learning_rate": 9.900110901029784e-06, + "loss": 3.7479, + "step": 5540 + }, + { + "epoch": 0.4722577345947328, + "grad_norm": 89.17236378808454, + "learning_rate": 9.900012258640392e-06, + "loss": 4.3163, + "step": 5541 + }, + { + "epoch": 0.4723429642887582, + "grad_norm": 214.1686507678759, + "learning_rate": 9.899913568061325e-06, + "loss": 6.3485, + "step": 5542 + }, + { + "epoch": 0.47242819398278363, + "grad_norm": 156.03009434732877, + "learning_rate": 9.899814829293554e-06, + "loss": 5.6953, + "step": 5543 + }, + { + "epoch": 0.472513423676809, + "grad_norm": 84.52846348252871, + "learning_rate": 9.899716042338052e-06, + "loss": 4.4997, + "step": 5544 + }, + { + "epoch": 0.4725986533708344, + "grad_norm": 86.06991111457577, + "learning_rate": 9.899617207195791e-06, + "loss": 4.1874, + "step": 5545 + }, + { + "epoch": 0.4726838830648598, + "grad_norm": 72.27952794009879, + "learning_rate": 9.89951832386774e-06, + "loss": 3.6694, + "step": 5546 + }, + { + "epoch": 0.47276911275888517, + "grad_norm": 49.048597453424655, + "learning_rate": 9.899419392354873e-06, + "loss": 3.5673, + "step": 5547 + }, + { + "epoch": 0.4728543424529106, + "grad_norm": 74.49888534913066, + "learning_rate": 9.899320412658165e-06, + "loss": 3.63, + "step": 5548 + }, + { + "epoch": 0.472939572146936, + "grad_norm": 63.42386522345104, + "learning_rate": 9.899221384778586e-06, + "loss": 4.1497, + "step": 5549 + }, + { + "epoch": 0.4730248018409614, + "grad_norm": 33.94581271991376, + "learning_rate": 9.89912230871711e-06, + "loss": 3.1791, + "step": 5550 + }, + { + "epoch": 0.47311003153498676, + "grad_norm": 51.573271401547316, + "learning_rate": 9.899023184474715e-06, + "loss": 4.6591, + "step": 5551 + }, + { + "epoch": 0.4731952612290122, + "grad_norm": 105.79464093469348, + "learning_rate": 9.898924012052373e-06, + "loss": 3.4024, + "step": 5552 + }, + { + "epoch": 0.4732804909230376, + "grad_norm": 87.8258915961922, + "learning_rate": 9.89882479145106e-06, + "loss": 3.2285, + "step": 5553 + }, + { + "epoch": 0.473365720617063, + "grad_norm": 125.37821485748383, + "learning_rate": 9.898725522671752e-06, + "loss": 6.3184, + "step": 5554 + }, + { + "epoch": 0.47345095031108836, + "grad_norm": 40.059611234538586, + "learning_rate": 9.898626205715426e-06, + "loss": 3.1533, + "step": 5555 + }, + { + "epoch": 0.47353618000511377, + "grad_norm": 50.81252187287443, + "learning_rate": 9.898526840583055e-06, + "loss": 4.2789, + "step": 5556 + }, + { + "epoch": 0.4736214096991392, + "grad_norm": 35.39663980574267, + "learning_rate": 9.898427427275623e-06, + "loss": 3.2915, + "step": 5557 + }, + { + "epoch": 0.4737066393931646, + "grad_norm": 64.90672449538432, + "learning_rate": 9.8983279657941e-06, + "loss": 4.1268, + "step": 5558 + }, + { + "epoch": 0.47379186908718995, + "grad_norm": 33.80551343444474, + "learning_rate": 9.898228456139471e-06, + "loss": 2.4551, + "step": 5559 + }, + { + "epoch": 0.47387709878121537, + "grad_norm": 64.33323658865255, + "learning_rate": 9.898128898312709e-06, + "loss": 4.5215, + "step": 5560 + }, + { + "epoch": 0.4739623284752408, + "grad_norm": 44.8963658981517, + "learning_rate": 9.898029292314797e-06, + "loss": 3.4717, + "step": 5561 + }, + { + "epoch": 0.4740475581692662, + "grad_norm": 36.417870505463284, + "learning_rate": 9.897929638146713e-06, + "loss": 3.695, + "step": 5562 + }, + { + "epoch": 0.47413278786329155, + "grad_norm": 67.65445674925873, + "learning_rate": 9.897829935809435e-06, + "loss": 3.8716, + "step": 5563 + }, + { + "epoch": 0.47421801755731696, + "grad_norm": 48.34288961660753, + "learning_rate": 9.897730185303948e-06, + "loss": 3.3605, + "step": 5564 + }, + { + "epoch": 0.4743032472513424, + "grad_norm": 36.43002344574473, + "learning_rate": 9.897630386631232e-06, + "loss": 3.1536, + "step": 5565 + }, + { + "epoch": 0.4743884769453678, + "grad_norm": 125.28395085278434, + "learning_rate": 9.897530539792266e-06, + "loss": 3.6635, + "step": 5566 + }, + { + "epoch": 0.47447370663939314, + "grad_norm": 47.523152309852776, + "learning_rate": 9.897430644788034e-06, + "loss": 3.8522, + "step": 5567 + }, + { + "epoch": 0.47455893633341856, + "grad_norm": 39.04104098102972, + "learning_rate": 9.897330701619514e-06, + "loss": 3.3688, + "step": 5568 + }, + { + "epoch": 0.47464416602744397, + "grad_norm": 71.05661857222296, + "learning_rate": 9.897230710287697e-06, + "loss": 3.9641, + "step": 5569 + }, + { + "epoch": 0.4747293957214694, + "grad_norm": 77.18904858845598, + "learning_rate": 9.89713067079356e-06, + "loss": 4.9983, + "step": 5570 + }, + { + "epoch": 0.47481462541549474, + "grad_norm": 63.2065506849645, + "learning_rate": 9.897030583138088e-06, + "loss": 4.1529, + "step": 5571 + }, + { + "epoch": 0.47489985510952015, + "grad_norm": 53.99548384631834, + "learning_rate": 9.896930447322267e-06, + "loss": 3.3698, + "step": 5572 + }, + { + "epoch": 0.47498508480354557, + "grad_norm": 36.952283123116324, + "learning_rate": 9.896830263347082e-06, + "loss": 3.3684, + "step": 5573 + }, + { + "epoch": 0.475070314497571, + "grad_norm": 46.89395869281032, + "learning_rate": 9.896730031213515e-06, + "loss": 3.6047, + "step": 5574 + }, + { + "epoch": 0.47515554419159634, + "grad_norm": 44.87210047618915, + "learning_rate": 9.896629750922555e-06, + "loss": 3.9879, + "step": 5575 + }, + { + "epoch": 0.47524077388562175, + "grad_norm": 29.82897252873884, + "learning_rate": 9.896529422475187e-06, + "loss": 2.4427, + "step": 5576 + }, + { + "epoch": 0.47532600357964716, + "grad_norm": 45.79187893201952, + "learning_rate": 9.896429045872397e-06, + "loss": 2.3046, + "step": 5577 + }, + { + "epoch": 0.4754112332736726, + "grad_norm": 71.24368266157187, + "learning_rate": 9.896328621115176e-06, + "loss": 4.326, + "step": 5578 + }, + { + "epoch": 0.47549646296769793, + "grad_norm": 41.49085719685233, + "learning_rate": 9.896228148204505e-06, + "loss": 4.1761, + "step": 5579 + }, + { + "epoch": 0.47558169266172334, + "grad_norm": 43.24664144261717, + "learning_rate": 9.896127627141379e-06, + "loss": 4.3839, + "step": 5580 + }, + { + "epoch": 0.47566692235574876, + "grad_norm": 82.21191641780298, + "learning_rate": 9.896027057926781e-06, + "loss": 4.4687, + "step": 5581 + }, + { + "epoch": 0.47575215204977417, + "grad_norm": 112.54287778782488, + "learning_rate": 9.895926440561703e-06, + "loss": 5.1443, + "step": 5582 + }, + { + "epoch": 0.4758373817437995, + "grad_norm": 71.51149159574446, + "learning_rate": 9.895825775047135e-06, + "loss": 4.5365, + "step": 5583 + }, + { + "epoch": 0.47592261143782494, + "grad_norm": 40.079513396900275, + "learning_rate": 9.895725061384064e-06, + "loss": 3.6732, + "step": 5584 + }, + { + "epoch": 0.47600784113185035, + "grad_norm": 60.70342805403072, + "learning_rate": 9.895624299573483e-06, + "loss": 4.5861, + "step": 5585 + }, + { + "epoch": 0.4760930708258757, + "grad_norm": 52.16502665353056, + "learning_rate": 9.895523489616384e-06, + "loss": 3.7012, + "step": 5586 + }, + { + "epoch": 0.4761783005199011, + "grad_norm": 76.95416015414095, + "learning_rate": 9.895422631513754e-06, + "loss": 5.1732, + "step": 5587 + }, + { + "epoch": 0.47626353021392653, + "grad_norm": 80.80091271311551, + "learning_rate": 9.895321725266588e-06, + "loss": 4.6604, + "step": 5588 + }, + { + "epoch": 0.47634875990795195, + "grad_norm": 75.7132097312016, + "learning_rate": 9.895220770875879e-06, + "loss": 4.899, + "step": 5589 + }, + { + "epoch": 0.4764339896019773, + "grad_norm": 42.059427384719896, + "learning_rate": 9.89511976834262e-06, + "loss": 3.3103, + "step": 5590 + }, + { + "epoch": 0.4765192192960027, + "grad_norm": 78.05552131111847, + "learning_rate": 9.895018717667801e-06, + "loss": 3.7475, + "step": 5591 + }, + { + "epoch": 0.47660444899002813, + "grad_norm": 58.142338919762345, + "learning_rate": 9.89491761885242e-06, + "loss": 4.8608, + "step": 5592 + }, + { + "epoch": 0.47668967868405354, + "grad_norm": 75.10825745816854, + "learning_rate": 9.894816471897469e-06, + "loss": 4.412, + "step": 5593 + }, + { + "epoch": 0.4767749083780789, + "grad_norm": 110.04353093489819, + "learning_rate": 9.894715276803941e-06, + "loss": 4.1616, + "step": 5594 + }, + { + "epoch": 0.4768601380721043, + "grad_norm": 76.3097535274989, + "learning_rate": 9.894614033572835e-06, + "loss": 4.0602, + "step": 5595 + }, + { + "epoch": 0.4769453677661297, + "grad_norm": 85.76841055531281, + "learning_rate": 9.894512742205145e-06, + "loss": 3.7237, + "step": 5596 + }, + { + "epoch": 0.47703059746015514, + "grad_norm": 69.09651833807625, + "learning_rate": 9.894411402701867e-06, + "loss": 3.1304, + "step": 5597 + }, + { + "epoch": 0.4771158271541805, + "grad_norm": 44.44956166323667, + "learning_rate": 9.894310015063997e-06, + "loss": 3.5668, + "step": 5598 + }, + { + "epoch": 0.4772010568482059, + "grad_norm": 70.26992842328413, + "learning_rate": 9.894208579292532e-06, + "loss": 3.295, + "step": 5599 + }, + { + "epoch": 0.4772862865422313, + "grad_norm": 89.10974573526195, + "learning_rate": 9.894107095388473e-06, + "loss": 4.3817, + "step": 5600 + }, + { + "epoch": 0.47737151623625673, + "grad_norm": 51.95097110465282, + "learning_rate": 9.894005563352813e-06, + "loss": 2.2844, + "step": 5601 + }, + { + "epoch": 0.4774567459302821, + "grad_norm": 39.79879638436884, + "learning_rate": 9.893903983186555e-06, + "loss": 4.0642, + "step": 5602 + }, + { + "epoch": 0.4775419756243075, + "grad_norm": 62.19482690249234, + "learning_rate": 9.893802354890693e-06, + "loss": 4.2097, + "step": 5603 + }, + { + "epoch": 0.4776272053183329, + "grad_norm": 97.39416106377544, + "learning_rate": 9.893700678466233e-06, + "loss": 2.5777, + "step": 5604 + }, + { + "epoch": 0.4777124350123583, + "grad_norm": 34.94384770426911, + "learning_rate": 9.893598953914169e-06, + "loss": 2.6021, + "step": 5605 + }, + { + "epoch": 0.4777976647063837, + "grad_norm": 65.50333681686536, + "learning_rate": 9.893497181235506e-06, + "loss": 3.5985, + "step": 5606 + }, + { + "epoch": 0.4778828944004091, + "grad_norm": 198.9107559422797, + "learning_rate": 9.893395360431239e-06, + "loss": 3.816, + "step": 5607 + }, + { + "epoch": 0.4779681240944345, + "grad_norm": 131.33565186050274, + "learning_rate": 9.893293491502375e-06, + "loss": 4.1583, + "step": 5608 + }, + { + "epoch": 0.4780533537884599, + "grad_norm": 107.34575522596916, + "learning_rate": 9.893191574449916e-06, + "loss": 5.4584, + "step": 5609 + }, + { + "epoch": 0.4781385834824853, + "grad_norm": 84.43220629065593, + "learning_rate": 9.89308960927486e-06, + "loss": 4.6884, + "step": 5610 + }, + { + "epoch": 0.4782238131765107, + "grad_norm": 41.966979030101285, + "learning_rate": 9.892987595978212e-06, + "loss": 3.543, + "step": 5611 + }, + { + "epoch": 0.4783090428705361, + "grad_norm": 79.8028984359866, + "learning_rate": 9.892885534560977e-06, + "loss": 3.7314, + "step": 5612 + }, + { + "epoch": 0.4783942725645615, + "grad_norm": 61.8370942649348, + "learning_rate": 9.892783425024155e-06, + "loss": 4.3746, + "step": 5613 + }, + { + "epoch": 0.4784795022585869, + "grad_norm": 66.88089462649738, + "learning_rate": 9.892681267368754e-06, + "loss": 3.0119, + "step": 5614 + }, + { + "epoch": 0.4785647319526123, + "grad_norm": 111.56436955325218, + "learning_rate": 9.892579061595775e-06, + "loss": 4.8323, + "step": 5615 + }, + { + "epoch": 0.4786499616466377, + "grad_norm": 103.14159468850654, + "learning_rate": 9.892476807706226e-06, + "loss": 2.2136, + "step": 5616 + }, + { + "epoch": 0.4787351913406631, + "grad_norm": 47.13740812359217, + "learning_rate": 9.892374505701112e-06, + "loss": 3.2906, + "step": 5617 + }, + { + "epoch": 0.47882042103468847, + "grad_norm": 64.6434411333141, + "learning_rate": 9.892272155581438e-06, + "loss": 3.1779, + "step": 5618 + }, + { + "epoch": 0.4789056507287139, + "grad_norm": 32.36400553183114, + "learning_rate": 9.892169757348211e-06, + "loss": 2.8694, + "step": 5619 + }, + { + "epoch": 0.4789908804227393, + "grad_norm": 142.8070819778115, + "learning_rate": 9.89206731100244e-06, + "loss": 3.8573, + "step": 5620 + }, + { + "epoch": 0.4790761101167647, + "grad_norm": 46.83387676423386, + "learning_rate": 9.89196481654513e-06, + "loss": 4.1774, + "step": 5621 + }, + { + "epoch": 0.47916133981079007, + "grad_norm": 55.77821468214995, + "learning_rate": 9.891862273977291e-06, + "loss": 4.5986, + "step": 5622 + }, + { + "epoch": 0.4792465695048155, + "grad_norm": 62.67854438929018, + "learning_rate": 9.891759683299929e-06, + "loss": 3.89, + "step": 5623 + }, + { + "epoch": 0.4793317991988409, + "grad_norm": 92.06270926364334, + "learning_rate": 9.891657044514054e-06, + "loss": 3.3243, + "step": 5624 + }, + { + "epoch": 0.47941702889286625, + "grad_norm": 49.05501871237577, + "learning_rate": 9.891554357620676e-06, + "loss": 2.7406, + "step": 5625 + }, + { + "epoch": 0.47950225858689166, + "grad_norm": 41.00578688205697, + "learning_rate": 9.891451622620805e-06, + "loss": 3.2031, + "step": 5626 + }, + { + "epoch": 0.4795874882809171, + "grad_norm": 35.073581605565934, + "learning_rate": 9.891348839515451e-06, + "loss": 3.0479, + "step": 5627 + }, + { + "epoch": 0.4796727179749425, + "grad_norm": 36.63491620908259, + "learning_rate": 9.891246008305624e-06, + "loss": 4.0456, + "step": 5628 + }, + { + "epoch": 0.47975794766896784, + "grad_norm": 61.092614542332406, + "learning_rate": 9.891143128992337e-06, + "loss": 4.7778, + "step": 5629 + }, + { + "epoch": 0.47984317736299326, + "grad_norm": 47.71203771887612, + "learning_rate": 9.891040201576601e-06, + "loss": 4.0712, + "step": 5630 + }, + { + "epoch": 0.47992840705701867, + "grad_norm": 43.016724974946946, + "learning_rate": 9.890937226059428e-06, + "loss": 3.8813, + "step": 5631 + }, + { + "epoch": 0.4800136367510441, + "grad_norm": 83.01387732377046, + "learning_rate": 9.89083420244183e-06, + "loss": 4.194, + "step": 5632 + }, + { + "epoch": 0.48009886644506944, + "grad_norm": 43.05661728682337, + "learning_rate": 9.890731130724822e-06, + "loss": 3.8945, + "step": 5633 + }, + { + "epoch": 0.48018409613909485, + "grad_norm": 55.0867384077637, + "learning_rate": 9.890628010909418e-06, + "loss": 3.837, + "step": 5634 + }, + { + "epoch": 0.48026932583312026, + "grad_norm": 111.38014752186481, + "learning_rate": 9.890524842996629e-06, + "loss": 4.632, + "step": 5635 + }, + { + "epoch": 0.4803545555271457, + "grad_norm": 60.17505072128379, + "learning_rate": 9.89042162698747e-06, + "loss": 3.8629, + "step": 5636 + }, + { + "epoch": 0.48043978522117103, + "grad_norm": 63.97781962341123, + "learning_rate": 9.89031836288296e-06, + "loss": 4.224, + "step": 5637 + }, + { + "epoch": 0.48052501491519645, + "grad_norm": 59.18713309673121, + "learning_rate": 9.890215050684112e-06, + "loss": 3.5479, + "step": 5638 + }, + { + "epoch": 0.48061024460922186, + "grad_norm": 47.134506111939054, + "learning_rate": 9.890111690391941e-06, + "loss": 3.6596, + "step": 5639 + }, + { + "epoch": 0.48069547430324727, + "grad_norm": 105.51342052576697, + "learning_rate": 9.890008282007465e-06, + "loss": 4.802, + "step": 5640 + }, + { + "epoch": 0.48078070399727263, + "grad_norm": 33.93235986740884, + "learning_rate": 9.889904825531703e-06, + "loss": 3.4956, + "step": 5641 + }, + { + "epoch": 0.48086593369129804, + "grad_norm": 56.52443376509956, + "learning_rate": 9.889801320965667e-06, + "loss": 3.9747, + "step": 5642 + }, + { + "epoch": 0.48095116338532345, + "grad_norm": 66.87880240305827, + "learning_rate": 9.889697768310379e-06, + "loss": 4.2962, + "step": 5643 + }, + { + "epoch": 0.48103639307934887, + "grad_norm": 31.373131210072177, + "learning_rate": 9.889594167566855e-06, + "loss": 3.1947, + "step": 5644 + }, + { + "epoch": 0.4811216227733742, + "grad_norm": 74.60848159475225, + "learning_rate": 9.889490518736118e-06, + "loss": 3.7719, + "step": 5645 + }, + { + "epoch": 0.48120685246739964, + "grad_norm": 64.31725885584542, + "learning_rate": 9.889386821819184e-06, + "loss": 4.2554, + "step": 5646 + }, + { + "epoch": 0.48129208216142505, + "grad_norm": 58.60125563082644, + "learning_rate": 9.889283076817073e-06, + "loss": 3.6485, + "step": 5647 + }, + { + "epoch": 0.48137731185545046, + "grad_norm": 55.66800850974167, + "learning_rate": 9.889179283730805e-06, + "loss": 3.949, + "step": 5648 + }, + { + "epoch": 0.4814625415494758, + "grad_norm": 56.30844340658086, + "learning_rate": 9.889075442561403e-06, + "loss": 4.0689, + "step": 5649 + }, + { + "epoch": 0.48154777124350123, + "grad_norm": 88.03043899060127, + "learning_rate": 9.888971553309885e-06, + "loss": 4.5598, + "step": 5650 + }, + { + "epoch": 0.48163300093752665, + "grad_norm": 30.69098173822248, + "learning_rate": 9.888867615977276e-06, + "loss": 2.5966, + "step": 5651 + }, + { + "epoch": 0.48171823063155206, + "grad_norm": 77.26270819234844, + "learning_rate": 9.888763630564595e-06, + "loss": 4.0992, + "step": 5652 + }, + { + "epoch": 0.4818034603255774, + "grad_norm": 45.678236634980635, + "learning_rate": 9.888659597072867e-06, + "loss": 3.7811, + "step": 5653 + }, + { + "epoch": 0.48188869001960283, + "grad_norm": 65.58975066286916, + "learning_rate": 9.888555515503116e-06, + "loss": 4.6894, + "step": 5654 + }, + { + "epoch": 0.48197391971362824, + "grad_norm": 94.15575321000291, + "learning_rate": 9.888451385856362e-06, + "loss": 3.9925, + "step": 5655 + }, + { + "epoch": 0.48205914940765365, + "grad_norm": 38.76025501135886, + "learning_rate": 9.888347208133632e-06, + "loss": 3.0207, + "step": 5656 + }, + { + "epoch": 0.482144379101679, + "grad_norm": 65.4593887645199, + "learning_rate": 9.88824298233595e-06, + "loss": 4.1209, + "step": 5657 + }, + { + "epoch": 0.4822296087957044, + "grad_norm": 59.39999044700873, + "learning_rate": 9.888138708464339e-06, + "loss": 3.6318, + "step": 5658 + }, + { + "epoch": 0.48231483848972984, + "grad_norm": 82.222486665663, + "learning_rate": 9.888034386519827e-06, + "loss": 4.7134, + "step": 5659 + }, + { + "epoch": 0.4824000681837552, + "grad_norm": 34.66641170417938, + "learning_rate": 9.88793001650344e-06, + "loss": 2.4707, + "step": 5660 + }, + { + "epoch": 0.4824852978777806, + "grad_norm": 51.09509027235441, + "learning_rate": 9.887825598416203e-06, + "loss": 3.5862, + "step": 5661 + }, + { + "epoch": 0.482570527571806, + "grad_norm": 91.63840617128018, + "learning_rate": 9.887721132259142e-06, + "loss": 4.4284, + "step": 5662 + }, + { + "epoch": 0.48265575726583143, + "grad_norm": 85.99722200093886, + "learning_rate": 9.88761661803329e-06, + "loss": 4.0867, + "step": 5663 + }, + { + "epoch": 0.4827409869598568, + "grad_norm": 80.67686297796456, + "learning_rate": 9.887512055739666e-06, + "loss": 4.4589, + "step": 5664 + }, + { + "epoch": 0.4828262166538822, + "grad_norm": 61.431985205668965, + "learning_rate": 9.887407445379306e-06, + "loss": 4.4437, + "step": 5665 + }, + { + "epoch": 0.4829114463479076, + "grad_norm": 116.88841344881365, + "learning_rate": 9.887302786953234e-06, + "loss": 4.7058, + "step": 5666 + }, + { + "epoch": 0.482996676041933, + "grad_norm": 70.06229326143443, + "learning_rate": 9.887198080462484e-06, + "loss": 5.2284, + "step": 5667 + }, + { + "epoch": 0.4830819057359584, + "grad_norm": 37.678211654476186, + "learning_rate": 9.88709332590808e-06, + "loss": 2.8641, + "step": 5668 + }, + { + "epoch": 0.4831671354299838, + "grad_norm": 47.19171228831058, + "learning_rate": 9.886988523291055e-06, + "loss": 4.4832, + "step": 5669 + }, + { + "epoch": 0.4832523651240092, + "grad_norm": 50.76433655812878, + "learning_rate": 9.886883672612441e-06, + "loss": 3.9482, + "step": 5670 + }, + { + "epoch": 0.4833375948180346, + "grad_norm": 134.5616689277689, + "learning_rate": 9.886778773873268e-06, + "loss": 5.5881, + "step": 5671 + }, + { + "epoch": 0.48342282451206, + "grad_norm": 37.37012897513051, + "learning_rate": 9.886673827074567e-06, + "loss": 3.2383, + "step": 5672 + }, + { + "epoch": 0.4835080542060854, + "grad_norm": 65.54503131261845, + "learning_rate": 9.88656883221737e-06, + "loss": 4.7017, + "step": 5673 + }, + { + "epoch": 0.4835932839001108, + "grad_norm": 44.77388210598299, + "learning_rate": 9.886463789302712e-06, + "loss": 4.2308, + "step": 5674 + }, + { + "epoch": 0.4836785135941362, + "grad_norm": 49.47681124031168, + "learning_rate": 9.886358698331623e-06, + "loss": 3.8445, + "step": 5675 + }, + { + "epoch": 0.4837637432881616, + "grad_norm": 139.65629137744148, + "learning_rate": 9.886253559305139e-06, + "loss": 4.4643, + "step": 5676 + }, + { + "epoch": 0.483848972982187, + "grad_norm": 157.39681811249764, + "learning_rate": 9.886148372224292e-06, + "loss": 6.0068, + "step": 5677 + }, + { + "epoch": 0.4839342026762124, + "grad_norm": 52.10611445518578, + "learning_rate": 9.886043137090118e-06, + "loss": 3.9902, + "step": 5678 + }, + { + "epoch": 0.4840194323702378, + "grad_norm": 76.2456685958604, + "learning_rate": 9.88593785390365e-06, + "loss": 4.6545, + "step": 5679 + }, + { + "epoch": 0.48410466206426317, + "grad_norm": 32.37027726692327, + "learning_rate": 9.885832522665926e-06, + "loss": 2.8517, + "step": 5680 + }, + { + "epoch": 0.4841898917582886, + "grad_norm": 70.1211787295011, + "learning_rate": 9.88572714337798e-06, + "loss": 4.0346, + "step": 5681 + }, + { + "epoch": 0.484275121452314, + "grad_norm": 39.386726197943624, + "learning_rate": 9.88562171604085e-06, + "loss": 2.8643, + "step": 5682 + }, + { + "epoch": 0.4843603511463394, + "grad_norm": 43.45250598818581, + "learning_rate": 9.88551624065557e-06, + "loss": 3.0858, + "step": 5683 + }, + { + "epoch": 0.48444558084036476, + "grad_norm": 33.21802525533783, + "learning_rate": 9.885410717223179e-06, + "loss": 3.4517, + "step": 5684 + }, + { + "epoch": 0.4845308105343902, + "grad_norm": 312.59363165817047, + "learning_rate": 9.885305145744718e-06, + "loss": 4.6771, + "step": 5685 + }, + { + "epoch": 0.4846160402284156, + "grad_norm": 43.66728340407764, + "learning_rate": 9.885199526221218e-06, + "loss": 3.9607, + "step": 5686 + }, + { + "epoch": 0.484701269922441, + "grad_norm": 54.88532179537625, + "learning_rate": 9.885093858653723e-06, + "loss": 3.8251, + "step": 5687 + }, + { + "epoch": 0.48478649961646636, + "grad_norm": 53.22994891743695, + "learning_rate": 9.884988143043273e-06, + "loss": 4.3293, + "step": 5688 + }, + { + "epoch": 0.4848717293104918, + "grad_norm": 82.5495331086943, + "learning_rate": 9.884882379390904e-06, + "loss": 4.4548, + "step": 5689 + }, + { + "epoch": 0.4849569590045172, + "grad_norm": 333.51127078036484, + "learning_rate": 9.884776567697658e-06, + "loss": 3.9502, + "step": 5690 + }, + { + "epoch": 0.4850421886985426, + "grad_norm": 71.33845773167745, + "learning_rate": 9.884670707964576e-06, + "loss": 4.4048, + "step": 5691 + }, + { + "epoch": 0.48512741839256796, + "grad_norm": 125.80364830392162, + "learning_rate": 9.884564800192698e-06, + "loss": 4.5847, + "step": 5692 + }, + { + "epoch": 0.48521264808659337, + "grad_norm": 95.99056938103051, + "learning_rate": 9.884458844383067e-06, + "loss": 4.1139, + "step": 5693 + }, + { + "epoch": 0.4852978777806188, + "grad_norm": 51.08090682534571, + "learning_rate": 9.884352840536725e-06, + "loss": 3.9096, + "step": 5694 + }, + { + "epoch": 0.4853831074746442, + "grad_norm": 98.8747768615208, + "learning_rate": 9.884246788654712e-06, + "loss": 3.955, + "step": 5695 + }, + { + "epoch": 0.48546833716866955, + "grad_norm": 47.919661300338355, + "learning_rate": 9.884140688738072e-06, + "loss": 3.8472, + "step": 5696 + }, + { + "epoch": 0.48555356686269496, + "grad_norm": 43.97519908021715, + "learning_rate": 9.88403454078785e-06, + "loss": 2.7466, + "step": 5697 + }, + { + "epoch": 0.4856387965567204, + "grad_norm": 46.17466656598797, + "learning_rate": 9.883928344805089e-06, + "loss": 4.4315, + "step": 5698 + }, + { + "epoch": 0.48572402625074573, + "grad_norm": 46.78573934999607, + "learning_rate": 9.883822100790834e-06, + "loss": 3.8987, + "step": 5699 + }, + { + "epoch": 0.48580925594477115, + "grad_norm": 92.56226526870924, + "learning_rate": 9.883715808746129e-06, + "loss": 3.0737, + "step": 5700 + }, + { + "epoch": 0.48589448563879656, + "grad_norm": 80.74065138563105, + "learning_rate": 9.88360946867202e-06, + "loss": 5.6469, + "step": 5701 + }, + { + "epoch": 0.48597971533282197, + "grad_norm": 43.049487593362485, + "learning_rate": 9.883503080569551e-06, + "loss": 3.5894, + "step": 5702 + }, + { + "epoch": 0.48606494502684733, + "grad_norm": 32.54262401004992, + "learning_rate": 9.88339664443977e-06, + "loss": 3.5974, + "step": 5703 + }, + { + "epoch": 0.48615017472087274, + "grad_norm": 57.919600768299475, + "learning_rate": 9.883290160283724e-06, + "loss": 3.3501, + "step": 5704 + }, + { + "epoch": 0.48623540441489815, + "grad_norm": 37.87805134967131, + "learning_rate": 9.88318362810246e-06, + "loss": 3.6404, + "step": 5705 + }, + { + "epoch": 0.48632063410892357, + "grad_norm": 108.03067722658433, + "learning_rate": 9.883077047897025e-06, + "loss": 4.2587, + "step": 5706 + }, + { + "epoch": 0.4864058638029489, + "grad_norm": 39.63651644449076, + "learning_rate": 9.882970419668467e-06, + "loss": 3.3149, + "step": 5707 + }, + { + "epoch": 0.48649109349697434, + "grad_norm": 68.12617797968525, + "learning_rate": 9.882863743417836e-06, + "loss": 4.3321, + "step": 5708 + }, + { + "epoch": 0.48657632319099975, + "grad_norm": 38.19722358459072, + "learning_rate": 9.88275701914618e-06, + "loss": 3.4474, + "step": 5709 + }, + { + "epoch": 0.48666155288502516, + "grad_norm": 78.39508110047198, + "learning_rate": 9.88265024685455e-06, + "loss": 4.2022, + "step": 5710 + }, + { + "epoch": 0.4867467825790505, + "grad_norm": 99.11493187088003, + "learning_rate": 9.882543426543995e-06, + "loss": 4.4835, + "step": 5711 + }, + { + "epoch": 0.48683201227307593, + "grad_norm": 48.97498536891706, + "learning_rate": 9.882436558215565e-06, + "loss": 3.6754, + "step": 5712 + }, + { + "epoch": 0.48691724196710134, + "grad_norm": 44.957416276318504, + "learning_rate": 9.88232964187031e-06, + "loss": 3.5469, + "step": 5713 + }, + { + "epoch": 0.48700247166112676, + "grad_norm": 43.65593367006708, + "learning_rate": 9.882222677509284e-06, + "loss": 4.0457, + "step": 5714 + }, + { + "epoch": 0.4870877013551521, + "grad_norm": 37.31869445766091, + "learning_rate": 9.882115665133537e-06, + "loss": 3.1357, + "step": 5715 + }, + { + "epoch": 0.4871729310491775, + "grad_norm": 46.9671476504446, + "learning_rate": 9.882008604744126e-06, + "loss": 2.2998, + "step": 5716 + }, + { + "epoch": 0.48725816074320294, + "grad_norm": 52.84545023301607, + "learning_rate": 9.881901496342097e-06, + "loss": 4.5645, + "step": 5717 + }, + { + "epoch": 0.48734339043722835, + "grad_norm": 284.3202877789888, + "learning_rate": 9.881794339928507e-06, + "loss": 4.3555, + "step": 5718 + }, + { + "epoch": 0.4874286201312537, + "grad_norm": 99.48428822376282, + "learning_rate": 9.881687135504411e-06, + "loss": 5.0364, + "step": 5719 + }, + { + "epoch": 0.4875138498252791, + "grad_norm": 58.25786755279627, + "learning_rate": 9.881579883070861e-06, + "loss": 4.4729, + "step": 5720 + }, + { + "epoch": 0.48759907951930453, + "grad_norm": 144.81997530501687, + "learning_rate": 9.881472582628913e-06, + "loss": 4.2486, + "step": 5721 + }, + { + "epoch": 0.48768430921332995, + "grad_norm": 88.76371312604998, + "learning_rate": 9.88136523417962e-06, + "loss": 6.2939, + "step": 5722 + }, + { + "epoch": 0.4877695389073553, + "grad_norm": 54.30229621209748, + "learning_rate": 9.881257837724042e-06, + "loss": 4.0657, + "step": 5723 + }, + { + "epoch": 0.4878547686013807, + "grad_norm": 62.70115259288333, + "learning_rate": 9.881150393263233e-06, + "loss": 4.1656, + "step": 5724 + }, + { + "epoch": 0.48793999829540613, + "grad_norm": 42.251378161431454, + "learning_rate": 9.881042900798247e-06, + "loss": 3.7162, + "step": 5725 + }, + { + "epoch": 0.48802522798943154, + "grad_norm": 53.149212586982955, + "learning_rate": 9.880935360330145e-06, + "loss": 4.4946, + "step": 5726 + }, + { + "epoch": 0.4881104576834569, + "grad_norm": 60.85780365104979, + "learning_rate": 9.880827771859982e-06, + "loss": 2.8481, + "step": 5727 + }, + { + "epoch": 0.4881956873774823, + "grad_norm": 33.863200967415196, + "learning_rate": 9.88072013538882e-06, + "loss": 3.2329, + "step": 5728 + }, + { + "epoch": 0.4882809170715077, + "grad_norm": 78.62268882472672, + "learning_rate": 9.880612450917713e-06, + "loss": 4.7488, + "step": 5729 + }, + { + "epoch": 0.48836614676553314, + "grad_norm": 71.75711956436497, + "learning_rate": 9.88050471844772e-06, + "loss": 4.1674, + "step": 5730 + }, + { + "epoch": 0.4884513764595585, + "grad_norm": 56.19959490741268, + "learning_rate": 9.880396937979907e-06, + "loss": 3.8553, + "step": 5731 + }, + { + "epoch": 0.4885366061535839, + "grad_norm": 70.06618570954868, + "learning_rate": 9.880289109515327e-06, + "loss": 4.2541, + "step": 5732 + }, + { + "epoch": 0.4886218358476093, + "grad_norm": 43.97899062878663, + "learning_rate": 9.880181233055041e-06, + "loss": 3.4473, + "step": 5733 + }, + { + "epoch": 0.48870706554163473, + "grad_norm": 63.639569735841825, + "learning_rate": 9.880073308600115e-06, + "loss": 4.6016, + "step": 5734 + }, + { + "epoch": 0.4887922952356601, + "grad_norm": 142.4855331411222, + "learning_rate": 9.879965336151604e-06, + "loss": 3.8516, + "step": 5735 + }, + { + "epoch": 0.4888775249296855, + "grad_norm": 38.619086965710366, + "learning_rate": 9.879857315710576e-06, + "loss": 3.5585, + "step": 5736 + }, + { + "epoch": 0.4889627546237109, + "grad_norm": 55.059412854526585, + "learning_rate": 9.87974924727809e-06, + "loss": 3.9445, + "step": 5737 + }, + { + "epoch": 0.4890479843177363, + "grad_norm": 52.60788423302549, + "learning_rate": 9.879641130855206e-06, + "loss": 5.1752, + "step": 5738 + }, + { + "epoch": 0.4891332140117617, + "grad_norm": 43.51448586023342, + "learning_rate": 9.879532966442994e-06, + "loss": 3.2124, + "step": 5739 + }, + { + "epoch": 0.4892184437057871, + "grad_norm": 50.81151767040198, + "learning_rate": 9.879424754042512e-06, + "loss": 4.1373, + "step": 5740 + }, + { + "epoch": 0.4893036733998125, + "grad_norm": 72.31717160628135, + "learning_rate": 9.879316493654828e-06, + "loss": 4.5606, + "step": 5741 + }, + { + "epoch": 0.48938890309383787, + "grad_norm": 56.85046200843727, + "learning_rate": 9.879208185281004e-06, + "loss": 4.4042, + "step": 5742 + }, + { + "epoch": 0.4894741327878633, + "grad_norm": 34.793736478538264, + "learning_rate": 9.879099828922108e-06, + "loss": 2.9013, + "step": 5743 + }, + { + "epoch": 0.4895593624818887, + "grad_norm": 83.60045758189946, + "learning_rate": 9.878991424579204e-06, + "loss": 4.3816, + "step": 5744 + }, + { + "epoch": 0.4896445921759141, + "grad_norm": 68.82350313826313, + "learning_rate": 9.878882972253355e-06, + "loss": 4.371, + "step": 5745 + }, + { + "epoch": 0.48972982186993946, + "grad_norm": 83.38947741640376, + "learning_rate": 9.878774471945633e-06, + "loss": 4.885, + "step": 5746 + }, + { + "epoch": 0.4898150515639649, + "grad_norm": 35.209221035438205, + "learning_rate": 9.878665923657103e-06, + "loss": 3.1392, + "step": 5747 + }, + { + "epoch": 0.4899002812579903, + "grad_norm": 60.289654514063244, + "learning_rate": 9.87855732738883e-06, + "loss": 4.6627, + "step": 5748 + }, + { + "epoch": 0.4899855109520157, + "grad_norm": 97.97259188956733, + "learning_rate": 9.878448683141885e-06, + "loss": 4.2946, + "step": 5749 + }, + { + "epoch": 0.49007074064604106, + "grad_norm": 41.08005281027159, + "learning_rate": 9.878339990917338e-06, + "loss": 4.0252, + "step": 5750 + }, + { + "epoch": 0.49015597034006647, + "grad_norm": 46.60752056504246, + "learning_rate": 9.878231250716253e-06, + "loss": 4.4187, + "step": 5751 + }, + { + "epoch": 0.4902412000340919, + "grad_norm": 103.15461930101922, + "learning_rate": 9.878122462539703e-06, + "loss": 3.9963, + "step": 5752 + }, + { + "epoch": 0.4903264297281173, + "grad_norm": 70.98774151314481, + "learning_rate": 9.878013626388759e-06, + "loss": 4.3368, + "step": 5753 + }, + { + "epoch": 0.49041165942214265, + "grad_norm": 55.46747943224001, + "learning_rate": 9.877904742264486e-06, + "loss": 4.423, + "step": 5754 + }, + { + "epoch": 0.49049688911616807, + "grad_norm": 83.92393117991568, + "learning_rate": 9.87779581016796e-06, + "loss": 4.5022, + "step": 5755 + }, + { + "epoch": 0.4905821188101935, + "grad_norm": 136.93295584112815, + "learning_rate": 9.877686830100249e-06, + "loss": 4.8406, + "step": 5756 + }, + { + "epoch": 0.4906673485042189, + "grad_norm": 125.35647159399811, + "learning_rate": 9.877577802062429e-06, + "loss": 4.2082, + "step": 5757 + }, + { + "epoch": 0.49075257819824425, + "grad_norm": 58.17157694218456, + "learning_rate": 9.877468726055566e-06, + "loss": 4.9071, + "step": 5758 + }, + { + "epoch": 0.49083780789226966, + "grad_norm": 38.82657985001499, + "learning_rate": 9.877359602080739e-06, + "loss": 3.449, + "step": 5759 + }, + { + "epoch": 0.4909230375862951, + "grad_norm": 84.30016961552734, + "learning_rate": 9.877250430139015e-06, + "loss": 3.8089, + "step": 5760 + }, + { + "epoch": 0.4910082672803205, + "grad_norm": 42.08601853314156, + "learning_rate": 9.877141210231475e-06, + "loss": 4.1423, + "step": 5761 + }, + { + "epoch": 0.49109349697434584, + "grad_norm": 88.3318100868726, + "learning_rate": 9.877031942359187e-06, + "loss": 6.3425, + "step": 5762 + }, + { + "epoch": 0.49117872666837126, + "grad_norm": 133.61926066985598, + "learning_rate": 9.876922626523229e-06, + "loss": 5.0557, + "step": 5763 + }, + { + "epoch": 0.49126395636239667, + "grad_norm": 54.81572456852787, + "learning_rate": 9.876813262724674e-06, + "loss": 3.6622, + "step": 5764 + }, + { + "epoch": 0.4913491860564221, + "grad_norm": 78.8772619678882, + "learning_rate": 9.876703850964598e-06, + "loss": 3.9553, + "step": 5765 + }, + { + "epoch": 0.49143441575044744, + "grad_norm": 51.9530571913329, + "learning_rate": 9.87659439124408e-06, + "loss": 3.8234, + "step": 5766 + }, + { + "epoch": 0.49151964544447285, + "grad_norm": 68.39561116170917, + "learning_rate": 9.87648488356419e-06, + "loss": 4.175, + "step": 5767 + }, + { + "epoch": 0.49160487513849827, + "grad_norm": 110.54467106356174, + "learning_rate": 9.876375327926011e-06, + "loss": 5.1927, + "step": 5768 + }, + { + "epoch": 0.4916901048325237, + "grad_norm": 56.01841867666868, + "learning_rate": 9.876265724330618e-06, + "loss": 4.0739, + "step": 5769 + }, + { + "epoch": 0.49177533452654904, + "grad_norm": 82.81555311310223, + "learning_rate": 9.87615607277909e-06, + "loss": 4.3757, + "step": 5770 + }, + { + "epoch": 0.49186056422057445, + "grad_norm": 60.44880534675127, + "learning_rate": 9.876046373272503e-06, + "loss": 3.1512, + "step": 5771 + }, + { + "epoch": 0.49194579391459986, + "grad_norm": 73.31813255515087, + "learning_rate": 9.87593662581194e-06, + "loss": 4.5992, + "step": 5772 + }, + { + "epoch": 0.4920310236086252, + "grad_norm": 57.59259765074792, + "learning_rate": 9.875826830398474e-06, + "loss": 4.5988, + "step": 5773 + }, + { + "epoch": 0.49211625330265063, + "grad_norm": 98.98463173950054, + "learning_rate": 9.875716987033192e-06, + "loss": 4.0589, + "step": 5774 + }, + { + "epoch": 0.49220148299667604, + "grad_norm": 100.63046693708027, + "learning_rate": 9.875607095717171e-06, + "loss": 4.8599, + "step": 5775 + }, + { + "epoch": 0.49228671269070146, + "grad_norm": 65.07520268413414, + "learning_rate": 9.875497156451489e-06, + "loss": 3.9623, + "step": 5776 + }, + { + "epoch": 0.4923719423847268, + "grad_norm": 36.88016370008422, + "learning_rate": 9.875387169237232e-06, + "loss": 3.2261, + "step": 5777 + }, + { + "epoch": 0.4924571720787522, + "grad_norm": 51.42926795545789, + "learning_rate": 9.875277134075479e-06, + "loss": 3.7691, + "step": 5778 + }, + { + "epoch": 0.49254240177277764, + "grad_norm": 71.70008624864248, + "learning_rate": 9.875167050967312e-06, + "loss": 4.54, + "step": 5779 + }, + { + "epoch": 0.49262763146680305, + "grad_norm": 41.01109735064693, + "learning_rate": 9.875056919913813e-06, + "loss": 3.3908, + "step": 5780 + }, + { + "epoch": 0.4927128611608284, + "grad_norm": 127.1206705924369, + "learning_rate": 9.874946740916068e-06, + "loss": 5.4544, + "step": 5781 + }, + { + "epoch": 0.4927980908548538, + "grad_norm": 53.297620041891605, + "learning_rate": 9.874836513975158e-06, + "loss": 4.1883, + "step": 5782 + }, + { + "epoch": 0.49288332054887923, + "grad_norm": 159.76656813880894, + "learning_rate": 9.87472623909217e-06, + "loss": 4.226, + "step": 5783 + }, + { + "epoch": 0.49296855024290465, + "grad_norm": 59.99021439888083, + "learning_rate": 9.874615916268185e-06, + "loss": 5.3962, + "step": 5784 + }, + { + "epoch": 0.49305377993693, + "grad_norm": 60.8909396531738, + "learning_rate": 9.874505545504289e-06, + "loss": 4.1046, + "step": 5785 + }, + { + "epoch": 0.4931390096309554, + "grad_norm": 44.39248304807782, + "learning_rate": 9.874395126801569e-06, + "loss": 3.8149, + "step": 5786 + }, + { + "epoch": 0.49322423932498083, + "grad_norm": 47.95928466546405, + "learning_rate": 9.874284660161109e-06, + "loss": 3.5364, + "step": 5787 + }, + { + "epoch": 0.49330946901900624, + "grad_norm": 45.077894686440665, + "learning_rate": 9.874174145583996e-06, + "loss": 4.0567, + "step": 5788 + }, + { + "epoch": 0.4933946987130316, + "grad_norm": 48.844454417594896, + "learning_rate": 9.874063583071319e-06, + "loss": 4.5531, + "step": 5789 + }, + { + "epoch": 0.493479928407057, + "grad_norm": 145.89723808885262, + "learning_rate": 9.873952972624162e-06, + "loss": 4.3756, + "step": 5790 + }, + { + "epoch": 0.4935651581010824, + "grad_norm": 39.16219951110983, + "learning_rate": 9.873842314243613e-06, + "loss": 3.435, + "step": 5791 + }, + { + "epoch": 0.49365038779510784, + "grad_norm": 83.77106982247878, + "learning_rate": 9.873731607930764e-06, + "loss": 4.6192, + "step": 5792 + }, + { + "epoch": 0.4937356174891332, + "grad_norm": 80.90730787475829, + "learning_rate": 9.873620853686699e-06, + "loss": 5.0324, + "step": 5793 + }, + { + "epoch": 0.4938208471831586, + "grad_norm": 44.30288964523237, + "learning_rate": 9.873510051512512e-06, + "loss": 4.1087, + "step": 5794 + }, + { + "epoch": 0.493906076877184, + "grad_norm": 66.09043816576924, + "learning_rate": 9.873399201409286e-06, + "loss": 4.2154, + "step": 5795 + }, + { + "epoch": 0.49399130657120943, + "grad_norm": 31.946487306281693, + "learning_rate": 9.87328830337812e-06, + "loss": 3.0514, + "step": 5796 + }, + { + "epoch": 0.4940765362652348, + "grad_norm": 85.38005972615402, + "learning_rate": 9.873177357420097e-06, + "loss": 2.1586, + "step": 5797 + }, + { + "epoch": 0.4941617659592602, + "grad_norm": 74.89729294102055, + "learning_rate": 9.873066363536313e-06, + "loss": 4.9708, + "step": 5798 + }, + { + "epoch": 0.4942469956532856, + "grad_norm": 127.97782494917051, + "learning_rate": 9.872955321727857e-06, + "loss": 4.812, + "step": 5799 + }, + { + "epoch": 0.494332225347311, + "grad_norm": 68.16166041793112, + "learning_rate": 9.872844231995823e-06, + "loss": 2.5779, + "step": 5800 + }, + { + "epoch": 0.4944174550413364, + "grad_norm": 121.7679146462153, + "learning_rate": 9.8727330943413e-06, + "loss": 3.3147, + "step": 5801 + }, + { + "epoch": 0.4945026847353618, + "grad_norm": 113.19963120020691, + "learning_rate": 9.872621908765383e-06, + "loss": 6.2926, + "step": 5802 + }, + { + "epoch": 0.4945879144293872, + "grad_norm": 63.83514044557165, + "learning_rate": 9.872510675269167e-06, + "loss": 3.9459, + "step": 5803 + }, + { + "epoch": 0.4946731441234126, + "grad_norm": 57.58305098836138, + "learning_rate": 9.872399393853745e-06, + "loss": 4.1643, + "step": 5804 + }, + { + "epoch": 0.494758373817438, + "grad_norm": 66.27076697843042, + "learning_rate": 9.87228806452021e-06, + "loss": 4.6752, + "step": 5805 + }, + { + "epoch": 0.4948436035114634, + "grad_norm": 32.60217303076537, + "learning_rate": 9.87217668726966e-06, + "loss": 3.2605, + "step": 5806 + }, + { + "epoch": 0.4949288332054888, + "grad_norm": 37.0845973389955, + "learning_rate": 9.872065262103186e-06, + "loss": 3.2824, + "step": 5807 + }, + { + "epoch": 0.4950140628995142, + "grad_norm": 43.941942072238504, + "learning_rate": 9.871953789021886e-06, + "loss": 3.7848, + "step": 5808 + }, + { + "epoch": 0.4950992925935396, + "grad_norm": 68.15496761946301, + "learning_rate": 9.871842268026857e-06, + "loss": 5.0969, + "step": 5809 + }, + { + "epoch": 0.495184522287565, + "grad_norm": 103.31533208600084, + "learning_rate": 9.871730699119195e-06, + "loss": 4.3414, + "step": 5810 + }, + { + "epoch": 0.4952697519815904, + "grad_norm": 51.03269843966296, + "learning_rate": 9.871619082299999e-06, + "loss": 4.3515, + "step": 5811 + }, + { + "epoch": 0.49535498167561576, + "grad_norm": 40.58937230458869, + "learning_rate": 9.871507417570363e-06, + "loss": 3.7068, + "step": 5812 + }, + { + "epoch": 0.49544021136964117, + "grad_norm": 78.35444851040475, + "learning_rate": 9.87139570493139e-06, + "loss": 4.4253, + "step": 5813 + }, + { + "epoch": 0.4955254410636666, + "grad_norm": 44.24921443428407, + "learning_rate": 9.871283944384174e-06, + "loss": 4.1765, + "step": 5814 + }, + { + "epoch": 0.495610670757692, + "grad_norm": 252.05135677342608, + "learning_rate": 9.871172135929816e-06, + "loss": 5.1373, + "step": 5815 + }, + { + "epoch": 0.49569590045171735, + "grad_norm": 285.97979024269137, + "learning_rate": 9.871060279569416e-06, + "loss": 5.1616, + "step": 5816 + }, + { + "epoch": 0.49578113014574277, + "grad_norm": 176.93478939825056, + "learning_rate": 9.870948375304072e-06, + "loss": 4.7612, + "step": 5817 + }, + { + "epoch": 0.4958663598397682, + "grad_norm": 55.019741918167234, + "learning_rate": 9.87083642313489e-06, + "loss": 4.097, + "step": 5818 + }, + { + "epoch": 0.4959515895337936, + "grad_norm": 74.14769426962124, + "learning_rate": 9.870724423062965e-06, + "loss": 4.4496, + "step": 5819 + }, + { + "epoch": 0.49603681922781895, + "grad_norm": 56.501297038076615, + "learning_rate": 9.8706123750894e-06, + "loss": 3.862, + "step": 5820 + }, + { + "epoch": 0.49612204892184436, + "grad_norm": 64.75203521990801, + "learning_rate": 9.870500279215298e-06, + "loss": 3.9068, + "step": 5821 + }, + { + "epoch": 0.4962072786158698, + "grad_norm": 66.0887272755276, + "learning_rate": 9.870388135441763e-06, + "loss": 4.9072, + "step": 5822 + }, + { + "epoch": 0.4962925083098952, + "grad_norm": 175.57552973593005, + "learning_rate": 9.870275943769894e-06, + "loss": 4.2209, + "step": 5823 + }, + { + "epoch": 0.49637773800392054, + "grad_norm": 65.89647593681416, + "learning_rate": 9.870163704200796e-06, + "loss": 3.8705, + "step": 5824 + }, + { + "epoch": 0.49646296769794596, + "grad_norm": 57.44408442660013, + "learning_rate": 9.870051416735575e-06, + "loss": 4.5684, + "step": 5825 + }, + { + "epoch": 0.49654819739197137, + "grad_norm": 76.17404292850826, + "learning_rate": 9.869939081375333e-06, + "loss": 4.3547, + "step": 5826 + }, + { + "epoch": 0.4966334270859968, + "grad_norm": 57.68612573509315, + "learning_rate": 9.869826698121175e-06, + "loss": 4.7343, + "step": 5827 + }, + { + "epoch": 0.49671865678002214, + "grad_norm": 105.49825283082319, + "learning_rate": 9.869714266974207e-06, + "loss": 4.1813, + "step": 5828 + }, + { + "epoch": 0.49680388647404755, + "grad_norm": 23.559872820825124, + "learning_rate": 9.869601787935534e-06, + "loss": 3.5527, + "step": 5829 + }, + { + "epoch": 0.49688911616807296, + "grad_norm": 49.2310688486615, + "learning_rate": 9.869489261006261e-06, + "loss": 3.1143, + "step": 5830 + }, + { + "epoch": 0.4969743458620984, + "grad_norm": 149.80463010358568, + "learning_rate": 9.869376686187498e-06, + "loss": 3.192, + "step": 5831 + }, + { + "epoch": 0.49705957555612373, + "grad_norm": 49.46926473759317, + "learning_rate": 9.869264063480348e-06, + "loss": 3.5631, + "step": 5832 + }, + { + "epoch": 0.49714480525014915, + "grad_norm": 71.69976855357, + "learning_rate": 9.869151392885922e-06, + "loss": 2.7604, + "step": 5833 + }, + { + "epoch": 0.49723003494417456, + "grad_norm": 72.65031877298357, + "learning_rate": 9.869038674405327e-06, + "loss": 5.3079, + "step": 5834 + }, + { + "epoch": 0.49731526463819997, + "grad_norm": 35.73495003043377, + "learning_rate": 9.868925908039673e-06, + "loss": 2.6465, + "step": 5835 + }, + { + "epoch": 0.49740049433222533, + "grad_norm": 106.0509378593016, + "learning_rate": 9.868813093790066e-06, + "loss": 4.6478, + "step": 5836 + }, + { + "epoch": 0.49748572402625074, + "grad_norm": 31.04458175321007, + "learning_rate": 9.868700231657615e-06, + "loss": 2.5867, + "step": 5837 + }, + { + "epoch": 0.49757095372027615, + "grad_norm": 48.11015631762859, + "learning_rate": 9.868587321643435e-06, + "loss": 4.4948, + "step": 5838 + }, + { + "epoch": 0.49765618341430157, + "grad_norm": 92.81946416710547, + "learning_rate": 9.86847436374863e-06, + "loss": 4.4838, + "step": 5839 + }, + { + "epoch": 0.4977414131083269, + "grad_norm": 110.21385439133253, + "learning_rate": 9.868361357974318e-06, + "loss": 5.9483, + "step": 5840 + }, + { + "epoch": 0.49782664280235234, + "grad_norm": 64.65284414687164, + "learning_rate": 9.868248304321604e-06, + "loss": 3.3142, + "step": 5841 + }, + { + "epoch": 0.49791187249637775, + "grad_norm": 69.6395772461209, + "learning_rate": 9.868135202791604e-06, + "loss": 3.1806, + "step": 5842 + }, + { + "epoch": 0.49799710219040316, + "grad_norm": 46.893598519142365, + "learning_rate": 9.868022053385427e-06, + "loss": 3.753, + "step": 5843 + }, + { + "epoch": 0.4980823318844285, + "grad_norm": 33.61056719415625, + "learning_rate": 9.867908856104188e-06, + "loss": 3.2719, + "step": 5844 + }, + { + "epoch": 0.49816756157845393, + "grad_norm": 72.34587423044337, + "learning_rate": 9.867795610949e-06, + "loss": 3.0788, + "step": 5845 + }, + { + "epoch": 0.49825279127247935, + "grad_norm": 75.86583468122008, + "learning_rate": 9.867682317920975e-06, + "loss": 3.9394, + "step": 5846 + }, + { + "epoch": 0.49833802096650476, + "grad_norm": 47.277219696842934, + "learning_rate": 9.867568977021232e-06, + "loss": 4.045, + "step": 5847 + }, + { + "epoch": 0.4984232506605301, + "grad_norm": 106.26711413888361, + "learning_rate": 9.867455588250879e-06, + "loss": 5.5594, + "step": 5848 + }, + { + "epoch": 0.4985084803545555, + "grad_norm": 97.6974304696984, + "learning_rate": 9.867342151611037e-06, + "loss": 4.4605, + "step": 5849 + }, + { + "epoch": 0.49859371004858094, + "grad_norm": 45.05293164971688, + "learning_rate": 9.867228667102816e-06, + "loss": 3.1036, + "step": 5850 + }, + { + "epoch": 0.4986789397426063, + "grad_norm": 58.54199394228699, + "learning_rate": 9.867115134727336e-06, + "loss": 4.0332, + "step": 5851 + }, + { + "epoch": 0.4987641694366317, + "grad_norm": 97.11158690782462, + "learning_rate": 9.867001554485714e-06, + "loss": 3.4652, + "step": 5852 + }, + { + "epoch": 0.4988493991306571, + "grad_norm": 52.6763710298736, + "learning_rate": 9.866887926379065e-06, + "loss": 4.165, + "step": 5853 + }, + { + "epoch": 0.49893462882468254, + "grad_norm": 49.022884502360185, + "learning_rate": 9.866774250408508e-06, + "loss": 3.4756, + "step": 5854 + }, + { + "epoch": 0.4990198585187079, + "grad_norm": 51.313508922593996, + "learning_rate": 9.86666052657516e-06, + "loss": 4.7121, + "step": 5855 + }, + { + "epoch": 0.4991050882127333, + "grad_norm": 92.33875729650276, + "learning_rate": 9.866546754880137e-06, + "loss": 4.3935, + "step": 5856 + }, + { + "epoch": 0.4991903179067587, + "grad_norm": 43.33241378524917, + "learning_rate": 9.866432935324563e-06, + "loss": 3.9784, + "step": 5857 + }, + { + "epoch": 0.49927554760078413, + "grad_norm": 44.82534297412752, + "learning_rate": 9.866319067909555e-06, + "loss": 3.5141, + "step": 5858 + }, + { + "epoch": 0.4993607772948095, + "grad_norm": 78.20303372186864, + "learning_rate": 9.86620515263623e-06, + "loss": 3.9094, + "step": 5859 + }, + { + "epoch": 0.4994460069888349, + "grad_norm": 36.68761511706196, + "learning_rate": 9.866091189505713e-06, + "loss": 3.2351, + "step": 5860 + }, + { + "epoch": 0.4995312366828603, + "grad_norm": 143.98526619660421, + "learning_rate": 9.865977178519125e-06, + "loss": 4.7055, + "step": 5861 + }, + { + "epoch": 0.4996164663768857, + "grad_norm": 32.96764268125675, + "learning_rate": 9.865863119677582e-06, + "loss": 3.0878, + "step": 5862 + }, + { + "epoch": 0.4997016960709111, + "grad_norm": 31.778765136802875, + "learning_rate": 9.865749012982208e-06, + "loss": 3.0702, + "step": 5863 + }, + { + "epoch": 0.4997869257649365, + "grad_norm": 93.80184003438818, + "learning_rate": 9.865634858434128e-06, + "loss": 5.0628, + "step": 5864 + }, + { + "epoch": 0.4998721554589619, + "grad_norm": 50.01032143330035, + "learning_rate": 9.86552065603446e-06, + "loss": 4.1369, + "step": 5865 + }, + { + "epoch": 0.4999573851529873, + "grad_norm": 88.5306200780799, + "learning_rate": 9.865406405784332e-06, + "loss": 5.6735, + "step": 5866 + }, + { + "epoch": 0.5000426148470127, + "grad_norm": 93.00710463755833, + "learning_rate": 9.865292107684866e-06, + "loss": 4.669, + "step": 5867 + }, + { + "epoch": 0.5001278445410381, + "grad_norm": 45.96986855829979, + "learning_rate": 9.865177761737184e-06, + "loss": 4.3019, + "step": 5868 + }, + { + "epoch": 0.5002130742350634, + "grad_norm": 65.13895516869027, + "learning_rate": 9.865063367942412e-06, + "loss": 4.1363, + "step": 5869 + }, + { + "epoch": 0.5002983039290889, + "grad_norm": 46.059933931123716, + "learning_rate": 9.864948926301674e-06, + "loss": 3.0441, + "step": 5870 + }, + { + "epoch": 0.5003835336231143, + "grad_norm": 62.10128475173116, + "learning_rate": 9.864834436816098e-06, + "loss": 4.1365, + "step": 5871 + }, + { + "epoch": 0.5004687633171397, + "grad_norm": 46.388649982820404, + "learning_rate": 9.864719899486807e-06, + "loss": 3.7603, + "step": 5872 + }, + { + "epoch": 0.5005539930111651, + "grad_norm": 47.55340202010126, + "learning_rate": 9.86460531431493e-06, + "loss": 3.6942, + "step": 5873 + }, + { + "epoch": 0.5006392227051905, + "grad_norm": 90.19106450212546, + "learning_rate": 9.864490681301593e-06, + "loss": 4.3263, + "step": 5874 + }, + { + "epoch": 0.5007244523992159, + "grad_norm": 113.31136394633833, + "learning_rate": 9.864376000447922e-06, + "loss": 5.3178, + "step": 5875 + }, + { + "epoch": 0.5008096820932413, + "grad_norm": 78.64553928819656, + "learning_rate": 9.864261271755047e-06, + "loss": 4.6346, + "step": 5876 + }, + { + "epoch": 0.5008949117872666, + "grad_norm": 42.743222056314124, + "learning_rate": 9.864146495224094e-06, + "loss": 3.4404, + "step": 5877 + }, + { + "epoch": 0.5009801414812921, + "grad_norm": 44.35040959035183, + "learning_rate": 9.864031670856195e-06, + "loss": 3.223, + "step": 5878 + }, + { + "epoch": 0.5010653711753175, + "grad_norm": 61.102728777029554, + "learning_rate": 9.863916798652476e-06, + "loss": 5.5232, + "step": 5879 + }, + { + "epoch": 0.5011506008693429, + "grad_norm": 40.46489225859997, + "learning_rate": 9.863801878614067e-06, + "loss": 3.6384, + "step": 5880 + }, + { + "epoch": 0.5012358305633683, + "grad_norm": 96.31085952821978, + "learning_rate": 9.863686910742102e-06, + "loss": 4.7665, + "step": 5881 + }, + { + "epoch": 0.5013210602573936, + "grad_norm": 134.7953287268369, + "learning_rate": 9.863571895037707e-06, + "loss": 5.4896, + "step": 5882 + }, + { + "epoch": 0.5014062899514191, + "grad_norm": 35.1710208026381, + "learning_rate": 9.863456831502015e-06, + "loss": 3.7658, + "step": 5883 + }, + { + "epoch": 0.5014915196454445, + "grad_norm": 107.2061701027174, + "learning_rate": 9.863341720136158e-06, + "loss": 2.6987, + "step": 5884 + }, + { + "epoch": 0.5015767493394698, + "grad_norm": 94.89544099243112, + "learning_rate": 9.863226560941268e-06, + "loss": 4.2199, + "step": 5885 + }, + { + "epoch": 0.5016619790334953, + "grad_norm": 57.47556971876721, + "learning_rate": 9.863111353918476e-06, + "loss": 3.9663, + "step": 5886 + }, + { + "epoch": 0.5017472087275207, + "grad_norm": 60.99233971696487, + "learning_rate": 9.862996099068917e-06, + "loss": 4.3918, + "step": 5887 + }, + { + "epoch": 0.501832438421546, + "grad_norm": 65.74776107240194, + "learning_rate": 9.862880796393723e-06, + "loss": 4.0764, + "step": 5888 + }, + { + "epoch": 0.5019176681155715, + "grad_norm": 66.21140192981966, + "learning_rate": 9.862765445894031e-06, + "loss": 4.7205, + "step": 5889 + }, + { + "epoch": 0.5020028978095968, + "grad_norm": 148.59367710651628, + "learning_rate": 9.862650047570972e-06, + "loss": 4.8379, + "step": 5890 + }, + { + "epoch": 0.5020881275036223, + "grad_norm": 62.06015776452399, + "learning_rate": 9.86253460142568e-06, + "loss": 5.0534, + "step": 5891 + }, + { + "epoch": 0.5021733571976477, + "grad_norm": 44.642086925965174, + "learning_rate": 9.862419107459295e-06, + "loss": 3.3662, + "step": 5892 + }, + { + "epoch": 0.502258586891673, + "grad_norm": 49.29223884611931, + "learning_rate": 9.862303565672948e-06, + "loss": 3.1241, + "step": 5893 + }, + { + "epoch": 0.5023438165856985, + "grad_norm": 88.90434303444003, + "learning_rate": 9.86218797606778e-06, + "loss": 4.4857, + "step": 5894 + }, + { + "epoch": 0.5024290462797238, + "grad_norm": 51.60579514633322, + "learning_rate": 9.862072338644924e-06, + "loss": 3.7333, + "step": 5895 + }, + { + "epoch": 0.5025142759737492, + "grad_norm": 38.86159141494794, + "learning_rate": 9.861956653405518e-06, + "loss": 3.7608, + "step": 5896 + }, + { + "epoch": 0.5025995056677747, + "grad_norm": 69.5628157687868, + "learning_rate": 9.8618409203507e-06, + "loss": 5.5743, + "step": 5897 + }, + { + "epoch": 0.5026847353618, + "grad_norm": 50.05082545007132, + "learning_rate": 9.861725139481611e-06, + "loss": 4.1707, + "step": 5898 + }, + { + "epoch": 0.5027699650558255, + "grad_norm": 69.23844163575728, + "learning_rate": 9.861609310799385e-06, + "loss": 3.8818, + "step": 5899 + }, + { + "epoch": 0.5028551947498509, + "grad_norm": 56.03610203650179, + "learning_rate": 9.861493434305162e-06, + "loss": 3.4934, + "step": 5900 + }, + { + "epoch": 0.5029404244438762, + "grad_norm": 31.010903592300018, + "learning_rate": 9.861377510000085e-06, + "loss": 3.3296, + "step": 5901 + }, + { + "epoch": 0.5030256541379017, + "grad_norm": 38.54010294371357, + "learning_rate": 9.861261537885291e-06, + "loss": 3.4027, + "step": 5902 + }, + { + "epoch": 0.503110883831927, + "grad_norm": 342.7810691484116, + "learning_rate": 9.861145517961924e-06, + "loss": 4.1455, + "step": 5903 + }, + { + "epoch": 0.5031961135259524, + "grad_norm": 50.763060593431206, + "learning_rate": 9.861029450231119e-06, + "loss": 3.673, + "step": 5904 + }, + { + "epoch": 0.5032813432199779, + "grad_norm": 126.98436228525628, + "learning_rate": 9.860913334694022e-06, + "loss": 5.0449, + "step": 5905 + }, + { + "epoch": 0.5033665729140032, + "grad_norm": 44.335178845926876, + "learning_rate": 9.860797171351774e-06, + "loss": 3.8625, + "step": 5906 + }, + { + "epoch": 0.5034518026080287, + "grad_norm": 76.7921093171589, + "learning_rate": 9.860680960205518e-06, + "loss": 4.9916, + "step": 5907 + }, + { + "epoch": 0.503537032302054, + "grad_norm": 53.404784269938304, + "learning_rate": 9.860564701256395e-06, + "loss": 3.7105, + "step": 5908 + }, + { + "epoch": 0.5036222619960794, + "grad_norm": 76.74303578078937, + "learning_rate": 9.86044839450555e-06, + "loss": 3.4007, + "step": 5909 + }, + { + "epoch": 0.5037074916901049, + "grad_norm": 74.30076492797873, + "learning_rate": 9.860332039954128e-06, + "loss": 3.851, + "step": 5910 + }, + { + "epoch": 0.5037927213841302, + "grad_norm": 92.10412034675424, + "learning_rate": 9.86021563760327e-06, + "loss": 5.0332, + "step": 5911 + }, + { + "epoch": 0.5038779510781556, + "grad_norm": 44.240948807335165, + "learning_rate": 9.860099187454122e-06, + "loss": 3.6953, + "step": 5912 + }, + { + "epoch": 0.503963180772181, + "grad_norm": 49.04962048232123, + "learning_rate": 9.859982689507829e-06, + "loss": 4.0437, + "step": 5913 + }, + { + "epoch": 0.5040484104662064, + "grad_norm": 60.647195052076846, + "learning_rate": 9.859866143765539e-06, + "loss": 4.5255, + "step": 5914 + }, + { + "epoch": 0.5041336401602319, + "grad_norm": 60.7247864608757, + "learning_rate": 9.859749550228397e-06, + "loss": 5.111, + "step": 5915 + }, + { + "epoch": 0.5042188698542572, + "grad_norm": 82.53584423321772, + "learning_rate": 9.859632908897549e-06, + "loss": 5.9262, + "step": 5916 + }, + { + "epoch": 0.5043040995482826, + "grad_norm": 27.148587713882343, + "learning_rate": 9.859516219774143e-06, + "loss": 2.2122, + "step": 5917 + }, + { + "epoch": 0.5043893292423081, + "grad_norm": 35.86746097481712, + "learning_rate": 9.859399482859324e-06, + "loss": 3.0527, + "step": 5918 + }, + { + "epoch": 0.5044745589363334, + "grad_norm": 39.119807054886145, + "learning_rate": 9.859282698154243e-06, + "loss": 3.3314, + "step": 5919 + }, + { + "epoch": 0.5045597886303588, + "grad_norm": 72.96587322710093, + "learning_rate": 9.859165865660047e-06, + "loss": 4.2298, + "step": 5920 + }, + { + "epoch": 0.5046450183243842, + "grad_norm": 50.86612133428278, + "learning_rate": 9.859048985377888e-06, + "loss": 3.8691, + "step": 5921 + }, + { + "epoch": 0.5047302480184096, + "grad_norm": 434.90754101701043, + "learning_rate": 9.85893205730891e-06, + "loss": 3.4162, + "step": 5922 + }, + { + "epoch": 0.5048154777124351, + "grad_norm": 62.93089165216517, + "learning_rate": 9.858815081454267e-06, + "loss": 4.3948, + "step": 5923 + }, + { + "epoch": 0.5049007074064604, + "grad_norm": 91.54624894917715, + "learning_rate": 9.858698057815108e-06, + "loss": 4.625, + "step": 5924 + }, + { + "epoch": 0.5049859371004858, + "grad_norm": 50.500657493607704, + "learning_rate": 9.858580986392586e-06, + "loss": 4.204, + "step": 5925 + }, + { + "epoch": 0.5050711667945113, + "grad_norm": 54.3034652748573, + "learning_rate": 9.858463867187848e-06, + "loss": 3.5876, + "step": 5926 + }, + { + "epoch": 0.5051563964885366, + "grad_norm": 73.97865041120708, + "learning_rate": 9.85834670020205e-06, + "loss": 3.7191, + "step": 5927 + }, + { + "epoch": 0.505241626182562, + "grad_norm": 78.11793136071853, + "learning_rate": 9.858229485436343e-06, + "loss": 3.3193, + "step": 5928 + }, + { + "epoch": 0.5053268558765874, + "grad_norm": 105.81746320943107, + "learning_rate": 9.858112222891881e-06, + "loss": 4.728, + "step": 5929 + }, + { + "epoch": 0.5054120855706128, + "grad_norm": 48.32522687406326, + "learning_rate": 9.857994912569812e-06, + "loss": 4.7545, + "step": 5930 + }, + { + "epoch": 0.5054973152646381, + "grad_norm": 151.9332892254753, + "learning_rate": 9.857877554471296e-06, + "loss": 4.9751, + "step": 5931 + }, + { + "epoch": 0.5055825449586636, + "grad_norm": 60.939987509841814, + "learning_rate": 9.857760148597483e-06, + "loss": 3.7859, + "step": 5932 + }, + { + "epoch": 0.505667774652689, + "grad_norm": 33.040857906753224, + "learning_rate": 9.85764269494953e-06, + "loss": 3.5424, + "step": 5933 + }, + { + "epoch": 0.5057530043467144, + "grad_norm": 54.77351347590993, + "learning_rate": 9.857525193528592e-06, + "loss": 3.1292, + "step": 5934 + }, + { + "epoch": 0.5058382340407398, + "grad_norm": 48.98396278516292, + "learning_rate": 9.857407644335822e-06, + "loss": 4.8378, + "step": 5935 + }, + { + "epoch": 0.5059234637347652, + "grad_norm": 40.79945752594696, + "learning_rate": 9.85729004737238e-06, + "loss": 3.1979, + "step": 5936 + }, + { + "epoch": 0.5060086934287906, + "grad_norm": 33.01713790389166, + "learning_rate": 9.857172402639418e-06, + "loss": 2.6942, + "step": 5937 + }, + { + "epoch": 0.506093923122816, + "grad_norm": 116.61393491002097, + "learning_rate": 9.857054710138098e-06, + "loss": 4.5528, + "step": 5938 + }, + { + "epoch": 0.5061791528168413, + "grad_norm": 43.06794371949652, + "learning_rate": 9.856936969869574e-06, + "loss": 3.4771, + "step": 5939 + }, + { + "epoch": 0.5062643825108668, + "grad_norm": 45.54793330310433, + "learning_rate": 9.856819181835004e-06, + "loss": 4.3967, + "step": 5940 + }, + { + "epoch": 0.5063496122048922, + "grad_norm": 49.06626421535383, + "learning_rate": 9.856701346035549e-06, + "loss": 4.3514, + "step": 5941 + }, + { + "epoch": 0.5064348418989176, + "grad_norm": 24.145953170822352, + "learning_rate": 9.856583462472366e-06, + "loss": 2.89, + "step": 5942 + }, + { + "epoch": 0.506520071592943, + "grad_norm": 38.31744886473224, + "learning_rate": 9.856465531146613e-06, + "loss": 4.4991, + "step": 5943 + }, + { + "epoch": 0.5066053012869683, + "grad_norm": 52.38168916952687, + "learning_rate": 9.856347552059454e-06, + "loss": 3.3875, + "step": 5944 + }, + { + "epoch": 0.5066905309809938, + "grad_norm": 85.86621330098147, + "learning_rate": 9.856229525212044e-06, + "loss": 3.7642, + "step": 5945 + }, + { + "epoch": 0.5067757606750192, + "grad_norm": 54.0122423316999, + "learning_rate": 9.856111450605545e-06, + "loss": 5.4443, + "step": 5946 + }, + { + "epoch": 0.5068609903690445, + "grad_norm": 76.08373393719589, + "learning_rate": 9.855993328241123e-06, + "loss": 4.7643, + "step": 5947 + }, + { + "epoch": 0.50694622006307, + "grad_norm": 65.82015365424142, + "learning_rate": 9.855875158119935e-06, + "loss": 4.0084, + "step": 5948 + }, + { + "epoch": 0.5070314497570954, + "grad_norm": 36.128657541601335, + "learning_rate": 9.855756940243142e-06, + "loss": 3.1263, + "step": 5949 + }, + { + "epoch": 0.5071166794511208, + "grad_norm": 81.328349462793, + "learning_rate": 9.855638674611911e-06, + "loss": 4.4956, + "step": 5950 + }, + { + "epoch": 0.5072019091451462, + "grad_norm": 810.97894810387, + "learning_rate": 9.855520361227404e-06, + "loss": 3.582, + "step": 5951 + }, + { + "epoch": 0.5072871388391715, + "grad_norm": 70.0322441673758, + "learning_rate": 9.855402000090783e-06, + "loss": 5.1815, + "step": 5952 + }, + { + "epoch": 0.507372368533197, + "grad_norm": 50.63957601824556, + "learning_rate": 9.85528359120321e-06, + "loss": 4.3026, + "step": 5953 + }, + { + "epoch": 0.5074575982272224, + "grad_norm": 49.6150658878888, + "learning_rate": 9.855165134565855e-06, + "loss": 4.3464, + "step": 5954 + }, + { + "epoch": 0.5075428279212477, + "grad_norm": 49.706216603202805, + "learning_rate": 9.85504663017988e-06, + "loss": 3.7111, + "step": 5955 + }, + { + "epoch": 0.5076280576152732, + "grad_norm": 33.64686070442909, + "learning_rate": 9.85492807804645e-06, + "loss": 3.6451, + "step": 5956 + }, + { + "epoch": 0.5077132873092985, + "grad_norm": 92.31408537477375, + "learning_rate": 9.854809478166733e-06, + "loss": 4.9572, + "step": 5957 + }, + { + "epoch": 0.507798517003324, + "grad_norm": 37.483836259011376, + "learning_rate": 9.854690830541893e-06, + "loss": 3.6452, + "step": 5958 + }, + { + "epoch": 0.5078837466973494, + "grad_norm": 110.17308563748463, + "learning_rate": 9.854572135173098e-06, + "loss": 4.8678, + "step": 5959 + }, + { + "epoch": 0.5079689763913747, + "grad_norm": 74.76949078475779, + "learning_rate": 9.854453392061514e-06, + "loss": 4.2023, + "step": 5960 + }, + { + "epoch": 0.5080542060854002, + "grad_norm": 70.1312169809347, + "learning_rate": 9.85433460120831e-06, + "loss": 4.2178, + "step": 5961 + }, + { + "epoch": 0.5081394357794256, + "grad_norm": 137.90375096541462, + "learning_rate": 9.854215762614654e-06, + "loss": 6.2797, + "step": 5962 + }, + { + "epoch": 0.5082246654734509, + "grad_norm": 95.07414916864528, + "learning_rate": 9.854096876281715e-06, + "loss": 4.3614, + "step": 5963 + }, + { + "epoch": 0.5083098951674764, + "grad_norm": 85.89649750712708, + "learning_rate": 9.853977942210661e-06, + "loss": 4.723, + "step": 5964 + }, + { + "epoch": 0.5083951248615017, + "grad_norm": 75.26508164131042, + "learning_rate": 9.853858960402665e-06, + "loss": 4.7406, + "step": 5965 + }, + { + "epoch": 0.5084803545555271, + "grad_norm": 106.96339543861507, + "learning_rate": 9.853739930858894e-06, + "loss": 5.0629, + "step": 5966 + }, + { + "epoch": 0.5085655842495526, + "grad_norm": 304.34050625581153, + "learning_rate": 9.85362085358052e-06, + "loss": 5.6502, + "step": 5967 + }, + { + "epoch": 0.5086508139435779, + "grad_norm": 37.54526538853574, + "learning_rate": 9.853501728568712e-06, + "loss": 2.2985, + "step": 5968 + }, + { + "epoch": 0.5087360436376034, + "grad_norm": 90.1817661121711, + "learning_rate": 9.853382555824645e-06, + "loss": 5.5671, + "step": 5969 + }, + { + "epoch": 0.5088212733316287, + "grad_norm": 49.78987530071829, + "learning_rate": 9.853263335349488e-06, + "loss": 3.8419, + "step": 5970 + }, + { + "epoch": 0.5089065030256541, + "grad_norm": 56.329853083456655, + "learning_rate": 9.853144067144414e-06, + "loss": 4.3172, + "step": 5971 + }, + { + "epoch": 0.5089917327196796, + "grad_norm": 42.14804075337529, + "learning_rate": 9.853024751210598e-06, + "loss": 3.9839, + "step": 5972 + }, + { + "epoch": 0.5090769624137049, + "grad_norm": 35.03380798565042, + "learning_rate": 9.852905387549212e-06, + "loss": 3.8423, + "step": 5973 + }, + { + "epoch": 0.5091621921077303, + "grad_norm": 104.18917017100259, + "learning_rate": 9.852785976161428e-06, + "loss": 3.755, + "step": 5974 + }, + { + "epoch": 0.5092474218017558, + "grad_norm": 53.72194147322129, + "learning_rate": 9.852666517048423e-06, + "loss": 4.6996, + "step": 5975 + }, + { + "epoch": 0.5093326514957811, + "grad_norm": 62.61381079679127, + "learning_rate": 9.852547010211373e-06, + "loss": 4.603, + "step": 5976 + }, + { + "epoch": 0.5094178811898066, + "grad_norm": 100.2572755970988, + "learning_rate": 9.852427455651448e-06, + "loss": 3.3833, + "step": 5977 + }, + { + "epoch": 0.5095031108838319, + "grad_norm": 76.82329795927198, + "learning_rate": 9.852307853369829e-06, + "loss": 4.734, + "step": 5978 + }, + { + "epoch": 0.5095883405778573, + "grad_norm": 45.42243223746524, + "learning_rate": 9.852188203367692e-06, + "loss": 2.7034, + "step": 5979 + }, + { + "epoch": 0.5096735702718828, + "grad_norm": 59.1757136836874, + "learning_rate": 9.852068505646209e-06, + "loss": 3.3937, + "step": 5980 + }, + { + "epoch": 0.5097587999659081, + "grad_norm": 32.500963018751825, + "learning_rate": 9.851948760206562e-06, + "loss": 3.4479, + "step": 5981 + }, + { + "epoch": 0.5098440296599335, + "grad_norm": 52.690916402868154, + "learning_rate": 9.851828967049928e-06, + "loss": 2.0993, + "step": 5982 + }, + { + "epoch": 0.5099292593539589, + "grad_norm": 38.05444367293422, + "learning_rate": 9.851709126177483e-06, + "loss": 3.2952, + "step": 5983 + }, + { + "epoch": 0.5100144890479843, + "grad_norm": 152.43793251666096, + "learning_rate": 9.851589237590404e-06, + "loss": 4.9723, + "step": 5984 + }, + { + "epoch": 0.5100997187420098, + "grad_norm": 101.07381465910073, + "learning_rate": 9.851469301289876e-06, + "loss": 5.3223, + "step": 5985 + }, + { + "epoch": 0.5101849484360351, + "grad_norm": 47.18590460810969, + "learning_rate": 9.851349317277074e-06, + "loss": 3.7727, + "step": 5986 + }, + { + "epoch": 0.5102701781300605, + "grad_norm": 308.15062092355697, + "learning_rate": 9.85122928555318e-06, + "loss": 5.4199, + "step": 5987 + }, + { + "epoch": 0.510355407824086, + "grad_norm": 66.02984282222573, + "learning_rate": 9.851109206119372e-06, + "loss": 4.0266, + "step": 5988 + }, + { + "epoch": 0.5104406375181113, + "grad_norm": 44.859106783958715, + "learning_rate": 9.850989078976833e-06, + "loss": 3.2879, + "step": 5989 + }, + { + "epoch": 0.5105258672121367, + "grad_norm": 47.97205638739311, + "learning_rate": 9.850868904126744e-06, + "loss": 4.785, + "step": 5990 + }, + { + "epoch": 0.5106110969061621, + "grad_norm": 44.93614881688043, + "learning_rate": 9.850748681570287e-06, + "loss": 4.4717, + "step": 5991 + }, + { + "epoch": 0.5106963266001875, + "grad_norm": 37.264096181723666, + "learning_rate": 9.850628411308647e-06, + "loss": 3.1245, + "step": 5992 + }, + { + "epoch": 0.510781556294213, + "grad_norm": 56.436754904621274, + "learning_rate": 9.850508093343e-06, + "loss": 4.286, + "step": 5993 + }, + { + "epoch": 0.5108667859882383, + "grad_norm": 76.94306830170198, + "learning_rate": 9.850387727674537e-06, + "loss": 4.3032, + "step": 5994 + }, + { + "epoch": 0.5109520156822637, + "grad_norm": 45.34612579473496, + "learning_rate": 9.850267314304436e-06, + "loss": 3.6349, + "step": 5995 + }, + { + "epoch": 0.5110372453762891, + "grad_norm": 158.85150797951573, + "learning_rate": 9.850146853233882e-06, + "loss": 4.912, + "step": 5996 + }, + { + "epoch": 0.5111224750703145, + "grad_norm": 74.1549813190187, + "learning_rate": 9.850026344464063e-06, + "loss": 4.062, + "step": 5997 + }, + { + "epoch": 0.5112077047643399, + "grad_norm": 90.64119910869348, + "learning_rate": 9.849905787996161e-06, + "loss": 4.4732, + "step": 5998 + }, + { + "epoch": 0.5112929344583653, + "grad_norm": 56.75229910545568, + "learning_rate": 9.849785183831365e-06, + "loss": 3.9866, + "step": 5999 + }, + { + "epoch": 0.5113781641523907, + "grad_norm": 37.09737369496036, + "learning_rate": 9.849664531970855e-06, + "loss": 3.4701, + "step": 6000 + }, + { + "epoch": 0.511463393846416, + "grad_norm": 41.168431526709924, + "learning_rate": 9.849543832415824e-06, + "loss": 3.0888, + "step": 6001 + }, + { + "epoch": 0.5115486235404415, + "grad_norm": 42.53621645514064, + "learning_rate": 9.849423085167456e-06, + "loss": 3.7761, + "step": 6002 + }, + { + "epoch": 0.5116338532344669, + "grad_norm": 56.39321792854902, + "learning_rate": 9.849302290226938e-06, + "loss": 3.9382, + "step": 6003 + }, + { + "epoch": 0.5117190829284923, + "grad_norm": 93.26528027677467, + "learning_rate": 9.849181447595459e-06, + "loss": 5.3726, + "step": 6004 + }, + { + "epoch": 0.5118043126225177, + "grad_norm": 800.8866440007575, + "learning_rate": 9.849060557274209e-06, + "loss": 5.7619, + "step": 6005 + }, + { + "epoch": 0.511889542316543, + "grad_norm": 122.41903045759027, + "learning_rate": 9.848939619264375e-06, + "loss": 4.115, + "step": 6006 + }, + { + "epoch": 0.5119747720105685, + "grad_norm": 38.27813416592933, + "learning_rate": 9.848818633567145e-06, + "loss": 4.0945, + "step": 6007 + }, + { + "epoch": 0.5120600017045939, + "grad_norm": 56.95616320060417, + "learning_rate": 9.848697600183712e-06, + "loss": 4.224, + "step": 6008 + }, + { + "epoch": 0.5121452313986192, + "grad_norm": 44.32104710676975, + "learning_rate": 9.848576519115263e-06, + "loss": 5.1243, + "step": 6009 + }, + { + "epoch": 0.5122304610926447, + "grad_norm": 42.136427254015615, + "learning_rate": 9.848455390362992e-06, + "loss": 3.1557, + "step": 6010 + }, + { + "epoch": 0.51231569078667, + "grad_norm": 61.299924092555344, + "learning_rate": 9.848334213928088e-06, + "loss": 3.9461, + "step": 6011 + }, + { + "epoch": 0.5124009204806955, + "grad_norm": 63.628448902051076, + "learning_rate": 9.848212989811744e-06, + "loss": 4.3169, + "step": 6012 + }, + { + "epoch": 0.5124861501747209, + "grad_norm": 37.48304249619227, + "learning_rate": 9.848091718015152e-06, + "loss": 3.4769, + "step": 6013 + }, + { + "epoch": 0.5125713798687462, + "grad_norm": 42.89330800181122, + "learning_rate": 9.847970398539504e-06, + "loss": 3.9904, + "step": 6014 + }, + { + "epoch": 0.5126566095627717, + "grad_norm": 61.0298485662653, + "learning_rate": 9.847849031385994e-06, + "loss": 4.1581, + "step": 6015 + }, + { + "epoch": 0.5127418392567971, + "grad_norm": 66.45786825556863, + "learning_rate": 9.847727616555815e-06, + "loss": 3.7588, + "step": 6016 + }, + { + "epoch": 0.5128270689508224, + "grad_norm": 91.96983453311913, + "learning_rate": 9.847606154050162e-06, + "loss": 4.496, + "step": 6017 + }, + { + "epoch": 0.5129122986448479, + "grad_norm": 53.655050044412874, + "learning_rate": 9.847484643870226e-06, + "loss": 5.2731, + "step": 6018 + }, + { + "epoch": 0.5129975283388732, + "grad_norm": 36.75203835576498, + "learning_rate": 9.847363086017208e-06, + "loss": 3.7847, + "step": 6019 + }, + { + "epoch": 0.5130827580328987, + "grad_norm": 57.285087606604954, + "learning_rate": 9.847241480492297e-06, + "loss": 4.1969, + "step": 6020 + }, + { + "epoch": 0.5131679877269241, + "grad_norm": 33.24058071301571, + "learning_rate": 9.847119827296695e-06, + "loss": 2.8162, + "step": 6021 + }, + { + "epoch": 0.5132532174209494, + "grad_norm": 43.638785668405525, + "learning_rate": 9.846998126431594e-06, + "loss": 3.9708, + "step": 6022 + }, + { + "epoch": 0.5133384471149749, + "grad_norm": 32.227286842403146, + "learning_rate": 9.846876377898193e-06, + "loss": 3.4744, + "step": 6023 + }, + { + "epoch": 0.5134236768090003, + "grad_norm": 87.40135931652867, + "learning_rate": 9.846754581697689e-06, + "loss": 4.6119, + "step": 6024 + }, + { + "epoch": 0.5135089065030256, + "grad_norm": 71.20830718453911, + "learning_rate": 9.846632737831279e-06, + "loss": 3.4587, + "step": 6025 + }, + { + "epoch": 0.5135941361970511, + "grad_norm": 126.28080150103773, + "learning_rate": 9.84651084630016e-06, + "loss": 4.509, + "step": 6026 + }, + { + "epoch": 0.5136793658910764, + "grad_norm": 142.88397545052956, + "learning_rate": 9.846388907105537e-06, + "loss": 3.9946, + "step": 6027 + }, + { + "epoch": 0.5137645955851019, + "grad_norm": 32.29280521410761, + "learning_rate": 9.846266920248601e-06, + "loss": 3.7416, + "step": 6028 + }, + { + "epoch": 0.5138498252791273, + "grad_norm": 88.2956956930872, + "learning_rate": 9.846144885730558e-06, + "loss": 5.1623, + "step": 6029 + }, + { + "epoch": 0.5139350549731526, + "grad_norm": 104.61543017820907, + "learning_rate": 9.846022803552605e-06, + "loss": 3.8588, + "step": 6030 + }, + { + "epoch": 0.5140202846671781, + "grad_norm": 53.015509484104996, + "learning_rate": 9.845900673715942e-06, + "loss": 2.6317, + "step": 6031 + }, + { + "epoch": 0.5141055143612034, + "grad_norm": 49.30665941922039, + "learning_rate": 9.845778496221773e-06, + "loss": 3.8778, + "step": 6032 + }, + { + "epoch": 0.5141907440552288, + "grad_norm": 70.46540102613713, + "learning_rate": 9.845656271071298e-06, + "loss": 4.2854, + "step": 6033 + }, + { + "epoch": 0.5142759737492543, + "grad_norm": 31.86540429316905, + "learning_rate": 9.84553399826572e-06, + "loss": 3.4859, + "step": 6034 + }, + { + "epoch": 0.5143612034432796, + "grad_norm": 36.882195639647826, + "learning_rate": 9.845411677806238e-06, + "loss": 2.7601, + "step": 6035 + }, + { + "epoch": 0.5144464331373051, + "grad_norm": 97.97128352060861, + "learning_rate": 9.84528930969406e-06, + "loss": 6.0054, + "step": 6036 + }, + { + "epoch": 0.5145316628313304, + "grad_norm": 97.38799459192298, + "learning_rate": 9.845166893930384e-06, + "loss": 5.0066, + "step": 6037 + }, + { + "epoch": 0.5146168925253558, + "grad_norm": 65.43340840851461, + "learning_rate": 9.84504443051642e-06, + "loss": 4.1588, + "step": 6038 + }, + { + "epoch": 0.5147021222193813, + "grad_norm": 120.13532563046253, + "learning_rate": 9.844921919453367e-06, + "loss": 3.8421, + "step": 6039 + }, + { + "epoch": 0.5147873519134066, + "grad_norm": 116.30521653419481, + "learning_rate": 9.844799360742435e-06, + "loss": 4.3219, + "step": 6040 + }, + { + "epoch": 0.514872581607432, + "grad_norm": 48.815129149117574, + "learning_rate": 9.844676754384822e-06, + "loss": 4.2764, + "step": 6041 + }, + { + "epoch": 0.5149578113014575, + "grad_norm": 41.881080355815186, + "learning_rate": 9.84455410038174e-06, + "loss": 3.4491, + "step": 6042 + }, + { + "epoch": 0.5150430409954828, + "grad_norm": 38.69722215367094, + "learning_rate": 9.844431398734395e-06, + "loss": 3.3913, + "step": 6043 + }, + { + "epoch": 0.5151282706895082, + "grad_norm": 104.48843594032084, + "learning_rate": 9.844308649443993e-06, + "loss": 4.2282, + "step": 6044 + }, + { + "epoch": 0.5152135003835336, + "grad_norm": 76.85143850215226, + "learning_rate": 9.844185852511738e-06, + "loss": 3.5807, + "step": 6045 + }, + { + "epoch": 0.515298730077559, + "grad_norm": 39.25653083948329, + "learning_rate": 9.84406300793884e-06, + "loss": 3.6798, + "step": 6046 + }, + { + "epoch": 0.5153839597715845, + "grad_norm": 55.01551939702444, + "learning_rate": 9.843940115726508e-06, + "loss": 4.0667, + "step": 6047 + }, + { + "epoch": 0.5154691894656098, + "grad_norm": 48.83611728641843, + "learning_rate": 9.84381717587595e-06, + "loss": 2.8662, + "step": 6048 + }, + { + "epoch": 0.5155544191596352, + "grad_norm": 85.8787818678376, + "learning_rate": 9.843694188388375e-06, + "loss": 6.3431, + "step": 6049 + }, + { + "epoch": 0.5156396488536606, + "grad_norm": 124.64145753458695, + "learning_rate": 9.84357115326499e-06, + "loss": 4.7504, + "step": 6050 + }, + { + "epoch": 0.515724878547686, + "grad_norm": 62.08275854017039, + "learning_rate": 9.843448070507008e-06, + "loss": 3.074, + "step": 6051 + }, + { + "epoch": 0.5158101082417114, + "grad_norm": 54.02422378356158, + "learning_rate": 9.843324940115641e-06, + "loss": 3.4503, + "step": 6052 + }, + { + "epoch": 0.5158953379357368, + "grad_norm": 38.57453509575516, + "learning_rate": 9.843201762092097e-06, + "loss": 3.5313, + "step": 6053 + }, + { + "epoch": 0.5159805676297622, + "grad_norm": 57.57080819333929, + "learning_rate": 9.843078536437587e-06, + "loss": 4.4294, + "step": 6054 + }, + { + "epoch": 0.5160657973237877, + "grad_norm": 56.23779117425589, + "learning_rate": 9.842955263153327e-06, + "loss": 4.2478, + "step": 6055 + }, + { + "epoch": 0.516151027017813, + "grad_norm": 41.024586697850786, + "learning_rate": 9.842831942240523e-06, + "loss": 3.0999, + "step": 6056 + }, + { + "epoch": 0.5162362567118384, + "grad_norm": 39.98617137437763, + "learning_rate": 9.842708573700392e-06, + "loss": 3.5597, + "step": 6057 + }, + { + "epoch": 0.5163214864058638, + "grad_norm": 80.59406463754053, + "learning_rate": 9.842585157534147e-06, + "loss": 4.8764, + "step": 6058 + }, + { + "epoch": 0.5164067160998892, + "grad_norm": 45.89520960419101, + "learning_rate": 9.842461693743002e-06, + "loss": 3.6995, + "step": 6059 + }, + { + "epoch": 0.5164919457939146, + "grad_norm": 86.29806937051586, + "learning_rate": 9.84233818232817e-06, + "loss": 3.7044, + "step": 6060 + }, + { + "epoch": 0.51657717548794, + "grad_norm": 50.456123815435184, + "learning_rate": 9.842214623290865e-06, + "loss": 4.852, + "step": 6061 + }, + { + "epoch": 0.5166624051819654, + "grad_norm": 35.53870344452423, + "learning_rate": 9.842091016632304e-06, + "loss": 2.6022, + "step": 6062 + }, + { + "epoch": 0.5167476348759908, + "grad_norm": 37.07991999520034, + "learning_rate": 9.841967362353703e-06, + "loss": 3.195, + "step": 6063 + }, + { + "epoch": 0.5168328645700162, + "grad_norm": 41.91519018447769, + "learning_rate": 9.841843660456275e-06, + "loss": 3.9467, + "step": 6064 + }, + { + "epoch": 0.5169180942640416, + "grad_norm": 54.42890325014038, + "learning_rate": 9.84171991094124e-06, + "loss": 3.3434, + "step": 6065 + }, + { + "epoch": 0.517003323958067, + "grad_norm": 71.13042814255985, + "learning_rate": 9.841596113809813e-06, + "loss": 5.8215, + "step": 6066 + }, + { + "epoch": 0.5170885536520924, + "grad_norm": 34.418618885387005, + "learning_rate": 9.841472269063213e-06, + "loss": 3.47, + "step": 6067 + }, + { + "epoch": 0.5171737833461177, + "grad_norm": 64.53964782412609, + "learning_rate": 9.841348376702655e-06, + "loss": 4.0354, + "step": 6068 + }, + { + "epoch": 0.5172590130401432, + "grad_norm": 132.25859908640524, + "learning_rate": 9.841224436729361e-06, + "loss": 4.3482, + "step": 6069 + }, + { + "epoch": 0.5173442427341686, + "grad_norm": 46.98692660663864, + "learning_rate": 9.841100449144548e-06, + "loss": 4.4138, + "step": 6070 + }, + { + "epoch": 0.517429472428194, + "grad_norm": 45.16757998017609, + "learning_rate": 9.840976413949437e-06, + "loss": 2.6815, + "step": 6071 + }, + { + "epoch": 0.5175147021222194, + "grad_norm": 77.12437415024695, + "learning_rate": 9.840852331145245e-06, + "loss": 4.1827, + "step": 6072 + }, + { + "epoch": 0.5175999318162448, + "grad_norm": 112.07012726374586, + "learning_rate": 9.840728200733194e-06, + "loss": 4.562, + "step": 6073 + }, + { + "epoch": 0.5176851615102702, + "grad_norm": 56.65780805485877, + "learning_rate": 9.840604022714504e-06, + "loss": 4.6176, + "step": 6074 + }, + { + "epoch": 0.5177703912042956, + "grad_norm": 60.63065153581057, + "learning_rate": 9.840479797090398e-06, + "loss": 4.4401, + "step": 6075 + }, + { + "epoch": 0.5178556208983209, + "grad_norm": 82.07624133338774, + "learning_rate": 9.840355523862097e-06, + "loss": 3.3942, + "step": 6076 + }, + { + "epoch": 0.5179408505923464, + "grad_norm": 55.63857301103979, + "learning_rate": 9.840231203030824e-06, + "loss": 4.4648, + "step": 6077 + }, + { + "epoch": 0.5180260802863718, + "grad_norm": 69.32028781561121, + "learning_rate": 9.840106834597798e-06, + "loss": 3.7501, + "step": 6078 + }, + { + "epoch": 0.5181113099803971, + "grad_norm": 102.72037396802457, + "learning_rate": 9.839982418564244e-06, + "loss": 4.9025, + "step": 6079 + }, + { + "epoch": 0.5181965396744226, + "grad_norm": 53.38582032650243, + "learning_rate": 9.839857954931388e-06, + "loss": 4.7491, + "step": 6080 + }, + { + "epoch": 0.5182817693684479, + "grad_norm": 82.00909058279092, + "learning_rate": 9.839733443700452e-06, + "loss": 4.7374, + "step": 6081 + }, + { + "epoch": 0.5183669990624734, + "grad_norm": 55.883481293299276, + "learning_rate": 9.83960888487266e-06, + "loss": 4.6866, + "step": 6082 + }, + { + "epoch": 0.5184522287564988, + "grad_norm": 64.52221907977673, + "learning_rate": 9.839484278449238e-06, + "loss": 3.8759, + "step": 6083 + }, + { + "epoch": 0.5185374584505241, + "grad_norm": 59.95821630662994, + "learning_rate": 9.839359624431412e-06, + "loss": 4.3235, + "step": 6084 + }, + { + "epoch": 0.5186226881445496, + "grad_norm": 179.8083880581171, + "learning_rate": 9.839234922820407e-06, + "loss": 4.5032, + "step": 6085 + }, + { + "epoch": 0.518707917838575, + "grad_norm": 53.42989680250481, + "learning_rate": 9.839110173617447e-06, + "loss": 3.7815, + "step": 6086 + }, + { + "epoch": 0.5187931475326003, + "grad_norm": 47.38673537633964, + "learning_rate": 9.838985376823763e-06, + "loss": 3.8134, + "step": 6087 + }, + { + "epoch": 0.5188783772266258, + "grad_norm": 61.46219410863554, + "learning_rate": 9.838860532440581e-06, + "loss": 3.9796, + "step": 6088 + }, + { + "epoch": 0.5189636069206511, + "grad_norm": 90.14459196763127, + "learning_rate": 9.838735640469128e-06, + "loss": 4.5055, + "step": 6089 + }, + { + "epoch": 0.5190488366146766, + "grad_norm": 41.15135332942136, + "learning_rate": 9.838610700910633e-06, + "loss": 3.9663, + "step": 6090 + }, + { + "epoch": 0.519134066308702, + "grad_norm": 55.25998766792354, + "learning_rate": 9.838485713766324e-06, + "loss": 3.8313, + "step": 6091 + }, + { + "epoch": 0.5192192960027273, + "grad_norm": 68.00564915180821, + "learning_rate": 9.838360679037431e-06, + "loss": 2.1856, + "step": 6092 + }, + { + "epoch": 0.5193045256967528, + "grad_norm": 88.11021529536171, + "learning_rate": 9.838235596725183e-06, + "loss": 5.189, + "step": 6093 + }, + { + "epoch": 0.5193897553907781, + "grad_norm": 89.24638496027889, + "learning_rate": 9.83811046683081e-06, + "loss": 4.6498, + "step": 6094 + }, + { + "epoch": 0.5194749850848035, + "grad_norm": 30.124312794343936, + "learning_rate": 9.837985289355544e-06, + "loss": 3.2621, + "step": 6095 + }, + { + "epoch": 0.519560214778829, + "grad_norm": 41.86645245954419, + "learning_rate": 9.837860064300616e-06, + "loss": 3.6842, + "step": 6096 + }, + { + "epoch": 0.5196454444728543, + "grad_norm": 40.22733589431767, + "learning_rate": 9.837734791667254e-06, + "loss": 3.7682, + "step": 6097 + }, + { + "epoch": 0.5197306741668798, + "grad_norm": 41.82876783077576, + "learning_rate": 9.837609471456694e-06, + "loss": 3.5692, + "step": 6098 + }, + { + "epoch": 0.5198159038609051, + "grad_norm": 49.5954214861882, + "learning_rate": 9.837484103670168e-06, + "loss": 3.95, + "step": 6099 + }, + { + "epoch": 0.5199011335549305, + "grad_norm": 99.63677428829742, + "learning_rate": 9.837358688308907e-06, + "loss": 3.9558, + "step": 6100 + }, + { + "epoch": 0.519986363248956, + "grad_norm": 63.481465325257986, + "learning_rate": 9.837233225374146e-06, + "loss": 3.8182, + "step": 6101 + }, + { + "epoch": 0.5200715929429813, + "grad_norm": 149.25877848226645, + "learning_rate": 9.837107714867119e-06, + "loss": 4.3021, + "step": 6102 + }, + { + "epoch": 0.5201568226370067, + "grad_norm": 35.82648185105424, + "learning_rate": 9.836982156789056e-06, + "loss": 4.1588, + "step": 6103 + }, + { + "epoch": 0.5202420523310322, + "grad_norm": 88.07367726518652, + "learning_rate": 9.836856551141198e-06, + "loss": 4.1403, + "step": 6104 + }, + { + "epoch": 0.5203272820250575, + "grad_norm": 163.8253543763292, + "learning_rate": 9.83673089792478e-06, + "loss": 5.6103, + "step": 6105 + }, + { + "epoch": 0.520412511719083, + "grad_norm": 36.66132248679396, + "learning_rate": 9.836605197141032e-06, + "loss": 3.5357, + "step": 6106 + }, + { + "epoch": 0.5204977414131083, + "grad_norm": 41.06961940231361, + "learning_rate": 9.836479448791197e-06, + "loss": 3.5899, + "step": 6107 + }, + { + "epoch": 0.5205829711071337, + "grad_norm": 73.62188830810483, + "learning_rate": 9.836353652876505e-06, + "loss": 4.1579, + "step": 6108 + }, + { + "epoch": 0.5206682008011592, + "grad_norm": 37.13593204527578, + "learning_rate": 9.8362278093982e-06, + "loss": 2.9579, + "step": 6109 + }, + { + "epoch": 0.5207534304951845, + "grad_norm": 44.4552269473135, + "learning_rate": 9.836101918357514e-06, + "loss": 4.3582, + "step": 6110 + }, + { + "epoch": 0.5208386601892099, + "grad_norm": 54.54108735333471, + "learning_rate": 9.835975979755687e-06, + "loss": 3.0292, + "step": 6111 + }, + { + "epoch": 0.5209238898832353, + "grad_norm": 36.91546391286831, + "learning_rate": 9.83584999359396e-06, + "loss": 3.1511, + "step": 6112 + }, + { + "epoch": 0.5210091195772607, + "grad_norm": 37.49886941302548, + "learning_rate": 9.83572395987357e-06, + "loss": 4.0131, + "step": 6113 + }, + { + "epoch": 0.5210943492712861, + "grad_norm": 49.32669854079608, + "learning_rate": 9.835597878595753e-06, + "loss": 3.854, + "step": 6114 + }, + { + "epoch": 0.5211795789653115, + "grad_norm": 49.68676799165559, + "learning_rate": 9.835471749761755e-06, + "loss": 3.8317, + "step": 6115 + }, + { + "epoch": 0.5212648086593369, + "grad_norm": 54.074579580338415, + "learning_rate": 9.835345573372813e-06, + "loss": 4.0544, + "step": 6116 + }, + { + "epoch": 0.5213500383533624, + "grad_norm": 62.342589577460494, + "learning_rate": 9.83521934943017e-06, + "loss": 4.098, + "step": 6117 + }, + { + "epoch": 0.5214352680473877, + "grad_norm": 99.55042614540699, + "learning_rate": 9.835093077935064e-06, + "loss": 4.3379, + "step": 6118 + }, + { + "epoch": 0.5215204977414131, + "grad_norm": 62.95546509935315, + "learning_rate": 9.83496675888874e-06, + "loss": 4.6438, + "step": 6119 + }, + { + "epoch": 0.5216057274354385, + "grad_norm": 54.60195914823911, + "learning_rate": 9.83484039229244e-06, + "loss": 3.1087, + "step": 6120 + }, + { + "epoch": 0.5216909571294639, + "grad_norm": 104.41329057072906, + "learning_rate": 9.834713978147403e-06, + "loss": 3.9616, + "step": 6121 + }, + { + "epoch": 0.5217761868234893, + "grad_norm": 33.17133194244803, + "learning_rate": 9.834587516454877e-06, + "loss": 3.3421, + "step": 6122 + }, + { + "epoch": 0.5218614165175147, + "grad_norm": 55.296876880741834, + "learning_rate": 9.834461007216105e-06, + "loss": 4.2274, + "step": 6123 + }, + { + "epoch": 0.5219466462115401, + "grad_norm": 60.285983174094525, + "learning_rate": 9.834334450432328e-06, + "loss": 4.5408, + "step": 6124 + }, + { + "epoch": 0.5220318759055655, + "grad_norm": 78.5667038954891, + "learning_rate": 9.834207846104793e-06, + "loss": 4.1663, + "step": 6125 + }, + { + "epoch": 0.5221171055995909, + "grad_norm": 54.93599815160792, + "learning_rate": 9.834081194234746e-06, + "loss": 4.041, + "step": 6126 + }, + { + "epoch": 0.5222023352936163, + "grad_norm": 80.50065535023218, + "learning_rate": 9.83395449482343e-06, + "loss": 3.1226, + "step": 6127 + }, + { + "epoch": 0.5222875649876417, + "grad_norm": 106.92721603989028, + "learning_rate": 9.833827747872094e-06, + "loss": 4.1924, + "step": 6128 + }, + { + "epoch": 0.5223727946816671, + "grad_norm": 57.19795316502897, + "learning_rate": 9.833700953381981e-06, + "loss": 3.4535, + "step": 6129 + }, + { + "epoch": 0.5224580243756924, + "grad_norm": 104.93300125236732, + "learning_rate": 9.83357411135434e-06, + "loss": 4.5576, + "step": 6130 + }, + { + "epoch": 0.5225432540697179, + "grad_norm": 37.25633887306021, + "learning_rate": 9.83344722179042e-06, + "loss": 2.6196, + "step": 6131 + }, + { + "epoch": 0.5226284837637433, + "grad_norm": 84.05551630404982, + "learning_rate": 9.833320284691466e-06, + "loss": 4.9776, + "step": 6132 + }, + { + "epoch": 0.5227137134577687, + "grad_norm": 79.12736428967109, + "learning_rate": 9.833193300058728e-06, + "loss": 4.2935, + "step": 6133 + }, + { + "epoch": 0.5227989431517941, + "grad_norm": 349.74012872227115, + "learning_rate": 9.833066267893454e-06, + "loss": 4.6294, + "step": 6134 + }, + { + "epoch": 0.5228841728458195, + "grad_norm": 97.59695798125715, + "learning_rate": 9.832939188196893e-06, + "loss": 5.0885, + "step": 6135 + }, + { + "epoch": 0.5229694025398449, + "grad_norm": 148.92013554442454, + "learning_rate": 9.832812060970297e-06, + "loss": 4.5964, + "step": 6136 + }, + { + "epoch": 0.5230546322338703, + "grad_norm": 132.22996564112495, + "learning_rate": 9.832684886214914e-06, + "loss": 4.5017, + "step": 6137 + }, + { + "epoch": 0.5231398619278956, + "grad_norm": 346.10757706111406, + "learning_rate": 9.832557663931995e-06, + "loss": 4.1185, + "step": 6138 + }, + { + "epoch": 0.5232250916219211, + "grad_norm": 125.91256857700785, + "learning_rate": 9.832430394122794e-06, + "loss": 4.4816, + "step": 6139 + }, + { + "epoch": 0.5233103213159465, + "grad_norm": 55.31081024854022, + "learning_rate": 9.832303076788557e-06, + "loss": 3.7433, + "step": 6140 + }, + { + "epoch": 0.5233955510099719, + "grad_norm": 73.7220739566974, + "learning_rate": 9.83217571193054e-06, + "loss": 5.2935, + "step": 6141 + }, + { + "epoch": 0.5234807807039973, + "grad_norm": 44.10589161890686, + "learning_rate": 9.832048299549996e-06, + "loss": 3.8198, + "step": 6142 + }, + { + "epoch": 0.5235660103980226, + "grad_norm": 61.611484039215604, + "learning_rate": 9.831920839648178e-06, + "loss": 3.951, + "step": 6143 + }, + { + "epoch": 0.5236512400920481, + "grad_norm": 54.28925538497327, + "learning_rate": 9.831793332226338e-06, + "loss": 4.4111, + "step": 6144 + }, + { + "epoch": 0.5237364697860735, + "grad_norm": 114.98019163186925, + "learning_rate": 9.83166577728573e-06, + "loss": 4.3139, + "step": 6145 + }, + { + "epoch": 0.5238216994800988, + "grad_norm": 82.19042958273218, + "learning_rate": 9.831538174827608e-06, + "loss": 4.7851, + "step": 6146 + }, + { + "epoch": 0.5239069291741243, + "grad_norm": 133.82643813360139, + "learning_rate": 9.83141052485323e-06, + "loss": 5.4827, + "step": 6147 + }, + { + "epoch": 0.5239921588681496, + "grad_norm": 47.781459412706674, + "learning_rate": 9.831282827363849e-06, + "loss": 2.9455, + "step": 6148 + }, + { + "epoch": 0.5240773885621751, + "grad_norm": 32.66531945001395, + "learning_rate": 9.831155082360719e-06, + "loss": 4.101, + "step": 6149 + }, + { + "epoch": 0.5241626182562005, + "grad_norm": 75.41770937464483, + "learning_rate": 9.831027289845101e-06, + "loss": 4.0868, + "step": 6150 + }, + { + "epoch": 0.5242478479502258, + "grad_norm": 97.55579970826942, + "learning_rate": 9.830899449818248e-06, + "loss": 4.0546, + "step": 6151 + }, + { + "epoch": 0.5243330776442513, + "grad_norm": 47.295083439093126, + "learning_rate": 9.830771562281417e-06, + "loss": 2.8858, + "step": 6152 + }, + { + "epoch": 0.5244183073382767, + "grad_norm": 90.20052795705867, + "learning_rate": 9.83064362723587e-06, + "loss": 3.3929, + "step": 6153 + }, + { + "epoch": 0.524503537032302, + "grad_norm": 57.03687095696615, + "learning_rate": 9.830515644682862e-06, + "loss": 3.8953, + "step": 6154 + }, + { + "epoch": 0.5245887667263275, + "grad_norm": 190.29757943573168, + "learning_rate": 9.830387614623652e-06, + "loss": 5.237, + "step": 6155 + }, + { + "epoch": 0.5246739964203528, + "grad_norm": 35.352882877953924, + "learning_rate": 9.830259537059498e-06, + "loss": 3.5933, + "step": 6156 + }, + { + "epoch": 0.5247592261143782, + "grad_norm": 57.57647397971923, + "learning_rate": 9.830131411991661e-06, + "loss": 4.3016, + "step": 6157 + }, + { + "epoch": 0.5248444558084037, + "grad_norm": 48.9049016978042, + "learning_rate": 9.830003239421402e-06, + "loss": 4.5458, + "step": 6158 + }, + { + "epoch": 0.524929685502429, + "grad_norm": 72.5224733157498, + "learning_rate": 9.829875019349979e-06, + "loss": 3.1572, + "step": 6159 + }, + { + "epoch": 0.5250149151964545, + "grad_norm": 59.06176614736015, + "learning_rate": 9.829746751778655e-06, + "loss": 4.2475, + "step": 6160 + }, + { + "epoch": 0.5251001448904798, + "grad_norm": 146.00761413860897, + "learning_rate": 9.829618436708692e-06, + "loss": 4.8611, + "step": 6161 + }, + { + "epoch": 0.5251853745845052, + "grad_norm": 68.44243600719282, + "learning_rate": 9.82949007414135e-06, + "loss": 4.7101, + "step": 6162 + }, + { + "epoch": 0.5252706042785307, + "grad_norm": 51.49942510814681, + "learning_rate": 9.829361664077892e-06, + "loss": 4.1803, + "step": 6163 + }, + { + "epoch": 0.525355833972556, + "grad_norm": 53.01729085854487, + "learning_rate": 9.829233206519581e-06, + "loss": 4.3132, + "step": 6164 + }, + { + "epoch": 0.5254410636665814, + "grad_norm": 97.53862003331824, + "learning_rate": 9.82910470146768e-06, + "loss": 4.9874, + "step": 6165 + }, + { + "epoch": 0.5255262933606069, + "grad_norm": 62.29308162639816, + "learning_rate": 9.828976148923454e-06, + "loss": 4.0385, + "step": 6166 + }, + { + "epoch": 0.5256115230546322, + "grad_norm": 33.33618789923645, + "learning_rate": 9.828847548888165e-06, + "loss": 3.3071, + "step": 6167 + }, + { + "epoch": 0.5256967527486577, + "grad_norm": 58.62802787923182, + "learning_rate": 9.828718901363081e-06, + "loss": 4.1872, + "step": 6168 + }, + { + "epoch": 0.525781982442683, + "grad_norm": 58.680338539731274, + "learning_rate": 9.828590206349464e-06, + "loss": 4.0554, + "step": 6169 + }, + { + "epoch": 0.5258672121367084, + "grad_norm": 44.42709287837459, + "learning_rate": 9.828461463848582e-06, + "loss": 3.823, + "step": 6170 + }, + { + "epoch": 0.5259524418307339, + "grad_norm": 132.5352819107996, + "learning_rate": 9.828332673861699e-06, + "loss": 4.5586, + "step": 6171 + }, + { + "epoch": 0.5260376715247592, + "grad_norm": 45.21591626929955, + "learning_rate": 9.828203836390083e-06, + "loss": 3.924, + "step": 6172 + }, + { + "epoch": 0.5261229012187846, + "grad_norm": 67.44989329978903, + "learning_rate": 9.828074951435002e-06, + "loss": 3.8032, + "step": 6173 + }, + { + "epoch": 0.52620813091281, + "grad_norm": 67.08482271007648, + "learning_rate": 9.827946018997721e-06, + "loss": 4.6619, + "step": 6174 + }, + { + "epoch": 0.5262933606068354, + "grad_norm": 69.3896947594839, + "learning_rate": 9.82781703907951e-06, + "loss": 4.0627, + "step": 6175 + }, + { + "epoch": 0.5263785903008609, + "grad_norm": 66.94675871300292, + "learning_rate": 9.827688011681636e-06, + "loss": 2.7585, + "step": 6176 + }, + { + "epoch": 0.5264638199948862, + "grad_norm": 61.44535323730678, + "learning_rate": 9.82755893680537e-06, + "loss": 4.1121, + "step": 6177 + }, + { + "epoch": 0.5265490496889116, + "grad_norm": 148.93655646369572, + "learning_rate": 9.82742981445198e-06, + "loss": 4.2155, + "step": 6178 + }, + { + "epoch": 0.526634279382937, + "grad_norm": 47.26188822444608, + "learning_rate": 9.827300644622735e-06, + "loss": 4.3285, + "step": 6179 + }, + { + "epoch": 0.5267195090769624, + "grad_norm": 42.80853548166626, + "learning_rate": 9.827171427318907e-06, + "loss": 3.2296, + "step": 6180 + }, + { + "epoch": 0.5268047387709878, + "grad_norm": 56.94046488967121, + "learning_rate": 9.827042162541766e-06, + "loss": 4.4365, + "step": 6181 + }, + { + "epoch": 0.5268899684650132, + "grad_norm": 210.98813956053354, + "learning_rate": 9.826912850292583e-06, + "loss": 3.4083, + "step": 6182 + }, + { + "epoch": 0.5269751981590386, + "grad_norm": 86.6561903964425, + "learning_rate": 9.82678349057263e-06, + "loss": 5.8522, + "step": 6183 + }, + { + "epoch": 0.5270604278530641, + "grad_norm": 87.62253771299191, + "learning_rate": 9.82665408338318e-06, + "loss": 4.8776, + "step": 6184 + }, + { + "epoch": 0.5271456575470894, + "grad_norm": 120.06764975536034, + "learning_rate": 9.826524628725506e-06, + "loss": 6.0055, + "step": 6185 + }, + { + "epoch": 0.5272308872411148, + "grad_norm": 38.62272054630549, + "learning_rate": 9.82639512660088e-06, + "loss": 3.5961, + "step": 6186 + }, + { + "epoch": 0.5273161169351402, + "grad_norm": 46.940917584002584, + "learning_rate": 9.826265577010574e-06, + "loss": 4.2744, + "step": 6187 + }, + { + "epoch": 0.5274013466291656, + "grad_norm": 60.933613972952735, + "learning_rate": 9.826135979955864e-06, + "loss": 4.4893, + "step": 6188 + }, + { + "epoch": 0.527486576323191, + "grad_norm": 93.27620261882154, + "learning_rate": 9.826006335438026e-06, + "loss": 4.8703, + "step": 6189 + }, + { + "epoch": 0.5275718060172164, + "grad_norm": 79.9956901274627, + "learning_rate": 9.825876643458332e-06, + "loss": 4.5054, + "step": 6190 + }, + { + "epoch": 0.5276570357112418, + "grad_norm": 37.41611959389647, + "learning_rate": 9.825746904018058e-06, + "loss": 3.9763, + "step": 6191 + }, + { + "epoch": 0.5277422654052671, + "grad_norm": 70.68027365094888, + "learning_rate": 9.825617117118483e-06, + "loss": 4.0137, + "step": 6192 + }, + { + "epoch": 0.5278274950992926, + "grad_norm": 57.73266420044183, + "learning_rate": 9.825487282760879e-06, + "loss": 3.7723, + "step": 6193 + }, + { + "epoch": 0.527912724793318, + "grad_norm": 51.28274738798155, + "learning_rate": 9.825357400946527e-06, + "loss": 3.8551, + "step": 6194 + }, + { + "epoch": 0.5279979544873434, + "grad_norm": 56.51607489294212, + "learning_rate": 9.825227471676701e-06, + "loss": 4.2084, + "step": 6195 + }, + { + "epoch": 0.5280831841813688, + "grad_norm": 44.9109782428559, + "learning_rate": 9.825097494952681e-06, + "loss": 3.1683, + "step": 6196 + }, + { + "epoch": 0.5281684138753941, + "grad_norm": 70.77901661040171, + "learning_rate": 9.824967470775745e-06, + "loss": 4.5776, + "step": 6197 + }, + { + "epoch": 0.5282536435694196, + "grad_norm": 64.05126417650271, + "learning_rate": 9.824837399147169e-06, + "loss": 4.0905, + "step": 6198 + }, + { + "epoch": 0.528338873263445, + "grad_norm": 176.70224751252803, + "learning_rate": 9.824707280068236e-06, + "loss": 5.2529, + "step": 6199 + }, + { + "epoch": 0.5284241029574703, + "grad_norm": 74.2445553188132, + "learning_rate": 9.824577113540224e-06, + "loss": 3.8934, + "step": 6200 + }, + { + "epoch": 0.5285093326514958, + "grad_norm": 34.6165338244499, + "learning_rate": 9.824446899564412e-06, + "loss": 3.1955, + "step": 6201 + }, + { + "epoch": 0.5285945623455212, + "grad_norm": 82.36550900194362, + "learning_rate": 9.824316638142084e-06, + "loss": 5.6274, + "step": 6202 + }, + { + "epoch": 0.5286797920395466, + "grad_norm": 78.40810470692, + "learning_rate": 9.824186329274517e-06, + "loss": 3.7146, + "step": 6203 + }, + { + "epoch": 0.528765021733572, + "grad_norm": 37.1778485401865, + "learning_rate": 9.824055972962996e-06, + "loss": 3.4401, + "step": 6204 + }, + { + "epoch": 0.5288502514275973, + "grad_norm": 88.29670557687876, + "learning_rate": 9.8239255692088e-06, + "loss": 4.3061, + "step": 6205 + }, + { + "epoch": 0.5289354811216228, + "grad_norm": 68.06022466886328, + "learning_rate": 9.823795118013213e-06, + "loss": 4.3074, + "step": 6206 + }, + { + "epoch": 0.5290207108156482, + "grad_norm": 117.78756421360556, + "learning_rate": 9.823664619377517e-06, + "loss": 4.7545, + "step": 6207 + }, + { + "epoch": 0.5291059405096735, + "grad_norm": 29.339604653776412, + "learning_rate": 9.823534073302998e-06, + "loss": 3.1437, + "step": 6208 + }, + { + "epoch": 0.529191170203699, + "grad_norm": 46.71322046081011, + "learning_rate": 9.823403479790938e-06, + "loss": 3.6964, + "step": 6209 + }, + { + "epoch": 0.5292763998977243, + "grad_norm": 73.51452909638225, + "learning_rate": 9.823272838842621e-06, + "loss": 4.4709, + "step": 6210 + }, + { + "epoch": 0.5293616295917498, + "grad_norm": 115.45079678511456, + "learning_rate": 9.823142150459332e-06, + "loss": 5.24, + "step": 6211 + }, + { + "epoch": 0.5294468592857752, + "grad_norm": 115.62432683533832, + "learning_rate": 9.823011414642356e-06, + "loss": 5.8389, + "step": 6212 + }, + { + "epoch": 0.5295320889798005, + "grad_norm": 53.17275812024313, + "learning_rate": 9.82288063139298e-06, + "loss": 4.1216, + "step": 6213 + }, + { + "epoch": 0.529617318673826, + "grad_norm": 55.873985092404304, + "learning_rate": 9.822749800712489e-06, + "loss": 3.4412, + "step": 6214 + }, + { + "epoch": 0.5297025483678514, + "grad_norm": 43.02764821559522, + "learning_rate": 9.82261892260217e-06, + "loss": 2.4432, + "step": 6215 + }, + { + "epoch": 0.5297877780618767, + "grad_norm": 33.632774945870906, + "learning_rate": 9.82248799706331e-06, + "loss": 3.3354, + "step": 6216 + }, + { + "epoch": 0.5298730077559022, + "grad_norm": 49.63601948470341, + "learning_rate": 9.822357024097198e-06, + "loss": 3.5468, + "step": 6217 + }, + { + "epoch": 0.5299582374499275, + "grad_norm": 74.79210555674752, + "learning_rate": 9.82222600370512e-06, + "loss": 3.7383, + "step": 6218 + }, + { + "epoch": 0.530043467143953, + "grad_norm": 428.9448309080901, + "learning_rate": 9.822094935888364e-06, + "loss": 5.2957, + "step": 6219 + }, + { + "epoch": 0.5301286968379784, + "grad_norm": 118.4335795692575, + "learning_rate": 9.821963820648222e-06, + "loss": 5.5913, + "step": 6220 + }, + { + "epoch": 0.5302139265320037, + "grad_norm": 56.489939915847735, + "learning_rate": 9.821832657985982e-06, + "loss": 3.8019, + "step": 6221 + }, + { + "epoch": 0.5302991562260292, + "grad_norm": 80.1642647671302, + "learning_rate": 9.821701447902934e-06, + "loss": 4.9696, + "step": 6222 + }, + { + "epoch": 0.5303843859200545, + "grad_norm": 281.28962619817736, + "learning_rate": 9.821570190400368e-06, + "loss": 4.5135, + "step": 6223 + }, + { + "epoch": 0.5304696156140799, + "grad_norm": 58.344485998773536, + "learning_rate": 9.821438885479576e-06, + "loss": 3.8312, + "step": 6224 + }, + { + "epoch": 0.5305548453081054, + "grad_norm": 48.70810892335611, + "learning_rate": 9.821307533141845e-06, + "loss": 3.8214, + "step": 6225 + }, + { + "epoch": 0.5306400750021307, + "grad_norm": 72.80139406285119, + "learning_rate": 9.821176133388473e-06, + "loss": 3.586, + "step": 6226 + }, + { + "epoch": 0.5307253046961561, + "grad_norm": 105.80038550902307, + "learning_rate": 9.82104468622075e-06, + "loss": 3.9842, + "step": 6227 + }, + { + "epoch": 0.5308105343901816, + "grad_norm": 95.79796211899826, + "learning_rate": 9.820913191639968e-06, + "loss": 4.1613, + "step": 6228 + }, + { + "epoch": 0.5308957640842069, + "grad_norm": 66.72282131767703, + "learning_rate": 9.820781649647419e-06, + "loss": 4.4206, + "step": 6229 + }, + { + "epoch": 0.5309809937782324, + "grad_norm": 50.23467559422582, + "learning_rate": 9.820650060244399e-06, + "loss": 3.7236, + "step": 6230 + }, + { + "epoch": 0.5310662234722577, + "grad_norm": 37.92879365761449, + "learning_rate": 9.8205184234322e-06, + "loss": 3.4251, + "step": 6231 + }, + { + "epoch": 0.5311514531662831, + "grad_norm": 38.18509963079561, + "learning_rate": 9.82038673921212e-06, + "loss": 3.6837, + "step": 6232 + }, + { + "epoch": 0.5312366828603086, + "grad_norm": 61.234454257230695, + "learning_rate": 9.820255007585448e-06, + "loss": 3.7512, + "step": 6233 + }, + { + "epoch": 0.5313219125543339, + "grad_norm": 73.5716656717758, + "learning_rate": 9.820123228553486e-06, + "loss": 4.1521, + "step": 6234 + }, + { + "epoch": 0.5314071422483593, + "grad_norm": 97.11341184520448, + "learning_rate": 9.819991402117527e-06, + "loss": 6.0318, + "step": 6235 + }, + { + "epoch": 0.5314923719423847, + "grad_norm": 39.42204634950718, + "learning_rate": 9.81985952827887e-06, + "loss": 3.7888, + "step": 6236 + }, + { + "epoch": 0.5315776016364101, + "grad_norm": 39.95453289470443, + "learning_rate": 9.819727607038807e-06, + "loss": 3.5224, + "step": 6237 + }, + { + "epoch": 0.5316628313304356, + "grad_norm": 62.98915500982737, + "learning_rate": 9.819595638398639e-06, + "loss": 4.1769, + "step": 6238 + }, + { + "epoch": 0.5317480610244609, + "grad_norm": 100.35544730546711, + "learning_rate": 9.819463622359664e-06, + "loss": 3.4102, + "step": 6239 + }, + { + "epoch": 0.5318332907184863, + "grad_norm": 60.14354959654688, + "learning_rate": 9.819331558923177e-06, + "loss": 3.2964, + "step": 6240 + }, + { + "epoch": 0.5319185204125118, + "grad_norm": 38.87529332865605, + "learning_rate": 9.819199448090481e-06, + "loss": 3.9104, + "step": 6241 + }, + { + "epoch": 0.5320037501065371, + "grad_norm": 38.74729676182885, + "learning_rate": 9.819067289862874e-06, + "loss": 3.904, + "step": 6242 + }, + { + "epoch": 0.5320889798005625, + "grad_norm": 79.57204787446975, + "learning_rate": 9.818935084241652e-06, + "loss": 4.5955, + "step": 6243 + }, + { + "epoch": 0.5321742094945879, + "grad_norm": 106.24642330420815, + "learning_rate": 9.818802831228121e-06, + "loss": 5.4215, + "step": 6244 + }, + { + "epoch": 0.5322594391886133, + "grad_norm": 104.76192599825946, + "learning_rate": 9.818670530823579e-06, + "loss": 5.4546, + "step": 6245 + }, + { + "epoch": 0.5323446688826388, + "grad_norm": 32.04773301974468, + "learning_rate": 9.818538183029327e-06, + "loss": 2.9129, + "step": 6246 + }, + { + "epoch": 0.5324298985766641, + "grad_norm": 52.61412366767528, + "learning_rate": 9.818405787846665e-06, + "loss": 4.5382, + "step": 6247 + }, + { + "epoch": 0.5325151282706895, + "grad_norm": 67.08045805777567, + "learning_rate": 9.8182733452769e-06, + "loss": 5.2829, + "step": 6248 + }, + { + "epoch": 0.5326003579647149, + "grad_norm": 83.42336652395929, + "learning_rate": 9.818140855321327e-06, + "loss": 4.2345, + "step": 6249 + }, + { + "epoch": 0.5326855876587403, + "grad_norm": 73.13733788117833, + "learning_rate": 9.818008317981256e-06, + "loss": 2.839, + "step": 6250 + }, + { + "epoch": 0.5327708173527657, + "grad_norm": 53.34349528695025, + "learning_rate": 9.817875733257987e-06, + "loss": 4.3237, + "step": 6251 + }, + { + "epoch": 0.5328560470467911, + "grad_norm": 56.928984574540344, + "learning_rate": 9.817743101152824e-06, + "loss": 4.2792, + "step": 6252 + }, + { + "epoch": 0.5329412767408165, + "grad_norm": 61.33643703732121, + "learning_rate": 9.817610421667072e-06, + "loss": 4.7534, + "step": 6253 + }, + { + "epoch": 0.533026506434842, + "grad_norm": 87.32009065940547, + "learning_rate": 9.817477694802035e-06, + "loss": 4.9336, + "step": 6254 + }, + { + "epoch": 0.5331117361288673, + "grad_norm": 118.60134531504887, + "learning_rate": 9.817344920559023e-06, + "loss": 4.1901, + "step": 6255 + }, + { + "epoch": 0.5331969658228927, + "grad_norm": 43.90720865659277, + "learning_rate": 9.817212098939334e-06, + "loss": 3.4101, + "step": 6256 + }, + { + "epoch": 0.5332821955169181, + "grad_norm": 34.72884972512831, + "learning_rate": 9.81707922994428e-06, + "loss": 3.5498, + "step": 6257 + }, + { + "epoch": 0.5333674252109435, + "grad_norm": 35.197166788155215, + "learning_rate": 9.816946313575165e-06, + "loss": 3.8325, + "step": 6258 + }, + { + "epoch": 0.5334526549049688, + "grad_norm": 49.732907301033585, + "learning_rate": 9.816813349833299e-06, + "loss": 4.0499, + "step": 6259 + }, + { + "epoch": 0.5335378845989943, + "grad_norm": 128.65457170531477, + "learning_rate": 9.816680338719986e-06, + "loss": 3.6947, + "step": 6260 + }, + { + "epoch": 0.5336231142930197, + "grad_norm": 66.55251038722125, + "learning_rate": 9.816547280236534e-06, + "loss": 4.4068, + "step": 6261 + }, + { + "epoch": 0.5337083439870451, + "grad_norm": 52.082045601511794, + "learning_rate": 9.816414174384256e-06, + "loss": 4.1556, + "step": 6262 + }, + { + "epoch": 0.5337935736810705, + "grad_norm": 109.65352381075523, + "learning_rate": 9.816281021164459e-06, + "loss": 4.1191, + "step": 6263 + }, + { + "epoch": 0.5338788033750959, + "grad_norm": 46.49675402881843, + "learning_rate": 9.816147820578449e-06, + "loss": 2.4696, + "step": 6264 + }, + { + "epoch": 0.5339640330691213, + "grad_norm": 86.7657950739738, + "learning_rate": 9.816014572627543e-06, + "loss": 5.1803, + "step": 6265 + }, + { + "epoch": 0.5340492627631467, + "grad_norm": 97.36478174031305, + "learning_rate": 9.815881277313044e-06, + "loss": 4.1126, + "step": 6266 + }, + { + "epoch": 0.534134492457172, + "grad_norm": 76.03704070337278, + "learning_rate": 9.815747934636268e-06, + "loss": 5.6386, + "step": 6267 + }, + { + "epoch": 0.5342197221511975, + "grad_norm": 44.932287782990954, + "learning_rate": 9.815614544598524e-06, + "loss": 3.2078, + "step": 6268 + }, + { + "epoch": 0.5343049518452229, + "grad_norm": 33.95376005870744, + "learning_rate": 9.815481107201124e-06, + "loss": 3.2708, + "step": 6269 + }, + { + "epoch": 0.5343901815392482, + "grad_norm": 70.97867381469143, + "learning_rate": 9.815347622445383e-06, + "loss": 4.5318, + "step": 6270 + }, + { + "epoch": 0.5344754112332737, + "grad_norm": 63.80547222606847, + "learning_rate": 9.81521409033261e-06, + "loss": 4.5442, + "step": 6271 + }, + { + "epoch": 0.534560640927299, + "grad_norm": 181.6171801468707, + "learning_rate": 9.81508051086412e-06, + "loss": 2.727, + "step": 6272 + }, + { + "epoch": 0.5346458706213245, + "grad_norm": 62.40518637002304, + "learning_rate": 9.814946884041225e-06, + "loss": 3.5157, + "step": 6273 + }, + { + "epoch": 0.5347311003153499, + "grad_norm": 54.952569375554205, + "learning_rate": 9.814813209865242e-06, + "loss": 4.5718, + "step": 6274 + }, + { + "epoch": 0.5348163300093752, + "grad_norm": 71.80948565669364, + "learning_rate": 9.814679488337485e-06, + "loss": 3.8814, + "step": 6275 + }, + { + "epoch": 0.5349015597034007, + "grad_norm": 55.58902165078893, + "learning_rate": 9.814545719459267e-06, + "loss": 3.9401, + "step": 6276 + }, + { + "epoch": 0.534986789397426, + "grad_norm": 92.07830487479757, + "learning_rate": 9.814411903231907e-06, + "loss": 4.9438, + "step": 6277 + }, + { + "epoch": 0.5350720190914514, + "grad_norm": 139.68694539473984, + "learning_rate": 9.814278039656717e-06, + "loss": 3.2664, + "step": 6278 + }, + { + "epoch": 0.5351572487854769, + "grad_norm": 56.65119623267713, + "learning_rate": 9.814144128735015e-06, + "loss": 3.6891, + "step": 6279 + }, + { + "epoch": 0.5352424784795022, + "grad_norm": 102.77216859431275, + "learning_rate": 9.81401017046812e-06, + "loss": 4.1385, + "step": 6280 + }, + { + "epoch": 0.5353277081735277, + "grad_norm": 40.971098710808945, + "learning_rate": 9.813876164857347e-06, + "loss": 3.7936, + "step": 6281 + }, + { + "epoch": 0.5354129378675531, + "grad_norm": 75.40279153022043, + "learning_rate": 9.813742111904017e-06, + "loss": 3.346, + "step": 6282 + }, + { + "epoch": 0.5354981675615784, + "grad_norm": 58.81712895863266, + "learning_rate": 9.813608011609443e-06, + "loss": 4.7145, + "step": 6283 + }, + { + "epoch": 0.5355833972556039, + "grad_norm": 95.71885549165575, + "learning_rate": 9.81347386397495e-06, + "loss": 4.1912, + "step": 6284 + }, + { + "epoch": 0.5356686269496292, + "grad_norm": 45.57328996413426, + "learning_rate": 9.813339669001851e-06, + "loss": 3.547, + "step": 6285 + }, + { + "epoch": 0.5357538566436546, + "grad_norm": 48.909662399746814, + "learning_rate": 9.81320542669147e-06, + "loss": 4.1399, + "step": 6286 + }, + { + "epoch": 0.5358390863376801, + "grad_norm": 80.59556400788897, + "learning_rate": 9.813071137045126e-06, + "loss": 4.1881, + "step": 6287 + }, + { + "epoch": 0.5359243160317054, + "grad_norm": 71.05234812858215, + "learning_rate": 9.812936800064141e-06, + "loss": 3.9609, + "step": 6288 + }, + { + "epoch": 0.5360095457257309, + "grad_norm": 33.370556666036634, + "learning_rate": 9.812802415749836e-06, + "loss": 3.6109, + "step": 6289 + }, + { + "epoch": 0.5360947754197563, + "grad_norm": 64.94910036120238, + "learning_rate": 9.81266798410353e-06, + "loss": 4.1245, + "step": 6290 + }, + { + "epoch": 0.5361800051137816, + "grad_norm": 126.6904286493837, + "learning_rate": 9.812533505126546e-06, + "loss": 4.5081, + "step": 6291 + }, + { + "epoch": 0.5362652348078071, + "grad_norm": 82.54942200277347, + "learning_rate": 9.812398978820209e-06, + "loss": 3.8187, + "step": 6292 + }, + { + "epoch": 0.5363504645018324, + "grad_norm": 44.82126404081596, + "learning_rate": 9.812264405185841e-06, + "loss": 3.4128, + "step": 6293 + }, + { + "epoch": 0.5364356941958578, + "grad_norm": 106.31036945917756, + "learning_rate": 9.812129784224763e-06, + "loss": 4.9297, + "step": 6294 + }, + { + "epoch": 0.5365209238898833, + "grad_norm": 57.39985902051263, + "learning_rate": 9.8119951159383e-06, + "loss": 4.012, + "step": 6295 + }, + { + "epoch": 0.5366061535839086, + "grad_norm": 120.22113045073723, + "learning_rate": 9.81186040032778e-06, + "loss": 4.8731, + "step": 6296 + }, + { + "epoch": 0.5366913832779341, + "grad_norm": 47.13695379049335, + "learning_rate": 9.811725637394522e-06, + "loss": 3.7464, + "step": 6297 + }, + { + "epoch": 0.5367766129719594, + "grad_norm": 112.79416013415228, + "learning_rate": 9.811590827139857e-06, + "loss": 4.704, + "step": 6298 + }, + { + "epoch": 0.5368618426659848, + "grad_norm": 49.623552424231406, + "learning_rate": 9.811455969565104e-06, + "loss": 4.2673, + "step": 6299 + }, + { + "epoch": 0.5369470723600103, + "grad_norm": 34.74410661247958, + "learning_rate": 9.811321064671597e-06, + "loss": 3.2549, + "step": 6300 + }, + { + "epoch": 0.5370323020540356, + "grad_norm": 64.24465530684728, + "learning_rate": 9.81118611246066e-06, + "loss": 3.9453, + "step": 6301 + }, + { + "epoch": 0.537117531748061, + "grad_norm": 49.05539549887171, + "learning_rate": 9.811051112933616e-06, + "loss": 4.8385, + "step": 6302 + }, + { + "epoch": 0.5372027614420865, + "grad_norm": 71.92956601942505, + "learning_rate": 9.810916066091797e-06, + "loss": 4.6493, + "step": 6303 + }, + { + "epoch": 0.5372879911361118, + "grad_norm": 48.27903785983637, + "learning_rate": 9.810780971936532e-06, + "loss": 3.2722, + "step": 6304 + }, + { + "epoch": 0.5373732208301372, + "grad_norm": 61.13400138507994, + "learning_rate": 9.810645830469147e-06, + "loss": 4.8485, + "step": 6305 + }, + { + "epoch": 0.5374584505241626, + "grad_norm": 63.143872419604214, + "learning_rate": 9.81051064169097e-06, + "loss": 3.8225, + "step": 6306 + }, + { + "epoch": 0.537543680218188, + "grad_norm": 64.20436144667462, + "learning_rate": 9.810375405603336e-06, + "loss": 4.0306, + "step": 6307 + }, + { + "epoch": 0.5376289099122135, + "grad_norm": 120.30368756651531, + "learning_rate": 9.810240122207568e-06, + "loss": 3.7672, + "step": 6308 + }, + { + "epoch": 0.5377141396062388, + "grad_norm": 106.21805669508849, + "learning_rate": 9.810104791505001e-06, + "loss": 4.4001, + "step": 6309 + }, + { + "epoch": 0.5377993693002642, + "grad_norm": 57.24914686758172, + "learning_rate": 9.809969413496964e-06, + "loss": 3.8103, + "step": 6310 + }, + { + "epoch": 0.5378845989942896, + "grad_norm": 81.82889295908367, + "learning_rate": 9.809833988184792e-06, + "loss": 4.5867, + "step": 6311 + }, + { + "epoch": 0.537969828688315, + "grad_norm": 64.23165249496925, + "learning_rate": 9.809698515569812e-06, + "loss": 4.6952, + "step": 6312 + }, + { + "epoch": 0.5380550583823404, + "grad_norm": 292.19247491292634, + "learning_rate": 9.809562995653356e-06, + "loss": 3.814, + "step": 6313 + }, + { + "epoch": 0.5381402880763658, + "grad_norm": 42.14793828842963, + "learning_rate": 9.809427428436761e-06, + "loss": 4.1092, + "step": 6314 + }, + { + "epoch": 0.5382255177703912, + "grad_norm": 82.98865894769204, + "learning_rate": 9.80929181392136e-06, + "loss": 3.7119, + "step": 6315 + }, + { + "epoch": 0.5383107474644167, + "grad_norm": 65.02342007067406, + "learning_rate": 9.809156152108483e-06, + "loss": 4.7276, + "step": 6316 + }, + { + "epoch": 0.538395977158442, + "grad_norm": 58.18039777698409, + "learning_rate": 9.809020442999467e-06, + "loss": 3.913, + "step": 6317 + }, + { + "epoch": 0.5384812068524674, + "grad_norm": 66.56329933438772, + "learning_rate": 9.808884686595645e-06, + "loss": 4.7611, + "step": 6318 + }, + { + "epoch": 0.5385664365464928, + "grad_norm": 55.680756056282256, + "learning_rate": 9.808748882898353e-06, + "loss": 3.921, + "step": 6319 + }, + { + "epoch": 0.5386516662405182, + "grad_norm": 65.84881821184308, + "learning_rate": 9.808613031908927e-06, + "loss": 4.6597, + "step": 6320 + }, + { + "epoch": 0.5387368959345435, + "grad_norm": 43.138814951920054, + "learning_rate": 9.808477133628704e-06, + "loss": 4.0244, + "step": 6321 + }, + { + "epoch": 0.538822125628569, + "grad_norm": 118.33596833221567, + "learning_rate": 9.808341188059017e-06, + "loss": 3.8022, + "step": 6322 + }, + { + "epoch": 0.5389073553225944, + "grad_norm": 76.79581855123395, + "learning_rate": 9.808205195201207e-06, + "loss": 4.9279, + "step": 6323 + }, + { + "epoch": 0.5389925850166198, + "grad_norm": 86.15381149075192, + "learning_rate": 9.808069155056608e-06, + "loss": 3.3946, + "step": 6324 + }, + { + "epoch": 0.5390778147106452, + "grad_norm": 191.11474031387175, + "learning_rate": 9.80793306762656e-06, + "loss": 5.7629, + "step": 6325 + }, + { + "epoch": 0.5391630444046706, + "grad_norm": 45.16006689001985, + "learning_rate": 9.8077969329124e-06, + "loss": 4.3314, + "step": 6326 + }, + { + "epoch": 0.539248274098696, + "grad_norm": 137.58301995761616, + "learning_rate": 9.807660750915468e-06, + "loss": 3.9893, + "step": 6327 + }, + { + "epoch": 0.5393335037927214, + "grad_norm": 27.644375819062578, + "learning_rate": 9.807524521637103e-06, + "loss": 2.6188, + "step": 6328 + }, + { + "epoch": 0.5394187334867467, + "grad_norm": 342.71459193849216, + "learning_rate": 9.807388245078644e-06, + "loss": 3.8665, + "step": 6329 + }, + { + "epoch": 0.5395039631807722, + "grad_norm": 86.12904710963569, + "learning_rate": 9.807251921241434e-06, + "loss": 4.9036, + "step": 6330 + }, + { + "epoch": 0.5395891928747976, + "grad_norm": 47.59727875824649, + "learning_rate": 9.807115550126809e-06, + "loss": 3.7216, + "step": 6331 + }, + { + "epoch": 0.539674422568823, + "grad_norm": 47.1893827411996, + "learning_rate": 9.806979131736114e-06, + "loss": 3.4926, + "step": 6332 + }, + { + "epoch": 0.5397596522628484, + "grad_norm": 38.38380256837344, + "learning_rate": 9.80684266607069e-06, + "loss": 4.3352, + "step": 6333 + }, + { + "epoch": 0.5398448819568737, + "grad_norm": 106.36791583638069, + "learning_rate": 9.806706153131878e-06, + "loss": 4.0455, + "step": 6334 + }, + { + "epoch": 0.5399301116508992, + "grad_norm": 67.40597066383451, + "learning_rate": 9.80656959292102e-06, + "loss": 3.5948, + "step": 6335 + }, + { + "epoch": 0.5400153413449246, + "grad_norm": 53.804811132753265, + "learning_rate": 9.80643298543946e-06, + "loss": 4.3923, + "step": 6336 + }, + { + "epoch": 0.5401005710389499, + "grad_norm": 44.11124024990956, + "learning_rate": 9.806296330688543e-06, + "loss": 3.3402, + "step": 6337 + }, + { + "epoch": 0.5401858007329754, + "grad_norm": 67.1023203067789, + "learning_rate": 9.806159628669609e-06, + "loss": 4.4883, + "step": 6338 + }, + { + "epoch": 0.5402710304270008, + "grad_norm": 73.5658833495862, + "learning_rate": 9.806022879384008e-06, + "loss": 2.9079, + "step": 6339 + }, + { + "epoch": 0.5403562601210261, + "grad_norm": 76.90511609221717, + "learning_rate": 9.805886082833079e-06, + "loss": 4.3429, + "step": 6340 + }, + { + "epoch": 0.5404414898150516, + "grad_norm": 31.8766817170934, + "learning_rate": 9.80574923901817e-06, + "loss": 3.4371, + "step": 6341 + }, + { + "epoch": 0.5405267195090769, + "grad_norm": 56.9653280854829, + "learning_rate": 9.805612347940629e-06, + "loss": 4.2925, + "step": 6342 + }, + { + "epoch": 0.5406119492031024, + "grad_norm": 46.00978015415495, + "learning_rate": 9.805475409601799e-06, + "loss": 3.4644, + "step": 6343 + }, + { + "epoch": 0.5406971788971278, + "grad_norm": 89.88227430363877, + "learning_rate": 9.805338424003028e-06, + "loss": 4.0647, + "step": 6344 + }, + { + "epoch": 0.5407824085911531, + "grad_norm": 101.17870554773707, + "learning_rate": 9.805201391145663e-06, + "loss": 6.2501, + "step": 6345 + }, + { + "epoch": 0.5408676382851786, + "grad_norm": 59.002829039414735, + "learning_rate": 9.805064311031052e-06, + "loss": 3.8117, + "step": 6346 + }, + { + "epoch": 0.5409528679792039, + "grad_norm": 41.86388426606348, + "learning_rate": 9.804927183660542e-06, + "loss": 3.2565, + "step": 6347 + }, + { + "epoch": 0.5410380976732293, + "grad_norm": 85.43827548518327, + "learning_rate": 9.804790009035482e-06, + "loss": 4.4778, + "step": 6348 + }, + { + "epoch": 0.5411233273672548, + "grad_norm": 36.47659651448968, + "learning_rate": 9.804652787157223e-06, + "loss": 4.7294, + "step": 6349 + }, + { + "epoch": 0.5412085570612801, + "grad_norm": 115.50293723959837, + "learning_rate": 9.804515518027114e-06, + "loss": 4.3584, + "step": 6350 + }, + { + "epoch": 0.5412937867553056, + "grad_norm": 47.93656472281737, + "learning_rate": 9.8043782016465e-06, + "loss": 3.6657, + "step": 6351 + }, + { + "epoch": 0.541379016449331, + "grad_norm": 38.49074185923366, + "learning_rate": 9.804240838016738e-06, + "loss": 3.3136, + "step": 6352 + }, + { + "epoch": 0.5414642461433563, + "grad_norm": 34.51738593443349, + "learning_rate": 9.804103427139177e-06, + "loss": 3.6486, + "step": 6353 + }, + { + "epoch": 0.5415494758373818, + "grad_norm": 34.391684516428576, + "learning_rate": 9.803965969015167e-06, + "loss": 3.3712, + "step": 6354 + }, + { + "epoch": 0.5416347055314071, + "grad_norm": 29.097912737845313, + "learning_rate": 9.803828463646061e-06, + "loss": 2.5654, + "step": 6355 + }, + { + "epoch": 0.5417199352254325, + "grad_norm": 99.7461726792975, + "learning_rate": 9.80369091103321e-06, + "loss": 6.306, + "step": 6356 + }, + { + "epoch": 0.541805164919458, + "grad_norm": 71.68328438336219, + "learning_rate": 9.803553311177969e-06, + "loss": 4.7628, + "step": 6357 + }, + { + "epoch": 0.5418903946134833, + "grad_norm": 64.91219686578447, + "learning_rate": 9.803415664081689e-06, + "loss": 4.4316, + "step": 6358 + }, + { + "epoch": 0.5419756243075088, + "grad_norm": 30.489054384643627, + "learning_rate": 9.803277969745724e-06, + "loss": 3.0801, + "step": 6359 + }, + { + "epoch": 0.5420608540015341, + "grad_norm": 42.66328909546936, + "learning_rate": 9.80314022817143e-06, + "loss": 3.8204, + "step": 6360 + }, + { + "epoch": 0.5421460836955595, + "grad_norm": 48.81351678224321, + "learning_rate": 9.80300243936016e-06, + "loss": 3.8062, + "step": 6361 + }, + { + "epoch": 0.542231313389585, + "grad_norm": 42.51255815001212, + "learning_rate": 9.80286460331327e-06, + "loss": 3.7425, + "step": 6362 + }, + { + "epoch": 0.5423165430836103, + "grad_norm": 44.295808684271, + "learning_rate": 9.802726720032115e-06, + "loss": 4.1151, + "step": 6363 + }, + { + "epoch": 0.5424017727776357, + "grad_norm": 43.650435648260824, + "learning_rate": 9.80258878951805e-06, + "loss": 3.4587, + "step": 6364 + }, + { + "epoch": 0.5424870024716612, + "grad_norm": 91.73789497266407, + "learning_rate": 9.802450811772434e-06, + "loss": 3.8195, + "step": 6365 + }, + { + "epoch": 0.5425722321656865, + "grad_norm": 37.83817535838133, + "learning_rate": 9.802312786796622e-06, + "loss": 3.3652, + "step": 6366 + }, + { + "epoch": 0.542657461859712, + "grad_norm": 227.29062249014856, + "learning_rate": 9.802174714591971e-06, + "loss": 4.24, + "step": 6367 + }, + { + "epoch": 0.5427426915537373, + "grad_norm": 38.799424181605445, + "learning_rate": 9.802036595159842e-06, + "loss": 4.257, + "step": 6368 + }, + { + "epoch": 0.5428279212477627, + "grad_norm": 101.52334598141468, + "learning_rate": 9.80189842850159e-06, + "loss": 4.1963, + "step": 6369 + }, + { + "epoch": 0.5429131509417882, + "grad_norm": 62.63185749212388, + "learning_rate": 9.801760214618576e-06, + "loss": 3.3116, + "step": 6370 + }, + { + "epoch": 0.5429983806358135, + "grad_norm": 42.023105347501826, + "learning_rate": 9.801621953512157e-06, + "loss": 4.1138, + "step": 6371 + }, + { + "epoch": 0.5430836103298389, + "grad_norm": 134.94266324110836, + "learning_rate": 9.801483645183694e-06, + "loss": 4.5165, + "step": 6372 + }, + { + "epoch": 0.5431688400238643, + "grad_norm": 42.95463731203372, + "learning_rate": 9.801345289634548e-06, + "loss": 4.524, + "step": 6373 + }, + { + "epoch": 0.5432540697178897, + "grad_norm": 123.7494490784202, + "learning_rate": 9.80120688686608e-06, + "loss": 1.6752, + "step": 6374 + }, + { + "epoch": 0.5433392994119152, + "grad_norm": 92.74983686176164, + "learning_rate": 9.801068436879647e-06, + "loss": 4.3656, + "step": 6375 + }, + { + "epoch": 0.5434245291059405, + "grad_norm": 100.00704463100095, + "learning_rate": 9.800929939676617e-06, + "loss": 4.6347, + "step": 6376 + }, + { + "epoch": 0.5435097587999659, + "grad_norm": 51.91204299151719, + "learning_rate": 9.800791395258346e-06, + "loss": 4.329, + "step": 6377 + }, + { + "epoch": 0.5435949884939913, + "grad_norm": 66.0291844488549, + "learning_rate": 9.800652803626201e-06, + "loss": 5.2199, + "step": 6378 + }, + { + "epoch": 0.5436802181880167, + "grad_norm": 40.39595024569811, + "learning_rate": 9.800514164781543e-06, + "loss": 3.6479, + "step": 6379 + }, + { + "epoch": 0.5437654478820421, + "grad_norm": 37.30725496990477, + "learning_rate": 9.800375478725736e-06, + "loss": 2.779, + "step": 6380 + }, + { + "epoch": 0.5438506775760675, + "grad_norm": 35.42347126172939, + "learning_rate": 9.800236745460143e-06, + "loss": 2.8876, + "step": 6381 + }, + { + "epoch": 0.5439359072700929, + "grad_norm": 32.574838514287144, + "learning_rate": 9.800097964986129e-06, + "loss": 3.7338, + "step": 6382 + }, + { + "epoch": 0.5440211369641182, + "grad_norm": 161.02769787197323, + "learning_rate": 9.79995913730506e-06, + "loss": 4.9655, + "step": 6383 + }, + { + "epoch": 0.5441063666581437, + "grad_norm": 43.30434795250893, + "learning_rate": 9.799820262418299e-06, + "loss": 3.8645, + "step": 6384 + }, + { + "epoch": 0.5441915963521691, + "grad_norm": 63.99195208212581, + "learning_rate": 9.799681340327213e-06, + "loss": 4.8096, + "step": 6385 + }, + { + "epoch": 0.5442768260461945, + "grad_norm": 72.87697042638383, + "learning_rate": 9.799542371033169e-06, + "loss": 3.3469, + "step": 6386 + }, + { + "epoch": 0.5443620557402199, + "grad_norm": 65.52442886267175, + "learning_rate": 9.79940335453753e-06, + "loss": 3.5817, + "step": 6387 + }, + { + "epoch": 0.5444472854342453, + "grad_norm": 84.63025532369197, + "learning_rate": 9.79926429084167e-06, + "loss": 5.6635, + "step": 6388 + }, + { + "epoch": 0.5445325151282707, + "grad_norm": 83.31426735705506, + "learning_rate": 9.799125179946951e-06, + "loss": 4.9427, + "step": 6389 + }, + { + "epoch": 0.5446177448222961, + "grad_norm": 37.39647362964618, + "learning_rate": 9.798986021854743e-06, + "loss": 3.6559, + "step": 6390 + }, + { + "epoch": 0.5447029745163214, + "grad_norm": 80.36237391174087, + "learning_rate": 9.798846816566415e-06, + "loss": 5.0935, + "step": 6391 + }, + { + "epoch": 0.5447882042103469, + "grad_norm": 101.37445459734137, + "learning_rate": 9.798707564083335e-06, + "loss": 3.7416, + "step": 6392 + }, + { + "epoch": 0.5448734339043723, + "grad_norm": 85.75075492462845, + "learning_rate": 9.798568264406873e-06, + "loss": 4.2464, + "step": 6393 + }, + { + "epoch": 0.5449586635983977, + "grad_norm": 43.45350319070783, + "learning_rate": 9.798428917538399e-06, + "loss": 3.5921, + "step": 6394 + }, + { + "epoch": 0.5450438932924231, + "grad_norm": 69.17235456691036, + "learning_rate": 9.798289523479283e-06, + "loss": 3.8159, + "step": 6395 + }, + { + "epoch": 0.5451291229864484, + "grad_norm": 63.16973295734898, + "learning_rate": 9.798150082230896e-06, + "loss": 3.8481, + "step": 6396 + }, + { + "epoch": 0.5452143526804739, + "grad_norm": 48.61786501314894, + "learning_rate": 9.79801059379461e-06, + "loss": 2.8542, + "step": 6397 + }, + { + "epoch": 0.5452995823744993, + "grad_norm": 70.49764267295885, + "learning_rate": 9.797871058171796e-06, + "loss": 4.3253, + "step": 6398 + }, + { + "epoch": 0.5453848120685246, + "grad_norm": 44.16859104044838, + "learning_rate": 9.797731475363827e-06, + "loss": 3.4253, + "step": 6399 + }, + { + "epoch": 0.5454700417625501, + "grad_norm": 72.78785675428166, + "learning_rate": 9.797591845372076e-06, + "loss": 3.9111, + "step": 6400 + }, + { + "epoch": 0.5455552714565755, + "grad_norm": 127.52804505490418, + "learning_rate": 9.797452168197913e-06, + "loss": 5.2857, + "step": 6401 + }, + { + "epoch": 0.5456405011506009, + "grad_norm": 52.05142443666455, + "learning_rate": 9.797312443842716e-06, + "loss": 4.3314, + "step": 6402 + }, + { + "epoch": 0.5457257308446263, + "grad_norm": 84.29566597151737, + "learning_rate": 9.797172672307858e-06, + "loss": 3.9149, + "step": 6403 + }, + { + "epoch": 0.5458109605386516, + "grad_norm": 114.8787061824226, + "learning_rate": 9.797032853594712e-06, + "loss": 4.6303, + "step": 6404 + }, + { + "epoch": 0.5458961902326771, + "grad_norm": 36.868568334951426, + "learning_rate": 9.796892987704655e-06, + "loss": 3.66, + "step": 6405 + }, + { + "epoch": 0.5459814199267025, + "grad_norm": 55.30658518705101, + "learning_rate": 9.79675307463906e-06, + "loss": 4.3435, + "step": 6406 + }, + { + "epoch": 0.5460666496207278, + "grad_norm": 68.86391577229263, + "learning_rate": 9.796613114399307e-06, + "loss": 4.0844, + "step": 6407 + }, + { + "epoch": 0.5461518793147533, + "grad_norm": 160.58996283334295, + "learning_rate": 9.796473106986768e-06, + "loss": 4.44, + "step": 6408 + }, + { + "epoch": 0.5462371090087786, + "grad_norm": 132.25706816928871, + "learning_rate": 9.796333052402822e-06, + "loss": 5.3482, + "step": 6409 + }, + { + "epoch": 0.5463223387028041, + "grad_norm": 76.78958686500194, + "learning_rate": 9.796192950648845e-06, + "loss": 4.4371, + "step": 6410 + }, + { + "epoch": 0.5464075683968295, + "grad_norm": 149.57697847881477, + "learning_rate": 9.796052801726218e-06, + "loss": 4.6623, + "step": 6411 + }, + { + "epoch": 0.5464927980908548, + "grad_norm": 85.82421251689638, + "learning_rate": 9.79591260563632e-06, + "loss": 4.6112, + "step": 6412 + }, + { + "epoch": 0.5465780277848803, + "grad_norm": 64.87428162183842, + "learning_rate": 9.795772362380524e-06, + "loss": 3.8683, + "step": 6413 + }, + { + "epoch": 0.5466632574789057, + "grad_norm": 30.44882500737535, + "learning_rate": 9.795632071960213e-06, + "loss": 2.5596, + "step": 6414 + }, + { + "epoch": 0.546748487172931, + "grad_norm": 84.51742617828428, + "learning_rate": 9.795491734376766e-06, + "loss": 3.2054, + "step": 6415 + }, + { + "epoch": 0.5468337168669565, + "grad_norm": 65.77020544574879, + "learning_rate": 9.795351349631564e-06, + "loss": 3.1695, + "step": 6416 + }, + { + "epoch": 0.5469189465609818, + "grad_norm": 72.53100862861443, + "learning_rate": 9.795210917725986e-06, + "loss": 4.2253, + "step": 6417 + }, + { + "epoch": 0.5470041762550072, + "grad_norm": 86.94778693217889, + "learning_rate": 9.795070438661414e-06, + "loss": 4.3525, + "step": 6418 + }, + { + "epoch": 0.5470894059490327, + "grad_norm": 37.52741335738349, + "learning_rate": 9.79492991243923e-06, + "loss": 3.7454, + "step": 6419 + }, + { + "epoch": 0.547174635643058, + "grad_norm": 50.069580114250535, + "learning_rate": 9.794789339060817e-06, + "loss": 3.7205, + "step": 6420 + }, + { + "epoch": 0.5472598653370835, + "grad_norm": 50.795734248720066, + "learning_rate": 9.794648718527555e-06, + "loss": 4.2098, + "step": 6421 + }, + { + "epoch": 0.5473450950311088, + "grad_norm": 67.09513347876327, + "learning_rate": 9.794508050840828e-06, + "loss": 3.9084, + "step": 6422 + }, + { + "epoch": 0.5474303247251342, + "grad_norm": 77.3885782517478, + "learning_rate": 9.79436733600202e-06, + "loss": 4.7212, + "step": 6423 + }, + { + "epoch": 0.5475155544191597, + "grad_norm": 119.73760809440022, + "learning_rate": 9.794226574012513e-06, + "loss": 4.7137, + "step": 6424 + }, + { + "epoch": 0.547600784113185, + "grad_norm": 77.35512949517326, + "learning_rate": 9.794085764873693e-06, + "loss": 4.4944, + "step": 6425 + }, + { + "epoch": 0.5476860138072104, + "grad_norm": 55.39538541733907, + "learning_rate": 9.793944908586945e-06, + "loss": 4.2334, + "step": 6426 + }, + { + "epoch": 0.5477712435012358, + "grad_norm": 225.97803096005478, + "learning_rate": 9.793804005153654e-06, + "loss": 4.1989, + "step": 6427 + }, + { + "epoch": 0.5478564731952612, + "grad_norm": 77.54018320266765, + "learning_rate": 9.793663054575205e-06, + "loss": 3.5467, + "step": 6428 + }, + { + "epoch": 0.5479417028892867, + "grad_norm": 56.98775733328857, + "learning_rate": 9.793522056852986e-06, + "loss": 3.7122, + "step": 6429 + }, + { + "epoch": 0.548026932583312, + "grad_norm": 72.4879155182325, + "learning_rate": 9.79338101198838e-06, + "loss": 3.9831, + "step": 6430 + }, + { + "epoch": 0.5481121622773374, + "grad_norm": 61.39657596056265, + "learning_rate": 9.793239919982777e-06, + "loss": 3.6778, + "step": 6431 + }, + { + "epoch": 0.5481973919713629, + "grad_norm": 74.0653961726493, + "learning_rate": 9.793098780837565e-06, + "loss": 5.5371, + "step": 6432 + }, + { + "epoch": 0.5482826216653882, + "grad_norm": 32.188081950248694, + "learning_rate": 9.792957594554132e-06, + "loss": 3.1751, + "step": 6433 + }, + { + "epoch": 0.5483678513594136, + "grad_norm": 91.26905931649507, + "learning_rate": 9.792816361133863e-06, + "loss": 3.386, + "step": 6434 + }, + { + "epoch": 0.548453081053439, + "grad_norm": 53.930083846662164, + "learning_rate": 9.792675080578151e-06, + "loss": 4.4993, + "step": 6435 + }, + { + "epoch": 0.5485383107474644, + "grad_norm": 87.86099292216966, + "learning_rate": 9.792533752888385e-06, + "loss": 3.5562, + "step": 6436 + }, + { + "epoch": 0.5486235404414899, + "grad_norm": 45.10520505539933, + "learning_rate": 9.792392378065952e-06, + "loss": 3.092, + "step": 6437 + }, + { + "epoch": 0.5487087701355152, + "grad_norm": 159.0035737809341, + "learning_rate": 9.792250956112245e-06, + "loss": 4.6385, + "step": 6438 + }, + { + "epoch": 0.5487939998295406, + "grad_norm": 87.13871162463894, + "learning_rate": 9.792109487028654e-06, + "loss": 4.0484, + "step": 6439 + }, + { + "epoch": 0.548879229523566, + "grad_norm": 91.65806272688923, + "learning_rate": 9.791967970816572e-06, + "loss": 4.0043, + "step": 6440 + }, + { + "epoch": 0.5489644592175914, + "grad_norm": 52.864263504938194, + "learning_rate": 9.791826407477388e-06, + "loss": 3.6283, + "step": 6441 + }, + { + "epoch": 0.5490496889116168, + "grad_norm": 138.28966217096723, + "learning_rate": 9.791684797012493e-06, + "loss": 5.0318, + "step": 6442 + }, + { + "epoch": 0.5491349186056422, + "grad_norm": 40.02416901670796, + "learning_rate": 9.791543139423285e-06, + "loss": 3.7623, + "step": 6443 + }, + { + "epoch": 0.5492201482996676, + "grad_norm": 68.14715695964702, + "learning_rate": 9.791401434711154e-06, + "loss": 3.7799, + "step": 6444 + }, + { + "epoch": 0.5493053779936931, + "grad_norm": 123.23207514056229, + "learning_rate": 9.791259682877493e-06, + "loss": 3.7148, + "step": 6445 + }, + { + "epoch": 0.5493906076877184, + "grad_norm": 47.06423883008668, + "learning_rate": 9.791117883923697e-06, + "loss": 3.7745, + "step": 6446 + }, + { + "epoch": 0.5494758373817438, + "grad_norm": 174.59083101048455, + "learning_rate": 9.79097603785116e-06, + "loss": 6.3774, + "step": 6447 + }, + { + "epoch": 0.5495610670757692, + "grad_norm": 37.814417349841975, + "learning_rate": 9.790834144661278e-06, + "loss": 3.7297, + "step": 6448 + }, + { + "epoch": 0.5496462967697946, + "grad_norm": 51.90585397230624, + "learning_rate": 9.790692204355446e-06, + "loss": 3.649, + "step": 6449 + }, + { + "epoch": 0.54973152646382, + "grad_norm": 56.63490384137957, + "learning_rate": 9.790550216935059e-06, + "loss": 3.8761, + "step": 6450 + }, + { + "epoch": 0.5498167561578454, + "grad_norm": 50.944462870243775, + "learning_rate": 9.790408182401515e-06, + "loss": 3.5404, + "step": 6451 + }, + { + "epoch": 0.5499019858518708, + "grad_norm": 120.65073969190534, + "learning_rate": 9.79026610075621e-06, + "loss": 3.4826, + "step": 6452 + }, + { + "epoch": 0.5499872155458961, + "grad_norm": 77.5226252190231, + "learning_rate": 9.790123972000542e-06, + "loss": 4.768, + "step": 6453 + }, + { + "epoch": 0.5500724452399216, + "grad_norm": 52.40772029466509, + "learning_rate": 9.789981796135907e-06, + "loss": 3.4805, + "step": 6454 + }, + { + "epoch": 0.550157674933947, + "grad_norm": 51.599425684322114, + "learning_rate": 9.789839573163705e-06, + "loss": 3.8168, + "step": 6455 + }, + { + "epoch": 0.5502429046279724, + "grad_norm": 56.48710460966256, + "learning_rate": 9.789697303085333e-06, + "loss": 3.7604, + "step": 6456 + }, + { + "epoch": 0.5503281343219978, + "grad_norm": 47.7821735356763, + "learning_rate": 9.789554985902192e-06, + "loss": 3.7909, + "step": 6457 + }, + { + "epoch": 0.5504133640160231, + "grad_norm": 67.06462313008203, + "learning_rate": 9.789412621615682e-06, + "loss": 4.3822, + "step": 6458 + }, + { + "epoch": 0.5504985937100486, + "grad_norm": 78.45695471436649, + "learning_rate": 9.7892702102272e-06, + "loss": 3.6842, + "step": 6459 + }, + { + "epoch": 0.550583823404074, + "grad_norm": 41.278573731873266, + "learning_rate": 9.78912775173815e-06, + "loss": 3.58, + "step": 6460 + }, + { + "epoch": 0.5506690530980993, + "grad_norm": 62.697539736305316, + "learning_rate": 9.78898524614993e-06, + "loss": 4.2066, + "step": 6461 + }, + { + "epoch": 0.5507542827921248, + "grad_norm": 61.35171614893387, + "learning_rate": 9.788842693463944e-06, + "loss": 5.4171, + "step": 6462 + }, + { + "epoch": 0.5508395124861502, + "grad_norm": 45.57504480303064, + "learning_rate": 9.788700093681593e-06, + "loss": 4.4768, + "step": 6463 + }, + { + "epoch": 0.5509247421801756, + "grad_norm": 34.74316654376344, + "learning_rate": 9.788557446804281e-06, + "loss": 3.7157, + "step": 6464 + }, + { + "epoch": 0.551009971874201, + "grad_norm": 76.27896441064259, + "learning_rate": 9.788414752833408e-06, + "loss": 3.8921, + "step": 6465 + }, + { + "epoch": 0.5510952015682263, + "grad_norm": 102.76402605292714, + "learning_rate": 9.788272011770377e-06, + "loss": 4.3337, + "step": 6466 + }, + { + "epoch": 0.5511804312622518, + "grad_norm": 95.74626430346457, + "learning_rate": 9.788129223616596e-06, + "loss": 3.5769, + "step": 6467 + }, + { + "epoch": 0.5512656609562772, + "grad_norm": 48.781254871761234, + "learning_rate": 9.787986388373465e-06, + "loss": 4.1297, + "step": 6468 + }, + { + "epoch": 0.5513508906503025, + "grad_norm": 52.9885642778682, + "learning_rate": 9.787843506042391e-06, + "loss": 4.2402, + "step": 6469 + }, + { + "epoch": 0.551436120344328, + "grad_norm": 89.95390097470352, + "learning_rate": 9.78770057662478e-06, + "loss": 4.2648, + "step": 6470 + }, + { + "epoch": 0.5515213500383533, + "grad_norm": 71.50992008476649, + "learning_rate": 9.787557600122033e-06, + "loss": 4.7422, + "step": 6471 + }, + { + "epoch": 0.5516065797323788, + "grad_norm": 66.2292410124138, + "learning_rate": 9.787414576535563e-06, + "loss": 3.62, + "step": 6472 + }, + { + "epoch": 0.5516918094264042, + "grad_norm": 61.09856270678353, + "learning_rate": 9.787271505866771e-06, + "loss": 5.2922, + "step": 6473 + }, + { + "epoch": 0.5517770391204295, + "grad_norm": 66.13850640776278, + "learning_rate": 9.787128388117066e-06, + "loss": 3.0758, + "step": 6474 + }, + { + "epoch": 0.551862268814455, + "grad_norm": 32.084365007159896, + "learning_rate": 9.786985223287856e-06, + "loss": 3.1242, + "step": 6475 + }, + { + "epoch": 0.5519474985084803, + "grad_norm": 119.6212452412415, + "learning_rate": 9.786842011380549e-06, + "loss": 3.7378, + "step": 6476 + }, + { + "epoch": 0.5520327282025057, + "grad_norm": 202.75325757118327, + "learning_rate": 9.78669875239655e-06, + "loss": 6.0153, + "step": 6477 + }, + { + "epoch": 0.5521179578965312, + "grad_norm": 37.38855556321196, + "learning_rate": 9.786555446337273e-06, + "loss": 3.4822, + "step": 6478 + }, + { + "epoch": 0.5522031875905565, + "grad_norm": 44.337159179551904, + "learning_rate": 9.786412093204127e-06, + "loss": 4.2585, + "step": 6479 + }, + { + "epoch": 0.552288417284582, + "grad_norm": 172.43018837040768, + "learning_rate": 9.786268692998517e-06, + "loss": 3.8527, + "step": 6480 + }, + { + "epoch": 0.5523736469786074, + "grad_norm": 65.313337323666, + "learning_rate": 9.786125245721859e-06, + "loss": 3.7046, + "step": 6481 + }, + { + "epoch": 0.5524588766726327, + "grad_norm": 100.46877076864767, + "learning_rate": 9.785981751375559e-06, + "loss": 4.1501, + "step": 6482 + }, + { + "epoch": 0.5525441063666582, + "grad_norm": 49.38032313586483, + "learning_rate": 9.78583820996103e-06, + "loss": 3.7649, + "step": 6483 + }, + { + "epoch": 0.5526293360606835, + "grad_norm": 43.97517525472476, + "learning_rate": 9.785694621479685e-06, + "loss": 3.7225, + "step": 6484 + }, + { + "epoch": 0.5527145657547089, + "grad_norm": 165.53201688009494, + "learning_rate": 9.785550985932935e-06, + "loss": 4.2034, + "step": 6485 + }, + { + "epoch": 0.5527997954487344, + "grad_norm": 73.40397103985127, + "learning_rate": 9.785407303322192e-06, + "loss": 4.4521, + "step": 6486 + }, + { + "epoch": 0.5528850251427597, + "grad_norm": 66.42730370624759, + "learning_rate": 9.78526357364887e-06, + "loss": 4.1213, + "step": 6487 + }, + { + "epoch": 0.5529702548367852, + "grad_norm": 59.91004243691178, + "learning_rate": 9.785119796914383e-06, + "loss": 4.3674, + "step": 6488 + }, + { + "epoch": 0.5530554845308105, + "grad_norm": 37.106204980376965, + "learning_rate": 9.784975973120143e-06, + "loss": 2.7206, + "step": 6489 + }, + { + "epoch": 0.5531407142248359, + "grad_norm": 47.97181176446865, + "learning_rate": 9.784832102267567e-06, + "loss": 3.5262, + "step": 6490 + }, + { + "epoch": 0.5532259439188614, + "grad_norm": 83.75728228677558, + "learning_rate": 9.784688184358068e-06, + "loss": 5.2916, + "step": 6491 + }, + { + "epoch": 0.5533111736128867, + "grad_norm": 66.20473274367045, + "learning_rate": 9.784544219393061e-06, + "loss": 4.2429, + "step": 6492 + }, + { + "epoch": 0.5533964033069121, + "grad_norm": 89.83537311878109, + "learning_rate": 9.784400207373964e-06, + "loss": 5.9662, + "step": 6493 + }, + { + "epoch": 0.5534816330009376, + "grad_norm": 36.80764394814378, + "learning_rate": 9.784256148302191e-06, + "loss": 3.1727, + "step": 6494 + }, + { + "epoch": 0.5535668626949629, + "grad_norm": 68.06858227450861, + "learning_rate": 9.78411204217916e-06, + "loss": 4.7787, + "step": 6495 + }, + { + "epoch": 0.5536520923889883, + "grad_norm": 69.11112003118774, + "learning_rate": 9.783967889006289e-06, + "loss": 4.5995, + "step": 6496 + }, + { + "epoch": 0.5537373220830137, + "grad_norm": 85.90678555572153, + "learning_rate": 9.783823688784994e-06, + "loss": 5.9574, + "step": 6497 + }, + { + "epoch": 0.5538225517770391, + "grad_norm": 49.195008326550905, + "learning_rate": 9.783679441516696e-06, + "loss": 4.5889, + "step": 6498 + }, + { + "epoch": 0.5539077814710646, + "grad_norm": 42.13679363076643, + "learning_rate": 9.78353514720281e-06, + "loss": 3.8003, + "step": 6499 + }, + { + "epoch": 0.5539930111650899, + "grad_norm": 162.0447002229275, + "learning_rate": 9.783390805844755e-06, + "loss": 6.0023, + "step": 6500 + }, + { + "epoch": 0.5540782408591153, + "grad_norm": 48.47363321151972, + "learning_rate": 9.783246417443954e-06, + "loss": 2.9414, + "step": 6501 + }, + { + "epoch": 0.5541634705531407, + "grad_norm": 74.79704390818331, + "learning_rate": 9.783101982001826e-06, + "loss": 4.3719, + "step": 6502 + }, + { + "epoch": 0.5542487002471661, + "grad_norm": 41.278223070370416, + "learning_rate": 9.782957499519788e-06, + "loss": 4.0333, + "step": 6503 + }, + { + "epoch": 0.5543339299411915, + "grad_norm": 58.00881418352002, + "learning_rate": 9.782812969999265e-06, + "loss": 4.037, + "step": 6504 + }, + { + "epoch": 0.5544191596352169, + "grad_norm": 45.70216612698884, + "learning_rate": 9.782668393441677e-06, + "loss": 3.8384, + "step": 6505 + }, + { + "epoch": 0.5545043893292423, + "grad_norm": 60.04692959126674, + "learning_rate": 9.782523769848447e-06, + "loss": 4.6232, + "step": 6506 + }, + { + "epoch": 0.5545896190232678, + "grad_norm": 30.57190829025745, + "learning_rate": 9.782379099220993e-06, + "loss": 3.5148, + "step": 6507 + }, + { + "epoch": 0.5546748487172931, + "grad_norm": 99.89836767385971, + "learning_rate": 9.782234381560744e-06, + "loss": 5.2951, + "step": 6508 + }, + { + "epoch": 0.5547600784113185, + "grad_norm": 34.95656858436132, + "learning_rate": 9.782089616869119e-06, + "loss": 3.0218, + "step": 6509 + }, + { + "epoch": 0.5548453081053439, + "grad_norm": 72.88587192907936, + "learning_rate": 9.781944805147541e-06, + "loss": 3.8465, + "step": 6510 + }, + { + "epoch": 0.5549305377993693, + "grad_norm": 85.38059849241878, + "learning_rate": 9.78179994639744e-06, + "loss": 4.18, + "step": 6511 + }, + { + "epoch": 0.5550157674933947, + "grad_norm": 42.84245984397612, + "learning_rate": 9.781655040620233e-06, + "loss": 3.7707, + "step": 6512 + }, + { + "epoch": 0.5551009971874201, + "grad_norm": 35.05089050562371, + "learning_rate": 9.78151008781735e-06, + "loss": 3.8953, + "step": 6513 + }, + { + "epoch": 0.5551862268814455, + "grad_norm": 42.60875471623382, + "learning_rate": 9.781365087990216e-06, + "loss": 4.6776, + "step": 6514 + }, + { + "epoch": 0.555271456575471, + "grad_norm": 34.82878034608826, + "learning_rate": 9.781220041140255e-06, + "loss": 3.6702, + "step": 6515 + }, + { + "epoch": 0.5553566862694963, + "grad_norm": 95.25066614759614, + "learning_rate": 9.781074947268894e-06, + "loss": 5.5875, + "step": 6516 + }, + { + "epoch": 0.5554419159635217, + "grad_norm": 30.326635718325655, + "learning_rate": 9.780929806377564e-06, + "loss": 4.075, + "step": 6517 + }, + { + "epoch": 0.5555271456575471, + "grad_norm": 100.6723546531567, + "learning_rate": 9.780784618467685e-06, + "loss": 3.8792, + "step": 6518 + }, + { + "epoch": 0.5556123753515725, + "grad_norm": 54.49222477869009, + "learning_rate": 9.780639383540692e-06, + "loss": 4.6681, + "step": 6519 + }, + { + "epoch": 0.5556976050455978, + "grad_norm": 54.174792340962355, + "learning_rate": 9.78049410159801e-06, + "loss": 2.8669, + "step": 6520 + }, + { + "epoch": 0.5557828347396233, + "grad_norm": 44.94506360099072, + "learning_rate": 9.780348772641067e-06, + "loss": 4.194, + "step": 6521 + }, + { + "epoch": 0.5558680644336487, + "grad_norm": 34.506230758545534, + "learning_rate": 9.780203396671294e-06, + "loss": 3.5545, + "step": 6522 + }, + { + "epoch": 0.5559532941276741, + "grad_norm": 54.157383482822674, + "learning_rate": 9.78005797369012e-06, + "loss": 3.5004, + "step": 6523 + }, + { + "epoch": 0.5560385238216995, + "grad_norm": 40.82775659727376, + "learning_rate": 9.779912503698974e-06, + "loss": 3.9291, + "step": 6524 + }, + { + "epoch": 0.5561237535157249, + "grad_norm": 40.45743268361474, + "learning_rate": 9.779766986699291e-06, + "loss": 3.8975, + "step": 6525 + }, + { + "epoch": 0.5562089832097503, + "grad_norm": 60.21380341930261, + "learning_rate": 9.779621422692496e-06, + "loss": 5.2369, + "step": 6526 + }, + { + "epoch": 0.5562942129037757, + "grad_norm": 35.954179733430074, + "learning_rate": 9.779475811680023e-06, + "loss": 4.1966, + "step": 6527 + }, + { + "epoch": 0.556379442597801, + "grad_norm": 67.7531529477257, + "learning_rate": 9.779330153663308e-06, + "loss": 5.1119, + "step": 6528 + }, + { + "epoch": 0.5564646722918265, + "grad_norm": 116.46493681858126, + "learning_rate": 9.779184448643776e-06, + "loss": 5.5231, + "step": 6529 + }, + { + "epoch": 0.5565499019858519, + "grad_norm": 59.82150105136833, + "learning_rate": 9.779038696622869e-06, + "loss": 4.6754, + "step": 6530 + }, + { + "epoch": 0.5566351316798772, + "grad_norm": 45.672821570984965, + "learning_rate": 9.778892897602012e-06, + "loss": 3.7829, + "step": 6531 + }, + { + "epoch": 0.5567203613739027, + "grad_norm": 46.680831790635274, + "learning_rate": 9.778747051582642e-06, + "loss": 3.2406, + "step": 6532 + }, + { + "epoch": 0.556805591067928, + "grad_norm": 58.05415886023439, + "learning_rate": 9.778601158566195e-06, + "loss": 4.1353, + "step": 6533 + }, + { + "epoch": 0.5568908207619535, + "grad_norm": 51.9588540281932, + "learning_rate": 9.778455218554104e-06, + "loss": 4.3063, + "step": 6534 + }, + { + "epoch": 0.5569760504559789, + "grad_norm": 43.31188042842686, + "learning_rate": 9.778309231547805e-06, + "loss": 4.0484, + "step": 6535 + }, + { + "epoch": 0.5570612801500042, + "grad_norm": 39.739935175723375, + "learning_rate": 9.778163197548733e-06, + "loss": 2.8338, + "step": 6536 + }, + { + "epoch": 0.5571465098440297, + "grad_norm": 85.79114343441684, + "learning_rate": 9.778017116558325e-06, + "loss": 5.0299, + "step": 6537 + }, + { + "epoch": 0.557231739538055, + "grad_norm": 36.94806379053543, + "learning_rate": 9.777870988578017e-06, + "loss": 3.9497, + "step": 6538 + }, + { + "epoch": 0.5573169692320804, + "grad_norm": 68.1857821189027, + "learning_rate": 9.777724813609246e-06, + "loss": 4.3365, + "step": 6539 + }, + { + "epoch": 0.5574021989261059, + "grad_norm": 42.49198402356034, + "learning_rate": 9.77757859165345e-06, + "loss": 3.6062, + "step": 6540 + }, + { + "epoch": 0.5574874286201312, + "grad_norm": 44.19126021386254, + "learning_rate": 9.777432322712067e-06, + "loss": 4.2522, + "step": 6541 + }, + { + "epoch": 0.5575726583141567, + "grad_norm": 39.81521442767112, + "learning_rate": 9.777286006786537e-06, + "loss": 3.2973, + "step": 6542 + }, + { + "epoch": 0.5576578880081821, + "grad_norm": 132.78713588239853, + "learning_rate": 9.777139643878295e-06, + "loss": 5.5804, + "step": 6543 + }, + { + "epoch": 0.5577431177022074, + "grad_norm": 37.62072736810609, + "learning_rate": 9.776993233988785e-06, + "loss": 4.1249, + "step": 6544 + }, + { + "epoch": 0.5578283473962329, + "grad_norm": 43.633321017595385, + "learning_rate": 9.776846777119443e-06, + "loss": 3.8093, + "step": 6545 + }, + { + "epoch": 0.5579135770902582, + "grad_norm": 43.01561225603386, + "learning_rate": 9.776700273271712e-06, + "loss": 2.7501, + "step": 6546 + }, + { + "epoch": 0.5579988067842836, + "grad_norm": 58.627023780993916, + "learning_rate": 9.77655372244703e-06, + "loss": 4.7371, + "step": 6547 + }, + { + "epoch": 0.5580840364783091, + "grad_norm": 54.6443414579717, + "learning_rate": 9.776407124646844e-06, + "loss": 4.0505, + "step": 6548 + }, + { + "epoch": 0.5581692661723344, + "grad_norm": 93.60926356618873, + "learning_rate": 9.77626047987259e-06, + "loss": 5.0515, + "step": 6549 + }, + { + "epoch": 0.5582544958663599, + "grad_norm": 373.52926097718677, + "learning_rate": 9.77611378812571e-06, + "loss": 7.22, + "step": 6550 + }, + { + "epoch": 0.5583397255603852, + "grad_norm": 54.59048390202406, + "learning_rate": 9.775967049407651e-06, + "loss": 4.039, + "step": 6551 + }, + { + "epoch": 0.5584249552544106, + "grad_norm": 100.12553377745678, + "learning_rate": 9.775820263719853e-06, + "loss": 5.4055, + "step": 6552 + }, + { + "epoch": 0.5585101849484361, + "grad_norm": 55.53680940667921, + "learning_rate": 9.77567343106376e-06, + "loss": 3.5682, + "step": 6553 + }, + { + "epoch": 0.5585954146424614, + "grad_norm": 30.943864191121126, + "learning_rate": 9.77552655144082e-06, + "loss": 3.1932, + "step": 6554 + }, + { + "epoch": 0.5586806443364868, + "grad_norm": 38.85440743528554, + "learning_rate": 9.77537962485247e-06, + "loss": 3.4274, + "step": 6555 + }, + { + "epoch": 0.5587658740305123, + "grad_norm": 55.72480083474386, + "learning_rate": 9.77523265130016e-06, + "loss": 4.4021, + "step": 6556 + }, + { + "epoch": 0.5588511037245376, + "grad_norm": 43.42302194300998, + "learning_rate": 9.775085630785335e-06, + "loss": 4.3059, + "step": 6557 + }, + { + "epoch": 0.5589363334185631, + "grad_norm": 35.229086108369636, + "learning_rate": 9.77493856330944e-06, + "loss": 2.3678, + "step": 6558 + }, + { + "epoch": 0.5590215631125884, + "grad_norm": 47.88391049268968, + "learning_rate": 9.774791448873921e-06, + "loss": 4.0332, + "step": 6559 + }, + { + "epoch": 0.5591067928066138, + "grad_norm": 133.4451197075272, + "learning_rate": 9.774644287480226e-06, + "loss": 4.9208, + "step": 6560 + }, + { + "epoch": 0.5591920225006393, + "grad_norm": 122.91737344516972, + "learning_rate": 9.7744970791298e-06, + "loss": 3.9832, + "step": 6561 + }, + { + "epoch": 0.5592772521946646, + "grad_norm": 61.58026686365381, + "learning_rate": 9.774349823824097e-06, + "loss": 3.9517, + "step": 6562 + }, + { + "epoch": 0.55936248188869, + "grad_norm": 58.03996444716273, + "learning_rate": 9.774202521564556e-06, + "loss": 4.764, + "step": 6563 + }, + { + "epoch": 0.5594477115827154, + "grad_norm": 59.6633669673502, + "learning_rate": 9.774055172352631e-06, + "loss": 4.5508, + "step": 6564 + }, + { + "epoch": 0.5595329412767408, + "grad_norm": 57.427453521145985, + "learning_rate": 9.773907776189773e-06, + "loss": 4.8453, + "step": 6565 + }, + { + "epoch": 0.5596181709707662, + "grad_norm": 33.750399232262296, + "learning_rate": 9.773760333077426e-06, + "loss": 3.2238, + "step": 6566 + }, + { + "epoch": 0.5597034006647916, + "grad_norm": 84.54568391378383, + "learning_rate": 9.773612843017046e-06, + "loss": 4.9696, + "step": 6567 + }, + { + "epoch": 0.559788630358817, + "grad_norm": 155.1100965691436, + "learning_rate": 9.773465306010078e-06, + "loss": 5.1693, + "step": 6568 + }, + { + "epoch": 0.5598738600528425, + "grad_norm": 44.487279329080124, + "learning_rate": 9.773317722057976e-06, + "loss": 4.3027, + "step": 6569 + }, + { + "epoch": 0.5599590897468678, + "grad_norm": 36.55215517951569, + "learning_rate": 9.773170091162191e-06, + "loss": 4.1526, + "step": 6570 + }, + { + "epoch": 0.5600443194408932, + "grad_norm": 196.44997643373037, + "learning_rate": 9.773022413324177e-06, + "loss": 5.1719, + "step": 6571 + }, + { + "epoch": 0.5601295491349186, + "grad_norm": 53.1938094399945, + "learning_rate": 9.772874688545381e-06, + "loss": 3.4561, + "step": 6572 + }, + { + "epoch": 0.560214778828944, + "grad_norm": 51.65701796743242, + "learning_rate": 9.77272691682726e-06, + "loss": 3.5866, + "step": 6573 + }, + { + "epoch": 0.5603000085229694, + "grad_norm": 52.42584063500239, + "learning_rate": 9.772579098171267e-06, + "loss": 4.3324, + "step": 6574 + }, + { + "epoch": 0.5603852382169948, + "grad_norm": 38.97907535139324, + "learning_rate": 9.772431232578853e-06, + "loss": 3.6144, + "step": 6575 + }, + { + "epoch": 0.5604704679110202, + "grad_norm": 57.71078088255659, + "learning_rate": 9.772283320051477e-06, + "loss": 4.416, + "step": 6576 + }, + { + "epoch": 0.5605556976050456, + "grad_norm": 37.49792116989329, + "learning_rate": 9.772135360590589e-06, + "loss": 2.8447, + "step": 6577 + }, + { + "epoch": 0.560640927299071, + "grad_norm": 30.613229578540395, + "learning_rate": 9.771987354197647e-06, + "loss": 3.7806, + "step": 6578 + }, + { + "epoch": 0.5607261569930964, + "grad_norm": 85.33696933731429, + "learning_rate": 9.771839300874106e-06, + "loss": 4.6612, + "step": 6579 + }, + { + "epoch": 0.5608113866871218, + "grad_norm": 59.69859692160411, + "learning_rate": 9.771691200621421e-06, + "loss": 4.268, + "step": 6580 + }, + { + "epoch": 0.5608966163811472, + "grad_norm": 43.84032131755743, + "learning_rate": 9.771543053441049e-06, + "loss": 3.7419, + "step": 6581 + }, + { + "epoch": 0.5609818460751725, + "grad_norm": 49.04097572382152, + "learning_rate": 9.771394859334447e-06, + "loss": 4.106, + "step": 6582 + }, + { + "epoch": 0.561067075769198, + "grad_norm": 29.809859868771202, + "learning_rate": 9.771246618303072e-06, + "loss": 3.2276, + "step": 6583 + }, + { + "epoch": 0.5611523054632234, + "grad_norm": 58.9928152607603, + "learning_rate": 9.771098330348384e-06, + "loss": 4.6103, + "step": 6584 + }, + { + "epoch": 0.5612375351572488, + "grad_norm": 64.08259913702578, + "learning_rate": 9.770949995471836e-06, + "loss": 4.3678, + "step": 6585 + }, + { + "epoch": 0.5613227648512742, + "grad_norm": 41.13349655447463, + "learning_rate": 9.770801613674893e-06, + "loss": 2.7303, + "step": 6586 + }, + { + "epoch": 0.5614079945452995, + "grad_norm": 44.465320251845384, + "learning_rate": 9.770653184959013e-06, + "loss": 4.4594, + "step": 6587 + }, + { + "epoch": 0.561493224239325, + "grad_norm": 32.29031929548869, + "learning_rate": 9.770504709325654e-06, + "loss": 2.8913, + "step": 6588 + }, + { + "epoch": 0.5615784539333504, + "grad_norm": 107.53906103068958, + "learning_rate": 9.770356186776278e-06, + "loss": 4.5959, + "step": 6589 + }, + { + "epoch": 0.5616636836273757, + "grad_norm": 35.93860719211766, + "learning_rate": 9.770207617312341e-06, + "loss": 3.4824, + "step": 6590 + }, + { + "epoch": 0.5617489133214012, + "grad_norm": 75.01628144794512, + "learning_rate": 9.77005900093531e-06, + "loss": 3.8475, + "step": 6591 + }, + { + "epoch": 0.5618341430154266, + "grad_norm": 72.46726014797031, + "learning_rate": 9.769910337646644e-06, + "loss": 4.6288, + "step": 6592 + }, + { + "epoch": 0.561919372709452, + "grad_norm": 65.76661755654878, + "learning_rate": 9.769761627447805e-06, + "loss": 4.1935, + "step": 6593 + }, + { + "epoch": 0.5620046024034774, + "grad_norm": 115.62968297731618, + "learning_rate": 9.769612870340256e-06, + "loss": 4.2256, + "step": 6594 + }, + { + "epoch": 0.5620898320975027, + "grad_norm": 67.74635498014999, + "learning_rate": 9.76946406632546e-06, + "loss": 4.4781, + "step": 6595 + }, + { + "epoch": 0.5621750617915282, + "grad_norm": 48.16863571761387, + "learning_rate": 9.76931521540488e-06, + "loss": 3.4126, + "step": 6596 + }, + { + "epoch": 0.5622602914855536, + "grad_norm": 70.25722180528179, + "learning_rate": 9.769166317579979e-06, + "loss": 4.273, + "step": 6597 + }, + { + "epoch": 0.5623455211795789, + "grad_norm": 48.993455027569986, + "learning_rate": 9.769017372852223e-06, + "loss": 3.5675, + "step": 6598 + }, + { + "epoch": 0.5624307508736044, + "grad_norm": 90.45534953511056, + "learning_rate": 9.768868381223075e-06, + "loss": 5.3385, + "step": 6599 + }, + { + "epoch": 0.5625159805676297, + "grad_norm": 34.26189343058613, + "learning_rate": 9.768719342694001e-06, + "loss": 3.6929, + "step": 6600 + }, + { + "epoch": 0.5626012102616552, + "grad_norm": 26.54361992798626, + "learning_rate": 9.76857025726647e-06, + "loss": 2.6721, + "step": 6601 + }, + { + "epoch": 0.5626864399556806, + "grad_norm": 68.16726344498205, + "learning_rate": 9.768421124941944e-06, + "loss": 4.2784, + "step": 6602 + }, + { + "epoch": 0.5627716696497059, + "grad_norm": 34.03127497763232, + "learning_rate": 9.768271945721891e-06, + "loss": 3.4873, + "step": 6603 + }, + { + "epoch": 0.5628568993437314, + "grad_norm": 57.43487669899828, + "learning_rate": 9.768122719607778e-06, + "loss": 4.1386, + "step": 6604 + }, + { + "epoch": 0.5629421290377568, + "grad_norm": 82.93072969457857, + "learning_rate": 9.767973446601074e-06, + "loss": 3.9538, + "step": 6605 + }, + { + "epoch": 0.5630273587317821, + "grad_norm": 40.71765468960456, + "learning_rate": 9.767824126703244e-06, + "loss": 4.3211, + "step": 6606 + }, + { + "epoch": 0.5631125884258076, + "grad_norm": 52.03437502735925, + "learning_rate": 9.767674759915758e-06, + "loss": 3.2327, + "step": 6607 + }, + { + "epoch": 0.5631978181198329, + "grad_norm": 50.60788508721443, + "learning_rate": 9.767525346240087e-06, + "loss": 4.4268, + "step": 6608 + }, + { + "epoch": 0.5632830478138583, + "grad_norm": 72.37820955868393, + "learning_rate": 9.767375885677696e-06, + "loss": 4.6036, + "step": 6609 + }, + { + "epoch": 0.5633682775078838, + "grad_norm": 80.89365414800871, + "learning_rate": 9.76722637823006e-06, + "loss": 4.2401, + "step": 6610 + }, + { + "epoch": 0.5634535072019091, + "grad_norm": 54.16212097342948, + "learning_rate": 9.767076823898647e-06, + "loss": 4.4783, + "step": 6611 + }, + { + "epoch": 0.5635387368959346, + "grad_norm": 132.86214557631482, + "learning_rate": 9.766927222684926e-06, + "loss": 4.266, + "step": 6612 + }, + { + "epoch": 0.56362396658996, + "grad_norm": 39.894153309306624, + "learning_rate": 9.766777574590371e-06, + "loss": 3.1428, + "step": 6613 + }, + { + "epoch": 0.5637091962839853, + "grad_norm": 97.8137519323328, + "learning_rate": 9.766627879616453e-06, + "loss": 2.4831, + "step": 6614 + }, + { + "epoch": 0.5637944259780108, + "grad_norm": 80.2402582777211, + "learning_rate": 9.766478137764644e-06, + "loss": 4.603, + "step": 6615 + }, + { + "epoch": 0.5638796556720361, + "grad_norm": 63.65972449915047, + "learning_rate": 9.766328349036414e-06, + "loss": 4.0486, + "step": 6616 + }, + { + "epoch": 0.5639648853660615, + "grad_norm": 67.81740377413651, + "learning_rate": 9.766178513433242e-06, + "loss": 4.9612, + "step": 6617 + }, + { + "epoch": 0.564050115060087, + "grad_norm": 50.91375982203877, + "learning_rate": 9.766028630956597e-06, + "loss": 5.275, + "step": 6618 + }, + { + "epoch": 0.5641353447541123, + "grad_norm": 53.76432279125839, + "learning_rate": 9.765878701607953e-06, + "loss": 4.2205, + "step": 6619 + }, + { + "epoch": 0.5642205744481378, + "grad_norm": 120.250531367847, + "learning_rate": 9.765728725388786e-06, + "loss": 5.294, + "step": 6620 + }, + { + "epoch": 0.5643058041421631, + "grad_norm": 73.24022318265698, + "learning_rate": 9.765578702300572e-06, + "loss": 4.6156, + "step": 6621 + }, + { + "epoch": 0.5643910338361885, + "grad_norm": 48.882498880054285, + "learning_rate": 9.765428632344784e-06, + "loss": 3.9791, + "step": 6622 + }, + { + "epoch": 0.564476263530214, + "grad_norm": 56.43512148997928, + "learning_rate": 9.765278515522901e-06, + "loss": 2.0769, + "step": 6623 + }, + { + "epoch": 0.5645614932242393, + "grad_norm": 174.0669019372921, + "learning_rate": 9.765128351836394e-06, + "loss": 4.796, + "step": 6624 + }, + { + "epoch": 0.5646467229182647, + "grad_norm": 69.49018565252317, + "learning_rate": 9.764978141286746e-06, + "loss": 5.0932, + "step": 6625 + }, + { + "epoch": 0.5647319526122901, + "grad_norm": 61.980071765049246, + "learning_rate": 9.76482788387543e-06, + "loss": 4.3761, + "step": 6626 + }, + { + "epoch": 0.5648171823063155, + "grad_norm": 40.34795069224244, + "learning_rate": 9.764677579603924e-06, + "loss": 3.3978, + "step": 6627 + }, + { + "epoch": 0.564902412000341, + "grad_norm": 70.22478333098807, + "learning_rate": 9.76452722847371e-06, + "loss": 4.7509, + "step": 6628 + }, + { + "epoch": 0.5649876416943663, + "grad_norm": 46.736068493298006, + "learning_rate": 9.764376830486264e-06, + "loss": 3.1559, + "step": 6629 + }, + { + "epoch": 0.5650728713883917, + "grad_norm": 56.68313160744208, + "learning_rate": 9.764226385643062e-06, + "loss": 3.337, + "step": 6630 + }, + { + "epoch": 0.5651581010824172, + "grad_norm": 66.97557748070679, + "learning_rate": 9.76407589394559e-06, + "loss": 4.2703, + "step": 6631 + }, + { + "epoch": 0.5652433307764425, + "grad_norm": 63.891117010512325, + "learning_rate": 9.763925355395323e-06, + "loss": 4.4017, + "step": 6632 + }, + { + "epoch": 0.5653285604704679, + "grad_norm": 58.90327018857547, + "learning_rate": 9.763774769993744e-06, + "loss": 3.8446, + "step": 6633 + }, + { + "epoch": 0.5654137901644933, + "grad_norm": 33.23158184250969, + "learning_rate": 9.763624137742331e-06, + "loss": 3.2575, + "step": 6634 + }, + { + "epoch": 0.5654990198585187, + "grad_norm": 38.04340584525615, + "learning_rate": 9.76347345864257e-06, + "loss": 3.8643, + "step": 6635 + }, + { + "epoch": 0.5655842495525442, + "grad_norm": 29.305145225801645, + "learning_rate": 9.763322732695938e-06, + "loss": 2.7231, + "step": 6636 + }, + { + "epoch": 0.5656694792465695, + "grad_norm": 59.08623245948496, + "learning_rate": 9.763171959903922e-06, + "loss": 4.0458, + "step": 6637 + }, + { + "epoch": 0.5657547089405949, + "grad_norm": 59.25349263802438, + "learning_rate": 9.763021140267999e-06, + "loss": 3.1701, + "step": 6638 + }, + { + "epoch": 0.5658399386346203, + "grad_norm": 81.75091519103611, + "learning_rate": 9.76287027378966e-06, + "loss": 5.794, + "step": 6639 + }, + { + "epoch": 0.5659251683286457, + "grad_norm": 77.28232119408604, + "learning_rate": 9.76271936047038e-06, + "loss": 3.5411, + "step": 6640 + }, + { + "epoch": 0.5660103980226711, + "grad_norm": 38.33956220238498, + "learning_rate": 9.762568400311651e-06, + "loss": 3.3818, + "step": 6641 + }, + { + "epoch": 0.5660956277166965, + "grad_norm": 49.506634881459014, + "learning_rate": 9.762417393314953e-06, + "loss": 3.5119, + "step": 6642 + }, + { + "epoch": 0.5661808574107219, + "grad_norm": 78.18551075352958, + "learning_rate": 9.762266339481774e-06, + "loss": 5.6851, + "step": 6643 + }, + { + "epoch": 0.5662660871047472, + "grad_norm": 420.20147044343787, + "learning_rate": 9.762115238813596e-06, + "loss": 4.6503, + "step": 6644 + }, + { + "epoch": 0.5663513167987727, + "grad_norm": 65.59917094862982, + "learning_rate": 9.761964091311906e-06, + "loss": 4.2429, + "step": 6645 + }, + { + "epoch": 0.5664365464927981, + "grad_norm": 61.69013713324427, + "learning_rate": 9.761812896978194e-06, + "loss": 4.3076, + "step": 6646 + }, + { + "epoch": 0.5665217761868235, + "grad_norm": 96.3727994145819, + "learning_rate": 9.761661655813943e-06, + "loss": 3.7801, + "step": 6647 + }, + { + "epoch": 0.5666070058808489, + "grad_norm": 57.268606813965945, + "learning_rate": 9.76151036782064e-06, + "loss": 4.1688, + "step": 6648 + }, + { + "epoch": 0.5666922355748742, + "grad_norm": 38.08002956630187, + "learning_rate": 9.761359032999777e-06, + "loss": 4.0441, + "step": 6649 + }, + { + "epoch": 0.5667774652688997, + "grad_norm": 29.494044112418973, + "learning_rate": 9.761207651352839e-06, + "loss": 2.488, + "step": 6650 + }, + { + "epoch": 0.5668626949629251, + "grad_norm": 116.14594194637988, + "learning_rate": 9.761056222881318e-06, + "loss": 3.576, + "step": 6651 + }, + { + "epoch": 0.5669479246569504, + "grad_norm": 77.62563628587799, + "learning_rate": 9.760904747586698e-06, + "loss": 5.0385, + "step": 6652 + }, + { + "epoch": 0.5670331543509759, + "grad_norm": 58.547160077702586, + "learning_rate": 9.760753225470472e-06, + "loss": 4.6174, + "step": 6653 + }, + { + "epoch": 0.5671183840450013, + "grad_norm": 52.92450950460365, + "learning_rate": 9.760601656534132e-06, + "loss": 4.4615, + "step": 6654 + }, + { + "epoch": 0.5672036137390267, + "grad_norm": 61.109177624607575, + "learning_rate": 9.760450040779164e-06, + "loss": 3.8822, + "step": 6655 + }, + { + "epoch": 0.5672888434330521, + "grad_norm": 45.7883283192184, + "learning_rate": 9.760298378207062e-06, + "loss": 3.9279, + "step": 6656 + }, + { + "epoch": 0.5673740731270774, + "grad_norm": 75.68096754996157, + "learning_rate": 9.760146668819317e-06, + "loss": 4.6932, + "step": 6657 + }, + { + "epoch": 0.5674593028211029, + "grad_norm": 64.24784136122484, + "learning_rate": 9.759994912617421e-06, + "loss": 4.4669, + "step": 6658 + }, + { + "epoch": 0.5675445325151283, + "grad_norm": 98.0343885298707, + "learning_rate": 9.759843109602867e-06, + "loss": 3.9783, + "step": 6659 + }, + { + "epoch": 0.5676297622091536, + "grad_norm": 42.74518837511286, + "learning_rate": 9.759691259777147e-06, + "loss": 3.7406, + "step": 6660 + }, + { + "epoch": 0.5677149919031791, + "grad_norm": 41.301905576990144, + "learning_rate": 9.759539363141754e-06, + "loss": 4.1196, + "step": 6661 + }, + { + "epoch": 0.5678002215972044, + "grad_norm": 36.8771228981439, + "learning_rate": 9.759387419698182e-06, + "loss": 3.1838, + "step": 6662 + }, + { + "epoch": 0.5678854512912299, + "grad_norm": 46.041381666804234, + "learning_rate": 9.759235429447929e-06, + "loss": 3.9805, + "step": 6663 + }, + { + "epoch": 0.5679706809852553, + "grad_norm": 49.079879348321256, + "learning_rate": 9.759083392392485e-06, + "loss": 3.4118, + "step": 6664 + }, + { + "epoch": 0.5680559106792806, + "grad_norm": 82.82823236180475, + "learning_rate": 9.758931308533346e-06, + "loss": 3.7703, + "step": 6665 + }, + { + "epoch": 0.5681411403733061, + "grad_norm": 57.86937702223851, + "learning_rate": 9.75877917787201e-06, + "loss": 4.0245, + "step": 6666 + }, + { + "epoch": 0.5682263700673315, + "grad_norm": 49.938690703369716, + "learning_rate": 9.75862700040997e-06, + "loss": 3.1151, + "step": 6667 + }, + { + "epoch": 0.5683115997613568, + "grad_norm": 135.70022423762188, + "learning_rate": 9.758474776148726e-06, + "loss": 4.9301, + "step": 6668 + }, + { + "epoch": 0.5683968294553823, + "grad_norm": 75.07884142085935, + "learning_rate": 9.758322505089773e-06, + "loss": 5.1803, + "step": 6669 + }, + { + "epoch": 0.5684820591494076, + "grad_norm": 41.53879888368953, + "learning_rate": 9.758170187234609e-06, + "loss": 3.4021, + "step": 6670 + }, + { + "epoch": 0.5685672888434331, + "grad_norm": 55.17186336317466, + "learning_rate": 9.758017822584732e-06, + "loss": 3.5683, + "step": 6671 + }, + { + "epoch": 0.5686525185374585, + "grad_norm": 63.996703523208126, + "learning_rate": 9.75786541114164e-06, + "loss": 3.9541, + "step": 6672 + }, + { + "epoch": 0.5687377482314838, + "grad_norm": 60.55001532179251, + "learning_rate": 9.757712952906832e-06, + "loss": 4.7864, + "step": 6673 + }, + { + "epoch": 0.5688229779255093, + "grad_norm": 62.548441453422114, + "learning_rate": 9.757560447881808e-06, + "loss": 4.0954, + "step": 6674 + }, + { + "epoch": 0.5689082076195346, + "grad_norm": 56.703295170918864, + "learning_rate": 9.757407896068067e-06, + "loss": 4.5401, + "step": 6675 + }, + { + "epoch": 0.56899343731356, + "grad_norm": 150.1599864580711, + "learning_rate": 9.757255297467111e-06, + "loss": 6.0642, + "step": 6676 + }, + { + "epoch": 0.5690786670075855, + "grad_norm": 52.111725266505864, + "learning_rate": 9.75710265208044e-06, + "loss": 4.4998, + "step": 6677 + }, + { + "epoch": 0.5691638967016108, + "grad_norm": 33.45165907983639, + "learning_rate": 9.756949959909553e-06, + "loss": 2.6656, + "step": 6678 + }, + { + "epoch": 0.5692491263956362, + "grad_norm": 22.71454691683189, + "learning_rate": 9.756797220955954e-06, + "loss": 2.6245, + "step": 6679 + }, + { + "epoch": 0.5693343560896617, + "grad_norm": 138.73151481990394, + "learning_rate": 9.756644435221144e-06, + "loss": 4.8427, + "step": 6680 + }, + { + "epoch": 0.569419585783687, + "grad_norm": 34.423470820443164, + "learning_rate": 9.756491602706627e-06, + "loss": 3.2091, + "step": 6681 + }, + { + "epoch": 0.5695048154777125, + "grad_norm": 60.18274786053218, + "learning_rate": 9.756338723413904e-06, + "loss": 5.3614, + "step": 6682 + }, + { + "epoch": 0.5695900451717378, + "grad_norm": 84.8019445307255, + "learning_rate": 9.756185797344479e-06, + "loss": 5.2053, + "step": 6683 + }, + { + "epoch": 0.5696752748657632, + "grad_norm": 56.90493582420987, + "learning_rate": 9.756032824499859e-06, + "loss": 4.4294, + "step": 6684 + }, + { + "epoch": 0.5697605045597887, + "grad_norm": 37.271277466123934, + "learning_rate": 9.755879804881545e-06, + "loss": 2.4634, + "step": 6685 + }, + { + "epoch": 0.569845734253814, + "grad_norm": 62.02724993188206, + "learning_rate": 9.755726738491042e-06, + "loss": 3.6678, + "step": 6686 + }, + { + "epoch": 0.5699309639478394, + "grad_norm": 53.190387010097275, + "learning_rate": 9.755573625329857e-06, + "loss": 3.8917, + "step": 6687 + }, + { + "epoch": 0.5700161936418648, + "grad_norm": 86.26657698319988, + "learning_rate": 9.755420465399496e-06, + "loss": 4.4237, + "step": 6688 + }, + { + "epoch": 0.5701014233358902, + "grad_norm": 50.963863729766146, + "learning_rate": 9.755267258701463e-06, + "loss": 3.5962, + "step": 6689 + }, + { + "epoch": 0.5701866530299157, + "grad_norm": 189.8360450251818, + "learning_rate": 9.755114005237267e-06, + "loss": 3.4494, + "step": 6690 + }, + { + "epoch": 0.570271882723941, + "grad_norm": 61.7990126568831, + "learning_rate": 9.754960705008413e-06, + "loss": 3.7013, + "step": 6691 + }, + { + "epoch": 0.5703571124179664, + "grad_norm": 36.18950902681479, + "learning_rate": 9.754807358016411e-06, + "loss": 3.5572, + "step": 6692 + }, + { + "epoch": 0.5704423421119919, + "grad_norm": 62.41482220506783, + "learning_rate": 9.754653964262768e-06, + "loss": 4.6326, + "step": 6693 + }, + { + "epoch": 0.5705275718060172, + "grad_norm": 60.5127221102314, + "learning_rate": 9.754500523748992e-06, + "loss": 4.3061, + "step": 6694 + }, + { + "epoch": 0.5706128015000426, + "grad_norm": 68.21710229718012, + "learning_rate": 9.754347036476592e-06, + "loss": 4.3534, + "step": 6695 + }, + { + "epoch": 0.570698031194068, + "grad_norm": 70.00722721148112, + "learning_rate": 9.754193502447081e-06, + "loss": 4.7402, + "step": 6696 + }, + { + "epoch": 0.5707832608880934, + "grad_norm": 113.14018706878495, + "learning_rate": 9.754039921661963e-06, + "loss": 5.8429, + "step": 6697 + }, + { + "epoch": 0.5708684905821189, + "grad_norm": 52.853017645204275, + "learning_rate": 9.753886294122752e-06, + "loss": 3.9106, + "step": 6698 + }, + { + "epoch": 0.5709537202761442, + "grad_norm": 72.5523147043565, + "learning_rate": 9.753732619830959e-06, + "loss": 4.3072, + "step": 6699 + }, + { + "epoch": 0.5710389499701696, + "grad_norm": 70.13360432524925, + "learning_rate": 9.753578898788094e-06, + "loss": 4.7091, + "step": 6700 + }, + { + "epoch": 0.571124179664195, + "grad_norm": 115.80281049887792, + "learning_rate": 9.75342513099567e-06, + "loss": 4.0881, + "step": 6701 + }, + { + "epoch": 0.5712094093582204, + "grad_norm": 77.79434948808272, + "learning_rate": 9.753271316455198e-06, + "loss": 3.9123, + "step": 6702 + }, + { + "epoch": 0.5712946390522458, + "grad_norm": 36.83767106771119, + "learning_rate": 9.753117455168192e-06, + "loss": 3.1229, + "step": 6703 + }, + { + "epoch": 0.5713798687462712, + "grad_norm": 79.25224919753195, + "learning_rate": 9.752963547136165e-06, + "loss": 4.3988, + "step": 6704 + }, + { + "epoch": 0.5714650984402966, + "grad_norm": 37.52753967650383, + "learning_rate": 9.752809592360629e-06, + "loss": 3.7231, + "step": 6705 + }, + { + "epoch": 0.571550328134322, + "grad_norm": 74.30592197622155, + "learning_rate": 9.7526555908431e-06, + "loss": 3.5103, + "step": 6706 + }, + { + "epoch": 0.5716355578283474, + "grad_norm": 48.39358783192619, + "learning_rate": 9.75250154258509e-06, + "loss": 4.3355, + "step": 6707 + }, + { + "epoch": 0.5717207875223728, + "grad_norm": 128.85167415477468, + "learning_rate": 9.75234744758812e-06, + "loss": 4.5231, + "step": 6708 + }, + { + "epoch": 0.5718060172163982, + "grad_norm": 62.57921501462144, + "learning_rate": 9.752193305853696e-06, + "loss": 4.6914, + "step": 6709 + }, + { + "epoch": 0.5718912469104236, + "grad_norm": 98.79145133293126, + "learning_rate": 9.752039117383343e-06, + "loss": 4.0982, + "step": 6710 + }, + { + "epoch": 0.571976476604449, + "grad_norm": 82.27786714665632, + "learning_rate": 9.751884882178572e-06, + "loss": 4.4158, + "step": 6711 + }, + { + "epoch": 0.5720617062984744, + "grad_norm": 48.260610436065726, + "learning_rate": 9.751730600240903e-06, + "loss": 3.6672, + "step": 6712 + }, + { + "epoch": 0.5721469359924998, + "grad_norm": 44.46472974125675, + "learning_rate": 9.75157627157185e-06, + "loss": 3.7397, + "step": 6713 + }, + { + "epoch": 0.5722321656865252, + "grad_norm": 46.52975662390165, + "learning_rate": 9.751421896172932e-06, + "loss": 4.3691, + "step": 6714 + }, + { + "epoch": 0.5723173953805506, + "grad_norm": 68.72056500962155, + "learning_rate": 9.751267474045669e-06, + "loss": 5.4267, + "step": 6715 + }, + { + "epoch": 0.572402625074576, + "grad_norm": 31.506260927476283, + "learning_rate": 9.751113005191578e-06, + "loss": 3.8886, + "step": 6716 + }, + { + "epoch": 0.5724878547686014, + "grad_norm": 63.712356484564545, + "learning_rate": 9.75095848961218e-06, + "loss": 4.3616, + "step": 6717 + }, + { + "epoch": 0.5725730844626268, + "grad_norm": 64.65900285475516, + "learning_rate": 9.750803927308989e-06, + "loss": 4.5578, + "step": 6718 + }, + { + "epoch": 0.5726583141566521, + "grad_norm": 40.729599938567524, + "learning_rate": 9.750649318283533e-06, + "loss": 4.9233, + "step": 6719 + }, + { + "epoch": 0.5727435438506776, + "grad_norm": 62.45113416234719, + "learning_rate": 9.750494662537326e-06, + "loss": 4.2222, + "step": 6720 + }, + { + "epoch": 0.572828773544703, + "grad_norm": 42.99337404496386, + "learning_rate": 9.750339960071893e-06, + "loss": 3.3177, + "step": 6721 + }, + { + "epoch": 0.5729140032387283, + "grad_norm": 34.21822499575695, + "learning_rate": 9.750185210888753e-06, + "loss": 3.2183, + "step": 6722 + }, + { + "epoch": 0.5729992329327538, + "grad_norm": 107.27866834341185, + "learning_rate": 9.750030414989432e-06, + "loss": 4.8919, + "step": 6723 + }, + { + "epoch": 0.5730844626267791, + "grad_norm": 57.717950815789095, + "learning_rate": 9.749875572375446e-06, + "loss": 4.3889, + "step": 6724 + }, + { + "epoch": 0.5731696923208046, + "grad_norm": 43.49459417397251, + "learning_rate": 9.749720683048323e-06, + "loss": 3.9406, + "step": 6725 + }, + { + "epoch": 0.57325492201483, + "grad_norm": 78.29491063691925, + "learning_rate": 9.749565747009584e-06, + "loss": 3.4868, + "step": 6726 + }, + { + "epoch": 0.5733401517088553, + "grad_norm": 94.59601915143669, + "learning_rate": 9.749410764260753e-06, + "loss": 3.9684, + "step": 6727 + }, + { + "epoch": 0.5734253814028808, + "grad_norm": 80.43200101513187, + "learning_rate": 9.749255734803355e-06, + "loss": 4.4927, + "step": 6728 + }, + { + "epoch": 0.5735106110969062, + "grad_norm": 37.504835981298356, + "learning_rate": 9.749100658638914e-06, + "loss": 3.9094, + "step": 6729 + }, + { + "epoch": 0.5735958407909315, + "grad_norm": 42.2912369843689, + "learning_rate": 9.748945535768955e-06, + "loss": 4.302, + "step": 6730 + }, + { + "epoch": 0.573681070484957, + "grad_norm": 50.7814081067717, + "learning_rate": 9.748790366195003e-06, + "loss": 4.7341, + "step": 6731 + }, + { + "epoch": 0.5737663001789823, + "grad_norm": 50.76015561141913, + "learning_rate": 9.748635149918587e-06, + "loss": 4.5801, + "step": 6732 + }, + { + "epoch": 0.5738515298730078, + "grad_norm": 57.40571542729291, + "learning_rate": 9.748479886941227e-06, + "loss": 3.1554, + "step": 6733 + }, + { + "epoch": 0.5739367595670332, + "grad_norm": 75.25810180402809, + "learning_rate": 9.748324577264456e-06, + "loss": 4.4485, + "step": 6734 + }, + { + "epoch": 0.5740219892610585, + "grad_norm": 94.82843556401163, + "learning_rate": 9.7481692208898e-06, + "loss": 4.3528, + "step": 6735 + }, + { + "epoch": 0.574107218955084, + "grad_norm": 148.5267759303818, + "learning_rate": 9.748013817818787e-06, + "loss": 4.7885, + "step": 6736 + }, + { + "epoch": 0.5741924486491093, + "grad_norm": 53.73766383831068, + "learning_rate": 9.747858368052945e-06, + "loss": 3.503, + "step": 6737 + }, + { + "epoch": 0.5742776783431347, + "grad_norm": 29.48951746442083, + "learning_rate": 9.747702871593803e-06, + "loss": 2.3843, + "step": 6738 + }, + { + "epoch": 0.5743629080371602, + "grad_norm": 96.08280931660012, + "learning_rate": 9.747547328442888e-06, + "loss": 4.7948, + "step": 6739 + }, + { + "epoch": 0.5744481377311855, + "grad_norm": 72.53274883978114, + "learning_rate": 9.747391738601732e-06, + "loss": 3.4806, + "step": 6740 + }, + { + "epoch": 0.574533367425211, + "grad_norm": 54.1635674368636, + "learning_rate": 9.747236102071863e-06, + "loss": 4.4779, + "step": 6741 + }, + { + "epoch": 0.5746185971192364, + "grad_norm": 104.97980770231642, + "learning_rate": 9.747080418854818e-06, + "loss": 4.6077, + "step": 6742 + }, + { + "epoch": 0.5747038268132617, + "grad_norm": 43.51959902617295, + "learning_rate": 9.746924688952119e-06, + "loss": 2.541, + "step": 6743 + }, + { + "epoch": 0.5747890565072872, + "grad_norm": 29.134742195658376, + "learning_rate": 9.746768912365304e-06, + "loss": 4.0015, + "step": 6744 + }, + { + "epoch": 0.5748742862013125, + "grad_norm": 69.21916615779887, + "learning_rate": 9.746613089095902e-06, + "loss": 4.091, + "step": 6745 + }, + { + "epoch": 0.5749595158953379, + "grad_norm": 62.45843961148835, + "learning_rate": 9.746457219145448e-06, + "loss": 4.6806, + "step": 6746 + }, + { + "epoch": 0.5750447455893634, + "grad_norm": 30.08976279513013, + "learning_rate": 9.746301302515472e-06, + "loss": 2.5497, + "step": 6747 + }, + { + "epoch": 0.5751299752833887, + "grad_norm": 35.65023179222891, + "learning_rate": 9.74614533920751e-06, + "loss": 3.7337, + "step": 6748 + }, + { + "epoch": 0.5752152049774142, + "grad_norm": 45.06237809580878, + "learning_rate": 9.745989329223092e-06, + "loss": 3.2852, + "step": 6749 + }, + { + "epoch": 0.5753004346714395, + "grad_norm": 54.448168911710184, + "learning_rate": 9.745833272563757e-06, + "loss": 3.8049, + "step": 6750 + }, + { + "epoch": 0.5753856643654649, + "grad_norm": 36.691004457525665, + "learning_rate": 9.745677169231036e-06, + "loss": 3.8014, + "step": 6751 + }, + { + "epoch": 0.5754708940594904, + "grad_norm": 134.6917802531193, + "learning_rate": 9.745521019226466e-06, + "loss": 5.2229, + "step": 6752 + }, + { + "epoch": 0.5755561237535157, + "grad_norm": 48.98949574081355, + "learning_rate": 9.745364822551582e-06, + "loss": 2.8879, + "step": 6753 + }, + { + "epoch": 0.5756413534475411, + "grad_norm": 65.17353390578974, + "learning_rate": 9.745208579207924e-06, + "loss": 4.8793, + "step": 6754 + }, + { + "epoch": 0.5757265831415666, + "grad_norm": 41.77101581831975, + "learning_rate": 9.74505228919702e-06, + "loss": 4.2781, + "step": 6755 + }, + { + "epoch": 0.5758118128355919, + "grad_norm": 44.10884006952758, + "learning_rate": 9.744895952520416e-06, + "loss": 4.2126, + "step": 6756 + }, + { + "epoch": 0.5758970425296173, + "grad_norm": 79.91928849865018, + "learning_rate": 9.744739569179645e-06, + "loss": 4.1179, + "step": 6757 + }, + { + "epoch": 0.5759822722236427, + "grad_norm": 73.39843439124684, + "learning_rate": 9.744583139176246e-06, + "loss": 4.7421, + "step": 6758 + }, + { + "epoch": 0.5760675019176681, + "grad_norm": 43.97234844584379, + "learning_rate": 9.744426662511757e-06, + "loss": 4.3764, + "step": 6759 + }, + { + "epoch": 0.5761527316116936, + "grad_norm": 35.90160974826978, + "learning_rate": 9.744270139187715e-06, + "loss": 3.0113, + "step": 6760 + }, + { + "epoch": 0.5762379613057189, + "grad_norm": 31.865742572558062, + "learning_rate": 9.744113569205664e-06, + "loss": 2.7981, + "step": 6761 + }, + { + "epoch": 0.5763231909997443, + "grad_norm": 37.83185406838484, + "learning_rate": 9.74395695256714e-06, + "loss": 3.2093, + "step": 6762 + }, + { + "epoch": 0.5764084206937697, + "grad_norm": 38.031545949443164, + "learning_rate": 9.743800289273684e-06, + "loss": 3.7549, + "step": 6763 + }, + { + "epoch": 0.5764936503877951, + "grad_norm": 54.5532685822997, + "learning_rate": 9.743643579326838e-06, + "loss": 3.7162, + "step": 6764 + }, + { + "epoch": 0.5765788800818205, + "grad_norm": 42.97702179705242, + "learning_rate": 9.743486822728142e-06, + "loss": 4.48, + "step": 6765 + }, + { + "epoch": 0.5766641097758459, + "grad_norm": 46.32894726716102, + "learning_rate": 9.743330019479138e-06, + "loss": 4.203, + "step": 6766 + }, + { + "epoch": 0.5767493394698713, + "grad_norm": 68.19868656809527, + "learning_rate": 9.743173169581368e-06, + "loss": 4.6124, + "step": 6767 + }, + { + "epoch": 0.5768345691638967, + "grad_norm": 41.94841236546382, + "learning_rate": 9.743016273036375e-06, + "loss": 3.3995, + "step": 6768 + }, + { + "epoch": 0.5769197988579221, + "grad_norm": 48.49544625547273, + "learning_rate": 9.742859329845703e-06, + "loss": 4.3452, + "step": 6769 + }, + { + "epoch": 0.5770050285519475, + "grad_norm": 93.30462991175139, + "learning_rate": 9.742702340010892e-06, + "loss": 4.109, + "step": 6770 + }, + { + "epoch": 0.5770902582459729, + "grad_norm": 73.63355350992039, + "learning_rate": 9.742545303533488e-06, + "loss": 3.2313, + "step": 6771 + }, + { + "epoch": 0.5771754879399983, + "grad_norm": 49.3087243923684, + "learning_rate": 9.742388220415036e-06, + "loss": 4.145, + "step": 6772 + }, + { + "epoch": 0.5772607176340236, + "grad_norm": 65.68243258689522, + "learning_rate": 9.742231090657079e-06, + "loss": 3.9872, + "step": 6773 + }, + { + "epoch": 0.5773459473280491, + "grad_norm": 59.88546617754959, + "learning_rate": 9.742073914261165e-06, + "loss": 4.4874, + "step": 6774 + }, + { + "epoch": 0.5774311770220745, + "grad_norm": 43.67250626711286, + "learning_rate": 9.741916691228839e-06, + "loss": 3.3787, + "step": 6775 + }, + { + "epoch": 0.5775164067160999, + "grad_norm": 42.795197791904116, + "learning_rate": 9.741759421561647e-06, + "loss": 2.8956, + "step": 6776 + }, + { + "epoch": 0.5776016364101253, + "grad_norm": 38.300606530658, + "learning_rate": 9.741602105261134e-06, + "loss": 4.1845, + "step": 6777 + }, + { + "epoch": 0.5776868661041507, + "grad_norm": 170.0371203185057, + "learning_rate": 9.741444742328846e-06, + "loss": 3.5872, + "step": 6778 + }, + { + "epoch": 0.5777720957981761, + "grad_norm": 54.0875798408164, + "learning_rate": 9.741287332766338e-06, + "loss": 4.2337, + "step": 6779 + }, + { + "epoch": 0.5778573254922015, + "grad_norm": 63.39981003941458, + "learning_rate": 9.74112987657515e-06, + "loss": 4.4203, + "step": 6780 + }, + { + "epoch": 0.5779425551862268, + "grad_norm": 43.71634112374161, + "learning_rate": 9.740972373756834e-06, + "loss": 3.893, + "step": 6781 + }, + { + "epoch": 0.5780277848802523, + "grad_norm": 50.7450037273286, + "learning_rate": 9.740814824312938e-06, + "loss": 3.9953, + "step": 6782 + }, + { + "epoch": 0.5781130145742777, + "grad_norm": 32.4582197310228, + "learning_rate": 9.740657228245014e-06, + "loss": 3.66, + "step": 6783 + }, + { + "epoch": 0.5781982442683031, + "grad_norm": 51.463120780280384, + "learning_rate": 9.74049958555461e-06, + "loss": 4.5002, + "step": 6784 + }, + { + "epoch": 0.5782834739623285, + "grad_norm": 50.215007354682896, + "learning_rate": 9.740341896243274e-06, + "loss": 3.9336, + "step": 6785 + }, + { + "epoch": 0.5783687036563538, + "grad_norm": 33.656221335354594, + "learning_rate": 9.74018416031256e-06, + "loss": 3.1147, + "step": 6786 + }, + { + "epoch": 0.5784539333503793, + "grad_norm": 46.70118132285706, + "learning_rate": 9.740026377764019e-06, + "loss": 4.2996, + "step": 6787 + }, + { + "epoch": 0.5785391630444047, + "grad_norm": 42.897467017026216, + "learning_rate": 9.7398685485992e-06, + "loss": 4.3422, + "step": 6788 + }, + { + "epoch": 0.57862439273843, + "grad_norm": 53.96103869670818, + "learning_rate": 9.73971067281966e-06, + "loss": 4.3092, + "step": 6789 + }, + { + "epoch": 0.5787096224324555, + "grad_norm": 70.4003424292137, + "learning_rate": 9.739552750426948e-06, + "loss": 4.2505, + "step": 6790 + }, + { + "epoch": 0.5787948521264809, + "grad_norm": 105.39810786475275, + "learning_rate": 9.739394781422618e-06, + "loss": 4.4855, + "step": 6791 + }, + { + "epoch": 0.5788800818205062, + "grad_norm": 59.524466895981405, + "learning_rate": 9.739236765808224e-06, + "loss": 4.5555, + "step": 6792 + }, + { + "epoch": 0.5789653115145317, + "grad_norm": 46.41751843156265, + "learning_rate": 9.739078703585318e-06, + "loss": 4.1571, + "step": 6793 + }, + { + "epoch": 0.579050541208557, + "grad_norm": 71.06323641177455, + "learning_rate": 9.738920594755457e-06, + "loss": 4.5096, + "step": 6794 + }, + { + "epoch": 0.5791357709025825, + "grad_norm": 39.750503243612314, + "learning_rate": 9.738762439320196e-06, + "loss": 4.3756, + "step": 6795 + }, + { + "epoch": 0.5792210005966079, + "grad_norm": 58.368298077809115, + "learning_rate": 9.738604237281087e-06, + "loss": 5.1629, + "step": 6796 + }, + { + "epoch": 0.5793062302906332, + "grad_norm": 42.38973948606111, + "learning_rate": 9.73844598863969e-06, + "loss": 3.4787, + "step": 6797 + }, + { + "epoch": 0.5793914599846587, + "grad_norm": 45.82971780340642, + "learning_rate": 9.73828769339756e-06, + "loss": 3.5316, + "step": 6798 + }, + { + "epoch": 0.579476689678684, + "grad_norm": 37.78133023279401, + "learning_rate": 9.738129351556251e-06, + "loss": 3.3167, + "step": 6799 + }, + { + "epoch": 0.5795619193727094, + "grad_norm": 33.16032503691611, + "learning_rate": 9.737970963117325e-06, + "loss": 3.4038, + "step": 6800 + }, + { + "epoch": 0.5796471490667349, + "grad_norm": 77.35563335003314, + "learning_rate": 9.737812528082337e-06, + "loss": 5.0404, + "step": 6801 + }, + { + "epoch": 0.5797323787607602, + "grad_norm": 222.0540983916454, + "learning_rate": 9.737654046452844e-06, + "loss": 4.3299, + "step": 6802 + }, + { + "epoch": 0.5798176084547857, + "grad_norm": 103.56707356728417, + "learning_rate": 9.737495518230406e-06, + "loss": 4.8119, + "step": 6803 + }, + { + "epoch": 0.579902838148811, + "grad_norm": 42.957855820152226, + "learning_rate": 9.737336943416585e-06, + "loss": 3.547, + "step": 6804 + }, + { + "epoch": 0.5799880678428364, + "grad_norm": 42.871179563453985, + "learning_rate": 9.737178322012935e-06, + "loss": 3.5416, + "step": 6805 + }, + { + "epoch": 0.5800732975368619, + "grad_norm": 106.85487907493375, + "learning_rate": 9.737019654021019e-06, + "loss": 4.5644, + "step": 6806 + }, + { + "epoch": 0.5801585272308872, + "grad_norm": 42.98333899848532, + "learning_rate": 9.736860939442396e-06, + "loss": 4.5636, + "step": 6807 + }, + { + "epoch": 0.5802437569249126, + "grad_norm": 51.93168699924286, + "learning_rate": 9.73670217827863e-06, + "loss": 4.0725, + "step": 6808 + }, + { + "epoch": 0.5803289866189381, + "grad_norm": 45.78310260879225, + "learning_rate": 9.73654337053128e-06, + "loss": 4.3049, + "step": 6809 + }, + { + "epoch": 0.5804142163129634, + "grad_norm": 63.386685360623716, + "learning_rate": 9.736384516201907e-06, + "loss": 3.7818, + "step": 6810 + }, + { + "epoch": 0.5804994460069889, + "grad_norm": 40.31665112367906, + "learning_rate": 9.736225615292073e-06, + "loss": 3.7095, + "step": 6811 + }, + { + "epoch": 0.5805846757010142, + "grad_norm": 43.79990720116221, + "learning_rate": 9.736066667803346e-06, + "loss": 3.749, + "step": 6812 + }, + { + "epoch": 0.5806699053950396, + "grad_norm": 35.10250889778951, + "learning_rate": 9.73590767373728e-06, + "loss": 3.1655, + "step": 6813 + }, + { + "epoch": 0.5807551350890651, + "grad_norm": 94.34017378742607, + "learning_rate": 9.735748633095449e-06, + "loss": 4.4602, + "step": 6814 + }, + { + "epoch": 0.5808403647830904, + "grad_norm": 162.9402731832269, + "learning_rate": 9.73558954587941e-06, + "loss": 5.0461, + "step": 6815 + }, + { + "epoch": 0.5809255944771158, + "grad_norm": 54.507568869788095, + "learning_rate": 9.73543041209073e-06, + "loss": 3.3809, + "step": 6816 + }, + { + "epoch": 0.5810108241711412, + "grad_norm": 80.38199313847639, + "learning_rate": 9.735271231730971e-06, + "loss": 3.5513, + "step": 6817 + }, + { + "epoch": 0.5810960538651666, + "grad_norm": 111.26858484814225, + "learning_rate": 9.735112004801703e-06, + "loss": 5.0366, + "step": 6818 + }, + { + "epoch": 0.5811812835591921, + "grad_norm": 49.83465550557853, + "learning_rate": 9.734952731304492e-06, + "loss": 3.2262, + "step": 6819 + }, + { + "epoch": 0.5812665132532174, + "grad_norm": 48.46548979059027, + "learning_rate": 9.734793411240899e-06, + "loss": 4.6995, + "step": 6820 + }, + { + "epoch": 0.5813517429472428, + "grad_norm": 112.82815245777157, + "learning_rate": 9.734634044612496e-06, + "loss": 5.0697, + "step": 6821 + }, + { + "epoch": 0.5814369726412683, + "grad_norm": 78.36027833718555, + "learning_rate": 9.734474631420848e-06, + "loss": 4.1292, + "step": 6822 + }, + { + "epoch": 0.5815222023352936, + "grad_norm": 77.37522362524668, + "learning_rate": 9.734315171667524e-06, + "loss": 4.1764, + "step": 6823 + }, + { + "epoch": 0.581607432029319, + "grad_norm": 36.85148336191906, + "learning_rate": 9.73415566535409e-06, + "loss": 3.8514, + "step": 6824 + }, + { + "epoch": 0.5816926617233444, + "grad_norm": 59.3838288156878, + "learning_rate": 9.733996112482117e-06, + "loss": 4.3718, + "step": 6825 + }, + { + "epoch": 0.5817778914173698, + "grad_norm": 61.12859572018884, + "learning_rate": 9.733836513053174e-06, + "loss": 4.6885, + "step": 6826 + }, + { + "epoch": 0.5818631211113953, + "grad_norm": 77.79501739345119, + "learning_rate": 9.733676867068831e-06, + "loss": 4.1371, + "step": 6827 + }, + { + "epoch": 0.5819483508054206, + "grad_norm": 56.31942241859684, + "learning_rate": 9.733517174530655e-06, + "loss": 4.6778, + "step": 6828 + }, + { + "epoch": 0.582033580499446, + "grad_norm": 46.798704383513154, + "learning_rate": 9.733357435440219e-06, + "loss": 3.972, + "step": 6829 + }, + { + "epoch": 0.5821188101934714, + "grad_norm": 46.54221046174129, + "learning_rate": 9.733197649799093e-06, + "loss": 3.1164, + "step": 6830 + }, + { + "epoch": 0.5822040398874968, + "grad_norm": 138.50317631361892, + "learning_rate": 9.73303781760885e-06, + "loss": 4.5357, + "step": 6831 + }, + { + "epoch": 0.5822892695815222, + "grad_norm": 51.98873093106348, + "learning_rate": 9.73287793887106e-06, + "loss": 3.8226, + "step": 6832 + }, + { + "epoch": 0.5823744992755476, + "grad_norm": 65.60908781467218, + "learning_rate": 9.732718013587296e-06, + "loss": 4.5948, + "step": 6833 + }, + { + "epoch": 0.582459728969573, + "grad_norm": 54.360078344796754, + "learning_rate": 9.73255804175913e-06, + "loss": 3.989, + "step": 6834 + }, + { + "epoch": 0.5825449586635983, + "grad_norm": 43.23486132671201, + "learning_rate": 9.732398023388138e-06, + "loss": 4.1826, + "step": 6835 + }, + { + "epoch": 0.5826301883576238, + "grad_norm": 74.86889384599202, + "learning_rate": 9.732237958475892e-06, + "loss": 4.2736, + "step": 6836 + }, + { + "epoch": 0.5827154180516492, + "grad_norm": 44.25030441961256, + "learning_rate": 9.732077847023964e-06, + "loss": 3.9099, + "step": 6837 + }, + { + "epoch": 0.5828006477456746, + "grad_norm": 39.83561264862887, + "learning_rate": 9.731917689033932e-06, + "loss": 3.246, + "step": 6838 + }, + { + "epoch": 0.5828858774397, + "grad_norm": 79.4921049691147, + "learning_rate": 9.731757484507372e-06, + "loss": 5.6318, + "step": 6839 + }, + { + "epoch": 0.5829711071337254, + "grad_norm": 57.13768520692216, + "learning_rate": 9.731597233445855e-06, + "loss": 4.0807, + "step": 6840 + }, + { + "epoch": 0.5830563368277508, + "grad_norm": 48.29839029048162, + "learning_rate": 9.731436935850958e-06, + "loss": 4.0263, + "step": 6841 + }, + { + "epoch": 0.5831415665217762, + "grad_norm": 127.25962056089216, + "learning_rate": 9.731276591724261e-06, + "loss": 4.5799, + "step": 6842 + }, + { + "epoch": 0.5832267962158015, + "grad_norm": 26.965894393207783, + "learning_rate": 9.731116201067339e-06, + "loss": 2.9926, + "step": 6843 + }, + { + "epoch": 0.583312025909827, + "grad_norm": 95.737868695842, + "learning_rate": 9.730955763881768e-06, + "loss": 4.7879, + "step": 6844 + }, + { + "epoch": 0.5833972556038524, + "grad_norm": 88.00576009568145, + "learning_rate": 9.730795280169127e-06, + "loss": 3.2743, + "step": 6845 + }, + { + "epoch": 0.5834824852978778, + "grad_norm": 43.480509570784356, + "learning_rate": 9.730634749930994e-06, + "loss": 4.3157, + "step": 6846 + }, + { + "epoch": 0.5835677149919032, + "grad_norm": 44.24310317290496, + "learning_rate": 9.730474173168949e-06, + "loss": 3.543, + "step": 6847 + }, + { + "epoch": 0.5836529446859285, + "grad_norm": 50.17954443856686, + "learning_rate": 9.73031354988457e-06, + "loss": 4.0602, + "step": 6848 + }, + { + "epoch": 0.583738174379954, + "grad_norm": 34.465808516205946, + "learning_rate": 9.730152880079437e-06, + "loss": 3.4032, + "step": 6849 + }, + { + "epoch": 0.5838234040739794, + "grad_norm": 61.5631034898874, + "learning_rate": 9.72999216375513e-06, + "loss": 3.8142, + "step": 6850 + }, + { + "epoch": 0.5839086337680047, + "grad_norm": 38.61641114285714, + "learning_rate": 9.729831400913228e-06, + "loss": 3.6751, + "step": 6851 + }, + { + "epoch": 0.5839938634620302, + "grad_norm": 53.739853662548235, + "learning_rate": 9.729670591555316e-06, + "loss": 4.2316, + "step": 6852 + }, + { + "epoch": 0.5840790931560556, + "grad_norm": 50.97031138807037, + "learning_rate": 9.729509735682972e-06, + "loss": 3.8149, + "step": 6853 + }, + { + "epoch": 0.584164322850081, + "grad_norm": 70.98971548885787, + "learning_rate": 9.729348833297778e-06, + "loss": 4.5964, + "step": 6854 + }, + { + "epoch": 0.5842495525441064, + "grad_norm": 102.68460751976332, + "learning_rate": 9.72918788440132e-06, + "loss": 3.8835, + "step": 6855 + }, + { + "epoch": 0.5843347822381317, + "grad_norm": 69.83568127637005, + "learning_rate": 9.729026888995175e-06, + "loss": 4.6111, + "step": 6856 + }, + { + "epoch": 0.5844200119321572, + "grad_norm": 65.87337182722665, + "learning_rate": 9.728865847080932e-06, + "loss": 3.8977, + "step": 6857 + }, + { + "epoch": 0.5845052416261826, + "grad_norm": 117.15882484292224, + "learning_rate": 9.728704758660172e-06, + "loss": 5.29, + "step": 6858 + }, + { + "epoch": 0.5845904713202079, + "grad_norm": 37.34502095354472, + "learning_rate": 9.728543623734479e-06, + "loss": 3.9823, + "step": 6859 + }, + { + "epoch": 0.5846757010142334, + "grad_norm": 63.61999894831475, + "learning_rate": 9.72838244230544e-06, + "loss": 4.3075, + "step": 6860 + }, + { + "epoch": 0.5847609307082587, + "grad_norm": 82.19757313802025, + "learning_rate": 9.728221214374637e-06, + "loss": 3.9543, + "step": 6861 + }, + { + "epoch": 0.5848461604022842, + "grad_norm": 70.01190427472507, + "learning_rate": 9.728059939943658e-06, + "loss": 4.2314, + "step": 6862 + }, + { + "epoch": 0.5849313900963096, + "grad_norm": 86.59011017522081, + "learning_rate": 9.727898619014089e-06, + "loss": 4.7539, + "step": 6863 + }, + { + "epoch": 0.5850166197903349, + "grad_norm": 46.25163052117771, + "learning_rate": 9.727737251587515e-06, + "loss": 3.493, + "step": 6864 + }, + { + "epoch": 0.5851018494843604, + "grad_norm": 45.299547703467994, + "learning_rate": 9.727575837665523e-06, + "loss": 3.8751, + "step": 6865 + }, + { + "epoch": 0.5851870791783857, + "grad_norm": 35.26402476729552, + "learning_rate": 9.727414377249702e-06, + "loss": 2.7976, + "step": 6866 + }, + { + "epoch": 0.5852723088724111, + "grad_norm": 101.04355929216206, + "learning_rate": 9.727252870341638e-06, + "loss": 5.1415, + "step": 6867 + }, + { + "epoch": 0.5853575385664366, + "grad_norm": 57.328994614963165, + "learning_rate": 9.727091316942921e-06, + "loss": 4.4676, + "step": 6868 + }, + { + "epoch": 0.5854427682604619, + "grad_norm": 56.145184394443426, + "learning_rate": 9.72692971705514e-06, + "loss": 3.7087, + "step": 6869 + }, + { + "epoch": 0.5855279979544873, + "grad_norm": 52.1926751008487, + "learning_rate": 9.726768070679882e-06, + "loss": 4.1588, + "step": 6870 + }, + { + "epoch": 0.5856132276485128, + "grad_norm": 38.00429356572144, + "learning_rate": 9.72660637781874e-06, + "loss": 4.047, + "step": 6871 + }, + { + "epoch": 0.5856984573425381, + "grad_norm": 36.33784541403047, + "learning_rate": 9.726444638473303e-06, + "loss": 3.7598, + "step": 6872 + }, + { + "epoch": 0.5857836870365636, + "grad_norm": 59.25769818736107, + "learning_rate": 9.726282852645158e-06, + "loss": 5.0217, + "step": 6873 + }, + { + "epoch": 0.5858689167305889, + "grad_norm": 35.03421278882942, + "learning_rate": 9.726121020335903e-06, + "loss": 2.7313, + "step": 6874 + }, + { + "epoch": 0.5859541464246143, + "grad_norm": 35.067533398494824, + "learning_rate": 9.725959141547123e-06, + "loss": 2.9403, + "step": 6875 + }, + { + "epoch": 0.5860393761186398, + "grad_norm": 49.51160974673813, + "learning_rate": 9.725797216280413e-06, + "loss": 4.1881, + "step": 6876 + }, + { + "epoch": 0.5861246058126651, + "grad_norm": 36.022024404996415, + "learning_rate": 9.725635244537366e-06, + "loss": 3.1729, + "step": 6877 + }, + { + "epoch": 0.5862098355066905, + "grad_norm": 51.711408309689546, + "learning_rate": 9.725473226319575e-06, + "loss": 4.446, + "step": 6878 + }, + { + "epoch": 0.586295065200716, + "grad_norm": 71.823530758149, + "learning_rate": 9.72531116162863e-06, + "loss": 3.8653, + "step": 6879 + }, + { + "epoch": 0.5863802948947413, + "grad_norm": 36.763749009036346, + "learning_rate": 9.72514905046613e-06, + "loss": 4.3649, + "step": 6880 + }, + { + "epoch": 0.5864655245887668, + "grad_norm": 231.78967595163522, + "learning_rate": 9.724986892833666e-06, + "loss": 5.5042, + "step": 6881 + }, + { + "epoch": 0.5865507542827921, + "grad_norm": 37.1599219258908, + "learning_rate": 9.724824688732833e-06, + "loss": 3.6153, + "step": 6882 + }, + { + "epoch": 0.5866359839768175, + "grad_norm": 42.67776124791589, + "learning_rate": 9.724662438165225e-06, + "loss": 4.3855, + "step": 6883 + }, + { + "epoch": 0.586721213670843, + "grad_norm": 67.30697568952719, + "learning_rate": 9.724500141132441e-06, + "loss": 3.4486, + "step": 6884 + }, + { + "epoch": 0.5868064433648683, + "grad_norm": 32.32200311797355, + "learning_rate": 9.724337797636077e-06, + "loss": 4.1081, + "step": 6885 + }, + { + "epoch": 0.5868916730588937, + "grad_norm": 35.51727415274878, + "learning_rate": 9.724175407677726e-06, + "loss": 3.4537, + "step": 6886 + }, + { + "epoch": 0.5869769027529191, + "grad_norm": 75.17478372915173, + "learning_rate": 9.724012971258987e-06, + "loss": 4.7391, + "step": 6887 + }, + { + "epoch": 0.5870621324469445, + "grad_norm": 37.06038171704543, + "learning_rate": 9.723850488381457e-06, + "loss": 4.0428, + "step": 6888 + }, + { + "epoch": 0.58714736214097, + "grad_norm": 69.10542441509763, + "learning_rate": 9.723687959046737e-06, + "loss": 3.5073, + "step": 6889 + }, + { + "epoch": 0.5872325918349953, + "grad_norm": 58.65424214362811, + "learning_rate": 9.723525383256421e-06, + "loss": 4.6424, + "step": 6890 + }, + { + "epoch": 0.5873178215290207, + "grad_norm": 80.02399809258606, + "learning_rate": 9.72336276101211e-06, + "loss": 4.9278, + "step": 6891 + }, + { + "epoch": 0.5874030512230461, + "grad_norm": 110.5366162359318, + "learning_rate": 9.723200092315404e-06, + "loss": 4.343, + "step": 6892 + }, + { + "epoch": 0.5874882809170715, + "grad_norm": 72.5111920285422, + "learning_rate": 9.723037377167901e-06, + "loss": 3.548, + "step": 6893 + }, + { + "epoch": 0.5875735106110969, + "grad_norm": 47.58020234020506, + "learning_rate": 9.722874615571203e-06, + "loss": 4.2882, + "step": 6894 + }, + { + "epoch": 0.5876587403051223, + "grad_norm": 87.97882096985975, + "learning_rate": 9.722711807526908e-06, + "loss": 4.8517, + "step": 6895 + }, + { + "epoch": 0.5877439699991477, + "grad_norm": 51.25490900856505, + "learning_rate": 9.722548953036622e-06, + "loss": 4.4607, + "step": 6896 + }, + { + "epoch": 0.5878291996931732, + "grad_norm": 33.10123622784721, + "learning_rate": 9.72238605210194e-06, + "loss": 3.171, + "step": 6897 + }, + { + "epoch": 0.5879144293871985, + "grad_norm": 50.54140969992499, + "learning_rate": 9.722223104724471e-06, + "loss": 3.3733, + "step": 6898 + }, + { + "epoch": 0.5879996590812239, + "grad_norm": 82.2076808372868, + "learning_rate": 9.722060110905812e-06, + "loss": 4.2323, + "step": 6899 + }, + { + "epoch": 0.5880848887752493, + "grad_norm": 71.82267973079314, + "learning_rate": 9.721897070647568e-06, + "loss": 3.9611, + "step": 6900 + }, + { + "epoch": 0.5881701184692747, + "grad_norm": 81.0061302394703, + "learning_rate": 9.721733983951345e-06, + "loss": 5.2065, + "step": 6901 + }, + { + "epoch": 0.5882553481633, + "grad_norm": 45.092444384248786, + "learning_rate": 9.721570850818742e-06, + "loss": 3.2038, + "step": 6902 + }, + { + "epoch": 0.5883405778573255, + "grad_norm": 82.89568976994882, + "learning_rate": 9.721407671251368e-06, + "loss": 2.9518, + "step": 6903 + }, + { + "epoch": 0.5884258075513509, + "grad_norm": 79.74365416436689, + "learning_rate": 9.721244445250824e-06, + "loss": 5.7819, + "step": 6904 + }, + { + "epoch": 0.5885110372453762, + "grad_norm": 41.949062361500026, + "learning_rate": 9.721081172818718e-06, + "loss": 3.4991, + "step": 6905 + }, + { + "epoch": 0.5885962669394017, + "grad_norm": 63.99214477628285, + "learning_rate": 9.720917853956654e-06, + "loss": 4.9762, + "step": 6906 + }, + { + "epoch": 0.5886814966334271, + "grad_norm": 64.72868286397043, + "learning_rate": 9.720754488666239e-06, + "loss": 4.3467, + "step": 6907 + }, + { + "epoch": 0.5887667263274525, + "grad_norm": 64.429338668836, + "learning_rate": 9.72059107694908e-06, + "loss": 5.5539, + "step": 6908 + }, + { + "epoch": 0.5888519560214779, + "grad_norm": 30.466223363686037, + "learning_rate": 9.720427618806782e-06, + "loss": 2.895, + "step": 6909 + }, + { + "epoch": 0.5889371857155032, + "grad_norm": 50.1779793491123, + "learning_rate": 9.720264114240955e-06, + "loss": 3.5221, + "step": 6910 + }, + { + "epoch": 0.5890224154095287, + "grad_norm": 38.96664827283243, + "learning_rate": 9.720100563253207e-06, + "loss": 3.5661, + "step": 6911 + }, + { + "epoch": 0.5891076451035541, + "grad_norm": 54.348947549884144, + "learning_rate": 9.719936965845144e-06, + "loss": 3.0423, + "step": 6912 + }, + { + "epoch": 0.5891928747975794, + "grad_norm": 40.02427338846298, + "learning_rate": 9.719773322018378e-06, + "loss": 3.7997, + "step": 6913 + }, + { + "epoch": 0.5892781044916049, + "grad_norm": 62.44795569263393, + "learning_rate": 9.719609631774515e-06, + "loss": 5.1256, + "step": 6914 + }, + { + "epoch": 0.5893633341856302, + "grad_norm": 49.57581847480429, + "learning_rate": 9.719445895115167e-06, + "loss": 4.181, + "step": 6915 + }, + { + "epoch": 0.5894485638796557, + "grad_norm": 33.993865740995126, + "learning_rate": 9.719282112041945e-06, + "loss": 2.7781, + "step": 6916 + }, + { + "epoch": 0.5895337935736811, + "grad_norm": 34.69132261682094, + "learning_rate": 9.719118282556458e-06, + "loss": 2.9189, + "step": 6917 + }, + { + "epoch": 0.5896190232677064, + "grad_norm": 67.34578794633948, + "learning_rate": 9.718954406660319e-06, + "loss": 4.5899, + "step": 6918 + }, + { + "epoch": 0.5897042529617319, + "grad_norm": 88.70596915917193, + "learning_rate": 9.718790484355137e-06, + "loss": 5.5396, + "step": 6919 + }, + { + "epoch": 0.5897894826557573, + "grad_norm": 110.72624074317294, + "learning_rate": 9.718626515642526e-06, + "loss": 4.2988, + "step": 6920 + }, + { + "epoch": 0.5898747123497826, + "grad_norm": 179.34700006465349, + "learning_rate": 9.7184625005241e-06, + "loss": 5.6158, + "step": 6921 + }, + { + "epoch": 0.5899599420438081, + "grad_norm": 52.65260717676119, + "learning_rate": 9.718298439001468e-06, + "loss": 3.8407, + "step": 6922 + }, + { + "epoch": 0.5900451717378334, + "grad_norm": 60.11615805623743, + "learning_rate": 9.718134331076245e-06, + "loss": 4.0671, + "step": 6923 + }, + { + "epoch": 0.5901304014318589, + "grad_norm": 71.06882661837277, + "learning_rate": 9.717970176750047e-06, + "loss": 4.1339, + "step": 6924 + }, + { + "epoch": 0.5902156311258843, + "grad_norm": 96.69679926954777, + "learning_rate": 9.717805976024489e-06, + "loss": 3.7061, + "step": 6925 + }, + { + "epoch": 0.5903008608199096, + "grad_norm": 43.72858053713466, + "learning_rate": 9.717641728901181e-06, + "loss": 3.3606, + "step": 6926 + }, + { + "epoch": 0.5903860905139351, + "grad_norm": 99.24293836405508, + "learning_rate": 9.717477435381742e-06, + "loss": 5.345, + "step": 6927 + }, + { + "epoch": 0.5904713202079604, + "grad_norm": 37.87792862177885, + "learning_rate": 9.717313095467787e-06, + "loss": 3.2405, + "step": 6928 + }, + { + "epoch": 0.5905565499019858, + "grad_norm": 45.14804655845818, + "learning_rate": 9.717148709160932e-06, + "loss": 3.922, + "step": 6929 + }, + { + "epoch": 0.5906417795960113, + "grad_norm": 38.09406056608441, + "learning_rate": 9.716984276462796e-06, + "loss": 3.5992, + "step": 6930 + }, + { + "epoch": 0.5907270092900366, + "grad_norm": 38.70278048444596, + "learning_rate": 9.716819797374992e-06, + "loss": 3.5421, + "step": 6931 + }, + { + "epoch": 0.5908122389840621, + "grad_norm": 40.46692026448401, + "learning_rate": 9.71665527189914e-06, + "loss": 4.3925, + "step": 6932 + }, + { + "epoch": 0.5908974686780875, + "grad_norm": 57.127788112844776, + "learning_rate": 9.716490700036858e-06, + "loss": 4.9273, + "step": 6933 + }, + { + "epoch": 0.5909826983721128, + "grad_norm": 35.37232579466286, + "learning_rate": 9.716326081789762e-06, + "loss": 2.6124, + "step": 6934 + }, + { + "epoch": 0.5910679280661383, + "grad_norm": 56.639592858068944, + "learning_rate": 9.716161417159475e-06, + "loss": 3.7287, + "step": 6935 + }, + { + "epoch": 0.5911531577601636, + "grad_norm": 42.31089872163457, + "learning_rate": 9.715996706147614e-06, + "loss": 3.9313, + "step": 6936 + }, + { + "epoch": 0.591238387454189, + "grad_norm": 58.31974911871015, + "learning_rate": 9.7158319487558e-06, + "loss": 3.9981, + "step": 6937 + }, + { + "epoch": 0.5913236171482145, + "grad_norm": 42.77856039569779, + "learning_rate": 9.715667144985653e-06, + "loss": 4.0554, + "step": 6938 + }, + { + "epoch": 0.5914088468422398, + "grad_norm": 38.63998301878826, + "learning_rate": 9.715502294838792e-06, + "loss": 3.6521, + "step": 6939 + }, + { + "epoch": 0.5914940765362653, + "grad_norm": 78.16266041956668, + "learning_rate": 9.71533739831684e-06, + "loss": 3.8719, + "step": 6940 + }, + { + "epoch": 0.5915793062302906, + "grad_norm": 79.75610028347555, + "learning_rate": 9.715172455421418e-06, + "loss": 4.6138, + "step": 6941 + }, + { + "epoch": 0.591664535924316, + "grad_norm": 47.586730738104485, + "learning_rate": 9.715007466154149e-06, + "loss": 2.7464, + "step": 6942 + }, + { + "epoch": 0.5917497656183415, + "grad_norm": 48.94432008956414, + "learning_rate": 9.714842430516656e-06, + "loss": 3.7049, + "step": 6943 + }, + { + "epoch": 0.5918349953123668, + "grad_norm": 55.14639608421955, + "learning_rate": 9.71467734851056e-06, + "loss": 3.8355, + "step": 6944 + }, + { + "epoch": 0.5919202250063922, + "grad_norm": 44.60251400955862, + "learning_rate": 9.714512220137486e-06, + "loss": 3.3088, + "step": 6945 + }, + { + "epoch": 0.5920054547004177, + "grad_norm": 31.141796852271575, + "learning_rate": 9.714347045399058e-06, + "loss": 3.5023, + "step": 6946 + }, + { + "epoch": 0.592090684394443, + "grad_norm": 41.4538692011842, + "learning_rate": 9.7141818242969e-06, + "loss": 3.5525, + "step": 6947 + }, + { + "epoch": 0.5921759140884684, + "grad_norm": 69.39710887581623, + "learning_rate": 9.714016556832637e-06, + "loss": 4.2704, + "step": 6948 + }, + { + "epoch": 0.5922611437824938, + "grad_norm": 68.73225405007291, + "learning_rate": 9.713851243007894e-06, + "loss": 4.122, + "step": 6949 + }, + { + "epoch": 0.5923463734765192, + "grad_norm": 33.43424408122639, + "learning_rate": 9.713685882824298e-06, + "loss": 2.6227, + "step": 6950 + }, + { + "epoch": 0.5924316031705447, + "grad_norm": 114.64306224866894, + "learning_rate": 9.713520476283473e-06, + "loss": 4.4606, + "step": 6951 + }, + { + "epoch": 0.59251683286457, + "grad_norm": 37.990964912955405, + "learning_rate": 9.713355023387048e-06, + "loss": 4.3593, + "step": 6952 + }, + { + "epoch": 0.5926020625585954, + "grad_norm": 296.6112854800947, + "learning_rate": 9.713189524136651e-06, + "loss": 4.8186, + "step": 6953 + }, + { + "epoch": 0.5926872922526208, + "grad_norm": 39.67344553576479, + "learning_rate": 9.713023978533904e-06, + "loss": 3.6707, + "step": 6954 + }, + { + "epoch": 0.5927725219466462, + "grad_norm": 60.66258842686105, + "learning_rate": 9.712858386580441e-06, + "loss": 3.7681, + "step": 6955 + }, + { + "epoch": 0.5928577516406716, + "grad_norm": 65.32585116708945, + "learning_rate": 9.71269274827789e-06, + "loss": 4.6719, + "step": 6956 + }, + { + "epoch": 0.592942981334697, + "grad_norm": 69.38678009159946, + "learning_rate": 9.712527063627876e-06, + "loss": 4.9222, + "step": 6957 + }, + { + "epoch": 0.5930282110287224, + "grad_norm": 67.3486816333277, + "learning_rate": 9.712361332632033e-06, + "loss": 4.9423, + "step": 6958 + }, + { + "epoch": 0.5931134407227479, + "grad_norm": 79.840887295082, + "learning_rate": 9.712195555291986e-06, + "loss": 3.5147, + "step": 6959 + }, + { + "epoch": 0.5931986704167732, + "grad_norm": 61.26922101767539, + "learning_rate": 9.71202973160937e-06, + "loss": 4.12, + "step": 6960 + }, + { + "epoch": 0.5932839001107986, + "grad_norm": 34.96703382733217, + "learning_rate": 9.711863861585813e-06, + "loss": 3.3681, + "step": 6961 + }, + { + "epoch": 0.593369129804824, + "grad_norm": 33.89704373229677, + "learning_rate": 9.711697945222949e-06, + "loss": 3.829, + "step": 6962 + }, + { + "epoch": 0.5934543594988494, + "grad_norm": 113.36747235833356, + "learning_rate": 9.711531982522406e-06, + "loss": 4.7689, + "step": 6963 + }, + { + "epoch": 0.5935395891928748, + "grad_norm": 64.10612439940584, + "learning_rate": 9.71136597348582e-06, + "loss": 3.2107, + "step": 6964 + }, + { + "epoch": 0.5936248188869002, + "grad_norm": 56.20785441644328, + "learning_rate": 9.71119991811482e-06, + "loss": 4.2381, + "step": 6965 + }, + { + "epoch": 0.5937100485809256, + "grad_norm": 74.64458096640583, + "learning_rate": 9.711033816411042e-06, + "loss": 4.2002, + "step": 6966 + }, + { + "epoch": 0.593795278274951, + "grad_norm": 71.25277124326492, + "learning_rate": 9.710867668376118e-06, + "loss": 3.6284, + "step": 6967 + }, + { + "epoch": 0.5938805079689764, + "grad_norm": 113.9919605916266, + "learning_rate": 9.710701474011683e-06, + "loss": 4.2053, + "step": 6968 + }, + { + "epoch": 0.5939657376630018, + "grad_norm": 145.32075586419816, + "learning_rate": 9.710535233319371e-06, + "loss": 5.5346, + "step": 6969 + }, + { + "epoch": 0.5940509673570272, + "grad_norm": 63.9966003594494, + "learning_rate": 9.710368946300816e-06, + "loss": 4.2726, + "step": 6970 + }, + { + "epoch": 0.5941361970510526, + "grad_norm": 63.92549729709942, + "learning_rate": 9.710202612957653e-06, + "loss": 4.0621, + "step": 6971 + }, + { + "epoch": 0.5942214267450779, + "grad_norm": 59.892989352350604, + "learning_rate": 9.71003623329152e-06, + "loss": 3.6648, + "step": 6972 + }, + { + "epoch": 0.5943066564391034, + "grad_norm": 50.07843405474428, + "learning_rate": 9.709869807304053e-06, + "loss": 3.6771, + "step": 6973 + }, + { + "epoch": 0.5943918861331288, + "grad_norm": 40.719041913802506, + "learning_rate": 9.709703334996887e-06, + "loss": 3.3379, + "step": 6974 + }, + { + "epoch": 0.5944771158271542, + "grad_norm": 60.71052577929627, + "learning_rate": 9.70953681637166e-06, + "loss": 3.8392, + "step": 6975 + }, + { + "epoch": 0.5945623455211796, + "grad_norm": 35.79780579551875, + "learning_rate": 9.709370251430011e-06, + "loss": 3.7075, + "step": 6976 + }, + { + "epoch": 0.594647575215205, + "grad_norm": 35.83007005814753, + "learning_rate": 9.709203640173576e-06, + "loss": 2.9847, + "step": 6977 + }, + { + "epoch": 0.5947328049092304, + "grad_norm": 48.33308308218786, + "learning_rate": 9.709036982603994e-06, + "loss": 2.5303, + "step": 6978 + }, + { + "epoch": 0.5948180346032558, + "grad_norm": 42.89468291189227, + "learning_rate": 9.708870278722906e-06, + "loss": 3.7906, + "step": 6979 + }, + { + "epoch": 0.5949032642972811, + "grad_norm": 151.62690107220612, + "learning_rate": 9.70870352853195e-06, + "loss": 4.9569, + "step": 6980 + }, + { + "epoch": 0.5949884939913066, + "grad_norm": 81.25951822599832, + "learning_rate": 9.708536732032763e-06, + "loss": 4.1694, + "step": 6981 + }, + { + "epoch": 0.595073723685332, + "grad_norm": 166.53960370219994, + "learning_rate": 9.708369889226993e-06, + "loss": 3.6764, + "step": 6982 + }, + { + "epoch": 0.5951589533793573, + "grad_norm": 112.11940482216214, + "learning_rate": 9.708203000116272e-06, + "loss": 4.8001, + "step": 6983 + }, + { + "epoch": 0.5952441830733828, + "grad_norm": 55.11759382190206, + "learning_rate": 9.708036064702246e-06, + "loss": 5.2375, + "step": 6984 + }, + { + "epoch": 0.5953294127674081, + "grad_norm": 39.01778138514402, + "learning_rate": 9.707869082986556e-06, + "loss": 4.0099, + "step": 6985 + }, + { + "epoch": 0.5954146424614336, + "grad_norm": 89.63633428149335, + "learning_rate": 9.707702054970845e-06, + "loss": 5.0444, + "step": 6986 + }, + { + "epoch": 0.595499872155459, + "grad_norm": 100.4246159669026, + "learning_rate": 9.707534980656755e-06, + "loss": 5.1281, + "step": 6987 + }, + { + "epoch": 0.5955851018494843, + "grad_norm": 59.06161149015458, + "learning_rate": 9.707367860045929e-06, + "loss": 3.4639, + "step": 6988 + }, + { + "epoch": 0.5956703315435098, + "grad_norm": 62.11891372932454, + "learning_rate": 9.707200693140011e-06, + "loss": 3.6982, + "step": 6989 + }, + { + "epoch": 0.5957555612375351, + "grad_norm": 32.84520575917572, + "learning_rate": 9.707033479940644e-06, + "loss": 3.5711, + "step": 6990 + }, + { + "epoch": 0.5958407909315605, + "grad_norm": 65.5365811738721, + "learning_rate": 9.706866220449473e-06, + "loss": 4.8411, + "step": 6991 + }, + { + "epoch": 0.595926020625586, + "grad_norm": 31.07553579527621, + "learning_rate": 9.706698914668143e-06, + "loss": 3.0155, + "step": 6992 + }, + { + "epoch": 0.5960112503196113, + "grad_norm": 37.551366323951, + "learning_rate": 9.7065315625983e-06, + "loss": 3.6396, + "step": 6993 + }, + { + "epoch": 0.5960964800136368, + "grad_norm": 40.65854085639585, + "learning_rate": 9.706364164241588e-06, + "loss": 4.4494, + "step": 6994 + }, + { + "epoch": 0.5961817097076622, + "grad_norm": 62.777678797351825, + "learning_rate": 9.706196719599657e-06, + "loss": 4.5385, + "step": 6995 + }, + { + "epoch": 0.5962669394016875, + "grad_norm": 78.55718294983953, + "learning_rate": 9.70602922867415e-06, + "loss": 3.4138, + "step": 6996 + }, + { + "epoch": 0.596352169095713, + "grad_norm": 47.941975228927554, + "learning_rate": 9.705861691466717e-06, + "loss": 4.4747, + "step": 6997 + }, + { + "epoch": 0.5964373987897383, + "grad_norm": 51.006912053624305, + "learning_rate": 9.705694107979e-06, + "loss": 3.202, + "step": 6998 + }, + { + "epoch": 0.5965226284837637, + "grad_norm": 56.81600922297004, + "learning_rate": 9.705526478212654e-06, + "loss": 3.8198, + "step": 6999 + }, + { + "epoch": 0.5966078581777892, + "grad_norm": 49.142630783549876, + "learning_rate": 9.705358802169326e-06, + "loss": 3.7032, + "step": 7000 + }, + { + "epoch": 0.5966930878718145, + "grad_norm": 61.33117755365235, + "learning_rate": 9.705191079850662e-06, + "loss": 4.451, + "step": 7001 + }, + { + "epoch": 0.59677831756584, + "grad_norm": 36.17084039038516, + "learning_rate": 9.705023311258315e-06, + "loss": 4.0013, + "step": 7002 + }, + { + "epoch": 0.5968635472598653, + "grad_norm": 38.19783684183057, + "learning_rate": 9.704855496393933e-06, + "loss": 3.657, + "step": 7003 + }, + { + "epoch": 0.5969487769538907, + "grad_norm": 131.2588660750787, + "learning_rate": 9.704687635259164e-06, + "loss": 5.6731, + "step": 7004 + }, + { + "epoch": 0.5970340066479162, + "grad_norm": 54.27275446349738, + "learning_rate": 9.704519727855666e-06, + "loss": 3.8397, + "step": 7005 + }, + { + "epoch": 0.5971192363419415, + "grad_norm": 63.216203074348286, + "learning_rate": 9.704351774185084e-06, + "loss": 4.3762, + "step": 7006 + }, + { + "epoch": 0.5972044660359669, + "grad_norm": 32.06172752608586, + "learning_rate": 9.70418377424907e-06, + "loss": 2.4199, + "step": 7007 + }, + { + "epoch": 0.5972896957299924, + "grad_norm": 53.19281784890904, + "learning_rate": 9.704015728049279e-06, + "loss": 3.4324, + "step": 7008 + }, + { + "epoch": 0.5973749254240177, + "grad_norm": 108.44175635296705, + "learning_rate": 9.703847635587363e-06, + "loss": 4.9976, + "step": 7009 + }, + { + "epoch": 0.5974601551180432, + "grad_norm": 40.64618153083282, + "learning_rate": 9.703679496864974e-06, + "loss": 4.0732, + "step": 7010 + }, + { + "epoch": 0.5975453848120685, + "grad_norm": 127.45224384213857, + "learning_rate": 9.703511311883767e-06, + "loss": 4.1368, + "step": 7011 + }, + { + "epoch": 0.5976306145060939, + "grad_norm": 50.96481461290079, + "learning_rate": 9.703343080645394e-06, + "loss": 4.0996, + "step": 7012 + }, + { + "epoch": 0.5977158442001194, + "grad_norm": 47.50244248429764, + "learning_rate": 9.703174803151512e-06, + "loss": 4.4342, + "step": 7013 + }, + { + "epoch": 0.5978010738941447, + "grad_norm": 72.42199864643412, + "learning_rate": 9.703006479403773e-06, + "loss": 3.8155, + "step": 7014 + }, + { + "epoch": 0.5978863035881701, + "grad_norm": 39.593239969126174, + "learning_rate": 9.702838109403836e-06, + "loss": 2.6841, + "step": 7015 + }, + { + "epoch": 0.5979715332821955, + "grad_norm": 123.51776828348888, + "learning_rate": 9.702669693153354e-06, + "loss": 4.6055, + "step": 7016 + }, + { + "epoch": 0.5980567629762209, + "grad_norm": 34.934086394207156, + "learning_rate": 9.702501230653985e-06, + "loss": 3.6744, + "step": 7017 + }, + { + "epoch": 0.5981419926702463, + "grad_norm": 39.25550040678407, + "learning_rate": 9.702332721907383e-06, + "loss": 3.6968, + "step": 7018 + }, + { + "epoch": 0.5982272223642717, + "grad_norm": 51.12828958912287, + "learning_rate": 9.70216416691521e-06, + "loss": 4.8976, + "step": 7019 + }, + { + "epoch": 0.5983124520582971, + "grad_norm": 81.01379423734522, + "learning_rate": 9.701995565679118e-06, + "loss": 4.5961, + "step": 7020 + }, + { + "epoch": 0.5983976817523226, + "grad_norm": 97.20155282593495, + "learning_rate": 9.70182691820077e-06, + "loss": 4.9694, + "step": 7021 + }, + { + "epoch": 0.5984829114463479, + "grad_norm": 42.398255825187334, + "learning_rate": 9.701658224481821e-06, + "loss": 3.8248, + "step": 7022 + }, + { + "epoch": 0.5985681411403733, + "grad_norm": 52.99705168679998, + "learning_rate": 9.701489484523932e-06, + "loss": 3.4705, + "step": 7023 + }, + { + "epoch": 0.5986533708343987, + "grad_norm": 70.75445129870988, + "learning_rate": 9.701320698328764e-06, + "loss": 3.7776, + "step": 7024 + }, + { + "epoch": 0.5987386005284241, + "grad_norm": 77.93200902321624, + "learning_rate": 9.701151865897972e-06, + "loss": 5.5032, + "step": 7025 + }, + { + "epoch": 0.5988238302224494, + "grad_norm": 91.16098766028853, + "learning_rate": 9.700982987233222e-06, + "loss": 5.0511, + "step": 7026 + }, + { + "epoch": 0.5989090599164749, + "grad_norm": 142.27635707163287, + "learning_rate": 9.700814062336172e-06, + "loss": 5.9651, + "step": 7027 + }, + { + "epoch": 0.5989942896105003, + "grad_norm": 75.81101638523023, + "learning_rate": 9.700645091208482e-06, + "loss": 3.3582, + "step": 7028 + }, + { + "epoch": 0.5990795193045257, + "grad_norm": 42.743745516710796, + "learning_rate": 9.700476073851817e-06, + "loss": 2.7344, + "step": 7029 + }, + { + "epoch": 0.5991647489985511, + "grad_norm": 32.62659844604043, + "learning_rate": 9.700307010267837e-06, + "loss": 3.0609, + "step": 7030 + }, + { + "epoch": 0.5992499786925765, + "grad_norm": 32.27631568535001, + "learning_rate": 9.700137900458207e-06, + "loss": 3.6521, + "step": 7031 + }, + { + "epoch": 0.5993352083866019, + "grad_norm": 50.97199511789842, + "learning_rate": 9.699968744424587e-06, + "loss": 4.2567, + "step": 7032 + }, + { + "epoch": 0.5994204380806273, + "grad_norm": 72.20117454065321, + "learning_rate": 9.69979954216864e-06, + "loss": 4.278, + "step": 7033 + }, + { + "epoch": 0.5995056677746526, + "grad_norm": 86.91607266132387, + "learning_rate": 9.699630293692033e-06, + "loss": 4.4505, + "step": 7034 + }, + { + "epoch": 0.5995908974686781, + "grad_norm": 34.726871620409725, + "learning_rate": 9.699460998996431e-06, + "loss": 3.7319, + "step": 7035 + }, + { + "epoch": 0.5996761271627035, + "grad_norm": 75.29450460770276, + "learning_rate": 9.699291658083496e-06, + "loss": 3.5763, + "step": 7036 + }, + { + "epoch": 0.5997613568567289, + "grad_norm": 45.46904908921865, + "learning_rate": 9.699122270954896e-06, + "loss": 4.1484, + "step": 7037 + }, + { + "epoch": 0.5998465865507543, + "grad_norm": 36.089084827975114, + "learning_rate": 9.698952837612296e-06, + "loss": 3.3908, + "step": 7038 + }, + { + "epoch": 0.5999318162447796, + "grad_norm": 53.806781737931104, + "learning_rate": 9.69878335805736e-06, + "loss": 4.1137, + "step": 7039 + }, + { + "epoch": 0.6000170459388051, + "grad_norm": 37.9987395642859, + "learning_rate": 9.69861383229176e-06, + "loss": 3.8777, + "step": 7040 + }, + { + "epoch": 0.6001022756328305, + "grad_norm": 54.073464028650285, + "learning_rate": 9.698444260317157e-06, + "loss": 3.234, + "step": 7041 + }, + { + "epoch": 0.6001875053268558, + "grad_norm": 51.72689450205108, + "learning_rate": 9.698274642135224e-06, + "loss": 3.7705, + "step": 7042 + }, + { + "epoch": 0.6002727350208813, + "grad_norm": 39.27326732625189, + "learning_rate": 9.698104977747624e-06, + "loss": 4.3065, + "step": 7043 + }, + { + "epoch": 0.6003579647149067, + "grad_norm": 45.30243600139912, + "learning_rate": 9.697935267156031e-06, + "loss": 3.4204, + "step": 7044 + }, + { + "epoch": 0.6004431944089321, + "grad_norm": 40.077300424269, + "learning_rate": 9.69776551036211e-06, + "loss": 3.9492, + "step": 7045 + }, + { + "epoch": 0.6005284241029575, + "grad_norm": 34.17544204973927, + "learning_rate": 9.697595707367533e-06, + "loss": 4.5627, + "step": 7046 + }, + { + "epoch": 0.6006136537969828, + "grad_norm": 40.5545319276427, + "learning_rate": 9.697425858173968e-06, + "loss": 4.1468, + "step": 7047 + }, + { + "epoch": 0.6006988834910083, + "grad_norm": 66.15352941965777, + "learning_rate": 9.697255962783086e-06, + "loss": 4.5821, + "step": 7048 + }, + { + "epoch": 0.6007841131850337, + "grad_norm": 51.15382188689984, + "learning_rate": 9.697086021196558e-06, + "loss": 4.1524, + "step": 7049 + }, + { + "epoch": 0.600869342879059, + "grad_norm": 69.52098374368188, + "learning_rate": 9.696916033416057e-06, + "loss": 4.7376, + "step": 7050 + }, + { + "epoch": 0.6009545725730845, + "grad_norm": 35.381165643821674, + "learning_rate": 9.696745999443251e-06, + "loss": 3.7743, + "step": 7051 + }, + { + "epoch": 0.6010398022671098, + "grad_norm": 61.363570087538015, + "learning_rate": 9.696575919279816e-06, + "loss": 4.2132, + "step": 7052 + }, + { + "epoch": 0.6011250319611353, + "grad_norm": 60.316394410717365, + "learning_rate": 9.696405792927423e-06, + "loss": 4.2822, + "step": 7053 + }, + { + "epoch": 0.6012102616551607, + "grad_norm": 55.4834793032257, + "learning_rate": 9.696235620387743e-06, + "loss": 4.0518, + "step": 7054 + }, + { + "epoch": 0.601295491349186, + "grad_norm": 81.20880290378349, + "learning_rate": 9.696065401662456e-06, + "loss": 3.8428, + "step": 7055 + }, + { + "epoch": 0.6013807210432115, + "grad_norm": 39.644925436994654, + "learning_rate": 9.695895136753228e-06, + "loss": 3.7391, + "step": 7056 + }, + { + "epoch": 0.6014659507372369, + "grad_norm": 46.01834051923324, + "learning_rate": 9.695724825661738e-06, + "loss": 4.177, + "step": 7057 + }, + { + "epoch": 0.6015511804312622, + "grad_norm": 76.22799951638555, + "learning_rate": 9.695554468389664e-06, + "loss": 4.6106, + "step": 7058 + }, + { + "epoch": 0.6016364101252877, + "grad_norm": 51.77771229457778, + "learning_rate": 9.695384064938674e-06, + "loss": 5.2349, + "step": 7059 + }, + { + "epoch": 0.601721639819313, + "grad_norm": 107.39047032105739, + "learning_rate": 9.695213615310448e-06, + "loss": 4.9414, + "step": 7060 + }, + { + "epoch": 0.6018068695133384, + "grad_norm": 104.07510877172551, + "learning_rate": 9.695043119506661e-06, + "loss": 6.2566, + "step": 7061 + }, + { + "epoch": 0.6018920992073639, + "grad_norm": 56.803382028305755, + "learning_rate": 9.69487257752899e-06, + "loss": 3.8927, + "step": 7062 + }, + { + "epoch": 0.6019773289013892, + "grad_norm": 55.581171959181866, + "learning_rate": 9.694701989379114e-06, + "loss": 4.9396, + "step": 7063 + }, + { + "epoch": 0.6020625585954147, + "grad_norm": 55.34941256189842, + "learning_rate": 9.69453135505871e-06, + "loss": 5.3906, + "step": 7064 + }, + { + "epoch": 0.60214778828944, + "grad_norm": 44.673216582928, + "learning_rate": 9.694360674569457e-06, + "loss": 4.6537, + "step": 7065 + }, + { + "epoch": 0.6022330179834654, + "grad_norm": 83.1589884445959, + "learning_rate": 9.694189947913029e-06, + "loss": 2.45, + "step": 7066 + }, + { + "epoch": 0.6023182476774909, + "grad_norm": 47.00639005359675, + "learning_rate": 9.69401917509111e-06, + "loss": 3.988, + "step": 7067 + }, + { + "epoch": 0.6024034773715162, + "grad_norm": 47.20823588869726, + "learning_rate": 9.693848356105377e-06, + "loss": 4.2005, + "step": 7068 + }, + { + "epoch": 0.6024887070655416, + "grad_norm": 109.84406727640544, + "learning_rate": 9.69367749095751e-06, + "loss": 3.5082, + "step": 7069 + }, + { + "epoch": 0.602573936759567, + "grad_norm": 64.86771803943991, + "learning_rate": 9.69350657964919e-06, + "loss": 3.7705, + "step": 7070 + }, + { + "epoch": 0.6026591664535924, + "grad_norm": 32.99649692668177, + "learning_rate": 9.693335622182098e-06, + "loss": 3.0171, + "step": 7071 + }, + { + "epoch": 0.6027443961476179, + "grad_norm": 33.65387575244305, + "learning_rate": 9.693164618557916e-06, + "loss": 3.8762, + "step": 7072 + }, + { + "epoch": 0.6028296258416432, + "grad_norm": 38.854989710668065, + "learning_rate": 9.692993568778324e-06, + "loss": 3.0511, + "step": 7073 + }, + { + "epoch": 0.6029148555356686, + "grad_norm": 33.077290738986235, + "learning_rate": 9.692822472845006e-06, + "loss": 4.4952, + "step": 7074 + }, + { + "epoch": 0.6030000852296941, + "grad_norm": 68.9407027553675, + "learning_rate": 9.692651330759644e-06, + "loss": 4.9909, + "step": 7075 + }, + { + "epoch": 0.6030853149237194, + "grad_norm": 72.39977782014128, + "learning_rate": 9.69248014252392e-06, + "loss": 4.7422, + "step": 7076 + }, + { + "epoch": 0.6031705446177448, + "grad_norm": 55.00332126370176, + "learning_rate": 9.692308908139517e-06, + "loss": 4.3093, + "step": 7077 + }, + { + "epoch": 0.6032557743117702, + "grad_norm": 40.80145886847741, + "learning_rate": 9.692137627608123e-06, + "loss": 3.6594, + "step": 7078 + }, + { + "epoch": 0.6033410040057956, + "grad_norm": 100.6511139750617, + "learning_rate": 9.691966300931418e-06, + "loss": 6.2896, + "step": 7079 + }, + { + "epoch": 0.6034262336998211, + "grad_norm": 66.29314477242066, + "learning_rate": 9.69179492811109e-06, + "loss": 4.8273, + "step": 7080 + }, + { + "epoch": 0.6035114633938464, + "grad_norm": 56.62013719247796, + "learning_rate": 9.691623509148821e-06, + "loss": 4.9309, + "step": 7081 + }, + { + "epoch": 0.6035966930878718, + "grad_norm": 46.59115777091726, + "learning_rate": 9.691452044046302e-06, + "loss": 3.8307, + "step": 7082 + }, + { + "epoch": 0.6036819227818973, + "grad_norm": 52.89769994149706, + "learning_rate": 9.691280532805215e-06, + "loss": 3.7305, + "step": 7083 + }, + { + "epoch": 0.6037671524759226, + "grad_norm": 33.1630682016017, + "learning_rate": 9.691108975427248e-06, + "loss": 3.3395, + "step": 7084 + }, + { + "epoch": 0.603852382169948, + "grad_norm": 56.016746169088826, + "learning_rate": 9.690937371914088e-06, + "loss": 4.8031, + "step": 7085 + }, + { + "epoch": 0.6039376118639734, + "grad_norm": 34.00515457034019, + "learning_rate": 9.690765722267423e-06, + "loss": 3.7865, + "step": 7086 + }, + { + "epoch": 0.6040228415579988, + "grad_norm": 87.19263821971909, + "learning_rate": 9.690594026488941e-06, + "loss": 3.9235, + "step": 7087 + }, + { + "epoch": 0.6041080712520243, + "grad_norm": 52.79023338199815, + "learning_rate": 9.690422284580332e-06, + "loss": 4.0351, + "step": 7088 + }, + { + "epoch": 0.6041933009460496, + "grad_norm": 32.661130937833164, + "learning_rate": 9.690250496543283e-06, + "loss": 3.7534, + "step": 7089 + }, + { + "epoch": 0.604278530640075, + "grad_norm": 42.221630834454174, + "learning_rate": 9.690078662379486e-06, + "loss": 4.186, + "step": 7090 + }, + { + "epoch": 0.6043637603341004, + "grad_norm": 37.978938376839956, + "learning_rate": 9.689906782090626e-06, + "loss": 3.3186, + "step": 7091 + }, + { + "epoch": 0.6044489900281258, + "grad_norm": 55.078723351435926, + "learning_rate": 9.689734855678397e-06, + "loss": 4.1052, + "step": 7092 + }, + { + "epoch": 0.6045342197221512, + "grad_norm": 46.08667240092258, + "learning_rate": 9.689562883144488e-06, + "loss": 3.5414, + "step": 7093 + }, + { + "epoch": 0.6046194494161766, + "grad_norm": 91.55416370035336, + "learning_rate": 9.689390864490593e-06, + "loss": 4.5506, + "step": 7094 + }, + { + "epoch": 0.604704679110202, + "grad_norm": 50.53303289730222, + "learning_rate": 9.689218799718403e-06, + "loss": 3.3096, + "step": 7095 + }, + { + "epoch": 0.6047899088042273, + "grad_norm": 48.59314739288301, + "learning_rate": 9.68904668882961e-06, + "loss": 4.2311, + "step": 7096 + }, + { + "epoch": 0.6048751384982528, + "grad_norm": 42.12240890891932, + "learning_rate": 9.688874531825903e-06, + "loss": 4.2496, + "step": 7097 + }, + { + "epoch": 0.6049603681922782, + "grad_norm": 39.94824333498798, + "learning_rate": 9.68870232870898e-06, + "loss": 4.213, + "step": 7098 + }, + { + "epoch": 0.6050455978863036, + "grad_norm": 126.11982109392144, + "learning_rate": 9.688530079480533e-06, + "loss": 4.0925, + "step": 7099 + }, + { + "epoch": 0.605130827580329, + "grad_norm": 40.52888339912945, + "learning_rate": 9.688357784142255e-06, + "loss": 4.6233, + "step": 7100 + }, + { + "epoch": 0.6052160572743543, + "grad_norm": 53.20842762466884, + "learning_rate": 9.688185442695841e-06, + "loss": 3.5151, + "step": 7101 + }, + { + "epoch": 0.6053012869683798, + "grad_norm": 46.895185833107625, + "learning_rate": 9.688013055142987e-06, + "loss": 3.8706, + "step": 7102 + }, + { + "epoch": 0.6053865166624052, + "grad_norm": 171.30770508425408, + "learning_rate": 9.687840621485387e-06, + "loss": 5.5979, + "step": 7103 + }, + { + "epoch": 0.6054717463564305, + "grad_norm": 36.16737678799477, + "learning_rate": 9.687668141724736e-06, + "loss": 3.3643, + "step": 7104 + }, + { + "epoch": 0.605556976050456, + "grad_norm": 59.893694991322135, + "learning_rate": 9.687495615862733e-06, + "loss": 4.1409, + "step": 7105 + }, + { + "epoch": 0.6056422057444814, + "grad_norm": 84.42739047182137, + "learning_rate": 9.687323043901073e-06, + "loss": 4.5162, + "step": 7106 + }, + { + "epoch": 0.6057274354385068, + "grad_norm": 50.56021079971597, + "learning_rate": 9.687150425841455e-06, + "loss": 4.4963, + "step": 7107 + }, + { + "epoch": 0.6058126651325322, + "grad_norm": 77.29089777577549, + "learning_rate": 9.686977761685571e-06, + "loss": 3.0906, + "step": 7108 + }, + { + "epoch": 0.6058978948265575, + "grad_norm": 56.67320241579703, + "learning_rate": 9.686805051435126e-06, + "loss": 4.2092, + "step": 7109 + }, + { + "epoch": 0.605983124520583, + "grad_norm": 44.06778563877455, + "learning_rate": 9.686632295091815e-06, + "loss": 3.7857, + "step": 7110 + }, + { + "epoch": 0.6060683542146084, + "grad_norm": 43.44864033339315, + "learning_rate": 9.686459492657338e-06, + "loss": 4.5819, + "step": 7111 + }, + { + "epoch": 0.6061535839086337, + "grad_norm": 52.5815720731484, + "learning_rate": 9.686286644133393e-06, + "loss": 3.9202, + "step": 7112 + }, + { + "epoch": 0.6062388136026592, + "grad_norm": 67.82654032716837, + "learning_rate": 9.686113749521683e-06, + "loss": 4.5492, + "step": 7113 + }, + { + "epoch": 0.6063240432966845, + "grad_norm": 51.661040391642196, + "learning_rate": 9.685940808823905e-06, + "loss": 4.5728, + "step": 7114 + }, + { + "epoch": 0.60640927299071, + "grad_norm": 91.46711729622763, + "learning_rate": 9.685767822041761e-06, + "loss": 3.9745, + "step": 7115 + }, + { + "epoch": 0.6064945026847354, + "grad_norm": 64.81241670684182, + "learning_rate": 9.685594789176952e-06, + "loss": 4.9166, + "step": 7116 + }, + { + "epoch": 0.6065797323787607, + "grad_norm": 56.13359964399881, + "learning_rate": 9.685421710231181e-06, + "loss": 3.8582, + "step": 7117 + }, + { + "epoch": 0.6066649620727862, + "grad_norm": 44.441954119808905, + "learning_rate": 9.68524858520615e-06, + "loss": 4.1586, + "step": 7118 + }, + { + "epoch": 0.6067501917668116, + "grad_norm": 73.75435365402062, + "learning_rate": 9.685075414103562e-06, + "loss": 4.1895, + "step": 7119 + }, + { + "epoch": 0.6068354214608369, + "grad_norm": 68.4668815884241, + "learning_rate": 9.684902196925116e-06, + "loss": 3.7939, + "step": 7120 + }, + { + "epoch": 0.6069206511548624, + "grad_norm": 63.55846372895725, + "learning_rate": 9.684728933672521e-06, + "loss": 5.248, + "step": 7121 + }, + { + "epoch": 0.6070058808488877, + "grad_norm": 80.62594703266271, + "learning_rate": 9.684555624347476e-06, + "loss": 4.398, + "step": 7122 + }, + { + "epoch": 0.6070911105429132, + "grad_norm": 58.19585532636283, + "learning_rate": 9.68438226895169e-06, + "loss": 4.4664, + "step": 7123 + }, + { + "epoch": 0.6071763402369386, + "grad_norm": 34.629826553113055, + "learning_rate": 9.684208867486863e-06, + "loss": 4.0992, + "step": 7124 + }, + { + "epoch": 0.6072615699309639, + "grad_norm": 35.91249427800077, + "learning_rate": 9.684035419954705e-06, + "loss": 3.5631, + "step": 7125 + }, + { + "epoch": 0.6073467996249894, + "grad_norm": 37.44452692456127, + "learning_rate": 9.683861926356921e-06, + "loss": 3.7932, + "step": 7126 + }, + { + "epoch": 0.6074320293190147, + "grad_norm": 38.324598073952956, + "learning_rate": 9.683688386695213e-06, + "loss": 3.6302, + "step": 7127 + }, + { + "epoch": 0.6075172590130401, + "grad_norm": 31.693172843071718, + "learning_rate": 9.683514800971294e-06, + "loss": 2.74, + "step": 7128 + }, + { + "epoch": 0.6076024887070656, + "grad_norm": 41.598109381562566, + "learning_rate": 9.683341169186865e-06, + "loss": 3.6754, + "step": 7129 + }, + { + "epoch": 0.6076877184010909, + "grad_norm": 48.516690394410865, + "learning_rate": 9.68316749134364e-06, + "loss": 4.429, + "step": 7130 + }, + { + "epoch": 0.6077729480951163, + "grad_norm": 42.37862427318067, + "learning_rate": 9.682993767443322e-06, + "loss": 3.5111, + "step": 7131 + }, + { + "epoch": 0.6078581777891418, + "grad_norm": 52.52417670455441, + "learning_rate": 9.682819997487622e-06, + "loss": 3.5881, + "step": 7132 + }, + { + "epoch": 0.6079434074831671, + "grad_norm": 35.489481778062085, + "learning_rate": 9.682646181478246e-06, + "loss": 2.5418, + "step": 7133 + }, + { + "epoch": 0.6080286371771926, + "grad_norm": 69.06854074567835, + "learning_rate": 9.682472319416906e-06, + "loss": 4.6708, + "step": 7134 + }, + { + "epoch": 0.6081138668712179, + "grad_norm": 74.69806937434252, + "learning_rate": 9.682298411305313e-06, + "loss": 4.0383, + "step": 7135 + }, + { + "epoch": 0.6081990965652433, + "grad_norm": 98.15644420043762, + "learning_rate": 9.682124457145175e-06, + "loss": 4.8246, + "step": 7136 + }, + { + "epoch": 0.6082843262592688, + "grad_norm": 61.32856723752117, + "learning_rate": 9.681950456938203e-06, + "loss": 4.6604, + "step": 7137 + }, + { + "epoch": 0.6083695559532941, + "grad_norm": 69.90733151272973, + "learning_rate": 9.681776410686109e-06, + "loss": 4.9674, + "step": 7138 + }, + { + "epoch": 0.6084547856473195, + "grad_norm": 98.31956254382577, + "learning_rate": 9.681602318390604e-06, + "loss": 5.7824, + "step": 7139 + }, + { + "epoch": 0.6085400153413449, + "grad_norm": 47.12354725355492, + "learning_rate": 9.6814281800534e-06, + "loss": 4.4797, + "step": 7140 + }, + { + "epoch": 0.6086252450353703, + "grad_norm": 65.77066133533931, + "learning_rate": 9.681253995676213e-06, + "loss": 3.5508, + "step": 7141 + }, + { + "epoch": 0.6087104747293958, + "grad_norm": 42.882867194234244, + "learning_rate": 9.68107976526075e-06, + "loss": 3.0651, + "step": 7142 + }, + { + "epoch": 0.6087957044234211, + "grad_norm": 35.08096767627367, + "learning_rate": 9.680905488808727e-06, + "loss": 4.0509, + "step": 7143 + }, + { + "epoch": 0.6088809341174465, + "grad_norm": 65.19430470084808, + "learning_rate": 9.680731166321862e-06, + "loss": 3.8852, + "step": 7144 + }, + { + "epoch": 0.608966163811472, + "grad_norm": 75.37634430985062, + "learning_rate": 9.680556797801864e-06, + "loss": 4.7645, + "step": 7145 + }, + { + "epoch": 0.6090513935054973, + "grad_norm": 99.84741073385267, + "learning_rate": 9.680382383250449e-06, + "loss": 2.9889, + "step": 7146 + }, + { + "epoch": 0.6091366231995227, + "grad_norm": 46.26789072159519, + "learning_rate": 9.680207922669334e-06, + "loss": 3.8927, + "step": 7147 + }, + { + "epoch": 0.6092218528935481, + "grad_norm": 65.40132103392116, + "learning_rate": 9.680033416060233e-06, + "loss": 4.3003, + "step": 7148 + }, + { + "epoch": 0.6093070825875735, + "grad_norm": 58.28309342584988, + "learning_rate": 9.679858863424863e-06, + "loss": 3.331, + "step": 7149 + }, + { + "epoch": 0.609392312281599, + "grad_norm": 46.38178201530226, + "learning_rate": 9.679684264764941e-06, + "loss": 2.8201, + "step": 7150 + }, + { + "epoch": 0.6094775419756243, + "grad_norm": 110.4162128385662, + "learning_rate": 9.679509620082184e-06, + "loss": 4.9319, + "step": 7151 + }, + { + "epoch": 0.6095627716696497, + "grad_norm": 155.37591231214813, + "learning_rate": 9.67933492937831e-06, + "loss": 5.1229, + "step": 7152 + }, + { + "epoch": 0.6096480013636751, + "grad_norm": 44.91064752275703, + "learning_rate": 9.679160192655034e-06, + "loss": 3.9, + "step": 7153 + }, + { + "epoch": 0.6097332310577005, + "grad_norm": 33.17301267084483, + "learning_rate": 9.67898540991408e-06, + "loss": 2.4214, + "step": 7154 + }, + { + "epoch": 0.6098184607517259, + "grad_norm": 101.28153885456136, + "learning_rate": 9.678810581157162e-06, + "loss": 5.9336, + "step": 7155 + }, + { + "epoch": 0.6099036904457513, + "grad_norm": 37.04031141987955, + "learning_rate": 9.678635706386001e-06, + "loss": 4.0732, + "step": 7156 + }, + { + "epoch": 0.6099889201397767, + "grad_norm": 34.8349907598752, + "learning_rate": 9.678460785602318e-06, + "loss": 3.071, + "step": 7157 + }, + { + "epoch": 0.6100741498338021, + "grad_norm": 41.50690913649255, + "learning_rate": 9.678285818807832e-06, + "loss": 3.7784, + "step": 7158 + }, + { + "epoch": 0.6101593795278275, + "grad_norm": 32.41716727147717, + "learning_rate": 9.678110806004264e-06, + "loss": 4.3458, + "step": 7159 + }, + { + "epoch": 0.6102446092218529, + "grad_norm": 69.5048820780513, + "learning_rate": 9.677935747193335e-06, + "loss": 4.6573, + "step": 7160 + }, + { + "epoch": 0.6103298389158783, + "grad_norm": 51.69739617489367, + "learning_rate": 9.677760642376766e-06, + "loss": 3.8527, + "step": 7161 + }, + { + "epoch": 0.6104150686099037, + "grad_norm": 66.35177719646776, + "learning_rate": 9.67758549155628e-06, + "loss": 4.7622, + "step": 7162 + }, + { + "epoch": 0.610500298303929, + "grad_norm": 34.03452546984183, + "learning_rate": 9.677410294733598e-06, + "loss": 2.9313, + "step": 7163 + }, + { + "epoch": 0.6105855279979545, + "grad_norm": 30.615181624933214, + "learning_rate": 9.677235051910448e-06, + "loss": 3.8359, + "step": 7164 + }, + { + "epoch": 0.6106707576919799, + "grad_norm": 51.756202749122494, + "learning_rate": 9.677059763088547e-06, + "loss": 3.3016, + "step": 7165 + }, + { + "epoch": 0.6107559873860053, + "grad_norm": 110.30374052570426, + "learning_rate": 9.676884428269621e-06, + "loss": 3.0965, + "step": 7166 + }, + { + "epoch": 0.6108412170800307, + "grad_norm": 28.99603055527152, + "learning_rate": 9.676709047455397e-06, + "loss": 2.6947, + "step": 7167 + }, + { + "epoch": 0.610926446774056, + "grad_norm": 47.552217563401584, + "learning_rate": 9.676533620647598e-06, + "loss": 3.6964, + "step": 7168 + }, + { + "epoch": 0.6110116764680815, + "grad_norm": 37.73213287113133, + "learning_rate": 9.676358147847949e-06, + "loss": 4.0673, + "step": 7169 + }, + { + "epoch": 0.6110969061621069, + "grad_norm": 27.386709972967616, + "learning_rate": 9.676182629058174e-06, + "loss": 3.1031, + "step": 7170 + }, + { + "epoch": 0.6111821358561322, + "grad_norm": 54.67517182342505, + "learning_rate": 9.676007064280003e-06, + "loss": 4.66, + "step": 7171 + }, + { + "epoch": 0.6112673655501577, + "grad_norm": 48.22923551327239, + "learning_rate": 9.67583145351516e-06, + "loss": 4.7931, + "step": 7172 + }, + { + "epoch": 0.6113525952441831, + "grad_norm": 175.6389278181498, + "learning_rate": 9.675655796765374e-06, + "loss": 5.5231, + "step": 7173 + }, + { + "epoch": 0.6114378249382084, + "grad_norm": 33.36587517732967, + "learning_rate": 9.675480094032368e-06, + "loss": 3.7405, + "step": 7174 + }, + { + "epoch": 0.6115230546322339, + "grad_norm": 106.78701167469876, + "learning_rate": 9.675304345317875e-06, + "loss": 3.3853, + "step": 7175 + }, + { + "epoch": 0.6116082843262592, + "grad_norm": 34.020950169650554, + "learning_rate": 9.675128550623622e-06, + "loss": 3.8459, + "step": 7176 + }, + { + "epoch": 0.6116935140202847, + "grad_norm": 62.548472240224825, + "learning_rate": 9.674952709951336e-06, + "loss": 4.1337, + "step": 7177 + }, + { + "epoch": 0.6117787437143101, + "grad_norm": 41.51310672958709, + "learning_rate": 9.67477682330275e-06, + "loss": 4.3578, + "step": 7178 + }, + { + "epoch": 0.6118639734083354, + "grad_norm": 44.48738378655526, + "learning_rate": 9.674600890679588e-06, + "loss": 3.9805, + "step": 7179 + }, + { + "epoch": 0.6119492031023609, + "grad_norm": 38.69780132834238, + "learning_rate": 9.674424912083586e-06, + "loss": 4.7811, + "step": 7180 + }, + { + "epoch": 0.6120344327963863, + "grad_norm": 97.83516621542897, + "learning_rate": 9.674248887516473e-06, + "loss": 4.0227, + "step": 7181 + }, + { + "epoch": 0.6121196624904116, + "grad_norm": 29.305949737816313, + "learning_rate": 9.674072816979977e-06, + "loss": 3.3768, + "step": 7182 + }, + { + "epoch": 0.6122048921844371, + "grad_norm": 42.580987099148885, + "learning_rate": 9.673896700475835e-06, + "loss": 3.9217, + "step": 7183 + }, + { + "epoch": 0.6122901218784624, + "grad_norm": 34.931916344059715, + "learning_rate": 9.673720538005775e-06, + "loss": 3.8118, + "step": 7184 + }, + { + "epoch": 0.6123753515724879, + "grad_norm": 39.33274668779669, + "learning_rate": 9.673544329571531e-06, + "loss": 3.0907, + "step": 7185 + }, + { + "epoch": 0.6124605812665133, + "grad_norm": 52.75760708173821, + "learning_rate": 9.673368075174837e-06, + "loss": 3.1979, + "step": 7186 + }, + { + "epoch": 0.6125458109605386, + "grad_norm": 37.4160866399599, + "learning_rate": 9.673191774817422e-06, + "loss": 3.6577, + "step": 7187 + }, + { + "epoch": 0.6126310406545641, + "grad_norm": 81.27018442075915, + "learning_rate": 9.673015428501025e-06, + "loss": 4.6095, + "step": 7188 + }, + { + "epoch": 0.6127162703485894, + "grad_norm": 84.7115471905751, + "learning_rate": 9.672839036227377e-06, + "loss": 4.182, + "step": 7189 + }, + { + "epoch": 0.6128015000426148, + "grad_norm": 50.61237430078103, + "learning_rate": 9.672662597998214e-06, + "loss": 4.2013, + "step": 7190 + }, + { + "epoch": 0.6128867297366403, + "grad_norm": 47.00136502029159, + "learning_rate": 9.672486113815271e-06, + "loss": 3.4064, + "step": 7191 + }, + { + "epoch": 0.6129719594306656, + "grad_norm": 196.46400755570772, + "learning_rate": 9.672309583680284e-06, + "loss": 3.9098, + "step": 7192 + }, + { + "epoch": 0.6130571891246911, + "grad_norm": 41.84458578963474, + "learning_rate": 9.672133007594987e-06, + "loss": 4.0434, + "step": 7193 + }, + { + "epoch": 0.6131424188187165, + "grad_norm": 45.35036587809437, + "learning_rate": 9.671956385561122e-06, + "loss": 4.3897, + "step": 7194 + }, + { + "epoch": 0.6132276485127418, + "grad_norm": 36.80144013414823, + "learning_rate": 9.671779717580421e-06, + "loss": 3.4209, + "step": 7195 + }, + { + "epoch": 0.6133128782067673, + "grad_norm": 100.8885213996165, + "learning_rate": 9.67160300365462e-06, + "loss": 4.322, + "step": 7196 + }, + { + "epoch": 0.6133981079007926, + "grad_norm": 32.21545708867357, + "learning_rate": 9.671426243785464e-06, + "loss": 3.0684, + "step": 7197 + }, + { + "epoch": 0.613483337594818, + "grad_norm": 71.60183907220673, + "learning_rate": 9.671249437974686e-06, + "loss": 4.8282, + "step": 7198 + }, + { + "epoch": 0.6135685672888435, + "grad_norm": 30.241377211098936, + "learning_rate": 9.671072586224027e-06, + "loss": 3.1054, + "step": 7199 + }, + { + "epoch": 0.6136537969828688, + "grad_norm": 46.41289057341161, + "learning_rate": 9.670895688535223e-06, + "loss": 4.483, + "step": 7200 + }, + { + "epoch": 0.6137390266768943, + "grad_norm": 49.67040683643137, + "learning_rate": 9.670718744910015e-06, + "loss": 3.9101, + "step": 7201 + }, + { + "epoch": 0.6138242563709196, + "grad_norm": 38.39682586496809, + "learning_rate": 9.670541755350147e-06, + "loss": 3.1763, + "step": 7202 + }, + { + "epoch": 0.613909486064945, + "grad_norm": 73.08120021044162, + "learning_rate": 9.670364719857355e-06, + "loss": 5.016, + "step": 7203 + }, + { + "epoch": 0.6139947157589705, + "grad_norm": 27.007619807924634, + "learning_rate": 9.670187638433381e-06, + "loss": 3.0478, + "step": 7204 + }, + { + "epoch": 0.6140799454529958, + "grad_norm": 41.29027386803011, + "learning_rate": 9.67001051107997e-06, + "loss": 3.2829, + "step": 7205 + }, + { + "epoch": 0.6141651751470212, + "grad_norm": 66.27549526151466, + "learning_rate": 9.669833337798859e-06, + "loss": 3.6219, + "step": 7206 + }, + { + "epoch": 0.6142504048410466, + "grad_norm": 35.836339566596706, + "learning_rate": 9.669656118591793e-06, + "loss": 3.9949, + "step": 7207 + }, + { + "epoch": 0.614335634535072, + "grad_norm": 66.27661526147608, + "learning_rate": 9.669478853460515e-06, + "loss": 3.7878, + "step": 7208 + }, + { + "epoch": 0.6144208642290974, + "grad_norm": 75.05239783042762, + "learning_rate": 9.669301542406769e-06, + "loss": 4.0857, + "step": 7209 + }, + { + "epoch": 0.6145060939231228, + "grad_norm": 43.16569469308362, + "learning_rate": 9.669124185432296e-06, + "loss": 2.708, + "step": 7210 + }, + { + "epoch": 0.6145913236171482, + "grad_norm": 49.628786401024875, + "learning_rate": 9.668946782538845e-06, + "loss": 4.443, + "step": 7211 + }, + { + "epoch": 0.6146765533111737, + "grad_norm": 35.0466702392635, + "learning_rate": 9.668769333728154e-06, + "loss": 2.9339, + "step": 7212 + }, + { + "epoch": 0.614761783005199, + "grad_norm": 54.68451097226737, + "learning_rate": 9.668591839001973e-06, + "loss": 3.4253, + "step": 7213 + }, + { + "epoch": 0.6148470126992244, + "grad_norm": 63.453971355361695, + "learning_rate": 9.668414298362047e-06, + "loss": 4.2587, + "step": 7214 + }, + { + "epoch": 0.6149322423932498, + "grad_norm": 44.488694717778806, + "learning_rate": 9.66823671181012e-06, + "loss": 4.4251, + "step": 7215 + }, + { + "epoch": 0.6150174720872752, + "grad_norm": 38.52661191339547, + "learning_rate": 9.668059079347942e-06, + "loss": 3.4782, + "step": 7216 + }, + { + "epoch": 0.6151027017813006, + "grad_norm": 44.97814271002804, + "learning_rate": 9.667881400977256e-06, + "loss": 4.2682, + "step": 7217 + }, + { + "epoch": 0.615187931475326, + "grad_norm": 65.48093416602526, + "learning_rate": 9.667703676699813e-06, + "loss": 3.3524, + "step": 7218 + }, + { + "epoch": 0.6152731611693514, + "grad_norm": 27.719088544849637, + "learning_rate": 9.667525906517356e-06, + "loss": 3.3888, + "step": 7219 + }, + { + "epoch": 0.6153583908633768, + "grad_norm": 95.64653437285833, + "learning_rate": 9.667348090431639e-06, + "loss": 4.5941, + "step": 7220 + }, + { + "epoch": 0.6154436205574022, + "grad_norm": 46.47837234368994, + "learning_rate": 9.667170228444409e-06, + "loss": 3.032, + "step": 7221 + }, + { + "epoch": 0.6155288502514276, + "grad_norm": 65.81207447816993, + "learning_rate": 9.666992320557414e-06, + "loss": 4.4645, + "step": 7222 + }, + { + "epoch": 0.615614079945453, + "grad_norm": 42.49606867998147, + "learning_rate": 9.666814366772402e-06, + "loss": 3.9705, + "step": 7223 + }, + { + "epoch": 0.6156993096394784, + "grad_norm": 44.13565313954498, + "learning_rate": 9.666636367091127e-06, + "loss": 3.1898, + "step": 7224 + }, + { + "epoch": 0.6157845393335037, + "grad_norm": 192.3314674069077, + "learning_rate": 9.666458321515334e-06, + "loss": 4.4758, + "step": 7225 + }, + { + "epoch": 0.6158697690275292, + "grad_norm": 48.67072645452913, + "learning_rate": 9.666280230046782e-06, + "loss": 4.4073, + "step": 7226 + }, + { + "epoch": 0.6159549987215546, + "grad_norm": 89.7652646427792, + "learning_rate": 9.666102092687217e-06, + "loss": 5.0235, + "step": 7227 + }, + { + "epoch": 0.61604022841558, + "grad_norm": 53.303750477690535, + "learning_rate": 9.665923909438392e-06, + "loss": 3.2496, + "step": 7228 + }, + { + "epoch": 0.6161254581096054, + "grad_norm": 161.56428005008408, + "learning_rate": 9.665745680302058e-06, + "loss": 5.4142, + "step": 7229 + }, + { + "epoch": 0.6162106878036308, + "grad_norm": 73.26988460298593, + "learning_rate": 9.66556740527997e-06, + "loss": 4.8679, + "step": 7230 + }, + { + "epoch": 0.6162959174976562, + "grad_norm": 36.65882620168898, + "learning_rate": 9.665389084373882e-06, + "loss": 2.6351, + "step": 7231 + }, + { + "epoch": 0.6163811471916816, + "grad_norm": 77.55839409759773, + "learning_rate": 9.665210717585545e-06, + "loss": 4.2025, + "step": 7232 + }, + { + "epoch": 0.6164663768857069, + "grad_norm": 64.25193793416864, + "learning_rate": 9.665032304916715e-06, + "loss": 4.4055, + "step": 7233 + }, + { + "epoch": 0.6165516065797324, + "grad_norm": 80.48485343760936, + "learning_rate": 9.664853846369144e-06, + "loss": 4.3374, + "step": 7234 + }, + { + "epoch": 0.6166368362737578, + "grad_norm": 53.59629206241685, + "learning_rate": 9.664675341944591e-06, + "loss": 4.1126, + "step": 7235 + }, + { + "epoch": 0.6167220659677832, + "grad_norm": 32.173902086069546, + "learning_rate": 9.664496791644808e-06, + "loss": 3.7048, + "step": 7236 + }, + { + "epoch": 0.6168072956618086, + "grad_norm": 98.46152758272356, + "learning_rate": 9.664318195471554e-06, + "loss": 5.7931, + "step": 7237 + }, + { + "epoch": 0.6168925253558339, + "grad_norm": 54.69851785933915, + "learning_rate": 9.664139553426584e-06, + "loss": 4.7387, + "step": 7238 + }, + { + "epoch": 0.6169777550498594, + "grad_norm": 32.61871646428709, + "learning_rate": 9.663960865511655e-06, + "loss": 3.6887, + "step": 7239 + }, + { + "epoch": 0.6170629847438848, + "grad_norm": 131.03140766359684, + "learning_rate": 9.663782131728522e-06, + "loss": 4.5249, + "step": 7240 + }, + { + "epoch": 0.6171482144379101, + "grad_norm": 53.62999167734192, + "learning_rate": 9.663603352078948e-06, + "loss": 4.9497, + "step": 7241 + }, + { + "epoch": 0.6172334441319356, + "grad_norm": 81.7010955933114, + "learning_rate": 9.663424526564686e-06, + "loss": 4.0929, + "step": 7242 + }, + { + "epoch": 0.617318673825961, + "grad_norm": 164.09858463343195, + "learning_rate": 9.663245655187497e-06, + "loss": 4.3429, + "step": 7243 + }, + { + "epoch": 0.6174039035199863, + "grad_norm": 39.997882989904106, + "learning_rate": 9.66306673794914e-06, + "loss": 4.0177, + "step": 7244 + }, + { + "epoch": 0.6174891332140118, + "grad_norm": 84.61260952107689, + "learning_rate": 9.662887774851374e-06, + "loss": 5.1511, + "step": 7245 + }, + { + "epoch": 0.6175743629080371, + "grad_norm": 39.96101765649077, + "learning_rate": 9.662708765895963e-06, + "loss": 3.5537, + "step": 7246 + }, + { + "epoch": 0.6176595926020626, + "grad_norm": 54.55930043229549, + "learning_rate": 9.662529711084661e-06, + "loss": 4.1068, + "step": 7247 + }, + { + "epoch": 0.617744822296088, + "grad_norm": 53.18911027095126, + "learning_rate": 9.662350610419234e-06, + "loss": 4.1366, + "step": 7248 + }, + { + "epoch": 0.6178300519901133, + "grad_norm": 44.77831264804955, + "learning_rate": 9.662171463901439e-06, + "loss": 3.2935, + "step": 7249 + }, + { + "epoch": 0.6179152816841388, + "grad_norm": 40.09750734559976, + "learning_rate": 9.66199227153304e-06, + "loss": 4.1232, + "step": 7250 + }, + { + "epoch": 0.6180005113781641, + "grad_norm": 71.33510026680841, + "learning_rate": 9.661813033315802e-06, + "loss": 4.1356, + "step": 7251 + }, + { + "epoch": 0.6180857410721895, + "grad_norm": 44.94877539116093, + "learning_rate": 9.661633749251484e-06, + "loss": 4.1202, + "step": 7252 + }, + { + "epoch": 0.618170970766215, + "grad_norm": 61.99278129772301, + "learning_rate": 9.66145441934185e-06, + "loss": 2.8321, + "step": 7253 + }, + { + "epoch": 0.6182562004602403, + "grad_norm": 160.11718861973006, + "learning_rate": 9.661275043588665e-06, + "loss": 4.5669, + "step": 7254 + }, + { + "epoch": 0.6183414301542658, + "grad_norm": 48.36007176656276, + "learning_rate": 9.66109562199369e-06, + "loss": 3.637, + "step": 7255 + }, + { + "epoch": 0.6184266598482911, + "grad_norm": 37.795296421714816, + "learning_rate": 9.660916154558695e-06, + "loss": 3.5258, + "step": 7256 + }, + { + "epoch": 0.6185118895423165, + "grad_norm": 58.14979309332231, + "learning_rate": 9.66073664128544e-06, + "loss": 3.8184, + "step": 7257 + }, + { + "epoch": 0.618597119236342, + "grad_norm": 43.01857237955449, + "learning_rate": 9.660557082175692e-06, + "loss": 2.9549, + "step": 7258 + }, + { + "epoch": 0.6186823489303673, + "grad_norm": 154.89782106919395, + "learning_rate": 9.660377477231216e-06, + "loss": 4.9387, + "step": 7259 + }, + { + "epoch": 0.6187675786243927, + "grad_norm": 36.37093013755172, + "learning_rate": 9.660197826453782e-06, + "loss": 3.7324, + "step": 7260 + }, + { + "epoch": 0.6188528083184182, + "grad_norm": 47.57745746421499, + "learning_rate": 9.660018129845152e-06, + "loss": 3.4567, + "step": 7261 + }, + { + "epoch": 0.6189380380124435, + "grad_norm": 32.6813123447051, + "learning_rate": 9.659838387407096e-06, + "loss": 3.6707, + "step": 7262 + }, + { + "epoch": 0.619023267706469, + "grad_norm": 62.67011061963485, + "learning_rate": 9.659658599141381e-06, + "loss": 4.2578, + "step": 7263 + }, + { + "epoch": 0.6191084974004943, + "grad_norm": 52.570346809937696, + "learning_rate": 9.659478765049775e-06, + "loss": 3.4483, + "step": 7264 + }, + { + "epoch": 0.6191937270945197, + "grad_norm": 38.25593807070529, + "learning_rate": 9.659298885134048e-06, + "loss": 3.9195, + "step": 7265 + }, + { + "epoch": 0.6192789567885452, + "grad_norm": 46.63130615261444, + "learning_rate": 9.659118959395967e-06, + "loss": 4.2269, + "step": 7266 + }, + { + "epoch": 0.6193641864825705, + "grad_norm": 72.66413559888522, + "learning_rate": 9.658938987837301e-06, + "loss": 4.4485, + "step": 7267 + }, + { + "epoch": 0.6194494161765959, + "grad_norm": 53.772696675999136, + "learning_rate": 9.658758970459823e-06, + "loss": 5.1404, + "step": 7268 + }, + { + "epoch": 0.6195346458706213, + "grad_norm": 73.32182972775925, + "learning_rate": 9.658578907265302e-06, + "loss": 3.9245, + "step": 7269 + }, + { + "epoch": 0.6196198755646467, + "grad_norm": 46.19658606575703, + "learning_rate": 9.658398798255508e-06, + "loss": 3.8839, + "step": 7270 + }, + { + "epoch": 0.6197051052586722, + "grad_norm": 95.47766872414776, + "learning_rate": 9.658218643432213e-06, + "loss": 5.9228, + "step": 7271 + }, + { + "epoch": 0.6197903349526975, + "grad_norm": 61.48644725711559, + "learning_rate": 9.65803844279719e-06, + "loss": 4.8105, + "step": 7272 + }, + { + "epoch": 0.6198755646467229, + "grad_norm": 63.42864601854711, + "learning_rate": 9.657858196352208e-06, + "loss": 4.2601, + "step": 7273 + }, + { + "epoch": 0.6199607943407484, + "grad_norm": 72.81213452588088, + "learning_rate": 9.657677904099042e-06, + "loss": 4.2336, + "step": 7274 + }, + { + "epoch": 0.6200460240347737, + "grad_norm": 47.460339554169735, + "learning_rate": 9.657497566039464e-06, + "loss": 4.0831, + "step": 7275 + }, + { + "epoch": 0.6201312537287991, + "grad_norm": 76.37336163613693, + "learning_rate": 9.65731718217525e-06, + "loss": 4.443, + "step": 7276 + }, + { + "epoch": 0.6202164834228245, + "grad_norm": 37.62957221176526, + "learning_rate": 9.657136752508171e-06, + "loss": 3.7308, + "step": 7277 + }, + { + "epoch": 0.6203017131168499, + "grad_norm": 121.15123445969635, + "learning_rate": 9.656956277040002e-06, + "loss": 3.9843, + "step": 7278 + }, + { + "epoch": 0.6203869428108754, + "grad_norm": 31.881776394127733, + "learning_rate": 9.656775755772518e-06, + "loss": 3.0052, + "step": 7279 + }, + { + "epoch": 0.6204721725049007, + "grad_norm": 51.126566526450326, + "learning_rate": 9.656595188707497e-06, + "loss": 4.7225, + "step": 7280 + }, + { + "epoch": 0.6205574021989261, + "grad_norm": 48.527047824766456, + "learning_rate": 9.65641457584671e-06, + "loss": 4.6252, + "step": 7281 + }, + { + "epoch": 0.6206426318929515, + "grad_norm": 40.54281809543171, + "learning_rate": 9.65623391719194e-06, + "loss": 4.1167, + "step": 7282 + }, + { + "epoch": 0.6207278615869769, + "grad_norm": 140.2480240347197, + "learning_rate": 9.656053212744956e-06, + "loss": 2.6196, + "step": 7283 + }, + { + "epoch": 0.6208130912810023, + "grad_norm": 46.542880463338854, + "learning_rate": 9.65587246250754e-06, + "loss": 4.0175, + "step": 7284 + }, + { + "epoch": 0.6208983209750277, + "grad_norm": 32.41872917537419, + "learning_rate": 9.655691666481468e-06, + "loss": 3.7098, + "step": 7285 + }, + { + "epoch": 0.6209835506690531, + "grad_norm": 38.15840541834022, + "learning_rate": 9.65551082466852e-06, + "loss": 3.5153, + "step": 7286 + }, + { + "epoch": 0.6210687803630784, + "grad_norm": 41.94338729752454, + "learning_rate": 9.65532993707047e-06, + "loss": 3.687, + "step": 7287 + }, + { + "epoch": 0.6211540100571039, + "grad_norm": 54.55195632222559, + "learning_rate": 9.655149003689102e-06, + "loss": 3.6235, + "step": 7288 + }, + { + "epoch": 0.6212392397511293, + "grad_norm": 51.483997516972266, + "learning_rate": 9.654968024526194e-06, + "loss": 4.4191, + "step": 7289 + }, + { + "epoch": 0.6213244694451547, + "grad_norm": 34.64706250470334, + "learning_rate": 9.654786999583523e-06, + "loss": 3.9814, + "step": 7290 + }, + { + "epoch": 0.6214096991391801, + "grad_norm": 100.64384253756603, + "learning_rate": 9.654605928862872e-06, + "loss": 4.6988, + "step": 7291 + }, + { + "epoch": 0.6214949288332055, + "grad_norm": 61.09377701324651, + "learning_rate": 9.654424812366024e-06, + "loss": 3.464, + "step": 7292 + }, + { + "epoch": 0.6215801585272309, + "grad_norm": 63.34826227170092, + "learning_rate": 9.654243650094754e-06, + "loss": 4.6918, + "step": 7293 + }, + { + "epoch": 0.6216653882212563, + "grad_norm": 80.36161123714528, + "learning_rate": 9.65406244205085e-06, + "loss": 4.2427, + "step": 7294 + }, + { + "epoch": 0.6217506179152816, + "grad_norm": 43.93590449347085, + "learning_rate": 9.653881188236088e-06, + "loss": 3.7752, + "step": 7295 + }, + { + "epoch": 0.6218358476093071, + "grad_norm": 61.07407420132681, + "learning_rate": 9.653699888652256e-06, + "loss": 3.8519, + "step": 7296 + }, + { + "epoch": 0.6219210773033325, + "grad_norm": 102.41108421828837, + "learning_rate": 9.653518543301135e-06, + "loss": 5.1623, + "step": 7297 + }, + { + "epoch": 0.6220063069973579, + "grad_norm": 46.58000016660236, + "learning_rate": 9.653337152184506e-06, + "loss": 3.1285, + "step": 7298 + }, + { + "epoch": 0.6220915366913833, + "grad_norm": 43.16805911725296, + "learning_rate": 9.653155715304157e-06, + "loss": 4.2975, + "step": 7299 + }, + { + "epoch": 0.6221767663854086, + "grad_norm": 56.16782058902724, + "learning_rate": 9.652974232661871e-06, + "loss": 3.9105, + "step": 7300 + }, + { + "epoch": 0.6222619960794341, + "grad_norm": 86.22606645770708, + "learning_rate": 9.65279270425943e-06, + "loss": 4.3814, + "step": 7301 + }, + { + "epoch": 0.6223472257734595, + "grad_norm": 35.000743650545054, + "learning_rate": 9.652611130098624e-06, + "loss": 2.9469, + "step": 7302 + }, + { + "epoch": 0.6224324554674848, + "grad_norm": 47.01456284018394, + "learning_rate": 9.652429510181236e-06, + "loss": 4.3338, + "step": 7303 + }, + { + "epoch": 0.6225176851615103, + "grad_norm": 42.70766461698697, + "learning_rate": 9.65224784450905e-06, + "loss": 3.8125, + "step": 7304 + }, + { + "epoch": 0.6226029148555356, + "grad_norm": 45.82721780280522, + "learning_rate": 9.652066133083859e-06, + "loss": 4.1779, + "step": 7305 + }, + { + "epoch": 0.6226881445495611, + "grad_norm": 47.120995031793726, + "learning_rate": 9.651884375907443e-06, + "loss": 4.4184, + "step": 7306 + }, + { + "epoch": 0.6227733742435865, + "grad_norm": 45.16852778383039, + "learning_rate": 9.651702572981593e-06, + "loss": 4.0435, + "step": 7307 + }, + { + "epoch": 0.6228586039376118, + "grad_norm": 59.30937625279356, + "learning_rate": 9.651520724308099e-06, + "loss": 4.1976, + "step": 7308 + }, + { + "epoch": 0.6229438336316373, + "grad_norm": 84.35907162179163, + "learning_rate": 9.651338829888744e-06, + "loss": 4.2552, + "step": 7309 + }, + { + "epoch": 0.6230290633256627, + "grad_norm": 25.473459907642656, + "learning_rate": 9.651156889725322e-06, + "loss": 3.6275, + "step": 7310 + }, + { + "epoch": 0.623114293019688, + "grad_norm": 97.32945041386218, + "learning_rate": 9.650974903819618e-06, + "loss": 5.2589, + "step": 7311 + }, + { + "epoch": 0.6231995227137135, + "grad_norm": 28.46382997438068, + "learning_rate": 9.650792872173426e-06, + "loss": 2.8062, + "step": 7312 + }, + { + "epoch": 0.6232847524077388, + "grad_norm": 123.89248001226161, + "learning_rate": 9.650610794788534e-06, + "loss": 3.4356, + "step": 7313 + }, + { + "epoch": 0.6233699821017643, + "grad_norm": 30.908400801413737, + "learning_rate": 9.650428671666732e-06, + "loss": 3.8703, + "step": 7314 + }, + { + "epoch": 0.6234552117957897, + "grad_norm": 35.93646262941076, + "learning_rate": 9.650246502809812e-06, + "loss": 3.6214, + "step": 7315 + }, + { + "epoch": 0.623540441489815, + "grad_norm": 41.235717134625055, + "learning_rate": 9.650064288219566e-06, + "loss": 3.529, + "step": 7316 + }, + { + "epoch": 0.6236256711838405, + "grad_norm": 38.30849897912161, + "learning_rate": 9.649882027897785e-06, + "loss": 3.7496, + "step": 7317 + }, + { + "epoch": 0.6237109008778658, + "grad_norm": 34.02002097346317, + "learning_rate": 9.649699721846262e-06, + "loss": 3.3411, + "step": 7318 + }, + { + "epoch": 0.6237961305718912, + "grad_norm": 117.68543868804834, + "learning_rate": 9.64951737006679e-06, + "loss": 4.1122, + "step": 7319 + }, + { + "epoch": 0.6238813602659167, + "grad_norm": 41.35288573765617, + "learning_rate": 9.649334972561162e-06, + "loss": 4.7913, + "step": 7320 + }, + { + "epoch": 0.623966589959942, + "grad_norm": 53.93418477781512, + "learning_rate": 9.649152529331171e-06, + "loss": 2.8526, + "step": 7321 + }, + { + "epoch": 0.6240518196539674, + "grad_norm": 72.26616319194689, + "learning_rate": 9.648970040378614e-06, + "loss": 4.3231, + "step": 7322 + }, + { + "epoch": 0.6241370493479929, + "grad_norm": 52.8696257683702, + "learning_rate": 9.648787505705284e-06, + "loss": 2.4135, + "step": 7323 + }, + { + "epoch": 0.6242222790420182, + "grad_norm": 58.90473437475137, + "learning_rate": 9.648604925312975e-06, + "loss": 4.0301, + "step": 7324 + }, + { + "epoch": 0.6243075087360437, + "grad_norm": 37.17492737381906, + "learning_rate": 9.648422299203485e-06, + "loss": 2.6139, + "step": 7325 + }, + { + "epoch": 0.624392738430069, + "grad_norm": 68.69398153695283, + "learning_rate": 9.648239627378609e-06, + "loss": 3.7553, + "step": 7326 + }, + { + "epoch": 0.6244779681240944, + "grad_norm": 46.567924648880314, + "learning_rate": 9.648056909840142e-06, + "loss": 4.3093, + "step": 7327 + }, + { + "epoch": 0.6245631978181199, + "grad_norm": 74.41846595559755, + "learning_rate": 9.647874146589882e-06, + "loss": 4.0945, + "step": 7328 + }, + { + "epoch": 0.6246484275121452, + "grad_norm": 48.847199504167264, + "learning_rate": 9.647691337629626e-06, + "loss": 3.3393, + "step": 7329 + }, + { + "epoch": 0.6247336572061706, + "grad_norm": 43.24565411823729, + "learning_rate": 9.647508482961174e-06, + "loss": 3.2242, + "step": 7330 + }, + { + "epoch": 0.624818886900196, + "grad_norm": 64.76075575789827, + "learning_rate": 9.647325582586322e-06, + "loss": 4.5715, + "step": 7331 + }, + { + "epoch": 0.6249041165942214, + "grad_norm": 125.70206448430665, + "learning_rate": 9.647142636506872e-06, + "loss": 4.0383, + "step": 7332 + }, + { + "epoch": 0.6249893462882469, + "grad_norm": 38.186425570820646, + "learning_rate": 9.646959644724619e-06, + "loss": 2.3372, + "step": 7333 + }, + { + "epoch": 0.6250745759822722, + "grad_norm": 168.28818452569584, + "learning_rate": 9.646776607241364e-06, + "loss": 4.7565, + "step": 7334 + }, + { + "epoch": 0.6251598056762976, + "grad_norm": 74.44660566607158, + "learning_rate": 9.646593524058908e-06, + "loss": 6.1523, + "step": 7335 + }, + { + "epoch": 0.625245035370323, + "grad_norm": 47.514570120117014, + "learning_rate": 9.64641039517905e-06, + "loss": 2.9419, + "step": 7336 + }, + { + "epoch": 0.6253302650643484, + "grad_norm": 107.58521584919197, + "learning_rate": 9.646227220603592e-06, + "loss": 4.5564, + "step": 7337 + }, + { + "epoch": 0.6254154947583738, + "grad_norm": 45.34241110439691, + "learning_rate": 9.646044000334337e-06, + "loss": 4.4616, + "step": 7338 + }, + { + "epoch": 0.6255007244523992, + "grad_norm": 51.83932261709796, + "learning_rate": 9.645860734373085e-06, + "loss": 4.4214, + "step": 7339 + }, + { + "epoch": 0.6255859541464246, + "grad_norm": 89.71872915629464, + "learning_rate": 9.645677422721638e-06, + "loss": 4.726, + "step": 7340 + }, + { + "epoch": 0.6256711838404501, + "grad_norm": 44.42328843599023, + "learning_rate": 9.6454940653818e-06, + "loss": 3.0281, + "step": 7341 + }, + { + "epoch": 0.6257564135344754, + "grad_norm": 217.51704265319017, + "learning_rate": 9.645310662355373e-06, + "loss": 4.1767, + "step": 7342 + }, + { + "epoch": 0.6258416432285008, + "grad_norm": 34.86878782419425, + "learning_rate": 9.645127213644163e-06, + "loss": 3.682, + "step": 7343 + }, + { + "epoch": 0.6259268729225262, + "grad_norm": 37.56209228709143, + "learning_rate": 9.644943719249973e-06, + "loss": 3.3895, + "step": 7344 + }, + { + "epoch": 0.6260121026165516, + "grad_norm": 192.3257172907942, + "learning_rate": 9.644760179174606e-06, + "loss": 4.3825, + "step": 7345 + }, + { + "epoch": 0.626097332310577, + "grad_norm": 46.17830277327857, + "learning_rate": 9.644576593419868e-06, + "loss": 3.6613, + "step": 7346 + }, + { + "epoch": 0.6261825620046024, + "grad_norm": 39.72388229470485, + "learning_rate": 9.644392961987566e-06, + "loss": 3.987, + "step": 7347 + }, + { + "epoch": 0.6262677916986278, + "grad_norm": 104.51029942936178, + "learning_rate": 9.644209284879504e-06, + "loss": 5.1053, + "step": 7348 + }, + { + "epoch": 0.6263530213926533, + "grad_norm": 27.862862882827976, + "learning_rate": 9.64402556209749e-06, + "loss": 3.4616, + "step": 7349 + }, + { + "epoch": 0.6264382510866786, + "grad_norm": 32.437403634270126, + "learning_rate": 9.643841793643328e-06, + "loss": 3.8952, + "step": 7350 + }, + { + "epoch": 0.626523480780704, + "grad_norm": 48.219401425190384, + "learning_rate": 9.64365797951883e-06, + "loss": 3.7703, + "step": 7351 + }, + { + "epoch": 0.6266087104747294, + "grad_norm": 49.48553539638266, + "learning_rate": 9.643474119725799e-06, + "loss": 3.972, + "step": 7352 + }, + { + "epoch": 0.6266939401687548, + "grad_norm": 31.359381281482214, + "learning_rate": 9.643290214266046e-06, + "loss": 3.5418, + "step": 7353 + }, + { + "epoch": 0.6267791698627801, + "grad_norm": 54.93344604536314, + "learning_rate": 9.643106263141377e-06, + "loss": 2.982, + "step": 7354 + }, + { + "epoch": 0.6268643995568056, + "grad_norm": 31.549039644293217, + "learning_rate": 9.642922266353605e-06, + "loss": 3.5437, + "step": 7355 + }, + { + "epoch": 0.626949629250831, + "grad_norm": 97.15015030882819, + "learning_rate": 9.642738223904537e-06, + "loss": 4.9636, + "step": 7356 + }, + { + "epoch": 0.6270348589448563, + "grad_norm": 32.90381791173384, + "learning_rate": 9.642554135795982e-06, + "loss": 3.594, + "step": 7357 + }, + { + "epoch": 0.6271200886388818, + "grad_norm": 105.32279847788233, + "learning_rate": 9.642370002029754e-06, + "loss": 5.0885, + "step": 7358 + }, + { + "epoch": 0.6272053183329072, + "grad_norm": 40.78714061549743, + "learning_rate": 9.64218582260766e-06, + "loss": 3.9827, + "step": 7359 + }, + { + "epoch": 0.6272905480269326, + "grad_norm": 37.880393487298655, + "learning_rate": 9.642001597531515e-06, + "loss": 4.2312, + "step": 7360 + }, + { + "epoch": 0.627375777720958, + "grad_norm": 61.59865768003733, + "learning_rate": 9.641817326803127e-06, + "loss": 4.0394, + "step": 7361 + }, + { + "epoch": 0.6274610074149833, + "grad_norm": 76.59041945600761, + "learning_rate": 9.64163301042431e-06, + "loss": 5.5129, + "step": 7362 + }, + { + "epoch": 0.6275462371090088, + "grad_norm": 38.036369779609174, + "learning_rate": 9.641448648396879e-06, + "loss": 4.0894, + "step": 7363 + }, + { + "epoch": 0.6276314668030342, + "grad_norm": 68.71393969636733, + "learning_rate": 9.641264240722642e-06, + "loss": 3.9784, + "step": 7364 + }, + { + "epoch": 0.6277166964970595, + "grad_norm": 62.74043425679161, + "learning_rate": 9.641079787403418e-06, + "loss": 4.8823, + "step": 7365 + }, + { + "epoch": 0.627801926191085, + "grad_norm": 107.14232601277826, + "learning_rate": 9.640895288441016e-06, + "loss": 5.0397, + "step": 7366 + }, + { + "epoch": 0.6278871558851103, + "grad_norm": 126.32410785832626, + "learning_rate": 9.640710743837254e-06, + "loss": 3.7503, + "step": 7367 + }, + { + "epoch": 0.6279723855791358, + "grad_norm": 69.11294266121507, + "learning_rate": 9.640526153593946e-06, + "loss": 3.7513, + "step": 7368 + }, + { + "epoch": 0.6280576152731612, + "grad_norm": 60.26590620094862, + "learning_rate": 9.640341517712907e-06, + "loss": 5.2567, + "step": 7369 + }, + { + "epoch": 0.6281428449671865, + "grad_norm": 28.086347108807836, + "learning_rate": 9.640156836195953e-06, + "loss": 3.7313, + "step": 7370 + }, + { + "epoch": 0.628228074661212, + "grad_norm": 53.68818940180779, + "learning_rate": 9.639972109044902e-06, + "loss": 3.7825, + "step": 7371 + }, + { + "epoch": 0.6283133043552374, + "grad_norm": 73.33819008780607, + "learning_rate": 9.639787336261567e-06, + "loss": 3.0884, + "step": 7372 + }, + { + "epoch": 0.6283985340492627, + "grad_norm": 51.94956625300336, + "learning_rate": 9.639602517847769e-06, + "loss": 4.7323, + "step": 7373 + }, + { + "epoch": 0.6284837637432882, + "grad_norm": 198.91377194894622, + "learning_rate": 9.63941765380532e-06, + "loss": 5.24, + "step": 7374 + }, + { + "epoch": 0.6285689934373135, + "grad_norm": 70.27842947438452, + "learning_rate": 9.639232744136047e-06, + "loss": 4.1287, + "step": 7375 + }, + { + "epoch": 0.628654223131339, + "grad_norm": 30.76649526252509, + "learning_rate": 9.63904778884176e-06, + "loss": 3.2469, + "step": 7376 + }, + { + "epoch": 0.6287394528253644, + "grad_norm": 51.50595910639632, + "learning_rate": 9.638862787924284e-06, + "loss": 4.7496, + "step": 7377 + }, + { + "epoch": 0.6288246825193897, + "grad_norm": 34.983335328267195, + "learning_rate": 9.638677741385433e-06, + "loss": 3.6584, + "step": 7378 + }, + { + "epoch": 0.6289099122134152, + "grad_norm": 52.7881653596717, + "learning_rate": 9.638492649227031e-06, + "loss": 4.1965, + "step": 7379 + }, + { + "epoch": 0.6289951419074405, + "grad_norm": 356.0659052110609, + "learning_rate": 9.638307511450897e-06, + "loss": 6.1259, + "step": 7380 + }, + { + "epoch": 0.6290803716014659, + "grad_norm": 45.28548250544312, + "learning_rate": 9.63812232805885e-06, + "loss": 4.11, + "step": 7381 + }, + { + "epoch": 0.6291656012954914, + "grad_norm": 82.64540283343162, + "learning_rate": 9.637937099052715e-06, + "loss": 6.3306, + "step": 7382 + }, + { + "epoch": 0.6292508309895167, + "grad_norm": 50.9403035087247, + "learning_rate": 9.637751824434312e-06, + "loss": 4.0738, + "step": 7383 + }, + { + "epoch": 0.6293360606835422, + "grad_norm": 68.81662023511468, + "learning_rate": 9.637566504205462e-06, + "loss": 3.5799, + "step": 7384 + }, + { + "epoch": 0.6294212903775676, + "grad_norm": 32.94465975861659, + "learning_rate": 9.637381138367987e-06, + "loss": 2.9994, + "step": 7385 + }, + { + "epoch": 0.6295065200715929, + "grad_norm": 89.11573553102005, + "learning_rate": 9.637195726923712e-06, + "loss": 5.7629, + "step": 7386 + }, + { + "epoch": 0.6295917497656184, + "grad_norm": 52.34793560226151, + "learning_rate": 9.63701026987446e-06, + "loss": 4.7646, + "step": 7387 + }, + { + "epoch": 0.6296769794596437, + "grad_norm": 111.41622383280742, + "learning_rate": 9.636824767222055e-06, + "loss": 5.4289, + "step": 7388 + }, + { + "epoch": 0.6297622091536691, + "grad_norm": 44.71756988383436, + "learning_rate": 9.63663921896832e-06, + "loss": 3.5752, + "step": 7389 + }, + { + "epoch": 0.6298474388476946, + "grad_norm": 33.88307041905652, + "learning_rate": 9.63645362511508e-06, + "loss": 3.8678, + "step": 7390 + }, + { + "epoch": 0.6299326685417199, + "grad_norm": 53.382728105471855, + "learning_rate": 9.636267985664163e-06, + "loss": 4.0081, + "step": 7391 + }, + { + "epoch": 0.6300178982357454, + "grad_norm": 75.5505426412503, + "learning_rate": 9.63608230061739e-06, + "loss": 4.499, + "step": 7392 + }, + { + "epoch": 0.6301031279297707, + "grad_norm": 50.181410252614896, + "learning_rate": 9.635896569976593e-06, + "loss": 4.0124, + "step": 7393 + }, + { + "epoch": 0.6301883576237961, + "grad_norm": 63.082881538591785, + "learning_rate": 9.635710793743593e-06, + "loss": 4.3979, + "step": 7394 + }, + { + "epoch": 0.6302735873178216, + "grad_norm": 38.558993968684184, + "learning_rate": 9.63552497192022e-06, + "loss": 2.7524, + "step": 7395 + }, + { + "epoch": 0.6303588170118469, + "grad_norm": 59.952294811655655, + "learning_rate": 9.635339104508301e-06, + "loss": 4.429, + "step": 7396 + }, + { + "epoch": 0.6304440467058723, + "grad_norm": 30.571663226304242, + "learning_rate": 9.635153191509664e-06, + "loss": 2.998, + "step": 7397 + }, + { + "epoch": 0.6305292763998978, + "grad_norm": 37.00475334148108, + "learning_rate": 9.634967232926137e-06, + "loss": 3.5582, + "step": 7398 + }, + { + "epoch": 0.6306145060939231, + "grad_norm": 44.40500065408644, + "learning_rate": 9.63478122875955e-06, + "loss": 3.6448, + "step": 7399 + }, + { + "epoch": 0.6306997357879485, + "grad_norm": 30.28617620816391, + "learning_rate": 9.634595179011728e-06, + "loss": 3.1194, + "step": 7400 + }, + { + "epoch": 0.6307849654819739, + "grad_norm": 54.17531807946968, + "learning_rate": 9.634409083684507e-06, + "loss": 3.8778, + "step": 7401 + }, + { + "epoch": 0.6308701951759993, + "grad_norm": 61.77373718293438, + "learning_rate": 9.634222942779714e-06, + "loss": 3.6559, + "step": 7402 + }, + { + "epoch": 0.6309554248700248, + "grad_norm": 90.19048748037504, + "learning_rate": 9.63403675629918e-06, + "loss": 2.5816, + "step": 7403 + }, + { + "epoch": 0.6310406545640501, + "grad_norm": 78.0075771666112, + "learning_rate": 9.633850524244735e-06, + "loss": 5.7399, + "step": 7404 + }, + { + "epoch": 0.6311258842580755, + "grad_norm": 32.30544463142163, + "learning_rate": 9.63366424661821e-06, + "loss": 4.4026, + "step": 7405 + }, + { + "epoch": 0.6312111139521009, + "grad_norm": 35.75734745817726, + "learning_rate": 9.63347792342144e-06, + "loss": 3.8857, + "step": 7406 + }, + { + "epoch": 0.6312963436461263, + "grad_norm": 68.3221346190525, + "learning_rate": 9.633291554656256e-06, + "loss": 4.7325, + "step": 7407 + }, + { + "epoch": 0.6313815733401517, + "grad_norm": 42.406735863849526, + "learning_rate": 9.63310514032449e-06, + "loss": 3.7047, + "step": 7408 + }, + { + "epoch": 0.6314668030341771, + "grad_norm": 59.70448297830624, + "learning_rate": 9.632918680427977e-06, + "loss": 4.2055, + "step": 7409 + }, + { + "epoch": 0.6315520327282025, + "grad_norm": 56.824649780874, + "learning_rate": 9.63273217496855e-06, + "loss": 4.2921, + "step": 7410 + }, + { + "epoch": 0.631637262422228, + "grad_norm": 36.34285600000094, + "learning_rate": 9.632545623948041e-06, + "loss": 3.899, + "step": 7411 + }, + { + "epoch": 0.6317224921162533, + "grad_norm": 49.75272411889444, + "learning_rate": 9.632359027368287e-06, + "loss": 4.2747, + "step": 7412 + }, + { + "epoch": 0.6318077218102787, + "grad_norm": 39.52777896520794, + "learning_rate": 9.632172385231124e-06, + "loss": 3.8166, + "step": 7413 + }, + { + "epoch": 0.6318929515043041, + "grad_norm": 61.05271188083879, + "learning_rate": 9.631985697538386e-06, + "loss": 4.895, + "step": 7414 + }, + { + "epoch": 0.6319781811983295, + "grad_norm": 45.57259353039852, + "learning_rate": 9.631798964291907e-06, + "loss": 3.8626, + "step": 7415 + }, + { + "epoch": 0.6320634108923548, + "grad_norm": 42.670796876070284, + "learning_rate": 9.631612185493528e-06, + "loss": 3.9323, + "step": 7416 + }, + { + "epoch": 0.6321486405863803, + "grad_norm": 48.394205733777056, + "learning_rate": 9.631425361145083e-06, + "loss": 4.1699, + "step": 7417 + }, + { + "epoch": 0.6322338702804057, + "grad_norm": 77.80598671809129, + "learning_rate": 9.63123849124841e-06, + "loss": 5.6629, + "step": 7418 + }, + { + "epoch": 0.6323190999744311, + "grad_norm": 44.346925422655154, + "learning_rate": 9.631051575805346e-06, + "loss": 4.2787, + "step": 7419 + }, + { + "epoch": 0.6324043296684565, + "grad_norm": 53.00039394514578, + "learning_rate": 9.630864614817733e-06, + "loss": 4.1563, + "step": 7420 + }, + { + "epoch": 0.6324895593624819, + "grad_norm": 62.7380668619435, + "learning_rate": 9.630677608287402e-06, + "loss": 3.3212, + "step": 7421 + }, + { + "epoch": 0.6325747890565073, + "grad_norm": 46.86499320524458, + "learning_rate": 9.630490556216199e-06, + "loss": 3.9995, + "step": 7422 + }, + { + "epoch": 0.6326600187505327, + "grad_norm": 45.64049518148216, + "learning_rate": 9.630303458605962e-06, + "loss": 4.373, + "step": 7423 + }, + { + "epoch": 0.632745248444558, + "grad_norm": 55.54557561548787, + "learning_rate": 9.630116315458529e-06, + "loss": 4.5149, + "step": 7424 + }, + { + "epoch": 0.6328304781385835, + "grad_norm": 87.40888735082204, + "learning_rate": 9.629929126775743e-06, + "loss": 4.3196, + "step": 7425 + }, + { + "epoch": 0.6329157078326089, + "grad_norm": 113.82197476070246, + "learning_rate": 9.629741892559443e-06, + "loss": 6.1119, + "step": 7426 + }, + { + "epoch": 0.6330009375266343, + "grad_norm": 78.91932924628816, + "learning_rate": 9.629554612811472e-06, + "loss": 3.648, + "step": 7427 + }, + { + "epoch": 0.6330861672206597, + "grad_norm": 59.569344318787024, + "learning_rate": 9.629367287533672e-06, + "loss": 4.1314, + "step": 7428 + }, + { + "epoch": 0.633171396914685, + "grad_norm": 57.64602112036456, + "learning_rate": 9.629179916727882e-06, + "loss": 4.5152, + "step": 7429 + }, + { + "epoch": 0.6332566266087105, + "grad_norm": 42.746170475116635, + "learning_rate": 9.628992500395947e-06, + "loss": 4.2518, + "step": 7430 + }, + { + "epoch": 0.6333418563027359, + "grad_norm": 55.70609409598789, + "learning_rate": 9.628805038539711e-06, + "loss": 3.9091, + "step": 7431 + }, + { + "epoch": 0.6334270859967612, + "grad_norm": 70.75033167722785, + "learning_rate": 9.628617531161018e-06, + "loss": 5.8086, + "step": 7432 + }, + { + "epoch": 0.6335123156907867, + "grad_norm": 45.92699666732321, + "learning_rate": 9.628429978261709e-06, + "loss": 3.8704, + "step": 7433 + }, + { + "epoch": 0.633597545384812, + "grad_norm": 38.11323758287769, + "learning_rate": 9.62824237984363e-06, + "loss": 3.1339, + "step": 7434 + }, + { + "epoch": 0.6336827750788374, + "grad_norm": 47.449592908622996, + "learning_rate": 9.628054735908627e-06, + "loss": 4.2042, + "step": 7435 + }, + { + "epoch": 0.6337680047728629, + "grad_norm": 32.574212746990305, + "learning_rate": 9.627867046458545e-06, + "loss": 3.1053, + "step": 7436 + }, + { + "epoch": 0.6338532344668882, + "grad_norm": 53.8783426204392, + "learning_rate": 9.62767931149523e-06, + "loss": 4.6883, + "step": 7437 + }, + { + "epoch": 0.6339384641609137, + "grad_norm": 43.005933310529734, + "learning_rate": 9.627491531020526e-06, + "loss": 4.6886, + "step": 7438 + }, + { + "epoch": 0.6340236938549391, + "grad_norm": 27.789000777155, + "learning_rate": 9.627303705036283e-06, + "loss": 3.6641, + "step": 7439 + }, + { + "epoch": 0.6341089235489644, + "grad_norm": 787.1449944337822, + "learning_rate": 9.627115833544346e-06, + "loss": 4.329, + "step": 7440 + }, + { + "epoch": 0.6341941532429899, + "grad_norm": 29.184656563514135, + "learning_rate": 9.626927916546564e-06, + "loss": 3.8399, + "step": 7441 + }, + { + "epoch": 0.6342793829370152, + "grad_norm": 258.96290341105896, + "learning_rate": 9.626739954044785e-06, + "loss": 1.6148, + "step": 7442 + }, + { + "epoch": 0.6343646126310406, + "grad_norm": 59.44104966201389, + "learning_rate": 9.626551946040856e-06, + "loss": 4.6696, + "step": 7443 + }, + { + "epoch": 0.6344498423250661, + "grad_norm": 73.52081609769581, + "learning_rate": 9.626363892536628e-06, + "loss": 5.6398, + "step": 7444 + }, + { + "epoch": 0.6345350720190914, + "grad_norm": 30.33850160562456, + "learning_rate": 9.626175793533948e-06, + "loss": 4.245, + "step": 7445 + }, + { + "epoch": 0.6346203017131169, + "grad_norm": 40.16448706176597, + "learning_rate": 9.62598764903467e-06, + "loss": 2.7444, + "step": 7446 + }, + { + "epoch": 0.6347055314071423, + "grad_norm": 66.46182509123861, + "learning_rate": 9.625799459040638e-06, + "loss": 4.17, + "step": 7447 + }, + { + "epoch": 0.6347907611011676, + "grad_norm": 54.73720912583829, + "learning_rate": 9.625611223553708e-06, + "loss": 3.3905, + "step": 7448 + }, + { + "epoch": 0.6348759907951931, + "grad_norm": 234.21601271862752, + "learning_rate": 9.62542294257573e-06, + "loss": 4.1806, + "step": 7449 + }, + { + "epoch": 0.6349612204892184, + "grad_norm": 32.979570658088456, + "learning_rate": 9.625234616108556e-06, + "loss": 3.6508, + "step": 7450 + }, + { + "epoch": 0.6350464501832438, + "grad_norm": 62.83800946720259, + "learning_rate": 9.625046244154038e-06, + "loss": 3.6668, + "step": 7451 + }, + { + "epoch": 0.6351316798772693, + "grad_norm": 76.36003840318912, + "learning_rate": 9.624857826714025e-06, + "loss": 4.4428, + "step": 7452 + }, + { + "epoch": 0.6352169095712946, + "grad_norm": 62.730190336978545, + "learning_rate": 9.624669363790376e-06, + "loss": 4.0185, + "step": 7453 + }, + { + "epoch": 0.6353021392653201, + "grad_norm": 64.86552635609105, + "learning_rate": 9.624480855384939e-06, + "loss": 4.4847, + "step": 7454 + }, + { + "epoch": 0.6353873689593454, + "grad_norm": 70.09293316772674, + "learning_rate": 9.624292301499572e-06, + "loss": 4.3543, + "step": 7455 + }, + { + "epoch": 0.6354725986533708, + "grad_norm": 82.12106115299272, + "learning_rate": 9.624103702136127e-06, + "loss": 3.524, + "step": 7456 + }, + { + "epoch": 0.6355578283473963, + "grad_norm": 123.5367952035071, + "learning_rate": 9.62391505729646e-06, + "loss": 4.6286, + "step": 7457 + }, + { + "epoch": 0.6356430580414216, + "grad_norm": 52.819286871132455, + "learning_rate": 9.623726366982425e-06, + "loss": 2.5319, + "step": 7458 + }, + { + "epoch": 0.635728287735447, + "grad_norm": 58.689612489077085, + "learning_rate": 9.62353763119588e-06, + "loss": 3.994, + "step": 7459 + }, + { + "epoch": 0.6358135174294725, + "grad_norm": 32.741820013678876, + "learning_rate": 9.623348849938678e-06, + "loss": 3.719, + "step": 7460 + }, + { + "epoch": 0.6358987471234978, + "grad_norm": 34.29115590847662, + "learning_rate": 9.623160023212679e-06, + "loss": 4.6705, + "step": 7461 + }, + { + "epoch": 0.6359839768175233, + "grad_norm": 61.35410858250289, + "learning_rate": 9.622971151019737e-06, + "loss": 4.778, + "step": 7462 + }, + { + "epoch": 0.6360692065115486, + "grad_norm": 72.6486717014933, + "learning_rate": 9.62278223336171e-06, + "loss": 3.8066, + "step": 7463 + }, + { + "epoch": 0.636154436205574, + "grad_norm": 34.99346107764678, + "learning_rate": 9.622593270240459e-06, + "loss": 3.8185, + "step": 7464 + }, + { + "epoch": 0.6362396658995995, + "grad_norm": 81.01157875740144, + "learning_rate": 9.622404261657838e-06, + "loss": 3.4748, + "step": 7465 + }, + { + "epoch": 0.6363248955936248, + "grad_norm": 32.008925473454134, + "learning_rate": 9.622215207615709e-06, + "loss": 3.7249, + "step": 7466 + }, + { + "epoch": 0.6364101252876502, + "grad_norm": 30.960667670896267, + "learning_rate": 9.622026108115929e-06, + "loss": 3.2749, + "step": 7467 + }, + { + "epoch": 0.6364953549816756, + "grad_norm": 76.45356552558816, + "learning_rate": 9.621836963160358e-06, + "loss": 4.9754, + "step": 7468 + }, + { + "epoch": 0.636580584675701, + "grad_norm": 30.211407350670417, + "learning_rate": 9.62164777275086e-06, + "loss": 3.1029, + "step": 7469 + }, + { + "epoch": 0.6366658143697264, + "grad_norm": 49.56871792478299, + "learning_rate": 9.62145853688929e-06, + "loss": 3.97, + "step": 7470 + }, + { + "epoch": 0.6367510440637518, + "grad_norm": 102.21517728571123, + "learning_rate": 9.621269255577512e-06, + "loss": 5.5343, + "step": 7471 + }, + { + "epoch": 0.6368362737577772, + "grad_norm": 60.78482639254117, + "learning_rate": 9.621079928817388e-06, + "loss": 4.4577, + "step": 7472 + }, + { + "epoch": 0.6369215034518027, + "grad_norm": 33.85141658434687, + "learning_rate": 9.62089055661078e-06, + "loss": 3.5487, + "step": 7473 + }, + { + "epoch": 0.637006733145828, + "grad_norm": 33.70122785650399, + "learning_rate": 9.620701138959547e-06, + "loss": 3.7062, + "step": 7474 + }, + { + "epoch": 0.6370919628398534, + "grad_norm": 72.42155829472473, + "learning_rate": 9.620511675865556e-06, + "loss": 3.4056, + "step": 7475 + }, + { + "epoch": 0.6371771925338788, + "grad_norm": 50.88759288160079, + "learning_rate": 9.62032216733067e-06, + "loss": 4.0541, + "step": 7476 + }, + { + "epoch": 0.6372624222279042, + "grad_norm": 33.034839081976685, + "learning_rate": 9.62013261335675e-06, + "loss": 3.3544, + "step": 7477 + }, + { + "epoch": 0.6373476519219295, + "grad_norm": 51.20433996492929, + "learning_rate": 9.61994301394566e-06, + "loss": 4.2447, + "step": 7478 + }, + { + "epoch": 0.637432881615955, + "grad_norm": 82.7726824004558, + "learning_rate": 9.61975336909927e-06, + "loss": 5.7222, + "step": 7479 + }, + { + "epoch": 0.6375181113099804, + "grad_norm": 38.18964228607455, + "learning_rate": 9.619563678819438e-06, + "loss": 3.9198, + "step": 7480 + }, + { + "epoch": 0.6376033410040058, + "grad_norm": 114.00076138195924, + "learning_rate": 9.619373943108033e-06, + "loss": 4.8342, + "step": 7481 + }, + { + "epoch": 0.6376885706980312, + "grad_norm": 38.60560482067813, + "learning_rate": 9.619184161966922e-06, + "loss": 3.508, + "step": 7482 + }, + { + "epoch": 0.6377738003920566, + "grad_norm": 43.62146197040192, + "learning_rate": 9.618994335397972e-06, + "loss": 3.6956, + "step": 7483 + }, + { + "epoch": 0.637859030086082, + "grad_norm": 62.45866420716975, + "learning_rate": 9.618804463403044e-06, + "loss": 4.2227, + "step": 7484 + }, + { + "epoch": 0.6379442597801074, + "grad_norm": 51.43428116583457, + "learning_rate": 9.618614545984014e-06, + "loss": 3.8122, + "step": 7485 + }, + { + "epoch": 0.6380294894741327, + "grad_norm": 51.39639404497483, + "learning_rate": 9.618424583142741e-06, + "loss": 4.5092, + "step": 7486 + }, + { + "epoch": 0.6381147191681582, + "grad_norm": 43.80338298648904, + "learning_rate": 9.6182345748811e-06, + "loss": 3.493, + "step": 7487 + }, + { + "epoch": 0.6381999488621836, + "grad_norm": 39.59005888435336, + "learning_rate": 9.618044521200955e-06, + "loss": 3.9187, + "step": 7488 + }, + { + "epoch": 0.638285178556209, + "grad_norm": 38.637968956445, + "learning_rate": 9.617854422104179e-06, + "loss": 4.3843, + "step": 7489 + }, + { + "epoch": 0.6383704082502344, + "grad_norm": 141.45582871516368, + "learning_rate": 9.61766427759264e-06, + "loss": 4.1611, + "step": 7490 + }, + { + "epoch": 0.6384556379442597, + "grad_norm": 65.13387053566237, + "learning_rate": 9.617474087668206e-06, + "loss": 4.3459, + "step": 7491 + }, + { + "epoch": 0.6385408676382852, + "grad_norm": 56.189896444061716, + "learning_rate": 9.61728385233275e-06, + "loss": 3.9515, + "step": 7492 + }, + { + "epoch": 0.6386260973323106, + "grad_norm": 42.65579229197976, + "learning_rate": 9.617093571588144e-06, + "loss": 4.1056, + "step": 7493 + }, + { + "epoch": 0.6387113270263359, + "grad_norm": 60.07913433124987, + "learning_rate": 9.616903245436252e-06, + "loss": 3.7678, + "step": 7494 + }, + { + "epoch": 0.6387965567203614, + "grad_norm": 34.95627367837398, + "learning_rate": 9.616712873878957e-06, + "loss": 2.7054, + "step": 7495 + }, + { + "epoch": 0.6388817864143868, + "grad_norm": 39.68742432821787, + "learning_rate": 9.616522456918122e-06, + "loss": 3.5915, + "step": 7496 + }, + { + "epoch": 0.6389670161084122, + "grad_norm": 37.680971298059895, + "learning_rate": 9.616331994555625e-06, + "loss": 3.4431, + "step": 7497 + }, + { + "epoch": 0.6390522458024376, + "grad_norm": 112.00365220388144, + "learning_rate": 9.616141486793336e-06, + "loss": 4.3349, + "step": 7498 + }, + { + "epoch": 0.6391374754964629, + "grad_norm": 49.15747502449706, + "learning_rate": 9.615950933633129e-06, + "loss": 3.8193, + "step": 7499 + }, + { + "epoch": 0.6392227051904884, + "grad_norm": 72.74740676576258, + "learning_rate": 9.61576033507688e-06, + "loss": 5.5058, + "step": 7500 + }, + { + "epoch": 0.6393079348845138, + "grad_norm": 80.14248601126769, + "learning_rate": 9.615569691126462e-06, + "loss": 3.6158, + "step": 7501 + }, + { + "epoch": 0.6393931645785391, + "grad_norm": 31.012908144154572, + "learning_rate": 9.615379001783752e-06, + "loss": 2.7601, + "step": 7502 + }, + { + "epoch": 0.6394783942725646, + "grad_norm": 41.26847859308931, + "learning_rate": 9.615188267050621e-06, + "loss": 3.8836, + "step": 7503 + }, + { + "epoch": 0.63956362396659, + "grad_norm": 49.40966574110474, + "learning_rate": 9.614997486928947e-06, + "loss": 4.6721, + "step": 7504 + }, + { + "epoch": 0.6396488536606154, + "grad_norm": 61.60389186952238, + "learning_rate": 9.614806661420608e-06, + "loss": 4.9409, + "step": 7505 + }, + { + "epoch": 0.6397340833546408, + "grad_norm": 85.40247548341277, + "learning_rate": 9.614615790527478e-06, + "loss": 3.86, + "step": 7506 + }, + { + "epoch": 0.6398193130486661, + "grad_norm": 41.181545741713805, + "learning_rate": 9.614424874251438e-06, + "loss": 3.7354, + "step": 7507 + }, + { + "epoch": 0.6399045427426916, + "grad_norm": 49.792782683019986, + "learning_rate": 9.61423391259436e-06, + "loss": 4.1276, + "step": 7508 + }, + { + "epoch": 0.639989772436717, + "grad_norm": 52.723944317384195, + "learning_rate": 9.614042905558126e-06, + "loss": 3.827, + "step": 7509 + }, + { + "epoch": 0.6400750021307423, + "grad_norm": 33.45842704377171, + "learning_rate": 9.613851853144614e-06, + "loss": 3.1008, + "step": 7510 + }, + { + "epoch": 0.6401602318247678, + "grad_norm": 91.02154008709896, + "learning_rate": 9.6136607553557e-06, + "loss": 5.6238, + "step": 7511 + }, + { + "epoch": 0.6402454615187931, + "grad_norm": 36.68620609838685, + "learning_rate": 9.61346961219327e-06, + "loss": 3.3797, + "step": 7512 + }, + { + "epoch": 0.6403306912128185, + "grad_norm": 86.3064247052017, + "learning_rate": 9.613278423659195e-06, + "loss": 4.7075, + "step": 7513 + }, + { + "epoch": 0.640415920906844, + "grad_norm": 46.161933645422124, + "learning_rate": 9.613087189755362e-06, + "loss": 3.2439, + "step": 7514 + }, + { + "epoch": 0.6405011506008693, + "grad_norm": 36.067399753738016, + "learning_rate": 9.61289591048365e-06, + "loss": 3.6721, + "step": 7515 + }, + { + "epoch": 0.6405863802948948, + "grad_norm": 128.7487552608545, + "learning_rate": 9.61270458584594e-06, + "loss": 4.258, + "step": 7516 + }, + { + "epoch": 0.6406716099889201, + "grad_norm": 35.40848282865981, + "learning_rate": 9.612513215844112e-06, + "loss": 3.0755, + "step": 7517 + }, + { + "epoch": 0.6407568396829455, + "grad_norm": 40.445409913879665, + "learning_rate": 9.61232180048005e-06, + "loss": 3.3346, + "step": 7518 + }, + { + "epoch": 0.640842069376971, + "grad_norm": 100.26749806643608, + "learning_rate": 9.612130339755636e-06, + "loss": 4.8422, + "step": 7519 + }, + { + "epoch": 0.6409272990709963, + "grad_norm": 44.520269370960534, + "learning_rate": 9.611938833672753e-06, + "loss": 3.4161, + "step": 7520 + }, + { + "epoch": 0.6410125287650217, + "grad_norm": 91.98831458299291, + "learning_rate": 9.611747282233284e-06, + "loss": 3.6578, + "step": 7521 + }, + { + "epoch": 0.6410977584590472, + "grad_norm": 40.58746509469372, + "learning_rate": 9.611555685439112e-06, + "loss": 3.4904, + "step": 7522 + }, + { + "epoch": 0.6411829881530725, + "grad_norm": 66.38617074717071, + "learning_rate": 9.611364043292122e-06, + "loss": 4.9559, + "step": 7523 + }, + { + "epoch": 0.641268217847098, + "grad_norm": 38.252257979474436, + "learning_rate": 9.611172355794201e-06, + "loss": 2.8033, + "step": 7524 + }, + { + "epoch": 0.6413534475411233, + "grad_norm": 40.50641091239754, + "learning_rate": 9.610980622947232e-06, + "loss": 3.7037, + "step": 7525 + }, + { + "epoch": 0.6414386772351487, + "grad_norm": 38.78612912515025, + "learning_rate": 9.610788844753099e-06, + "loss": 3.1956, + "step": 7526 + }, + { + "epoch": 0.6415239069291742, + "grad_norm": 58.967582210011045, + "learning_rate": 9.61059702121369e-06, + "loss": 4.1843, + "step": 7527 + }, + { + "epoch": 0.6416091366231995, + "grad_norm": 110.34870386567319, + "learning_rate": 9.610405152330892e-06, + "loss": 5.1036, + "step": 7528 + }, + { + "epoch": 0.6416943663172249, + "grad_norm": 42.93736570851666, + "learning_rate": 9.610213238106593e-06, + "loss": 4.0334, + "step": 7529 + }, + { + "epoch": 0.6417795960112503, + "grad_norm": 50.97440916477286, + "learning_rate": 9.610021278542677e-06, + "loss": 3.1331, + "step": 7530 + }, + { + "epoch": 0.6418648257052757, + "grad_norm": 61.36951674588067, + "learning_rate": 9.609829273641034e-06, + "loss": 3.0905, + "step": 7531 + }, + { + "epoch": 0.6419500553993012, + "grad_norm": 56.53901875185575, + "learning_rate": 9.609637223403552e-06, + "loss": 3.3712, + "step": 7532 + }, + { + "epoch": 0.6420352850933265, + "grad_norm": 59.70319716596022, + "learning_rate": 9.60944512783212e-06, + "loss": 4.8316, + "step": 7533 + }, + { + "epoch": 0.6421205147873519, + "grad_norm": 69.76091588044605, + "learning_rate": 9.609252986928626e-06, + "loss": 4.6968, + "step": 7534 + }, + { + "epoch": 0.6422057444813773, + "grad_norm": 56.554554268682196, + "learning_rate": 9.609060800694961e-06, + "loss": 3.6597, + "step": 7535 + }, + { + "epoch": 0.6422909741754027, + "grad_norm": 67.5702104196744, + "learning_rate": 9.608868569133012e-06, + "loss": 4.345, + "step": 7536 + }, + { + "epoch": 0.6423762038694281, + "grad_norm": 41.182734957214976, + "learning_rate": 9.608676292244674e-06, + "loss": 4.2208, + "step": 7537 + }, + { + "epoch": 0.6424614335634535, + "grad_norm": 70.5942529995128, + "learning_rate": 9.608483970031836e-06, + "loss": 4.4236, + "step": 7538 + }, + { + "epoch": 0.6425466632574789, + "grad_norm": 41.31232524129738, + "learning_rate": 9.60829160249639e-06, + "loss": 3.5541, + "step": 7539 + }, + { + "epoch": 0.6426318929515044, + "grad_norm": 64.50132261728092, + "learning_rate": 9.608099189640227e-06, + "loss": 4.1599, + "step": 7540 + }, + { + "epoch": 0.6427171226455297, + "grad_norm": 40.87956932962533, + "learning_rate": 9.60790673146524e-06, + "loss": 5.0391, + "step": 7541 + }, + { + "epoch": 0.6428023523395551, + "grad_norm": 70.63124596588231, + "learning_rate": 9.60771422797332e-06, + "loss": 4.3233, + "step": 7542 + }, + { + "epoch": 0.6428875820335805, + "grad_norm": 57.39300681166749, + "learning_rate": 9.607521679166362e-06, + "loss": 4.2815, + "step": 7543 + }, + { + "epoch": 0.6429728117276059, + "grad_norm": 53.39561039097385, + "learning_rate": 9.607329085046258e-06, + "loss": 5.1001, + "step": 7544 + }, + { + "epoch": 0.6430580414216313, + "grad_norm": 45.94949911294782, + "learning_rate": 9.607136445614904e-06, + "loss": 2.5899, + "step": 7545 + }, + { + "epoch": 0.6431432711156567, + "grad_norm": 58.14997248915799, + "learning_rate": 9.606943760874195e-06, + "loss": 5.0191, + "step": 7546 + }, + { + "epoch": 0.6432285008096821, + "grad_norm": 77.95206974412018, + "learning_rate": 9.606751030826025e-06, + "loss": 3.9822, + "step": 7547 + }, + { + "epoch": 0.6433137305037074, + "grad_norm": 60.9062508278459, + "learning_rate": 9.606558255472286e-06, + "loss": 3.8884, + "step": 7548 + }, + { + "epoch": 0.6433989601977329, + "grad_norm": 30.718659014453433, + "learning_rate": 9.60636543481488e-06, + "loss": 2.5738, + "step": 7549 + }, + { + "epoch": 0.6434841898917583, + "grad_norm": 44.347717204989884, + "learning_rate": 9.606172568855699e-06, + "loss": 3.9356, + "step": 7550 + }, + { + "epoch": 0.6435694195857837, + "grad_norm": 36.76420048890189, + "learning_rate": 9.605979657596641e-06, + "loss": 3.6782, + "step": 7551 + }, + { + "epoch": 0.6436546492798091, + "grad_norm": 50.31442444933203, + "learning_rate": 9.605786701039604e-06, + "loss": 3.5263, + "step": 7552 + }, + { + "epoch": 0.6437398789738344, + "grad_norm": 41.373525978253106, + "learning_rate": 9.605593699186485e-06, + "loss": 2.3081, + "step": 7553 + }, + { + "epoch": 0.6438251086678599, + "grad_norm": 49.09276525200417, + "learning_rate": 9.605400652039181e-06, + "loss": 4.5537, + "step": 7554 + }, + { + "epoch": 0.6439103383618853, + "grad_norm": 87.91050362984944, + "learning_rate": 9.605207559599593e-06, + "loss": 3.4003, + "step": 7555 + }, + { + "epoch": 0.6439955680559106, + "grad_norm": 53.04459746235822, + "learning_rate": 9.605014421869618e-06, + "loss": 4.2959, + "step": 7556 + }, + { + "epoch": 0.6440807977499361, + "grad_norm": 66.90936511935874, + "learning_rate": 9.604821238851156e-06, + "loss": 4.4774, + "step": 7557 + }, + { + "epoch": 0.6441660274439615, + "grad_norm": 57.69964958191921, + "learning_rate": 9.604628010546107e-06, + "loss": 4.7114, + "step": 7558 + }, + { + "epoch": 0.6442512571379869, + "grad_norm": 32.86550215248329, + "learning_rate": 9.604434736956371e-06, + "loss": 3.5015, + "step": 7559 + }, + { + "epoch": 0.6443364868320123, + "grad_norm": 49.317658711441446, + "learning_rate": 9.604241418083849e-06, + "loss": 4.0264, + "step": 7560 + }, + { + "epoch": 0.6444217165260376, + "grad_norm": 89.9413490717043, + "learning_rate": 9.604048053930442e-06, + "loss": 4.717, + "step": 7561 + }, + { + "epoch": 0.6445069462200631, + "grad_norm": 58.142617762352145, + "learning_rate": 9.603854644498051e-06, + "loss": 4.1219, + "step": 7562 + }, + { + "epoch": 0.6445921759140885, + "grad_norm": 33.68931139765722, + "learning_rate": 9.60366118978858e-06, + "loss": 2.9469, + "step": 7563 + }, + { + "epoch": 0.6446774056081138, + "grad_norm": 46.095919065366935, + "learning_rate": 9.603467689803932e-06, + "loss": 3.9885, + "step": 7564 + }, + { + "epoch": 0.6447626353021393, + "grad_norm": 37.77823031544389, + "learning_rate": 9.603274144546007e-06, + "loss": 2.721, + "step": 7565 + }, + { + "epoch": 0.6448478649961646, + "grad_norm": 80.22714606073134, + "learning_rate": 9.60308055401671e-06, + "loss": 4.5002, + "step": 7566 + }, + { + "epoch": 0.6449330946901901, + "grad_norm": 86.57539215690109, + "learning_rate": 9.602886918217944e-06, + "loss": 5.721, + "step": 7567 + }, + { + "epoch": 0.6450183243842155, + "grad_norm": 65.58909999134913, + "learning_rate": 9.602693237151615e-06, + "loss": 4.8009, + "step": 7568 + }, + { + "epoch": 0.6451035540782408, + "grad_norm": 29.161943538486117, + "learning_rate": 9.602499510819626e-06, + "loss": 2.2593, + "step": 7569 + }, + { + "epoch": 0.6451887837722663, + "grad_norm": 43.9507896232835, + "learning_rate": 9.602305739223884e-06, + "loss": 3.3624, + "step": 7570 + }, + { + "epoch": 0.6452740134662917, + "grad_norm": 46.31156321584529, + "learning_rate": 9.602111922366295e-06, + "loss": 3.6525, + "step": 7571 + }, + { + "epoch": 0.645359243160317, + "grad_norm": 59.05500901862607, + "learning_rate": 9.601918060248762e-06, + "loss": 4.7035, + "step": 7572 + }, + { + "epoch": 0.6454444728543425, + "grad_norm": 41.919872817626256, + "learning_rate": 9.601724152873195e-06, + "loss": 3.5391, + "step": 7573 + }, + { + "epoch": 0.6455297025483678, + "grad_norm": 212.96450406831426, + "learning_rate": 9.6015302002415e-06, + "loss": 4.6963, + "step": 7574 + }, + { + "epoch": 0.6456149322423933, + "grad_norm": 50.94905762672716, + "learning_rate": 9.601336202355582e-06, + "loss": 3.7646, + "step": 7575 + }, + { + "epoch": 0.6457001619364187, + "grad_norm": 62.66123982055303, + "learning_rate": 9.60114215921735e-06, + "loss": 4.5809, + "step": 7576 + }, + { + "epoch": 0.645785391630444, + "grad_norm": 109.87106108517673, + "learning_rate": 9.600948070828717e-06, + "loss": 4.7642, + "step": 7577 + }, + { + "epoch": 0.6458706213244695, + "grad_norm": 65.57706181191323, + "learning_rate": 9.600753937191586e-06, + "loss": 4.381, + "step": 7578 + }, + { + "epoch": 0.6459558510184948, + "grad_norm": 126.48320394164722, + "learning_rate": 9.600559758307868e-06, + "loss": 5.1369, + "step": 7579 + }, + { + "epoch": 0.6460410807125202, + "grad_norm": 38.608225386257544, + "learning_rate": 9.600365534179473e-06, + "loss": 3.5386, + "step": 7580 + }, + { + "epoch": 0.6461263104065457, + "grad_norm": 45.11390012136824, + "learning_rate": 9.60017126480831e-06, + "loss": 4.7191, + "step": 7581 + }, + { + "epoch": 0.646211540100571, + "grad_norm": 66.09602383879529, + "learning_rate": 9.599976950196292e-06, + "loss": 5.5015, + "step": 7582 + }, + { + "epoch": 0.6462967697945965, + "grad_norm": 48.71089099095972, + "learning_rate": 9.599782590345328e-06, + "loss": 3.9778, + "step": 7583 + }, + { + "epoch": 0.6463819994886219, + "grad_norm": 96.55530896310096, + "learning_rate": 9.59958818525733e-06, + "loss": 4.8016, + "step": 7584 + }, + { + "epoch": 0.6464672291826472, + "grad_norm": 39.02065392413926, + "learning_rate": 9.59939373493421e-06, + "loss": 4.0937, + "step": 7585 + }, + { + "epoch": 0.6465524588766727, + "grad_norm": 41.34674151744692, + "learning_rate": 9.59919923937788e-06, + "loss": 3.2782, + "step": 7586 + }, + { + "epoch": 0.646637688570698, + "grad_norm": 79.61137901260591, + "learning_rate": 9.599004698590252e-06, + "loss": 2.8892, + "step": 7587 + }, + { + "epoch": 0.6467229182647234, + "grad_norm": 46.78686985886046, + "learning_rate": 9.598810112573243e-06, + "loss": 3.1322, + "step": 7588 + }, + { + "epoch": 0.6468081479587489, + "grad_norm": 38.66763538115807, + "learning_rate": 9.598615481328762e-06, + "loss": 3.7144, + "step": 7589 + }, + { + "epoch": 0.6468933776527742, + "grad_norm": 65.56056021342974, + "learning_rate": 9.598420804858725e-06, + "loss": 4.0917, + "step": 7590 + }, + { + "epoch": 0.6469786073467996, + "grad_norm": 33.05604361925965, + "learning_rate": 9.598226083165047e-06, + "loss": 3.1718, + "step": 7591 + }, + { + "epoch": 0.647063837040825, + "grad_norm": 34.591296556066915, + "learning_rate": 9.598031316249643e-06, + "loss": 3.5336, + "step": 7592 + }, + { + "epoch": 0.6471490667348504, + "grad_norm": 29.75305774439913, + "learning_rate": 9.597836504114427e-06, + "loss": 3.2376, + "step": 7593 + }, + { + "epoch": 0.6472342964288759, + "grad_norm": 61.903257868560026, + "learning_rate": 9.597641646761317e-06, + "loss": 4.2357, + "step": 7594 + }, + { + "epoch": 0.6473195261229012, + "grad_norm": 48.29896340278413, + "learning_rate": 9.597446744192227e-06, + "loss": 3.3615, + "step": 7595 + }, + { + "epoch": 0.6474047558169266, + "grad_norm": 60.600497031595715, + "learning_rate": 9.597251796409078e-06, + "loss": 4.3138, + "step": 7596 + }, + { + "epoch": 0.647489985510952, + "grad_norm": 37.93250396851614, + "learning_rate": 9.59705680341378e-06, + "loss": 3.9311, + "step": 7597 + }, + { + "epoch": 0.6475752152049774, + "grad_norm": 78.7475920191121, + "learning_rate": 9.596861765208258e-06, + "loss": 4.392, + "step": 7598 + }, + { + "epoch": 0.6476604448990028, + "grad_norm": 169.55874116123425, + "learning_rate": 9.596666681794427e-06, + "loss": 4.1867, + "step": 7599 + }, + { + "epoch": 0.6477456745930282, + "grad_norm": 105.24796920580349, + "learning_rate": 9.596471553174207e-06, + "loss": 4.0065, + "step": 7600 + }, + { + "epoch": 0.6478309042870536, + "grad_norm": 91.75246637655418, + "learning_rate": 9.596276379349514e-06, + "loss": 5.0911, + "step": 7601 + }, + { + "epoch": 0.6479161339810791, + "grad_norm": 43.42173135594345, + "learning_rate": 9.596081160322269e-06, + "loss": 3.9299, + "step": 7602 + }, + { + "epoch": 0.6480013636751044, + "grad_norm": 69.17321910918574, + "learning_rate": 9.595885896094391e-06, + "loss": 3.7451, + "step": 7603 + }, + { + "epoch": 0.6480865933691298, + "grad_norm": 38.26975578913456, + "learning_rate": 9.595690586667802e-06, + "loss": 3.6871, + "step": 7604 + }, + { + "epoch": 0.6481718230631552, + "grad_norm": 77.34627195292244, + "learning_rate": 9.595495232044425e-06, + "loss": 4.9347, + "step": 7605 + }, + { + "epoch": 0.6482570527571806, + "grad_norm": 39.717259808531864, + "learning_rate": 9.595299832226174e-06, + "loss": 3.1784, + "step": 7606 + }, + { + "epoch": 0.648342282451206, + "grad_norm": 91.10430705747662, + "learning_rate": 9.59510438721498e-06, + "loss": 4.0282, + "step": 7607 + }, + { + "epoch": 0.6484275121452314, + "grad_norm": 89.57607390327868, + "learning_rate": 9.594908897012759e-06, + "loss": 3.5721, + "step": 7608 + }, + { + "epoch": 0.6485127418392568, + "grad_norm": 74.6431383675591, + "learning_rate": 9.594713361621432e-06, + "loss": 4.775, + "step": 7609 + }, + { + "epoch": 0.6485979715332822, + "grad_norm": 59.21118922986563, + "learning_rate": 9.594517781042927e-06, + "loss": 4.2344, + "step": 7610 + }, + { + "epoch": 0.6486832012273076, + "grad_norm": 51.56571243878347, + "learning_rate": 9.594322155279165e-06, + "loss": 4.9073, + "step": 7611 + }, + { + "epoch": 0.648768430921333, + "grad_norm": 118.58764864942117, + "learning_rate": 9.594126484332068e-06, + "loss": 4.3065, + "step": 7612 + }, + { + "epoch": 0.6488536606153584, + "grad_norm": 70.99098281801803, + "learning_rate": 9.593930768203565e-06, + "loss": 4.619, + "step": 7613 + }, + { + "epoch": 0.6489388903093838, + "grad_norm": 31.434111341251313, + "learning_rate": 9.59373500689558e-06, + "loss": 3.3188, + "step": 7614 + }, + { + "epoch": 0.6490241200034091, + "grad_norm": 67.83880457537778, + "learning_rate": 9.593539200410034e-06, + "loss": 5.138, + "step": 7615 + }, + { + "epoch": 0.6491093496974346, + "grad_norm": 43.80081397640233, + "learning_rate": 9.593343348748856e-06, + "loss": 3.9986, + "step": 7616 + }, + { + "epoch": 0.64919457939146, + "grad_norm": 95.1550253263496, + "learning_rate": 9.593147451913971e-06, + "loss": 4.901, + "step": 7617 + }, + { + "epoch": 0.6492798090854854, + "grad_norm": 69.26967089138107, + "learning_rate": 9.592951509907306e-06, + "loss": 4.4065, + "step": 7618 + }, + { + "epoch": 0.6493650387795108, + "grad_norm": 25.94983269795352, + "learning_rate": 9.592755522730787e-06, + "loss": 2.6795, + "step": 7619 + }, + { + "epoch": 0.6494502684735362, + "grad_norm": 50.6243278955211, + "learning_rate": 9.592559490386343e-06, + "loss": 3.3137, + "step": 7620 + }, + { + "epoch": 0.6495354981675616, + "grad_norm": 32.987938756578856, + "learning_rate": 9.592363412875902e-06, + "loss": 3.4916, + "step": 7621 + }, + { + "epoch": 0.649620727861587, + "grad_norm": 37.15686772517669, + "learning_rate": 9.592167290201393e-06, + "loss": 4.3138, + "step": 7622 + }, + { + "epoch": 0.6497059575556123, + "grad_norm": 34.3564166081184, + "learning_rate": 9.591971122364742e-06, + "loss": 3.1451, + "step": 7623 + }, + { + "epoch": 0.6497911872496378, + "grad_norm": 69.57032993829823, + "learning_rate": 9.59177490936788e-06, + "loss": 3.8991, + "step": 7624 + }, + { + "epoch": 0.6498764169436632, + "grad_norm": 73.82455405489145, + "learning_rate": 9.591578651212736e-06, + "loss": 4.3263, + "step": 7625 + }, + { + "epoch": 0.6499616466376885, + "grad_norm": 35.83137365087284, + "learning_rate": 9.59138234790124e-06, + "loss": 2.9258, + "step": 7626 + }, + { + "epoch": 0.650046876331714, + "grad_norm": 85.35562473515868, + "learning_rate": 9.591185999435324e-06, + "loss": 4.3747, + "step": 7627 + }, + { + "epoch": 0.6501321060257393, + "grad_norm": 51.01549931154435, + "learning_rate": 9.590989605816917e-06, + "loss": 4.4516, + "step": 7628 + }, + { + "epoch": 0.6502173357197648, + "grad_norm": 45.17480003576412, + "learning_rate": 9.590793167047952e-06, + "loss": 4.3575, + "step": 7629 + }, + { + "epoch": 0.6503025654137902, + "grad_norm": 35.561075864742484, + "learning_rate": 9.59059668313036e-06, + "loss": 4.2652, + "step": 7630 + }, + { + "epoch": 0.6503877951078155, + "grad_norm": 52.90777294506273, + "learning_rate": 9.590400154066075e-06, + "loss": 3.9614, + "step": 7631 + }, + { + "epoch": 0.650473024801841, + "grad_norm": 53.893251153893715, + "learning_rate": 9.590203579857028e-06, + "loss": 4.06, + "step": 7632 + }, + { + "epoch": 0.6505582544958664, + "grad_norm": 32.170825837075085, + "learning_rate": 9.590006960505151e-06, + "loss": 2.2639, + "step": 7633 + }, + { + "epoch": 0.6506434841898917, + "grad_norm": 34.94632148319423, + "learning_rate": 9.589810296012382e-06, + "loss": 4.0454, + "step": 7634 + }, + { + "epoch": 0.6507287138839172, + "grad_norm": 95.32892410917877, + "learning_rate": 9.589613586380651e-06, + "loss": 4.0577, + "step": 7635 + }, + { + "epoch": 0.6508139435779425, + "grad_norm": 92.04208303107156, + "learning_rate": 9.589416831611895e-06, + "loss": 4.5925, + "step": 7636 + }, + { + "epoch": 0.650899173271968, + "grad_norm": 44.380928434983, + "learning_rate": 9.58922003170805e-06, + "loss": 3.567, + "step": 7637 + }, + { + "epoch": 0.6509844029659934, + "grad_norm": 64.97944632958125, + "learning_rate": 9.589023186671044e-06, + "loss": 4.1851, + "step": 7638 + }, + { + "epoch": 0.6510696326600187, + "grad_norm": 63.49639789685538, + "learning_rate": 9.588826296502823e-06, + "loss": 5.3237, + "step": 7639 + }, + { + "epoch": 0.6511548623540442, + "grad_norm": 47.454205169095005, + "learning_rate": 9.588629361205317e-06, + "loss": 3.7797, + "step": 7640 + }, + { + "epoch": 0.6512400920480695, + "grad_norm": 43.79906069409527, + "learning_rate": 9.588432380780465e-06, + "loss": 4.269, + "step": 7641 + }, + { + "epoch": 0.6513253217420949, + "grad_norm": 35.02914495770703, + "learning_rate": 9.588235355230204e-06, + "loss": 3.3299, + "step": 7642 + }, + { + "epoch": 0.6514105514361204, + "grad_norm": 74.50596940821555, + "learning_rate": 9.588038284556471e-06, + "loss": 4.1176, + "step": 7643 + }, + { + "epoch": 0.6514957811301457, + "grad_norm": 56.71792262850509, + "learning_rate": 9.587841168761207e-06, + "loss": 4.7477, + "step": 7644 + }, + { + "epoch": 0.6515810108241712, + "grad_norm": 104.49991108093627, + "learning_rate": 9.587644007846346e-06, + "loss": 4.0825, + "step": 7645 + }, + { + "epoch": 0.6516662405181965, + "grad_norm": 56.280751675640744, + "learning_rate": 9.587446801813829e-06, + "loss": 4.7997, + "step": 7646 + }, + { + "epoch": 0.6517514702122219, + "grad_norm": 186.29038996186537, + "learning_rate": 9.587249550665595e-06, + "loss": 3.3952, + "step": 7647 + }, + { + "epoch": 0.6518366999062474, + "grad_norm": 123.02174728935199, + "learning_rate": 9.587052254403587e-06, + "loss": 5.7362, + "step": 7648 + }, + { + "epoch": 0.6519219296002727, + "grad_norm": 66.520676823993, + "learning_rate": 9.58685491302974e-06, + "loss": 2.8578, + "step": 7649 + }, + { + "epoch": 0.6520071592942981, + "grad_norm": 64.81356135322203, + "learning_rate": 9.586657526545997e-06, + "loss": 4.3607, + "step": 7650 + }, + { + "epoch": 0.6520923889883236, + "grad_norm": 38.321285245910595, + "learning_rate": 9.586460094954302e-06, + "loss": 3.7714, + "step": 7651 + }, + { + "epoch": 0.6521776186823489, + "grad_norm": 30.926844872466873, + "learning_rate": 9.586262618256595e-06, + "loss": 4.1562, + "step": 7652 + }, + { + "epoch": 0.6522628483763744, + "grad_norm": 144.45967098437012, + "learning_rate": 9.586065096454815e-06, + "loss": 4.5807, + "step": 7653 + }, + { + "epoch": 0.6523480780703997, + "grad_norm": 83.44029735070482, + "learning_rate": 9.58586752955091e-06, + "loss": 3.8071, + "step": 7654 + }, + { + "epoch": 0.6524333077644251, + "grad_norm": 43.20008542317813, + "learning_rate": 9.585669917546818e-06, + "loss": 4.2123, + "step": 7655 + }, + { + "epoch": 0.6525185374584506, + "grad_norm": 86.88539003929895, + "learning_rate": 9.585472260444484e-06, + "loss": 5.6925, + "step": 7656 + }, + { + "epoch": 0.6526037671524759, + "grad_norm": 37.59210350212603, + "learning_rate": 9.585274558245854e-06, + "loss": 4.0568, + "step": 7657 + }, + { + "epoch": 0.6526889968465013, + "grad_norm": 35.26994967783049, + "learning_rate": 9.585076810952871e-06, + "loss": 3.1196, + "step": 7658 + }, + { + "epoch": 0.6527742265405267, + "grad_norm": 33.655715300987424, + "learning_rate": 9.584879018567478e-06, + "loss": 3.6811, + "step": 7659 + }, + { + "epoch": 0.6528594562345521, + "grad_norm": 63.82666467045236, + "learning_rate": 9.584681181091624e-06, + "loss": 4.5581, + "step": 7660 + }, + { + "epoch": 0.6529446859285775, + "grad_norm": 48.681807796023726, + "learning_rate": 9.58448329852725e-06, + "loss": 4.5255, + "step": 7661 + }, + { + "epoch": 0.6530299156226029, + "grad_norm": 46.689362940260324, + "learning_rate": 9.584285370876305e-06, + "loss": 3.7226, + "step": 7662 + }, + { + "epoch": 0.6531151453166283, + "grad_norm": 46.91125631075147, + "learning_rate": 9.584087398140735e-06, + "loss": 3.1294, + "step": 7663 + }, + { + "epoch": 0.6532003750106538, + "grad_norm": 38.37593782275442, + "learning_rate": 9.583889380322487e-06, + "loss": 4.9574, + "step": 7664 + }, + { + "epoch": 0.6532856047046791, + "grad_norm": 73.20249638063073, + "learning_rate": 9.58369131742351e-06, + "loss": 4.4634, + "step": 7665 + }, + { + "epoch": 0.6533708343987045, + "grad_norm": 47.07347725271552, + "learning_rate": 9.583493209445749e-06, + "loss": 4.0845, + "step": 7666 + }, + { + "epoch": 0.6534560640927299, + "grad_norm": 49.83858963758249, + "learning_rate": 9.583295056391152e-06, + "loss": 3.2751, + "step": 7667 + }, + { + "epoch": 0.6535412937867553, + "grad_norm": 57.178931633464714, + "learning_rate": 9.58309685826167e-06, + "loss": 2.9251, + "step": 7668 + }, + { + "epoch": 0.6536265234807807, + "grad_norm": 83.6181738729657, + "learning_rate": 9.582898615059253e-06, + "loss": 6.3141, + "step": 7669 + }, + { + "epoch": 0.6537117531748061, + "grad_norm": 41.01019798495236, + "learning_rate": 9.58270032678585e-06, + "loss": 3.9463, + "step": 7670 + }, + { + "epoch": 0.6537969828688315, + "grad_norm": 53.31030853043981, + "learning_rate": 9.58250199344341e-06, + "loss": 3.5435, + "step": 7671 + }, + { + "epoch": 0.653882212562857, + "grad_norm": 59.60291543497137, + "learning_rate": 9.58230361503388e-06, + "loss": 4.397, + "step": 7672 + }, + { + "epoch": 0.6539674422568823, + "grad_norm": 50.08991300719015, + "learning_rate": 9.58210519155922e-06, + "loss": 3.3046, + "step": 7673 + }, + { + "epoch": 0.6540526719509077, + "grad_norm": 31.093805462574363, + "learning_rate": 9.581906723021373e-06, + "loss": 3.7382, + "step": 7674 + }, + { + "epoch": 0.6541379016449331, + "grad_norm": 38.68711785605445, + "learning_rate": 9.581708209422293e-06, + "loss": 3.8086, + "step": 7675 + }, + { + "epoch": 0.6542231313389585, + "grad_norm": 67.05689871764667, + "learning_rate": 9.581509650763935e-06, + "loss": 3.2944, + "step": 7676 + }, + { + "epoch": 0.6543083610329838, + "grad_norm": 83.83250280330437, + "learning_rate": 9.581311047048251e-06, + "loss": 4.7755, + "step": 7677 + }, + { + "epoch": 0.6543935907270093, + "grad_norm": 32.286961692242286, + "learning_rate": 9.581112398277192e-06, + "loss": 2.8978, + "step": 7678 + }, + { + "epoch": 0.6544788204210347, + "grad_norm": 49.591517541784796, + "learning_rate": 9.580913704452715e-06, + "loss": 4.6718, + "step": 7679 + }, + { + "epoch": 0.6545640501150601, + "grad_norm": 66.91178898165472, + "learning_rate": 9.58071496557677e-06, + "loss": 3.8604, + "step": 7680 + }, + { + "epoch": 0.6546492798090855, + "grad_norm": 39.84966669310861, + "learning_rate": 9.580516181651314e-06, + "loss": 3.1551, + "step": 7681 + }, + { + "epoch": 0.6547345095031109, + "grad_norm": 71.53690862232277, + "learning_rate": 9.5803173526783e-06, + "loss": 3.9744, + "step": 7682 + }, + { + "epoch": 0.6548197391971363, + "grad_norm": 62.09626620110153, + "learning_rate": 9.580118478659687e-06, + "loss": 4.3828, + "step": 7683 + }, + { + "epoch": 0.6549049688911617, + "grad_norm": 70.20892849533723, + "learning_rate": 9.579919559597427e-06, + "loss": 4.5289, + "step": 7684 + }, + { + "epoch": 0.654990198585187, + "grad_norm": 36.74241785779362, + "learning_rate": 9.57972059549348e-06, + "loss": 3.2171, + "step": 7685 + }, + { + "epoch": 0.6550754282792125, + "grad_norm": 91.96075013331853, + "learning_rate": 9.579521586349799e-06, + "loss": 4.4176, + "step": 7686 + }, + { + "epoch": 0.6551606579732379, + "grad_norm": 39.99740355167903, + "learning_rate": 9.579322532168344e-06, + "loss": 3.5743, + "step": 7687 + }, + { + "epoch": 0.6552458876672633, + "grad_norm": 67.06072241675348, + "learning_rate": 9.579123432951071e-06, + "loss": 4.1578, + "step": 7688 + }, + { + "epoch": 0.6553311173612887, + "grad_norm": 89.12621465318158, + "learning_rate": 9.57892428869994e-06, + "loss": 2.5379, + "step": 7689 + }, + { + "epoch": 0.655416347055314, + "grad_norm": 62.398717481691676, + "learning_rate": 9.578725099416906e-06, + "loss": 3.3701, + "step": 7690 + }, + { + "epoch": 0.6555015767493395, + "grad_norm": 33.14324643982597, + "learning_rate": 9.578525865103931e-06, + "loss": 3.1011, + "step": 7691 + }, + { + "epoch": 0.6555868064433649, + "grad_norm": 38.836046549309735, + "learning_rate": 9.578326585762972e-06, + "loss": 2.282, + "step": 7692 + }, + { + "epoch": 0.6556720361373902, + "grad_norm": 88.20429068059894, + "learning_rate": 9.578127261395993e-06, + "loss": 4.4831, + "step": 7693 + }, + { + "epoch": 0.6557572658314157, + "grad_norm": 44.16149043927727, + "learning_rate": 9.57792789200495e-06, + "loss": 3.5472, + "step": 7694 + }, + { + "epoch": 0.655842495525441, + "grad_norm": 94.52595690004453, + "learning_rate": 9.577728477591805e-06, + "loss": 3.5079, + "step": 7695 + }, + { + "epoch": 0.6559277252194665, + "grad_norm": 49.7256588477277, + "learning_rate": 9.57752901815852e-06, + "loss": 3.2687, + "step": 7696 + }, + { + "epoch": 0.6560129549134919, + "grad_norm": 76.45527981145366, + "learning_rate": 9.577329513707056e-06, + "loss": 4.0986, + "step": 7697 + }, + { + "epoch": 0.6560981846075172, + "grad_norm": 57.53517105482323, + "learning_rate": 9.577129964239376e-06, + "loss": 4.9209, + "step": 7698 + }, + { + "epoch": 0.6561834143015427, + "grad_norm": 58.10173565884077, + "learning_rate": 9.57693036975744e-06, + "loss": 4.1604, + "step": 7699 + }, + { + "epoch": 0.6562686439955681, + "grad_norm": 37.91802188463773, + "learning_rate": 9.576730730263213e-06, + "loss": 3.77, + "step": 7700 + }, + { + "epoch": 0.6563538736895934, + "grad_norm": 39.43805365324698, + "learning_rate": 9.576531045758656e-06, + "loss": 4.0969, + "step": 7701 + }, + { + "epoch": 0.6564391033836189, + "grad_norm": 67.93220448483282, + "learning_rate": 9.576331316245738e-06, + "loss": 5.5964, + "step": 7702 + }, + { + "epoch": 0.6565243330776442, + "grad_norm": 91.06647640713865, + "learning_rate": 9.57613154172642e-06, + "loss": 4.5706, + "step": 7703 + }, + { + "epoch": 0.6566095627716696, + "grad_norm": 90.1695923511541, + "learning_rate": 9.575931722202665e-06, + "loss": 5.625, + "step": 7704 + }, + { + "epoch": 0.6566947924656951, + "grad_norm": 29.81837383468151, + "learning_rate": 9.575731857676438e-06, + "loss": 2.9034, + "step": 7705 + }, + { + "epoch": 0.6567800221597204, + "grad_norm": 26.776702429783047, + "learning_rate": 9.57553194814971e-06, + "loss": 2.5013, + "step": 7706 + }, + { + "epoch": 0.6568652518537459, + "grad_norm": 35.547088759703065, + "learning_rate": 9.575331993624441e-06, + "loss": 3.8524, + "step": 7707 + }, + { + "epoch": 0.6569504815477712, + "grad_norm": 45.49284522313604, + "learning_rate": 9.575131994102603e-06, + "loss": 4.3972, + "step": 7708 + }, + { + "epoch": 0.6570357112417966, + "grad_norm": 51.76507888378431, + "learning_rate": 9.574931949586155e-06, + "loss": 2.4221, + "step": 7709 + }, + { + "epoch": 0.6571209409358221, + "grad_norm": 33.405603550026804, + "learning_rate": 9.574731860077073e-06, + "loss": 3.0583, + "step": 7710 + }, + { + "epoch": 0.6572061706298474, + "grad_norm": 38.23768101673815, + "learning_rate": 9.57453172557732e-06, + "loss": 1.5778, + "step": 7711 + }, + { + "epoch": 0.6572914003238728, + "grad_norm": 57.68505334040219, + "learning_rate": 9.574331546088864e-06, + "loss": 3.97, + "step": 7712 + }, + { + "epoch": 0.6573766300178983, + "grad_norm": 47.0023902815701, + "learning_rate": 9.574131321613677e-06, + "loss": 4.4405, + "step": 7713 + }, + { + "epoch": 0.6574618597119236, + "grad_norm": 45.397220748146935, + "learning_rate": 9.573931052153724e-06, + "loss": 3.2456, + "step": 7714 + }, + { + "epoch": 0.6575470894059491, + "grad_norm": 53.55386632553502, + "learning_rate": 9.573730737710979e-06, + "loss": 4.2539, + "step": 7715 + }, + { + "epoch": 0.6576323190999744, + "grad_norm": 44.41398985572601, + "learning_rate": 9.573530378287407e-06, + "loss": 3.5544, + "step": 7716 + }, + { + "epoch": 0.6577175487939998, + "grad_norm": 53.956204593227056, + "learning_rate": 9.573329973884982e-06, + "loss": 4.7151, + "step": 7717 + }, + { + "epoch": 0.6578027784880253, + "grad_norm": 64.99843522043895, + "learning_rate": 9.573129524505675e-06, + "loss": 5.2694, + "step": 7718 + }, + { + "epoch": 0.6578880081820506, + "grad_norm": 38.91149129998304, + "learning_rate": 9.572929030151456e-06, + "loss": 3.4499, + "step": 7719 + }, + { + "epoch": 0.657973237876076, + "grad_norm": 75.82353384354045, + "learning_rate": 9.572728490824296e-06, + "loss": 5.1488, + "step": 7720 + }, + { + "epoch": 0.6580584675701014, + "grad_norm": 48.092002710313785, + "learning_rate": 9.572527906526169e-06, + "loss": 4.1309, + "step": 7721 + }, + { + "epoch": 0.6581436972641268, + "grad_norm": 50.86764416629625, + "learning_rate": 9.572327277259046e-06, + "loss": 3.896, + "step": 7722 + }, + { + "epoch": 0.6582289269581523, + "grad_norm": 40.46110247066429, + "learning_rate": 9.572126603024902e-06, + "loss": 3.7417, + "step": 7723 + }, + { + "epoch": 0.6583141566521776, + "grad_norm": 88.1421041108163, + "learning_rate": 9.57192588382571e-06, + "loss": 4.3153, + "step": 7724 + }, + { + "epoch": 0.658399386346203, + "grad_norm": 25.71671917889854, + "learning_rate": 9.571725119663445e-06, + "loss": 3.0865, + "step": 7725 + }, + { + "epoch": 0.6584846160402285, + "grad_norm": 34.55529867409842, + "learning_rate": 9.571524310540077e-06, + "loss": 3.1756, + "step": 7726 + }, + { + "epoch": 0.6585698457342538, + "grad_norm": 43.98464073959454, + "learning_rate": 9.571323456457586e-06, + "loss": 3.4339, + "step": 7727 + }, + { + "epoch": 0.6586550754282792, + "grad_norm": 33.57642728757868, + "learning_rate": 9.571122557417945e-06, + "loss": 3.5609, + "step": 7728 + }, + { + "epoch": 0.6587403051223046, + "grad_norm": 86.73936481916304, + "learning_rate": 9.570921613423128e-06, + "loss": 4.6801, + "step": 7729 + }, + { + "epoch": 0.65882553481633, + "grad_norm": 47.65696512046825, + "learning_rate": 9.570720624475116e-06, + "loss": 2.597, + "step": 7730 + }, + { + "epoch": 0.6589107645103555, + "grad_norm": 62.455408801416716, + "learning_rate": 9.570519590575882e-06, + "loss": 4.0537, + "step": 7731 + }, + { + "epoch": 0.6589959942043808, + "grad_norm": 39.80119072108354, + "learning_rate": 9.570318511727403e-06, + "loss": 3.4383, + "step": 7732 + }, + { + "epoch": 0.6590812238984062, + "grad_norm": 40.0147607444116, + "learning_rate": 9.570117387931659e-06, + "loss": 3.9599, + "step": 7733 + }, + { + "epoch": 0.6591664535924316, + "grad_norm": 72.93430967739673, + "learning_rate": 9.569916219190624e-06, + "loss": 3.7209, + "step": 7734 + }, + { + "epoch": 0.659251683286457, + "grad_norm": 62.16978867067515, + "learning_rate": 9.569715005506282e-06, + "loss": 3.7213, + "step": 7735 + }, + { + "epoch": 0.6593369129804824, + "grad_norm": 69.11037061073654, + "learning_rate": 9.569513746880606e-06, + "loss": 4.5672, + "step": 7736 + }, + { + "epoch": 0.6594221426745078, + "grad_norm": 42.036212098956824, + "learning_rate": 9.569312443315579e-06, + "loss": 3.8432, + "step": 7737 + }, + { + "epoch": 0.6595073723685332, + "grad_norm": 33.43416507149936, + "learning_rate": 9.569111094813181e-06, + "loss": 3.4691, + "step": 7738 + }, + { + "epoch": 0.6595926020625585, + "grad_norm": 38.323387161336136, + "learning_rate": 9.56890970137539e-06, + "loss": 3.5477, + "step": 7739 + }, + { + "epoch": 0.659677831756584, + "grad_norm": 41.00313710035649, + "learning_rate": 9.568708263004188e-06, + "loss": 3.6549, + "step": 7740 + }, + { + "epoch": 0.6597630614506094, + "grad_norm": 28.577326751294592, + "learning_rate": 9.568506779701556e-06, + "loss": 3.3211, + "step": 7741 + }, + { + "epoch": 0.6598482911446348, + "grad_norm": 79.30433059468737, + "learning_rate": 9.568305251469474e-06, + "loss": 4.748, + "step": 7742 + }, + { + "epoch": 0.6599335208386602, + "grad_norm": 64.63526904053873, + "learning_rate": 9.568103678309926e-06, + "loss": 5.2505, + "step": 7743 + }, + { + "epoch": 0.6600187505326855, + "grad_norm": 29.958361948091078, + "learning_rate": 9.567902060224892e-06, + "loss": 3.8013, + "step": 7744 + }, + { + "epoch": 0.660103980226711, + "grad_norm": 60.05345924167335, + "learning_rate": 9.567700397216358e-06, + "loss": 3.9425, + "step": 7745 + }, + { + "epoch": 0.6601892099207364, + "grad_norm": 29.864096523058492, + "learning_rate": 9.567498689286306e-06, + "loss": 2.9625, + "step": 7746 + }, + { + "epoch": 0.6602744396147617, + "grad_norm": 110.34855537422939, + "learning_rate": 9.567296936436719e-06, + "loss": 5.1256, + "step": 7747 + }, + { + "epoch": 0.6603596693087872, + "grad_norm": 77.22703931686097, + "learning_rate": 9.56709513866958e-06, + "loss": 4.4124, + "step": 7748 + }, + { + "epoch": 0.6604448990028126, + "grad_norm": 45.03508055429347, + "learning_rate": 9.566893295986877e-06, + "loss": 2.5953, + "step": 7749 + }, + { + "epoch": 0.660530128696838, + "grad_norm": 62.24426987676732, + "learning_rate": 9.566691408390591e-06, + "loss": 4.3193, + "step": 7750 + }, + { + "epoch": 0.6606153583908634, + "grad_norm": 56.634444269749075, + "learning_rate": 9.566489475882712e-06, + "loss": 4.1601, + "step": 7751 + }, + { + "epoch": 0.6607005880848887, + "grad_norm": 58.738653345727535, + "learning_rate": 9.566287498465221e-06, + "loss": 3.7592, + "step": 7752 + }, + { + "epoch": 0.6607858177789142, + "grad_norm": 81.27773602477927, + "learning_rate": 9.566085476140109e-06, + "loss": 4.363, + "step": 7753 + }, + { + "epoch": 0.6608710474729396, + "grad_norm": 79.97323395265148, + "learning_rate": 9.56588340890936e-06, + "loss": 4.3094, + "step": 7754 + }, + { + "epoch": 0.6609562771669649, + "grad_norm": 37.533723687301915, + "learning_rate": 9.565681296774963e-06, + "loss": 3.4132, + "step": 7755 + }, + { + "epoch": 0.6610415068609904, + "grad_norm": 71.24354812051392, + "learning_rate": 9.565479139738902e-06, + "loss": 4.1602, + "step": 7756 + }, + { + "epoch": 0.6611267365550157, + "grad_norm": 63.84644762123255, + "learning_rate": 9.56527693780317e-06, + "loss": 4.1633, + "step": 7757 + }, + { + "epoch": 0.6612119662490412, + "grad_norm": 44.078855464623096, + "learning_rate": 9.565074690969754e-06, + "loss": 3.961, + "step": 7758 + }, + { + "epoch": 0.6612971959430666, + "grad_norm": 142.39635404366456, + "learning_rate": 9.56487239924064e-06, + "loss": 4.5701, + "step": 7759 + }, + { + "epoch": 0.6613824256370919, + "grad_norm": 70.71378729227789, + "learning_rate": 9.564670062617822e-06, + "loss": 5.2807, + "step": 7760 + }, + { + "epoch": 0.6614676553311174, + "grad_norm": 35.504224042971785, + "learning_rate": 9.564467681103287e-06, + "loss": 3.527, + "step": 7761 + }, + { + "epoch": 0.6615528850251428, + "grad_norm": 121.35565867700632, + "learning_rate": 9.564265254699026e-06, + "loss": 5.2356, + "step": 7762 + }, + { + "epoch": 0.6616381147191681, + "grad_norm": 77.29253147080037, + "learning_rate": 9.564062783407029e-06, + "loss": 3.2395, + "step": 7763 + }, + { + "epoch": 0.6617233444131936, + "grad_norm": 70.56287585058786, + "learning_rate": 9.563860267229288e-06, + "loss": 4.8796, + "step": 7764 + }, + { + "epoch": 0.6618085741072189, + "grad_norm": 135.9717221953834, + "learning_rate": 9.563657706167797e-06, + "loss": 4.4219, + "step": 7765 + }, + { + "epoch": 0.6618938038012444, + "grad_norm": 58.65667696200646, + "learning_rate": 9.563455100224544e-06, + "loss": 4.631, + "step": 7766 + }, + { + "epoch": 0.6619790334952698, + "grad_norm": 39.02864630681097, + "learning_rate": 9.563252449401525e-06, + "loss": 3.0556, + "step": 7767 + }, + { + "epoch": 0.6620642631892951, + "grad_norm": 32.14096654474795, + "learning_rate": 9.56304975370073e-06, + "loss": 3.2807, + "step": 7768 + }, + { + "epoch": 0.6621494928833206, + "grad_norm": 81.85230948166316, + "learning_rate": 9.562847013124155e-06, + "loss": 5.6623, + "step": 7769 + }, + { + "epoch": 0.662234722577346, + "grad_norm": 47.14076587774965, + "learning_rate": 9.562644227673792e-06, + "loss": 3.4294, + "step": 7770 + }, + { + "epoch": 0.6623199522713713, + "grad_norm": 34.457896893905854, + "learning_rate": 9.562441397351637e-06, + "loss": 3.5956, + "step": 7771 + }, + { + "epoch": 0.6624051819653968, + "grad_norm": 45.97420990334509, + "learning_rate": 9.562238522159681e-06, + "loss": 3.9179, + "step": 7772 + }, + { + "epoch": 0.6624904116594221, + "grad_norm": 69.23793116431283, + "learning_rate": 9.562035602099923e-06, + "loss": 4.3216, + "step": 7773 + }, + { + "epoch": 0.6625756413534475, + "grad_norm": 74.16653091372015, + "learning_rate": 9.56183263717436e-06, + "loss": 4.2383, + "step": 7774 + }, + { + "epoch": 0.662660871047473, + "grad_norm": 67.21534762340964, + "learning_rate": 9.561629627384984e-06, + "loss": 3.7942, + "step": 7775 + }, + { + "epoch": 0.6627461007414983, + "grad_norm": 48.70927067530908, + "learning_rate": 9.561426572733795e-06, + "loss": 2.8873, + "step": 7776 + }, + { + "epoch": 0.6628313304355238, + "grad_norm": 38.96115727826991, + "learning_rate": 9.561223473222785e-06, + "loss": 3.6189, + "step": 7777 + }, + { + "epoch": 0.6629165601295491, + "grad_norm": 102.88885678820705, + "learning_rate": 9.561020328853955e-06, + "loss": 5.2575, + "step": 7778 + }, + { + "epoch": 0.6630017898235745, + "grad_norm": 42.5664992730407, + "learning_rate": 9.560817139629305e-06, + "loss": 3.7153, + "step": 7779 + }, + { + "epoch": 0.6630870195176, + "grad_norm": 39.393266967150936, + "learning_rate": 9.560613905550828e-06, + "loss": 4.1882, + "step": 7780 + }, + { + "epoch": 0.6631722492116253, + "grad_norm": 51.952914099718676, + "learning_rate": 9.560410626620527e-06, + "loss": 4.6682, + "step": 7781 + }, + { + "epoch": 0.6632574789056507, + "grad_norm": 34.86960874906729, + "learning_rate": 9.5602073028404e-06, + "loss": 3.0262, + "step": 7782 + }, + { + "epoch": 0.6633427085996761, + "grad_norm": 56.85077891924106, + "learning_rate": 9.560003934212446e-06, + "loss": 3.5909, + "step": 7783 + }, + { + "epoch": 0.6634279382937015, + "grad_norm": 41.6099068830164, + "learning_rate": 9.559800520738664e-06, + "loss": 3.7271, + "step": 7784 + }, + { + "epoch": 0.663513167987727, + "grad_norm": 61.156010490258325, + "learning_rate": 9.559597062421057e-06, + "loss": 4.4992, + "step": 7785 + }, + { + "epoch": 0.6635983976817523, + "grad_norm": 60.56204530304145, + "learning_rate": 9.559393559261624e-06, + "loss": 4.3166, + "step": 7786 + }, + { + "epoch": 0.6636836273757777, + "grad_norm": 58.86501695646214, + "learning_rate": 9.55919001126237e-06, + "loss": 5.463, + "step": 7787 + }, + { + "epoch": 0.6637688570698032, + "grad_norm": 53.61506850276474, + "learning_rate": 9.55898641842529e-06, + "loss": 3.9291, + "step": 7788 + }, + { + "epoch": 0.6638540867638285, + "grad_norm": 63.77839922040786, + "learning_rate": 9.558782780752392e-06, + "loss": 3.7735, + "step": 7789 + }, + { + "epoch": 0.6639393164578539, + "grad_norm": 33.59018091439746, + "learning_rate": 9.558579098245676e-06, + "loss": 3.3511, + "step": 7790 + }, + { + "epoch": 0.6640245461518793, + "grad_norm": 36.19188178757662, + "learning_rate": 9.558375370907145e-06, + "loss": 3.0742, + "step": 7791 + }, + { + "epoch": 0.6641097758459047, + "grad_norm": 41.4154377026421, + "learning_rate": 9.558171598738805e-06, + "loss": 3.786, + "step": 7792 + }, + { + "epoch": 0.6641950055399302, + "grad_norm": 35.22178323519945, + "learning_rate": 9.557967781742659e-06, + "loss": 3.9772, + "step": 7793 + }, + { + "epoch": 0.6642802352339555, + "grad_norm": 31.96969032219862, + "learning_rate": 9.55776391992071e-06, + "loss": 3.3346, + "step": 7794 + }, + { + "epoch": 0.6643654649279809, + "grad_norm": 124.23083354363025, + "learning_rate": 9.557560013274965e-06, + "loss": 4.2995, + "step": 7795 + }, + { + "epoch": 0.6644506946220063, + "grad_norm": 32.865773283097354, + "learning_rate": 9.557356061807427e-06, + "loss": 2.7988, + "step": 7796 + }, + { + "epoch": 0.6645359243160317, + "grad_norm": 86.47529053316029, + "learning_rate": 9.557152065520103e-06, + "loss": 3.8818, + "step": 7797 + }, + { + "epoch": 0.6646211540100571, + "grad_norm": 73.24201037962587, + "learning_rate": 9.556948024415001e-06, + "loss": 3.7211, + "step": 7798 + }, + { + "epoch": 0.6647063837040825, + "grad_norm": 48.984135385325025, + "learning_rate": 9.556743938494125e-06, + "loss": 4.7045, + "step": 7799 + }, + { + "epoch": 0.6647916133981079, + "grad_norm": 60.57811923759839, + "learning_rate": 9.556539807759483e-06, + "loss": 4.4311, + "step": 7800 + }, + { + "epoch": 0.6648768430921334, + "grad_norm": 46.062499770855396, + "learning_rate": 9.556335632213082e-06, + "loss": 4.4156, + "step": 7801 + }, + { + "epoch": 0.6649620727861587, + "grad_norm": 65.28322669052496, + "learning_rate": 9.556131411856931e-06, + "loss": 4.0312, + "step": 7802 + }, + { + "epoch": 0.6650473024801841, + "grad_norm": 41.1659951038776, + "learning_rate": 9.555927146693038e-06, + "loss": 2.9822, + "step": 7803 + }, + { + "epoch": 0.6651325321742095, + "grad_norm": 28.017797660310983, + "learning_rate": 9.555722836723412e-06, + "loss": 2.403, + "step": 7804 + }, + { + "epoch": 0.6652177618682349, + "grad_norm": 75.48708971851978, + "learning_rate": 9.555518481950062e-06, + "loss": 4.0126, + "step": 7805 + }, + { + "epoch": 0.6653029915622602, + "grad_norm": 49.58948088223626, + "learning_rate": 9.555314082374997e-06, + "loss": 4.3574, + "step": 7806 + }, + { + "epoch": 0.6653882212562857, + "grad_norm": 25.610411863367354, + "learning_rate": 9.555109638000231e-06, + "loss": 3.508, + "step": 7807 + }, + { + "epoch": 0.6654734509503111, + "grad_norm": 51.66016419371234, + "learning_rate": 9.554905148827769e-06, + "loss": 4.0176, + "step": 7808 + }, + { + "epoch": 0.6655586806443365, + "grad_norm": 47.85536092610096, + "learning_rate": 9.554700614859626e-06, + "loss": 3.8504, + "step": 7809 + }, + { + "epoch": 0.6656439103383619, + "grad_norm": 115.89461563137655, + "learning_rate": 9.55449603609781e-06, + "loss": 4.5871, + "step": 7810 + }, + { + "epoch": 0.6657291400323873, + "grad_norm": 49.57515835627192, + "learning_rate": 9.554291412544339e-06, + "loss": 3.1267, + "step": 7811 + }, + { + "epoch": 0.6658143697264127, + "grad_norm": 54.73466834600576, + "learning_rate": 9.55408674420122e-06, + "loss": 4.622, + "step": 7812 + }, + { + "epoch": 0.6658995994204381, + "grad_norm": 92.16174631885114, + "learning_rate": 9.553882031070467e-06, + "loss": 5.7841, + "step": 7813 + }, + { + "epoch": 0.6659848291144634, + "grad_norm": 54.43471319129703, + "learning_rate": 9.553677273154094e-06, + "loss": 4.0424, + "step": 7814 + }, + { + "epoch": 0.6660700588084889, + "grad_norm": 48.40543348173017, + "learning_rate": 9.553472470454114e-06, + "loss": 3.9437, + "step": 7815 + }, + { + "epoch": 0.6661552885025143, + "grad_norm": 59.26865575971614, + "learning_rate": 9.553267622972545e-06, + "loss": 4.0376, + "step": 7816 + }, + { + "epoch": 0.6662405181965396, + "grad_norm": 42.209676632580496, + "learning_rate": 9.553062730711393e-06, + "loss": 3.6524, + "step": 7817 + }, + { + "epoch": 0.6663257478905651, + "grad_norm": 59.17652345348131, + "learning_rate": 9.55285779367268e-06, + "loss": 4.6911, + "step": 7818 + }, + { + "epoch": 0.6664109775845904, + "grad_norm": 115.64823879699856, + "learning_rate": 9.552652811858421e-06, + "loss": 5.1613, + "step": 7819 + }, + { + "epoch": 0.6664962072786159, + "grad_norm": 80.60186380863587, + "learning_rate": 9.55244778527063e-06, + "loss": 4.158, + "step": 7820 + }, + { + "epoch": 0.6665814369726413, + "grad_norm": 65.25491227098601, + "learning_rate": 9.552242713911324e-06, + "loss": 5.0077, + "step": 7821 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 35.27303829368458, + "learning_rate": 9.55203759778252e-06, + "loss": 3.3631, + "step": 7822 + }, + { + "epoch": 0.6667518963606921, + "grad_norm": 37.10161052893934, + "learning_rate": 9.551832436886236e-06, + "loss": 3.6825, + "step": 7823 + }, + { + "epoch": 0.6668371260547175, + "grad_norm": 44.677714973661345, + "learning_rate": 9.551627231224488e-06, + "loss": 3.223, + "step": 7824 + }, + { + "epoch": 0.6669223557487428, + "grad_norm": 62.84435923278618, + "learning_rate": 9.551421980799294e-06, + "loss": 3.8102, + "step": 7825 + }, + { + "epoch": 0.6670075854427683, + "grad_norm": 155.99270882899012, + "learning_rate": 9.551216685612674e-06, + "loss": 5.0715, + "step": 7826 + }, + { + "epoch": 0.6670928151367936, + "grad_norm": 470.51010571623453, + "learning_rate": 9.551011345666645e-06, + "loss": 4.9871, + "step": 7827 + }, + { + "epoch": 0.6671780448308191, + "grad_norm": 57.01092399764411, + "learning_rate": 9.55080596096323e-06, + "loss": 3.319, + "step": 7828 + }, + { + "epoch": 0.6672632745248445, + "grad_norm": 34.70252154959769, + "learning_rate": 9.550600531504446e-06, + "loss": 3.4124, + "step": 7829 + }, + { + "epoch": 0.6673485042188698, + "grad_norm": 78.76707050335808, + "learning_rate": 9.550395057292311e-06, + "loss": 4.7086, + "step": 7830 + }, + { + "epoch": 0.6674337339128953, + "grad_norm": 52.660487054633975, + "learning_rate": 9.550189538328854e-06, + "loss": 3.9852, + "step": 7831 + }, + { + "epoch": 0.6675189636069206, + "grad_norm": 62.16053781305122, + "learning_rate": 9.549983974616087e-06, + "loss": 5.1263, + "step": 7832 + }, + { + "epoch": 0.667604193300946, + "grad_norm": 91.22207287059094, + "learning_rate": 9.549778366156037e-06, + "loss": 5.541, + "step": 7833 + }, + { + "epoch": 0.6676894229949715, + "grad_norm": 33.35619917386741, + "learning_rate": 9.549572712950726e-06, + "loss": 2.9165, + "step": 7834 + }, + { + "epoch": 0.6677746526889968, + "grad_norm": 47.997838116610104, + "learning_rate": 9.549367015002173e-06, + "loss": 4.5782, + "step": 7835 + }, + { + "epoch": 0.6678598823830223, + "grad_norm": 57.99326751883074, + "learning_rate": 9.549161272312404e-06, + "loss": 4.2499, + "step": 7836 + }, + { + "epoch": 0.6679451120770477, + "grad_norm": 53.80407918374351, + "learning_rate": 9.548955484883441e-06, + "loss": 4.232, + "step": 7837 + }, + { + "epoch": 0.668030341771073, + "grad_norm": 40.72874077378597, + "learning_rate": 9.548749652717309e-06, + "loss": 4.403, + "step": 7838 + }, + { + "epoch": 0.6681155714650985, + "grad_norm": 47.76050229692261, + "learning_rate": 9.548543775816033e-06, + "loss": 3.72, + "step": 7839 + }, + { + "epoch": 0.6682008011591238, + "grad_norm": 34.655764357286685, + "learning_rate": 9.548337854181634e-06, + "loss": 3.7908, + "step": 7840 + }, + { + "epoch": 0.6682860308531492, + "grad_norm": 49.94365492303414, + "learning_rate": 9.548131887816141e-06, + "loss": 3.3573, + "step": 7841 + }, + { + "epoch": 0.6683712605471747, + "grad_norm": 56.19620528123167, + "learning_rate": 9.547925876721578e-06, + "loss": 4.6343, + "step": 7842 + }, + { + "epoch": 0.6684564902412, + "grad_norm": 44.54802479423041, + "learning_rate": 9.54771982089997e-06, + "loss": 4.1002, + "step": 7843 + }, + { + "epoch": 0.6685417199352255, + "grad_norm": 96.32736382954882, + "learning_rate": 9.547513720353346e-06, + "loss": 5.666, + "step": 7844 + }, + { + "epoch": 0.6686269496292508, + "grad_norm": 35.93734103389517, + "learning_rate": 9.547307575083731e-06, + "loss": 3.8989, + "step": 7845 + }, + { + "epoch": 0.6687121793232762, + "grad_norm": 87.4760827221815, + "learning_rate": 9.547101385093151e-06, + "loss": 4.2263, + "step": 7846 + }, + { + "epoch": 0.6687974090173017, + "grad_norm": 82.48164187332303, + "learning_rate": 9.546895150383639e-06, + "loss": 5.7348, + "step": 7847 + }, + { + "epoch": 0.668882638711327, + "grad_norm": 123.88909243802308, + "learning_rate": 9.546688870957218e-06, + "loss": 3.9409, + "step": 7848 + }, + { + "epoch": 0.6689678684053524, + "grad_norm": 41.67211192519025, + "learning_rate": 9.546482546815919e-06, + "loss": 3.4979, + "step": 7849 + }, + { + "epoch": 0.6690530980993779, + "grad_norm": 70.8814638801749, + "learning_rate": 9.546276177961771e-06, + "loss": 3.2551, + "step": 7850 + }, + { + "epoch": 0.6691383277934032, + "grad_norm": 35.87136495794269, + "learning_rate": 9.546069764396804e-06, + "loss": 4.037, + "step": 7851 + }, + { + "epoch": 0.6692235574874286, + "grad_norm": 53.44492767742284, + "learning_rate": 9.545863306123047e-06, + "loss": 4.5096, + "step": 7852 + }, + { + "epoch": 0.669308787181454, + "grad_norm": 109.34701553065905, + "learning_rate": 9.54565680314253e-06, + "loss": 4.1701, + "step": 7853 + }, + { + "epoch": 0.6693940168754794, + "grad_norm": 68.35623569780287, + "learning_rate": 9.545450255457285e-06, + "loss": 4.9585, + "step": 7854 + }, + { + "epoch": 0.6694792465695049, + "grad_norm": 106.63330917853568, + "learning_rate": 9.545243663069342e-06, + "loss": 4.3592, + "step": 7855 + }, + { + "epoch": 0.6695644762635302, + "grad_norm": 34.530485638569814, + "learning_rate": 9.545037025980734e-06, + "loss": 3.5311, + "step": 7856 + }, + { + "epoch": 0.6696497059575556, + "grad_norm": 82.9572160787848, + "learning_rate": 9.544830344193494e-06, + "loss": 5.0262, + "step": 7857 + }, + { + "epoch": 0.669734935651581, + "grad_norm": 137.65307161044603, + "learning_rate": 9.544623617709652e-06, + "loss": 4.7137, + "step": 7858 + }, + { + "epoch": 0.6698201653456064, + "grad_norm": 47.50101620699166, + "learning_rate": 9.544416846531245e-06, + "loss": 3.8005, + "step": 7859 + }, + { + "epoch": 0.6699053950396318, + "grad_norm": 28.3716452595639, + "learning_rate": 9.544210030660302e-06, + "loss": 2.2592, + "step": 7860 + }, + { + "epoch": 0.6699906247336572, + "grad_norm": 68.79005756116473, + "learning_rate": 9.54400317009886e-06, + "loss": 5.9795, + "step": 7861 + }, + { + "epoch": 0.6700758544276826, + "grad_norm": 61.28004615018663, + "learning_rate": 9.543796264848953e-06, + "loss": 5.0595, + "step": 7862 + }, + { + "epoch": 0.670161084121708, + "grad_norm": 31.50568075221389, + "learning_rate": 9.543589314912614e-06, + "loss": 3.8053, + "step": 7863 + }, + { + "epoch": 0.6702463138157334, + "grad_norm": 69.30272295480894, + "learning_rate": 9.543382320291878e-06, + "loss": 4.2378, + "step": 7864 + }, + { + "epoch": 0.6703315435097588, + "grad_norm": 48.58356009958245, + "learning_rate": 9.543175280988786e-06, + "loss": 2.6241, + "step": 7865 + }, + { + "epoch": 0.6704167732037842, + "grad_norm": 106.90948588266413, + "learning_rate": 9.542968197005366e-06, + "loss": 3.0861, + "step": 7866 + }, + { + "epoch": 0.6705020028978096, + "grad_norm": 48.84550045878579, + "learning_rate": 9.542761068343663e-06, + "loss": 4.8381, + "step": 7867 + }, + { + "epoch": 0.670587232591835, + "grad_norm": 73.8010594947492, + "learning_rate": 9.542553895005709e-06, + "loss": 4.8661, + "step": 7868 + }, + { + "epoch": 0.6706724622858604, + "grad_norm": 37.03552551849622, + "learning_rate": 9.542346676993542e-06, + "loss": 4.1576, + "step": 7869 + }, + { + "epoch": 0.6707576919798858, + "grad_norm": 100.45051381927112, + "learning_rate": 9.5421394143092e-06, + "loss": 3.0949, + "step": 7870 + }, + { + "epoch": 0.6708429216739112, + "grad_norm": 47.36361185214298, + "learning_rate": 9.541932106954722e-06, + "loss": 4.3793, + "step": 7871 + }, + { + "epoch": 0.6709281513679366, + "grad_norm": 30.680493474114563, + "learning_rate": 9.541724754932147e-06, + "loss": 2.7013, + "step": 7872 + }, + { + "epoch": 0.671013381061962, + "grad_norm": 59.87391964353782, + "learning_rate": 9.541517358243513e-06, + "loss": 4.4581, + "step": 7873 + }, + { + "epoch": 0.6710986107559874, + "grad_norm": 32.84915424103605, + "learning_rate": 9.541309916890859e-06, + "loss": 3.7169, + "step": 7874 + }, + { + "epoch": 0.6711838404500128, + "grad_norm": 51.865735952237266, + "learning_rate": 9.541102430876229e-06, + "loss": 3.8113, + "step": 7875 + }, + { + "epoch": 0.6712690701440381, + "grad_norm": 40.61544605697701, + "learning_rate": 9.54089490020166e-06, + "loss": 4.2478, + "step": 7876 + }, + { + "epoch": 0.6713542998380636, + "grad_norm": 32.004542907137505, + "learning_rate": 9.540687324869195e-06, + "loss": 2.1633, + "step": 7877 + }, + { + "epoch": 0.671439529532089, + "grad_norm": 39.48392617310955, + "learning_rate": 9.540479704880873e-06, + "loss": 4.5008, + "step": 7878 + }, + { + "epoch": 0.6715247592261144, + "grad_norm": 49.521025474375506, + "learning_rate": 9.540272040238738e-06, + "loss": 4.8589, + "step": 7879 + }, + { + "epoch": 0.6716099889201398, + "grad_norm": 53.381999835725495, + "learning_rate": 9.540064330944831e-06, + "loss": 3.4656, + "step": 7880 + }, + { + "epoch": 0.6716952186141651, + "grad_norm": 43.92060443959247, + "learning_rate": 9.539856577001196e-06, + "loss": 4.4858, + "step": 7881 + }, + { + "epoch": 0.6717804483081906, + "grad_norm": 117.13722359522771, + "learning_rate": 9.539648778409877e-06, + "loss": 5.1622, + "step": 7882 + }, + { + "epoch": 0.671865678002216, + "grad_norm": 44.97884473400457, + "learning_rate": 9.539440935172914e-06, + "loss": 4.3683, + "step": 7883 + }, + { + "epoch": 0.6719509076962413, + "grad_norm": 33.16314125282699, + "learning_rate": 9.539233047292354e-06, + "loss": 4.0037, + "step": 7884 + }, + { + "epoch": 0.6720361373902668, + "grad_norm": 65.76455542736389, + "learning_rate": 9.53902511477024e-06, + "loss": 4.7385, + "step": 7885 + }, + { + "epoch": 0.6721213670842922, + "grad_norm": 33.83816712162378, + "learning_rate": 9.53881713760862e-06, + "loss": 3.665, + "step": 7886 + }, + { + "epoch": 0.6722065967783175, + "grad_norm": 31.343533538627543, + "learning_rate": 9.538609115809535e-06, + "loss": 3.4201, + "step": 7887 + }, + { + "epoch": 0.672291826472343, + "grad_norm": 30.755044588084576, + "learning_rate": 9.538401049375032e-06, + "loss": 1.4578, + "step": 7888 + }, + { + "epoch": 0.6723770561663683, + "grad_norm": 31.53306392534077, + "learning_rate": 9.538192938307159e-06, + "loss": 3.2243, + "step": 7889 + }, + { + "epoch": 0.6724622858603938, + "grad_norm": 47.92337000397289, + "learning_rate": 9.537984782607962e-06, + "loss": 4.1434, + "step": 7890 + }, + { + "epoch": 0.6725475155544192, + "grad_norm": 81.21086551727025, + "learning_rate": 9.537776582279487e-06, + "loss": 5.2443, + "step": 7891 + }, + { + "epoch": 0.6726327452484445, + "grad_norm": 39.3560399082948, + "learning_rate": 9.537568337323784e-06, + "loss": 2.9984, + "step": 7892 + }, + { + "epoch": 0.67271797494247, + "grad_norm": 35.73513279809078, + "learning_rate": 9.537360047742899e-06, + "loss": 4.0905, + "step": 7893 + }, + { + "epoch": 0.6728032046364953, + "grad_norm": 29.693149415947655, + "learning_rate": 9.53715171353888e-06, + "loss": 2.2891, + "step": 7894 + }, + { + "epoch": 0.6728884343305207, + "grad_norm": 153.12891915540683, + "learning_rate": 9.536943334713778e-06, + "loss": 5.2495, + "step": 7895 + }, + { + "epoch": 0.6729736640245462, + "grad_norm": 84.46920742034867, + "learning_rate": 9.53673491126964e-06, + "loss": 4.0256, + "step": 7896 + }, + { + "epoch": 0.6730588937185715, + "grad_norm": 47.41116793248533, + "learning_rate": 9.536526443208516e-06, + "loss": 3.5563, + "step": 7897 + }, + { + "epoch": 0.673144123412597, + "grad_norm": 63.752385899831445, + "learning_rate": 9.53631793053246e-06, + "loss": 4.7459, + "step": 7898 + }, + { + "epoch": 0.6732293531066224, + "grad_norm": 48.26421347045388, + "learning_rate": 9.536109373243518e-06, + "loss": 4.0786, + "step": 7899 + }, + { + "epoch": 0.6733145828006477, + "grad_norm": 69.98162307719672, + "learning_rate": 9.535900771343743e-06, + "loss": 3.8437, + "step": 7900 + }, + { + "epoch": 0.6733998124946732, + "grad_norm": 44.4133936294714, + "learning_rate": 9.535692124835184e-06, + "loss": 3.9564, + "step": 7901 + }, + { + "epoch": 0.6734850421886985, + "grad_norm": 36.80613453268677, + "learning_rate": 9.5354834337199e-06, + "loss": 3.5261, + "step": 7902 + }, + { + "epoch": 0.6735702718827239, + "grad_norm": 35.30838013433798, + "learning_rate": 9.535274697999935e-06, + "loss": 3.4412, + "step": 7903 + }, + { + "epoch": 0.6736555015767494, + "grad_norm": 60.97728205356374, + "learning_rate": 9.535065917677347e-06, + "loss": 3.4018, + "step": 7904 + }, + { + "epoch": 0.6737407312707747, + "grad_norm": 77.75331641298473, + "learning_rate": 9.534857092754187e-06, + "loss": 4.4134, + "step": 7905 + }, + { + "epoch": 0.6738259609648002, + "grad_norm": 122.98137570366295, + "learning_rate": 9.534648223232509e-06, + "loss": 5.6992, + "step": 7906 + }, + { + "epoch": 0.6739111906588255, + "grad_norm": 46.230459036552695, + "learning_rate": 9.534439309114369e-06, + "loss": 2.879, + "step": 7907 + }, + { + "epoch": 0.6739964203528509, + "grad_norm": 38.961382413976686, + "learning_rate": 9.53423035040182e-06, + "loss": 3.7901, + "step": 7908 + }, + { + "epoch": 0.6740816500468764, + "grad_norm": 69.3297253385392, + "learning_rate": 9.534021347096915e-06, + "loss": 3.9931, + "step": 7909 + }, + { + "epoch": 0.6741668797409017, + "grad_norm": 33.92049953745993, + "learning_rate": 9.533812299201715e-06, + "loss": 2.5261, + "step": 7910 + }, + { + "epoch": 0.6742521094349271, + "grad_norm": 41.73856843246681, + "learning_rate": 9.53360320671827e-06, + "loss": 4.6164, + "step": 7911 + }, + { + "epoch": 0.6743373391289526, + "grad_norm": 107.95341717005178, + "learning_rate": 9.53339406964864e-06, + "loss": 4.5928, + "step": 7912 + }, + { + "epoch": 0.6744225688229779, + "grad_norm": 38.76604961838977, + "learning_rate": 9.533184887994881e-06, + "loss": 4.4262, + "step": 7913 + }, + { + "epoch": 0.6745077985170034, + "grad_norm": 42.42986560783847, + "learning_rate": 9.532975661759049e-06, + "loss": 3.6029, + "step": 7914 + }, + { + "epoch": 0.6745930282110287, + "grad_norm": 42.90972950803116, + "learning_rate": 9.532766390943205e-06, + "loss": 3.2887, + "step": 7915 + }, + { + "epoch": 0.6746782579050541, + "grad_norm": 80.7846265892805, + "learning_rate": 9.532557075549403e-06, + "loss": 2.6657, + "step": 7916 + }, + { + "epoch": 0.6747634875990796, + "grad_norm": 47.98216736719257, + "learning_rate": 9.532347715579702e-06, + "loss": 4.0717, + "step": 7917 + }, + { + "epoch": 0.6748487172931049, + "grad_norm": 92.72515457595634, + "learning_rate": 9.532138311036164e-06, + "loss": 4.8909, + "step": 7918 + }, + { + "epoch": 0.6749339469871303, + "grad_norm": 33.68084001638652, + "learning_rate": 9.531928861920847e-06, + "loss": 2.7737, + "step": 7919 + }, + { + "epoch": 0.6750191766811557, + "grad_norm": 77.96672572074971, + "learning_rate": 9.53171936823581e-06, + "loss": 5.5873, + "step": 7920 + }, + { + "epoch": 0.6751044063751811, + "grad_norm": 38.87806256473002, + "learning_rate": 9.531509829983112e-06, + "loss": 3.7542, + "step": 7921 + }, + { + "epoch": 0.6751896360692066, + "grad_norm": 152.4942164502101, + "learning_rate": 9.531300247164817e-06, + "loss": 4.5985, + "step": 7922 + }, + { + "epoch": 0.6752748657632319, + "grad_norm": 59.356629543643926, + "learning_rate": 9.531090619782983e-06, + "loss": 4.4753, + "step": 7923 + }, + { + "epoch": 0.6753600954572573, + "grad_norm": 33.443035342602926, + "learning_rate": 9.530880947839676e-06, + "loss": 2.9863, + "step": 7924 + }, + { + "epoch": 0.6754453251512827, + "grad_norm": 59.58955868276113, + "learning_rate": 9.530671231336954e-06, + "loss": 5.4282, + "step": 7925 + }, + { + "epoch": 0.6755305548453081, + "grad_norm": 38.596062531254574, + "learning_rate": 9.53046147027688e-06, + "loss": 3.1477, + "step": 7926 + }, + { + "epoch": 0.6756157845393335, + "grad_norm": 21.76539335409951, + "learning_rate": 9.53025166466152e-06, + "loss": 1.9017, + "step": 7927 + }, + { + "epoch": 0.6757010142333589, + "grad_norm": 43.07705581220899, + "learning_rate": 9.530041814492932e-06, + "loss": 3.518, + "step": 7928 + }, + { + "epoch": 0.6757862439273843, + "grad_norm": 77.60002008536141, + "learning_rate": 9.529831919773182e-06, + "loss": 4.0415, + "step": 7929 + }, + { + "epoch": 0.6758714736214096, + "grad_norm": 39.7389035928205, + "learning_rate": 9.529621980504339e-06, + "loss": 3.7411, + "step": 7930 + }, + { + "epoch": 0.6759567033154351, + "grad_norm": 62.98252332557031, + "learning_rate": 9.529411996688462e-06, + "loss": 2.3863, + "step": 7931 + }, + { + "epoch": 0.6760419330094605, + "grad_norm": 69.23644201268264, + "learning_rate": 9.529201968327618e-06, + "loss": 5.109, + "step": 7932 + }, + { + "epoch": 0.6761271627034859, + "grad_norm": 48.36563787679989, + "learning_rate": 9.528991895423872e-06, + "loss": 3.8641, + "step": 7933 + }, + { + "epoch": 0.6762123923975113, + "grad_norm": 33.941312583896064, + "learning_rate": 9.528781777979288e-06, + "loss": 2.9572, + "step": 7934 + }, + { + "epoch": 0.6762976220915367, + "grad_norm": 35.59018153002368, + "learning_rate": 9.528571615995939e-06, + "loss": 3.8092, + "step": 7935 + }, + { + "epoch": 0.6763828517855621, + "grad_norm": 46.80133409602208, + "learning_rate": 9.528361409475886e-06, + "loss": 3.8595, + "step": 7936 + }, + { + "epoch": 0.6764680814795875, + "grad_norm": 51.45094949395174, + "learning_rate": 9.528151158421198e-06, + "loss": 3.6859, + "step": 7937 + }, + { + "epoch": 0.6765533111736128, + "grad_norm": 33.903972746875425, + "learning_rate": 9.527940862833942e-06, + "loss": 3.4486, + "step": 7938 + }, + { + "epoch": 0.6766385408676383, + "grad_norm": 53.16179325242175, + "learning_rate": 9.527730522716186e-06, + "loss": 2.7317, + "step": 7939 + }, + { + "epoch": 0.6767237705616637, + "grad_norm": 51.345345094755146, + "learning_rate": 9.52752013807e-06, + "loss": 4.6204, + "step": 7940 + }, + { + "epoch": 0.6768090002556891, + "grad_norm": 54.66187654763423, + "learning_rate": 9.527309708897455e-06, + "loss": 4.8967, + "step": 7941 + }, + { + "epoch": 0.6768942299497145, + "grad_norm": 80.49620958901292, + "learning_rate": 9.527099235200615e-06, + "loss": 4.1044, + "step": 7942 + }, + { + "epoch": 0.6769794596437398, + "grad_norm": 127.80922636389461, + "learning_rate": 9.526888716981556e-06, + "loss": 5.6883, + "step": 7943 + }, + { + "epoch": 0.6770646893377653, + "grad_norm": 43.72613494637724, + "learning_rate": 9.526678154242341e-06, + "loss": 3.815, + "step": 7944 + }, + { + "epoch": 0.6771499190317907, + "grad_norm": 71.46266529827062, + "learning_rate": 9.526467546985048e-06, + "loss": 6.0439, + "step": 7945 + }, + { + "epoch": 0.677235148725816, + "grad_norm": 35.789285698448495, + "learning_rate": 9.526256895211744e-06, + "loss": 4.0504, + "step": 7946 + }, + { + "epoch": 0.6773203784198415, + "grad_norm": 43.12853650293486, + "learning_rate": 9.526046198924503e-06, + "loss": 3.8155, + "step": 7947 + }, + { + "epoch": 0.6774056081138669, + "grad_norm": 63.45383938476634, + "learning_rate": 9.525835458125396e-06, + "loss": 4.4114, + "step": 7948 + }, + { + "epoch": 0.6774908378078923, + "grad_norm": 61.266334541798436, + "learning_rate": 9.525624672816496e-06, + "loss": 5.2561, + "step": 7949 + }, + { + "epoch": 0.6775760675019177, + "grad_norm": 44.16004366484192, + "learning_rate": 9.525413842999875e-06, + "loss": 3.7881, + "step": 7950 + }, + { + "epoch": 0.677661297195943, + "grad_norm": 35.832957970747785, + "learning_rate": 9.525202968677607e-06, + "loss": 3.462, + "step": 7951 + }, + { + "epoch": 0.6777465268899685, + "grad_norm": 31.787520338090296, + "learning_rate": 9.524992049851768e-06, + "loss": 2.872, + "step": 7952 + }, + { + "epoch": 0.6778317565839939, + "grad_norm": 30.640539669296963, + "learning_rate": 9.524781086524428e-06, + "loss": 2.1317, + "step": 7953 + }, + { + "epoch": 0.6779169862780192, + "grad_norm": 32.78941728091738, + "learning_rate": 9.524570078697663e-06, + "loss": 3.6124, + "step": 7954 + }, + { + "epoch": 0.6780022159720447, + "grad_norm": 74.0519869960754, + "learning_rate": 9.524359026373551e-06, + "loss": 2.9059, + "step": 7955 + }, + { + "epoch": 0.67808744566607, + "grad_norm": 60.05896197271119, + "learning_rate": 9.524147929554166e-06, + "loss": 3.9099, + "step": 7956 + }, + { + "epoch": 0.6781726753600955, + "grad_norm": 89.52579921825766, + "learning_rate": 9.523936788241584e-06, + "loss": 3.5606, + "step": 7957 + }, + { + "epoch": 0.6782579050541209, + "grad_norm": 99.72188187100599, + "learning_rate": 9.523725602437879e-06, + "loss": 5.5421, + "step": 7958 + }, + { + "epoch": 0.6783431347481462, + "grad_norm": 44.81425174993578, + "learning_rate": 9.523514372145133e-06, + "loss": 4.6684, + "step": 7959 + }, + { + "epoch": 0.6784283644421717, + "grad_norm": 35.93136876457581, + "learning_rate": 9.52330309736542e-06, + "loss": 3.656, + "step": 7960 + }, + { + "epoch": 0.678513594136197, + "grad_norm": 63.390494171130555, + "learning_rate": 9.523091778100817e-06, + "loss": 4.2834, + "step": 7961 + }, + { + "epoch": 0.6785988238302224, + "grad_norm": 35.32741114919132, + "learning_rate": 9.522880414353404e-06, + "loss": 3.4128, + "step": 7962 + }, + { + "epoch": 0.6786840535242479, + "grad_norm": 36.929764993601076, + "learning_rate": 9.52266900612526e-06, + "loss": 3.6936, + "step": 7963 + }, + { + "epoch": 0.6787692832182732, + "grad_norm": 35.06815704774465, + "learning_rate": 9.522457553418463e-06, + "loss": 4.4341, + "step": 7964 + }, + { + "epoch": 0.6788545129122986, + "grad_norm": 36.72924843509507, + "learning_rate": 9.522246056235094e-06, + "loss": 3.5049, + "step": 7965 + }, + { + "epoch": 0.6789397426063241, + "grad_norm": 74.39030979159607, + "learning_rate": 9.522034514577231e-06, + "loss": 3.7184, + "step": 7966 + }, + { + "epoch": 0.6790249723003494, + "grad_norm": 70.20989186684213, + "learning_rate": 9.521822928446955e-06, + "loss": 5.2431, + "step": 7967 + }, + { + "epoch": 0.6791102019943749, + "grad_norm": 36.15902170112064, + "learning_rate": 9.52161129784635e-06, + "loss": 3.4769, + "step": 7968 + }, + { + "epoch": 0.6791954316884002, + "grad_norm": 60.78100393191295, + "learning_rate": 9.521399622777492e-06, + "loss": 3.5345, + "step": 7969 + }, + { + "epoch": 0.6792806613824256, + "grad_norm": 86.05925746269278, + "learning_rate": 9.521187903242466e-06, + "loss": 4.8661, + "step": 7970 + }, + { + "epoch": 0.6793658910764511, + "grad_norm": 45.74384084072793, + "learning_rate": 9.520976139243352e-06, + "loss": 4.9586, + "step": 7971 + }, + { + "epoch": 0.6794511207704764, + "grad_norm": 47.519507924310425, + "learning_rate": 9.520764330782236e-06, + "loss": 4.2765, + "step": 7972 + }, + { + "epoch": 0.6795363504645018, + "grad_norm": 83.3527380087607, + "learning_rate": 9.5205524778612e-06, + "loss": 4.5351, + "step": 7973 + }, + { + "epoch": 0.6796215801585272, + "grad_norm": 43.755409191419155, + "learning_rate": 9.520340580482325e-06, + "loss": 3.6015, + "step": 7974 + }, + { + "epoch": 0.6797068098525526, + "grad_norm": 52.80094471491085, + "learning_rate": 9.520128638647696e-06, + "loss": 4.7649, + "step": 7975 + }, + { + "epoch": 0.6797920395465781, + "grad_norm": 128.8847544207158, + "learning_rate": 9.5199166523594e-06, + "loss": 3.5213, + "step": 7976 + }, + { + "epoch": 0.6798772692406034, + "grad_norm": 56.41987399528185, + "learning_rate": 9.519704621619518e-06, + "loss": 3.8702, + "step": 7977 + }, + { + "epoch": 0.6799624989346288, + "grad_norm": 51.68536690639922, + "learning_rate": 9.519492546430139e-06, + "loss": 2.504, + "step": 7978 + }, + { + "epoch": 0.6800477286286543, + "grad_norm": 34.747554272871184, + "learning_rate": 9.519280426793345e-06, + "loss": 3.1402, + "step": 7979 + }, + { + "epoch": 0.6801329583226796, + "grad_norm": 49.940056001317906, + "learning_rate": 9.519068262711223e-06, + "loss": 4.03, + "step": 7980 + }, + { + "epoch": 0.680218188016705, + "grad_norm": 73.04615052605341, + "learning_rate": 9.518856054185862e-06, + "loss": 5.1355, + "step": 7981 + }, + { + "epoch": 0.6803034177107304, + "grad_norm": 38.571118079447736, + "learning_rate": 9.518643801219347e-06, + "loss": 4.36, + "step": 7982 + }, + { + "epoch": 0.6803886474047558, + "grad_norm": 97.0323819827978, + "learning_rate": 9.518431503813766e-06, + "loss": 4.2251, + "step": 7983 + }, + { + "epoch": 0.6804738770987813, + "grad_norm": 35.78780423086227, + "learning_rate": 9.518219161971206e-06, + "loss": 4.5592, + "step": 7984 + }, + { + "epoch": 0.6805591067928066, + "grad_norm": 55.99577507297835, + "learning_rate": 9.518006775693756e-06, + "loss": 2.7831, + "step": 7985 + }, + { + "epoch": 0.680644336486832, + "grad_norm": 51.43861183755728, + "learning_rate": 9.517794344983505e-06, + "loss": 3.8447, + "step": 7986 + }, + { + "epoch": 0.6807295661808574, + "grad_norm": 37.91859636541553, + "learning_rate": 9.517581869842541e-06, + "loss": 3.1742, + "step": 7987 + }, + { + "epoch": 0.6808147958748828, + "grad_norm": 35.384621052577245, + "learning_rate": 9.517369350272955e-06, + "loss": 3.573, + "step": 7988 + }, + { + "epoch": 0.6809000255689082, + "grad_norm": 45.17056396331386, + "learning_rate": 9.517156786276838e-06, + "loss": 2.5739, + "step": 7989 + }, + { + "epoch": 0.6809852552629336, + "grad_norm": 42.99952904915872, + "learning_rate": 9.516944177856276e-06, + "loss": 2.0932, + "step": 7990 + }, + { + "epoch": 0.681070484956959, + "grad_norm": 31.95877900142604, + "learning_rate": 9.516731525013365e-06, + "loss": 4.061, + "step": 7991 + }, + { + "epoch": 0.6811557146509845, + "grad_norm": 73.81122118699334, + "learning_rate": 9.516518827750193e-06, + "loss": 4.1193, + "step": 7992 + }, + { + "epoch": 0.6812409443450098, + "grad_norm": 100.73975258022617, + "learning_rate": 9.516306086068855e-06, + "loss": 2.2325, + "step": 7993 + }, + { + "epoch": 0.6813261740390352, + "grad_norm": 27.950713310276317, + "learning_rate": 9.516093299971439e-06, + "loss": 3.1392, + "step": 7994 + }, + { + "epoch": 0.6814114037330606, + "grad_norm": 41.263218509088496, + "learning_rate": 9.515880469460042e-06, + "loss": 4.4666, + "step": 7995 + }, + { + "epoch": 0.681496633427086, + "grad_norm": 34.658291574007976, + "learning_rate": 9.515667594536753e-06, + "loss": 2.5607, + "step": 7996 + }, + { + "epoch": 0.6815818631211114, + "grad_norm": 107.50458108355431, + "learning_rate": 9.515454675203669e-06, + "loss": 4.7864, + "step": 7997 + }, + { + "epoch": 0.6816670928151368, + "grad_norm": 31.29699960978538, + "learning_rate": 9.51524171146288e-06, + "loss": 3.9145, + "step": 7998 + }, + { + "epoch": 0.6817523225091622, + "grad_norm": 36.49845715362484, + "learning_rate": 9.515028703316484e-06, + "loss": 3.1588, + "step": 7999 + }, + { + "epoch": 0.6818375522031875, + "grad_norm": 182.88990402315332, + "learning_rate": 9.514815650766577e-06, + "loss": 4.391, + "step": 8000 + }, + { + "epoch": 0.681922781897213, + "grad_norm": 74.33189527086124, + "learning_rate": 9.514602553815251e-06, + "loss": 5.2853, + "step": 8001 + }, + { + "epoch": 0.6820080115912384, + "grad_norm": 94.95467463349975, + "learning_rate": 9.514389412464603e-06, + "loss": 1.7366, + "step": 8002 + }, + { + "epoch": 0.6820932412852638, + "grad_norm": 75.77844056143134, + "learning_rate": 9.514176226716727e-06, + "loss": 4.6652, + "step": 8003 + }, + { + "epoch": 0.6821784709792892, + "grad_norm": 39.30433824011933, + "learning_rate": 9.513962996573724e-06, + "loss": 3.7189, + "step": 8004 + }, + { + "epoch": 0.6822637006733145, + "grad_norm": 37.78048994796801, + "learning_rate": 9.513749722037687e-06, + "loss": 4.1048, + "step": 8005 + }, + { + "epoch": 0.68234893036734, + "grad_norm": 67.18585549638183, + "learning_rate": 9.513536403110716e-06, + "loss": 5.222, + "step": 8006 + }, + { + "epoch": 0.6824341600613654, + "grad_norm": 30.5564094194024, + "learning_rate": 9.513323039794908e-06, + "loss": 4.2553, + "step": 8007 + }, + { + "epoch": 0.6825193897553907, + "grad_norm": 81.6311830770425, + "learning_rate": 9.51310963209236e-06, + "loss": 5.0676, + "step": 8008 + }, + { + "epoch": 0.6826046194494162, + "grad_norm": 83.66406924411957, + "learning_rate": 9.512896180005173e-06, + "loss": 4.3523, + "step": 8009 + }, + { + "epoch": 0.6826898491434416, + "grad_norm": 54.64575858978978, + "learning_rate": 9.512682683535446e-06, + "loss": 4.6623, + "step": 8010 + }, + { + "epoch": 0.682775078837467, + "grad_norm": 32.297960039717296, + "learning_rate": 9.512469142685276e-06, + "loss": 3.6061, + "step": 8011 + }, + { + "epoch": 0.6828603085314924, + "grad_norm": 36.341289934408586, + "learning_rate": 9.512255557456768e-06, + "loss": 2.8635, + "step": 8012 + }, + { + "epoch": 0.6829455382255177, + "grad_norm": 56.64849647202185, + "learning_rate": 9.512041927852018e-06, + "loss": 4.1521, + "step": 8013 + }, + { + "epoch": 0.6830307679195432, + "grad_norm": 66.04177713053572, + "learning_rate": 9.511828253873128e-06, + "loss": 4.5457, + "step": 8014 + }, + { + "epoch": 0.6831159976135686, + "grad_norm": 45.88945054348178, + "learning_rate": 9.5116145355222e-06, + "loss": 4.5875, + "step": 8015 + }, + { + "epoch": 0.6832012273075939, + "grad_norm": 76.9685045274052, + "learning_rate": 9.511400772801338e-06, + "loss": 4.2416, + "step": 8016 + }, + { + "epoch": 0.6832864570016194, + "grad_norm": 55.49207199262145, + "learning_rate": 9.51118696571264e-06, + "loss": 4.7419, + "step": 8017 + }, + { + "epoch": 0.6833716866956447, + "grad_norm": 42.67376972000825, + "learning_rate": 9.510973114258211e-06, + "loss": 4.3213, + "step": 8018 + }, + { + "epoch": 0.6834569163896702, + "grad_norm": 169.82987704859224, + "learning_rate": 9.510759218440153e-06, + "loss": 4.3268, + "step": 8019 + }, + { + "epoch": 0.6835421460836956, + "grad_norm": 49.927543619758865, + "learning_rate": 9.510545278260572e-06, + "loss": 4.3768, + "step": 8020 + }, + { + "epoch": 0.6836273757777209, + "grad_norm": 40.2188606648637, + "learning_rate": 9.51033129372157e-06, + "loss": 3.3877, + "step": 8021 + }, + { + "epoch": 0.6837126054717464, + "grad_norm": 47.526017622191524, + "learning_rate": 9.510117264825253e-06, + "loss": 4.3494, + "step": 8022 + }, + { + "epoch": 0.6837978351657718, + "grad_norm": 47.666995897430965, + "learning_rate": 9.509903191573725e-06, + "loss": 4.1365, + "step": 8023 + }, + { + "epoch": 0.6838830648597971, + "grad_norm": 72.72448120400446, + "learning_rate": 9.509689073969088e-06, + "loss": 2.7062, + "step": 8024 + }, + { + "epoch": 0.6839682945538226, + "grad_norm": 40.5656456169838, + "learning_rate": 9.509474912013454e-06, + "loss": 4.0091, + "step": 8025 + }, + { + "epoch": 0.6840535242478479, + "grad_norm": 107.20200496930826, + "learning_rate": 9.509260705708925e-06, + "loss": 4.3081, + "step": 8026 + }, + { + "epoch": 0.6841387539418734, + "grad_norm": 41.712336164488484, + "learning_rate": 9.509046455057608e-06, + "loss": 4.2666, + "step": 8027 + }, + { + "epoch": 0.6842239836358988, + "grad_norm": 64.06635125265949, + "learning_rate": 9.508832160061612e-06, + "loss": 4.6419, + "step": 8028 + }, + { + "epoch": 0.6843092133299241, + "grad_norm": 32.815099570665005, + "learning_rate": 9.508617820723042e-06, + "loss": 3.7215, + "step": 8029 + }, + { + "epoch": 0.6843944430239496, + "grad_norm": 36.50661605984217, + "learning_rate": 9.50840343704401e-06, + "loss": 3.084, + "step": 8030 + }, + { + "epoch": 0.6844796727179749, + "grad_norm": 62.03888710222273, + "learning_rate": 9.508189009026618e-06, + "loss": 4.4997, + "step": 8031 + }, + { + "epoch": 0.6845649024120003, + "grad_norm": 42.562460761848186, + "learning_rate": 9.50797453667298e-06, + "loss": 3.4076, + "step": 8032 + }, + { + "epoch": 0.6846501321060258, + "grad_norm": 71.20418774948544, + "learning_rate": 9.507760019985207e-06, + "loss": 4.0468, + "step": 8033 + }, + { + "epoch": 0.6847353618000511, + "grad_norm": 31.8395302978539, + "learning_rate": 9.507545458965402e-06, + "loss": 2.7939, + "step": 8034 + }, + { + "epoch": 0.6848205914940766, + "grad_norm": 109.54982756907509, + "learning_rate": 9.507330853615679e-06, + "loss": 6.4037, + "step": 8035 + }, + { + "epoch": 0.684905821188102, + "grad_norm": 38.603646362557285, + "learning_rate": 9.507116203938147e-06, + "loss": 2.7984, + "step": 8036 + }, + { + "epoch": 0.6849910508821273, + "grad_norm": 171.07441145084832, + "learning_rate": 9.50690150993492e-06, + "loss": 5.4533, + "step": 8037 + }, + { + "epoch": 0.6850762805761528, + "grad_norm": 40.96773589389296, + "learning_rate": 9.506686771608107e-06, + "loss": 3.9646, + "step": 8038 + }, + { + "epoch": 0.6851615102701781, + "grad_norm": 33.551320550931706, + "learning_rate": 9.50647198895982e-06, + "loss": 3.1659, + "step": 8039 + }, + { + "epoch": 0.6852467399642035, + "grad_norm": 47.859922724649024, + "learning_rate": 9.506257161992171e-06, + "loss": 4.0439, + "step": 8040 + }, + { + "epoch": 0.685331969658229, + "grad_norm": 40.50722071821673, + "learning_rate": 9.506042290707274e-06, + "loss": 3.4814, + "step": 8041 + }, + { + "epoch": 0.6854171993522543, + "grad_norm": 41.415542929771355, + "learning_rate": 9.505827375107242e-06, + "loss": 3.9541, + "step": 8042 + }, + { + "epoch": 0.6855024290462797, + "grad_norm": 67.6177446608495, + "learning_rate": 9.505612415194189e-06, + "loss": 4.6539, + "step": 8043 + }, + { + "epoch": 0.6855876587403051, + "grad_norm": 34.99354441611127, + "learning_rate": 9.505397410970227e-06, + "loss": 2.9735, + "step": 8044 + }, + { + "epoch": 0.6856728884343305, + "grad_norm": 35.925011106441225, + "learning_rate": 9.505182362437472e-06, + "loss": 4.2265, + "step": 8045 + }, + { + "epoch": 0.685758118128356, + "grad_norm": 68.67572522300259, + "learning_rate": 9.504967269598039e-06, + "loss": 4.0091, + "step": 8046 + }, + { + "epoch": 0.6858433478223813, + "grad_norm": 74.47597205443205, + "learning_rate": 9.504752132454043e-06, + "loss": 4.0398, + "step": 8047 + }, + { + "epoch": 0.6859285775164067, + "grad_norm": 33.38880103969242, + "learning_rate": 9.504536951007599e-06, + "loss": 2.9381, + "step": 8048 + }, + { + "epoch": 0.6860138072104321, + "grad_norm": 122.95440135761099, + "learning_rate": 9.504321725260825e-06, + "loss": 5.6658, + "step": 8049 + }, + { + "epoch": 0.6860990369044575, + "grad_norm": 72.56342318477405, + "learning_rate": 9.504106455215835e-06, + "loss": 3.9533, + "step": 8050 + }, + { + "epoch": 0.6861842665984829, + "grad_norm": 61.75960832088197, + "learning_rate": 9.50389114087475e-06, + "loss": 5.3909, + "step": 8051 + }, + { + "epoch": 0.6862694962925083, + "grad_norm": 48.04814180744228, + "learning_rate": 9.503675782239684e-06, + "loss": 4.6741, + "step": 8052 + }, + { + "epoch": 0.6863547259865337, + "grad_norm": 188.75972368397296, + "learning_rate": 9.503460379312757e-06, + "loss": 3.4677, + "step": 8053 + }, + { + "epoch": 0.6864399556805592, + "grad_norm": 36.54270573375048, + "learning_rate": 9.503244932096085e-06, + "loss": 3.6207, + "step": 8054 + }, + { + "epoch": 0.6865251853745845, + "grad_norm": 36.31227796311069, + "learning_rate": 9.50302944059179e-06, + "loss": 3.3475, + "step": 8055 + }, + { + "epoch": 0.6866104150686099, + "grad_norm": 53.00467112549898, + "learning_rate": 9.502813904801989e-06, + "loss": 3.3268, + "step": 8056 + }, + { + "epoch": 0.6866956447626353, + "grad_norm": 35.9870957413362, + "learning_rate": 9.502598324728804e-06, + "loss": 3.2781, + "step": 8057 + }, + { + "epoch": 0.6867808744566607, + "grad_norm": 41.81652881253951, + "learning_rate": 9.502382700374352e-06, + "loss": 3.5114, + "step": 8058 + }, + { + "epoch": 0.686866104150686, + "grad_norm": 87.58958243977625, + "learning_rate": 9.502167031740757e-06, + "loss": 4.6552, + "step": 8059 + }, + { + "epoch": 0.6869513338447115, + "grad_norm": 63.59994381446729, + "learning_rate": 9.501951318830136e-06, + "loss": 3.7471, + "step": 8060 + }, + { + "epoch": 0.6870365635387369, + "grad_norm": 52.22410511883605, + "learning_rate": 9.501735561644615e-06, + "loss": 3.1009, + "step": 8061 + }, + { + "epoch": 0.6871217932327623, + "grad_norm": 40.61491350867806, + "learning_rate": 9.501519760186311e-06, + "loss": 3.6993, + "step": 8062 + }, + { + "epoch": 0.6872070229267877, + "grad_norm": 111.65482662890265, + "learning_rate": 9.50130391445735e-06, + "loss": 5.1095, + "step": 8063 + }, + { + "epoch": 0.6872922526208131, + "grad_norm": 35.775727690257916, + "learning_rate": 9.501088024459855e-06, + "loss": 3.543, + "step": 8064 + }, + { + "epoch": 0.6873774823148385, + "grad_norm": 55.38670594922676, + "learning_rate": 9.500872090195945e-06, + "loss": 3.8318, + "step": 8065 + }, + { + "epoch": 0.6874627120088639, + "grad_norm": 65.15665211629067, + "learning_rate": 9.500656111667748e-06, + "loss": 4.2094, + "step": 8066 + }, + { + "epoch": 0.6875479417028892, + "grad_norm": 77.60672412336221, + "learning_rate": 9.500440088877388e-06, + "loss": 3.7488, + "step": 8067 + }, + { + "epoch": 0.6876331713969147, + "grad_norm": 76.59562447669242, + "learning_rate": 9.500224021826985e-06, + "loss": 5.7778, + "step": 8068 + }, + { + "epoch": 0.6877184010909401, + "grad_norm": 45.43539955346785, + "learning_rate": 9.50000791051867e-06, + "loss": 3.3592, + "step": 8069 + }, + { + "epoch": 0.6878036307849655, + "grad_norm": 42.552342767082536, + "learning_rate": 9.499791754954564e-06, + "loss": 3.2389, + "step": 8070 + }, + { + "epoch": 0.6878888604789909, + "grad_norm": 45.432323743664426, + "learning_rate": 9.499575555136792e-06, + "loss": 3.9434, + "step": 8071 + }, + { + "epoch": 0.6879740901730163, + "grad_norm": 37.027571862504495, + "learning_rate": 9.499359311067487e-06, + "loss": 3.496, + "step": 8072 + }, + { + "epoch": 0.6880593198670417, + "grad_norm": 36.57381774767836, + "learning_rate": 9.499143022748768e-06, + "loss": 3.6118, + "step": 8073 + }, + { + "epoch": 0.6881445495610671, + "grad_norm": 34.08381148365091, + "learning_rate": 9.498926690182767e-06, + "loss": 4.2885, + "step": 8074 + }, + { + "epoch": 0.6882297792550924, + "grad_norm": 27.527831524532335, + "learning_rate": 9.498710313371606e-06, + "loss": 3.7583, + "step": 8075 + }, + { + "epoch": 0.6883150089491179, + "grad_norm": 34.174427382172084, + "learning_rate": 9.498493892317419e-06, + "loss": 3.2604, + "step": 8076 + }, + { + "epoch": 0.6884002386431433, + "grad_norm": 22.999496500857404, + "learning_rate": 9.498277427022333e-06, + "loss": 1.9467, + "step": 8077 + }, + { + "epoch": 0.6884854683371686, + "grad_norm": 45.99190502865396, + "learning_rate": 9.498060917488476e-06, + "loss": 4.6808, + "step": 8078 + }, + { + "epoch": 0.6885706980311941, + "grad_norm": 50.58919007871622, + "learning_rate": 9.497844363717976e-06, + "loss": 4.4066, + "step": 8079 + }, + { + "epoch": 0.6886559277252194, + "grad_norm": 57.694916029162286, + "learning_rate": 9.497627765712964e-06, + "loss": 4.0448, + "step": 8080 + }, + { + "epoch": 0.6887411574192449, + "grad_norm": 66.75769131116964, + "learning_rate": 9.49741112347557e-06, + "loss": 4.2688, + "step": 8081 + }, + { + "epoch": 0.6888263871132703, + "grad_norm": 83.98960598560548, + "learning_rate": 9.497194437007924e-06, + "loss": 4.5399, + "step": 8082 + }, + { + "epoch": 0.6889116168072956, + "grad_norm": 50.869144494860514, + "learning_rate": 9.49697770631216e-06, + "loss": 2.182, + "step": 8083 + }, + { + "epoch": 0.6889968465013211, + "grad_norm": 40.88238331321252, + "learning_rate": 9.496760931390404e-06, + "loss": 4.0856, + "step": 8084 + }, + { + "epoch": 0.6890820761953464, + "grad_norm": 74.07880688403901, + "learning_rate": 9.496544112244795e-06, + "loss": 5.227, + "step": 8085 + }, + { + "epoch": 0.6891673058893718, + "grad_norm": 59.488670398944336, + "learning_rate": 9.49632724887746e-06, + "loss": 4.3349, + "step": 8086 + }, + { + "epoch": 0.6892525355833973, + "grad_norm": 171.25248559895556, + "learning_rate": 9.496110341290531e-06, + "loss": 3.4281, + "step": 8087 + }, + { + "epoch": 0.6893377652774226, + "grad_norm": 67.07690057779358, + "learning_rate": 9.495893389486147e-06, + "loss": 4.042, + "step": 8088 + }, + { + "epoch": 0.6894229949714481, + "grad_norm": 76.74929385461773, + "learning_rate": 9.495676393466435e-06, + "loss": 4.2155, + "step": 8089 + }, + { + "epoch": 0.6895082246654735, + "grad_norm": 50.63250663571482, + "learning_rate": 9.495459353233534e-06, + "loss": 4.6342, + "step": 8090 + }, + { + "epoch": 0.6895934543594988, + "grad_norm": 72.13467542659392, + "learning_rate": 9.495242268789575e-06, + "loss": 3.7602, + "step": 8091 + }, + { + "epoch": 0.6896786840535243, + "grad_norm": 48.34397938751454, + "learning_rate": 9.495025140136695e-06, + "loss": 3.0027, + "step": 8092 + }, + { + "epoch": 0.6897639137475496, + "grad_norm": 39.55896111330358, + "learning_rate": 9.494807967277028e-06, + "loss": 3.2207, + "step": 8093 + }, + { + "epoch": 0.689849143441575, + "grad_norm": 51.954659039725655, + "learning_rate": 9.494590750212714e-06, + "loss": 4.9083, + "step": 8094 + }, + { + "epoch": 0.6899343731356005, + "grad_norm": 33.18327127052087, + "learning_rate": 9.494373488945882e-06, + "loss": 3.5735, + "step": 8095 + }, + { + "epoch": 0.6900196028296258, + "grad_norm": 34.08755061209599, + "learning_rate": 9.494156183478676e-06, + "loss": 3.6732, + "step": 8096 + }, + { + "epoch": 0.6901048325236513, + "grad_norm": 36.681249460968274, + "learning_rate": 9.493938833813227e-06, + "loss": 4.339, + "step": 8097 + }, + { + "epoch": 0.6901900622176766, + "grad_norm": 41.152504067755594, + "learning_rate": 9.493721439951677e-06, + "loss": 3.7917, + "step": 8098 + }, + { + "epoch": 0.690275291911702, + "grad_norm": 63.63868093877939, + "learning_rate": 9.49350400189616e-06, + "loss": 4.9989, + "step": 8099 + }, + { + "epoch": 0.6903605216057275, + "grad_norm": 33.79992055138596, + "learning_rate": 9.493286519648819e-06, + "loss": 3.2142, + "step": 8100 + }, + { + "epoch": 0.6904457512997528, + "grad_norm": 46.107186611302645, + "learning_rate": 9.49306899321179e-06, + "loss": 3.4413, + "step": 8101 + }, + { + "epoch": 0.6905309809937782, + "grad_norm": 186.6186468697471, + "learning_rate": 9.492851422587212e-06, + "loss": 3.3881, + "step": 8102 + }, + { + "epoch": 0.6906162106878037, + "grad_norm": 202.8562556921126, + "learning_rate": 9.492633807777227e-06, + "loss": 5.5919, + "step": 8103 + }, + { + "epoch": 0.690701440381829, + "grad_norm": 40.73895694907775, + "learning_rate": 9.492416148783973e-06, + "loss": 3.7349, + "step": 8104 + }, + { + "epoch": 0.6907866700758545, + "grad_norm": 65.97846554518587, + "learning_rate": 9.49219844560959e-06, + "loss": 4.2595, + "step": 8105 + }, + { + "epoch": 0.6908718997698798, + "grad_norm": 34.64583538252963, + "learning_rate": 9.491980698256222e-06, + "loss": 3.9076, + "step": 8106 + }, + { + "epoch": 0.6909571294639052, + "grad_norm": 76.13112903043762, + "learning_rate": 9.491762906726009e-06, + "loss": 4.2707, + "step": 8107 + }, + { + "epoch": 0.6910423591579307, + "grad_norm": 75.65777161045753, + "learning_rate": 9.491545071021092e-06, + "loss": 4.2734, + "step": 8108 + }, + { + "epoch": 0.691127588851956, + "grad_norm": 48.39182234818627, + "learning_rate": 9.491327191143614e-06, + "loss": 4.3637, + "step": 8109 + }, + { + "epoch": 0.6912128185459814, + "grad_norm": 33.361420332614784, + "learning_rate": 9.491109267095717e-06, + "loss": 3.5936, + "step": 8110 + }, + { + "epoch": 0.6912980482400068, + "grad_norm": 95.26456513336417, + "learning_rate": 9.490891298879546e-06, + "loss": 3.5515, + "step": 8111 + }, + { + "epoch": 0.6913832779340322, + "grad_norm": 46.4989559723224, + "learning_rate": 9.490673286497243e-06, + "loss": 3.8651, + "step": 8112 + }, + { + "epoch": 0.6914685076280576, + "grad_norm": 30.905555657952274, + "learning_rate": 9.490455229950953e-06, + "loss": 3.4865, + "step": 8113 + }, + { + "epoch": 0.691553737322083, + "grad_norm": 100.51680683022275, + "learning_rate": 9.49023712924282e-06, + "loss": 4.9089, + "step": 8114 + }, + { + "epoch": 0.6916389670161084, + "grad_norm": 87.08670362195025, + "learning_rate": 9.490018984374989e-06, + "loss": 5.2858, + "step": 8115 + }, + { + "epoch": 0.6917241967101339, + "grad_norm": 46.79669691026494, + "learning_rate": 9.489800795349606e-06, + "loss": 3.3583, + "step": 8116 + }, + { + "epoch": 0.6918094264041592, + "grad_norm": 87.24496981978524, + "learning_rate": 9.489582562168815e-06, + "loss": 5.333, + "step": 8117 + }, + { + "epoch": 0.6918946560981846, + "grad_norm": 38.21488405048477, + "learning_rate": 9.489364284834765e-06, + "loss": 3.3867, + "step": 8118 + }, + { + "epoch": 0.69197988579221, + "grad_norm": 39.2004845042154, + "learning_rate": 9.4891459633496e-06, + "loss": 3.1787, + "step": 8119 + }, + { + "epoch": 0.6920651154862354, + "grad_norm": 77.15507479616758, + "learning_rate": 9.48892759771547e-06, + "loss": 3.773, + "step": 8120 + }, + { + "epoch": 0.6921503451802608, + "grad_norm": 91.3414043569405, + "learning_rate": 9.48870918793452e-06, + "loss": 4.029, + "step": 8121 + }, + { + "epoch": 0.6922355748742862, + "grad_norm": 49.53654782261805, + "learning_rate": 9.488490734008898e-06, + "loss": 4.3878, + "step": 8122 + }, + { + "epoch": 0.6923208045683116, + "grad_norm": 26.895771186589204, + "learning_rate": 9.488272235940752e-06, + "loss": 3.6495, + "step": 8123 + }, + { + "epoch": 0.692406034262337, + "grad_norm": 48.3289321833287, + "learning_rate": 9.488053693732235e-06, + "loss": 4.3988, + "step": 8124 + }, + { + "epoch": 0.6924912639563624, + "grad_norm": 59.416152616042964, + "learning_rate": 9.48783510738549e-06, + "loss": 3.337, + "step": 8125 + }, + { + "epoch": 0.6925764936503878, + "grad_norm": 152.22038813898283, + "learning_rate": 9.487616476902672e-06, + "loss": 4.7594, + "step": 8126 + }, + { + "epoch": 0.6926617233444132, + "grad_norm": 34.97845887651458, + "learning_rate": 9.48739780228593e-06, + "loss": 2.3901, + "step": 8127 + }, + { + "epoch": 0.6927469530384386, + "grad_norm": 44.19203804174917, + "learning_rate": 9.487179083537412e-06, + "loss": 4.3399, + "step": 8128 + }, + { + "epoch": 0.6928321827324639, + "grad_norm": 29.107944490996402, + "learning_rate": 9.486960320659272e-06, + "loss": 2.503, + "step": 8129 + }, + { + "epoch": 0.6929174124264894, + "grad_norm": 68.67596567097613, + "learning_rate": 9.486741513653658e-06, + "loss": 3.8878, + "step": 8130 + }, + { + "epoch": 0.6930026421205148, + "grad_norm": 33.903522903795526, + "learning_rate": 9.486522662522727e-06, + "loss": 4.2474, + "step": 8131 + }, + { + "epoch": 0.6930878718145402, + "grad_norm": 47.63787456976611, + "learning_rate": 9.486303767268628e-06, + "loss": 3.6018, + "step": 8132 + }, + { + "epoch": 0.6931731015085656, + "grad_norm": 46.91266402344938, + "learning_rate": 9.486084827893513e-06, + "loss": 3.6416, + "step": 8133 + }, + { + "epoch": 0.693258331202591, + "grad_norm": 27.796382894464053, + "learning_rate": 9.485865844399535e-06, + "loss": 3.9006, + "step": 8134 + }, + { + "epoch": 0.6933435608966164, + "grad_norm": 38.70283301372435, + "learning_rate": 9.485646816788852e-06, + "loss": 3.9888, + "step": 8135 + }, + { + "epoch": 0.6934287905906418, + "grad_norm": 50.98521707104101, + "learning_rate": 9.485427745063614e-06, + "loss": 4.3211, + "step": 8136 + }, + { + "epoch": 0.6935140202846671, + "grad_norm": 35.39945842709776, + "learning_rate": 9.485208629225974e-06, + "loss": 3.1549, + "step": 8137 + }, + { + "epoch": 0.6935992499786926, + "grad_norm": 105.77460630636817, + "learning_rate": 9.484989469278092e-06, + "loss": 4.0231, + "step": 8138 + }, + { + "epoch": 0.693684479672718, + "grad_norm": 43.19533128259435, + "learning_rate": 9.48477026522212e-06, + "loss": 4.0699, + "step": 8139 + }, + { + "epoch": 0.6937697093667434, + "grad_norm": 67.36576846599448, + "learning_rate": 9.484551017060216e-06, + "loss": 4.269, + "step": 8140 + }, + { + "epoch": 0.6938549390607688, + "grad_norm": 89.50677598140759, + "learning_rate": 9.48433172479453e-06, + "loss": 3.8762, + "step": 8141 + }, + { + "epoch": 0.6939401687547941, + "grad_norm": 60.581943510450586, + "learning_rate": 9.48411238842723e-06, + "loss": 3.8746, + "step": 8142 + }, + { + "epoch": 0.6940253984488196, + "grad_norm": 67.93548573647037, + "learning_rate": 9.483893007960461e-06, + "loss": 3.796, + "step": 8143 + }, + { + "epoch": 0.694110628142845, + "grad_norm": 35.88685500531902, + "learning_rate": 9.483673583396388e-06, + "loss": 3.6953, + "step": 8144 + }, + { + "epoch": 0.6941958578368703, + "grad_norm": 115.70749201891645, + "learning_rate": 9.483454114737168e-06, + "loss": 6.0368, + "step": 8145 + }, + { + "epoch": 0.6942810875308958, + "grad_norm": 40.83843758754605, + "learning_rate": 9.483234601984956e-06, + "loss": 4.2064, + "step": 8146 + }, + { + "epoch": 0.6943663172249211, + "grad_norm": 151.54367646410776, + "learning_rate": 9.483015045141914e-06, + "loss": 5.465, + "step": 8147 + }, + { + "epoch": 0.6944515469189466, + "grad_norm": 38.38647768039055, + "learning_rate": 9.482795444210201e-06, + "loss": 3.7771, + "step": 8148 + }, + { + "epoch": 0.694536776612972, + "grad_norm": 56.25208729905117, + "learning_rate": 9.482575799191977e-06, + "loss": 4.4363, + "step": 8149 + }, + { + "epoch": 0.6946220063069973, + "grad_norm": 45.59384131594192, + "learning_rate": 9.482356110089401e-06, + "loss": 3.7072, + "step": 8150 + }, + { + "epoch": 0.6947072360010228, + "grad_norm": 74.46361629284962, + "learning_rate": 9.482136376904632e-06, + "loss": 4.3788, + "step": 8151 + }, + { + "epoch": 0.6947924656950482, + "grad_norm": 91.2073254950069, + "learning_rate": 9.481916599639835e-06, + "loss": 4.5502, + "step": 8152 + }, + { + "epoch": 0.6948776953890735, + "grad_norm": 65.72559108731858, + "learning_rate": 9.481696778297167e-06, + "loss": 5.2371, + "step": 8153 + }, + { + "epoch": 0.694962925083099, + "grad_norm": 142.2940663685687, + "learning_rate": 9.481476912878793e-06, + "loss": 4.9835, + "step": 8154 + }, + { + "epoch": 0.6950481547771243, + "grad_norm": 59.90604597873836, + "learning_rate": 9.481257003386874e-06, + "loss": 3.2862, + "step": 8155 + }, + { + "epoch": 0.6951333844711497, + "grad_norm": 115.32336507126792, + "learning_rate": 9.481037049823573e-06, + "loss": 4.822, + "step": 8156 + }, + { + "epoch": 0.6952186141651752, + "grad_norm": 59.49172021058181, + "learning_rate": 9.480817052191054e-06, + "loss": 5.7161, + "step": 8157 + }, + { + "epoch": 0.6953038438592005, + "grad_norm": 54.1775185962298, + "learning_rate": 9.48059701049148e-06, + "loss": 3.9635, + "step": 8158 + }, + { + "epoch": 0.695389073553226, + "grad_norm": 102.67669882108594, + "learning_rate": 9.480376924727013e-06, + "loss": 5.743, + "step": 8159 + }, + { + "epoch": 0.6954743032472513, + "grad_norm": 60.69173182949884, + "learning_rate": 9.480156794899822e-06, + "loss": 4.295, + "step": 8160 + }, + { + "epoch": 0.6955595329412767, + "grad_norm": 34.15780132174235, + "learning_rate": 9.479936621012067e-06, + "loss": 3.1924, + "step": 8161 + }, + { + "epoch": 0.6956447626353022, + "grad_norm": 66.995122265016, + "learning_rate": 9.479716403065916e-06, + "loss": 4.2115, + "step": 8162 + }, + { + "epoch": 0.6957299923293275, + "grad_norm": 42.0414281627406, + "learning_rate": 9.479496141063534e-06, + "loss": 3.63, + "step": 8163 + }, + { + "epoch": 0.6958152220233529, + "grad_norm": 73.8227389508093, + "learning_rate": 9.47927583500709e-06, + "loss": 2.5121, + "step": 8164 + }, + { + "epoch": 0.6959004517173784, + "grad_norm": 21.529646249600912, + "learning_rate": 9.479055484898745e-06, + "loss": 2.1279, + "step": 8165 + }, + { + "epoch": 0.6959856814114037, + "grad_norm": 72.89195270003239, + "learning_rate": 9.478835090740668e-06, + "loss": 4.1784, + "step": 8166 + }, + { + "epoch": 0.6960709111054292, + "grad_norm": 45.45503609273904, + "learning_rate": 9.47861465253503e-06, + "loss": 3.1823, + "step": 8167 + }, + { + "epoch": 0.6961561407994545, + "grad_norm": 59.57648806949558, + "learning_rate": 9.478394170283996e-06, + "loss": 4.2555, + "step": 8168 + }, + { + "epoch": 0.6962413704934799, + "grad_norm": 35.74480457039426, + "learning_rate": 9.478173643989735e-06, + "loss": 3.8714, + "step": 8169 + }, + { + "epoch": 0.6963266001875054, + "grad_norm": 26.788345310451902, + "learning_rate": 9.477953073654415e-06, + "loss": 2.5133, + "step": 8170 + }, + { + "epoch": 0.6964118298815307, + "grad_norm": 47.30823930080028, + "learning_rate": 9.477732459280207e-06, + "loss": 3.3816, + "step": 8171 + }, + { + "epoch": 0.6964970595755561, + "grad_norm": 42.35998052791184, + "learning_rate": 9.47751180086928e-06, + "loss": 4.1744, + "step": 8172 + }, + { + "epoch": 0.6965822892695815, + "grad_norm": 44.27445272313343, + "learning_rate": 9.477291098423802e-06, + "loss": 3.423, + "step": 8173 + }, + { + "epoch": 0.6966675189636069, + "grad_norm": 63.761966594803255, + "learning_rate": 9.477070351945944e-06, + "loss": 3.4869, + "step": 8174 + }, + { + "epoch": 0.6967527486576324, + "grad_norm": 46.140614549774064, + "learning_rate": 9.476849561437882e-06, + "loss": 2.7926, + "step": 8175 + }, + { + "epoch": 0.6968379783516577, + "grad_norm": 46.652345264419836, + "learning_rate": 9.476628726901781e-06, + "loss": 4.2386, + "step": 8176 + }, + { + "epoch": 0.6969232080456831, + "grad_norm": 69.58712182476715, + "learning_rate": 9.476407848339816e-06, + "loss": 3.5608, + "step": 8177 + }, + { + "epoch": 0.6970084377397086, + "grad_norm": 77.9302802731494, + "learning_rate": 9.476186925754159e-06, + "loss": 5.8129, + "step": 8178 + }, + { + "epoch": 0.6970936674337339, + "grad_norm": 49.32373352937912, + "learning_rate": 9.47596595914698e-06, + "loss": 3.0348, + "step": 8179 + }, + { + "epoch": 0.6971788971277593, + "grad_norm": 37.70863105436921, + "learning_rate": 9.475744948520456e-06, + "loss": 4.0736, + "step": 8180 + }, + { + "epoch": 0.6972641268217847, + "grad_norm": 41.05125374426806, + "learning_rate": 9.47552389387676e-06, + "loss": 4.1295, + "step": 8181 + }, + { + "epoch": 0.6973493565158101, + "grad_norm": 36.120170951184015, + "learning_rate": 9.475302795218063e-06, + "loss": 3.7152, + "step": 8182 + }, + { + "epoch": 0.6974345862098356, + "grad_norm": 68.11293108559009, + "learning_rate": 9.475081652546542e-06, + "loss": 3.6359, + "step": 8183 + }, + { + "epoch": 0.6975198159038609, + "grad_norm": 29.00992490096277, + "learning_rate": 9.47486046586437e-06, + "loss": 3.0589, + "step": 8184 + }, + { + "epoch": 0.6976050455978863, + "grad_norm": 43.89733644214256, + "learning_rate": 9.474639235173726e-06, + "loss": 4.0164, + "step": 8185 + }, + { + "epoch": 0.6976902752919117, + "grad_norm": 58.332678329128264, + "learning_rate": 9.47441796047678e-06, + "loss": 4.4635, + "step": 8186 + }, + { + "epoch": 0.6977755049859371, + "grad_norm": 66.6995183110696, + "learning_rate": 9.474196641775714e-06, + "loss": 4.4658, + "step": 8187 + }, + { + "epoch": 0.6978607346799625, + "grad_norm": 37.52235703113336, + "learning_rate": 9.473975279072702e-06, + "loss": 3.9185, + "step": 8188 + }, + { + "epoch": 0.6979459643739879, + "grad_norm": 68.6187734791151, + "learning_rate": 9.473753872369919e-06, + "loss": 5.4145, + "step": 8189 + }, + { + "epoch": 0.6980311940680133, + "grad_norm": 46.992927346897496, + "learning_rate": 9.473532421669545e-06, + "loss": 3.3395, + "step": 8190 + }, + { + "epoch": 0.6981164237620386, + "grad_norm": 47.17114908525513, + "learning_rate": 9.473310926973757e-06, + "loss": 4.0661, + "step": 8191 + }, + { + "epoch": 0.6982016534560641, + "grad_norm": 32.773286170597096, + "learning_rate": 9.473089388284735e-06, + "loss": 3.5423, + "step": 8192 + }, + { + "epoch": 0.6982868831500895, + "grad_norm": 33.973574382825014, + "learning_rate": 9.472867805604656e-06, + "loss": 2.9226, + "step": 8193 + }, + { + "epoch": 0.6983721128441149, + "grad_norm": 54.200300998549196, + "learning_rate": 9.472646178935699e-06, + "loss": 5.3642, + "step": 8194 + }, + { + "epoch": 0.6984573425381403, + "grad_norm": 43.920619716244445, + "learning_rate": 9.472424508280043e-06, + "loss": 3.9594, + "step": 8195 + }, + { + "epoch": 0.6985425722321656, + "grad_norm": 42.452841602651866, + "learning_rate": 9.47220279363987e-06, + "loss": 3.9602, + "step": 8196 + }, + { + "epoch": 0.6986278019261911, + "grad_norm": 55.86304947878374, + "learning_rate": 9.471981035017358e-06, + "loss": 4.7928, + "step": 8197 + }, + { + "epoch": 0.6987130316202165, + "grad_norm": 41.57223605371405, + "learning_rate": 9.471759232414693e-06, + "loss": 4.273, + "step": 8198 + }, + { + "epoch": 0.6987982613142418, + "grad_norm": 53.78373182722978, + "learning_rate": 9.471537385834052e-06, + "loss": 3.2944, + "step": 8199 + }, + { + "epoch": 0.6988834910082673, + "grad_norm": 59.094756518904006, + "learning_rate": 9.471315495277616e-06, + "loss": 3.715, + "step": 8200 + }, + { + "epoch": 0.6989687207022927, + "grad_norm": 65.06728121684272, + "learning_rate": 9.471093560747569e-06, + "loss": 4.6106, + "step": 8201 + }, + { + "epoch": 0.6990539503963181, + "grad_norm": 53.903625118941584, + "learning_rate": 9.470871582246094e-06, + "loss": 4.2911, + "step": 8202 + }, + { + "epoch": 0.6991391800903435, + "grad_norm": 37.186109860014895, + "learning_rate": 9.470649559775373e-06, + "loss": 4.0136, + "step": 8203 + }, + { + "epoch": 0.6992244097843688, + "grad_norm": 36.050384032129074, + "learning_rate": 9.470427493337591e-06, + "loss": 1.5621, + "step": 8204 + }, + { + "epoch": 0.6993096394783943, + "grad_norm": 176.39235941013206, + "learning_rate": 9.47020538293493e-06, + "loss": 4.106, + "step": 8205 + }, + { + "epoch": 0.6993948691724197, + "grad_norm": 59.18305360981478, + "learning_rate": 9.469983228569577e-06, + "loss": 3.7762, + "step": 8206 + }, + { + "epoch": 0.699480098866445, + "grad_norm": 36.94129424717682, + "learning_rate": 9.469761030243713e-06, + "loss": 3.5986, + "step": 8207 + }, + { + "epoch": 0.6995653285604705, + "grad_norm": 68.74003828378072, + "learning_rate": 9.469538787959526e-06, + "loss": 4.4524, + "step": 8208 + }, + { + "epoch": 0.6996505582544958, + "grad_norm": 36.22846809733315, + "learning_rate": 9.469316501719202e-06, + "loss": 3.2601, + "step": 8209 + }, + { + "epoch": 0.6997357879485213, + "grad_norm": 35.11019596278838, + "learning_rate": 9.469094171524926e-06, + "loss": 3.1948, + "step": 8210 + }, + { + "epoch": 0.6998210176425467, + "grad_norm": 64.88745097540426, + "learning_rate": 9.468871797378884e-06, + "loss": 4.8953, + "step": 8211 + }, + { + "epoch": 0.699906247336572, + "grad_norm": 100.60684594926252, + "learning_rate": 9.468649379283263e-06, + "loss": 5.0344, + "step": 8212 + }, + { + "epoch": 0.6999914770305975, + "grad_norm": 45.32931857263806, + "learning_rate": 9.468426917240252e-06, + "loss": 3.8128, + "step": 8213 + }, + { + "epoch": 0.7000767067246229, + "grad_norm": 55.593422887127694, + "learning_rate": 9.468204411252037e-06, + "loss": 4.8258, + "step": 8214 + }, + { + "epoch": 0.7001619364186482, + "grad_norm": 44.26215797362197, + "learning_rate": 9.46798186132081e-06, + "loss": 4.0282, + "step": 8215 + }, + { + "epoch": 0.7002471661126737, + "grad_norm": 80.51773492687006, + "learning_rate": 9.467759267448755e-06, + "loss": 4.5939, + "step": 8216 + }, + { + "epoch": 0.700332395806699, + "grad_norm": 89.78485237941317, + "learning_rate": 9.46753662963806e-06, + "loss": 5.2001, + "step": 8217 + }, + { + "epoch": 0.7004176255007245, + "grad_norm": 36.84149625068405, + "learning_rate": 9.467313947890921e-06, + "loss": 3.6112, + "step": 8218 + }, + { + "epoch": 0.7005028551947499, + "grad_norm": 57.2138842895918, + "learning_rate": 9.467091222209524e-06, + "loss": 4.2399, + "step": 8219 + }, + { + "epoch": 0.7005880848887752, + "grad_norm": 40.09772984500303, + "learning_rate": 9.466868452596058e-06, + "loss": 3.1827, + "step": 8220 + }, + { + "epoch": 0.7006733145828007, + "grad_norm": 155.22791934511562, + "learning_rate": 9.466645639052718e-06, + "loss": 4.2328, + "step": 8221 + }, + { + "epoch": 0.700758544276826, + "grad_norm": 28.345854214356812, + "learning_rate": 9.466422781581691e-06, + "loss": 2.5289, + "step": 8222 + }, + { + "epoch": 0.7008437739708514, + "grad_norm": 28.48423691065477, + "learning_rate": 9.466199880185173e-06, + "loss": 3.3496, + "step": 8223 + }, + { + "epoch": 0.7009290036648769, + "grad_norm": 70.06683820155054, + "learning_rate": 9.465976934865353e-06, + "loss": 3.6929, + "step": 8224 + }, + { + "epoch": 0.7010142333589022, + "grad_norm": 80.88876958482022, + "learning_rate": 9.465753945624423e-06, + "loss": 4.698, + "step": 8225 + }, + { + "epoch": 0.7010994630529276, + "grad_norm": 71.54473766972735, + "learning_rate": 9.46553091246458e-06, + "loss": 5.1165, + "step": 8226 + }, + { + "epoch": 0.701184692746953, + "grad_norm": 44.331499829423244, + "learning_rate": 9.465307835388013e-06, + "loss": 4.3616, + "step": 8227 + }, + { + "epoch": 0.7012699224409784, + "grad_norm": 40.109320141209096, + "learning_rate": 9.465084714396918e-06, + "loss": 2.9391, + "step": 8228 + }, + { + "epoch": 0.7013551521350039, + "grad_norm": 65.5481609064851, + "learning_rate": 9.46486154949349e-06, + "loss": 4.5015, + "step": 8229 + }, + { + "epoch": 0.7014403818290292, + "grad_norm": 28.574804784716555, + "learning_rate": 9.46463834067992e-06, + "loss": 3.3482, + "step": 8230 + }, + { + "epoch": 0.7015256115230546, + "grad_norm": 35.5191294763933, + "learning_rate": 9.46441508795841e-06, + "loss": 4.1459, + "step": 8231 + }, + { + "epoch": 0.7016108412170801, + "grad_norm": 49.731655640609205, + "learning_rate": 9.46419179133115e-06, + "loss": 3.6844, + "step": 8232 + }, + { + "epoch": 0.7016960709111054, + "grad_norm": 44.449791017480926, + "learning_rate": 9.463968450800335e-06, + "loss": 2.0671, + "step": 8233 + }, + { + "epoch": 0.7017813006051308, + "grad_norm": 33.76102865221321, + "learning_rate": 9.463745066368169e-06, + "loss": 3.3091, + "step": 8234 + }, + { + "epoch": 0.7018665302991562, + "grad_norm": 52.160531989961704, + "learning_rate": 9.463521638036842e-06, + "loss": 4.2612, + "step": 8235 + }, + { + "epoch": 0.7019517599931816, + "grad_norm": 42.97408905665028, + "learning_rate": 9.463298165808552e-06, + "loss": 3.3869, + "step": 8236 + }, + { + "epoch": 0.7020369896872071, + "grad_norm": 66.17351686141534, + "learning_rate": 9.463074649685499e-06, + "loss": 4.485, + "step": 8237 + }, + { + "epoch": 0.7021222193812324, + "grad_norm": 35.961195807721936, + "learning_rate": 9.46285108966988e-06, + "loss": 2.9676, + "step": 8238 + }, + { + "epoch": 0.7022074490752578, + "grad_norm": 45.59873178700352, + "learning_rate": 9.462627485763894e-06, + "loss": 2.1735, + "step": 8239 + }, + { + "epoch": 0.7022926787692833, + "grad_norm": 58.698366535863414, + "learning_rate": 9.46240383796974e-06, + "loss": 4.3138, + "step": 8240 + }, + { + "epoch": 0.7023779084633086, + "grad_norm": 99.72583031634979, + "learning_rate": 9.462180146289616e-06, + "loss": 3.6776, + "step": 8241 + }, + { + "epoch": 0.702463138157334, + "grad_norm": 51.29021213033599, + "learning_rate": 9.461956410725725e-06, + "loss": 4.2559, + "step": 8242 + }, + { + "epoch": 0.7025483678513594, + "grad_norm": 26.24324935525881, + "learning_rate": 9.461732631280268e-06, + "loss": 2.8652, + "step": 8243 + }, + { + "epoch": 0.7026335975453848, + "grad_norm": 53.72987310926999, + "learning_rate": 9.461508807955439e-06, + "loss": 2.9858, + "step": 8244 + }, + { + "epoch": 0.7027188272394103, + "grad_norm": 77.1959952104857, + "learning_rate": 9.461284940753446e-06, + "loss": 5.437, + "step": 8245 + }, + { + "epoch": 0.7028040569334356, + "grad_norm": 30.934212797646268, + "learning_rate": 9.461061029676489e-06, + "loss": 3.8786, + "step": 8246 + }, + { + "epoch": 0.702889286627461, + "grad_norm": 57.938706036284195, + "learning_rate": 9.460837074726767e-06, + "loss": 3.9223, + "step": 8247 + }, + { + "epoch": 0.7029745163214864, + "grad_norm": 55.511864128290476, + "learning_rate": 9.460613075906486e-06, + "loss": 4.1224, + "step": 8248 + }, + { + "epoch": 0.7030597460155118, + "grad_norm": 66.53812475718331, + "learning_rate": 9.460389033217849e-06, + "loss": 3.9833, + "step": 8249 + }, + { + "epoch": 0.7031449757095372, + "grad_norm": 52.80314305327341, + "learning_rate": 9.460164946663058e-06, + "loss": 4.0081, + "step": 8250 + }, + { + "epoch": 0.7032302054035626, + "grad_norm": 68.41181758893897, + "learning_rate": 9.459940816244316e-06, + "loss": 4.5321, + "step": 8251 + }, + { + "epoch": 0.703315435097588, + "grad_norm": 113.09025943422999, + "learning_rate": 9.45971664196383e-06, + "loss": 5.6666, + "step": 8252 + }, + { + "epoch": 0.7034006647916135, + "grad_norm": 31.324999748201414, + "learning_rate": 9.4594924238238e-06, + "loss": 2.8607, + "step": 8253 + }, + { + "epoch": 0.7034858944856388, + "grad_norm": 49.42110620902935, + "learning_rate": 9.459268161826436e-06, + "loss": 3.9798, + "step": 8254 + }, + { + "epoch": 0.7035711241796642, + "grad_norm": 79.33706659446018, + "learning_rate": 9.459043855973942e-06, + "loss": 3.5979, + "step": 8255 + }, + { + "epoch": 0.7036563538736896, + "grad_norm": 49.54466337422926, + "learning_rate": 9.458819506268523e-06, + "loss": 3.2373, + "step": 8256 + }, + { + "epoch": 0.703741583567715, + "grad_norm": 50.648763207885594, + "learning_rate": 9.458595112712385e-06, + "loss": 4.4015, + "step": 8257 + }, + { + "epoch": 0.7038268132617403, + "grad_norm": 66.50549878825159, + "learning_rate": 9.458370675307738e-06, + "loss": 3.2329, + "step": 8258 + }, + { + "epoch": 0.7039120429557658, + "grad_norm": 32.370491031265345, + "learning_rate": 9.458146194056786e-06, + "loss": 3.6156, + "step": 8259 + }, + { + "epoch": 0.7039972726497912, + "grad_norm": 40.32339748543539, + "learning_rate": 9.457921668961736e-06, + "loss": 3.7631, + "step": 8260 + }, + { + "epoch": 0.7040825023438166, + "grad_norm": 71.45597272190243, + "learning_rate": 9.457697100024799e-06, + "loss": 4.7107, + "step": 8261 + }, + { + "epoch": 0.704167732037842, + "grad_norm": 32.35962076514482, + "learning_rate": 9.457472487248182e-06, + "loss": 3.0316, + "step": 8262 + }, + { + "epoch": 0.7042529617318674, + "grad_norm": 160.96081823759914, + "learning_rate": 9.457247830634095e-06, + "loss": 4.7896, + "step": 8263 + }, + { + "epoch": 0.7043381914258928, + "grad_norm": 78.10901510915623, + "learning_rate": 9.457023130184746e-06, + "loss": 4.2461, + "step": 8264 + }, + { + "epoch": 0.7044234211199182, + "grad_norm": 62.13379815582674, + "learning_rate": 9.456798385902346e-06, + "loss": 4.4357, + "step": 8265 + }, + { + "epoch": 0.7045086508139435, + "grad_norm": 39.24747440342643, + "learning_rate": 9.456573597789104e-06, + "loss": 4.2501, + "step": 8266 + }, + { + "epoch": 0.704593880507969, + "grad_norm": 60.58778677571839, + "learning_rate": 9.456348765847234e-06, + "loss": 4.4505, + "step": 8267 + }, + { + "epoch": 0.7046791102019944, + "grad_norm": 30.385461457509365, + "learning_rate": 9.456123890078944e-06, + "loss": 2.806, + "step": 8268 + }, + { + "epoch": 0.7047643398960197, + "grad_norm": 48.12349978481205, + "learning_rate": 9.455898970486444e-06, + "loss": 4.3073, + "step": 8269 + }, + { + "epoch": 0.7048495695900452, + "grad_norm": 63.60261677430847, + "learning_rate": 9.455674007071948e-06, + "loss": 3.535, + "step": 8270 + }, + { + "epoch": 0.7049347992840705, + "grad_norm": 31.143128494465074, + "learning_rate": 9.455448999837671e-06, + "loss": 3.7241, + "step": 8271 + }, + { + "epoch": 0.705020028978096, + "grad_norm": 36.89198533519583, + "learning_rate": 9.455223948785823e-06, + "loss": 3.7107, + "step": 8272 + }, + { + "epoch": 0.7051052586721214, + "grad_norm": 83.80063132073828, + "learning_rate": 9.454998853918618e-06, + "loss": 4.2241, + "step": 8273 + }, + { + "epoch": 0.7051904883661467, + "grad_norm": 57.813955851220875, + "learning_rate": 9.454773715238267e-06, + "loss": 3.9499, + "step": 8274 + }, + { + "epoch": 0.7052757180601722, + "grad_norm": 53.42973417546646, + "learning_rate": 9.45454853274699e-06, + "loss": 4.9153, + "step": 8275 + }, + { + "epoch": 0.7053609477541976, + "grad_norm": 47.93685205849801, + "learning_rate": 9.454323306446996e-06, + "loss": 3.528, + "step": 8276 + }, + { + "epoch": 0.7054461774482229, + "grad_norm": 56.96725190065794, + "learning_rate": 9.454098036340505e-06, + "loss": 4.3976, + "step": 8277 + }, + { + "epoch": 0.7055314071422484, + "grad_norm": 52.70509438257659, + "learning_rate": 9.453872722429726e-06, + "loss": 3.8877, + "step": 8278 + }, + { + "epoch": 0.7056166368362737, + "grad_norm": 41.41016527554959, + "learning_rate": 9.45364736471688e-06, + "loss": 3.1401, + "step": 8279 + }, + { + "epoch": 0.7057018665302992, + "grad_norm": 74.67911446933034, + "learning_rate": 9.453421963204184e-06, + "loss": 3.9226, + "step": 8280 + }, + { + "epoch": 0.7057870962243246, + "grad_norm": 76.89994143101431, + "learning_rate": 9.45319651789385e-06, + "loss": 4.9684, + "step": 8281 + }, + { + "epoch": 0.7058723259183499, + "grad_norm": 52.26511711446675, + "learning_rate": 9.452971028788098e-06, + "loss": 3.0533, + "step": 8282 + }, + { + "epoch": 0.7059575556123754, + "grad_norm": 54.46457421778572, + "learning_rate": 9.452745495889145e-06, + "loss": 4.6106, + "step": 8283 + }, + { + "epoch": 0.7060427853064007, + "grad_norm": 54.55918795515165, + "learning_rate": 9.452519919199212e-06, + "loss": 4.1594, + "step": 8284 + }, + { + "epoch": 0.7061280150004261, + "grad_norm": 48.684813555743574, + "learning_rate": 9.452294298720512e-06, + "loss": 3.8018, + "step": 8285 + }, + { + "epoch": 0.7062132446944516, + "grad_norm": 113.01714877758528, + "learning_rate": 9.452068634455268e-06, + "loss": 4.1117, + "step": 8286 + }, + { + "epoch": 0.7062984743884769, + "grad_norm": 125.76744210867062, + "learning_rate": 9.451842926405699e-06, + "loss": 5.8351, + "step": 8287 + }, + { + "epoch": 0.7063837040825024, + "grad_norm": 28.02113693603441, + "learning_rate": 9.451617174574023e-06, + "loss": 1.818, + "step": 8288 + }, + { + "epoch": 0.7064689337765278, + "grad_norm": 120.64488004229094, + "learning_rate": 9.45139137896246e-06, + "loss": 4.3581, + "step": 8289 + }, + { + "epoch": 0.7065541634705531, + "grad_norm": 59.88031358749107, + "learning_rate": 9.451165539573233e-06, + "loss": 5.5212, + "step": 8290 + }, + { + "epoch": 0.7066393931645786, + "grad_norm": 57.43947156187411, + "learning_rate": 9.450939656408562e-06, + "loss": 3.7612, + "step": 8291 + }, + { + "epoch": 0.7067246228586039, + "grad_norm": 47.53333655392397, + "learning_rate": 9.450713729470666e-06, + "loss": 3.1996, + "step": 8292 + }, + { + "epoch": 0.7068098525526293, + "grad_norm": 135.8680039148981, + "learning_rate": 9.45048775876177e-06, + "loss": 5.8687, + "step": 8293 + }, + { + "epoch": 0.7068950822466548, + "grad_norm": 68.6995396561909, + "learning_rate": 9.450261744284097e-06, + "loss": 3.2416, + "step": 8294 + }, + { + "epoch": 0.7069803119406801, + "grad_norm": 68.91310418612233, + "learning_rate": 9.450035686039865e-06, + "loss": 4.3443, + "step": 8295 + }, + { + "epoch": 0.7070655416347056, + "grad_norm": 59.10294022960295, + "learning_rate": 9.449809584031303e-06, + "loss": 5.7906, + "step": 8296 + }, + { + "epoch": 0.7071507713287309, + "grad_norm": 57.662493315589046, + "learning_rate": 9.44958343826063e-06, + "loss": 4.5658, + "step": 8297 + }, + { + "epoch": 0.7072360010227563, + "grad_norm": 48.86056958431051, + "learning_rate": 9.449357248730073e-06, + "loss": 4.7471, + "step": 8298 + }, + { + "epoch": 0.7073212307167818, + "grad_norm": 81.71763505980178, + "learning_rate": 9.449131015441855e-06, + "loss": 5.3243, + "step": 8299 + }, + { + "epoch": 0.7074064604108071, + "grad_norm": 35.125061036109216, + "learning_rate": 9.448904738398202e-06, + "loss": 3.2264, + "step": 8300 + }, + { + "epoch": 0.7074916901048325, + "grad_norm": 78.62982404681215, + "learning_rate": 9.448678417601337e-06, + "loss": 5.0169, + "step": 8301 + }, + { + "epoch": 0.707576919798858, + "grad_norm": 98.90219578530974, + "learning_rate": 9.44845205305349e-06, + "loss": 4.9828, + "step": 8302 + }, + { + "epoch": 0.7076621494928833, + "grad_norm": 34.60529574750214, + "learning_rate": 9.448225644756884e-06, + "loss": 4.0308, + "step": 8303 + }, + { + "epoch": 0.7077473791869087, + "grad_norm": 55.897016781957724, + "learning_rate": 9.447999192713745e-06, + "loss": 4.2293, + "step": 8304 + }, + { + "epoch": 0.7078326088809341, + "grad_norm": 31.48252147306407, + "learning_rate": 9.447772696926303e-06, + "loss": 3.9403, + "step": 8305 + }, + { + "epoch": 0.7079178385749595, + "grad_norm": 63.52611569512536, + "learning_rate": 9.447546157396783e-06, + "loss": 4.0572, + "step": 8306 + }, + { + "epoch": 0.708003068268985, + "grad_norm": 56.270452606949924, + "learning_rate": 9.447319574127414e-06, + "loss": 4.3549, + "step": 8307 + }, + { + "epoch": 0.7080882979630103, + "grad_norm": 57.20433217059208, + "learning_rate": 9.447092947120425e-06, + "loss": 2.8514, + "step": 8308 + }, + { + "epoch": 0.7081735276570357, + "grad_norm": 108.59010244293061, + "learning_rate": 9.446866276378043e-06, + "loss": 4.6232, + "step": 8309 + }, + { + "epoch": 0.7082587573510611, + "grad_norm": 153.50945799819434, + "learning_rate": 9.446639561902498e-06, + "loss": 6.2595, + "step": 8310 + }, + { + "epoch": 0.7083439870450865, + "grad_norm": 61.80936133913222, + "learning_rate": 9.44641280369602e-06, + "loss": 4.0038, + "step": 8311 + }, + { + "epoch": 0.7084292167391119, + "grad_norm": 89.64370666827153, + "learning_rate": 9.44618600176084e-06, + "loss": 5.1299, + "step": 8312 + }, + { + "epoch": 0.7085144464331373, + "grad_norm": 93.32150760156833, + "learning_rate": 9.445959156099185e-06, + "loss": 3.9487, + "step": 8313 + }, + { + "epoch": 0.7085996761271627, + "grad_norm": 36.90193837488493, + "learning_rate": 9.44573226671329e-06, + "loss": 3.4025, + "step": 8314 + }, + { + "epoch": 0.7086849058211881, + "grad_norm": 44.042881569268864, + "learning_rate": 9.445505333605385e-06, + "loss": 3.8294, + "step": 8315 + }, + { + "epoch": 0.7087701355152135, + "grad_norm": 46.37166393030098, + "learning_rate": 9.445278356777701e-06, + "loss": 3.711, + "step": 8316 + }, + { + "epoch": 0.7088553652092389, + "grad_norm": 70.87678024831364, + "learning_rate": 9.44505133623247e-06, + "loss": 4.4946, + "step": 8317 + }, + { + "epoch": 0.7089405949032643, + "grad_norm": 154.81670439906392, + "learning_rate": 9.444824271971926e-06, + "loss": 4.5553, + "step": 8318 + }, + { + "epoch": 0.7090258245972897, + "grad_norm": 76.15228990330215, + "learning_rate": 9.444597163998302e-06, + "loss": 5.1845, + "step": 8319 + }, + { + "epoch": 0.709111054291315, + "grad_norm": 111.19177754727687, + "learning_rate": 9.444370012313831e-06, + "loss": 4.8172, + "step": 8320 + }, + { + "epoch": 0.7091962839853405, + "grad_norm": 85.45036275140936, + "learning_rate": 9.444142816920748e-06, + "loss": 4.4235, + "step": 8321 + }, + { + "epoch": 0.7092815136793659, + "grad_norm": 36.44498231845549, + "learning_rate": 9.443915577821284e-06, + "loss": 3.5112, + "step": 8322 + }, + { + "epoch": 0.7093667433733913, + "grad_norm": 81.83435584167384, + "learning_rate": 9.443688295017677e-06, + "loss": 3.8127, + "step": 8323 + }, + { + "epoch": 0.7094519730674167, + "grad_norm": 58.64640730971923, + "learning_rate": 9.443460968512162e-06, + "loss": 4.1267, + "step": 8324 + }, + { + "epoch": 0.709537202761442, + "grad_norm": 74.31827664574314, + "learning_rate": 9.443233598306974e-06, + "loss": 4.0997, + "step": 8325 + }, + { + "epoch": 0.7096224324554675, + "grad_norm": 126.73895604743925, + "learning_rate": 9.443006184404349e-06, + "loss": 5.1908, + "step": 8326 + }, + { + "epoch": 0.7097076621494929, + "grad_norm": 37.359430595426986, + "learning_rate": 9.442778726806522e-06, + "loss": 2.0331, + "step": 8327 + }, + { + "epoch": 0.7097928918435182, + "grad_norm": 59.318933052085704, + "learning_rate": 9.442551225515733e-06, + "loss": 4.0255, + "step": 8328 + }, + { + "epoch": 0.7098781215375437, + "grad_norm": 47.156656803989414, + "learning_rate": 9.442323680534217e-06, + "loss": 3.851, + "step": 8329 + }, + { + "epoch": 0.7099633512315691, + "grad_norm": 39.1190482427581, + "learning_rate": 9.442096091864214e-06, + "loss": 3.7199, + "step": 8330 + }, + { + "epoch": 0.7100485809255945, + "grad_norm": 61.35336049575636, + "learning_rate": 9.441868459507962e-06, + "loss": 4.1073, + "step": 8331 + }, + { + "epoch": 0.7101338106196199, + "grad_norm": 79.41569140365142, + "learning_rate": 9.441640783467697e-06, + "loss": 2.3669, + "step": 8332 + }, + { + "epoch": 0.7102190403136452, + "grad_norm": 69.52444363188319, + "learning_rate": 9.44141306374566e-06, + "loss": 3.5807, + "step": 8333 + }, + { + "epoch": 0.7103042700076707, + "grad_norm": 29.3808286044011, + "learning_rate": 9.441185300344091e-06, + "loss": 3.548, + "step": 8334 + }, + { + "epoch": 0.7103894997016961, + "grad_norm": 26.416157037982725, + "learning_rate": 9.440957493265228e-06, + "loss": 2.6725, + "step": 8335 + }, + { + "epoch": 0.7104747293957214, + "grad_norm": 43.59515995820499, + "learning_rate": 9.440729642511315e-06, + "loss": 4.4901, + "step": 8336 + }, + { + "epoch": 0.7105599590897469, + "grad_norm": 57.932406770944795, + "learning_rate": 9.44050174808459e-06, + "loss": 4.3022, + "step": 8337 + }, + { + "epoch": 0.7106451887837723, + "grad_norm": 54.94551002044287, + "learning_rate": 9.440273809987293e-06, + "loss": 4.4352, + "step": 8338 + }, + { + "epoch": 0.7107304184777976, + "grad_norm": 51.599638273268816, + "learning_rate": 9.44004582822167e-06, + "loss": 4.1673, + "step": 8339 + }, + { + "epoch": 0.7108156481718231, + "grad_norm": 62.972809856436506, + "learning_rate": 9.439817802789957e-06, + "loss": 4.823, + "step": 8340 + }, + { + "epoch": 0.7109008778658484, + "grad_norm": 32.79978319451905, + "learning_rate": 9.439589733694403e-06, + "loss": 3.3678, + "step": 8341 + }, + { + "epoch": 0.7109861075598739, + "grad_norm": 92.95532032162326, + "learning_rate": 9.439361620937246e-06, + "loss": 5.2038, + "step": 8342 + }, + { + "epoch": 0.7110713372538993, + "grad_norm": 47.003889582581614, + "learning_rate": 9.439133464520733e-06, + "loss": 3.4804, + "step": 8343 + }, + { + "epoch": 0.7111565669479246, + "grad_norm": 53.764813390084186, + "learning_rate": 9.438905264447107e-06, + "loss": 4.0468, + "step": 8344 + }, + { + "epoch": 0.7112417966419501, + "grad_norm": 55.56865861709256, + "learning_rate": 9.438677020718611e-06, + "loss": 4.2933, + "step": 8345 + }, + { + "epoch": 0.7113270263359754, + "grad_norm": 73.49987135454268, + "learning_rate": 9.438448733337489e-06, + "loss": 5.0936, + "step": 8346 + }, + { + "epoch": 0.7114122560300008, + "grad_norm": 32.74288445594092, + "learning_rate": 9.438220402305988e-06, + "loss": 2.7926, + "step": 8347 + }, + { + "epoch": 0.7114974857240263, + "grad_norm": 78.82162937551615, + "learning_rate": 9.437992027626356e-06, + "loss": 4.4596, + "step": 8348 + }, + { + "epoch": 0.7115827154180516, + "grad_norm": 27.333762718709647, + "learning_rate": 9.437763609300831e-06, + "loss": 2.5099, + "step": 8349 + }, + { + "epoch": 0.7116679451120771, + "grad_norm": 99.78980461020498, + "learning_rate": 9.437535147331667e-06, + "loss": 5.2455, + "step": 8350 + }, + { + "epoch": 0.7117531748061025, + "grad_norm": 85.73939851056149, + "learning_rate": 9.437306641721107e-06, + "loss": 5.8486, + "step": 8351 + }, + { + "epoch": 0.7118384045001278, + "grad_norm": 56.74840523491031, + "learning_rate": 9.4370780924714e-06, + "loss": 3.9051, + "step": 8352 + }, + { + "epoch": 0.7119236341941533, + "grad_norm": 38.145734648799575, + "learning_rate": 9.436849499584792e-06, + "loss": 3.7801, + "step": 8353 + }, + { + "epoch": 0.7120088638881786, + "grad_norm": 35.766720267211745, + "learning_rate": 9.436620863063535e-06, + "loss": 3.6232, + "step": 8354 + }, + { + "epoch": 0.712094093582204, + "grad_norm": 56.65887857447407, + "learning_rate": 9.436392182909873e-06, + "loss": 4.1343, + "step": 8355 + }, + { + "epoch": 0.7121793232762295, + "grad_norm": 27.14926481256435, + "learning_rate": 9.436163459126056e-06, + "loss": 3.6369, + "step": 8356 + }, + { + "epoch": 0.7122645529702548, + "grad_norm": 34.85833836113928, + "learning_rate": 9.435934691714335e-06, + "loss": 3.1969, + "step": 8357 + }, + { + "epoch": 0.7123497826642803, + "grad_norm": 70.2320484705981, + "learning_rate": 9.43570588067696e-06, + "loss": 4.1343, + "step": 8358 + }, + { + "epoch": 0.7124350123583056, + "grad_norm": 41.49182563202142, + "learning_rate": 9.435477026016178e-06, + "loss": 3.7177, + "step": 8359 + }, + { + "epoch": 0.712520242052331, + "grad_norm": 47.7029865817194, + "learning_rate": 9.435248127734243e-06, + "loss": 3.8257, + "step": 8360 + }, + { + "epoch": 0.7126054717463565, + "grad_norm": 60.46383968850079, + "learning_rate": 9.435019185833405e-06, + "loss": 4.4567, + "step": 8361 + }, + { + "epoch": 0.7126907014403818, + "grad_norm": 38.067232434211576, + "learning_rate": 9.434790200315915e-06, + "loss": 3.1511, + "step": 8362 + }, + { + "epoch": 0.7127759311344072, + "grad_norm": 55.787559823549906, + "learning_rate": 9.434561171184027e-06, + "loss": 5.1021, + "step": 8363 + }, + { + "epoch": 0.7128611608284326, + "grad_norm": 56.33874937766096, + "learning_rate": 9.434332098439992e-06, + "loss": 4.1976, + "step": 8364 + }, + { + "epoch": 0.712946390522458, + "grad_norm": 49.202441511868024, + "learning_rate": 9.434102982086061e-06, + "loss": 3.3968, + "step": 8365 + }, + { + "epoch": 0.7130316202164835, + "grad_norm": 46.73562730347138, + "learning_rate": 9.43387382212449e-06, + "loss": 4.1502, + "step": 8366 + }, + { + "epoch": 0.7131168499105088, + "grad_norm": 39.911232946782256, + "learning_rate": 9.433644618557532e-06, + "loss": 4.6259, + "step": 8367 + }, + { + "epoch": 0.7132020796045342, + "grad_norm": 54.39340833983436, + "learning_rate": 9.433415371387441e-06, + "loss": 4.2906, + "step": 8368 + }, + { + "epoch": 0.7132873092985597, + "grad_norm": 42.446114640541744, + "learning_rate": 9.433186080616472e-06, + "loss": 3.5467, + "step": 8369 + }, + { + "epoch": 0.713372538992585, + "grad_norm": 76.56447929577921, + "learning_rate": 9.43295674624688e-06, + "loss": 4.5579, + "step": 8370 + }, + { + "epoch": 0.7134577686866104, + "grad_norm": 80.51674781702127, + "learning_rate": 9.432727368280917e-06, + "loss": 6.0655, + "step": 8371 + }, + { + "epoch": 0.7135429983806358, + "grad_norm": 108.27650026301704, + "learning_rate": 9.432497946720844e-06, + "loss": 4.4475, + "step": 8372 + }, + { + "epoch": 0.7136282280746612, + "grad_norm": 32.87859617736367, + "learning_rate": 9.432268481568914e-06, + "loss": 2.736, + "step": 8373 + }, + { + "epoch": 0.7137134577686867, + "grad_norm": 52.1296195055107, + "learning_rate": 9.432038972827387e-06, + "loss": 4.1502, + "step": 8374 + }, + { + "epoch": 0.713798687462712, + "grad_norm": 45.08541986362914, + "learning_rate": 9.431809420498515e-06, + "loss": 3.3068, + "step": 8375 + }, + { + "epoch": 0.7138839171567374, + "grad_norm": 50.75340974299308, + "learning_rate": 9.431579824584559e-06, + "loss": 3.5582, + "step": 8376 + }, + { + "epoch": 0.7139691468507628, + "grad_norm": 60.05284244157667, + "learning_rate": 9.431350185087777e-06, + "loss": 4.7324, + "step": 8377 + }, + { + "epoch": 0.7140543765447882, + "grad_norm": 64.69530492250405, + "learning_rate": 9.431120502010427e-06, + "loss": 5.4587, + "step": 8378 + }, + { + "epoch": 0.7141396062388136, + "grad_norm": 125.55832581164813, + "learning_rate": 9.430890775354767e-06, + "loss": 2.7265, + "step": 8379 + }, + { + "epoch": 0.714224835932839, + "grad_norm": 71.87264950584516, + "learning_rate": 9.430661005123056e-06, + "loss": 4.2251, + "step": 8380 + }, + { + "epoch": 0.7143100656268644, + "grad_norm": 39.23009114804798, + "learning_rate": 9.430431191317557e-06, + "loss": 4.1983, + "step": 8381 + }, + { + "epoch": 0.7143952953208897, + "grad_norm": 38.472056393408295, + "learning_rate": 9.430201333940525e-06, + "loss": 3.6542, + "step": 8382 + }, + { + "epoch": 0.7144805250149152, + "grad_norm": 33.54707016371564, + "learning_rate": 9.429971432994225e-06, + "loss": 3.3659, + "step": 8383 + }, + { + "epoch": 0.7145657547089406, + "grad_norm": 82.71520155267903, + "learning_rate": 9.429741488480917e-06, + "loss": 4.5705, + "step": 8384 + }, + { + "epoch": 0.714650984402966, + "grad_norm": 78.06310520782733, + "learning_rate": 9.429511500402859e-06, + "loss": 3.8791, + "step": 8385 + }, + { + "epoch": 0.7147362140969914, + "grad_norm": 35.1788543304777, + "learning_rate": 9.429281468762318e-06, + "loss": 3.1079, + "step": 8386 + }, + { + "epoch": 0.7148214437910168, + "grad_norm": 65.10100890803322, + "learning_rate": 9.429051393561552e-06, + "loss": 3.3088, + "step": 8387 + }, + { + "epoch": 0.7149066734850422, + "grad_norm": 54.927577275013924, + "learning_rate": 9.428821274802826e-06, + "loss": 3.2795, + "step": 8388 + }, + { + "epoch": 0.7149919031790676, + "grad_norm": 52.85395560759211, + "learning_rate": 9.428591112488403e-06, + "loss": 3.1003, + "step": 8389 + }, + { + "epoch": 0.7150771328730929, + "grad_norm": 100.26197988594414, + "learning_rate": 9.428360906620546e-06, + "loss": 4.8364, + "step": 8390 + }, + { + "epoch": 0.7151623625671184, + "grad_norm": 64.94161681345756, + "learning_rate": 9.42813065720152e-06, + "loss": 4.7715, + "step": 8391 + }, + { + "epoch": 0.7152475922611438, + "grad_norm": 48.754589790545154, + "learning_rate": 9.427900364233588e-06, + "loss": 5.0333, + "step": 8392 + }, + { + "epoch": 0.7153328219551692, + "grad_norm": 64.46907311073035, + "learning_rate": 9.427670027719015e-06, + "loss": 2.8959, + "step": 8393 + }, + { + "epoch": 0.7154180516491946, + "grad_norm": 44.65137747263058, + "learning_rate": 9.427439647660066e-06, + "loss": 4.2, + "step": 8394 + }, + { + "epoch": 0.7155032813432199, + "grad_norm": 60.44794070519572, + "learning_rate": 9.427209224059008e-06, + "loss": 5.3104, + "step": 8395 + }, + { + "epoch": 0.7155885110372454, + "grad_norm": 41.02567623366854, + "learning_rate": 9.426978756918108e-06, + "loss": 4.017, + "step": 8396 + }, + { + "epoch": 0.7156737407312708, + "grad_norm": 67.64775907767454, + "learning_rate": 9.42674824623963e-06, + "loss": 4.8543, + "step": 8397 + }, + { + "epoch": 0.7157589704252961, + "grad_norm": 79.47425402981155, + "learning_rate": 9.42651769202584e-06, + "loss": 4.8227, + "step": 8398 + }, + { + "epoch": 0.7158442001193216, + "grad_norm": 32.826000524246126, + "learning_rate": 9.42628709427901e-06, + "loss": 3.8596, + "step": 8399 + }, + { + "epoch": 0.715929429813347, + "grad_norm": 33.439934211392256, + "learning_rate": 9.426056453001405e-06, + "loss": 3.8883, + "step": 8400 + }, + { + "epoch": 0.7160146595073724, + "grad_norm": 76.08996245032482, + "learning_rate": 9.425825768195292e-06, + "loss": 4.2198, + "step": 8401 + }, + { + "epoch": 0.7160998892013978, + "grad_norm": 50.37126655479239, + "learning_rate": 9.425595039862942e-06, + "loss": 4.1565, + "step": 8402 + }, + { + "epoch": 0.7161851188954231, + "grad_norm": 63.56081777064459, + "learning_rate": 9.425364268006625e-06, + "loss": 4.168, + "step": 8403 + }, + { + "epoch": 0.7162703485894486, + "grad_norm": 32.08952278844993, + "learning_rate": 9.425133452628607e-06, + "loss": 4.2916, + "step": 8404 + }, + { + "epoch": 0.716355578283474, + "grad_norm": 61.856874691651406, + "learning_rate": 9.42490259373116e-06, + "loss": 4.3422, + "step": 8405 + }, + { + "epoch": 0.7164408079774993, + "grad_norm": 35.889644174396935, + "learning_rate": 9.424671691316554e-06, + "loss": 3.3106, + "step": 8406 + }, + { + "epoch": 0.7165260376715248, + "grad_norm": 29.055129564773107, + "learning_rate": 9.42444074538706e-06, + "loss": 2.9327, + "step": 8407 + }, + { + "epoch": 0.7166112673655501, + "grad_norm": 119.42867368146045, + "learning_rate": 9.42420975594495e-06, + "loss": 4.1254, + "step": 8408 + }, + { + "epoch": 0.7166964970595756, + "grad_norm": 78.12327493034776, + "learning_rate": 9.423978722992497e-06, + "loss": 4.983, + "step": 8409 + }, + { + "epoch": 0.716781726753601, + "grad_norm": 24.254212025730418, + "learning_rate": 9.42374764653197e-06, + "loss": 2.8877, + "step": 8410 + }, + { + "epoch": 0.7168669564476263, + "grad_norm": 32.439832277956135, + "learning_rate": 9.42351652656564e-06, + "loss": 3.9206, + "step": 8411 + }, + { + "epoch": 0.7169521861416518, + "grad_norm": 35.11096345641912, + "learning_rate": 9.423285363095785e-06, + "loss": 3.7683, + "step": 8412 + }, + { + "epoch": 0.7170374158356771, + "grad_norm": 66.31227276311294, + "learning_rate": 9.423054156124676e-06, + "loss": 3.3026, + "step": 8413 + }, + { + "epoch": 0.7171226455297025, + "grad_norm": 118.51270854685106, + "learning_rate": 9.422822905654586e-06, + "loss": 4.7196, + "step": 8414 + }, + { + "epoch": 0.717207875223728, + "grad_norm": 39.692441721323725, + "learning_rate": 9.422591611687791e-06, + "loss": 2.9771, + "step": 8415 + }, + { + "epoch": 0.7172931049177533, + "grad_norm": 65.90399108208139, + "learning_rate": 9.422360274226565e-06, + "loss": 5.364, + "step": 8416 + }, + { + "epoch": 0.7173783346117787, + "grad_norm": 37.61885309148106, + "learning_rate": 9.422128893273183e-06, + "loss": 3.1, + "step": 8417 + }, + { + "epoch": 0.7174635643058042, + "grad_norm": 50.96909451104292, + "learning_rate": 9.42189746882992e-06, + "loss": 4.7055, + "step": 8418 + }, + { + "epoch": 0.7175487939998295, + "grad_norm": 36.33841339992384, + "learning_rate": 9.421666000899053e-06, + "loss": 4.1989, + "step": 8419 + }, + { + "epoch": 0.717634023693855, + "grad_norm": 49.34142677653665, + "learning_rate": 9.421434489482856e-06, + "loss": 3.3229, + "step": 8420 + }, + { + "epoch": 0.7177192533878803, + "grad_norm": 30.506607529791854, + "learning_rate": 9.42120293458361e-06, + "loss": 3.4912, + "step": 8421 + }, + { + "epoch": 0.7178044830819057, + "grad_norm": 42.004759075284184, + "learning_rate": 9.42097133620359e-06, + "loss": 4.0807, + "step": 8422 + }, + { + "epoch": 0.7178897127759312, + "grad_norm": 48.36305445465518, + "learning_rate": 9.420739694345073e-06, + "loss": 4.1678, + "step": 8423 + }, + { + "epoch": 0.7179749424699565, + "grad_norm": 42.44173094961864, + "learning_rate": 9.420508009010336e-06, + "loss": 4.2353, + "step": 8424 + }, + { + "epoch": 0.7180601721639819, + "grad_norm": 52.45840512520905, + "learning_rate": 9.420276280201661e-06, + "loss": 4.3657, + "step": 8425 + }, + { + "epoch": 0.7181454018580073, + "grad_norm": 67.51994751501236, + "learning_rate": 9.420044507921326e-06, + "loss": 3.9975, + "step": 8426 + }, + { + "epoch": 0.7182306315520327, + "grad_norm": 37.19305585636605, + "learning_rate": 9.419812692171608e-06, + "loss": 3.6013, + "step": 8427 + }, + { + "epoch": 0.7183158612460582, + "grad_norm": 47.321827537472544, + "learning_rate": 9.41958083295479e-06, + "loss": 4.8527, + "step": 8428 + }, + { + "epoch": 0.7184010909400835, + "grad_norm": 69.741018710217, + "learning_rate": 9.41934893027315e-06, + "loss": 4.4286, + "step": 8429 + }, + { + "epoch": 0.7184863206341089, + "grad_norm": 55.28198573746217, + "learning_rate": 9.419116984128968e-06, + "loss": 4.5657, + "step": 8430 + }, + { + "epoch": 0.7185715503281344, + "grad_norm": 39.09109314396146, + "learning_rate": 9.418884994524528e-06, + "loss": 3.474, + "step": 8431 + }, + { + "epoch": 0.7186567800221597, + "grad_norm": 103.30029233437938, + "learning_rate": 9.41865296146211e-06, + "loss": 4.4245, + "step": 8432 + }, + { + "epoch": 0.7187420097161851, + "grad_norm": 33.645795022662696, + "learning_rate": 9.418420884943995e-06, + "loss": 3.4516, + "step": 8433 + }, + { + "epoch": 0.7188272394102105, + "grad_norm": 34.57477760890329, + "learning_rate": 9.418188764972466e-06, + "loss": 3.2412, + "step": 8434 + }, + { + "epoch": 0.7189124691042359, + "grad_norm": 81.41428477375142, + "learning_rate": 9.417956601549807e-06, + "loss": 4.942, + "step": 8435 + }, + { + "epoch": 0.7189976987982614, + "grad_norm": 39.25343101103503, + "learning_rate": 9.417724394678302e-06, + "loss": 4.0198, + "step": 8436 + }, + { + "epoch": 0.7190829284922867, + "grad_norm": 51.36244733305959, + "learning_rate": 9.41749214436023e-06, + "loss": 4.2033, + "step": 8437 + }, + { + "epoch": 0.7191681581863121, + "grad_norm": 51.814732269121144, + "learning_rate": 9.41725985059788e-06, + "loss": 3.4786, + "step": 8438 + }, + { + "epoch": 0.7192533878803375, + "grad_norm": 54.96032032473823, + "learning_rate": 9.417027513393534e-06, + "loss": 3.101, + "step": 8439 + }, + { + "epoch": 0.7193386175743629, + "grad_norm": 34.16855032287686, + "learning_rate": 9.416795132749478e-06, + "loss": 3.4183, + "step": 8440 + }, + { + "epoch": 0.7194238472683883, + "grad_norm": 47.66516215856015, + "learning_rate": 9.416562708667996e-06, + "loss": 4.2061, + "step": 8441 + }, + { + "epoch": 0.7195090769624137, + "grad_norm": 35.01580346522016, + "learning_rate": 9.416330241151375e-06, + "loss": 3.6398, + "step": 8442 + }, + { + "epoch": 0.7195943066564391, + "grad_norm": 48.03210927265235, + "learning_rate": 9.416097730201901e-06, + "loss": 3.786, + "step": 8443 + }, + { + "epoch": 0.7196795363504646, + "grad_norm": 63.31724200128013, + "learning_rate": 9.415865175821861e-06, + "loss": 4.8206, + "step": 8444 + }, + { + "epoch": 0.7197647660444899, + "grad_norm": 51.43962536274468, + "learning_rate": 9.415632578013541e-06, + "loss": 4.8237, + "step": 8445 + }, + { + "epoch": 0.7198499957385153, + "grad_norm": 49.57014707873711, + "learning_rate": 9.41539993677923e-06, + "loss": 3.5967, + "step": 8446 + }, + { + "epoch": 0.7199352254325407, + "grad_norm": 137.51777212226787, + "learning_rate": 9.415167252121216e-06, + "loss": 5.0249, + "step": 8447 + }, + { + "epoch": 0.7200204551265661, + "grad_norm": 29.2590740026413, + "learning_rate": 9.414934524041785e-06, + "loss": 2.5361, + "step": 8448 + }, + { + "epoch": 0.7201056848205915, + "grad_norm": 51.27271373036123, + "learning_rate": 9.414701752543228e-06, + "loss": 4.1155, + "step": 8449 + }, + { + "epoch": 0.7201909145146169, + "grad_norm": 64.49081976849274, + "learning_rate": 9.414468937627832e-06, + "loss": 4.0365, + "step": 8450 + }, + { + "epoch": 0.7202761442086423, + "grad_norm": 35.26167676225941, + "learning_rate": 9.414236079297888e-06, + "loss": 3.7447, + "step": 8451 + }, + { + "epoch": 0.7203613739026676, + "grad_norm": 37.45277688154743, + "learning_rate": 9.414003177555688e-06, + "loss": 3.4734, + "step": 8452 + }, + { + "epoch": 0.7204466035966931, + "grad_norm": 32.87396570746419, + "learning_rate": 9.41377023240352e-06, + "loss": 3.4263, + "step": 8453 + }, + { + "epoch": 0.7205318332907185, + "grad_norm": 78.93050118419951, + "learning_rate": 9.413537243843676e-06, + "loss": 4.3247, + "step": 8454 + }, + { + "epoch": 0.7206170629847439, + "grad_norm": 26.298548457560834, + "learning_rate": 9.413304211878447e-06, + "loss": 2.7887, + "step": 8455 + }, + { + "epoch": 0.7207022926787693, + "grad_norm": 46.796144463112476, + "learning_rate": 9.413071136510124e-06, + "loss": 3.962, + "step": 8456 + }, + { + "epoch": 0.7207875223727946, + "grad_norm": 43.18530225694095, + "learning_rate": 9.412838017741001e-06, + "loss": 3.9073, + "step": 8457 + }, + { + "epoch": 0.7208727520668201, + "grad_norm": 35.13832410504685, + "learning_rate": 9.412604855573368e-06, + "loss": 2.8401, + "step": 8458 + }, + { + "epoch": 0.7209579817608455, + "grad_norm": 63.039197316012576, + "learning_rate": 9.412371650009522e-06, + "loss": 4.1712, + "step": 8459 + }, + { + "epoch": 0.7210432114548708, + "grad_norm": 43.5267975724106, + "learning_rate": 9.412138401051752e-06, + "loss": 2.7633, + "step": 8460 + }, + { + "epoch": 0.7211284411488963, + "grad_norm": 45.858336159125294, + "learning_rate": 9.411905108702355e-06, + "loss": 4.2143, + "step": 8461 + }, + { + "epoch": 0.7212136708429217, + "grad_norm": 35.77452700339567, + "learning_rate": 9.411671772963625e-06, + "loss": 3.6389, + "step": 8462 + }, + { + "epoch": 0.7212989005369471, + "grad_norm": 45.52940879426556, + "learning_rate": 9.411438393837855e-06, + "loss": 4.1669, + "step": 8463 + }, + { + "epoch": 0.7213841302309725, + "grad_norm": 43.91305587195607, + "learning_rate": 9.411204971327341e-06, + "loss": 4.2071, + "step": 8464 + }, + { + "epoch": 0.7214693599249978, + "grad_norm": 64.8909394601408, + "learning_rate": 9.41097150543438e-06, + "loss": 4.0426, + "step": 8465 + }, + { + "epoch": 0.7215545896190233, + "grad_norm": 89.66582738002559, + "learning_rate": 9.410737996161267e-06, + "loss": 5.5577, + "step": 8466 + }, + { + "epoch": 0.7216398193130487, + "grad_norm": 227.1237191270644, + "learning_rate": 9.410504443510298e-06, + "loss": 2.1643, + "step": 8467 + }, + { + "epoch": 0.721725049007074, + "grad_norm": 44.43339163291802, + "learning_rate": 9.41027084748377e-06, + "loss": 3.5094, + "step": 8468 + }, + { + "epoch": 0.7218102787010995, + "grad_norm": 35.11706586523106, + "learning_rate": 9.410037208083982e-06, + "loss": 3.2764, + "step": 8469 + }, + { + "epoch": 0.7218955083951248, + "grad_norm": 48.95974515376673, + "learning_rate": 9.409803525313228e-06, + "loss": 4.0624, + "step": 8470 + }, + { + "epoch": 0.7219807380891503, + "grad_norm": 36.921545304558215, + "learning_rate": 9.40956979917381e-06, + "loss": 2.9699, + "step": 8471 + }, + { + "epoch": 0.7220659677831757, + "grad_norm": 70.04100531845235, + "learning_rate": 9.409336029668025e-06, + "loss": 4.3331, + "step": 8472 + }, + { + "epoch": 0.722151197477201, + "grad_norm": 704.8502797218471, + "learning_rate": 9.409102216798174e-06, + "loss": 8.9461, + "step": 8473 + }, + { + "epoch": 0.7222364271712265, + "grad_norm": 98.27948036438647, + "learning_rate": 9.408868360566553e-06, + "loss": 4.5259, + "step": 8474 + }, + { + "epoch": 0.7223216568652518, + "grad_norm": 36.55989008630952, + "learning_rate": 9.408634460975464e-06, + "loss": 3.7914, + "step": 8475 + }, + { + "epoch": 0.7224068865592772, + "grad_norm": 58.17629932938226, + "learning_rate": 9.408400518027207e-06, + "loss": 4.1808, + "step": 8476 + }, + { + "epoch": 0.7224921162533027, + "grad_norm": 55.707524680893485, + "learning_rate": 9.408166531724082e-06, + "loss": 3.1686, + "step": 8477 + }, + { + "epoch": 0.722577345947328, + "grad_norm": 35.9874892019952, + "learning_rate": 9.40793250206839e-06, + "loss": 3.1434, + "step": 8478 + }, + { + "epoch": 0.7226625756413535, + "grad_norm": 39.794354880841354, + "learning_rate": 9.407698429062435e-06, + "loss": 3.819, + "step": 8479 + }, + { + "epoch": 0.7227478053353789, + "grad_norm": 46.39170249586668, + "learning_rate": 9.407464312708519e-06, + "loss": 4.5048, + "step": 8480 + }, + { + "epoch": 0.7228330350294042, + "grad_norm": 169.35727942114067, + "learning_rate": 9.407230153008939e-06, + "loss": 5.5028, + "step": 8481 + }, + { + "epoch": 0.7229182647234297, + "grad_norm": 942.3122568010288, + "learning_rate": 9.406995949966005e-06, + "loss": 5.5939, + "step": 8482 + }, + { + "epoch": 0.723003494417455, + "grad_norm": 53.452366941520594, + "learning_rate": 9.406761703582015e-06, + "loss": 4.7348, + "step": 8483 + }, + { + "epoch": 0.7230887241114804, + "grad_norm": 46.132222716749396, + "learning_rate": 9.406527413859274e-06, + "loss": 3.8471, + "step": 8484 + }, + { + "epoch": 0.7231739538055059, + "grad_norm": 52.28288704895239, + "learning_rate": 9.406293080800088e-06, + "loss": 4.0229, + "step": 8485 + }, + { + "epoch": 0.7232591834995312, + "grad_norm": 72.21310003520351, + "learning_rate": 9.406058704406763e-06, + "loss": 4.4887, + "step": 8486 + }, + { + "epoch": 0.7233444131935567, + "grad_norm": 40.185827344512106, + "learning_rate": 9.405824284681597e-06, + "loss": 3.5293, + "step": 8487 + }, + { + "epoch": 0.723429642887582, + "grad_norm": 130.29824037620878, + "learning_rate": 9.405589821626902e-06, + "loss": 6.4635, + "step": 8488 + }, + { + "epoch": 0.7235148725816074, + "grad_norm": 64.69037156296433, + "learning_rate": 9.405355315244982e-06, + "loss": 4.3446, + "step": 8489 + }, + { + "epoch": 0.7236001022756329, + "grad_norm": 99.912055998974, + "learning_rate": 9.405120765538143e-06, + "loss": 4.15, + "step": 8490 + }, + { + "epoch": 0.7236853319696582, + "grad_norm": 55.514482629570075, + "learning_rate": 9.404886172508692e-06, + "loss": 4.2878, + "step": 8491 + }, + { + "epoch": 0.7237705616636836, + "grad_norm": 148.7200365081542, + "learning_rate": 9.404651536158934e-06, + "loss": 6.1317, + "step": 8492 + }, + { + "epoch": 0.723855791357709, + "grad_norm": 124.5191325813226, + "learning_rate": 9.40441685649118e-06, + "loss": 3.6303, + "step": 8493 + }, + { + "epoch": 0.7239410210517344, + "grad_norm": 66.98309781239341, + "learning_rate": 9.404182133507735e-06, + "loss": 4.2614, + "step": 8494 + }, + { + "epoch": 0.7240262507457598, + "grad_norm": 43.39161737939548, + "learning_rate": 9.40394736721091e-06, + "loss": 2.659, + "step": 8495 + }, + { + "epoch": 0.7241114804397852, + "grad_norm": 54.75787840366236, + "learning_rate": 9.403712557603011e-06, + "loss": 4.1992, + "step": 8496 + }, + { + "epoch": 0.7241967101338106, + "grad_norm": 32.5846436329591, + "learning_rate": 9.403477704686351e-06, + "loss": 2.8826, + "step": 8497 + }, + { + "epoch": 0.7242819398278361, + "grad_norm": 39.754704778003685, + "learning_rate": 9.403242808463236e-06, + "loss": 3.8974, + "step": 8498 + }, + { + "epoch": 0.7243671695218614, + "grad_norm": 106.26512233553736, + "learning_rate": 9.40300786893598e-06, + "loss": 3.7057, + "step": 8499 + }, + { + "epoch": 0.7244523992158868, + "grad_norm": 51.74354358276156, + "learning_rate": 9.402772886106888e-06, + "loss": 3.9995, + "step": 8500 + }, + { + "epoch": 0.7245376289099122, + "grad_norm": 281.31636443041407, + "learning_rate": 9.402537859978275e-06, + "loss": 7.3883, + "step": 8501 + }, + { + "epoch": 0.7246228586039376, + "grad_norm": 73.36376196046903, + "learning_rate": 9.402302790552453e-06, + "loss": 4.4616, + "step": 8502 + }, + { + "epoch": 0.724708088297963, + "grad_norm": 53.704952114047806, + "learning_rate": 9.402067677831732e-06, + "loss": 3.7966, + "step": 8503 + }, + { + "epoch": 0.7247933179919884, + "grad_norm": 35.53950902474622, + "learning_rate": 9.401832521818424e-06, + "loss": 2.035, + "step": 8504 + }, + { + "epoch": 0.7248785476860138, + "grad_norm": 27.648492280563666, + "learning_rate": 9.401597322514842e-06, + "loss": 3.7348, + "step": 8505 + }, + { + "epoch": 0.7249637773800393, + "grad_norm": 102.22264111611332, + "learning_rate": 9.4013620799233e-06, + "loss": 4.6026, + "step": 8506 + }, + { + "epoch": 0.7250490070740646, + "grad_norm": 71.55456320070896, + "learning_rate": 9.40112679404611e-06, + "loss": 3.4096, + "step": 8507 + }, + { + "epoch": 0.72513423676809, + "grad_norm": 67.89445242430152, + "learning_rate": 9.400891464885589e-06, + "loss": 4.2355, + "step": 8508 + }, + { + "epoch": 0.7252194664621154, + "grad_norm": 67.48948412173624, + "learning_rate": 9.400656092444047e-06, + "loss": 4.1077, + "step": 8509 + }, + { + "epoch": 0.7253046961561408, + "grad_norm": 46.199279992027854, + "learning_rate": 9.4004206767238e-06, + "loss": 2.3987, + "step": 8510 + }, + { + "epoch": 0.7253899258501662, + "grad_norm": 66.1066992053935, + "learning_rate": 9.400185217727167e-06, + "loss": 4.4728, + "step": 8511 + }, + { + "epoch": 0.7254751555441916, + "grad_norm": 79.18281007743127, + "learning_rate": 9.39994971545646e-06, + "loss": 4.6648, + "step": 8512 + }, + { + "epoch": 0.725560385238217, + "grad_norm": 26.498584422439013, + "learning_rate": 9.399714169913995e-06, + "loss": 2.1771, + "step": 8513 + }, + { + "epoch": 0.7256456149322424, + "grad_norm": 46.254730803428096, + "learning_rate": 9.39947858110209e-06, + "loss": 3.5798, + "step": 8514 + }, + { + "epoch": 0.7257308446262678, + "grad_norm": 59.70658021821467, + "learning_rate": 9.399242949023063e-06, + "loss": 3.6548, + "step": 8515 + }, + { + "epoch": 0.7258160743202932, + "grad_norm": 36.03449206937789, + "learning_rate": 9.399007273679225e-06, + "loss": 4.0168, + "step": 8516 + }, + { + "epoch": 0.7259013040143186, + "grad_norm": 423.79765275608185, + "learning_rate": 9.398771555072902e-06, + "loss": 4.8261, + "step": 8517 + }, + { + "epoch": 0.725986533708344, + "grad_norm": 62.767462828321044, + "learning_rate": 9.39853579320641e-06, + "loss": 3.8839, + "step": 8518 + }, + { + "epoch": 0.7260717634023693, + "grad_norm": 31.283124351555866, + "learning_rate": 9.398299988082063e-06, + "loss": 4.0755, + "step": 8519 + }, + { + "epoch": 0.7261569930963948, + "grad_norm": 61.67158445411225, + "learning_rate": 9.398064139702185e-06, + "loss": 3.7723, + "step": 8520 + }, + { + "epoch": 0.7262422227904202, + "grad_norm": 82.43284952839774, + "learning_rate": 9.397828248069094e-06, + "loss": 4.3207, + "step": 8521 + }, + { + "epoch": 0.7263274524844456, + "grad_norm": 48.94003228179743, + "learning_rate": 9.397592313185109e-06, + "loss": 3.4113, + "step": 8522 + }, + { + "epoch": 0.726412682178471, + "grad_norm": 85.47534828523428, + "learning_rate": 9.397356335052551e-06, + "loss": 4.5244, + "step": 8523 + }, + { + "epoch": 0.7264979118724963, + "grad_norm": 41.63478627047177, + "learning_rate": 9.39712031367374e-06, + "loss": 4.2995, + "step": 8524 + }, + { + "epoch": 0.7265831415665218, + "grad_norm": 39.3969009955037, + "learning_rate": 9.396884249051e-06, + "loss": 3.6596, + "step": 8525 + }, + { + "epoch": 0.7266683712605472, + "grad_norm": 65.76501751583302, + "learning_rate": 9.396648141186649e-06, + "loss": 4.0413, + "step": 8526 + }, + { + "epoch": 0.7267536009545725, + "grad_norm": 28.932444634079648, + "learning_rate": 9.396411990083013e-06, + "loss": 2.7576, + "step": 8527 + }, + { + "epoch": 0.726838830648598, + "grad_norm": 73.79294390194262, + "learning_rate": 9.396175795742409e-06, + "loss": 4.6909, + "step": 8528 + }, + { + "epoch": 0.7269240603426234, + "grad_norm": 48.90581007951003, + "learning_rate": 9.395939558167164e-06, + "loss": 3.7428, + "step": 8529 + }, + { + "epoch": 0.7270092900366487, + "grad_norm": 35.516733335914346, + "learning_rate": 9.3957032773596e-06, + "loss": 3.3706, + "step": 8530 + }, + { + "epoch": 0.7270945197306742, + "grad_norm": 84.61238770318127, + "learning_rate": 9.395466953322042e-06, + "loss": 4.2333, + "step": 8531 + }, + { + "epoch": 0.7271797494246995, + "grad_norm": 132.40397708313037, + "learning_rate": 9.395230586056812e-06, + "loss": 3.6076, + "step": 8532 + }, + { + "epoch": 0.727264979118725, + "grad_norm": 43.525699933244866, + "learning_rate": 9.394994175566236e-06, + "loss": 3.5747, + "step": 8533 + }, + { + "epoch": 0.7273502088127504, + "grad_norm": 86.84636197757362, + "learning_rate": 9.394757721852639e-06, + "loss": 4.5854, + "step": 8534 + }, + { + "epoch": 0.7274354385067757, + "grad_norm": 43.100456958551156, + "learning_rate": 9.394521224918344e-06, + "loss": 3.2991, + "step": 8535 + }, + { + "epoch": 0.7275206682008012, + "grad_norm": 73.19889109448043, + "learning_rate": 9.394284684765682e-06, + "loss": 4.6276, + "step": 8536 + }, + { + "epoch": 0.7276058978948265, + "grad_norm": 37.23879830257934, + "learning_rate": 9.394048101396976e-06, + "loss": 3.3455, + "step": 8537 + }, + { + "epoch": 0.7276911275888519, + "grad_norm": 46.127526826963276, + "learning_rate": 9.393811474814552e-06, + "loss": 3.7128, + "step": 8538 + }, + { + "epoch": 0.7277763572828774, + "grad_norm": 82.48812748934984, + "learning_rate": 9.393574805020738e-06, + "loss": 4.9946, + "step": 8539 + }, + { + "epoch": 0.7278615869769027, + "grad_norm": 62.14892288453557, + "learning_rate": 9.39333809201786e-06, + "loss": 4.9942, + "step": 8540 + }, + { + "epoch": 0.7279468166709282, + "grad_norm": 50.15207174961249, + "learning_rate": 9.393101335808251e-06, + "loss": 4.0543, + "step": 8541 + }, + { + "epoch": 0.7280320463649536, + "grad_norm": 73.52675779970792, + "learning_rate": 9.392864536394233e-06, + "loss": 3.446, + "step": 8542 + }, + { + "epoch": 0.7281172760589789, + "grad_norm": 35.07229880260438, + "learning_rate": 9.39262769377814e-06, + "loss": 3.7252, + "step": 8543 + }, + { + "epoch": 0.7282025057530044, + "grad_norm": 42.8719218285736, + "learning_rate": 9.392390807962298e-06, + "loss": 4.2707, + "step": 8544 + }, + { + "epoch": 0.7282877354470297, + "grad_norm": 30.783718619998705, + "learning_rate": 9.392153878949038e-06, + "loss": 3.0296, + "step": 8545 + }, + { + "epoch": 0.7283729651410551, + "grad_norm": 37.99266616326244, + "learning_rate": 9.39191690674069e-06, + "loss": 4.124, + "step": 8546 + }, + { + "epoch": 0.7284581948350806, + "grad_norm": 51.858251865394244, + "learning_rate": 9.391679891339585e-06, + "loss": 4.4069, + "step": 8547 + }, + { + "epoch": 0.7285434245291059, + "grad_norm": 31.935237686751293, + "learning_rate": 9.391442832748053e-06, + "loss": 4.0455, + "step": 8548 + }, + { + "epoch": 0.7286286542231314, + "grad_norm": 38.20761502103394, + "learning_rate": 9.391205730968425e-06, + "loss": 3.8017, + "step": 8549 + }, + { + "epoch": 0.7287138839171567, + "grad_norm": 29.63226908012032, + "learning_rate": 9.390968586003034e-06, + "loss": 3.3685, + "step": 8550 + }, + { + "epoch": 0.7287991136111821, + "grad_norm": 37.94081682098085, + "learning_rate": 9.39073139785421e-06, + "loss": 3.4367, + "step": 8551 + }, + { + "epoch": 0.7288843433052076, + "grad_norm": 42.05633347788429, + "learning_rate": 9.39049416652429e-06, + "loss": 4.4126, + "step": 8552 + }, + { + "epoch": 0.7289695729992329, + "grad_norm": 81.56014789111339, + "learning_rate": 9.390256892015603e-06, + "loss": 4.4703, + "step": 8553 + }, + { + "epoch": 0.7290548026932583, + "grad_norm": 45.02994950339509, + "learning_rate": 9.390019574330485e-06, + "loss": 2.962, + "step": 8554 + }, + { + "epoch": 0.7291400323872838, + "grad_norm": 33.308651068657916, + "learning_rate": 9.389782213471268e-06, + "loss": 2.8898, + "step": 8555 + }, + { + "epoch": 0.7292252620813091, + "grad_norm": 45.32142137046848, + "learning_rate": 9.389544809440288e-06, + "loss": 4.6974, + "step": 8556 + }, + { + "epoch": 0.7293104917753346, + "grad_norm": 28.94334061593899, + "learning_rate": 9.389307362239878e-06, + "loss": 2.3222, + "step": 8557 + }, + { + "epoch": 0.7293957214693599, + "grad_norm": 55.17738911396123, + "learning_rate": 9.389069871872376e-06, + "loss": 4.2246, + "step": 8558 + }, + { + "epoch": 0.7294809511633853, + "grad_norm": 54.34218614035645, + "learning_rate": 9.388832338340114e-06, + "loss": 3.0276, + "step": 8559 + }, + { + "epoch": 0.7295661808574108, + "grad_norm": 99.6811752878024, + "learning_rate": 9.388594761645432e-06, + "loss": 4.6281, + "step": 8560 + }, + { + "epoch": 0.7296514105514361, + "grad_norm": 34.843554574968095, + "learning_rate": 9.388357141790663e-06, + "loss": 2.9882, + "step": 8561 + }, + { + "epoch": 0.7297366402454615, + "grad_norm": 60.45958778057615, + "learning_rate": 9.388119478778144e-06, + "loss": 4.3306, + "step": 8562 + }, + { + "epoch": 0.729821869939487, + "grad_norm": 76.47802298506554, + "learning_rate": 9.387881772610215e-06, + "loss": 4.0804, + "step": 8563 + }, + { + "epoch": 0.7299070996335123, + "grad_norm": 45.7590802368028, + "learning_rate": 9.387644023289213e-06, + "loss": 3.7956, + "step": 8564 + }, + { + "epoch": 0.7299923293275377, + "grad_norm": 74.52762045746772, + "learning_rate": 9.387406230817475e-06, + "loss": 4.852, + "step": 8565 + }, + { + "epoch": 0.7300775590215631, + "grad_norm": 55.68137851529273, + "learning_rate": 9.38716839519734e-06, + "loss": 3.5707, + "step": 8566 + }, + { + "epoch": 0.7301627887155885, + "grad_norm": 35.23968380659519, + "learning_rate": 9.386930516431147e-06, + "loss": 3.004, + "step": 8567 + }, + { + "epoch": 0.730248018409614, + "grad_norm": 58.93452355808924, + "learning_rate": 9.386692594521236e-06, + "loss": 5.8296, + "step": 8568 + }, + { + "epoch": 0.7303332481036393, + "grad_norm": 53.96064659537829, + "learning_rate": 9.386454629469946e-06, + "loss": 1.5303, + "step": 8569 + }, + { + "epoch": 0.7304184777976647, + "grad_norm": 68.40597446776069, + "learning_rate": 9.386216621279616e-06, + "loss": 3.9805, + "step": 8570 + }, + { + "epoch": 0.7305037074916901, + "grad_norm": 103.57232862321692, + "learning_rate": 9.385978569952591e-06, + "loss": 5.3182, + "step": 8571 + }, + { + "epoch": 0.7305889371857155, + "grad_norm": 45.532916721222726, + "learning_rate": 9.38574047549121e-06, + "loss": 4.084, + "step": 8572 + }, + { + "epoch": 0.7306741668797408, + "grad_norm": 61.6988201537078, + "learning_rate": 9.385502337897811e-06, + "loss": 4.1441, + "step": 8573 + }, + { + "epoch": 0.7307593965737663, + "grad_norm": 98.72268224330455, + "learning_rate": 9.385264157174742e-06, + "loss": 5.4706, + "step": 8574 + }, + { + "epoch": 0.7308446262677917, + "grad_norm": 34.766633661744315, + "learning_rate": 9.385025933324341e-06, + "loss": 2.9657, + "step": 8575 + }, + { + "epoch": 0.7309298559618171, + "grad_norm": 57.17188514425038, + "learning_rate": 9.384787666348954e-06, + "loss": 4.2603, + "step": 8576 + }, + { + "epoch": 0.7310150856558425, + "grad_norm": 35.58056836814135, + "learning_rate": 9.384549356250921e-06, + "loss": 3.8434, + "step": 8577 + }, + { + "epoch": 0.7311003153498679, + "grad_norm": 54.84894045652658, + "learning_rate": 9.384311003032587e-06, + "loss": 3.6672, + "step": 8578 + }, + { + "epoch": 0.7311855450438933, + "grad_norm": 50.90722374571438, + "learning_rate": 9.384072606696299e-06, + "loss": 3.8217, + "step": 8579 + }, + { + "epoch": 0.7312707747379187, + "grad_norm": 34.124605567094406, + "learning_rate": 9.383834167244397e-06, + "loss": 2.5582, + "step": 8580 + }, + { + "epoch": 0.731356004431944, + "grad_norm": 35.91446003182575, + "learning_rate": 9.383595684679228e-06, + "loss": 3.361, + "step": 8581 + }, + { + "epoch": 0.7314412341259695, + "grad_norm": 51.44546523370818, + "learning_rate": 9.383357159003137e-06, + "loss": 3.7287, + "step": 8582 + }, + { + "epoch": 0.7315264638199949, + "grad_norm": 38.80652227888235, + "learning_rate": 9.383118590218473e-06, + "loss": 3.9542, + "step": 8583 + }, + { + "epoch": 0.7316116935140203, + "grad_norm": 64.94103142882037, + "learning_rate": 9.382879978327574e-06, + "loss": 5.7009, + "step": 8584 + }, + { + "epoch": 0.7316969232080457, + "grad_norm": 72.72561664514943, + "learning_rate": 9.382641323332796e-06, + "loss": 4.0469, + "step": 8585 + }, + { + "epoch": 0.731782152902071, + "grad_norm": 32.22455309141986, + "learning_rate": 9.382402625236481e-06, + "loss": 2.3856, + "step": 8586 + }, + { + "epoch": 0.7318673825960965, + "grad_norm": 49.04567874690224, + "learning_rate": 9.382163884040977e-06, + "loss": 3.5761, + "step": 8587 + }, + { + "epoch": 0.7319526122901219, + "grad_norm": 79.73231779674175, + "learning_rate": 9.381925099748635e-06, + "loss": 4.5056, + "step": 8588 + }, + { + "epoch": 0.7320378419841472, + "grad_norm": 38.090853911876486, + "learning_rate": 9.381686272361798e-06, + "loss": 4.0394, + "step": 8589 + }, + { + "epoch": 0.7321230716781727, + "grad_norm": 54.67707647703505, + "learning_rate": 9.38144740188282e-06, + "loss": 4.3195, + "step": 8590 + }, + { + "epoch": 0.7322083013721981, + "grad_norm": 47.007149807978266, + "learning_rate": 9.381208488314046e-06, + "loss": 4.5101, + "step": 8591 + }, + { + "epoch": 0.7322935310662235, + "grad_norm": 53.056350807310146, + "learning_rate": 9.380969531657828e-06, + "loss": 4.1612, + "step": 8592 + }, + { + "epoch": 0.7323787607602489, + "grad_norm": 68.08266835359159, + "learning_rate": 9.380730531916515e-06, + "loss": 3.2854, + "step": 8593 + }, + { + "epoch": 0.7324639904542742, + "grad_norm": 50.54245722039397, + "learning_rate": 9.380491489092458e-06, + "loss": 5.2598, + "step": 8594 + }, + { + "epoch": 0.7325492201482997, + "grad_norm": 39.828744545425366, + "learning_rate": 9.380252403188007e-06, + "loss": 4.4075, + "step": 8595 + }, + { + "epoch": 0.7326344498423251, + "grad_norm": 34.56658336426164, + "learning_rate": 9.380013274205518e-06, + "loss": 2.8895, + "step": 8596 + }, + { + "epoch": 0.7327196795363504, + "grad_norm": 92.96898026096953, + "learning_rate": 9.379774102147335e-06, + "loss": 4.9067, + "step": 8597 + }, + { + "epoch": 0.7328049092303759, + "grad_norm": 31.38055177342042, + "learning_rate": 9.379534887015815e-06, + "loss": 3.5984, + "step": 8598 + }, + { + "epoch": 0.7328901389244012, + "grad_norm": 76.88048926631627, + "learning_rate": 9.37929562881331e-06, + "loss": 3.8722, + "step": 8599 + }, + { + "epoch": 0.7329753686184267, + "grad_norm": 32.619198519214976, + "learning_rate": 9.379056327542173e-06, + "loss": 2.6878, + "step": 8600 + }, + { + "epoch": 0.7330605983124521, + "grad_norm": 53.82612618316068, + "learning_rate": 9.378816983204757e-06, + "loss": 3.4749, + "step": 8601 + }, + { + "epoch": 0.7331458280064774, + "grad_norm": 47.775886778709605, + "learning_rate": 9.378577595803415e-06, + "loss": 4.5475, + "step": 8602 + }, + { + "epoch": 0.7332310577005029, + "grad_norm": 139.38248527717238, + "learning_rate": 9.378338165340503e-06, + "loss": 4.7211, + "step": 8603 + }, + { + "epoch": 0.7333162873945283, + "grad_norm": 34.595863776088514, + "learning_rate": 9.378098691818377e-06, + "loss": 3.3288, + "step": 8604 + }, + { + "epoch": 0.7334015170885536, + "grad_norm": 29.485399310469578, + "learning_rate": 9.377859175239385e-06, + "loss": 3.3505, + "step": 8605 + }, + { + "epoch": 0.7334867467825791, + "grad_norm": 37.13766174778927, + "learning_rate": 9.377619615605892e-06, + "loss": 3.5389, + "step": 8606 + }, + { + "epoch": 0.7335719764766044, + "grad_norm": 75.35562297037585, + "learning_rate": 9.377380012920247e-06, + "loss": 4.7464, + "step": 8607 + }, + { + "epoch": 0.7336572061706298, + "grad_norm": 95.0028358850285, + "learning_rate": 9.377140367184812e-06, + "loss": 3.5126, + "step": 8608 + }, + { + "epoch": 0.7337424358646553, + "grad_norm": 88.98788981455786, + "learning_rate": 9.376900678401937e-06, + "loss": 4.9149, + "step": 8609 + }, + { + "epoch": 0.7338276655586806, + "grad_norm": 41.545621553898044, + "learning_rate": 9.376660946573986e-06, + "loss": 3.3664, + "step": 8610 + }, + { + "epoch": 0.7339128952527061, + "grad_norm": 54.69624344916224, + "learning_rate": 9.376421171703312e-06, + "loss": 4.0558, + "step": 8611 + }, + { + "epoch": 0.7339981249467314, + "grad_norm": 26.18657702830262, + "learning_rate": 9.376181353792275e-06, + "loss": 3.4354, + "step": 8612 + }, + { + "epoch": 0.7340833546407568, + "grad_norm": 42.38145789720156, + "learning_rate": 9.375941492843235e-06, + "loss": 3.9853, + "step": 8613 + }, + { + "epoch": 0.7341685843347823, + "grad_norm": 42.222165398869556, + "learning_rate": 9.375701588858547e-06, + "loss": 3.9168, + "step": 8614 + }, + { + "epoch": 0.7342538140288076, + "grad_norm": 64.28750227788176, + "learning_rate": 9.375461641840573e-06, + "loss": 3.8352, + "step": 8615 + }, + { + "epoch": 0.734339043722833, + "grad_norm": 44.21274882920113, + "learning_rate": 9.375221651791672e-06, + "loss": 4.2076, + "step": 8616 + }, + { + "epoch": 0.7344242734168585, + "grad_norm": 51.449307712161264, + "learning_rate": 9.374981618714207e-06, + "loss": 4.1908, + "step": 8617 + }, + { + "epoch": 0.7345095031108838, + "grad_norm": 53.84761783520148, + "learning_rate": 9.374741542610535e-06, + "loss": 4.7104, + "step": 8618 + }, + { + "epoch": 0.7345947328049093, + "grad_norm": 40.5838238407919, + "learning_rate": 9.374501423483018e-06, + "loss": 3.9138, + "step": 8619 + }, + { + "epoch": 0.7346799624989346, + "grad_norm": 40.839464897706144, + "learning_rate": 9.374261261334017e-06, + "loss": 3.5471, + "step": 8620 + }, + { + "epoch": 0.73476519219296, + "grad_norm": 62.77460597225458, + "learning_rate": 9.374021056165897e-06, + "loss": 3.2314, + "step": 8621 + }, + { + "epoch": 0.7348504218869855, + "grad_norm": 35.407521443035705, + "learning_rate": 9.373780807981015e-06, + "loss": 3.3296, + "step": 8622 + }, + { + "epoch": 0.7349356515810108, + "grad_norm": 35.68559553499109, + "learning_rate": 9.373540516781739e-06, + "loss": 4.0371, + "step": 8623 + }, + { + "epoch": 0.7350208812750362, + "grad_norm": 67.05859553880073, + "learning_rate": 9.373300182570429e-06, + "loss": 4.1298, + "step": 8624 + }, + { + "epoch": 0.7351061109690616, + "grad_norm": 50.546611700863465, + "learning_rate": 9.37305980534945e-06, + "loss": 3.4836, + "step": 8625 + }, + { + "epoch": 0.735191340663087, + "grad_norm": 30.40974106172463, + "learning_rate": 9.372819385121163e-06, + "loss": 2.1917, + "step": 8626 + }, + { + "epoch": 0.7352765703571125, + "grad_norm": 68.40974049808942, + "learning_rate": 9.372578921887938e-06, + "loss": 4.2421, + "step": 8627 + }, + { + "epoch": 0.7353618000511378, + "grad_norm": 46.49120654754322, + "learning_rate": 9.372338415652136e-06, + "loss": 4.1725, + "step": 8628 + }, + { + "epoch": 0.7354470297451632, + "grad_norm": 42.20203990732686, + "learning_rate": 9.37209786641612e-06, + "loss": 4.4082, + "step": 8629 + }, + { + "epoch": 0.7355322594391887, + "grad_norm": 62.47263927078172, + "learning_rate": 9.371857274182263e-06, + "loss": 3.9095, + "step": 8630 + }, + { + "epoch": 0.735617489133214, + "grad_norm": 81.56748445986102, + "learning_rate": 9.371616638952926e-06, + "loss": 4.31, + "step": 8631 + }, + { + "epoch": 0.7357027188272394, + "grad_norm": 47.69289792145419, + "learning_rate": 9.371375960730476e-06, + "loss": 4.1343, + "step": 8632 + }, + { + "epoch": 0.7357879485212648, + "grad_norm": 45.45416182774038, + "learning_rate": 9.37113523951728e-06, + "loss": 3.79, + "step": 8633 + }, + { + "epoch": 0.7358731782152902, + "grad_norm": 49.94578251285773, + "learning_rate": 9.370894475315707e-06, + "loss": 3.2857, + "step": 8634 + }, + { + "epoch": 0.7359584079093157, + "grad_norm": 35.16857330353269, + "learning_rate": 9.370653668128123e-06, + "loss": 4.1634, + "step": 8635 + }, + { + "epoch": 0.736043637603341, + "grad_norm": 86.30343486397005, + "learning_rate": 9.370412817956897e-06, + "loss": 4.9296, + "step": 8636 + }, + { + "epoch": 0.7361288672973664, + "grad_norm": 84.63766145697988, + "learning_rate": 9.370171924804397e-06, + "loss": 3.6404, + "step": 8637 + }, + { + "epoch": 0.7362140969913918, + "grad_norm": 50.96303121247924, + "learning_rate": 9.369930988672992e-06, + "loss": 3.6919, + "step": 8638 + }, + { + "epoch": 0.7362993266854172, + "grad_norm": 63.57028960308399, + "learning_rate": 9.369690009565054e-06, + "loss": 4.4054, + "step": 8639 + }, + { + "epoch": 0.7363845563794426, + "grad_norm": 36.62164511164311, + "learning_rate": 9.36944898748295e-06, + "loss": 2.9282, + "step": 8640 + }, + { + "epoch": 0.736469786073468, + "grad_norm": 54.377634636336424, + "learning_rate": 9.369207922429052e-06, + "loss": 4.1764, + "step": 8641 + }, + { + "epoch": 0.7365550157674934, + "grad_norm": 46.22196675381468, + "learning_rate": 9.36896681440573e-06, + "loss": 4.0473, + "step": 8642 + }, + { + "epoch": 0.7366402454615187, + "grad_norm": 61.60222359005956, + "learning_rate": 9.368725663415355e-06, + "loss": 3.9947, + "step": 8643 + }, + { + "epoch": 0.7367254751555442, + "grad_norm": 34.19623643759125, + "learning_rate": 9.3684844694603e-06, + "loss": 4.2851, + "step": 8644 + }, + { + "epoch": 0.7368107048495696, + "grad_norm": 44.7846581145335, + "learning_rate": 9.368243232542935e-06, + "loss": 3.2935, + "step": 8645 + }, + { + "epoch": 0.736895934543595, + "grad_norm": 42.5707374496411, + "learning_rate": 9.368001952665635e-06, + "loss": 3.7239, + "step": 8646 + }, + { + "epoch": 0.7369811642376204, + "grad_norm": 37.78809153209083, + "learning_rate": 9.367760629830771e-06, + "loss": 2.3169, + "step": 8647 + }, + { + "epoch": 0.7370663939316457, + "grad_norm": 45.348526932916656, + "learning_rate": 9.367519264040717e-06, + "loss": 4.4395, + "step": 8648 + }, + { + "epoch": 0.7371516236256712, + "grad_norm": 33.98016979180548, + "learning_rate": 9.367277855297845e-06, + "loss": 3.5401, + "step": 8649 + }, + { + "epoch": 0.7372368533196966, + "grad_norm": 80.99982827966028, + "learning_rate": 9.367036403604532e-06, + "loss": 4.5223, + "step": 8650 + }, + { + "epoch": 0.7373220830137219, + "grad_norm": 78.70195870700691, + "learning_rate": 9.36679490896315e-06, + "loss": 4.6032, + "step": 8651 + }, + { + "epoch": 0.7374073127077474, + "grad_norm": 37.329872341783016, + "learning_rate": 9.366553371376076e-06, + "loss": 3.3313, + "step": 8652 + }, + { + "epoch": 0.7374925424017728, + "grad_norm": 34.13683312058305, + "learning_rate": 9.366311790845686e-06, + "loss": 3.5598, + "step": 8653 + }, + { + "epoch": 0.7375777720957982, + "grad_norm": 33.993313533094096, + "learning_rate": 9.366070167374352e-06, + "loss": 2.9386, + "step": 8654 + }, + { + "epoch": 0.7376630017898236, + "grad_norm": 24.37599667983417, + "learning_rate": 9.365828500964454e-06, + "loss": 2.4967, + "step": 8655 + }, + { + "epoch": 0.7377482314838489, + "grad_norm": 42.094180136788346, + "learning_rate": 9.365586791618369e-06, + "loss": 4.2845, + "step": 8656 + }, + { + "epoch": 0.7378334611778744, + "grad_norm": 74.39238245402892, + "learning_rate": 9.36534503933847e-06, + "loss": 3.7391, + "step": 8657 + }, + { + "epoch": 0.7379186908718998, + "grad_norm": 44.70101397739688, + "learning_rate": 9.365103244127138e-06, + "loss": 4.7928, + "step": 8658 + }, + { + "epoch": 0.7380039205659251, + "grad_norm": 37.741133687897445, + "learning_rate": 9.364861405986751e-06, + "loss": 4.2569, + "step": 8659 + }, + { + "epoch": 0.7380891502599506, + "grad_norm": 36.83760973238411, + "learning_rate": 9.364619524919687e-06, + "loss": 3.622, + "step": 8660 + }, + { + "epoch": 0.738174379953976, + "grad_norm": 44.507268862473104, + "learning_rate": 9.364377600928322e-06, + "loss": 4.7057, + "step": 8661 + }, + { + "epoch": 0.7382596096480014, + "grad_norm": 42.61252827850336, + "learning_rate": 9.36413563401504e-06, + "loss": 4.1029, + "step": 8662 + }, + { + "epoch": 0.7383448393420268, + "grad_norm": 35.75761214907165, + "learning_rate": 9.363893624182217e-06, + "loss": 3.686, + "step": 8663 + }, + { + "epoch": 0.7384300690360521, + "grad_norm": 54.48836322513299, + "learning_rate": 9.363651571432235e-06, + "loss": 3.7951, + "step": 8664 + }, + { + "epoch": 0.7385152987300776, + "grad_norm": 40.394243653535554, + "learning_rate": 9.363409475767472e-06, + "loss": 3.341, + "step": 8665 + }, + { + "epoch": 0.738600528424103, + "grad_norm": 53.60125943042724, + "learning_rate": 9.363167337190311e-06, + "loss": 4.3409, + "step": 8666 + }, + { + "epoch": 0.7386857581181283, + "grad_norm": 159.51898696605772, + "learning_rate": 9.362925155703136e-06, + "loss": 5.3783, + "step": 8667 + }, + { + "epoch": 0.7387709878121538, + "grad_norm": 26.302542202993926, + "learning_rate": 9.362682931308323e-06, + "loss": 3.5127, + "step": 8668 + }, + { + "epoch": 0.7388562175061791, + "grad_norm": 48.393117308744436, + "learning_rate": 9.362440664008257e-06, + "loss": 3.8128, + "step": 8669 + }, + { + "epoch": 0.7389414472002046, + "grad_norm": 42.109022503747696, + "learning_rate": 9.362198353805321e-06, + "loss": 3.9082, + "step": 8670 + }, + { + "epoch": 0.73902667689423, + "grad_norm": 44.45250397360502, + "learning_rate": 9.361956000701898e-06, + "loss": 4.3823, + "step": 8671 + }, + { + "epoch": 0.7391119065882553, + "grad_norm": 48.14301462802718, + "learning_rate": 9.36171360470037e-06, + "loss": 3.783, + "step": 8672 + }, + { + "epoch": 0.7391971362822808, + "grad_norm": 29.932418813293385, + "learning_rate": 9.361471165803122e-06, + "loss": 3.3587, + "step": 8673 + }, + { + "epoch": 0.7392823659763061, + "grad_norm": 42.79131755641806, + "learning_rate": 9.361228684012538e-06, + "loss": 3.3421, + "step": 8674 + }, + { + "epoch": 0.7393675956703315, + "grad_norm": 43.33668951446632, + "learning_rate": 9.360986159331003e-06, + "loss": 3.77, + "step": 8675 + }, + { + "epoch": 0.739452825364357, + "grad_norm": 34.335699619150994, + "learning_rate": 9.360743591760904e-06, + "loss": 3.2159, + "step": 8676 + }, + { + "epoch": 0.7395380550583823, + "grad_norm": 41.36674007621225, + "learning_rate": 9.360500981304621e-06, + "loss": 3.8673, + "step": 8677 + }, + { + "epoch": 0.7396232847524077, + "grad_norm": 49.49030846287214, + "learning_rate": 9.360258327964546e-06, + "loss": 4.0272, + "step": 8678 + }, + { + "epoch": 0.7397085144464332, + "grad_norm": 66.58638958936723, + "learning_rate": 9.360015631743062e-06, + "loss": 4.508, + "step": 8679 + }, + { + "epoch": 0.7397937441404585, + "grad_norm": 92.63325774580154, + "learning_rate": 9.359772892642558e-06, + "loss": 4.4018, + "step": 8680 + }, + { + "epoch": 0.739878973834484, + "grad_norm": 158.04802894758487, + "learning_rate": 9.35953011066542e-06, + "loss": 4.1927, + "step": 8681 + }, + { + "epoch": 0.7399642035285093, + "grad_norm": 55.67332739539912, + "learning_rate": 9.359287285814034e-06, + "loss": 5.5457, + "step": 8682 + }, + { + "epoch": 0.7400494332225347, + "grad_norm": 30.241783709194234, + "learning_rate": 9.35904441809079e-06, + "loss": 4.0191, + "step": 8683 + }, + { + "epoch": 0.7401346629165602, + "grad_norm": 91.99657183152853, + "learning_rate": 9.358801507498078e-06, + "loss": 4.5883, + "step": 8684 + }, + { + "epoch": 0.7402198926105855, + "grad_norm": 65.0339775542888, + "learning_rate": 9.358558554038283e-06, + "loss": 4.8105, + "step": 8685 + }, + { + "epoch": 0.7403051223046109, + "grad_norm": 73.83214746380067, + "learning_rate": 9.358315557713798e-06, + "loss": 5.182, + "step": 8686 + }, + { + "epoch": 0.7403903519986363, + "grad_norm": 34.88978884434051, + "learning_rate": 9.358072518527012e-06, + "loss": 3.5923, + "step": 8687 + }, + { + "epoch": 0.7404755816926617, + "grad_norm": 49.682473075430806, + "learning_rate": 9.357829436480313e-06, + "loss": 3.5477, + "step": 8688 + }, + { + "epoch": 0.7405608113866872, + "grad_norm": 34.19502031389076, + "learning_rate": 9.357586311576094e-06, + "loss": 2.5393, + "step": 8689 + }, + { + "epoch": 0.7406460410807125, + "grad_norm": 54.716629672551406, + "learning_rate": 9.357343143816744e-06, + "loss": 3.865, + "step": 8690 + }, + { + "epoch": 0.7407312707747379, + "grad_norm": 53.78464047705469, + "learning_rate": 9.357099933204656e-06, + "loss": 5.0677, + "step": 8691 + }, + { + "epoch": 0.7408165004687634, + "grad_norm": 70.95139078943082, + "learning_rate": 9.356856679742224e-06, + "loss": 3.0508, + "step": 8692 + }, + { + "epoch": 0.7409017301627887, + "grad_norm": 45.448969646488955, + "learning_rate": 9.356613383431834e-06, + "loss": 3.606, + "step": 8693 + }, + { + "epoch": 0.7409869598568141, + "grad_norm": 60.859193839860815, + "learning_rate": 9.356370044275885e-06, + "loss": 3.6141, + "step": 8694 + }, + { + "epoch": 0.7410721895508395, + "grad_norm": 40.0311453813292, + "learning_rate": 9.356126662276767e-06, + "loss": 2.9447, + "step": 8695 + }, + { + "epoch": 0.7411574192448649, + "grad_norm": 32.371249409040296, + "learning_rate": 9.355883237436874e-06, + "loss": 3.7677, + "step": 8696 + }, + { + "epoch": 0.7412426489388904, + "grad_norm": 34.13354055590746, + "learning_rate": 9.3556397697586e-06, + "loss": 3.6705, + "step": 8697 + }, + { + "epoch": 0.7413278786329157, + "grad_norm": 56.73566476422123, + "learning_rate": 9.35539625924434e-06, + "loss": 4.3746, + "step": 8698 + }, + { + "epoch": 0.7414131083269411, + "grad_norm": 44.08150111999473, + "learning_rate": 9.35515270589649e-06, + "loss": 3.8002, + "step": 8699 + }, + { + "epoch": 0.7414983380209665, + "grad_norm": 48.709840490602815, + "learning_rate": 9.35490910971744e-06, + "loss": 3.4241, + "step": 8700 + }, + { + "epoch": 0.7415835677149919, + "grad_norm": 54.79380045645059, + "learning_rate": 9.354665470709592e-06, + "loss": 4.2982, + "step": 8701 + }, + { + "epoch": 0.7416687974090173, + "grad_norm": 54.722596490047465, + "learning_rate": 9.354421788875339e-06, + "loss": 4.8158, + "step": 8702 + }, + { + "epoch": 0.7417540271030427, + "grad_norm": 31.63923762492922, + "learning_rate": 9.354178064217077e-06, + "loss": 2.7818, + "step": 8703 + }, + { + "epoch": 0.7418392567970681, + "grad_norm": 57.96363481141889, + "learning_rate": 9.353934296737205e-06, + "loss": 4.2717, + "step": 8704 + }, + { + "epoch": 0.7419244864910935, + "grad_norm": 82.60988751953542, + "learning_rate": 9.353690486438119e-06, + "loss": 4.4176, + "step": 8705 + }, + { + "epoch": 0.7420097161851189, + "grad_norm": 45.7288745758361, + "learning_rate": 9.353446633322217e-06, + "loss": 3.9692, + "step": 8706 + }, + { + "epoch": 0.7420949458791443, + "grad_norm": 45.526496863351184, + "learning_rate": 9.353202737391897e-06, + "loss": 4.2113, + "step": 8707 + }, + { + "epoch": 0.7421801755731697, + "grad_norm": 53.20531724716192, + "learning_rate": 9.352958798649558e-06, + "loss": 4.3274, + "step": 8708 + }, + { + "epoch": 0.7422654052671951, + "grad_norm": 41.02296820455439, + "learning_rate": 9.352714817097601e-06, + "loss": 4.185, + "step": 8709 + }, + { + "epoch": 0.7423506349612204, + "grad_norm": 63.90392561642143, + "learning_rate": 9.352470792738422e-06, + "loss": 4.4122, + "step": 8710 + }, + { + "epoch": 0.7424358646552459, + "grad_norm": 65.43569248504639, + "learning_rate": 9.35222672557442e-06, + "loss": 5.0721, + "step": 8711 + }, + { + "epoch": 0.7425210943492713, + "grad_norm": 65.30121871035537, + "learning_rate": 9.351982615608e-06, + "loss": 4.3536, + "step": 8712 + }, + { + "epoch": 0.7426063240432967, + "grad_norm": 62.36799982120927, + "learning_rate": 9.35173846284156e-06, + "loss": 2.7389, + "step": 8713 + }, + { + "epoch": 0.7426915537373221, + "grad_norm": 50.94363713363731, + "learning_rate": 9.351494267277501e-06, + "loss": 3.9574, + "step": 8714 + }, + { + "epoch": 0.7427767834313475, + "grad_norm": 34.61325923126371, + "learning_rate": 9.351250028918225e-06, + "loss": 2.6091, + "step": 8715 + }, + { + "epoch": 0.7428620131253729, + "grad_norm": 32.77866557455935, + "learning_rate": 9.351005747766135e-06, + "loss": 2.8892, + "step": 8716 + }, + { + "epoch": 0.7429472428193983, + "grad_norm": 101.97887416034418, + "learning_rate": 9.350761423823632e-06, + "loss": 5.4518, + "step": 8717 + }, + { + "epoch": 0.7430324725134236, + "grad_norm": 76.82666196345458, + "learning_rate": 9.350517057093119e-06, + "loss": 4.3431, + "step": 8718 + }, + { + "epoch": 0.7431177022074491, + "grad_norm": 67.51097235866568, + "learning_rate": 9.350272647576998e-06, + "loss": 4.0713, + "step": 8719 + }, + { + "epoch": 0.7432029319014745, + "grad_norm": 34.688220210020404, + "learning_rate": 9.350028195277674e-06, + "loss": 3.2477, + "step": 8720 + }, + { + "epoch": 0.7432881615954998, + "grad_norm": 58.14690462978841, + "learning_rate": 9.349783700197553e-06, + "loss": 4.2532, + "step": 8721 + }, + { + "epoch": 0.7433733912895253, + "grad_norm": 38.41888814720109, + "learning_rate": 9.349539162339037e-06, + "loss": 3.7158, + "step": 8722 + }, + { + "epoch": 0.7434586209835506, + "grad_norm": 93.60278081896843, + "learning_rate": 9.349294581704533e-06, + "loss": 5.4642, + "step": 8723 + }, + { + "epoch": 0.7435438506775761, + "grad_norm": 35.734844468393675, + "learning_rate": 9.349049958296443e-06, + "loss": 4.0096, + "step": 8724 + }, + { + "epoch": 0.7436290803716015, + "grad_norm": 23.454281071941807, + "learning_rate": 9.348805292117175e-06, + "loss": 2.3097, + "step": 8725 + }, + { + "epoch": 0.7437143100656268, + "grad_norm": 124.88331070636058, + "learning_rate": 9.348560583169138e-06, + "loss": 4.3836, + "step": 8726 + }, + { + "epoch": 0.7437995397596523, + "grad_norm": 35.50289764071286, + "learning_rate": 9.348315831454733e-06, + "loss": 3.8888, + "step": 8727 + }, + { + "epoch": 0.7438847694536777, + "grad_norm": 57.586309607196874, + "learning_rate": 9.34807103697637e-06, + "loss": 4.6944, + "step": 8728 + }, + { + "epoch": 0.743969999147703, + "grad_norm": 46.69897489921731, + "learning_rate": 9.347826199736456e-06, + "loss": 4.6577, + "step": 8729 + }, + { + "epoch": 0.7440552288417285, + "grad_norm": 53.742176346934656, + "learning_rate": 9.347581319737399e-06, + "loss": 4.5813, + "step": 8730 + }, + { + "epoch": 0.7441404585357538, + "grad_norm": 50.343385210408364, + "learning_rate": 9.347336396981608e-06, + "loss": 3.5471, + "step": 8731 + }, + { + "epoch": 0.7442256882297793, + "grad_norm": 49.5316528442199, + "learning_rate": 9.34709143147149e-06, + "loss": 4.3502, + "step": 8732 + }, + { + "epoch": 0.7443109179238047, + "grad_norm": 56.9724134244167, + "learning_rate": 9.346846423209454e-06, + "loss": 3.3897, + "step": 8733 + }, + { + "epoch": 0.74439614761783, + "grad_norm": 84.45319634891472, + "learning_rate": 9.346601372197914e-06, + "loss": 3.993, + "step": 8734 + }, + { + "epoch": 0.7444813773118555, + "grad_norm": 45.36356363567651, + "learning_rate": 9.346356278439275e-06, + "loss": 3.0852, + "step": 8735 + }, + { + "epoch": 0.7445666070058808, + "grad_norm": 67.22341890728333, + "learning_rate": 9.34611114193595e-06, + "loss": 4.0729, + "step": 8736 + }, + { + "epoch": 0.7446518366999062, + "grad_norm": 47.38640286348397, + "learning_rate": 9.345865962690345e-06, + "loss": 4.8522, + "step": 8737 + }, + { + "epoch": 0.7447370663939317, + "grad_norm": 48.25370310562048, + "learning_rate": 9.34562074070488e-06, + "loss": 4.6973, + "step": 8738 + }, + { + "epoch": 0.744822296087957, + "grad_norm": 36.59987897286057, + "learning_rate": 9.345375475981959e-06, + "loss": 4.1144, + "step": 8739 + }, + { + "epoch": 0.7449075257819825, + "grad_norm": 40.65097406704269, + "learning_rate": 9.345130168523996e-06, + "loss": 3.168, + "step": 8740 + }, + { + "epoch": 0.7449927554760079, + "grad_norm": 55.42048200524296, + "learning_rate": 9.344884818333408e-06, + "loss": 4.916, + "step": 8741 + }, + { + "epoch": 0.7450779851700332, + "grad_norm": 44.01403332855395, + "learning_rate": 9.344639425412603e-06, + "loss": 3.3679, + "step": 8742 + }, + { + "epoch": 0.7451632148640587, + "grad_norm": 71.56827323695461, + "learning_rate": 9.344393989763993e-06, + "loss": 3.4437, + "step": 8743 + }, + { + "epoch": 0.745248444558084, + "grad_norm": 34.21428887075692, + "learning_rate": 9.344148511389998e-06, + "loss": 3.7903, + "step": 8744 + }, + { + "epoch": 0.7453336742521094, + "grad_norm": 39.653029704718975, + "learning_rate": 9.343902990293026e-06, + "loss": 3.7674, + "step": 8745 + }, + { + "epoch": 0.7454189039461349, + "grad_norm": 43.2972567951316, + "learning_rate": 9.343657426475496e-06, + "loss": 3.8421, + "step": 8746 + }, + { + "epoch": 0.7455041336401602, + "grad_norm": 78.81088580424543, + "learning_rate": 9.34341181993982e-06, + "loss": 3.6631, + "step": 8747 + }, + { + "epoch": 0.7455893633341857, + "grad_norm": 32.768402695197146, + "learning_rate": 9.343166170688416e-06, + "loss": 3.4623, + "step": 8748 + }, + { + "epoch": 0.745674593028211, + "grad_norm": 88.8947770645014, + "learning_rate": 9.342920478723696e-06, + "loss": 5.2092, + "step": 8749 + }, + { + "epoch": 0.7457598227222364, + "grad_norm": 71.9090992970903, + "learning_rate": 9.342674744048081e-06, + "loss": 3.831, + "step": 8750 + }, + { + "epoch": 0.7458450524162619, + "grad_norm": 33.65320542879425, + "learning_rate": 9.342428966663985e-06, + "loss": 3.3411, + "step": 8751 + }, + { + "epoch": 0.7459302821102872, + "grad_norm": 21.113873834520863, + "learning_rate": 9.342183146573826e-06, + "loss": 1.9583, + "step": 8752 + }, + { + "epoch": 0.7460155118043126, + "grad_norm": 67.15230320869526, + "learning_rate": 9.341937283780021e-06, + "loss": 3.7518, + "step": 8753 + }, + { + "epoch": 0.746100741498338, + "grad_norm": 36.25694197262829, + "learning_rate": 9.341691378284986e-06, + "loss": 3.6761, + "step": 8754 + }, + { + "epoch": 0.7461859711923634, + "grad_norm": 81.86614799001003, + "learning_rate": 9.341445430091143e-06, + "loss": 3.5957, + "step": 8755 + }, + { + "epoch": 0.7462712008863888, + "grad_norm": 29.979523185196864, + "learning_rate": 9.34119943920091e-06, + "loss": 3.5629, + "step": 8756 + }, + { + "epoch": 0.7463564305804142, + "grad_norm": 82.35636367127046, + "learning_rate": 9.340953405616706e-06, + "loss": 5.1913, + "step": 8757 + }, + { + "epoch": 0.7464416602744396, + "grad_norm": 22.09674716519457, + "learning_rate": 9.34070732934095e-06, + "loss": 3.4598, + "step": 8758 + }, + { + "epoch": 0.7465268899684651, + "grad_norm": 38.00533269194648, + "learning_rate": 9.34046121037606e-06, + "loss": 3.9755, + "step": 8759 + }, + { + "epoch": 0.7466121196624904, + "grad_norm": 32.92710559696053, + "learning_rate": 9.34021504872446e-06, + "loss": 3.1968, + "step": 8760 + }, + { + "epoch": 0.7466973493565158, + "grad_norm": 60.929394586535864, + "learning_rate": 9.339968844388571e-06, + "loss": 3.361, + "step": 8761 + }, + { + "epoch": 0.7467825790505412, + "grad_norm": 37.096979973634696, + "learning_rate": 9.339722597370813e-06, + "loss": 3.9689, + "step": 8762 + }, + { + "epoch": 0.7468678087445666, + "grad_norm": 35.31305542953916, + "learning_rate": 9.339476307673605e-06, + "loss": 3.1238, + "step": 8763 + }, + { + "epoch": 0.746953038438592, + "grad_norm": 48.218384348143246, + "learning_rate": 9.339229975299375e-06, + "loss": 5.2343, + "step": 8764 + }, + { + "epoch": 0.7470382681326174, + "grad_norm": 29.025724486680886, + "learning_rate": 9.33898360025054e-06, + "loss": 2.821, + "step": 8765 + }, + { + "epoch": 0.7471234978266428, + "grad_norm": 29.097914018541168, + "learning_rate": 9.338737182529527e-06, + "loss": 2.6681, + "step": 8766 + }, + { + "epoch": 0.7472087275206682, + "grad_norm": 34.68756783735072, + "learning_rate": 9.338490722138755e-06, + "loss": 3.2409, + "step": 8767 + }, + { + "epoch": 0.7472939572146936, + "grad_norm": 46.862210862731835, + "learning_rate": 9.338244219080654e-06, + "loss": 4.7176, + "step": 8768 + }, + { + "epoch": 0.747379186908719, + "grad_norm": 112.71079483375928, + "learning_rate": 9.337997673357643e-06, + "loss": 3.9635, + "step": 8769 + }, + { + "epoch": 0.7474644166027444, + "grad_norm": 36.554854796122434, + "learning_rate": 9.337751084972151e-06, + "loss": 3.621, + "step": 8770 + }, + { + "epoch": 0.7475496462967698, + "grad_norm": 33.217125319295675, + "learning_rate": 9.337504453926597e-06, + "loss": 3.9869, + "step": 8771 + }, + { + "epoch": 0.7476348759907951, + "grad_norm": 106.09237602390269, + "learning_rate": 9.337257780223414e-06, + "loss": 5.6208, + "step": 8772 + }, + { + "epoch": 0.7477201056848206, + "grad_norm": 86.86570784403179, + "learning_rate": 9.337011063865023e-06, + "loss": 4.8485, + "step": 8773 + }, + { + "epoch": 0.747805335378846, + "grad_norm": 36.312479063759405, + "learning_rate": 9.336764304853851e-06, + "loss": 3.1823, + "step": 8774 + }, + { + "epoch": 0.7478905650728714, + "grad_norm": 83.8592509225843, + "learning_rate": 9.336517503192326e-06, + "loss": 4.1996, + "step": 8775 + }, + { + "epoch": 0.7479757947668968, + "grad_norm": 71.90192697318626, + "learning_rate": 9.336270658882873e-06, + "loss": 5.7712, + "step": 8776 + }, + { + "epoch": 0.7480610244609222, + "grad_norm": 37.726514725397855, + "learning_rate": 9.336023771927923e-06, + "loss": 3.3645, + "step": 8777 + }, + { + "epoch": 0.7481462541549476, + "grad_norm": 81.3993026853516, + "learning_rate": 9.3357768423299e-06, + "loss": 5.4232, + "step": 8778 + }, + { + "epoch": 0.748231483848973, + "grad_norm": 57.166463372196255, + "learning_rate": 9.335529870091237e-06, + "loss": 6.1912, + "step": 8779 + }, + { + "epoch": 0.7483167135429983, + "grad_norm": 58.92255875803314, + "learning_rate": 9.33528285521436e-06, + "loss": 4.6619, + "step": 8780 + }, + { + "epoch": 0.7484019432370238, + "grad_norm": 56.042189849095095, + "learning_rate": 9.335035797701696e-06, + "loss": 3.7075, + "step": 8781 + }, + { + "epoch": 0.7484871729310492, + "grad_norm": 55.302391901779, + "learning_rate": 9.33478869755568e-06, + "loss": 4.0584, + "step": 8782 + }, + { + "epoch": 0.7485724026250746, + "grad_norm": 38.64122944370655, + "learning_rate": 9.334541554778739e-06, + "loss": 3.0443, + "step": 8783 + }, + { + "epoch": 0.7486576323191, + "grad_norm": 34.19940383992949, + "learning_rate": 9.334294369373306e-06, + "loss": 3.4586, + "step": 8784 + }, + { + "epoch": 0.7487428620131253, + "grad_norm": 68.61516870830891, + "learning_rate": 9.334047141341809e-06, + "loss": 5.4723, + "step": 8785 + }, + { + "epoch": 0.7488280917071508, + "grad_norm": 71.73659109919049, + "learning_rate": 9.333799870686679e-06, + "loss": 4.4347, + "step": 8786 + }, + { + "epoch": 0.7489133214011762, + "grad_norm": 33.54085631988023, + "learning_rate": 9.333552557410352e-06, + "loss": 3.4039, + "step": 8787 + }, + { + "epoch": 0.7489985510952015, + "grad_norm": 45.70611504317977, + "learning_rate": 9.333305201515255e-06, + "loss": 4.9746, + "step": 8788 + }, + { + "epoch": 0.749083780789227, + "grad_norm": 54.658431963288116, + "learning_rate": 9.333057803003825e-06, + "loss": 4.9938, + "step": 8789 + }, + { + "epoch": 0.7491690104832524, + "grad_norm": 58.11638621297097, + "learning_rate": 9.332810361878493e-06, + "loss": 5.2839, + "step": 8790 + }, + { + "epoch": 0.7492542401772777, + "grad_norm": 64.09557064290851, + "learning_rate": 9.332562878141693e-06, + "loss": 4.5672, + "step": 8791 + }, + { + "epoch": 0.7493394698713032, + "grad_norm": 56.403315224813426, + "learning_rate": 9.332315351795858e-06, + "loss": 4.3109, + "step": 8792 + }, + { + "epoch": 0.7494246995653285, + "grad_norm": 46.42751919885947, + "learning_rate": 9.332067782843423e-06, + "loss": 3.9819, + "step": 8793 + }, + { + "epoch": 0.749509929259354, + "grad_norm": 35.79594967982176, + "learning_rate": 9.331820171286822e-06, + "loss": 3.3571, + "step": 8794 + }, + { + "epoch": 0.7495951589533794, + "grad_norm": 61.099375705387644, + "learning_rate": 9.331572517128492e-06, + "loss": 4.7075, + "step": 8795 + }, + { + "epoch": 0.7496803886474047, + "grad_norm": 1570.3516162862759, + "learning_rate": 9.331324820370868e-06, + "loss": 4.8846, + "step": 8796 + }, + { + "epoch": 0.7497656183414302, + "grad_norm": 57.007883559823355, + "learning_rate": 9.331077081016384e-06, + "loss": 4.2196, + "step": 8797 + }, + { + "epoch": 0.7498508480354555, + "grad_norm": 43.23746804685828, + "learning_rate": 9.330829299067479e-06, + "loss": 3.235, + "step": 8798 + }, + { + "epoch": 0.7499360777294809, + "grad_norm": 59.02616394823699, + "learning_rate": 9.330581474526588e-06, + "loss": 3.258, + "step": 8799 + }, + { + "epoch": 0.7500213074235064, + "grad_norm": 41.172639308623204, + "learning_rate": 9.330333607396149e-06, + "loss": 2.8305, + "step": 8800 + }, + { + "epoch": 0.7501065371175317, + "grad_norm": 37.065146561940296, + "learning_rate": 9.330085697678602e-06, + "loss": 2.6022, + "step": 8801 + }, + { + "epoch": 0.7501917668115572, + "grad_norm": 45.19973762307627, + "learning_rate": 9.32983774537638e-06, + "loss": 3.9694, + "step": 8802 + }, + { + "epoch": 0.7502769965055825, + "grad_norm": 70.2524329615925, + "learning_rate": 9.329589750491926e-06, + "loss": 5.068, + "step": 8803 + }, + { + "epoch": 0.7503622261996079, + "grad_norm": 51.51083870712368, + "learning_rate": 9.329341713027676e-06, + "loss": 4.3632, + "step": 8804 + }, + { + "epoch": 0.7504474558936334, + "grad_norm": 35.744624345340334, + "learning_rate": 9.32909363298607e-06, + "loss": 4.7454, + "step": 8805 + }, + { + "epoch": 0.7505326855876587, + "grad_norm": 39.21508892826853, + "learning_rate": 9.32884551036955e-06, + "loss": 3.2191, + "step": 8806 + }, + { + "epoch": 0.7506179152816841, + "grad_norm": 68.3227884816053, + "learning_rate": 9.328597345180555e-06, + "loss": 4.7751, + "step": 8807 + }, + { + "epoch": 0.7507031449757096, + "grad_norm": 43.37850806471166, + "learning_rate": 9.328349137421523e-06, + "loss": 4.0932, + "step": 8808 + }, + { + "epoch": 0.7507883746697349, + "grad_norm": 91.09299073833837, + "learning_rate": 9.328100887094898e-06, + "loss": 4.9593, + "step": 8809 + }, + { + "epoch": 0.7508736043637604, + "grad_norm": 53.863573577601024, + "learning_rate": 9.327852594203121e-06, + "loss": 4.5904, + "step": 8810 + }, + { + "epoch": 0.7509588340577857, + "grad_norm": 36.0768237839682, + "learning_rate": 9.327604258748633e-06, + "loss": 3.8545, + "step": 8811 + }, + { + "epoch": 0.7510440637518111, + "grad_norm": 50.56589872886345, + "learning_rate": 9.327355880733878e-06, + "loss": 4.3109, + "step": 8812 + }, + { + "epoch": 0.7511292934458366, + "grad_norm": 32.27281279728043, + "learning_rate": 9.327107460161296e-06, + "loss": 2.5334, + "step": 8813 + }, + { + "epoch": 0.7512145231398619, + "grad_norm": 66.26867200898575, + "learning_rate": 9.32685899703333e-06, + "loss": 4.6478, + "step": 8814 + }, + { + "epoch": 0.7512997528338873, + "grad_norm": 39.11091561184434, + "learning_rate": 9.326610491352427e-06, + "loss": 3.858, + "step": 8815 + }, + { + "epoch": 0.7513849825279127, + "grad_norm": 34.455989880108056, + "learning_rate": 9.32636194312103e-06, + "loss": 2.3588, + "step": 8816 + }, + { + "epoch": 0.7514702122219381, + "grad_norm": 52.514473043248564, + "learning_rate": 9.326113352341582e-06, + "loss": 5.2087, + "step": 8817 + }, + { + "epoch": 0.7515554419159636, + "grad_norm": 66.08729169286647, + "learning_rate": 9.325864719016526e-06, + "loss": 4.9413, + "step": 8818 + }, + { + "epoch": 0.7516406716099889, + "grad_norm": 45.32663021952343, + "learning_rate": 9.325616043148312e-06, + "loss": 3.2747, + "step": 8819 + }, + { + "epoch": 0.7517259013040143, + "grad_norm": 98.83528384201976, + "learning_rate": 9.325367324739383e-06, + "loss": 4.4241, + "step": 8820 + }, + { + "epoch": 0.7518111309980398, + "grad_norm": 24.703500987004688, + "learning_rate": 9.325118563792183e-06, + "loss": 2.0329, + "step": 8821 + }, + { + "epoch": 0.7518963606920651, + "grad_norm": 66.11881657422667, + "learning_rate": 9.324869760309162e-06, + "loss": 5.1905, + "step": 8822 + }, + { + "epoch": 0.7519815903860905, + "grad_norm": 33.92345445142524, + "learning_rate": 9.324620914292766e-06, + "loss": 3.399, + "step": 8823 + }, + { + "epoch": 0.7520668200801159, + "grad_norm": 68.15867963738951, + "learning_rate": 9.324372025745441e-06, + "loss": 4.8691, + "step": 8824 + }, + { + "epoch": 0.7521520497741413, + "grad_norm": 45.36615633146491, + "learning_rate": 9.324123094669634e-06, + "loss": 3.9475, + "step": 8825 + }, + { + "epoch": 0.7522372794681668, + "grad_norm": 50.131493246234704, + "learning_rate": 9.323874121067798e-06, + "loss": 4.1912, + "step": 8826 + }, + { + "epoch": 0.7523225091621921, + "grad_norm": 110.53830800286777, + "learning_rate": 9.323625104942375e-06, + "loss": 4.7044, + "step": 8827 + }, + { + "epoch": 0.7524077388562175, + "grad_norm": 39.095967035361426, + "learning_rate": 9.323376046295818e-06, + "loss": 3.7222, + "step": 8828 + }, + { + "epoch": 0.752492968550243, + "grad_norm": 38.685619560063635, + "learning_rate": 9.323126945130576e-06, + "loss": 4.3429, + "step": 8829 + }, + { + "epoch": 0.7525781982442683, + "grad_norm": 39.94560659488613, + "learning_rate": 9.322877801449099e-06, + "loss": 3.4263, + "step": 8830 + }, + { + "epoch": 0.7526634279382937, + "grad_norm": 141.1028691909985, + "learning_rate": 9.322628615253834e-06, + "loss": 6.2027, + "step": 8831 + }, + { + "epoch": 0.7527486576323191, + "grad_norm": 47.438454836871706, + "learning_rate": 9.322379386547236e-06, + "loss": 4.4231, + "step": 8832 + }, + { + "epoch": 0.7528338873263445, + "grad_norm": 40.476577849060696, + "learning_rate": 9.322130115331756e-06, + "loss": 3.6164, + "step": 8833 + }, + { + "epoch": 0.7529191170203698, + "grad_norm": 39.91292590851267, + "learning_rate": 9.321880801609841e-06, + "loss": 4.7949, + "step": 8834 + }, + { + "epoch": 0.7530043467143953, + "grad_norm": 52.467583096976824, + "learning_rate": 9.321631445383948e-06, + "loss": 4.1393, + "step": 8835 + }, + { + "epoch": 0.7530895764084207, + "grad_norm": 30.33520894152308, + "learning_rate": 9.321382046656524e-06, + "loss": 3.3562, + "step": 8836 + }, + { + "epoch": 0.7531748061024461, + "grad_norm": 37.98335799243818, + "learning_rate": 9.321132605430027e-06, + "loss": 3.5022, + "step": 8837 + }, + { + "epoch": 0.7532600357964715, + "grad_norm": 37.31886848844817, + "learning_rate": 9.320883121706907e-06, + "loss": 3.8453, + "step": 8838 + }, + { + "epoch": 0.7533452654904969, + "grad_norm": 36.91789889298272, + "learning_rate": 9.320633595489619e-06, + "loss": 3.1195, + "step": 8839 + }, + { + "epoch": 0.7534304951845223, + "grad_norm": 93.89453455154735, + "learning_rate": 9.320384026780616e-06, + "loss": 3.8689, + "step": 8840 + }, + { + "epoch": 0.7535157248785477, + "grad_norm": 70.10506838052956, + "learning_rate": 9.320134415582352e-06, + "loss": 4.8204, + "step": 8841 + }, + { + "epoch": 0.753600954572573, + "grad_norm": 78.32305984117268, + "learning_rate": 9.319884761897284e-06, + "loss": 3.6414, + "step": 8842 + }, + { + "epoch": 0.7536861842665985, + "grad_norm": 34.46079398602058, + "learning_rate": 9.319635065727865e-06, + "loss": 2.7649, + "step": 8843 + }, + { + "epoch": 0.7537714139606239, + "grad_norm": 61.30766828110029, + "learning_rate": 9.319385327076553e-06, + "loss": 4.625, + "step": 8844 + }, + { + "epoch": 0.7538566436546493, + "grad_norm": 28.846028297090125, + "learning_rate": 9.319135545945802e-06, + "loss": 3.3926, + "step": 8845 + }, + { + "epoch": 0.7539418733486747, + "grad_norm": 54.67539075309497, + "learning_rate": 9.31888572233807e-06, + "loss": 4.4987, + "step": 8846 + }, + { + "epoch": 0.7540271030427, + "grad_norm": 36.71447622837422, + "learning_rate": 9.318635856255811e-06, + "loss": 3.6796, + "step": 8847 + }, + { + "epoch": 0.7541123327367255, + "grad_norm": 46.572061569147166, + "learning_rate": 9.318385947701484e-06, + "loss": 4.4131, + "step": 8848 + }, + { + "epoch": 0.7541975624307509, + "grad_norm": 52.62703696685985, + "learning_rate": 9.31813599667755e-06, + "loss": 3.8508, + "step": 8849 + }, + { + "epoch": 0.7542827921247762, + "grad_norm": 60.07166785757904, + "learning_rate": 9.317886003186463e-06, + "loss": 4.2543, + "step": 8850 + }, + { + "epoch": 0.7543680218188017, + "grad_norm": 104.95593842417398, + "learning_rate": 9.317635967230683e-06, + "loss": 4.6694, + "step": 8851 + }, + { + "epoch": 0.754453251512827, + "grad_norm": 61.53794374074041, + "learning_rate": 9.317385888812668e-06, + "loss": 3.7493, + "step": 8852 + }, + { + "epoch": 0.7545384812068525, + "grad_norm": 32.6640862324898, + "learning_rate": 9.31713576793488e-06, + "loss": 3.3389, + "step": 8853 + }, + { + "epoch": 0.7546237109008779, + "grad_norm": 142.5449670921446, + "learning_rate": 9.316885604599778e-06, + "loss": 5.4328, + "step": 8854 + }, + { + "epoch": 0.7547089405949032, + "grad_norm": 48.769504548431776, + "learning_rate": 9.316635398809817e-06, + "loss": 3.5777, + "step": 8855 + }, + { + "epoch": 0.7547941702889287, + "grad_norm": 73.15977198039852, + "learning_rate": 9.316385150567465e-06, + "loss": 4.0788, + "step": 8856 + }, + { + "epoch": 0.7548793999829541, + "grad_norm": 52.2333688831959, + "learning_rate": 9.316134859875182e-06, + "loss": 3.8662, + "step": 8857 + }, + { + "epoch": 0.7549646296769794, + "grad_norm": 36.36426604501878, + "learning_rate": 9.315884526735425e-06, + "loss": 4.8474, + "step": 8858 + }, + { + "epoch": 0.7550498593710049, + "grad_norm": 36.058271197169624, + "learning_rate": 9.315634151150658e-06, + "loss": 4.1503, + "step": 8859 + }, + { + "epoch": 0.7551350890650302, + "grad_norm": 46.19128437839205, + "learning_rate": 9.315383733123346e-06, + "loss": 4.4747, + "step": 8860 + }, + { + "epoch": 0.7552203187590557, + "grad_norm": 82.99318998334162, + "learning_rate": 9.31513327265595e-06, + "loss": 4.9813, + "step": 8861 + }, + { + "epoch": 0.7553055484530811, + "grad_norm": 121.14201116408219, + "learning_rate": 9.314882769750932e-06, + "loss": 6.4794, + "step": 8862 + }, + { + "epoch": 0.7553907781471064, + "grad_norm": 59.29390495014795, + "learning_rate": 9.314632224410757e-06, + "loss": 4.0565, + "step": 8863 + }, + { + "epoch": 0.7554760078411319, + "grad_norm": 32.52075998014036, + "learning_rate": 9.314381636637888e-06, + "loss": 4.3278, + "step": 8864 + }, + { + "epoch": 0.7555612375351572, + "grad_norm": 53.18442337193749, + "learning_rate": 9.31413100643479e-06, + "loss": 5.1056, + "step": 8865 + }, + { + "epoch": 0.7556464672291826, + "grad_norm": 49.74412101344931, + "learning_rate": 9.313880333803929e-06, + "loss": 3.8803, + "step": 8866 + }, + { + "epoch": 0.7557316969232081, + "grad_norm": 39.151254694001594, + "learning_rate": 9.313629618747767e-06, + "loss": 4.0952, + "step": 8867 + }, + { + "epoch": 0.7558169266172334, + "grad_norm": 168.21719692186926, + "learning_rate": 9.313378861268774e-06, + "loss": 4.5182, + "step": 8868 + }, + { + "epoch": 0.7559021563112588, + "grad_norm": 41.937641798741645, + "learning_rate": 9.313128061369411e-06, + "loss": 2.8245, + "step": 8869 + }, + { + "epoch": 0.7559873860052843, + "grad_norm": 49.06888065909885, + "learning_rate": 9.31287721905215e-06, + "loss": 3.8161, + "step": 8870 + }, + { + "epoch": 0.7560726156993096, + "grad_norm": 49.44648484704931, + "learning_rate": 9.312626334319455e-06, + "loss": 4.7248, + "step": 8871 + }, + { + "epoch": 0.7561578453933351, + "grad_norm": 31.560017852549787, + "learning_rate": 9.312375407173792e-06, + "loss": 2.0917, + "step": 8872 + }, + { + "epoch": 0.7562430750873604, + "grad_norm": 46.52860073624489, + "learning_rate": 9.312124437617632e-06, + "loss": 4.0543, + "step": 8873 + }, + { + "epoch": 0.7563283047813858, + "grad_norm": 55.30907454714142, + "learning_rate": 9.311873425653442e-06, + "loss": 3.448, + "step": 8874 + }, + { + "epoch": 0.7564135344754113, + "grad_norm": 49.503800776730664, + "learning_rate": 9.311622371283688e-06, + "loss": 3.5533, + "step": 8875 + }, + { + "epoch": 0.7564987641694366, + "grad_norm": 76.0725870995064, + "learning_rate": 9.311371274510843e-06, + "loss": 3.6729, + "step": 8876 + }, + { + "epoch": 0.756583993863462, + "grad_norm": 38.84692164288412, + "learning_rate": 9.311120135337374e-06, + "loss": 2.6096, + "step": 8877 + }, + { + "epoch": 0.7566692235574874, + "grad_norm": 34.661497881279246, + "learning_rate": 9.310868953765753e-06, + "loss": 2.8469, + "step": 8878 + }, + { + "epoch": 0.7567544532515128, + "grad_norm": 71.80564562481082, + "learning_rate": 9.310617729798446e-06, + "loss": 4.9569, + "step": 8879 + }, + { + "epoch": 0.7568396829455383, + "grad_norm": 84.77473977286444, + "learning_rate": 9.31036646343793e-06, + "loss": 4.5121, + "step": 8880 + }, + { + "epoch": 0.7569249126395636, + "grad_norm": 27.39346059626975, + "learning_rate": 9.31011515468667e-06, + "loss": 3.2557, + "step": 8881 + }, + { + "epoch": 0.757010142333589, + "grad_norm": 49.276899164068425, + "learning_rate": 9.30986380354714e-06, + "loss": 3.437, + "step": 8882 + }, + { + "epoch": 0.7570953720276145, + "grad_norm": 43.7086730402091, + "learning_rate": 9.309612410021813e-06, + "loss": 3.9829, + "step": 8883 + }, + { + "epoch": 0.7571806017216398, + "grad_norm": 53.9569516018267, + "learning_rate": 9.309360974113161e-06, + "loss": 3.1591, + "step": 8884 + }, + { + "epoch": 0.7572658314156652, + "grad_norm": 157.21537609986416, + "learning_rate": 9.309109495823655e-06, + "loss": 5.4334, + "step": 8885 + }, + { + "epoch": 0.7573510611096906, + "grad_norm": 37.4610641109968, + "learning_rate": 9.30885797515577e-06, + "loss": 2.3335, + "step": 8886 + }, + { + "epoch": 0.757436290803716, + "grad_norm": 80.2331502526473, + "learning_rate": 9.308606412111979e-06, + "loss": 5.1173, + "step": 8887 + }, + { + "epoch": 0.7575215204977415, + "grad_norm": 36.83413526712184, + "learning_rate": 9.308354806694756e-06, + "loss": 3.571, + "step": 8888 + }, + { + "epoch": 0.7576067501917668, + "grad_norm": 49.383365451558426, + "learning_rate": 9.308103158906576e-06, + "loss": 4.5135, + "step": 8889 + }, + { + "epoch": 0.7576919798857922, + "grad_norm": 111.39314879837245, + "learning_rate": 9.30785146874991e-06, + "loss": 5.0248, + "step": 8890 + }, + { + "epoch": 0.7577772095798176, + "grad_norm": 61.98430008089398, + "learning_rate": 9.307599736227242e-06, + "loss": 5.3735, + "step": 8891 + }, + { + "epoch": 0.757862439273843, + "grad_norm": 45.35983238428603, + "learning_rate": 9.307347961341038e-06, + "loss": 4.0559, + "step": 8892 + }, + { + "epoch": 0.7579476689678684, + "grad_norm": 76.7474609142508, + "learning_rate": 9.30709614409378e-06, + "loss": 5.2028, + "step": 8893 + }, + { + "epoch": 0.7580328986618938, + "grad_norm": 44.607649323346095, + "learning_rate": 9.306844284487944e-06, + "loss": 4.0462, + "step": 8894 + }, + { + "epoch": 0.7581181283559192, + "grad_norm": 44.48204136360979, + "learning_rate": 9.306592382526005e-06, + "loss": 3.4748, + "step": 8895 + }, + { + "epoch": 0.7582033580499447, + "grad_norm": 82.63879921992945, + "learning_rate": 9.306340438210439e-06, + "loss": 4.3356, + "step": 8896 + }, + { + "epoch": 0.75828858774397, + "grad_norm": 41.49177577753257, + "learning_rate": 9.30608845154373e-06, + "loss": 4.4007, + "step": 8897 + }, + { + "epoch": 0.7583738174379954, + "grad_norm": 55.90122040323806, + "learning_rate": 9.305836422528349e-06, + "loss": 3.1882, + "step": 8898 + }, + { + "epoch": 0.7584590471320208, + "grad_norm": 39.1784448709948, + "learning_rate": 9.305584351166779e-06, + "loss": 2.7093, + "step": 8899 + }, + { + "epoch": 0.7585442768260462, + "grad_norm": 54.169841406404274, + "learning_rate": 9.305332237461497e-06, + "loss": 4.5339, + "step": 8900 + }, + { + "epoch": 0.7586295065200716, + "grad_norm": 55.03193680828255, + "learning_rate": 9.305080081414983e-06, + "loss": 4.2236, + "step": 8901 + }, + { + "epoch": 0.758714736214097, + "grad_norm": 42.87300013003739, + "learning_rate": 9.30482788302972e-06, + "loss": 4.6713, + "step": 8902 + }, + { + "epoch": 0.7587999659081224, + "grad_norm": 81.34827800041067, + "learning_rate": 9.304575642308181e-06, + "loss": 5.1024, + "step": 8903 + }, + { + "epoch": 0.7588851956021477, + "grad_norm": 35.70043790741008, + "learning_rate": 9.304323359252854e-06, + "loss": 4.1178, + "step": 8904 + }, + { + "epoch": 0.7589704252961732, + "grad_norm": 48.86327875257204, + "learning_rate": 9.304071033866216e-06, + "loss": 4.9164, + "step": 8905 + }, + { + "epoch": 0.7590556549901986, + "grad_norm": 80.2622933981368, + "learning_rate": 9.30381866615075e-06, + "loss": 3.2166, + "step": 8906 + }, + { + "epoch": 0.759140884684224, + "grad_norm": 48.39187149329811, + "learning_rate": 9.303566256108937e-06, + "loss": 3.9796, + "step": 8907 + }, + { + "epoch": 0.7592261143782494, + "grad_norm": 175.91646564701824, + "learning_rate": 9.30331380374326e-06, + "loss": 3.2842, + "step": 8908 + }, + { + "epoch": 0.7593113440722747, + "grad_norm": 37.053203298843464, + "learning_rate": 9.303061309056202e-06, + "loss": 4.38, + "step": 8909 + }, + { + "epoch": 0.7593965737663002, + "grad_norm": 42.93710095851039, + "learning_rate": 9.302808772050247e-06, + "loss": 3.9758, + "step": 8910 + }, + { + "epoch": 0.7594818034603256, + "grad_norm": 34.50521438535424, + "learning_rate": 9.302556192727876e-06, + "loss": 4.1764, + "step": 8911 + }, + { + "epoch": 0.7595670331543509, + "grad_norm": 67.85629944556652, + "learning_rate": 9.302303571091575e-06, + "loss": 4.5449, + "step": 8912 + }, + { + "epoch": 0.7596522628483764, + "grad_norm": 26.64110649470672, + "learning_rate": 9.30205090714383e-06, + "loss": 2.9287, + "step": 8913 + }, + { + "epoch": 0.7597374925424017, + "grad_norm": 62.734510382599815, + "learning_rate": 9.30179820088712e-06, + "loss": 2.8574, + "step": 8914 + }, + { + "epoch": 0.7598227222364272, + "grad_norm": 54.791812424625114, + "learning_rate": 9.301545452323938e-06, + "loss": 3.6582, + "step": 8915 + }, + { + "epoch": 0.7599079519304526, + "grad_norm": 41.03752199721257, + "learning_rate": 9.301292661456761e-06, + "loss": 2.9372, + "step": 8916 + }, + { + "epoch": 0.7599931816244779, + "grad_norm": 52.21635641574842, + "learning_rate": 9.301039828288084e-06, + "loss": 4.4918, + "step": 8917 + }, + { + "epoch": 0.7600784113185034, + "grad_norm": 36.2101864871482, + "learning_rate": 9.300786952820387e-06, + "loss": 4.2591, + "step": 8918 + }, + { + "epoch": 0.7601636410125288, + "grad_norm": 93.2291985425405, + "learning_rate": 9.30053403505616e-06, + "loss": 4.6564, + "step": 8919 + }, + { + "epoch": 0.7602488707065541, + "grad_norm": 50.605148317196864, + "learning_rate": 9.30028107499789e-06, + "loss": 4.0117, + "step": 8920 + }, + { + "epoch": 0.7603341004005796, + "grad_norm": 43.68182495582931, + "learning_rate": 9.300028072648063e-06, + "loss": 3.7115, + "step": 8921 + }, + { + "epoch": 0.7604193300946049, + "grad_norm": 95.44772512377521, + "learning_rate": 9.29977502800917e-06, + "loss": 5.1352, + "step": 8922 + }, + { + "epoch": 0.7605045597886304, + "grad_norm": 29.61482143529959, + "learning_rate": 9.299521941083698e-06, + "loss": 2.9619, + "step": 8923 + }, + { + "epoch": 0.7605897894826558, + "grad_norm": 45.82781334363319, + "learning_rate": 9.299268811874136e-06, + "loss": 4.2565, + "step": 8924 + }, + { + "epoch": 0.7606750191766811, + "grad_norm": 57.381459385904186, + "learning_rate": 9.299015640382973e-06, + "loss": 2.9085, + "step": 8925 + }, + { + "epoch": 0.7607602488707066, + "grad_norm": 39.90949169653859, + "learning_rate": 9.2987624266127e-06, + "loss": 2.7889, + "step": 8926 + }, + { + "epoch": 0.760845478564732, + "grad_norm": 41.47818909983261, + "learning_rate": 9.298509170565809e-06, + "loss": 3.6162, + "step": 8927 + }, + { + "epoch": 0.7609307082587573, + "grad_norm": 33.20676444080916, + "learning_rate": 9.298255872244785e-06, + "loss": 3.787, + "step": 8928 + }, + { + "epoch": 0.7610159379527828, + "grad_norm": 48.09959538621222, + "learning_rate": 9.298002531652122e-06, + "loss": 4.2931, + "step": 8929 + }, + { + "epoch": 0.7611011676468081, + "grad_norm": 49.62263578901242, + "learning_rate": 9.297749148790314e-06, + "loss": 3.8963, + "step": 8930 + }, + { + "epoch": 0.7611863973408336, + "grad_norm": 37.5816160781211, + "learning_rate": 9.297495723661851e-06, + "loss": 4.7949, + "step": 8931 + }, + { + "epoch": 0.761271627034859, + "grad_norm": 62.488342841645014, + "learning_rate": 9.297242256269225e-06, + "loss": 4.3682, + "step": 8932 + }, + { + "epoch": 0.7613568567288843, + "grad_norm": 32.90514991992983, + "learning_rate": 9.296988746614928e-06, + "loss": 3.9056, + "step": 8933 + }, + { + "epoch": 0.7614420864229098, + "grad_norm": 43.39068244631889, + "learning_rate": 9.296735194701456e-06, + "loss": 3.9425, + "step": 8934 + }, + { + "epoch": 0.7615273161169351, + "grad_norm": 64.5669182876647, + "learning_rate": 9.296481600531297e-06, + "loss": 5.0429, + "step": 8935 + }, + { + "epoch": 0.7616125458109605, + "grad_norm": 29.765153231793363, + "learning_rate": 9.296227964106953e-06, + "loss": 3.7862, + "step": 8936 + }, + { + "epoch": 0.761697775504986, + "grad_norm": 64.4753310838772, + "learning_rate": 9.29597428543091e-06, + "loss": 4.788, + "step": 8937 + }, + { + "epoch": 0.7617830051990113, + "grad_norm": 39.3192745015446, + "learning_rate": 9.29572056450567e-06, + "loss": 4.6919, + "step": 8938 + }, + { + "epoch": 0.7618682348930368, + "grad_norm": 35.75453119164952, + "learning_rate": 9.295466801333725e-06, + "loss": 3.5245, + "step": 8939 + }, + { + "epoch": 0.7619534645870621, + "grad_norm": 32.62777214032239, + "learning_rate": 9.295212995917572e-06, + "loss": 4.7253, + "step": 8940 + }, + { + "epoch": 0.7620386942810875, + "grad_norm": 31.869830390113393, + "learning_rate": 9.294959148259703e-06, + "loss": 3.3419, + "step": 8941 + }, + { + "epoch": 0.762123923975113, + "grad_norm": 113.29125958199478, + "learning_rate": 9.294705258362618e-06, + "loss": 4.1264, + "step": 8942 + }, + { + "epoch": 0.7622091536691383, + "grad_norm": 41.797865220208074, + "learning_rate": 9.294451326228815e-06, + "loss": 3.7491, + "step": 8943 + }, + { + "epoch": 0.7622943833631637, + "grad_norm": 52.06252154167989, + "learning_rate": 9.294197351860788e-06, + "loss": 4.2846, + "step": 8944 + }, + { + "epoch": 0.7623796130571892, + "grad_norm": 47.01446229991357, + "learning_rate": 9.293943335261038e-06, + "loss": 2.6546, + "step": 8945 + }, + { + "epoch": 0.7624648427512145, + "grad_norm": 53.5071870429445, + "learning_rate": 9.293689276432059e-06, + "loss": 3.7037, + "step": 8946 + }, + { + "epoch": 0.7625500724452399, + "grad_norm": 31.18605307343776, + "learning_rate": 9.293435175376355e-06, + "loss": 3.2304, + "step": 8947 + }, + { + "epoch": 0.7626353021392653, + "grad_norm": 35.922061991267284, + "learning_rate": 9.29318103209642e-06, + "loss": 3.1713, + "step": 8948 + }, + { + "epoch": 0.7627205318332907, + "grad_norm": 86.29943706990487, + "learning_rate": 9.292926846594757e-06, + "loss": 2.9646, + "step": 8949 + }, + { + "epoch": 0.7628057615273162, + "grad_norm": 92.18005613642896, + "learning_rate": 9.292672618873862e-06, + "loss": 4.565, + "step": 8950 + }, + { + "epoch": 0.7628909912213415, + "grad_norm": 49.236640636443404, + "learning_rate": 9.29241834893624e-06, + "loss": 4.2626, + "step": 8951 + }, + { + "epoch": 0.7629762209153669, + "grad_norm": 44.54201993853322, + "learning_rate": 9.292164036784388e-06, + "loss": 4.549, + "step": 8952 + }, + { + "epoch": 0.7630614506093923, + "grad_norm": 31.713401708176942, + "learning_rate": 9.29190968242081e-06, + "loss": 3.7257, + "step": 8953 + }, + { + "epoch": 0.7631466803034177, + "grad_norm": 32.084215642387015, + "learning_rate": 9.291655285848002e-06, + "loss": 2.5735, + "step": 8954 + }, + { + "epoch": 0.7632319099974431, + "grad_norm": 47.99309546105791, + "learning_rate": 9.291400847068472e-06, + "loss": 4.3342, + "step": 8955 + }, + { + "epoch": 0.7633171396914685, + "grad_norm": 50.183972835624125, + "learning_rate": 9.29114636608472e-06, + "loss": 4.0664, + "step": 8956 + }, + { + "epoch": 0.7634023693854939, + "grad_norm": 45.703390628490695, + "learning_rate": 9.290891842899248e-06, + "loss": 3.9034, + "step": 8957 + }, + { + "epoch": 0.7634875990795194, + "grad_norm": 47.24082427108974, + "learning_rate": 9.29063727751456e-06, + "loss": 4.7662, + "step": 8958 + }, + { + "epoch": 0.7635728287735447, + "grad_norm": 42.90976125263094, + "learning_rate": 9.29038266993316e-06, + "loss": 4.5412, + "step": 8959 + }, + { + "epoch": 0.7636580584675701, + "grad_norm": 48.513174097781175, + "learning_rate": 9.290128020157552e-06, + "loss": 5.0925, + "step": 8960 + }, + { + "epoch": 0.7637432881615955, + "grad_norm": 54.31509126145118, + "learning_rate": 9.289873328190237e-06, + "loss": 4.5553, + "step": 8961 + }, + { + "epoch": 0.7638285178556209, + "grad_norm": 203.7461426004996, + "learning_rate": 9.289618594033724e-06, + "loss": 4.7022, + "step": 8962 + }, + { + "epoch": 0.7639137475496462, + "grad_norm": 35.51015334828807, + "learning_rate": 9.289363817690516e-06, + "loss": 4.3262, + "step": 8963 + }, + { + "epoch": 0.7639989772436717, + "grad_norm": 33.98166918134331, + "learning_rate": 9.28910899916312e-06, + "loss": 3.7568, + "step": 8964 + }, + { + "epoch": 0.7640842069376971, + "grad_norm": 35.62589433973817, + "learning_rate": 9.288854138454041e-06, + "loss": 3.0409, + "step": 8965 + }, + { + "epoch": 0.7641694366317225, + "grad_norm": 80.61910742487962, + "learning_rate": 9.288599235565787e-06, + "loss": 5.0743, + "step": 8966 + }, + { + "epoch": 0.7642546663257479, + "grad_norm": 53.81265756851437, + "learning_rate": 9.288344290500862e-06, + "loss": 4.9104, + "step": 8967 + }, + { + "epoch": 0.7643398960197733, + "grad_norm": 51.371190761802694, + "learning_rate": 9.288089303261775e-06, + "loss": 4.6345, + "step": 8968 + }, + { + "epoch": 0.7644251257137987, + "grad_norm": 112.31040583564494, + "learning_rate": 9.287834273851035e-06, + "loss": 4.5967, + "step": 8969 + }, + { + "epoch": 0.7645103554078241, + "grad_norm": 33.79985518857962, + "learning_rate": 9.287579202271146e-06, + "loss": 3.4719, + "step": 8970 + }, + { + "epoch": 0.7645955851018494, + "grad_norm": 79.4126724268548, + "learning_rate": 9.28732408852462e-06, + "loss": 4.9571, + "step": 8971 + }, + { + "epoch": 0.7646808147958749, + "grad_norm": 64.89406742354169, + "learning_rate": 9.287068932613967e-06, + "loss": 4.2382, + "step": 8972 + }, + { + "epoch": 0.7647660444899003, + "grad_norm": 60.44807981427213, + "learning_rate": 9.286813734541694e-06, + "loss": 3.4168, + "step": 8973 + }, + { + "epoch": 0.7648512741839257, + "grad_norm": 68.58633488420998, + "learning_rate": 9.286558494310312e-06, + "loss": 4.2886, + "step": 8974 + }, + { + "epoch": 0.7649365038779511, + "grad_norm": 54.68473936471173, + "learning_rate": 9.28630321192233e-06, + "loss": 4.4864, + "step": 8975 + }, + { + "epoch": 0.7650217335719764, + "grad_norm": 43.20674315511884, + "learning_rate": 9.286047887380258e-06, + "loss": 3.8222, + "step": 8976 + }, + { + "epoch": 0.7651069632660019, + "grad_norm": 67.31152139948638, + "learning_rate": 9.28579252068661e-06, + "loss": 4.0687, + "step": 8977 + }, + { + "epoch": 0.7651921929600273, + "grad_norm": 34.61202205542362, + "learning_rate": 9.285537111843893e-06, + "loss": 3.8158, + "step": 8978 + }, + { + "epoch": 0.7652774226540526, + "grad_norm": 40.374220313137585, + "learning_rate": 9.285281660854623e-06, + "loss": 4.7792, + "step": 8979 + }, + { + "epoch": 0.7653626523480781, + "grad_norm": 65.88701468301085, + "learning_rate": 9.28502616772131e-06, + "loss": 3.9954, + "step": 8980 + }, + { + "epoch": 0.7654478820421035, + "grad_norm": 55.4821701395806, + "learning_rate": 9.284770632446468e-06, + "loss": 3.9155, + "step": 8981 + }, + { + "epoch": 0.7655331117361288, + "grad_norm": 31.38224553126836, + "learning_rate": 9.28451505503261e-06, + "loss": 3.5993, + "step": 8982 + }, + { + "epoch": 0.7656183414301543, + "grad_norm": 68.79186319795829, + "learning_rate": 9.284259435482249e-06, + "loss": 4.9571, + "step": 8983 + }, + { + "epoch": 0.7657035711241796, + "grad_norm": 33.278009175994725, + "learning_rate": 9.284003773797895e-06, + "loss": 3.3714, + "step": 8984 + }, + { + "epoch": 0.7657888008182051, + "grad_norm": 43.358858398395974, + "learning_rate": 9.28374806998207e-06, + "loss": 3.8256, + "step": 8985 + }, + { + "epoch": 0.7658740305122305, + "grad_norm": 36.112887161724785, + "learning_rate": 9.283492324037285e-06, + "loss": 3.9807, + "step": 8986 + }, + { + "epoch": 0.7659592602062558, + "grad_norm": 57.55548228319689, + "learning_rate": 9.283236535966055e-06, + "loss": 4.1633, + "step": 8987 + }, + { + "epoch": 0.7660444899002813, + "grad_norm": 84.12080918577392, + "learning_rate": 9.282980705770894e-06, + "loss": 3.8664, + "step": 8988 + }, + { + "epoch": 0.7661297195943066, + "grad_norm": 26.824203607100113, + "learning_rate": 9.28272483345432e-06, + "loss": 3.6684, + "step": 8989 + }, + { + "epoch": 0.766214949288332, + "grad_norm": 44.34098380555111, + "learning_rate": 9.282468919018849e-06, + "loss": 4.4501, + "step": 8990 + }, + { + "epoch": 0.7663001789823575, + "grad_norm": 179.432088007489, + "learning_rate": 9.282212962467e-06, + "loss": 3.9818, + "step": 8991 + }, + { + "epoch": 0.7663854086763828, + "grad_norm": 90.03882365859202, + "learning_rate": 9.281956963801286e-06, + "loss": 6.577, + "step": 8992 + }, + { + "epoch": 0.7664706383704083, + "grad_norm": 88.75550062588803, + "learning_rate": 9.281700923024228e-06, + "loss": 4.5764, + "step": 8993 + }, + { + "epoch": 0.7665558680644337, + "grad_norm": 54.637644413291646, + "learning_rate": 9.281444840138342e-06, + "loss": 3.2543, + "step": 8994 + }, + { + "epoch": 0.766641097758459, + "grad_norm": 41.859419762217705, + "learning_rate": 9.281188715146147e-06, + "loss": 3.595, + "step": 8995 + }, + { + "epoch": 0.7667263274524845, + "grad_norm": 42.99898737456987, + "learning_rate": 9.280932548050163e-06, + "loss": 3.8173, + "step": 8996 + }, + { + "epoch": 0.7668115571465098, + "grad_norm": 32.64337567971583, + "learning_rate": 9.280676338852908e-06, + "loss": 4.3817, + "step": 8997 + }, + { + "epoch": 0.7668967868405352, + "grad_norm": 42.80614527885558, + "learning_rate": 9.280420087556903e-06, + "loss": 4.4498, + "step": 8998 + }, + { + "epoch": 0.7669820165345607, + "grad_norm": 135.30187212492496, + "learning_rate": 9.280163794164666e-06, + "loss": 4.2192, + "step": 8999 + }, + { + "epoch": 0.767067246228586, + "grad_norm": 96.89626218669918, + "learning_rate": 9.279907458678719e-06, + "loss": 3.7031, + "step": 9000 + }, + { + "epoch": 0.7671524759226115, + "grad_norm": 46.98789985296226, + "learning_rate": 9.279651081101583e-06, + "loss": 4.547, + "step": 9001 + }, + { + "epoch": 0.7672377056166368, + "grad_norm": 55.45919564540598, + "learning_rate": 9.27939466143578e-06, + "loss": 4.9778, + "step": 9002 + }, + { + "epoch": 0.7673229353106622, + "grad_norm": 37.38986655410804, + "learning_rate": 9.279138199683831e-06, + "loss": 3.5472, + "step": 9003 + }, + { + "epoch": 0.7674081650046877, + "grad_norm": 39.8603376910377, + "learning_rate": 9.278881695848256e-06, + "loss": 3.1035, + "step": 9004 + }, + { + "epoch": 0.767493394698713, + "grad_norm": 39.98718420266864, + "learning_rate": 9.278625149931582e-06, + "loss": 3.6648, + "step": 9005 + }, + { + "epoch": 0.7675786243927384, + "grad_norm": 40.23535183065743, + "learning_rate": 9.278368561936328e-06, + "loss": 3.5154, + "step": 9006 + }, + { + "epoch": 0.7676638540867639, + "grad_norm": 58.22017281948064, + "learning_rate": 9.278111931865022e-06, + "loss": 4.0685, + "step": 9007 + }, + { + "epoch": 0.7677490837807892, + "grad_norm": 53.15548868178774, + "learning_rate": 9.277855259720183e-06, + "loss": 4.3107, + "step": 9008 + }, + { + "epoch": 0.7678343134748147, + "grad_norm": 67.2594346564451, + "learning_rate": 9.277598545504338e-06, + "loss": 2.2541, + "step": 9009 + }, + { + "epoch": 0.76791954316884, + "grad_norm": 36.92529732059073, + "learning_rate": 9.27734178922001e-06, + "loss": 3.7105, + "step": 9010 + }, + { + "epoch": 0.7680047728628654, + "grad_norm": 48.69275088477501, + "learning_rate": 9.277084990869727e-06, + "loss": 3.8163, + "step": 9011 + }, + { + "epoch": 0.7680900025568909, + "grad_norm": 50.47895697723206, + "learning_rate": 9.276828150456012e-06, + "loss": 3.8058, + "step": 9012 + }, + { + "epoch": 0.7681752322509162, + "grad_norm": 78.85855155517527, + "learning_rate": 9.276571267981391e-06, + "loss": 5.0252, + "step": 9013 + }, + { + "epoch": 0.7682604619449416, + "grad_norm": 34.44184482539085, + "learning_rate": 9.276314343448392e-06, + "loss": 3.1251, + "step": 9014 + }, + { + "epoch": 0.768345691638967, + "grad_norm": 49.87198844329357, + "learning_rate": 9.276057376859539e-06, + "loss": 3.9852, + "step": 9015 + }, + { + "epoch": 0.7684309213329924, + "grad_norm": 54.45844616381535, + "learning_rate": 9.275800368217362e-06, + "loss": 4.2159, + "step": 9016 + }, + { + "epoch": 0.7685161510270178, + "grad_norm": 29.358648094021277, + "learning_rate": 9.275543317524387e-06, + "loss": 2.7782, + "step": 9017 + }, + { + "epoch": 0.7686013807210432, + "grad_norm": 30.518927487686824, + "learning_rate": 9.275286224783143e-06, + "loss": 3.2663, + "step": 9018 + }, + { + "epoch": 0.7686866104150686, + "grad_norm": 64.76285529650279, + "learning_rate": 9.275029089996157e-06, + "loss": 3.847, + "step": 9019 + }, + { + "epoch": 0.768771840109094, + "grad_norm": 48.51564638979584, + "learning_rate": 9.274771913165958e-06, + "loss": 4.7178, + "step": 9020 + }, + { + "epoch": 0.7688570698031194, + "grad_norm": 32.776943562951004, + "learning_rate": 9.274514694295078e-06, + "loss": 2.3013, + "step": 9021 + }, + { + "epoch": 0.7689422994971448, + "grad_norm": 96.89319714059452, + "learning_rate": 9.274257433386043e-06, + "loss": 3.7714, + "step": 9022 + }, + { + "epoch": 0.7690275291911702, + "grad_norm": 41.02212879825083, + "learning_rate": 9.274000130441385e-06, + "loss": 3.7167, + "step": 9023 + }, + { + "epoch": 0.7691127588851956, + "grad_norm": 43.78388954986739, + "learning_rate": 9.273742785463632e-06, + "loss": 4.2512, + "step": 9024 + }, + { + "epoch": 0.769197988579221, + "grad_norm": 136.89028405803631, + "learning_rate": 9.27348539845532e-06, + "loss": 5.5977, + "step": 9025 + }, + { + "epoch": 0.7692832182732464, + "grad_norm": 35.1553778875537, + "learning_rate": 9.273227969418976e-06, + "loss": 3.4056, + "step": 9026 + }, + { + "epoch": 0.7693684479672718, + "grad_norm": 64.19645819686416, + "learning_rate": 9.272970498357134e-06, + "loss": 4.1556, + "step": 9027 + }, + { + "epoch": 0.7694536776612972, + "grad_norm": 68.1160990416229, + "learning_rate": 9.272712985272324e-06, + "loss": 3.6541, + "step": 9028 + }, + { + "epoch": 0.7695389073553226, + "grad_norm": 28.400195727404075, + "learning_rate": 9.27245543016708e-06, + "loss": 2.8924, + "step": 9029 + }, + { + "epoch": 0.769624137049348, + "grad_norm": 33.085003088748216, + "learning_rate": 9.272197833043935e-06, + "loss": 2.7687, + "step": 9030 + }, + { + "epoch": 0.7697093667433734, + "grad_norm": 42.3701924543343, + "learning_rate": 9.27194019390542e-06, + "loss": 3.7917, + "step": 9031 + }, + { + "epoch": 0.7697945964373988, + "grad_norm": 50.69031792085473, + "learning_rate": 9.271682512754072e-06, + "loss": 4.5011, + "step": 9032 + }, + { + "epoch": 0.7698798261314241, + "grad_norm": 37.35853163486422, + "learning_rate": 9.271424789592422e-06, + "loss": 3.0548, + "step": 9033 + }, + { + "epoch": 0.7699650558254496, + "grad_norm": 144.2058081027964, + "learning_rate": 9.27116702442301e-06, + "loss": 3.1388, + "step": 9034 + }, + { + "epoch": 0.770050285519475, + "grad_norm": 39.99122147467269, + "learning_rate": 9.270909217248365e-06, + "loss": 3.9752, + "step": 9035 + }, + { + "epoch": 0.7701355152135004, + "grad_norm": 198.0224475900102, + "learning_rate": 9.270651368071027e-06, + "loss": 3.9321, + "step": 9036 + }, + { + "epoch": 0.7702207449075258, + "grad_norm": 35.94723944243729, + "learning_rate": 9.270393476893526e-06, + "loss": 3.7974, + "step": 9037 + }, + { + "epoch": 0.7703059746015511, + "grad_norm": 72.7061467298701, + "learning_rate": 9.270135543718406e-06, + "loss": 4.3166, + "step": 9038 + }, + { + "epoch": 0.7703912042955766, + "grad_norm": 78.63166017451158, + "learning_rate": 9.269877568548197e-06, + "loss": 4.4683, + "step": 9039 + }, + { + "epoch": 0.770476433989602, + "grad_norm": 41.306403122784594, + "learning_rate": 9.269619551385442e-06, + "loss": 2.2121, + "step": 9040 + }, + { + "epoch": 0.7705616636836273, + "grad_norm": 41.685659682626884, + "learning_rate": 9.269361492232672e-06, + "loss": 3.5213, + "step": 9041 + }, + { + "epoch": 0.7706468933776528, + "grad_norm": 35.15092203380865, + "learning_rate": 9.26910339109243e-06, + "loss": 3.4846, + "step": 9042 + }, + { + "epoch": 0.7707321230716782, + "grad_norm": 44.77900275755505, + "learning_rate": 9.268845247967254e-06, + "loss": 3.5536, + "step": 9043 + }, + { + "epoch": 0.7708173527657036, + "grad_norm": 79.32812192293764, + "learning_rate": 9.268587062859678e-06, + "loss": 4.0862, + "step": 9044 + }, + { + "epoch": 0.770902582459729, + "grad_norm": 43.134993660437985, + "learning_rate": 9.268328835772245e-06, + "loss": 4.0598, + "step": 9045 + }, + { + "epoch": 0.7709878121537543, + "grad_norm": 50.64768743648764, + "learning_rate": 9.268070566707495e-06, + "loss": 3.9764, + "step": 9046 + }, + { + "epoch": 0.7710730418477798, + "grad_norm": 71.79912504732062, + "learning_rate": 9.267812255667966e-06, + "loss": 5.0802, + "step": 9047 + }, + { + "epoch": 0.7711582715418052, + "grad_norm": 102.6411534523435, + "learning_rate": 9.2675539026562e-06, + "loss": 5.1534, + "step": 9048 + }, + { + "epoch": 0.7712435012358305, + "grad_norm": 52.33440025363952, + "learning_rate": 9.267295507674738e-06, + "loss": 4.7062, + "step": 9049 + }, + { + "epoch": 0.771328730929856, + "grad_norm": 59.05736391064799, + "learning_rate": 9.267037070726119e-06, + "loss": 3.7479, + "step": 9050 + }, + { + "epoch": 0.7714139606238813, + "grad_norm": 39.975188302397555, + "learning_rate": 9.266778591812886e-06, + "loss": 4.7829, + "step": 9051 + }, + { + "epoch": 0.7714991903179068, + "grad_norm": 35.211662949433524, + "learning_rate": 9.266520070937581e-06, + "loss": 3.8652, + "step": 9052 + }, + { + "epoch": 0.7715844200119322, + "grad_norm": 33.647122667991034, + "learning_rate": 9.266261508102747e-06, + "loss": 3.9468, + "step": 9053 + }, + { + "epoch": 0.7716696497059575, + "grad_norm": 58.707282039333386, + "learning_rate": 9.266002903310927e-06, + "loss": 4.3101, + "step": 9054 + }, + { + "epoch": 0.771754879399983, + "grad_norm": 99.65156460256667, + "learning_rate": 9.265744256564662e-06, + "loss": 4.3808, + "step": 9055 + }, + { + "epoch": 0.7718401090940084, + "grad_norm": 40.38352979507176, + "learning_rate": 9.265485567866499e-06, + "loss": 2.4082, + "step": 9056 + }, + { + "epoch": 0.7719253387880337, + "grad_norm": 44.007622791519545, + "learning_rate": 9.26522683721898e-06, + "loss": 3.528, + "step": 9057 + }, + { + "epoch": 0.7720105684820592, + "grad_norm": 37.318405609289584, + "learning_rate": 9.264968064624649e-06, + "loss": 3.4532, + "step": 9058 + }, + { + "epoch": 0.7720957981760845, + "grad_norm": 53.97111597827668, + "learning_rate": 9.264709250086052e-06, + "loss": 3.505, + "step": 9059 + }, + { + "epoch": 0.7721810278701099, + "grad_norm": 74.75537093890534, + "learning_rate": 9.264450393605734e-06, + "loss": 5.2535, + "step": 9060 + }, + { + "epoch": 0.7722662575641354, + "grad_norm": 49.98527709027214, + "learning_rate": 9.264191495186241e-06, + "loss": 3.465, + "step": 9061 + }, + { + "epoch": 0.7723514872581607, + "grad_norm": 55.182423795932536, + "learning_rate": 9.263932554830118e-06, + "loss": 3.8419, + "step": 9062 + }, + { + "epoch": 0.7724367169521862, + "grad_norm": 39.967663225604184, + "learning_rate": 9.263673572539915e-06, + "loss": 3.9993, + "step": 9063 + }, + { + "epoch": 0.7725219466462115, + "grad_norm": 32.15857486421225, + "learning_rate": 9.263414548318174e-06, + "loss": 2.5257, + "step": 9064 + }, + { + "epoch": 0.7726071763402369, + "grad_norm": 78.38482069730887, + "learning_rate": 9.263155482167447e-06, + "loss": 4.5467, + "step": 9065 + }, + { + "epoch": 0.7726924060342624, + "grad_norm": 64.50616210013946, + "learning_rate": 9.26289637409028e-06, + "loss": 5.1068, + "step": 9066 + }, + { + "epoch": 0.7727776357282877, + "grad_norm": 38.80419010721257, + "learning_rate": 9.26263722408922e-06, + "loss": 3.9543, + "step": 9067 + }, + { + "epoch": 0.7728628654223131, + "grad_norm": 57.57886659661577, + "learning_rate": 9.262378032166816e-06, + "loss": 3.2856, + "step": 9068 + }, + { + "epoch": 0.7729480951163386, + "grad_norm": 47.96045200774339, + "learning_rate": 9.262118798325622e-06, + "loss": 3.8319, + "step": 9069 + }, + { + "epoch": 0.7730333248103639, + "grad_norm": 51.27590770747775, + "learning_rate": 9.26185952256818e-06, + "loss": 4.2581, + "step": 9070 + }, + { + "epoch": 0.7731185545043894, + "grad_norm": 78.34154895078959, + "learning_rate": 9.261600204897042e-06, + "loss": 5.1345, + "step": 9071 + }, + { + "epoch": 0.7732037841984147, + "grad_norm": 109.3040286509304, + "learning_rate": 9.261340845314761e-06, + "loss": 4.5551, + "step": 9072 + }, + { + "epoch": 0.7732890138924401, + "grad_norm": 74.10414044269491, + "learning_rate": 9.261081443823885e-06, + "loss": 2.9063, + "step": 9073 + }, + { + "epoch": 0.7733742435864656, + "grad_norm": 39.09560128025329, + "learning_rate": 9.260822000426967e-06, + "loss": 4.3776, + "step": 9074 + }, + { + "epoch": 0.7734594732804909, + "grad_norm": 42.03630200786847, + "learning_rate": 9.260562515126558e-06, + "loss": 4.0507, + "step": 9075 + }, + { + "epoch": 0.7735447029745163, + "grad_norm": 26.200192808149275, + "learning_rate": 9.26030298792521e-06, + "loss": 2.1745, + "step": 9076 + }, + { + "epoch": 0.7736299326685417, + "grad_norm": 105.68714113371225, + "learning_rate": 9.260043418825473e-06, + "loss": 4.1552, + "step": 9077 + }, + { + "epoch": 0.7737151623625671, + "grad_norm": 63.8003634712199, + "learning_rate": 9.259783807829903e-06, + "loss": 2.859, + "step": 9078 + }, + { + "epoch": 0.7738003920565926, + "grad_norm": 22.24765704529975, + "learning_rate": 9.259524154941051e-06, + "loss": 2.2981, + "step": 9079 + }, + { + "epoch": 0.7738856217506179, + "grad_norm": 45.768887784449326, + "learning_rate": 9.259264460161473e-06, + "loss": 3.478, + "step": 9080 + }, + { + "epoch": 0.7739708514446433, + "grad_norm": 91.88545150532501, + "learning_rate": 9.25900472349372e-06, + "loss": 3.0048, + "step": 9081 + }, + { + "epoch": 0.7740560811386688, + "grad_norm": 32.26424240005175, + "learning_rate": 9.25874494494035e-06, + "loss": 3.0784, + "step": 9082 + }, + { + "epoch": 0.7741413108326941, + "grad_norm": 28.049852886574364, + "learning_rate": 9.258485124503915e-06, + "loss": 3.9527, + "step": 9083 + }, + { + "epoch": 0.7742265405267195, + "grad_norm": 45.17109411857077, + "learning_rate": 9.25822526218697e-06, + "loss": 4.6682, + "step": 9084 + }, + { + "epoch": 0.7743117702207449, + "grad_norm": 58.238232065588974, + "learning_rate": 9.257965357992071e-06, + "loss": 3.9485, + "step": 9085 + }, + { + "epoch": 0.7743969999147703, + "grad_norm": 47.137501910135484, + "learning_rate": 9.257705411921777e-06, + "loss": 4.1408, + "step": 9086 + }, + { + "epoch": 0.7744822296087958, + "grad_norm": 45.25501650627569, + "learning_rate": 9.25744542397864e-06, + "loss": 3.3063, + "step": 9087 + }, + { + "epoch": 0.7745674593028211, + "grad_norm": 50.086740641777126, + "learning_rate": 9.25718539416522e-06, + "loss": 3.4445, + "step": 9088 + }, + { + "epoch": 0.7746526889968465, + "grad_norm": 39.79859963077792, + "learning_rate": 9.256925322484074e-06, + "loss": 3.3368, + "step": 9089 + }, + { + "epoch": 0.7747379186908719, + "grad_norm": 26.493965597584026, + "learning_rate": 9.256665208937758e-06, + "loss": 3.1994, + "step": 9090 + }, + { + "epoch": 0.7748231483848973, + "grad_norm": 53.08616288387615, + "learning_rate": 9.256405053528832e-06, + "loss": 4.352, + "step": 9091 + }, + { + "epoch": 0.7749083780789227, + "grad_norm": 29.95785371641103, + "learning_rate": 9.256144856259851e-06, + "loss": 2.5205, + "step": 9092 + }, + { + "epoch": 0.7749936077729481, + "grad_norm": 51.115697583266716, + "learning_rate": 9.255884617133379e-06, + "loss": 4.1911, + "step": 9093 + }, + { + "epoch": 0.7750788374669735, + "grad_norm": 53.12283124620508, + "learning_rate": 9.255624336151973e-06, + "loss": 5.1215, + "step": 9094 + }, + { + "epoch": 0.7751640671609988, + "grad_norm": 35.58512372446275, + "learning_rate": 9.255364013318191e-06, + "loss": 3.4213, + "step": 9095 + }, + { + "epoch": 0.7752492968550243, + "grad_norm": 42.484463596722186, + "learning_rate": 9.255103648634596e-06, + "loss": 4.4662, + "step": 9096 + }, + { + "epoch": 0.7753345265490497, + "grad_norm": 76.8573488988868, + "learning_rate": 9.254843242103749e-06, + "loss": 3.9416, + "step": 9097 + }, + { + "epoch": 0.7754197562430751, + "grad_norm": 32.458430273703236, + "learning_rate": 9.254582793728207e-06, + "loss": 4.1813, + "step": 9098 + }, + { + "epoch": 0.7755049859371005, + "grad_norm": 58.11609821364328, + "learning_rate": 9.254322303510535e-06, + "loss": 2.5203, + "step": 9099 + }, + { + "epoch": 0.7755902156311258, + "grad_norm": 50.696171849205854, + "learning_rate": 9.254061771453292e-06, + "loss": 3.8102, + "step": 9100 + }, + { + "epoch": 0.7756754453251513, + "grad_norm": 51.693360121885, + "learning_rate": 9.253801197559043e-06, + "loss": 4.3685, + "step": 9101 + }, + { + "epoch": 0.7757606750191767, + "grad_norm": 49.6855618831147, + "learning_rate": 9.25354058183035e-06, + "loss": 4.9158, + "step": 9102 + }, + { + "epoch": 0.775845904713202, + "grad_norm": 35.021748846729956, + "learning_rate": 9.253279924269774e-06, + "loss": 3.5924, + "step": 9103 + }, + { + "epoch": 0.7759311344072275, + "grad_norm": 75.56153811373568, + "learning_rate": 9.253019224879883e-06, + "loss": 3.7594, + "step": 9104 + }, + { + "epoch": 0.7760163641012529, + "grad_norm": 45.62888367905464, + "learning_rate": 9.252758483663235e-06, + "loss": 3.6615, + "step": 9105 + }, + { + "epoch": 0.7761015937952783, + "grad_norm": 75.64856764804819, + "learning_rate": 9.252497700622398e-06, + "loss": 4.7532, + "step": 9106 + }, + { + "epoch": 0.7761868234893037, + "grad_norm": 61.625447974286196, + "learning_rate": 9.252236875759937e-06, + "loss": 4.235, + "step": 9107 + }, + { + "epoch": 0.776272053183329, + "grad_norm": 73.4211499902414, + "learning_rate": 9.251976009078414e-06, + "loss": 3.7613, + "step": 9108 + }, + { + "epoch": 0.7763572828773545, + "grad_norm": 40.49328308815921, + "learning_rate": 9.251715100580398e-06, + "loss": 3.6328, + "step": 9109 + }, + { + "epoch": 0.7764425125713799, + "grad_norm": 28.99374164568728, + "learning_rate": 9.251454150268453e-06, + "loss": 3.5075, + "step": 9110 + }, + { + "epoch": 0.7765277422654052, + "grad_norm": 39.16386972548432, + "learning_rate": 9.251193158145147e-06, + "loss": 3.5689, + "step": 9111 + }, + { + "epoch": 0.7766129719594307, + "grad_norm": 41.824011711610446, + "learning_rate": 9.250932124213044e-06, + "loss": 4.1216, + "step": 9112 + }, + { + "epoch": 0.776698201653456, + "grad_norm": 57.973589724479446, + "learning_rate": 9.250671048474713e-06, + "loss": 4.4921, + "step": 9113 + }, + { + "epoch": 0.7767834313474815, + "grad_norm": 71.36467757844264, + "learning_rate": 9.250409930932723e-06, + "loss": 3.5913, + "step": 9114 + }, + { + "epoch": 0.7768686610415069, + "grad_norm": 28.941061966202803, + "learning_rate": 9.250148771589638e-06, + "loss": 2.4902, + "step": 9115 + }, + { + "epoch": 0.7769538907355322, + "grad_norm": 46.621970428361294, + "learning_rate": 9.249887570448029e-06, + "loss": 4.3478, + "step": 9116 + }, + { + "epoch": 0.7770391204295577, + "grad_norm": 246.23878934369375, + "learning_rate": 9.249626327510464e-06, + "loss": 5.851, + "step": 9117 + }, + { + "epoch": 0.777124350123583, + "grad_norm": 100.47528574025293, + "learning_rate": 9.249365042779515e-06, + "loss": 5.4223, + "step": 9118 + }, + { + "epoch": 0.7772095798176084, + "grad_norm": 44.46504300059719, + "learning_rate": 9.249103716257747e-06, + "loss": 4.5775, + "step": 9119 + }, + { + "epoch": 0.7772948095116339, + "grad_norm": 48.79616254240767, + "learning_rate": 9.248842347947733e-06, + "loss": 2.8955, + "step": 9120 + }, + { + "epoch": 0.7773800392056592, + "grad_norm": 54.80820384522362, + "learning_rate": 9.248580937852044e-06, + "loss": 3.3643, + "step": 9121 + }, + { + "epoch": 0.7774652688996847, + "grad_norm": 35.31498209577405, + "learning_rate": 9.248319485973249e-06, + "loss": 4.0823, + "step": 9122 + }, + { + "epoch": 0.7775504985937101, + "grad_norm": 41.42400067707317, + "learning_rate": 9.248057992313919e-06, + "loss": 3.4729, + "step": 9123 + }, + { + "epoch": 0.7776357282877354, + "grad_norm": 54.30548068236119, + "learning_rate": 9.247796456876628e-06, + "loss": 4.3675, + "step": 9124 + }, + { + "epoch": 0.7777209579817609, + "grad_norm": 32.82501472605532, + "learning_rate": 9.247534879663947e-06, + "loss": 4.2083, + "step": 9125 + }, + { + "epoch": 0.7778061876757862, + "grad_norm": 50.40808054763213, + "learning_rate": 9.247273260678447e-06, + "loss": 4.2937, + "step": 9126 + }, + { + "epoch": 0.7778914173698116, + "grad_norm": 71.82645200864863, + "learning_rate": 9.247011599922701e-06, + "loss": 4.86, + "step": 9127 + }, + { + "epoch": 0.7779766470638371, + "grad_norm": 32.83551328762592, + "learning_rate": 9.246749897399286e-06, + "loss": 3.4647, + "step": 9128 + }, + { + "epoch": 0.7780618767578624, + "grad_norm": 38.41027858379765, + "learning_rate": 9.246488153110772e-06, + "loss": 3.5336, + "step": 9129 + }, + { + "epoch": 0.7781471064518878, + "grad_norm": 49.661762090812964, + "learning_rate": 9.246226367059736e-06, + "loss": 4.395, + "step": 9130 + }, + { + "epoch": 0.7782323361459133, + "grad_norm": 59.84905779216049, + "learning_rate": 9.24596453924875e-06, + "loss": 4.7513, + "step": 9131 + }, + { + "epoch": 0.7783175658399386, + "grad_norm": 31.431304396320616, + "learning_rate": 9.24570266968039e-06, + "loss": 3.5107, + "step": 9132 + }, + { + "epoch": 0.7784027955339641, + "grad_norm": 51.35885225182045, + "learning_rate": 9.245440758357231e-06, + "loss": 2.0769, + "step": 9133 + }, + { + "epoch": 0.7784880252279894, + "grad_norm": 35.014654013626505, + "learning_rate": 9.245178805281849e-06, + "loss": 3.5837, + "step": 9134 + }, + { + "epoch": 0.7785732549220148, + "grad_norm": 40.83043786255731, + "learning_rate": 9.244916810456822e-06, + "loss": 3.2055, + "step": 9135 + }, + { + "epoch": 0.7786584846160403, + "grad_norm": 59.03895465838537, + "learning_rate": 9.24465477388472e-06, + "loss": 3.933, + "step": 9136 + }, + { + "epoch": 0.7787437143100656, + "grad_norm": 52.542738291369346, + "learning_rate": 9.244392695568129e-06, + "loss": 4.2295, + "step": 9137 + }, + { + "epoch": 0.778828944004091, + "grad_norm": 72.17512190740337, + "learning_rate": 9.244130575509623e-06, + "loss": 4.4208, + "step": 9138 + }, + { + "epoch": 0.7789141736981164, + "grad_norm": 28.9863802830038, + "learning_rate": 9.243868413711778e-06, + "loss": 3.8989, + "step": 9139 + }, + { + "epoch": 0.7789994033921418, + "grad_norm": 39.43351893785897, + "learning_rate": 9.243606210177174e-06, + "loss": 3.8004, + "step": 9140 + }, + { + "epoch": 0.7790846330861673, + "grad_norm": 38.95008317902867, + "learning_rate": 9.24334396490839e-06, + "loss": 4.0461, + "step": 9141 + }, + { + "epoch": 0.7791698627801926, + "grad_norm": 35.685640756734635, + "learning_rate": 9.243081677908e-06, + "loss": 3.7748, + "step": 9142 + }, + { + "epoch": 0.779255092474218, + "grad_norm": 31.18581839058328, + "learning_rate": 9.242819349178591e-06, + "loss": 3.7521, + "step": 9143 + }, + { + "epoch": 0.7793403221682434, + "grad_norm": 31.530012051082906, + "learning_rate": 9.242556978722742e-06, + "loss": 3.1395, + "step": 9144 + }, + { + "epoch": 0.7794255518622688, + "grad_norm": 52.982734326779145, + "learning_rate": 9.242294566543026e-06, + "loss": 4.0002, + "step": 9145 + }, + { + "epoch": 0.7795107815562942, + "grad_norm": 36.58789422288471, + "learning_rate": 9.242032112642033e-06, + "loss": 3.0744, + "step": 9146 + }, + { + "epoch": 0.7795960112503196, + "grad_norm": 45.034073718851346, + "learning_rate": 9.241769617022337e-06, + "loss": 4.2165, + "step": 9147 + }, + { + "epoch": 0.779681240944345, + "grad_norm": 44.34163103964711, + "learning_rate": 9.241507079686522e-06, + "loss": 3.3672, + "step": 9148 + }, + { + "epoch": 0.7797664706383705, + "grad_norm": 31.670231110602426, + "learning_rate": 9.241244500637173e-06, + "loss": 3.1626, + "step": 9149 + }, + { + "epoch": 0.7798517003323958, + "grad_norm": 55.58718130672622, + "learning_rate": 9.240981879876866e-06, + "loss": 4.5903, + "step": 9150 + }, + { + "epoch": 0.7799369300264212, + "grad_norm": 29.94245593080166, + "learning_rate": 9.240719217408189e-06, + "loss": 3.1378, + "step": 9151 + }, + { + "epoch": 0.7800221597204466, + "grad_norm": 30.74842122570756, + "learning_rate": 9.240456513233724e-06, + "loss": 4.1997, + "step": 9152 + }, + { + "epoch": 0.780107389414472, + "grad_norm": 37.3851223996, + "learning_rate": 9.240193767356055e-06, + "loss": 3.5222, + "step": 9153 + }, + { + "epoch": 0.7801926191084974, + "grad_norm": 31.512191898556164, + "learning_rate": 9.239930979777765e-06, + "loss": 2.4324, + "step": 9154 + }, + { + "epoch": 0.7802778488025228, + "grad_norm": 35.937561358518096, + "learning_rate": 9.239668150501437e-06, + "loss": 3.5138, + "step": 9155 + }, + { + "epoch": 0.7803630784965482, + "grad_norm": 53.09474427953359, + "learning_rate": 9.239405279529658e-06, + "loss": 3.6764, + "step": 9156 + }, + { + "epoch": 0.7804483081905736, + "grad_norm": 93.51842426158025, + "learning_rate": 9.239142366865014e-06, + "loss": 3.9849, + "step": 9157 + }, + { + "epoch": 0.780533537884599, + "grad_norm": 57.22786711755302, + "learning_rate": 9.238879412510088e-06, + "loss": 4.318, + "step": 9158 + }, + { + "epoch": 0.7806187675786244, + "grad_norm": 43.17107370730009, + "learning_rate": 9.238616416467468e-06, + "loss": 4.4368, + "step": 9159 + }, + { + "epoch": 0.7807039972726498, + "grad_norm": 67.29552867299297, + "learning_rate": 9.23835337873974e-06, + "loss": 4.0595, + "step": 9160 + }, + { + "epoch": 0.7807892269666752, + "grad_norm": 82.17992536987303, + "learning_rate": 9.23809029932949e-06, + "loss": 3.498, + "step": 9161 + }, + { + "epoch": 0.7808744566607005, + "grad_norm": 57.13755874596212, + "learning_rate": 9.237827178239305e-06, + "loss": 4.0505, + "step": 9162 + }, + { + "epoch": 0.780959686354726, + "grad_norm": 54.058916107166915, + "learning_rate": 9.237564015471774e-06, + "loss": 4.3975, + "step": 9163 + }, + { + "epoch": 0.7810449160487514, + "grad_norm": 29.991805351666567, + "learning_rate": 9.237300811029485e-06, + "loss": 2.9115, + "step": 9164 + }, + { + "epoch": 0.7811301457427768, + "grad_norm": 87.9096601238206, + "learning_rate": 9.237037564915029e-06, + "loss": 4.2289, + "step": 9165 + }, + { + "epoch": 0.7812153754368022, + "grad_norm": 64.71679595766074, + "learning_rate": 9.23677427713099e-06, + "loss": 3.3163, + "step": 9166 + }, + { + "epoch": 0.7813006051308276, + "grad_norm": 102.39012238180962, + "learning_rate": 9.236510947679958e-06, + "loss": 4.5625, + "step": 9167 + }, + { + "epoch": 0.781385834824853, + "grad_norm": 40.532567044536165, + "learning_rate": 9.236247576564526e-06, + "loss": 3.6656, + "step": 9168 + }, + { + "epoch": 0.7814710645188784, + "grad_norm": 42.142634318647076, + "learning_rate": 9.235984163787282e-06, + "loss": 4.1017, + "step": 9169 + }, + { + "epoch": 0.7815562942129037, + "grad_norm": 34.68390864179784, + "learning_rate": 9.235720709350816e-06, + "loss": 4.5224, + "step": 9170 + }, + { + "epoch": 0.7816415239069292, + "grad_norm": 73.30958603251271, + "learning_rate": 9.235457213257723e-06, + "loss": 4.9805, + "step": 9171 + }, + { + "epoch": 0.7817267536009546, + "grad_norm": 33.01674892072296, + "learning_rate": 9.235193675510588e-06, + "loss": 3.828, + "step": 9172 + }, + { + "epoch": 0.7818119832949799, + "grad_norm": 41.47304273556683, + "learning_rate": 9.234930096112008e-06, + "loss": 2.7938, + "step": 9173 + }, + { + "epoch": 0.7818972129890054, + "grad_norm": 36.14247057190058, + "learning_rate": 9.234666475064573e-06, + "loss": 4.3544, + "step": 9174 + }, + { + "epoch": 0.7819824426830307, + "grad_norm": 29.84316791647113, + "learning_rate": 9.234402812370875e-06, + "loss": 3.7277, + "step": 9175 + }, + { + "epoch": 0.7820676723770562, + "grad_norm": 48.22444709836909, + "learning_rate": 9.234139108033508e-06, + "loss": 3.9865, + "step": 9176 + }, + { + "epoch": 0.7821529020710816, + "grad_norm": 43.85466072004805, + "learning_rate": 9.233875362055066e-06, + "loss": 4.3188, + "step": 9177 + }, + { + "epoch": 0.7822381317651069, + "grad_norm": 76.21917237245782, + "learning_rate": 9.233611574438144e-06, + "loss": 5.5414, + "step": 9178 + }, + { + "epoch": 0.7823233614591324, + "grad_norm": 33.198105177898775, + "learning_rate": 9.233347745185332e-06, + "loss": 3.4367, + "step": 9179 + }, + { + "epoch": 0.7824085911531578, + "grad_norm": 117.80737434417148, + "learning_rate": 9.233083874299227e-06, + "loss": 3.9725, + "step": 9180 + }, + { + "epoch": 0.7824938208471831, + "grad_norm": 33.66884398932506, + "learning_rate": 9.232819961782424e-06, + "loss": 2.9398, + "step": 9181 + }, + { + "epoch": 0.7825790505412086, + "grad_norm": 40.50864676446132, + "learning_rate": 9.232556007637522e-06, + "loss": 4.3728, + "step": 9182 + }, + { + "epoch": 0.7826642802352339, + "grad_norm": 22.800658197467964, + "learning_rate": 9.232292011867109e-06, + "loss": 3.7152, + "step": 9183 + }, + { + "epoch": 0.7827495099292594, + "grad_norm": 56.17173512056723, + "learning_rate": 9.232027974473787e-06, + "loss": 5.0238, + "step": 9184 + }, + { + "epoch": 0.7828347396232848, + "grad_norm": 51.90864564858806, + "learning_rate": 9.231763895460153e-06, + "loss": 4.4551, + "step": 9185 + }, + { + "epoch": 0.7829199693173101, + "grad_norm": 37.80233981632751, + "learning_rate": 9.231499774828803e-06, + "loss": 2.7209, + "step": 9186 + }, + { + "epoch": 0.7830051990113356, + "grad_norm": 36.906276147674625, + "learning_rate": 9.231235612582332e-06, + "loss": 4.0774, + "step": 9187 + }, + { + "epoch": 0.7830904287053609, + "grad_norm": 75.67443983860359, + "learning_rate": 9.230971408723342e-06, + "loss": 4.9166, + "step": 9188 + }, + { + "epoch": 0.7831756583993863, + "grad_norm": 28.703004788254884, + "learning_rate": 9.230707163254427e-06, + "loss": 2.5628, + "step": 9189 + }, + { + "epoch": 0.7832608880934118, + "grad_norm": 51.30086616676055, + "learning_rate": 9.230442876178191e-06, + "loss": 3.6634, + "step": 9190 + }, + { + "epoch": 0.7833461177874371, + "grad_norm": 40.45419507092266, + "learning_rate": 9.230178547497228e-06, + "loss": 4.2223, + "step": 9191 + }, + { + "epoch": 0.7834313474814626, + "grad_norm": 65.43475865179408, + "learning_rate": 9.22991417721414e-06, + "loss": 3.994, + "step": 9192 + }, + { + "epoch": 0.783516577175488, + "grad_norm": 96.61180028677597, + "learning_rate": 9.22964976533153e-06, + "loss": 3.6458, + "step": 9193 + }, + { + "epoch": 0.7836018068695133, + "grad_norm": 35.01965507625666, + "learning_rate": 9.229385311851992e-06, + "loss": 3.5901, + "step": 9194 + }, + { + "epoch": 0.7836870365635388, + "grad_norm": 43.03986943723772, + "learning_rate": 9.229120816778132e-06, + "loss": 4.4857, + "step": 9195 + }, + { + "epoch": 0.7837722662575641, + "grad_norm": 54.197600069304094, + "learning_rate": 9.228856280112547e-06, + "loss": 4.2166, + "step": 9196 + }, + { + "epoch": 0.7838574959515895, + "grad_norm": 38.126407675738214, + "learning_rate": 9.228591701857843e-06, + "loss": 3.8217, + "step": 9197 + }, + { + "epoch": 0.783942725645615, + "grad_norm": 136.68232676494523, + "learning_rate": 9.22832708201662e-06, + "loss": 4.2493, + "step": 9198 + }, + { + "epoch": 0.7840279553396403, + "grad_norm": 51.610362923667694, + "learning_rate": 9.228062420591477e-06, + "loss": 3.6428, + "step": 9199 + }, + { + "epoch": 0.7841131850336658, + "grad_norm": 65.41361135972548, + "learning_rate": 9.227797717585024e-06, + "loss": 4.6203, + "step": 9200 + }, + { + "epoch": 0.7841984147276911, + "grad_norm": 37.80850736693203, + "learning_rate": 9.22753297299986e-06, + "loss": 2.879, + "step": 9201 + }, + { + "epoch": 0.7842836444217165, + "grad_norm": 32.70860890716564, + "learning_rate": 9.227268186838586e-06, + "loss": 3.8922, + "step": 9202 + }, + { + "epoch": 0.784368874115742, + "grad_norm": 42.97867873985488, + "learning_rate": 9.227003359103812e-06, + "loss": 4.4917, + "step": 9203 + }, + { + "epoch": 0.7844541038097673, + "grad_norm": 31.92659520846014, + "learning_rate": 9.22673848979814e-06, + "loss": 3.8152, + "step": 9204 + }, + { + "epoch": 0.7845393335037927, + "grad_norm": 61.88158999317326, + "learning_rate": 9.226473578924173e-06, + "loss": 3.7182, + "step": 9205 + }, + { + "epoch": 0.7846245631978181, + "grad_norm": 53.16371570944678, + "learning_rate": 9.226208626484518e-06, + "loss": 4.3552, + "step": 9206 + }, + { + "epoch": 0.7847097928918435, + "grad_norm": 57.34990720955574, + "learning_rate": 9.22594363248178e-06, + "loss": 1.3184, + "step": 9207 + }, + { + "epoch": 0.7847950225858689, + "grad_norm": 32.358755350241324, + "learning_rate": 9.225678596918565e-06, + "loss": 3.3933, + "step": 9208 + }, + { + "epoch": 0.7848802522798943, + "grad_norm": 35.41317510716157, + "learning_rate": 9.225413519797483e-06, + "loss": 3.09, + "step": 9209 + }, + { + "epoch": 0.7849654819739197, + "grad_norm": 69.71106190485523, + "learning_rate": 9.225148401121136e-06, + "loss": 3.259, + "step": 9210 + }, + { + "epoch": 0.7850507116679452, + "grad_norm": 45.537290444809884, + "learning_rate": 9.224883240892132e-06, + "loss": 4.2406, + "step": 9211 + }, + { + "epoch": 0.7851359413619705, + "grad_norm": 41.15788520047659, + "learning_rate": 9.224618039113084e-06, + "loss": 4.0347, + "step": 9212 + }, + { + "epoch": 0.7852211710559959, + "grad_norm": 139.76670407957067, + "learning_rate": 9.224352795786593e-06, + "loss": 7.0577, + "step": 9213 + }, + { + "epoch": 0.7853064007500213, + "grad_norm": 32.52876243407626, + "learning_rate": 9.224087510915273e-06, + "loss": 3.4461, + "step": 9214 + }, + { + "epoch": 0.7853916304440467, + "grad_norm": 34.088963499399455, + "learning_rate": 9.22382218450173e-06, + "loss": 3.6562, + "step": 9215 + }, + { + "epoch": 0.785476860138072, + "grad_norm": 37.70427422541753, + "learning_rate": 9.223556816548573e-06, + "loss": 3.3417, + "step": 9216 + }, + { + "epoch": 0.7855620898320975, + "grad_norm": 63.22030633681136, + "learning_rate": 9.223291407058414e-06, + "loss": 5.4796, + "step": 9217 + }, + { + "epoch": 0.7856473195261229, + "grad_norm": 74.1739256463046, + "learning_rate": 9.223025956033861e-06, + "loss": 5.3278, + "step": 9218 + }, + { + "epoch": 0.7857325492201483, + "grad_norm": 42.084435667811924, + "learning_rate": 9.22276046347753e-06, + "loss": 3.1585, + "step": 9219 + }, + { + "epoch": 0.7858177789141737, + "grad_norm": 34.17151660036768, + "learning_rate": 9.222494929392023e-06, + "loss": 3.8634, + "step": 9220 + }, + { + "epoch": 0.7859030086081991, + "grad_norm": 36.495102792398015, + "learning_rate": 9.222229353779959e-06, + "loss": 3.6739, + "step": 9221 + }, + { + "epoch": 0.7859882383022245, + "grad_norm": 38.245871833512545, + "learning_rate": 9.221963736643945e-06, + "loss": 4.501, + "step": 9222 + }, + { + "epoch": 0.7860734679962499, + "grad_norm": 67.01499761137742, + "learning_rate": 9.221698077986596e-06, + "loss": 5.0244, + "step": 9223 + }, + { + "epoch": 0.7861586976902752, + "grad_norm": 45.056326824937145, + "learning_rate": 9.221432377810524e-06, + "loss": 3.8308, + "step": 9224 + }, + { + "epoch": 0.7862439273843007, + "grad_norm": 52.96544965305681, + "learning_rate": 9.221166636118344e-06, + "loss": 4.5782, + "step": 9225 + }, + { + "epoch": 0.7863291570783261, + "grad_norm": 49.15359862984479, + "learning_rate": 9.220900852912664e-06, + "loss": 4.4338, + "step": 9226 + }, + { + "epoch": 0.7864143867723515, + "grad_norm": 35.54277657616985, + "learning_rate": 9.220635028196102e-06, + "loss": 3.9068, + "step": 9227 + }, + { + "epoch": 0.7864996164663769, + "grad_norm": 50.05931250422692, + "learning_rate": 9.220369161971274e-06, + "loss": 3.953, + "step": 9228 + }, + { + "epoch": 0.7865848461604023, + "grad_norm": 35.844223037846604, + "learning_rate": 9.220103254240791e-06, + "loss": 3.4996, + "step": 9229 + }, + { + "epoch": 0.7866700758544277, + "grad_norm": 129.8796866418701, + "learning_rate": 9.219837305007271e-06, + "loss": 5.5319, + "step": 9230 + }, + { + "epoch": 0.7867553055484531, + "grad_norm": 66.18834060519119, + "learning_rate": 9.219571314273326e-06, + "loss": 2.1276, + "step": 9231 + }, + { + "epoch": 0.7868405352424784, + "grad_norm": 36.72268471972625, + "learning_rate": 9.219305282041574e-06, + "loss": 3.3835, + "step": 9232 + }, + { + "epoch": 0.7869257649365039, + "grad_norm": 40.25643696584689, + "learning_rate": 9.21903920831463e-06, + "loss": 4.2808, + "step": 9233 + }, + { + "epoch": 0.7870109946305293, + "grad_norm": 34.867657326505075, + "learning_rate": 9.218773093095113e-06, + "loss": 4.4042, + "step": 9234 + }, + { + "epoch": 0.7870962243245547, + "grad_norm": 34.82784608703253, + "learning_rate": 9.218506936385641e-06, + "loss": 3.5901, + "step": 9235 + }, + { + "epoch": 0.7871814540185801, + "grad_norm": 68.12216819346783, + "learning_rate": 9.218240738188826e-06, + "loss": 4.9618, + "step": 9236 + }, + { + "epoch": 0.7872666837126054, + "grad_norm": 38.517022507020584, + "learning_rate": 9.217974498507292e-06, + "loss": 3.1527, + "step": 9237 + }, + { + "epoch": 0.7873519134066309, + "grad_norm": 31.134979737017648, + "learning_rate": 9.217708217343653e-06, + "loss": 4.0886, + "step": 9238 + }, + { + "epoch": 0.7874371431006563, + "grad_norm": 35.15073526356332, + "learning_rate": 9.217441894700532e-06, + "loss": 3.3593, + "step": 9239 + }, + { + "epoch": 0.7875223727946816, + "grad_norm": 52.8421856928715, + "learning_rate": 9.217175530580544e-06, + "loss": 4.0328, + "step": 9240 + }, + { + "epoch": 0.7876076024887071, + "grad_norm": 30.10943792605235, + "learning_rate": 9.216909124986311e-06, + "loss": 4.0186, + "step": 9241 + }, + { + "epoch": 0.7876928321827324, + "grad_norm": 25.300268127882216, + "learning_rate": 9.216642677920452e-06, + "loss": 2.9035, + "step": 9242 + }, + { + "epoch": 0.7877780618767579, + "grad_norm": 50.16168493741851, + "learning_rate": 9.216376189385587e-06, + "loss": 4.4512, + "step": 9243 + }, + { + "epoch": 0.7878632915707833, + "grad_norm": 78.75525662272847, + "learning_rate": 9.21610965938434e-06, + "loss": 4.3641, + "step": 9244 + }, + { + "epoch": 0.7879485212648086, + "grad_norm": 36.67034772971514, + "learning_rate": 9.215843087919328e-06, + "loss": 3.1785, + "step": 9245 + }, + { + "epoch": 0.7880337509588341, + "grad_norm": 35.89896380149836, + "learning_rate": 9.215576474993176e-06, + "loss": 3.834, + "step": 9246 + }, + { + "epoch": 0.7881189806528595, + "grad_norm": 60.63973392880807, + "learning_rate": 9.215309820608502e-06, + "loss": 4.5119, + "step": 9247 + }, + { + "epoch": 0.7882042103468848, + "grad_norm": 54.6799421776389, + "learning_rate": 9.215043124767933e-06, + "loss": 3.1596, + "step": 9248 + }, + { + "epoch": 0.7882894400409103, + "grad_norm": 49.43964827272844, + "learning_rate": 9.21477638747409e-06, + "loss": 4.1075, + "step": 9249 + }, + { + "epoch": 0.7883746697349356, + "grad_norm": 43.69643766725361, + "learning_rate": 9.214509608729593e-06, + "loss": 3.6398, + "step": 9250 + }, + { + "epoch": 0.788459899428961, + "grad_norm": 60.811352356689085, + "learning_rate": 9.214242788537073e-06, + "loss": 3.725, + "step": 9251 + }, + { + "epoch": 0.7885451291229865, + "grad_norm": 26.21702487578397, + "learning_rate": 9.213975926899145e-06, + "loss": 2.8011, + "step": 9252 + }, + { + "epoch": 0.7886303588170118, + "grad_norm": 70.22667763757595, + "learning_rate": 9.213709023818442e-06, + "loss": 5.1297, + "step": 9253 + }, + { + "epoch": 0.7887155885110373, + "grad_norm": 40.19615755920601, + "learning_rate": 9.213442079297583e-06, + "loss": 3.4417, + "step": 9254 + }, + { + "epoch": 0.7888008182050626, + "grad_norm": 57.84693890960173, + "learning_rate": 9.213175093339197e-06, + "loss": 4.7232, + "step": 9255 + }, + { + "epoch": 0.788886047899088, + "grad_norm": 40.327128782086724, + "learning_rate": 9.212908065945907e-06, + "loss": 4.6735, + "step": 9256 + }, + { + "epoch": 0.7889712775931135, + "grad_norm": 147.38794384564335, + "learning_rate": 9.212640997120341e-06, + "loss": 4.6731, + "step": 9257 + }, + { + "epoch": 0.7890565072871388, + "grad_norm": 38.12940227791322, + "learning_rate": 9.212373886865125e-06, + "loss": 4.5674, + "step": 9258 + }, + { + "epoch": 0.7891417369811642, + "grad_norm": 69.80490735845632, + "learning_rate": 9.212106735182884e-06, + "loss": 4.6302, + "step": 9259 + }, + { + "epoch": 0.7892269666751897, + "grad_norm": 37.84681021795972, + "learning_rate": 9.211839542076248e-06, + "loss": 2.6836, + "step": 9260 + }, + { + "epoch": 0.789312196369215, + "grad_norm": 59.530117863107364, + "learning_rate": 9.211572307547844e-06, + "loss": 3.989, + "step": 9261 + }, + { + "epoch": 0.7893974260632405, + "grad_norm": 94.74805035358354, + "learning_rate": 9.2113050316003e-06, + "loss": 4.8417, + "step": 9262 + }, + { + "epoch": 0.7894826557572658, + "grad_norm": 33.2719926440314, + "learning_rate": 9.211037714236243e-06, + "loss": 3.7325, + "step": 9263 + }, + { + "epoch": 0.7895678854512912, + "grad_norm": 30.635633129803725, + "learning_rate": 9.210770355458304e-06, + "loss": 4.2504, + "step": 9264 + }, + { + "epoch": 0.7896531151453167, + "grad_norm": 32.9964112340751, + "learning_rate": 9.210502955269114e-06, + "loss": 3.8617, + "step": 9265 + }, + { + "epoch": 0.789738344839342, + "grad_norm": 68.09750418024524, + "learning_rate": 9.210235513671297e-06, + "loss": 4.4032, + "step": 9266 + }, + { + "epoch": 0.7898235745333674, + "grad_norm": 33.644041926075495, + "learning_rate": 9.20996803066749e-06, + "loss": 3.8625, + "step": 9267 + }, + { + "epoch": 0.7899088042273928, + "grad_norm": 30.792137484611747, + "learning_rate": 9.209700506260318e-06, + "loss": 3.9194, + "step": 9268 + }, + { + "epoch": 0.7899940339214182, + "grad_norm": 82.88420221792423, + "learning_rate": 9.209432940452414e-06, + "loss": 4.1062, + "step": 9269 + }, + { + "epoch": 0.7900792636154437, + "grad_norm": 44.832943459297184, + "learning_rate": 9.209165333246411e-06, + "loss": 2.7923, + "step": 9270 + }, + { + "epoch": 0.790164493309469, + "grad_norm": 41.01288495850687, + "learning_rate": 9.20889768464494e-06, + "loss": 3.9621, + "step": 9271 + }, + { + "epoch": 0.7902497230034944, + "grad_norm": 71.58289168530645, + "learning_rate": 9.20862999465063e-06, + "loss": 3.2134, + "step": 9272 + }, + { + "epoch": 0.7903349526975199, + "grad_norm": 74.23455879881996, + "learning_rate": 9.20836226326612e-06, + "loss": 5.2698, + "step": 9273 + }, + { + "epoch": 0.7904201823915452, + "grad_norm": 55.802631716681326, + "learning_rate": 9.208094490494038e-06, + "loss": 4.4168, + "step": 9274 + }, + { + "epoch": 0.7905054120855706, + "grad_norm": 43.66211620852429, + "learning_rate": 9.207826676337017e-06, + "loss": 4.6714, + "step": 9275 + }, + { + "epoch": 0.790590641779596, + "grad_norm": 52.90914979571117, + "learning_rate": 9.207558820797696e-06, + "loss": 3.6798, + "step": 9276 + }, + { + "epoch": 0.7906758714736214, + "grad_norm": 32.314615912714864, + "learning_rate": 9.207290923878702e-06, + "loss": 3.8292, + "step": 9277 + }, + { + "epoch": 0.7907611011676469, + "grad_norm": 67.14714864865832, + "learning_rate": 9.207022985582677e-06, + "loss": 3.8472, + "step": 9278 + }, + { + "epoch": 0.7908463308616722, + "grad_norm": 46.16382820477323, + "learning_rate": 9.206755005912251e-06, + "loss": 4.3126, + "step": 9279 + }, + { + "epoch": 0.7909315605556976, + "grad_norm": 37.8418371843462, + "learning_rate": 9.20648698487006e-06, + "loss": 4.1137, + "step": 9280 + }, + { + "epoch": 0.791016790249723, + "grad_norm": 50.39339540183027, + "learning_rate": 9.206218922458744e-06, + "loss": 4.2096, + "step": 9281 + }, + { + "epoch": 0.7911020199437484, + "grad_norm": 101.99451885138562, + "learning_rate": 9.205950818680933e-06, + "loss": 5.5443, + "step": 9282 + }, + { + "epoch": 0.7911872496377738, + "grad_norm": 167.35623625364548, + "learning_rate": 9.20568267353927e-06, + "loss": 6.4968, + "step": 9283 + }, + { + "epoch": 0.7912724793317992, + "grad_norm": 40.5294670680945, + "learning_rate": 9.205414487036387e-06, + "loss": 2.5028, + "step": 9284 + }, + { + "epoch": 0.7913577090258246, + "grad_norm": 68.0105352859282, + "learning_rate": 9.205146259174925e-06, + "loss": 4.4437, + "step": 9285 + }, + { + "epoch": 0.7914429387198499, + "grad_norm": 32.59278895287791, + "learning_rate": 9.204877989957518e-06, + "loss": 3.3798, + "step": 9286 + }, + { + "epoch": 0.7915281684138754, + "grad_norm": 42.017405434699356, + "learning_rate": 9.204609679386809e-06, + "loss": 2.4821, + "step": 9287 + }, + { + "epoch": 0.7916133981079008, + "grad_norm": 49.02442470068858, + "learning_rate": 9.204341327465434e-06, + "loss": 3.4045, + "step": 9288 + }, + { + "epoch": 0.7916986278019262, + "grad_norm": 47.79117405327515, + "learning_rate": 9.204072934196033e-06, + "loss": 3.402, + "step": 9289 + }, + { + "epoch": 0.7917838574959516, + "grad_norm": 52.65749401390089, + "learning_rate": 9.203804499581246e-06, + "loss": 3.8163, + "step": 9290 + }, + { + "epoch": 0.791869087189977, + "grad_norm": 36.86130659944685, + "learning_rate": 9.203536023623711e-06, + "loss": 4.6962, + "step": 9291 + }, + { + "epoch": 0.7919543168840024, + "grad_norm": 36.26277780675657, + "learning_rate": 9.20326750632607e-06, + "loss": 4.185, + "step": 9292 + }, + { + "epoch": 0.7920395465780278, + "grad_norm": 62.07696010391902, + "learning_rate": 9.202998947690962e-06, + "loss": 3.6754, + "step": 9293 + }, + { + "epoch": 0.7921247762720531, + "grad_norm": 146.41011882802647, + "learning_rate": 9.202730347721031e-06, + "loss": 3.0191, + "step": 9294 + }, + { + "epoch": 0.7922100059660786, + "grad_norm": 89.2297030034098, + "learning_rate": 9.202461706418917e-06, + "loss": 4.5532, + "step": 9295 + }, + { + "epoch": 0.792295235660104, + "grad_norm": 28.895963333135573, + "learning_rate": 9.20219302378726e-06, + "loss": 3.1387, + "step": 9296 + }, + { + "epoch": 0.7923804653541294, + "grad_norm": 48.9501465791504, + "learning_rate": 9.201924299828707e-06, + "loss": 3.7101, + "step": 9297 + }, + { + "epoch": 0.7924656950481548, + "grad_norm": 71.22290043455865, + "learning_rate": 9.201655534545897e-06, + "loss": 3.8276, + "step": 9298 + }, + { + "epoch": 0.7925509247421801, + "grad_norm": 86.98578851654622, + "learning_rate": 9.201386727941475e-06, + "loss": 4.8232, + "step": 9299 + }, + { + "epoch": 0.7926361544362056, + "grad_norm": 49.219951246978006, + "learning_rate": 9.201117880018083e-06, + "loss": 4.4323, + "step": 9300 + }, + { + "epoch": 0.792721384130231, + "grad_norm": 57.51223267291046, + "learning_rate": 9.200848990778367e-06, + "loss": 4.5618, + "step": 9301 + }, + { + "epoch": 0.7928066138242563, + "grad_norm": 57.83820061609448, + "learning_rate": 9.20058006022497e-06, + "loss": 4.3368, + "step": 9302 + }, + { + "epoch": 0.7928918435182818, + "grad_norm": 115.81676090583771, + "learning_rate": 9.200311088360538e-06, + "loss": 5.4355, + "step": 9303 + }, + { + "epoch": 0.7929770732123071, + "grad_norm": 80.23668250394529, + "learning_rate": 9.200042075187715e-06, + "loss": 4.7811, + "step": 9304 + }, + { + "epoch": 0.7930623029063326, + "grad_norm": 29.60186519294695, + "learning_rate": 9.199773020709147e-06, + "loss": 3.7109, + "step": 9305 + }, + { + "epoch": 0.793147532600358, + "grad_norm": 72.08116902871652, + "learning_rate": 9.199503924927478e-06, + "loss": 4.4161, + "step": 9306 + }, + { + "epoch": 0.7932327622943833, + "grad_norm": 46.737261018382775, + "learning_rate": 9.19923478784536e-06, + "loss": 3.5563, + "step": 9307 + }, + { + "epoch": 0.7933179919884088, + "grad_norm": 40.615130811643446, + "learning_rate": 9.198965609465434e-06, + "loss": 3.9769, + "step": 9308 + }, + { + "epoch": 0.7934032216824342, + "grad_norm": 33.778638365935215, + "learning_rate": 9.198696389790352e-06, + "loss": 3.5735, + "step": 9309 + }, + { + "epoch": 0.7934884513764595, + "grad_norm": 37.452828882970664, + "learning_rate": 9.198427128822758e-06, + "loss": 3.0156, + "step": 9310 + }, + { + "epoch": 0.793573681070485, + "grad_norm": 33.66159402454799, + "learning_rate": 9.198157826565301e-06, + "loss": 3.737, + "step": 9311 + }, + { + "epoch": 0.7936589107645103, + "grad_norm": 35.59585611103501, + "learning_rate": 9.19788848302063e-06, + "loss": 4.5381, + "step": 9312 + }, + { + "epoch": 0.7937441404585358, + "grad_norm": 42.37281640486715, + "learning_rate": 9.197619098191395e-06, + "loss": 3.6931, + "step": 9313 + }, + { + "epoch": 0.7938293701525612, + "grad_norm": 38.93581474910599, + "learning_rate": 9.197349672080245e-06, + "loss": 4.2659, + "step": 9314 + }, + { + "epoch": 0.7939145998465865, + "grad_norm": 48.16257702540365, + "learning_rate": 9.197080204689827e-06, + "loss": 4.0896, + "step": 9315 + }, + { + "epoch": 0.793999829540612, + "grad_norm": 58.51191198680842, + "learning_rate": 9.196810696022792e-06, + "loss": 3.9174, + "step": 9316 + }, + { + "epoch": 0.7940850592346373, + "grad_norm": 79.06384757058953, + "learning_rate": 9.196541146081794e-06, + "loss": 3.301, + "step": 9317 + }, + { + "epoch": 0.7941702889286627, + "grad_norm": 41.12472381709627, + "learning_rate": 9.19627155486948e-06, + "loss": 3.8536, + "step": 9318 + }, + { + "epoch": 0.7942555186226882, + "grad_norm": 37.87001802194174, + "learning_rate": 9.196001922388504e-06, + "loss": 3.9908, + "step": 9319 + }, + { + "epoch": 0.7943407483167135, + "grad_norm": 43.638018458289835, + "learning_rate": 9.195732248641514e-06, + "loss": 4.4437, + "step": 9320 + }, + { + "epoch": 0.7944259780107389, + "grad_norm": 38.50948991695033, + "learning_rate": 9.195462533631166e-06, + "loss": 2.7563, + "step": 9321 + }, + { + "epoch": 0.7945112077047644, + "grad_norm": 33.6387882539349, + "learning_rate": 9.195192777360111e-06, + "loss": 3.8497, + "step": 9322 + }, + { + "epoch": 0.7945964373987897, + "grad_norm": 42.63202804416761, + "learning_rate": 9.194922979831002e-06, + "loss": 3.5694, + "step": 9323 + }, + { + "epoch": 0.7946816670928152, + "grad_norm": 41.57189157984406, + "learning_rate": 9.194653141046492e-06, + "loss": 4.4257, + "step": 9324 + }, + { + "epoch": 0.7947668967868405, + "grad_norm": 38.57264890269854, + "learning_rate": 9.194383261009237e-06, + "loss": 3.5481, + "step": 9325 + }, + { + "epoch": 0.7948521264808659, + "grad_norm": 96.13667273882942, + "learning_rate": 9.194113339721887e-06, + "loss": 3.8863, + "step": 9326 + }, + { + "epoch": 0.7949373561748914, + "grad_norm": 38.62182383295695, + "learning_rate": 9.193843377187101e-06, + "loss": 3.9171, + "step": 9327 + }, + { + "epoch": 0.7950225858689167, + "grad_norm": 110.17438737847452, + "learning_rate": 9.19357337340753e-06, + "loss": 3.3776, + "step": 9328 + }, + { + "epoch": 0.7951078155629421, + "grad_norm": 36.55107897554351, + "learning_rate": 9.193303328385833e-06, + "loss": 4.0952, + "step": 9329 + }, + { + "epoch": 0.7951930452569675, + "grad_norm": 27.11307309063442, + "learning_rate": 9.193033242124661e-06, + "loss": 2.7469, + "step": 9330 + }, + { + "epoch": 0.7952782749509929, + "grad_norm": 32.97886418601086, + "learning_rate": 9.192763114626677e-06, + "loss": 4.0607, + "step": 9331 + }, + { + "epoch": 0.7953635046450184, + "grad_norm": 53.399803259010476, + "learning_rate": 9.192492945894532e-06, + "loss": 4.3463, + "step": 9332 + }, + { + "epoch": 0.7954487343390437, + "grad_norm": 46.919985910563454, + "learning_rate": 9.192222735930883e-06, + "loss": 4.429, + "step": 9333 + }, + { + "epoch": 0.7955339640330691, + "grad_norm": 38.14688314018628, + "learning_rate": 9.191952484738392e-06, + "loss": 3.4072, + "step": 9334 + }, + { + "epoch": 0.7956191937270946, + "grad_norm": 64.51899518276636, + "learning_rate": 9.191682192319713e-06, + "loss": 4.355, + "step": 9335 + }, + { + "epoch": 0.7957044234211199, + "grad_norm": 50.43051036040367, + "learning_rate": 9.191411858677505e-06, + "loss": 4.3856, + "step": 9336 + }, + { + "epoch": 0.7957896531151453, + "grad_norm": 44.058552036586335, + "learning_rate": 9.191141483814428e-06, + "loss": 3.8434, + "step": 9337 + }, + { + "epoch": 0.7958748828091707, + "grad_norm": 49.523358195811284, + "learning_rate": 9.190871067733138e-06, + "loss": 4.5289, + "step": 9338 + }, + { + "epoch": 0.7959601125031961, + "grad_norm": 42.41972174791392, + "learning_rate": 9.190600610436297e-06, + "loss": 3.9865, + "step": 9339 + }, + { + "epoch": 0.7960453421972216, + "grad_norm": 60.12228003336725, + "learning_rate": 9.190330111926564e-06, + "loss": 5.2638, + "step": 9340 + }, + { + "epoch": 0.7961305718912469, + "grad_norm": 45.402894847804006, + "learning_rate": 9.1900595722066e-06, + "loss": 4.076, + "step": 9341 + }, + { + "epoch": 0.7962158015852723, + "grad_norm": 60.515896472835436, + "learning_rate": 9.189788991279065e-06, + "loss": 3.1713, + "step": 9342 + }, + { + "epoch": 0.7963010312792977, + "grad_norm": 49.17595716467837, + "learning_rate": 9.189518369146621e-06, + "loss": 4.1375, + "step": 9343 + }, + { + "epoch": 0.7963862609733231, + "grad_norm": 54.2633188342917, + "learning_rate": 9.189247705811928e-06, + "loss": 4.1638, + "step": 9344 + }, + { + "epoch": 0.7964714906673485, + "grad_norm": 28.5725050406406, + "learning_rate": 9.188977001277648e-06, + "loss": 3.6099, + "step": 9345 + }, + { + "epoch": 0.7965567203613739, + "grad_norm": 67.93673206107272, + "learning_rate": 9.188706255546444e-06, + "loss": 4.1647, + "step": 9346 + }, + { + "epoch": 0.7966419500553993, + "grad_norm": 52.20819964974687, + "learning_rate": 9.188435468620977e-06, + "loss": 4.3989, + "step": 9347 + }, + { + "epoch": 0.7967271797494248, + "grad_norm": 80.28817053114643, + "learning_rate": 9.188164640503913e-06, + "loss": 3.85, + "step": 9348 + }, + { + "epoch": 0.7968124094434501, + "grad_norm": 45.95505620951203, + "learning_rate": 9.187893771197916e-06, + "loss": 4.4139, + "step": 9349 + }, + { + "epoch": 0.7968976391374755, + "grad_norm": 61.197743181898936, + "learning_rate": 9.187622860705645e-06, + "loss": 3.5442, + "step": 9350 + }, + { + "epoch": 0.7969828688315009, + "grad_norm": 56.374512789402075, + "learning_rate": 9.187351909029768e-06, + "loss": 4.4401, + "step": 9351 + }, + { + "epoch": 0.7970680985255263, + "grad_norm": 56.25008622612456, + "learning_rate": 9.187080916172948e-06, + "loss": 3.81, + "step": 9352 + }, + { + "epoch": 0.7971533282195516, + "grad_norm": 35.55554561056044, + "learning_rate": 9.186809882137851e-06, + "loss": 3.9299, + "step": 9353 + }, + { + "epoch": 0.7972385579135771, + "grad_norm": 41.18622575261113, + "learning_rate": 9.186538806927143e-06, + "loss": 4.3075, + "step": 9354 + }, + { + "epoch": 0.7973237876076025, + "grad_norm": 33.91522457455249, + "learning_rate": 9.18626769054349e-06, + "loss": 2.5706, + "step": 9355 + }, + { + "epoch": 0.7974090173016279, + "grad_norm": 67.8338691158056, + "learning_rate": 9.185996532989557e-06, + "loss": 4.0253, + "step": 9356 + }, + { + "epoch": 0.7974942469956533, + "grad_norm": 34.58032689657935, + "learning_rate": 9.185725334268011e-06, + "loss": 3.5003, + "step": 9357 + }, + { + "epoch": 0.7975794766896787, + "grad_norm": 55.12276172442596, + "learning_rate": 9.185454094381522e-06, + "loss": 4.4407, + "step": 9358 + }, + { + "epoch": 0.7976647063837041, + "grad_norm": 36.98537956061708, + "learning_rate": 9.185182813332753e-06, + "loss": 3.8811, + "step": 9359 + }, + { + "epoch": 0.7977499360777295, + "grad_norm": 60.50765142823166, + "learning_rate": 9.184911491124374e-06, + "loss": 4.4322, + "step": 9360 + }, + { + "epoch": 0.7978351657717548, + "grad_norm": 35.80642078158607, + "learning_rate": 9.184640127759055e-06, + "loss": 3.2486, + "step": 9361 + }, + { + "epoch": 0.7979203954657803, + "grad_norm": 35.43989686721344, + "learning_rate": 9.184368723239462e-06, + "loss": 3.9213, + "step": 9362 + }, + { + "epoch": 0.7980056251598057, + "grad_norm": 32.742560638001386, + "learning_rate": 9.184097277568264e-06, + "loss": 3.5949, + "step": 9363 + }, + { + "epoch": 0.798090854853831, + "grad_norm": 41.13380042808851, + "learning_rate": 9.183825790748135e-06, + "loss": 4.7416, + "step": 9364 + }, + { + "epoch": 0.7981760845478565, + "grad_norm": 72.9561745523713, + "learning_rate": 9.18355426278174e-06, + "loss": 4.8229, + "step": 9365 + }, + { + "epoch": 0.7982613142418818, + "grad_norm": 99.79665186515676, + "learning_rate": 9.183282693671752e-06, + "loss": 3.3776, + "step": 9366 + }, + { + "epoch": 0.7983465439359073, + "grad_norm": 47.93771412104897, + "learning_rate": 9.183011083420838e-06, + "loss": 3.4784, + "step": 9367 + }, + { + "epoch": 0.7984317736299327, + "grad_norm": 39.63211708620573, + "learning_rate": 9.182739432031675e-06, + "loss": 4.0705, + "step": 9368 + }, + { + "epoch": 0.798517003323958, + "grad_norm": 69.62153576439984, + "learning_rate": 9.182467739506931e-06, + "loss": 4.8067, + "step": 9369 + }, + { + "epoch": 0.7986022330179835, + "grad_norm": 24.60910542843322, + "learning_rate": 9.18219600584928e-06, + "loss": 3.0626, + "step": 9370 + }, + { + "epoch": 0.7986874627120089, + "grad_norm": 38.472028490803275, + "learning_rate": 9.181924231061391e-06, + "loss": 4.5136, + "step": 9371 + }, + { + "epoch": 0.7987726924060342, + "grad_norm": 144.04046482704834, + "learning_rate": 9.18165241514594e-06, + "loss": 5.388, + "step": 9372 + }, + { + "epoch": 0.7988579221000597, + "grad_norm": 54.98669599859775, + "learning_rate": 9.1813805581056e-06, + "loss": 4.5676, + "step": 9373 + }, + { + "epoch": 0.798943151794085, + "grad_norm": 53.29749594704657, + "learning_rate": 9.181108659943042e-06, + "loss": 4.3659, + "step": 9374 + }, + { + "epoch": 0.7990283814881105, + "grad_norm": 32.12350915439098, + "learning_rate": 9.180836720660941e-06, + "loss": 3.7401, + "step": 9375 + }, + { + "epoch": 0.7991136111821359, + "grad_norm": 51.7976012705069, + "learning_rate": 9.180564740261974e-06, + "loss": 4.2602, + "step": 9376 + }, + { + "epoch": 0.7991988408761612, + "grad_norm": 56.826958979501335, + "learning_rate": 9.180292718748814e-06, + "loss": 2.7992, + "step": 9377 + }, + { + "epoch": 0.7992840705701867, + "grad_norm": 33.60859153823067, + "learning_rate": 9.180020656124137e-06, + "loss": 3.0592, + "step": 9378 + }, + { + "epoch": 0.799369300264212, + "grad_norm": 41.453575034273726, + "learning_rate": 9.179748552390616e-06, + "loss": 4.1253, + "step": 9379 + }, + { + "epoch": 0.7994545299582374, + "grad_norm": 38.00237562358695, + "learning_rate": 9.17947640755093e-06, + "loss": 3.4394, + "step": 9380 + }, + { + "epoch": 0.7995397596522629, + "grad_norm": 47.80703637397388, + "learning_rate": 9.179204221607754e-06, + "loss": 3.5585, + "step": 9381 + }, + { + "epoch": 0.7996249893462882, + "grad_norm": 66.8487926216865, + "learning_rate": 9.178931994563765e-06, + "loss": 4.9463, + "step": 9382 + }, + { + "epoch": 0.7997102190403137, + "grad_norm": 62.31087422335572, + "learning_rate": 9.178659726421639e-06, + "loss": 5.06, + "step": 9383 + }, + { + "epoch": 0.799795448734339, + "grad_norm": 41.68818484995314, + "learning_rate": 9.178387417184057e-06, + "loss": 3.9096, + "step": 9384 + }, + { + "epoch": 0.7998806784283644, + "grad_norm": 45.781024717201404, + "learning_rate": 9.178115066853694e-06, + "loss": 5.0962, + "step": 9385 + }, + { + "epoch": 0.7999659081223899, + "grad_norm": 37.97643640747391, + "learning_rate": 9.17784267543323e-06, + "loss": 4.2409, + "step": 9386 + }, + { + "epoch": 0.8000511378164152, + "grad_norm": 56.86881979457853, + "learning_rate": 9.177570242925344e-06, + "loss": 4.9031, + "step": 9387 + }, + { + "epoch": 0.8001363675104406, + "grad_norm": 54.175142957518894, + "learning_rate": 9.177297769332714e-06, + "loss": 4.2834, + "step": 9388 + }, + { + "epoch": 0.8002215972044661, + "grad_norm": 94.19798549272585, + "learning_rate": 9.17702525465802e-06, + "loss": 4.135, + "step": 9389 + }, + { + "epoch": 0.8003068268984914, + "grad_norm": 63.83450931930138, + "learning_rate": 9.176752698903943e-06, + "loss": 5.1586, + "step": 9390 + }, + { + "epoch": 0.8003920565925169, + "grad_norm": 80.83299538211023, + "learning_rate": 9.176480102073163e-06, + "loss": 4.391, + "step": 9391 + }, + { + "epoch": 0.8004772862865422, + "grad_norm": 33.24973076953315, + "learning_rate": 9.17620746416836e-06, + "loss": 3.9854, + "step": 9392 + }, + { + "epoch": 0.8005625159805676, + "grad_norm": 31.425385322638505, + "learning_rate": 9.175934785192218e-06, + "loss": 2.8028, + "step": 9393 + }, + { + "epoch": 0.8006477456745931, + "grad_norm": 56.41268857925922, + "learning_rate": 9.175662065147413e-06, + "loss": 4.3696, + "step": 9394 + }, + { + "epoch": 0.8007329753686184, + "grad_norm": 42.91322855288378, + "learning_rate": 9.175389304036635e-06, + "loss": 4.269, + "step": 9395 + }, + { + "epoch": 0.8008182050626438, + "grad_norm": 45.11325922401882, + "learning_rate": 9.175116501862559e-06, + "loss": 3.1785, + "step": 9396 + }, + { + "epoch": 0.8009034347566693, + "grad_norm": 51.4155217993694, + "learning_rate": 9.174843658627873e-06, + "loss": 3.3892, + "step": 9397 + }, + { + "epoch": 0.8009886644506946, + "grad_norm": 62.15176304415181, + "learning_rate": 9.174570774335258e-06, + "loss": 5.321, + "step": 9398 + }, + { + "epoch": 0.80107389414472, + "grad_norm": 45.976583188803716, + "learning_rate": 9.1742978489874e-06, + "loss": 4.2082, + "step": 9399 + }, + { + "epoch": 0.8011591238387454, + "grad_norm": 67.25516606889012, + "learning_rate": 9.174024882586979e-06, + "loss": 4.6238, + "step": 9400 + }, + { + "epoch": 0.8012443535327708, + "grad_norm": 56.5690517300726, + "learning_rate": 9.173751875136683e-06, + "loss": 4.3944, + "step": 9401 + }, + { + "epoch": 0.8013295832267963, + "grad_norm": 37.792908598896716, + "learning_rate": 9.173478826639195e-06, + "loss": 3.9722, + "step": 9402 + }, + { + "epoch": 0.8014148129208216, + "grad_norm": 46.38970395658825, + "learning_rate": 9.173205737097201e-06, + "loss": 4.1826, + "step": 9403 + }, + { + "epoch": 0.801500042614847, + "grad_norm": 47.71523620345479, + "learning_rate": 9.172932606513387e-06, + "loss": 4.4551, + "step": 9404 + }, + { + "epoch": 0.8015852723088724, + "grad_norm": 96.90997073618676, + "learning_rate": 9.17265943489044e-06, + "loss": 4.7091, + "step": 9405 + }, + { + "epoch": 0.8016705020028978, + "grad_norm": 77.36485451677237, + "learning_rate": 9.172386222231044e-06, + "loss": 4.7044, + "step": 9406 + }, + { + "epoch": 0.8017557316969232, + "grad_norm": 82.79175122977854, + "learning_rate": 9.17211296853789e-06, + "loss": 4.9733, + "step": 9407 + }, + { + "epoch": 0.8018409613909486, + "grad_norm": 45.37065631748411, + "learning_rate": 9.17183967381366e-06, + "loss": 4.9904, + "step": 9408 + }, + { + "epoch": 0.801926191084974, + "grad_norm": 49.9642110878681, + "learning_rate": 9.171566338061047e-06, + "loss": 4.2476, + "step": 9409 + }, + { + "epoch": 0.8020114207789995, + "grad_norm": 28.136158492735078, + "learning_rate": 9.171292961282734e-06, + "loss": 2.3919, + "step": 9410 + }, + { + "epoch": 0.8020966504730248, + "grad_norm": 51.04260533009355, + "learning_rate": 9.171019543481414e-06, + "loss": 3.9599, + "step": 9411 + }, + { + "epoch": 0.8021818801670502, + "grad_norm": 37.004364545298984, + "learning_rate": 9.170746084659773e-06, + "loss": 3.889, + "step": 9412 + }, + { + "epoch": 0.8022671098610756, + "grad_norm": 48.99273561635117, + "learning_rate": 9.170472584820502e-06, + "loss": 3.7713, + "step": 9413 + }, + { + "epoch": 0.802352339555101, + "grad_norm": 39.73629772186979, + "learning_rate": 9.170199043966292e-06, + "loss": 4.0992, + "step": 9414 + }, + { + "epoch": 0.8024375692491263, + "grad_norm": 35.4901650785581, + "learning_rate": 9.169925462099829e-06, + "loss": 3.4957, + "step": 9415 + }, + { + "epoch": 0.8025227989431518, + "grad_norm": 60.18856542178067, + "learning_rate": 9.169651839223807e-06, + "loss": 4.9215, + "step": 9416 + }, + { + "epoch": 0.8026080286371772, + "grad_norm": 90.34515581009231, + "learning_rate": 9.169378175340918e-06, + "loss": 4.4234, + "step": 9417 + }, + { + "epoch": 0.8026932583312026, + "grad_norm": 39.08951524877765, + "learning_rate": 9.169104470453848e-06, + "loss": 4.1939, + "step": 9418 + }, + { + "epoch": 0.802778488025228, + "grad_norm": 53.926850326167596, + "learning_rate": 9.168830724565295e-06, + "loss": 5.048, + "step": 9419 + }, + { + "epoch": 0.8028637177192534, + "grad_norm": 57.534144439148605, + "learning_rate": 9.168556937677946e-06, + "loss": 4.2715, + "step": 9420 + }, + { + "epoch": 0.8029489474132788, + "grad_norm": 111.5498447596735, + "learning_rate": 9.168283109794498e-06, + "loss": 5.1611, + "step": 9421 + }, + { + "epoch": 0.8030341771073042, + "grad_norm": 45.6655738125588, + "learning_rate": 9.16800924091764e-06, + "loss": 3.711, + "step": 9422 + }, + { + "epoch": 0.8031194068013295, + "grad_norm": 28.63806161460342, + "learning_rate": 9.167735331050069e-06, + "loss": 3.3463, + "step": 9423 + }, + { + "epoch": 0.803204636495355, + "grad_norm": 49.616906557741, + "learning_rate": 9.167461380194475e-06, + "loss": 4.0875, + "step": 9424 + }, + { + "epoch": 0.8032898661893804, + "grad_norm": 56.268381703948336, + "learning_rate": 9.167187388353556e-06, + "loss": 4.3433, + "step": 9425 + }, + { + "epoch": 0.8033750958834058, + "grad_norm": 71.84348694012078, + "learning_rate": 9.166913355530005e-06, + "loss": 4.3786, + "step": 9426 + }, + { + "epoch": 0.8034603255774312, + "grad_norm": 75.56352642681266, + "learning_rate": 9.166639281726517e-06, + "loss": 4.6016, + "step": 9427 + }, + { + "epoch": 0.8035455552714565, + "grad_norm": 44.08606729486417, + "learning_rate": 9.166365166945786e-06, + "loss": 3.8192, + "step": 9428 + }, + { + "epoch": 0.803630784965482, + "grad_norm": 53.28774543689749, + "learning_rate": 9.16609101119051e-06, + "loss": 3.7479, + "step": 9429 + }, + { + "epoch": 0.8037160146595074, + "grad_norm": 45.19328732629132, + "learning_rate": 9.165816814463384e-06, + "loss": 3.2897, + "step": 9430 + }, + { + "epoch": 0.8038012443535327, + "grad_norm": 40.84337105114004, + "learning_rate": 9.165542576767106e-06, + "loss": 3.537, + "step": 9431 + }, + { + "epoch": 0.8038864740475582, + "grad_norm": 115.27723841274205, + "learning_rate": 9.16526829810437e-06, + "loss": 3.7557, + "step": 9432 + }, + { + "epoch": 0.8039717037415836, + "grad_norm": 52.571789069125764, + "learning_rate": 9.164993978477879e-06, + "loss": 3.6355, + "step": 9433 + }, + { + "epoch": 0.8040569334356089, + "grad_norm": 66.02604956549615, + "learning_rate": 9.164719617890322e-06, + "loss": 4.7235, + "step": 9434 + }, + { + "epoch": 0.8041421631296344, + "grad_norm": 69.75333086809368, + "learning_rate": 9.164445216344408e-06, + "loss": 4.5411, + "step": 9435 + }, + { + "epoch": 0.8042273928236597, + "grad_norm": 32.05560363959672, + "learning_rate": 9.164170773842827e-06, + "loss": 4.0622, + "step": 9436 + }, + { + "epoch": 0.8043126225176852, + "grad_norm": 136.51136653837645, + "learning_rate": 9.163896290388281e-06, + "loss": 4.8843, + "step": 9437 + }, + { + "epoch": 0.8043978522117106, + "grad_norm": 36.417461811533364, + "learning_rate": 9.16362176598347e-06, + "loss": 3.7289, + "step": 9438 + }, + { + "epoch": 0.8044830819057359, + "grad_norm": 33.15474209213084, + "learning_rate": 9.163347200631095e-06, + "loss": 2.9989, + "step": 9439 + }, + { + "epoch": 0.8045683115997614, + "grad_norm": 54.33516167358976, + "learning_rate": 9.163072594333853e-06, + "loss": 3.6519, + "step": 9440 + }, + { + "epoch": 0.8046535412937867, + "grad_norm": 78.07636447017767, + "learning_rate": 9.162797947094447e-06, + "loss": 4.4013, + "step": 9441 + }, + { + "epoch": 0.8047387709878121, + "grad_norm": 37.95266381672845, + "learning_rate": 9.162523258915575e-06, + "loss": 3.1613, + "step": 9442 + }, + { + "epoch": 0.8048240006818376, + "grad_norm": 63.449672919446364, + "learning_rate": 9.162248529799942e-06, + "loss": 4.6787, + "step": 9443 + }, + { + "epoch": 0.8049092303758629, + "grad_norm": 78.1260042687955, + "learning_rate": 9.16197375975025e-06, + "loss": 4.7199, + "step": 9444 + }, + { + "epoch": 0.8049944600698884, + "grad_norm": 57.73232252066487, + "learning_rate": 9.161698948769199e-06, + "loss": 4.2086, + "step": 9445 + }, + { + "epoch": 0.8050796897639138, + "grad_norm": 64.99991921122611, + "learning_rate": 9.161424096859492e-06, + "loss": 4.4741, + "step": 9446 + }, + { + "epoch": 0.8051649194579391, + "grad_norm": 81.08180785199184, + "learning_rate": 9.161149204023833e-06, + "loss": 5.711, + "step": 9447 + }, + { + "epoch": 0.8052501491519646, + "grad_norm": 43.15949779250836, + "learning_rate": 9.160874270264926e-06, + "loss": 3.1987, + "step": 9448 + }, + { + "epoch": 0.8053353788459899, + "grad_norm": 52.11049735547966, + "learning_rate": 9.160599295585473e-06, + "loss": 3.5482, + "step": 9449 + }, + { + "epoch": 0.8054206085400153, + "grad_norm": 72.41997418465144, + "learning_rate": 9.160324279988179e-06, + "loss": 4.488, + "step": 9450 + }, + { + "epoch": 0.8055058382340408, + "grad_norm": 34.704811289439796, + "learning_rate": 9.16004922347575e-06, + "loss": 5.0943, + "step": 9451 + }, + { + "epoch": 0.8055910679280661, + "grad_norm": 52.6644298326982, + "learning_rate": 9.159774126050888e-06, + "loss": 4.6434, + "step": 9452 + }, + { + "epoch": 0.8056762976220916, + "grad_norm": 87.72575373870762, + "learning_rate": 9.159498987716303e-06, + "loss": 4.8443, + "step": 9453 + }, + { + "epoch": 0.8057615273161169, + "grad_norm": 65.58940105018829, + "learning_rate": 9.159223808474696e-06, + "loss": 4.5922, + "step": 9454 + }, + { + "epoch": 0.8058467570101423, + "grad_norm": 37.909682256351836, + "learning_rate": 9.158948588328776e-06, + "loss": 3.8889, + "step": 9455 + }, + { + "epoch": 0.8059319867041678, + "grad_norm": 59.90112377582646, + "learning_rate": 9.15867332728125e-06, + "loss": 4.6537, + "step": 9456 + }, + { + "epoch": 0.8060172163981931, + "grad_norm": 39.52221038055008, + "learning_rate": 9.158398025334823e-06, + "loss": 4.0226, + "step": 9457 + }, + { + "epoch": 0.8061024460922185, + "grad_norm": 67.7254582207089, + "learning_rate": 9.158122682492206e-06, + "loss": 3.7716, + "step": 9458 + }, + { + "epoch": 0.806187675786244, + "grad_norm": 34.4073640465141, + "learning_rate": 9.157847298756102e-06, + "loss": 3.3191, + "step": 9459 + }, + { + "epoch": 0.8062729054802693, + "grad_norm": 47.110922933703264, + "learning_rate": 9.157571874129226e-06, + "loss": 4.5971, + "step": 9460 + }, + { + "epoch": 0.8063581351742948, + "grad_norm": 24.51479634139947, + "learning_rate": 9.157296408614278e-06, + "loss": 3.1522, + "step": 9461 + }, + { + "epoch": 0.8064433648683201, + "grad_norm": 153.9726358598846, + "learning_rate": 9.157020902213974e-06, + "loss": 5.4548, + "step": 9462 + }, + { + "epoch": 0.8065285945623455, + "grad_norm": 57.527202690782055, + "learning_rate": 9.156745354931021e-06, + "loss": 4.5499, + "step": 9463 + }, + { + "epoch": 0.806613824256371, + "grad_norm": 55.33039525246987, + "learning_rate": 9.15646976676813e-06, + "loss": 3.6242, + "step": 9464 + }, + { + "epoch": 0.8066990539503963, + "grad_norm": 88.58213694014289, + "learning_rate": 9.156194137728009e-06, + "loss": 4.8989, + "step": 9465 + }, + { + "epoch": 0.8067842836444217, + "grad_norm": 41.099085754936006, + "learning_rate": 9.155918467813372e-06, + "loss": 3.079, + "step": 9466 + }, + { + "epoch": 0.8068695133384471, + "grad_norm": 82.56636350210441, + "learning_rate": 9.155642757026927e-06, + "loss": 5.0888, + "step": 9467 + }, + { + "epoch": 0.8069547430324725, + "grad_norm": 85.24856911377508, + "learning_rate": 9.155367005371387e-06, + "loss": 2.7932, + "step": 9468 + }, + { + "epoch": 0.807039972726498, + "grad_norm": 38.050720413295096, + "learning_rate": 9.155091212849463e-06, + "loss": 3.0171, + "step": 9469 + }, + { + "epoch": 0.8071252024205233, + "grad_norm": 38.482578514058844, + "learning_rate": 9.154815379463869e-06, + "loss": 3.5194, + "step": 9470 + }, + { + "epoch": 0.8072104321145487, + "grad_norm": 46.912501628718836, + "learning_rate": 9.154539505217316e-06, + "loss": 3.8926, + "step": 9471 + }, + { + "epoch": 0.8072956618085741, + "grad_norm": 43.58606744379009, + "learning_rate": 9.154263590112518e-06, + "loss": 4.0999, + "step": 9472 + }, + { + "epoch": 0.8073808915025995, + "grad_norm": 80.55357569493195, + "learning_rate": 9.153987634152189e-06, + "loss": 3.7224, + "step": 9473 + }, + { + "epoch": 0.8074661211966249, + "grad_norm": 50.892130138493094, + "learning_rate": 9.153711637339042e-06, + "loss": 4.4755, + "step": 9474 + }, + { + "epoch": 0.8075513508906503, + "grad_norm": 50.028650112139495, + "learning_rate": 9.153435599675791e-06, + "loss": 4.5199, + "step": 9475 + }, + { + "epoch": 0.8076365805846757, + "grad_norm": 63.13914503656268, + "learning_rate": 9.153159521165151e-06, + "loss": 4.6874, + "step": 9476 + }, + { + "epoch": 0.807721810278701, + "grad_norm": 44.946490460215415, + "learning_rate": 9.152883401809838e-06, + "loss": 2.8577, + "step": 9477 + }, + { + "epoch": 0.8078070399727265, + "grad_norm": 35.2621339406568, + "learning_rate": 9.152607241612567e-06, + "loss": 3.5116, + "step": 9478 + }, + { + "epoch": 0.8078922696667519, + "grad_norm": 51.649373732760054, + "learning_rate": 9.152331040576055e-06, + "loss": 4.2454, + "step": 9479 + }, + { + "epoch": 0.8079774993607773, + "grad_norm": 1319.4379366078508, + "learning_rate": 9.152054798703015e-06, + "loss": 5.133, + "step": 9480 + }, + { + "epoch": 0.8080627290548027, + "grad_norm": 57.32216174903024, + "learning_rate": 9.151778515996167e-06, + "loss": 4.0867, + "step": 9481 + }, + { + "epoch": 0.808147958748828, + "grad_norm": 41.642548008221105, + "learning_rate": 9.151502192458226e-06, + "loss": 4.3833, + "step": 9482 + }, + { + "epoch": 0.8082331884428535, + "grad_norm": 75.95164329722277, + "learning_rate": 9.151225828091911e-06, + "loss": 4.2785, + "step": 9483 + }, + { + "epoch": 0.8083184181368789, + "grad_norm": 38.025568379106225, + "learning_rate": 9.150949422899942e-06, + "loss": 3.5441, + "step": 9484 + }, + { + "epoch": 0.8084036478309042, + "grad_norm": 39.892955165114564, + "learning_rate": 9.150672976885031e-06, + "loss": 3.6036, + "step": 9485 + }, + { + "epoch": 0.8084888775249297, + "grad_norm": 61.51263454163478, + "learning_rate": 9.150396490049904e-06, + "loss": 5.6591, + "step": 9486 + }, + { + "epoch": 0.8085741072189551, + "grad_norm": 63.88023441694558, + "learning_rate": 9.150119962397275e-06, + "loss": 4.1377, + "step": 9487 + }, + { + "epoch": 0.8086593369129805, + "grad_norm": 29.608744112184215, + "learning_rate": 9.149843393929865e-06, + "loss": 3.9761, + "step": 9488 + }, + { + "epoch": 0.8087445666070059, + "grad_norm": 29.40196829980848, + "learning_rate": 9.149566784650394e-06, + "loss": 2.3507, + "step": 9489 + }, + { + "epoch": 0.8088297963010312, + "grad_norm": 69.383864367176, + "learning_rate": 9.149290134561583e-06, + "loss": 4.0801, + "step": 9490 + }, + { + "epoch": 0.8089150259950567, + "grad_norm": 49.57622511827431, + "learning_rate": 9.149013443666153e-06, + "loss": 4.6554, + "step": 9491 + }, + { + "epoch": 0.8090002556890821, + "grad_norm": 66.27869219122039, + "learning_rate": 9.148736711966825e-06, + "loss": 3.8903, + "step": 9492 + }, + { + "epoch": 0.8090854853831074, + "grad_norm": 52.90507201561388, + "learning_rate": 9.148459939466319e-06, + "loss": 4.601, + "step": 9493 + }, + { + "epoch": 0.8091707150771329, + "grad_norm": 43.921502164290246, + "learning_rate": 9.148183126167358e-06, + "loss": 4.1328, + "step": 9494 + }, + { + "epoch": 0.8092559447711583, + "grad_norm": 30.7167733007947, + "learning_rate": 9.147906272072664e-06, + "loss": 3.8778, + "step": 9495 + }, + { + "epoch": 0.8093411744651837, + "grad_norm": 29.721463722838212, + "learning_rate": 9.14762937718496e-06, + "loss": 4.0169, + "step": 9496 + }, + { + "epoch": 0.8094264041592091, + "grad_norm": 40.394419094643624, + "learning_rate": 9.14735244150697e-06, + "loss": 4.8969, + "step": 9497 + }, + { + "epoch": 0.8095116338532344, + "grad_norm": 42.825549316556774, + "learning_rate": 9.147075465041414e-06, + "loss": 4.3882, + "step": 9498 + }, + { + "epoch": 0.8095968635472599, + "grad_norm": 27.629234291464638, + "learning_rate": 9.146798447791023e-06, + "loss": 4.0252, + "step": 9499 + }, + { + "epoch": 0.8096820932412853, + "grad_norm": 32.477592071717176, + "learning_rate": 9.146521389758513e-06, + "loss": 3.6242, + "step": 9500 + }, + { + "epoch": 0.8097673229353106, + "grad_norm": 39.26698999796972, + "learning_rate": 9.146244290946616e-06, + "loss": 3.6767, + "step": 9501 + }, + { + "epoch": 0.8098525526293361, + "grad_norm": 42.07729804961686, + "learning_rate": 9.145967151358052e-06, + "loss": 2.3815, + "step": 9502 + }, + { + "epoch": 0.8099377823233614, + "grad_norm": 46.17114732053244, + "learning_rate": 9.14568997099555e-06, + "loss": 3.0289, + "step": 9503 + }, + { + "epoch": 0.8100230120173869, + "grad_norm": 62.68852605655238, + "learning_rate": 9.145412749861832e-06, + "loss": 4.16, + "step": 9504 + }, + { + "epoch": 0.8101082417114123, + "grad_norm": 54.772965702460105, + "learning_rate": 9.145135487959629e-06, + "loss": 4.4454, + "step": 9505 + }, + { + "epoch": 0.8101934714054376, + "grad_norm": 47.236291046379414, + "learning_rate": 9.144858185291665e-06, + "loss": 4.2391, + "step": 9506 + }, + { + "epoch": 0.8102787010994631, + "grad_norm": 33.54422217158129, + "learning_rate": 9.144580841860668e-06, + "loss": 3.4101, + "step": 9507 + }, + { + "epoch": 0.8103639307934885, + "grad_norm": 132.33158615523607, + "learning_rate": 9.144303457669365e-06, + "loss": 5.1329, + "step": 9508 + }, + { + "epoch": 0.8104491604875138, + "grad_norm": 80.00268441261473, + "learning_rate": 9.144026032720485e-06, + "loss": 4.193, + "step": 9509 + }, + { + "epoch": 0.8105343901815393, + "grad_norm": 56.584546439953364, + "learning_rate": 9.143748567016755e-06, + "loss": 3.5298, + "step": 9510 + }, + { + "epoch": 0.8106196198755646, + "grad_norm": 40.90033990651645, + "learning_rate": 9.143471060560905e-06, + "loss": 4.0509, + "step": 9511 + }, + { + "epoch": 0.81070484956959, + "grad_norm": 31.63258790758312, + "learning_rate": 9.143193513355661e-06, + "loss": 3.775, + "step": 9512 + }, + { + "epoch": 0.8107900792636155, + "grad_norm": 31.676412683988804, + "learning_rate": 9.142915925403759e-06, + "loss": 3.1361, + "step": 9513 + }, + { + "epoch": 0.8108753089576408, + "grad_norm": 34.76614974580918, + "learning_rate": 9.142638296707922e-06, + "loss": 3.2986, + "step": 9514 + }, + { + "epoch": 0.8109605386516663, + "grad_norm": 118.056731812829, + "learning_rate": 9.142360627270886e-06, + "loss": 4.582, + "step": 9515 + }, + { + "epoch": 0.8110457683456916, + "grad_norm": 32.0275800006343, + "learning_rate": 9.142082917095377e-06, + "loss": 4.2696, + "step": 9516 + }, + { + "epoch": 0.811130998039717, + "grad_norm": 96.4135208934191, + "learning_rate": 9.14180516618413e-06, + "loss": 4.1549, + "step": 9517 + }, + { + "epoch": 0.8112162277337425, + "grad_norm": 34.92304008754877, + "learning_rate": 9.141527374539874e-06, + "loss": 2.977, + "step": 9518 + }, + { + "epoch": 0.8113014574277678, + "grad_norm": 62.06766861567168, + "learning_rate": 9.141249542165343e-06, + "loss": 3.9464, + "step": 9519 + }, + { + "epoch": 0.8113866871217932, + "grad_norm": 35.692845479255844, + "learning_rate": 9.140971669063266e-06, + "loss": 4.3757, + "step": 9520 + }, + { + "epoch": 0.8114719168158187, + "grad_norm": 38.02738813408138, + "learning_rate": 9.140693755236381e-06, + "loss": 3.0529, + "step": 9521 + }, + { + "epoch": 0.811557146509844, + "grad_norm": 56.03924889643491, + "learning_rate": 9.140415800687417e-06, + "loss": 4.6839, + "step": 9522 + }, + { + "epoch": 0.8116423762038695, + "grad_norm": 30.068151534976028, + "learning_rate": 9.140137805419108e-06, + "loss": 2.9551, + "step": 9523 + }, + { + "epoch": 0.8117276058978948, + "grad_norm": 41.72098075764856, + "learning_rate": 9.139859769434189e-06, + "loss": 3.8092, + "step": 9524 + }, + { + "epoch": 0.8118128355919202, + "grad_norm": 80.45823726976748, + "learning_rate": 9.139581692735395e-06, + "loss": 4.8039, + "step": 9525 + }, + { + "epoch": 0.8118980652859457, + "grad_norm": 47.58349069779127, + "learning_rate": 9.13930357532546e-06, + "loss": 3.5112, + "step": 9526 + }, + { + "epoch": 0.811983294979971, + "grad_norm": 50.127714883889595, + "learning_rate": 9.139025417207118e-06, + "loss": 3.5501, + "step": 9527 + }, + { + "epoch": 0.8120685246739964, + "grad_norm": 50.25943300125653, + "learning_rate": 9.138747218383108e-06, + "loss": 2.6629, + "step": 9528 + }, + { + "epoch": 0.8121537543680218, + "grad_norm": 78.55891332544284, + "learning_rate": 9.138468978856163e-06, + "loss": 1.8647, + "step": 9529 + }, + { + "epoch": 0.8122389840620472, + "grad_norm": 58.66317891979633, + "learning_rate": 9.138190698629018e-06, + "loss": 3.8244, + "step": 9530 + }, + { + "epoch": 0.8123242137560727, + "grad_norm": 39.34559291999717, + "learning_rate": 9.137912377704413e-06, + "loss": 4.4951, + "step": 9531 + }, + { + "epoch": 0.812409443450098, + "grad_norm": 45.04416625791852, + "learning_rate": 9.137634016085085e-06, + "loss": 4.2787, + "step": 9532 + }, + { + "epoch": 0.8124946731441234, + "grad_norm": 40.30692349587229, + "learning_rate": 9.137355613773769e-06, + "loss": 4.4072, + "step": 9533 + }, + { + "epoch": 0.8125799028381488, + "grad_norm": 32.490704569101055, + "learning_rate": 9.137077170773206e-06, + "loss": 3.9191, + "step": 9534 + }, + { + "epoch": 0.8126651325321742, + "grad_norm": 31.29631024015935, + "learning_rate": 9.136798687086132e-06, + "loss": 4.1945, + "step": 9535 + }, + { + "epoch": 0.8127503622261996, + "grad_norm": 40.78174943663238, + "learning_rate": 9.136520162715288e-06, + "loss": 3.379, + "step": 9536 + }, + { + "epoch": 0.812835591920225, + "grad_norm": 50.60866193534898, + "learning_rate": 9.13624159766341e-06, + "loss": 4.3577, + "step": 9537 + }, + { + "epoch": 0.8129208216142504, + "grad_norm": 72.9629224948433, + "learning_rate": 9.13596299193324e-06, + "loss": 4.5949, + "step": 9538 + }, + { + "epoch": 0.8130060513082759, + "grad_norm": 69.49045416528095, + "learning_rate": 9.135684345527518e-06, + "loss": 4.2669, + "step": 9539 + }, + { + "epoch": 0.8130912810023012, + "grad_norm": 62.16127064531188, + "learning_rate": 9.135405658448984e-06, + "loss": 4.1572, + "step": 9540 + }, + { + "epoch": 0.8131765106963266, + "grad_norm": 37.923480380011746, + "learning_rate": 9.135126930700378e-06, + "loss": 3.5698, + "step": 9541 + }, + { + "epoch": 0.813261740390352, + "grad_norm": 38.04875016605263, + "learning_rate": 9.134848162284442e-06, + "loss": 3.8753, + "step": 9542 + }, + { + "epoch": 0.8133469700843774, + "grad_norm": 41.0062336097038, + "learning_rate": 9.134569353203917e-06, + "loss": 3.1725, + "step": 9543 + }, + { + "epoch": 0.8134321997784028, + "grad_norm": 48.30946119448709, + "learning_rate": 9.134290503461546e-06, + "loss": 4.1013, + "step": 9544 + }, + { + "epoch": 0.8135174294724282, + "grad_norm": 26.993542874761474, + "learning_rate": 9.13401161306007e-06, + "loss": 3.538, + "step": 9545 + }, + { + "epoch": 0.8136026591664536, + "grad_norm": 29.604505509863245, + "learning_rate": 9.133732682002233e-06, + "loss": 3.6699, + "step": 9546 + }, + { + "epoch": 0.8136878888604789, + "grad_norm": 76.13846556533986, + "learning_rate": 9.133453710290779e-06, + "loss": 4.7846, + "step": 9547 + }, + { + "epoch": 0.8137731185545044, + "grad_norm": 38.30667966676297, + "learning_rate": 9.133174697928448e-06, + "loss": 4.3362, + "step": 9548 + }, + { + "epoch": 0.8138583482485298, + "grad_norm": 73.59579838847401, + "learning_rate": 9.132895644917986e-06, + "loss": 4.0081, + "step": 9549 + }, + { + "epoch": 0.8139435779425552, + "grad_norm": 92.05237470879352, + "learning_rate": 9.132616551262139e-06, + "loss": 3.9879, + "step": 9550 + }, + { + "epoch": 0.8140288076365806, + "grad_norm": 37.98591161477088, + "learning_rate": 9.13233741696365e-06, + "loss": 2.8938, + "step": 9551 + }, + { + "epoch": 0.8141140373306059, + "grad_norm": 29.611655245400677, + "learning_rate": 9.132058242025264e-06, + "loss": 2.9292, + "step": 9552 + }, + { + "epoch": 0.8141992670246314, + "grad_norm": 57.430321596073654, + "learning_rate": 9.131779026449727e-06, + "loss": 4.5006, + "step": 9553 + }, + { + "epoch": 0.8142844967186568, + "grad_norm": 127.02447270363302, + "learning_rate": 9.131499770239786e-06, + "loss": 4.79, + "step": 9554 + }, + { + "epoch": 0.8143697264126821, + "grad_norm": 34.38533295193657, + "learning_rate": 9.131220473398188e-06, + "loss": 2.4528, + "step": 9555 + }, + { + "epoch": 0.8144549561067076, + "grad_norm": 29.9011436736845, + "learning_rate": 9.130941135927675e-06, + "loss": 4.0563, + "step": 9556 + }, + { + "epoch": 0.814540185800733, + "grad_norm": 82.60647011853162, + "learning_rate": 9.130661757830998e-06, + "loss": 4.3607, + "step": 9557 + }, + { + "epoch": 0.8146254154947584, + "grad_norm": 43.34134395157846, + "learning_rate": 9.130382339110907e-06, + "loss": 4.5212, + "step": 9558 + }, + { + "epoch": 0.8147106451887838, + "grad_norm": 40.9370190853493, + "learning_rate": 9.130102879770144e-06, + "loss": 3.17, + "step": 9559 + }, + { + "epoch": 0.8147958748828091, + "grad_norm": 46.143257667801166, + "learning_rate": 9.12982337981146e-06, + "loss": 4.3404, + "step": 9560 + }, + { + "epoch": 0.8148811045768346, + "grad_norm": 47.215784006760465, + "learning_rate": 9.129543839237605e-06, + "loss": 3.7498, + "step": 9561 + }, + { + "epoch": 0.81496633427086, + "grad_norm": 39.25057366791992, + "learning_rate": 9.129264258051327e-06, + "loss": 4.2325, + "step": 9562 + }, + { + "epoch": 0.8150515639648853, + "grad_norm": 67.88536842945763, + "learning_rate": 9.128984636255376e-06, + "loss": 5.0531, + "step": 9563 + }, + { + "epoch": 0.8151367936589108, + "grad_norm": 38.13807560112149, + "learning_rate": 9.128704973852503e-06, + "loss": 3.291, + "step": 9564 + }, + { + "epoch": 0.8152220233529361, + "grad_norm": 45.392487820337934, + "learning_rate": 9.128425270845455e-06, + "loss": 2.9809, + "step": 9565 + }, + { + "epoch": 0.8153072530469616, + "grad_norm": 102.13274992049725, + "learning_rate": 9.128145527236988e-06, + "loss": 5.1763, + "step": 9566 + }, + { + "epoch": 0.815392482740987, + "grad_norm": 85.4702833865061, + "learning_rate": 9.127865743029846e-06, + "loss": 4.8187, + "step": 9567 + }, + { + "epoch": 0.8154777124350123, + "grad_norm": 61.57257078855469, + "learning_rate": 9.127585918226788e-06, + "loss": 3.9753, + "step": 9568 + }, + { + "epoch": 0.8155629421290378, + "grad_norm": 78.56080500499706, + "learning_rate": 9.127306052830561e-06, + "loss": 4.173, + "step": 9569 + }, + { + "epoch": 0.8156481718230632, + "grad_norm": 30.39535678657124, + "learning_rate": 9.127026146843919e-06, + "loss": 2.8059, + "step": 9570 + }, + { + "epoch": 0.8157334015170885, + "grad_norm": 64.70182386377358, + "learning_rate": 9.126746200269617e-06, + "loss": 3.5101, + "step": 9571 + }, + { + "epoch": 0.815818631211114, + "grad_norm": 54.85768744758145, + "learning_rate": 9.126466213110403e-06, + "loss": 3.9702, + "step": 9572 + }, + { + "epoch": 0.8159038609051393, + "grad_norm": 61.35952854205986, + "learning_rate": 9.126186185369035e-06, + "loss": 4.7937, + "step": 9573 + }, + { + "epoch": 0.8159890905991648, + "grad_norm": 69.51577754455728, + "learning_rate": 9.125906117048264e-06, + "loss": 3.8622, + "step": 9574 + }, + { + "epoch": 0.8160743202931902, + "grad_norm": 65.82524214775758, + "learning_rate": 9.125626008150848e-06, + "loss": 5.0731, + "step": 9575 + }, + { + "epoch": 0.8161595499872155, + "grad_norm": 75.26712738708592, + "learning_rate": 9.125345858679537e-06, + "loss": 4.3183, + "step": 9576 + }, + { + "epoch": 0.816244779681241, + "grad_norm": 44.094535924191206, + "learning_rate": 9.125065668637091e-06, + "loss": 3.4911, + "step": 9577 + }, + { + "epoch": 0.8163300093752663, + "grad_norm": 104.35158278551822, + "learning_rate": 9.124785438026262e-06, + "loss": 4.9397, + "step": 9578 + }, + { + "epoch": 0.8164152390692917, + "grad_norm": 48.49092385597328, + "learning_rate": 9.124505166849808e-06, + "loss": 4.1229, + "step": 9579 + }, + { + "epoch": 0.8165004687633172, + "grad_norm": 90.37187096595201, + "learning_rate": 9.124224855110484e-06, + "loss": 4.7194, + "step": 9580 + }, + { + "epoch": 0.8165856984573425, + "grad_norm": 73.2735957369362, + "learning_rate": 9.12394450281105e-06, + "loss": 5.1138, + "step": 9581 + }, + { + "epoch": 0.816670928151368, + "grad_norm": 31.997523683551254, + "learning_rate": 9.123664109954257e-06, + "loss": 2.8927, + "step": 9582 + }, + { + "epoch": 0.8167561578453933, + "grad_norm": 35.68470663994906, + "learning_rate": 9.123383676542867e-06, + "loss": 3.9237, + "step": 9583 + }, + { + "epoch": 0.8168413875394187, + "grad_norm": 33.45197189721488, + "learning_rate": 9.123103202579638e-06, + "loss": 3.3382, + "step": 9584 + }, + { + "epoch": 0.8169266172334442, + "grad_norm": 31.84753614175869, + "learning_rate": 9.122822688067325e-06, + "loss": 3.972, + "step": 9585 + }, + { + "epoch": 0.8170118469274695, + "grad_norm": 36.7738340078246, + "learning_rate": 9.122542133008692e-06, + "loss": 3.735, + "step": 9586 + }, + { + "epoch": 0.8170970766214949, + "grad_norm": 70.46401906501066, + "learning_rate": 9.122261537406493e-06, + "loss": 3.885, + "step": 9587 + }, + { + "epoch": 0.8171823063155204, + "grad_norm": 26.48348081482623, + "learning_rate": 9.121980901263492e-06, + "loss": 3.0758, + "step": 9588 + }, + { + "epoch": 0.8172675360095457, + "grad_norm": 33.28289179207342, + "learning_rate": 9.121700224582445e-06, + "loss": 3.9226, + "step": 9589 + }, + { + "epoch": 0.8173527657035711, + "grad_norm": 43.92464735919446, + "learning_rate": 9.121419507366114e-06, + "loss": 3.8125, + "step": 9590 + }, + { + "epoch": 0.8174379953975965, + "grad_norm": 48.26111249389667, + "learning_rate": 9.12113874961726e-06, + "loss": 3.6468, + "step": 9591 + }, + { + "epoch": 0.8175232250916219, + "grad_norm": 29.492419964786308, + "learning_rate": 9.120857951338644e-06, + "loss": 2.4852, + "step": 9592 + }, + { + "epoch": 0.8176084547856474, + "grad_norm": 64.06456207375602, + "learning_rate": 9.120577112533029e-06, + "loss": 3.8412, + "step": 9593 + }, + { + "epoch": 0.8176936844796727, + "grad_norm": 142.73984795950938, + "learning_rate": 9.120296233203174e-06, + "loss": 4.7103, + "step": 9594 + }, + { + "epoch": 0.8177789141736981, + "grad_norm": 52.296683734920066, + "learning_rate": 9.120015313351845e-06, + "loss": 3.9332, + "step": 9595 + }, + { + "epoch": 0.8178641438677235, + "grad_norm": 41.96595332202774, + "learning_rate": 9.1197343529818e-06, + "loss": 4.6107, + "step": 9596 + }, + { + "epoch": 0.8179493735617489, + "grad_norm": 33.61200452511677, + "learning_rate": 9.119453352095807e-06, + "loss": 3.0953, + "step": 9597 + }, + { + "epoch": 0.8180346032557743, + "grad_norm": 60.38708231436847, + "learning_rate": 9.119172310696624e-06, + "loss": 4.5362, + "step": 9598 + }, + { + "epoch": 0.8181198329497997, + "grad_norm": 48.79876304965003, + "learning_rate": 9.11889122878702e-06, + "loss": 3.9578, + "step": 9599 + }, + { + "epoch": 0.8182050626438251, + "grad_norm": 66.81782463722276, + "learning_rate": 9.118610106369757e-06, + "loss": 5.5737, + "step": 9600 + }, + { + "epoch": 0.8182902923378506, + "grad_norm": 31.373746985570342, + "learning_rate": 9.118328943447601e-06, + "loss": 4.1853, + "step": 9601 + }, + { + "epoch": 0.8183755220318759, + "grad_norm": 39.49017145894489, + "learning_rate": 9.118047740023314e-06, + "loss": 3.9237, + "step": 9602 + }, + { + "epoch": 0.8184607517259013, + "grad_norm": 35.48925383727671, + "learning_rate": 9.117766496099667e-06, + "loss": 3.6256, + "step": 9603 + }, + { + "epoch": 0.8185459814199267, + "grad_norm": 32.29595803715406, + "learning_rate": 9.117485211679421e-06, + "loss": 3.0802, + "step": 9604 + }, + { + "epoch": 0.8186312111139521, + "grad_norm": 47.39180002732242, + "learning_rate": 9.117203886765345e-06, + "loss": 4.0761, + "step": 9605 + }, + { + "epoch": 0.8187164408079775, + "grad_norm": 42.208648057847675, + "learning_rate": 9.116922521360203e-06, + "loss": 3.4435, + "step": 9606 + }, + { + "epoch": 0.8188016705020029, + "grad_norm": 34.212349600617195, + "learning_rate": 9.116641115466766e-06, + "loss": 4.1339, + "step": 9607 + }, + { + "epoch": 0.8188869001960283, + "grad_norm": 34.760485642736064, + "learning_rate": 9.116359669087797e-06, + "loss": 3.0348, + "step": 9608 + }, + { + "epoch": 0.8189721298900537, + "grad_norm": 32.062025671434334, + "learning_rate": 9.11607818222607e-06, + "loss": 4.2438, + "step": 9609 + }, + { + "epoch": 0.8190573595840791, + "grad_norm": 27.42618257142054, + "learning_rate": 9.115796654884346e-06, + "loss": 2.5684, + "step": 9610 + }, + { + "epoch": 0.8191425892781045, + "grad_norm": 31.12465233563377, + "learning_rate": 9.115515087065399e-06, + "loss": 2.8343, + "step": 9611 + }, + { + "epoch": 0.8192278189721299, + "grad_norm": 55.15712651863986, + "learning_rate": 9.115233478771996e-06, + "loss": 3.8602, + "step": 9612 + }, + { + "epoch": 0.8193130486661553, + "grad_norm": 42.454118809353766, + "learning_rate": 9.114951830006907e-06, + "loss": 3.2493, + "step": 9613 + }, + { + "epoch": 0.8193982783601806, + "grad_norm": 58.07515895170141, + "learning_rate": 9.114670140772904e-06, + "loss": 4.4689, + "step": 9614 + }, + { + "epoch": 0.8194835080542061, + "grad_norm": 50.68475571007066, + "learning_rate": 9.114388411072752e-06, + "loss": 4.421, + "step": 9615 + }, + { + "epoch": 0.8195687377482315, + "grad_norm": 184.04045738569744, + "learning_rate": 9.114106640909226e-06, + "loss": 4.6593, + "step": 9616 + }, + { + "epoch": 0.8196539674422569, + "grad_norm": 72.06600276375802, + "learning_rate": 9.113824830285096e-06, + "loss": 4.363, + "step": 9617 + }, + { + "epoch": 0.8197391971362823, + "grad_norm": 78.86476023887083, + "learning_rate": 9.113542979203133e-06, + "loss": 4.288, + "step": 9618 + }, + { + "epoch": 0.8198244268303077, + "grad_norm": 54.82933130966876, + "learning_rate": 9.11326108766611e-06, + "loss": 3.853, + "step": 9619 + }, + { + "epoch": 0.8199096565243331, + "grad_norm": 138.4571912801859, + "learning_rate": 9.1129791556768e-06, + "loss": 4.222, + "step": 9620 + }, + { + "epoch": 0.8199948862183585, + "grad_norm": 53.28463184247099, + "learning_rate": 9.112697183237972e-06, + "loss": 4.5161, + "step": 9621 + }, + { + "epoch": 0.8200801159123838, + "grad_norm": 49.796352917197275, + "learning_rate": 9.112415170352402e-06, + "loss": 3.5815, + "step": 9622 + }, + { + "epoch": 0.8201653456064093, + "grad_norm": 142.64766888213407, + "learning_rate": 9.112133117022864e-06, + "loss": 4.2676, + "step": 9623 + }, + { + "epoch": 0.8202505753004347, + "grad_norm": 83.08245644726938, + "learning_rate": 9.11185102325213e-06, + "loss": 4.9302, + "step": 9624 + }, + { + "epoch": 0.82033580499446, + "grad_norm": 28.760219764223798, + "learning_rate": 9.111568889042976e-06, + "loss": 2.5949, + "step": 9625 + }, + { + "epoch": 0.8204210346884855, + "grad_norm": 165.4852408078249, + "learning_rate": 9.111286714398174e-06, + "loss": 5.4169, + "step": 9626 + }, + { + "epoch": 0.8205062643825108, + "grad_norm": 64.68833176914116, + "learning_rate": 9.111004499320502e-06, + "loss": 4.9362, + "step": 9627 + }, + { + "epoch": 0.8205914940765363, + "grad_norm": 67.2072366011745, + "learning_rate": 9.110722243812734e-06, + "loss": 3.7235, + "step": 9628 + }, + { + "epoch": 0.8206767237705617, + "grad_norm": 81.46889033031518, + "learning_rate": 9.110439947877646e-06, + "loss": 3.6793, + "step": 9629 + }, + { + "epoch": 0.820761953464587, + "grad_norm": 60.20972299562206, + "learning_rate": 9.110157611518015e-06, + "loss": 4.7456, + "step": 9630 + }, + { + "epoch": 0.8208471831586125, + "grad_norm": 43.13351185130302, + "learning_rate": 9.109875234736617e-06, + "loss": 4.1522, + "step": 9631 + }, + { + "epoch": 0.8209324128526378, + "grad_norm": 37.973203964802714, + "learning_rate": 9.109592817536227e-06, + "loss": 3.1831, + "step": 9632 + }, + { + "epoch": 0.8210176425466632, + "grad_norm": 46.46965401708871, + "learning_rate": 9.109310359919627e-06, + "loss": 4.601, + "step": 9633 + }, + { + "epoch": 0.8211028722406887, + "grad_norm": 76.41755867332735, + "learning_rate": 9.109027861889593e-06, + "loss": 5.2764, + "step": 9634 + }, + { + "epoch": 0.821188101934714, + "grad_norm": 170.96868944462284, + "learning_rate": 9.108745323448902e-06, + "loss": 3.8074, + "step": 9635 + }, + { + "epoch": 0.8212733316287395, + "grad_norm": 48.54668790996321, + "learning_rate": 9.108462744600333e-06, + "loss": 2.9608, + "step": 9636 + }, + { + "epoch": 0.8213585613227649, + "grad_norm": 45.37830495804321, + "learning_rate": 9.108180125346667e-06, + "loss": 3.3212, + "step": 9637 + }, + { + "epoch": 0.8214437910167902, + "grad_norm": 43.61557585589827, + "learning_rate": 9.10789746569068e-06, + "loss": 3.557, + "step": 9638 + }, + { + "epoch": 0.8215290207108157, + "grad_norm": 36.218735837856066, + "learning_rate": 9.107614765635154e-06, + "loss": 3.8218, + "step": 9639 + }, + { + "epoch": 0.821614250404841, + "grad_norm": 25.136495181075993, + "learning_rate": 9.10733202518287e-06, + "loss": 2.5654, + "step": 9640 + }, + { + "epoch": 0.8216994800988664, + "grad_norm": 44.913945805624735, + "learning_rate": 9.107049244336605e-06, + "loss": 3.6948, + "step": 9641 + }, + { + "epoch": 0.8217847097928919, + "grad_norm": 37.23732357952145, + "learning_rate": 9.106766423099145e-06, + "loss": 3.3884, + "step": 9642 + }, + { + "epoch": 0.8218699394869172, + "grad_norm": 65.0930474417245, + "learning_rate": 9.106483561473269e-06, + "loss": 4.5163, + "step": 9643 + }, + { + "epoch": 0.8219551691809427, + "grad_norm": 38.55266913674876, + "learning_rate": 9.10620065946176e-06, + "loss": 4.3843, + "step": 9644 + }, + { + "epoch": 0.822040398874968, + "grad_norm": 37.061187387472, + "learning_rate": 9.105917717067397e-06, + "loss": 3.7846, + "step": 9645 + }, + { + "epoch": 0.8221256285689934, + "grad_norm": 43.410871785428846, + "learning_rate": 9.105634734292965e-06, + "loss": 4.0242, + "step": 9646 + }, + { + "epoch": 0.8222108582630189, + "grad_norm": 62.00109763552184, + "learning_rate": 9.105351711141248e-06, + "loss": 4.2112, + "step": 9647 + }, + { + "epoch": 0.8222960879570442, + "grad_norm": 83.9545993170453, + "learning_rate": 9.105068647615028e-06, + "loss": 4.1379, + "step": 9648 + }, + { + "epoch": 0.8223813176510696, + "grad_norm": 77.42660008388364, + "learning_rate": 9.104785543717088e-06, + "loss": 3.9583, + "step": 9649 + }, + { + "epoch": 0.8224665473450951, + "grad_norm": 43.43012916432568, + "learning_rate": 9.104502399450212e-06, + "loss": 4.0239, + "step": 9650 + }, + { + "epoch": 0.8225517770391204, + "grad_norm": 39.03469102737257, + "learning_rate": 9.10421921481719e-06, + "loss": 4.4396, + "step": 9651 + }, + { + "epoch": 0.8226370067331459, + "grad_norm": 39.9912123660761, + "learning_rate": 9.1039359898208e-06, + "loss": 3.6666, + "step": 9652 + }, + { + "epoch": 0.8227222364271712, + "grad_norm": 51.82905674336156, + "learning_rate": 9.10365272446383e-06, + "loss": 4.2651, + "step": 9653 + }, + { + "epoch": 0.8228074661211966, + "grad_norm": 37.44595828486444, + "learning_rate": 9.103369418749067e-06, + "loss": 3.683, + "step": 9654 + }, + { + "epoch": 0.8228926958152221, + "grad_norm": 55.729027699053574, + "learning_rate": 9.103086072679297e-06, + "loss": 5.2731, + "step": 9655 + }, + { + "epoch": 0.8229779255092474, + "grad_norm": 71.90363029736504, + "learning_rate": 9.102802686257304e-06, + "loss": 4.9272, + "step": 9656 + }, + { + "epoch": 0.8230631552032728, + "grad_norm": 36.375439379810054, + "learning_rate": 9.102519259485879e-06, + "loss": 4.044, + "step": 9657 + }, + { + "epoch": 0.8231483848972982, + "grad_norm": 61.386204300775724, + "learning_rate": 9.102235792367806e-06, + "loss": 4.55, + "step": 9658 + }, + { + "epoch": 0.8232336145913236, + "grad_norm": 49.46214386494238, + "learning_rate": 9.101952284905875e-06, + "loss": 3.8222, + "step": 9659 + }, + { + "epoch": 0.823318844285349, + "grad_norm": 64.51389859657164, + "learning_rate": 9.101668737102872e-06, + "loss": 3.6714, + "step": 9660 + }, + { + "epoch": 0.8234040739793744, + "grad_norm": 33.59273926063477, + "learning_rate": 9.101385148961587e-06, + "loss": 5.0998, + "step": 9661 + }, + { + "epoch": 0.8234893036733998, + "grad_norm": 41.92865390447663, + "learning_rate": 9.10110152048481e-06, + "loss": 4.1728, + "step": 9662 + }, + { + "epoch": 0.8235745333674253, + "grad_norm": 35.98556395872414, + "learning_rate": 9.100817851675327e-06, + "loss": 4.0273, + "step": 9663 + }, + { + "epoch": 0.8236597630614506, + "grad_norm": 51.248547544754004, + "learning_rate": 9.100534142535933e-06, + "loss": 3.5769, + "step": 9664 + }, + { + "epoch": 0.823744992755476, + "grad_norm": 31.147184437079865, + "learning_rate": 9.100250393069413e-06, + "loss": 2.4812, + "step": 9665 + }, + { + "epoch": 0.8238302224495014, + "grad_norm": 45.31257209326706, + "learning_rate": 9.09996660327856e-06, + "loss": 3.8745, + "step": 9666 + }, + { + "epoch": 0.8239154521435268, + "grad_norm": 86.36717156168002, + "learning_rate": 9.099682773166165e-06, + "loss": 4.4462, + "step": 9667 + }, + { + "epoch": 0.8240006818375522, + "grad_norm": 34.66681966368305, + "learning_rate": 9.099398902735019e-06, + "loss": 3.6055, + "step": 9668 + }, + { + "epoch": 0.8240859115315776, + "grad_norm": 45.289455802392524, + "learning_rate": 9.099114991987914e-06, + "loss": 3.9514, + "step": 9669 + }, + { + "epoch": 0.824171141225603, + "grad_norm": 60.74121274332526, + "learning_rate": 9.09883104092764e-06, + "loss": 3.6424, + "step": 9670 + }, + { + "epoch": 0.8242563709196284, + "grad_norm": 49.071749004272505, + "learning_rate": 9.098547049556992e-06, + "loss": 3.8564, + "step": 9671 + }, + { + "epoch": 0.8243416006136538, + "grad_norm": 82.93151144765187, + "learning_rate": 9.098263017878764e-06, + "loss": 4.4874, + "step": 9672 + }, + { + "epoch": 0.8244268303076792, + "grad_norm": 85.79539593241347, + "learning_rate": 9.097978945895748e-06, + "loss": 4.175, + "step": 9673 + }, + { + "epoch": 0.8245120600017046, + "grad_norm": 55.24950309493895, + "learning_rate": 9.097694833610736e-06, + "loss": 3.3717, + "step": 9674 + }, + { + "epoch": 0.82459728969573, + "grad_norm": 31.555025807048587, + "learning_rate": 9.097410681026524e-06, + "loss": 3.3556, + "step": 9675 + }, + { + "epoch": 0.8246825193897553, + "grad_norm": 43.8847785157127, + "learning_rate": 9.097126488145905e-06, + "loss": 2.9331, + "step": 9676 + }, + { + "epoch": 0.8247677490837808, + "grad_norm": 69.3086133876194, + "learning_rate": 9.096842254971676e-06, + "loss": 3.8112, + "step": 9677 + }, + { + "epoch": 0.8248529787778062, + "grad_norm": 65.47343375673549, + "learning_rate": 9.09655798150663e-06, + "loss": 4.0512, + "step": 9678 + }, + { + "epoch": 0.8249382084718316, + "grad_norm": 37.30926194347695, + "learning_rate": 9.096273667753566e-06, + "loss": 4.0824, + "step": 9679 + }, + { + "epoch": 0.825023438165857, + "grad_norm": 25.69726692793497, + "learning_rate": 9.095989313715276e-06, + "loss": 2.4904, + "step": 9680 + }, + { + "epoch": 0.8251086678598823, + "grad_norm": 39.76806681723436, + "learning_rate": 9.095704919394561e-06, + "loss": 3.6658, + "step": 9681 + }, + { + "epoch": 0.8251938975539078, + "grad_norm": 32.32778951273658, + "learning_rate": 9.095420484794216e-06, + "loss": 3.5995, + "step": 9682 + }, + { + "epoch": 0.8252791272479332, + "grad_norm": 52.49787906121623, + "learning_rate": 9.095136009917037e-06, + "loss": 4.7813, + "step": 9683 + }, + { + "epoch": 0.8253643569419585, + "grad_norm": 26.01790594518988, + "learning_rate": 9.094851494765822e-06, + "loss": 3.712, + "step": 9684 + }, + { + "epoch": 0.825449586635984, + "grad_norm": 77.43843453521647, + "learning_rate": 9.09456693934337e-06, + "loss": 4.0224, + "step": 9685 + }, + { + "epoch": 0.8255348163300094, + "grad_norm": 26.848692209243143, + "learning_rate": 9.094282343652479e-06, + "loss": 3.4873, + "step": 9686 + }, + { + "epoch": 0.8256200460240348, + "grad_norm": 38.06541990273552, + "learning_rate": 9.093997707695948e-06, + "loss": 3.2028, + "step": 9687 + }, + { + "epoch": 0.8257052757180602, + "grad_norm": 89.43143516208544, + "learning_rate": 9.093713031476576e-06, + "loss": 5.5311, + "step": 9688 + }, + { + "epoch": 0.8257905054120855, + "grad_norm": 84.76927620765257, + "learning_rate": 9.093428314997164e-06, + "loss": 5.4755, + "step": 9689 + }, + { + "epoch": 0.825875735106111, + "grad_norm": 81.91684337082663, + "learning_rate": 9.093143558260512e-06, + "loss": 4.6826, + "step": 9690 + }, + { + "epoch": 0.8259609648001364, + "grad_norm": 49.39647459720291, + "learning_rate": 9.092858761269417e-06, + "loss": 3.7066, + "step": 9691 + }, + { + "epoch": 0.8260461944941617, + "grad_norm": 95.00264452816045, + "learning_rate": 9.092573924026683e-06, + "loss": 3.9465, + "step": 9692 + }, + { + "epoch": 0.8261314241881872, + "grad_norm": 40.23361099181432, + "learning_rate": 9.092289046535112e-06, + "loss": 3.0964, + "step": 9693 + }, + { + "epoch": 0.8262166538822125, + "grad_norm": 31.82922263510414, + "learning_rate": 9.092004128797502e-06, + "loss": 3.7593, + "step": 9694 + }, + { + "epoch": 0.826301883576238, + "grad_norm": 43.25406126433876, + "learning_rate": 9.09171917081666e-06, + "loss": 3.1285, + "step": 9695 + }, + { + "epoch": 0.8263871132702634, + "grad_norm": 41.30280039193419, + "learning_rate": 9.091434172595386e-06, + "loss": 3.8756, + "step": 9696 + }, + { + "epoch": 0.8264723429642887, + "grad_norm": 30.516854268275367, + "learning_rate": 9.09114913413648e-06, + "loss": 2.9888, + "step": 9697 + }, + { + "epoch": 0.8265575726583142, + "grad_norm": 41.207919688391954, + "learning_rate": 9.090864055442751e-06, + "loss": 3.9263, + "step": 9698 + }, + { + "epoch": 0.8266428023523396, + "grad_norm": 38.427481071791796, + "learning_rate": 9.090578936516996e-06, + "loss": 3.1491, + "step": 9699 + }, + { + "epoch": 0.8267280320463649, + "grad_norm": 33.169444879985996, + "learning_rate": 9.090293777362027e-06, + "loss": 3.4448, + "step": 9700 + }, + { + "epoch": 0.8268132617403904, + "grad_norm": 66.75636218090929, + "learning_rate": 9.09000857798064e-06, + "loss": 3.6244, + "step": 9701 + }, + { + "epoch": 0.8268984914344157, + "grad_norm": 57.696558193493985, + "learning_rate": 9.089723338375647e-06, + "loss": 4.7912, + "step": 9702 + }, + { + "epoch": 0.8269837211284411, + "grad_norm": 32.976368062751696, + "learning_rate": 9.089438058549848e-06, + "loss": 3.2441, + "step": 9703 + }, + { + "epoch": 0.8270689508224666, + "grad_norm": 24.32614247743665, + "learning_rate": 9.089152738506051e-06, + "loss": 2.7948, + "step": 9704 + }, + { + "epoch": 0.8271541805164919, + "grad_norm": 39.575853896250905, + "learning_rate": 9.088867378247062e-06, + "loss": 4.039, + "step": 9705 + }, + { + "epoch": 0.8272394102105174, + "grad_norm": 54.093337077952285, + "learning_rate": 9.088581977775687e-06, + "loss": 4.4105, + "step": 9706 + }, + { + "epoch": 0.8273246399045427, + "grad_norm": 119.01862657409735, + "learning_rate": 9.088296537094733e-06, + "loss": 5.2021, + "step": 9707 + }, + { + "epoch": 0.8274098695985681, + "grad_norm": 37.22877716080698, + "learning_rate": 9.088011056207008e-06, + "loss": 4.1197, + "step": 9708 + }, + { + "epoch": 0.8274950992925936, + "grad_norm": 56.82173168483761, + "learning_rate": 9.087725535115317e-06, + "loss": 4.7578, + "step": 9709 + }, + { + "epoch": 0.8275803289866189, + "grad_norm": 126.64311368874458, + "learning_rate": 9.08743997382247e-06, + "loss": 2.9216, + "step": 9710 + }, + { + "epoch": 0.8276655586806443, + "grad_norm": 65.19213462388261, + "learning_rate": 9.087154372331275e-06, + "loss": 5.2996, + "step": 9711 + }, + { + "epoch": 0.8277507883746698, + "grad_norm": 42.94433196050254, + "learning_rate": 9.086868730644542e-06, + "loss": 4.0027, + "step": 9712 + }, + { + "epoch": 0.8278360180686951, + "grad_norm": 62.98612738307855, + "learning_rate": 9.086583048765078e-06, + "loss": 4.4098, + "step": 9713 + }, + { + "epoch": 0.8279212477627206, + "grad_norm": 34.45814739721965, + "learning_rate": 9.086297326695695e-06, + "loss": 4.0679, + "step": 9714 + }, + { + "epoch": 0.8280064774567459, + "grad_norm": 51.328959845774044, + "learning_rate": 9.0860115644392e-06, + "loss": 3.6231, + "step": 9715 + }, + { + "epoch": 0.8280917071507713, + "grad_norm": 38.05433748796454, + "learning_rate": 9.085725761998405e-06, + "loss": 3.2375, + "step": 9716 + }, + { + "epoch": 0.8281769368447968, + "grad_norm": 31.34301104530622, + "learning_rate": 9.08543991937612e-06, + "loss": 2.9459, + "step": 9717 + }, + { + "epoch": 0.8282621665388221, + "grad_norm": 55.8328945973222, + "learning_rate": 9.085154036575159e-06, + "loss": 4.7183, + "step": 9718 + }, + { + "epoch": 0.8283473962328475, + "grad_norm": 52.85419294861298, + "learning_rate": 9.084868113598329e-06, + "loss": 4.1298, + "step": 9719 + }, + { + "epoch": 0.828432625926873, + "grad_norm": 43.618920611266155, + "learning_rate": 9.084582150448446e-06, + "loss": 3.6679, + "step": 9720 + }, + { + "epoch": 0.8285178556208983, + "grad_norm": 35.097879747913495, + "learning_rate": 9.08429614712832e-06, + "loss": 3.2754, + "step": 9721 + }, + { + "epoch": 0.8286030853149238, + "grad_norm": 34.26279887653101, + "learning_rate": 9.084010103640763e-06, + "loss": 4.4222, + "step": 9722 + }, + { + "epoch": 0.8286883150089491, + "grad_norm": 29.547792753066375, + "learning_rate": 9.083724019988592e-06, + "loss": 3.9915, + "step": 9723 + }, + { + "epoch": 0.8287735447029745, + "grad_norm": 78.5751052584053, + "learning_rate": 9.083437896174615e-06, + "loss": 4.4106, + "step": 9724 + }, + { + "epoch": 0.828858774397, + "grad_norm": 34.69462195398285, + "learning_rate": 9.083151732201652e-06, + "loss": 3.7025, + "step": 9725 + }, + { + "epoch": 0.8289440040910253, + "grad_norm": 36.91045705393361, + "learning_rate": 9.082865528072511e-06, + "loss": 3.4041, + "step": 9726 + }, + { + "epoch": 0.8290292337850507, + "grad_norm": 54.57522318204374, + "learning_rate": 9.082579283790013e-06, + "loss": 5.0256, + "step": 9727 + }, + { + "epoch": 0.8291144634790761, + "grad_norm": 55.35831146383902, + "learning_rate": 9.082292999356967e-06, + "loss": 3.8036, + "step": 9728 + }, + { + "epoch": 0.8291996931731015, + "grad_norm": 27.78659874506736, + "learning_rate": 9.082006674776194e-06, + "loss": 2.3938, + "step": 9729 + }, + { + "epoch": 0.829284922867127, + "grad_norm": 53.78154552649302, + "learning_rate": 9.081720310050508e-06, + "loss": 4.1877, + "step": 9730 + }, + { + "epoch": 0.8293701525611523, + "grad_norm": 55.90763882674652, + "learning_rate": 9.081433905182722e-06, + "loss": 4.8111, + "step": 9731 + }, + { + "epoch": 0.8294553822551777, + "grad_norm": 73.79275542963562, + "learning_rate": 9.081147460175657e-06, + "loss": 5.1907, + "step": 9732 + }, + { + "epoch": 0.8295406119492031, + "grad_norm": 38.78349008740831, + "learning_rate": 9.080860975032128e-06, + "loss": 3.4948, + "step": 9733 + }, + { + "epoch": 0.8296258416432285, + "grad_norm": 50.25371105829556, + "learning_rate": 9.080574449754952e-06, + "loss": 4.904, + "step": 9734 + }, + { + "epoch": 0.8297110713372539, + "grad_norm": 35.29067732040365, + "learning_rate": 9.080287884346949e-06, + "loss": 3.7345, + "step": 9735 + }, + { + "epoch": 0.8297963010312793, + "grad_norm": 82.0497347232739, + "learning_rate": 9.080001278810934e-06, + "loss": 3.9929, + "step": 9736 + }, + { + "epoch": 0.8298815307253047, + "grad_norm": 39.061025755209045, + "learning_rate": 9.07971463314973e-06, + "loss": 4.1151, + "step": 9737 + }, + { + "epoch": 0.82996676041933, + "grad_norm": 30.180433136502252, + "learning_rate": 9.079427947366153e-06, + "loss": 3.7411, + "step": 9738 + }, + { + "epoch": 0.8300519901133555, + "grad_norm": 151.7854831627526, + "learning_rate": 9.079141221463023e-06, + "loss": 4.9547, + "step": 9739 + }, + { + "epoch": 0.8301372198073809, + "grad_norm": 36.81268039759944, + "learning_rate": 9.078854455443161e-06, + "loss": 3.7134, + "step": 9740 + }, + { + "epoch": 0.8302224495014063, + "grad_norm": 56.22890892249071, + "learning_rate": 9.078567649309385e-06, + "loss": 4.0834, + "step": 9741 + }, + { + "epoch": 0.8303076791954317, + "grad_norm": 224.39692753490877, + "learning_rate": 9.078280803064517e-06, + "loss": 1.6582, + "step": 9742 + }, + { + "epoch": 0.830392908889457, + "grad_norm": 45.27540005310117, + "learning_rate": 9.077993916711378e-06, + "loss": 4.6227, + "step": 9743 + }, + { + "epoch": 0.8304781385834825, + "grad_norm": 89.15823472134551, + "learning_rate": 9.077706990252789e-06, + "loss": 5.0235, + "step": 9744 + }, + { + "epoch": 0.8305633682775079, + "grad_norm": 40.255638291282864, + "learning_rate": 9.077420023691573e-06, + "loss": 3.6373, + "step": 9745 + }, + { + "epoch": 0.8306485979715332, + "grad_norm": 30.64643961807543, + "learning_rate": 9.077133017030551e-06, + "loss": 2.8043, + "step": 9746 + }, + { + "epoch": 0.8307338276655587, + "grad_norm": 50.743319566427864, + "learning_rate": 9.076845970272545e-06, + "loss": 4.3265, + "step": 9747 + }, + { + "epoch": 0.8308190573595841, + "grad_norm": 60.97491226986646, + "learning_rate": 9.076558883420379e-06, + "loss": 4.2008, + "step": 9748 + }, + { + "epoch": 0.8309042870536095, + "grad_norm": 46.725372230138774, + "learning_rate": 9.076271756476877e-06, + "loss": 4.5309, + "step": 9749 + }, + { + "epoch": 0.8309895167476349, + "grad_norm": 31.36208156484393, + "learning_rate": 9.07598458944486e-06, + "loss": 3.7567, + "step": 9750 + }, + { + "epoch": 0.8310747464416602, + "grad_norm": 50.466835844872584, + "learning_rate": 9.075697382327157e-06, + "loss": 4.1122, + "step": 9751 + }, + { + "epoch": 0.8311599761356857, + "grad_norm": 33.342490871652764, + "learning_rate": 9.075410135126587e-06, + "loss": 3.5749, + "step": 9752 + }, + { + "epoch": 0.8312452058297111, + "grad_norm": 33.0575135917658, + "learning_rate": 9.075122847845981e-06, + "loss": 3.1333, + "step": 9753 + }, + { + "epoch": 0.8313304355237364, + "grad_norm": 77.36053001308561, + "learning_rate": 9.074835520488159e-06, + "loss": 4.6945, + "step": 9754 + }, + { + "epoch": 0.8314156652177619, + "grad_norm": 47.52846493360014, + "learning_rate": 9.074548153055949e-06, + "loss": 3.049, + "step": 9755 + }, + { + "epoch": 0.8315008949117872, + "grad_norm": 31.544562464994996, + "learning_rate": 9.074260745552177e-06, + "loss": 4.0746, + "step": 9756 + }, + { + "epoch": 0.8315861246058127, + "grad_norm": 106.60616294915211, + "learning_rate": 9.07397329797967e-06, + "loss": 6.0511, + "step": 9757 + }, + { + "epoch": 0.8316713542998381, + "grad_norm": 51.08140339717136, + "learning_rate": 9.073685810341254e-06, + "loss": 4.5791, + "step": 9758 + }, + { + "epoch": 0.8317565839938634, + "grad_norm": 32.291360923032045, + "learning_rate": 9.073398282639758e-06, + "loss": 2.3317, + "step": 9759 + }, + { + "epoch": 0.8318418136878889, + "grad_norm": 32.94182007575091, + "learning_rate": 9.073110714878007e-06, + "loss": 2.8539, + "step": 9760 + }, + { + "epoch": 0.8319270433819143, + "grad_norm": 46.11219371757803, + "learning_rate": 9.07282310705883e-06, + "loss": 4.2803, + "step": 9761 + }, + { + "epoch": 0.8320122730759396, + "grad_norm": 86.20428161819245, + "learning_rate": 9.072535459185057e-06, + "loss": 3.6678, + "step": 9762 + }, + { + "epoch": 0.8320975027699651, + "grad_norm": 47.30197572362226, + "learning_rate": 9.072247771259517e-06, + "loss": 3.7834, + "step": 9763 + }, + { + "epoch": 0.8321827324639904, + "grad_norm": 34.70712881407965, + "learning_rate": 9.071960043285038e-06, + "loss": 3.5437, + "step": 9764 + }, + { + "epoch": 0.8322679621580159, + "grad_norm": 126.60918044323061, + "learning_rate": 9.071672275264449e-06, + "loss": 5.1741, + "step": 9765 + }, + { + "epoch": 0.8323531918520413, + "grad_norm": 31.00619048651289, + "learning_rate": 9.07138446720058e-06, + "loss": 4.5434, + "step": 9766 + }, + { + "epoch": 0.8324384215460666, + "grad_norm": 93.56127049868572, + "learning_rate": 9.071096619096265e-06, + "loss": 4.4315, + "step": 9767 + }, + { + "epoch": 0.8325236512400921, + "grad_norm": 53.77594787998698, + "learning_rate": 9.070808730954332e-06, + "loss": 4.0717, + "step": 9768 + }, + { + "epoch": 0.8326088809341174, + "grad_norm": 70.5976370697583, + "learning_rate": 9.070520802777612e-06, + "loss": 5.1645, + "step": 9769 + }, + { + "epoch": 0.8326941106281428, + "grad_norm": 66.64149513212409, + "learning_rate": 9.070232834568938e-06, + "loss": 3.6237, + "step": 9770 + }, + { + "epoch": 0.8327793403221683, + "grad_norm": 41.83689048445048, + "learning_rate": 9.069944826331142e-06, + "loss": 3.5604, + "step": 9771 + }, + { + "epoch": 0.8328645700161936, + "grad_norm": 68.09754863121167, + "learning_rate": 9.069656778067056e-06, + "loss": 2.9672, + "step": 9772 + }, + { + "epoch": 0.832949799710219, + "grad_norm": 71.257347525084, + "learning_rate": 9.06936868977951e-06, + "loss": 4.9358, + "step": 9773 + }, + { + "epoch": 0.8330350294042445, + "grad_norm": 36.085464804802896, + "learning_rate": 9.069080561471342e-06, + "loss": 3.6822, + "step": 9774 + }, + { + "epoch": 0.8331202590982698, + "grad_norm": 62.333047912749464, + "learning_rate": 9.068792393145382e-06, + "loss": 4.3467, + "step": 9775 + }, + { + "epoch": 0.8332054887922953, + "grad_norm": 45.36208040986968, + "learning_rate": 9.068504184804468e-06, + "loss": 4.4407, + "step": 9776 + }, + { + "epoch": 0.8332907184863206, + "grad_norm": 287.95039168799883, + "learning_rate": 9.06821593645143e-06, + "loss": 4.8374, + "step": 9777 + }, + { + "epoch": 0.833375948180346, + "grad_norm": 60.1479987359384, + "learning_rate": 9.067927648089107e-06, + "loss": 3.9015, + "step": 9778 + }, + { + "epoch": 0.8334611778743715, + "grad_norm": 103.04902037260854, + "learning_rate": 9.067639319720329e-06, + "loss": 5.6275, + "step": 9779 + }, + { + "epoch": 0.8335464075683968, + "grad_norm": 67.31395871156434, + "learning_rate": 9.067350951347937e-06, + "loss": 3.2351, + "step": 9780 + }, + { + "epoch": 0.8336316372624222, + "grad_norm": 41.22653755163007, + "learning_rate": 9.067062542974765e-06, + "loss": 3.5122, + "step": 9781 + }, + { + "epoch": 0.8337168669564476, + "grad_norm": 39.94573195324239, + "learning_rate": 9.066774094603648e-06, + "loss": 3.0813, + "step": 9782 + }, + { + "epoch": 0.833802096650473, + "grad_norm": 45.60601670707077, + "learning_rate": 9.066485606237423e-06, + "loss": 5.029, + "step": 9783 + }, + { + "epoch": 0.8338873263444985, + "grad_norm": 51.46393279108712, + "learning_rate": 9.066197077878928e-06, + "loss": 3.5208, + "step": 9784 + }, + { + "epoch": 0.8339725560385238, + "grad_norm": 53.00873158624648, + "learning_rate": 9.065908509531e-06, + "loss": 4.4693, + "step": 9785 + }, + { + "epoch": 0.8340577857325492, + "grad_norm": 51.27672289789463, + "learning_rate": 9.06561990119648e-06, + "loss": 4.0223, + "step": 9786 + }, + { + "epoch": 0.8341430154265747, + "grad_norm": 44.01435527699301, + "learning_rate": 9.065331252878203e-06, + "loss": 4.3721, + "step": 9787 + }, + { + "epoch": 0.8342282451206, + "grad_norm": 38.446724460569506, + "learning_rate": 9.065042564579008e-06, + "loss": 3.522, + "step": 9788 + }, + { + "epoch": 0.8343134748146254, + "grad_norm": 61.279545928816795, + "learning_rate": 9.064753836301732e-06, + "loss": 4.2016, + "step": 9789 + }, + { + "epoch": 0.8343987045086508, + "grad_norm": 81.71815900985254, + "learning_rate": 9.064465068049221e-06, + "loss": 5.2649, + "step": 9790 + }, + { + "epoch": 0.8344839342026762, + "grad_norm": 90.34768246310894, + "learning_rate": 9.064176259824309e-06, + "loss": 4.9898, + "step": 9791 + }, + { + "epoch": 0.8345691638967017, + "grad_norm": 39.13513434727815, + "learning_rate": 9.06388741162984e-06, + "loss": 4.1262, + "step": 9792 + }, + { + "epoch": 0.834654393590727, + "grad_norm": 58.137863100966484, + "learning_rate": 9.063598523468652e-06, + "loss": 5.0782, + "step": 9793 + }, + { + "epoch": 0.8347396232847524, + "grad_norm": 20.457556127664667, + "learning_rate": 9.063309595343588e-06, + "loss": 2.7382, + "step": 9794 + }, + { + "epoch": 0.8348248529787778, + "grad_norm": 27.60508646969135, + "learning_rate": 9.063020627257489e-06, + "loss": 2.2189, + "step": 9795 + }, + { + "epoch": 0.8349100826728032, + "grad_norm": 42.409540890513895, + "learning_rate": 9.062731619213194e-06, + "loss": 4.2556, + "step": 9796 + }, + { + "epoch": 0.8349953123668286, + "grad_norm": 26.45284821252311, + "learning_rate": 9.06244257121355e-06, + "loss": 2.9277, + "step": 9797 + }, + { + "epoch": 0.835080542060854, + "grad_norm": 43.41454957352177, + "learning_rate": 9.062153483261398e-06, + "loss": 4.09, + "step": 9798 + }, + { + "epoch": 0.8351657717548794, + "grad_norm": 48.10205787514216, + "learning_rate": 9.06186435535958e-06, + "loss": 4.1836, + "step": 9799 + }, + { + "epoch": 0.8352510014489049, + "grad_norm": 50.45429868313226, + "learning_rate": 9.061575187510941e-06, + "loss": 4.963, + "step": 9800 + }, + { + "epoch": 0.8353362311429302, + "grad_norm": 77.53234072689382, + "learning_rate": 9.061285979718322e-06, + "loss": 5.586, + "step": 9801 + }, + { + "epoch": 0.8354214608369556, + "grad_norm": 37.84361789546683, + "learning_rate": 9.060996731984571e-06, + "loss": 4.0733, + "step": 9802 + }, + { + "epoch": 0.835506690530981, + "grad_norm": 58.19408460024999, + "learning_rate": 9.06070744431253e-06, + "loss": 3.6882, + "step": 9803 + }, + { + "epoch": 0.8355919202250064, + "grad_norm": 52.949055183555885, + "learning_rate": 9.060418116705045e-06, + "loss": 4.4254, + "step": 9804 + }, + { + "epoch": 0.8356771499190317, + "grad_norm": 37.54571176500128, + "learning_rate": 9.06012874916496e-06, + "loss": 3.8055, + "step": 9805 + }, + { + "epoch": 0.8357623796130572, + "grad_norm": 36.09348214497412, + "learning_rate": 9.059839341695125e-06, + "loss": 2.9392, + "step": 9806 + }, + { + "epoch": 0.8358476093070826, + "grad_norm": 48.74805938486146, + "learning_rate": 9.059549894298381e-06, + "loss": 3.8675, + "step": 9807 + }, + { + "epoch": 0.835932839001108, + "grad_norm": 44.12618400665845, + "learning_rate": 9.059260406977576e-06, + "loss": 4.2806, + "step": 9808 + }, + { + "epoch": 0.8360180686951334, + "grad_norm": 87.14557757458967, + "learning_rate": 9.05897087973556e-06, + "loss": 3.606, + "step": 9809 + }, + { + "epoch": 0.8361032983891588, + "grad_norm": 53.1294065951142, + "learning_rate": 9.058681312575178e-06, + "loss": 3.7063, + "step": 9810 + }, + { + "epoch": 0.8361885280831842, + "grad_norm": 165.44075556339692, + "learning_rate": 9.058391705499278e-06, + "loss": 5.0697, + "step": 9811 + }, + { + "epoch": 0.8362737577772096, + "grad_norm": 45.4364074940368, + "learning_rate": 9.058102058510708e-06, + "loss": 3.8583, + "step": 9812 + }, + { + "epoch": 0.8363589874712349, + "grad_norm": 41.418340907762534, + "learning_rate": 9.057812371612315e-06, + "loss": 3.5841, + "step": 9813 + }, + { + "epoch": 0.8364442171652604, + "grad_norm": 39.29331641004044, + "learning_rate": 9.057522644806953e-06, + "loss": 3.7731, + "step": 9814 + }, + { + "epoch": 0.8365294468592858, + "grad_norm": 40.9458668861381, + "learning_rate": 9.057232878097467e-06, + "loss": 4.5723, + "step": 9815 + }, + { + "epoch": 0.8366146765533111, + "grad_norm": 42.32460135744381, + "learning_rate": 9.056943071486707e-06, + "loss": 4.2535, + "step": 9816 + }, + { + "epoch": 0.8366999062473366, + "grad_norm": 72.46659763336686, + "learning_rate": 9.056653224977525e-06, + "loss": 3.6068, + "step": 9817 + }, + { + "epoch": 0.836785135941362, + "grad_norm": 74.16849047970972, + "learning_rate": 9.05636333857277e-06, + "loss": 4.5443, + "step": 9818 + }, + { + "epoch": 0.8368703656353874, + "grad_norm": 61.37584252983151, + "learning_rate": 9.056073412275292e-06, + "loss": 5.2053, + "step": 9819 + }, + { + "epoch": 0.8369555953294128, + "grad_norm": 35.75688428347882, + "learning_rate": 9.055783446087944e-06, + "loss": 3.5642, + "step": 9820 + }, + { + "epoch": 0.8370408250234381, + "grad_norm": 54.02142392567076, + "learning_rate": 9.05549344001358e-06, + "loss": 4.0113, + "step": 9821 + }, + { + "epoch": 0.8371260547174636, + "grad_norm": 44.13731200708491, + "learning_rate": 9.055203394055047e-06, + "loss": 5.0853, + "step": 9822 + }, + { + "epoch": 0.837211284411489, + "grad_norm": 51.145847364989436, + "learning_rate": 9.0549133082152e-06, + "loss": 3.7007, + "step": 9823 + }, + { + "epoch": 0.8372965141055143, + "grad_norm": 55.24046117548866, + "learning_rate": 9.054623182496893e-06, + "loss": 4.9764, + "step": 9824 + }, + { + "epoch": 0.8373817437995398, + "grad_norm": 37.83370734450862, + "learning_rate": 9.054333016902979e-06, + "loss": 3.8136, + "step": 9825 + }, + { + "epoch": 0.8374669734935651, + "grad_norm": 41.38528705624263, + "learning_rate": 9.05404281143631e-06, + "loss": 4.1019, + "step": 9826 + }, + { + "epoch": 0.8375522031875906, + "grad_norm": 31.996872159999462, + "learning_rate": 9.05375256609974e-06, + "loss": 3.2553, + "step": 9827 + }, + { + "epoch": 0.837637432881616, + "grad_norm": 44.74833603724537, + "learning_rate": 9.053462280896125e-06, + "loss": 3.6401, + "step": 9828 + }, + { + "epoch": 0.8377226625756413, + "grad_norm": 29.208344540456366, + "learning_rate": 9.05317195582832e-06, + "loss": 3.21, + "step": 9829 + }, + { + "epoch": 0.8378078922696668, + "grad_norm": 41.18207777346146, + "learning_rate": 9.052881590899177e-06, + "loss": 4.433, + "step": 9830 + }, + { + "epoch": 0.8378931219636921, + "grad_norm": 43.229137831019045, + "learning_rate": 9.052591186111557e-06, + "loss": 2.8772, + "step": 9831 + }, + { + "epoch": 0.8379783516577175, + "grad_norm": 51.23927294583583, + "learning_rate": 9.05230074146831e-06, + "loss": 4.6496, + "step": 9832 + }, + { + "epoch": 0.838063581351743, + "grad_norm": 39.15957925366214, + "learning_rate": 9.052010256972296e-06, + "loss": 3.2985, + "step": 9833 + }, + { + "epoch": 0.8381488110457683, + "grad_norm": 37.45196733089393, + "learning_rate": 9.051719732626374e-06, + "loss": 3.7641, + "step": 9834 + }, + { + "epoch": 0.8382340407397938, + "grad_norm": 152.9834988715558, + "learning_rate": 9.051429168433396e-06, + "loss": 3.4626, + "step": 9835 + }, + { + "epoch": 0.8383192704338192, + "grad_norm": 48.47642207387035, + "learning_rate": 9.051138564396223e-06, + "loss": 4.1569, + "step": 9836 + }, + { + "epoch": 0.8384045001278445, + "grad_norm": 28.68828791148103, + "learning_rate": 9.05084792051771e-06, + "loss": 2.9752, + "step": 9837 + }, + { + "epoch": 0.83848972982187, + "grad_norm": 39.73444884918647, + "learning_rate": 9.05055723680072e-06, + "loss": 4.1611, + "step": 9838 + }, + { + "epoch": 0.8385749595158953, + "grad_norm": 45.510232386933545, + "learning_rate": 9.050266513248107e-06, + "loss": 3.8936, + "step": 9839 + }, + { + "epoch": 0.8386601892099207, + "grad_norm": 53.39496627325814, + "learning_rate": 9.049975749862733e-06, + "loss": 4.7691, + "step": 9840 + }, + { + "epoch": 0.8387454189039462, + "grad_norm": 113.67148561288673, + "learning_rate": 9.04968494664746e-06, + "loss": 4.9757, + "step": 9841 + }, + { + "epoch": 0.8388306485979715, + "grad_norm": 45.56405710058647, + "learning_rate": 9.04939410360514e-06, + "loss": 3.5434, + "step": 9842 + }, + { + "epoch": 0.838915878291997, + "grad_norm": 55.08995114163752, + "learning_rate": 9.04910322073864e-06, + "loss": 4.6349, + "step": 9843 + }, + { + "epoch": 0.8390011079860223, + "grad_norm": 52.62640232392161, + "learning_rate": 9.04881229805082e-06, + "loss": 4.2151, + "step": 9844 + }, + { + "epoch": 0.8390863376800477, + "grad_norm": 36.781370502359614, + "learning_rate": 9.048521335544537e-06, + "loss": 3.948, + "step": 9845 + }, + { + "epoch": 0.8391715673740732, + "grad_norm": 60.877918801649706, + "learning_rate": 9.048230333222658e-06, + "loss": 5.0882, + "step": 9846 + }, + { + "epoch": 0.8392567970680985, + "grad_norm": 51.70183819522719, + "learning_rate": 9.047939291088041e-06, + "loss": 5.1275, + "step": 9847 + }, + { + "epoch": 0.8393420267621239, + "grad_norm": 38.009739247896825, + "learning_rate": 9.047648209143549e-06, + "loss": 3.8378, + "step": 9848 + }, + { + "epoch": 0.8394272564561494, + "grad_norm": 40.792783450296255, + "learning_rate": 9.047357087392047e-06, + "loss": 3.9272, + "step": 9849 + }, + { + "epoch": 0.8395124861501747, + "grad_norm": 88.89849394646018, + "learning_rate": 9.047065925836394e-06, + "loss": 4.9521, + "step": 9850 + }, + { + "epoch": 0.8395977158442001, + "grad_norm": 64.52178643555085, + "learning_rate": 9.046774724479457e-06, + "loss": 4.2517, + "step": 9851 + }, + { + "epoch": 0.8396829455382255, + "grad_norm": 70.29854586492701, + "learning_rate": 9.046483483324099e-06, + "loss": 5.0765, + "step": 9852 + }, + { + "epoch": 0.8397681752322509, + "grad_norm": 40.06348288710746, + "learning_rate": 9.046192202373184e-06, + "loss": 3.7508, + "step": 9853 + }, + { + "epoch": 0.8398534049262764, + "grad_norm": 71.10670498824769, + "learning_rate": 9.045900881629575e-06, + "loss": 4.6783, + "step": 9854 + }, + { + "epoch": 0.8399386346203017, + "grad_norm": 48.12605710887641, + "learning_rate": 9.04560952109614e-06, + "loss": 4.0233, + "step": 9855 + }, + { + "epoch": 0.8400238643143271, + "grad_norm": 29.838526285559997, + "learning_rate": 9.045318120775743e-06, + "loss": 3.5586, + "step": 9856 + }, + { + "epoch": 0.8401090940083525, + "grad_norm": 39.417014487316365, + "learning_rate": 9.04502668067125e-06, + "loss": 3.9281, + "step": 9857 + }, + { + "epoch": 0.8401943237023779, + "grad_norm": 88.37332356421729, + "learning_rate": 9.044735200785525e-06, + "loss": 3.9597, + "step": 9858 + }, + { + "epoch": 0.8402795533964033, + "grad_norm": 84.52116334757952, + "learning_rate": 9.044443681121438e-06, + "loss": 4.5534, + "step": 9859 + }, + { + "epoch": 0.8403647830904287, + "grad_norm": 51.85973104506263, + "learning_rate": 9.044152121681854e-06, + "loss": 4.1536, + "step": 9860 + }, + { + "epoch": 0.8404500127844541, + "grad_norm": 81.93517870101128, + "learning_rate": 9.043860522469641e-06, + "loss": 4.5981, + "step": 9861 + }, + { + "epoch": 0.8405352424784795, + "grad_norm": 47.93239099090834, + "learning_rate": 9.043568883487667e-06, + "loss": 4.2424, + "step": 9862 + }, + { + "epoch": 0.8406204721725049, + "grad_norm": 41.28160973216145, + "learning_rate": 9.0432772047388e-06, + "loss": 3.5251, + "step": 9863 + }, + { + "epoch": 0.8407057018665303, + "grad_norm": 34.403242592335964, + "learning_rate": 9.042985486225908e-06, + "loss": 3.6479, + "step": 9864 + }, + { + "epoch": 0.8407909315605557, + "grad_norm": 28.440393675958994, + "learning_rate": 9.04269372795186e-06, + "loss": 3.6616, + "step": 9865 + }, + { + "epoch": 0.8408761612545811, + "grad_norm": 55.931493436856684, + "learning_rate": 9.042401929919523e-06, + "loss": 4.2029, + "step": 9866 + }, + { + "epoch": 0.8409613909486064, + "grad_norm": 61.863322822460454, + "learning_rate": 9.042110092131772e-06, + "loss": 4.5359, + "step": 9867 + }, + { + "epoch": 0.8410466206426319, + "grad_norm": 49.26055158382096, + "learning_rate": 9.041818214591475e-06, + "loss": 3.0367, + "step": 9868 + }, + { + "epoch": 0.8411318503366573, + "grad_norm": 74.01800161981399, + "learning_rate": 9.041526297301501e-06, + "loss": 4.1405, + "step": 9869 + }, + { + "epoch": 0.8412170800306827, + "grad_norm": 78.67333806849715, + "learning_rate": 9.041234340264724e-06, + "loss": 4.5433, + "step": 9870 + }, + { + "epoch": 0.8413023097247081, + "grad_norm": 39.140100609530904, + "learning_rate": 9.04094234348401e-06, + "loss": 3.4873, + "step": 9871 + }, + { + "epoch": 0.8413875394187335, + "grad_norm": 58.635777727312686, + "learning_rate": 9.040650306962236e-06, + "loss": 4.2346, + "step": 9872 + }, + { + "epoch": 0.8414727691127589, + "grad_norm": 30.314235848577603, + "learning_rate": 9.04035823070227e-06, + "loss": 3.3472, + "step": 9873 + }, + { + "epoch": 0.8415579988067843, + "grad_norm": 74.0616313548309, + "learning_rate": 9.040066114706988e-06, + "loss": 4.576, + "step": 9874 + }, + { + "epoch": 0.8416432285008096, + "grad_norm": 80.27641980543288, + "learning_rate": 9.03977395897926e-06, + "loss": 4.8142, + "step": 9875 + }, + { + "epoch": 0.8417284581948351, + "grad_norm": 58.838852584400655, + "learning_rate": 9.03948176352196e-06, + "loss": 3.7248, + "step": 9876 + }, + { + "epoch": 0.8418136878888605, + "grad_norm": 46.59331468631892, + "learning_rate": 9.039189528337964e-06, + "loss": 4.1695, + "step": 9877 + }, + { + "epoch": 0.8418989175828859, + "grad_norm": 112.54617933186009, + "learning_rate": 9.038897253430141e-06, + "loss": 5.3043, + "step": 9878 + }, + { + "epoch": 0.8419841472769113, + "grad_norm": 62.71734481776431, + "learning_rate": 9.03860493880137e-06, + "loss": 4.8635, + "step": 9879 + }, + { + "epoch": 0.8420693769709366, + "grad_norm": 65.36879836391547, + "learning_rate": 9.038312584454525e-06, + "loss": 3.9186, + "step": 9880 + }, + { + "epoch": 0.8421546066649621, + "grad_norm": 35.510820415272434, + "learning_rate": 9.03802019039248e-06, + "loss": 3.9808, + "step": 9881 + }, + { + "epoch": 0.8422398363589875, + "grad_norm": 46.93212929498806, + "learning_rate": 9.037727756618111e-06, + "loss": 3.9041, + "step": 9882 + }, + { + "epoch": 0.8423250660530128, + "grad_norm": 63.00970994466689, + "learning_rate": 9.037435283134293e-06, + "loss": 4.5169, + "step": 9883 + }, + { + "epoch": 0.8424102957470383, + "grad_norm": 34.816738728949716, + "learning_rate": 9.037142769943903e-06, + "loss": 3.2702, + "step": 9884 + }, + { + "epoch": 0.8424955254410637, + "grad_norm": 37.98961131169055, + "learning_rate": 9.036850217049819e-06, + "loss": 4.062, + "step": 9885 + }, + { + "epoch": 0.842580755135089, + "grad_norm": 40.054606928225006, + "learning_rate": 9.036557624454918e-06, + "loss": 4.186, + "step": 9886 + }, + { + "epoch": 0.8426659848291145, + "grad_norm": 38.26638212649672, + "learning_rate": 9.036264992162075e-06, + "loss": 3.4772, + "step": 9887 + }, + { + "epoch": 0.8427512145231398, + "grad_norm": 37.15689948892702, + "learning_rate": 9.035972320174171e-06, + "loss": 4.2182, + "step": 9888 + }, + { + "epoch": 0.8428364442171653, + "grad_norm": 95.2906735260043, + "learning_rate": 9.035679608494083e-06, + "loss": 5.7251, + "step": 9889 + }, + { + "epoch": 0.8429216739111907, + "grad_norm": 64.08171173476714, + "learning_rate": 9.035386857124687e-06, + "loss": 4.1331, + "step": 9890 + }, + { + "epoch": 0.843006903605216, + "grad_norm": 55.361658083383865, + "learning_rate": 9.035094066068868e-06, + "loss": 4.9469, + "step": 9891 + }, + { + "epoch": 0.8430921332992415, + "grad_norm": 24.7546443702798, + "learning_rate": 9.034801235329502e-06, + "loss": 3.0935, + "step": 9892 + }, + { + "epoch": 0.8431773629932668, + "grad_norm": 59.39268321926329, + "learning_rate": 9.034508364909468e-06, + "loss": 4.5693, + "step": 9893 + }, + { + "epoch": 0.8432625926872922, + "grad_norm": 42.2217673615603, + "learning_rate": 9.034215454811647e-06, + "loss": 4.1573, + "step": 9894 + }, + { + "epoch": 0.8433478223813177, + "grad_norm": 33.53062062017795, + "learning_rate": 9.033922505038921e-06, + "loss": 4.3671, + "step": 9895 + }, + { + "epoch": 0.843433052075343, + "grad_norm": 56.75021624240357, + "learning_rate": 9.03362951559417e-06, + "loss": 3.5161, + "step": 9896 + }, + { + "epoch": 0.8435182817693685, + "grad_norm": 34.840044082199405, + "learning_rate": 9.033336486480275e-06, + "loss": 2.7702, + "step": 9897 + }, + { + "epoch": 0.8436035114633939, + "grad_norm": 48.46116468026635, + "learning_rate": 9.03304341770012e-06, + "loss": 3.3648, + "step": 9898 + }, + { + "epoch": 0.8436887411574192, + "grad_norm": 45.44856832906552, + "learning_rate": 9.032750309256583e-06, + "loss": 3.9098, + "step": 9899 + }, + { + "epoch": 0.8437739708514447, + "grad_norm": 77.75928546061392, + "learning_rate": 9.032457161152551e-06, + "loss": 4.1197, + "step": 9900 + }, + { + "epoch": 0.84385920054547, + "grad_norm": 34.43188312603491, + "learning_rate": 9.032163973390903e-06, + "loss": 3.8668, + "step": 9901 + }, + { + "epoch": 0.8439444302394954, + "grad_norm": 38.589663668356195, + "learning_rate": 9.031870745974527e-06, + "loss": 3.5278, + "step": 9902 + }, + { + "epoch": 0.8440296599335209, + "grad_norm": 42.51190765450824, + "learning_rate": 9.031577478906303e-06, + "loss": 3.6287, + "step": 9903 + }, + { + "epoch": 0.8441148896275462, + "grad_norm": 40.543317134060295, + "learning_rate": 9.031284172189117e-06, + "loss": 4.1511, + "step": 9904 + }, + { + "epoch": 0.8442001193215717, + "grad_norm": 70.71442664697439, + "learning_rate": 9.030990825825854e-06, + "loss": 6.0401, + "step": 9905 + }, + { + "epoch": 0.844285349015597, + "grad_norm": 44.85033489837379, + "learning_rate": 9.030697439819396e-06, + "loss": 3.0068, + "step": 9906 + }, + { + "epoch": 0.8443705787096224, + "grad_norm": 44.08076523932327, + "learning_rate": 9.030404014172632e-06, + "loss": 2.3902, + "step": 9907 + }, + { + "epoch": 0.8444558084036479, + "grad_norm": 343.40614228898477, + "learning_rate": 9.030110548888445e-06, + "loss": 4.2036, + "step": 9908 + }, + { + "epoch": 0.8445410380976732, + "grad_norm": 87.94529520318093, + "learning_rate": 9.029817043969722e-06, + "loss": 3.6897, + "step": 9909 + }, + { + "epoch": 0.8446262677916986, + "grad_norm": 47.660973619654925, + "learning_rate": 9.02952349941935e-06, + "loss": 5.267, + "step": 9910 + }, + { + "epoch": 0.844711497485724, + "grad_norm": 58.91574365818134, + "learning_rate": 9.029229915240217e-06, + "loss": 4.4974, + "step": 9911 + }, + { + "epoch": 0.8447967271797494, + "grad_norm": 81.95810964115144, + "learning_rate": 9.028936291435207e-06, + "loss": 4.7919, + "step": 9912 + }, + { + "epoch": 0.8448819568737749, + "grad_norm": 122.0312665416512, + "learning_rate": 9.028642628007209e-06, + "loss": 5.3355, + "step": 9913 + }, + { + "epoch": 0.8449671865678002, + "grad_norm": 54.533943478890784, + "learning_rate": 9.028348924959114e-06, + "loss": 3.8385, + "step": 9914 + }, + { + "epoch": 0.8450524162618256, + "grad_norm": 47.68600802849665, + "learning_rate": 9.028055182293806e-06, + "loss": 2.6887, + "step": 9915 + }, + { + "epoch": 0.8451376459558511, + "grad_norm": 47.581294542283466, + "learning_rate": 9.027761400014176e-06, + "loss": 4.3103, + "step": 9916 + }, + { + "epoch": 0.8452228756498764, + "grad_norm": 34.00180902027831, + "learning_rate": 9.027467578123115e-06, + "loss": 3.4308, + "step": 9917 + }, + { + "epoch": 0.8453081053439018, + "grad_norm": 71.22096519326468, + "learning_rate": 9.027173716623509e-06, + "loss": 4.3735, + "step": 9918 + }, + { + "epoch": 0.8453933350379272, + "grad_norm": 33.957747962555445, + "learning_rate": 9.02687981551825e-06, + "loss": 3.9159, + "step": 9919 + }, + { + "epoch": 0.8454785647319526, + "grad_norm": 53.83457375332956, + "learning_rate": 9.026585874810227e-06, + "loss": 5.0282, + "step": 9920 + }, + { + "epoch": 0.8455637944259781, + "grad_norm": 63.660276916570254, + "learning_rate": 9.026291894502334e-06, + "loss": 5.254, + "step": 9921 + }, + { + "epoch": 0.8456490241200034, + "grad_norm": 106.40928861048312, + "learning_rate": 9.025997874597459e-06, + "loss": 3.5736, + "step": 9922 + }, + { + "epoch": 0.8457342538140288, + "grad_norm": 54.759163710350414, + "learning_rate": 9.025703815098496e-06, + "loss": 3.321, + "step": 9923 + }, + { + "epoch": 0.8458194835080542, + "grad_norm": 54.46225581996852, + "learning_rate": 9.025409716008332e-06, + "loss": 4.7006, + "step": 9924 + }, + { + "epoch": 0.8459047132020796, + "grad_norm": 61.053007787712396, + "learning_rate": 9.025115577329866e-06, + "loss": 3.6143, + "step": 9925 + }, + { + "epoch": 0.845989942896105, + "grad_norm": 72.74592312808842, + "learning_rate": 9.024821399065988e-06, + "loss": 5.3985, + "step": 9926 + }, + { + "epoch": 0.8460751725901304, + "grad_norm": 97.83343212317092, + "learning_rate": 9.02452718121959e-06, + "loss": 4.7339, + "step": 9927 + }, + { + "epoch": 0.8461604022841558, + "grad_norm": 88.82519911676816, + "learning_rate": 9.024232923793568e-06, + "loss": 4.9796, + "step": 9928 + }, + { + "epoch": 0.8462456319781811, + "grad_norm": 34.9306429004379, + "learning_rate": 9.02393862679081e-06, + "loss": 4.2105, + "step": 9929 + }, + { + "epoch": 0.8463308616722066, + "grad_norm": 404.1637561147887, + "learning_rate": 9.02364429021422e-06, + "loss": 4.529, + "step": 9930 + }, + { + "epoch": 0.846416091366232, + "grad_norm": 64.56680589994025, + "learning_rate": 9.023349914066684e-06, + "loss": 3.8273, + "step": 9931 + }, + { + "epoch": 0.8465013210602574, + "grad_norm": 43.291685441659666, + "learning_rate": 9.023055498351101e-06, + "loss": 3.7518, + "step": 9932 + }, + { + "epoch": 0.8465865507542828, + "grad_norm": 62.64878950186746, + "learning_rate": 9.022761043070364e-06, + "loss": 4.1654, + "step": 9933 + }, + { + "epoch": 0.8466717804483082, + "grad_norm": 59.521116122792215, + "learning_rate": 9.022466548227373e-06, + "loss": 4.5035, + "step": 9934 + }, + { + "epoch": 0.8467570101423336, + "grad_norm": 67.42801518701054, + "learning_rate": 9.022172013825023e-06, + "loss": 4.4626, + "step": 9935 + }, + { + "epoch": 0.846842239836359, + "grad_norm": 33.67455687627578, + "learning_rate": 9.021877439866206e-06, + "loss": 2.1747, + "step": 9936 + }, + { + "epoch": 0.8469274695303843, + "grad_norm": 32.85554057140279, + "learning_rate": 9.021582826353825e-06, + "loss": 3.8064, + "step": 9937 + }, + { + "epoch": 0.8470126992244098, + "grad_norm": 78.2688531969134, + "learning_rate": 9.021288173290774e-06, + "loss": 4.4419, + "step": 9938 + }, + { + "epoch": 0.8470979289184352, + "grad_norm": 42.025817582478, + "learning_rate": 9.020993480679952e-06, + "loss": 3.397, + "step": 9939 + }, + { + "epoch": 0.8471831586124606, + "grad_norm": 33.28257925826595, + "learning_rate": 9.020698748524257e-06, + "loss": 3.6527, + "step": 9940 + }, + { + "epoch": 0.847268388306486, + "grad_norm": 48.087324650602895, + "learning_rate": 9.020403976826587e-06, + "loss": 4.3774, + "step": 9941 + }, + { + "epoch": 0.8473536180005113, + "grad_norm": 99.1875236221382, + "learning_rate": 9.02010916558984e-06, + "loss": 4.3718, + "step": 9942 + }, + { + "epoch": 0.8474388476945368, + "grad_norm": 29.69793641369024, + "learning_rate": 9.01981431481692e-06, + "loss": 2.3597, + "step": 9943 + }, + { + "epoch": 0.8475240773885622, + "grad_norm": 68.64665475071575, + "learning_rate": 9.01951942451072e-06, + "loss": 5.1175, + "step": 9944 + }, + { + "epoch": 0.8476093070825875, + "grad_norm": 77.12153718132403, + "learning_rate": 9.019224494674149e-06, + "loss": 4.9123, + "step": 9945 + }, + { + "epoch": 0.847694536776613, + "grad_norm": 46.99233771421303, + "learning_rate": 9.018929525310097e-06, + "loss": 4.8434, + "step": 9946 + }, + { + "epoch": 0.8477797664706384, + "grad_norm": 64.37414463418021, + "learning_rate": 9.018634516421472e-06, + "loss": 5.2118, + "step": 9947 + }, + { + "epoch": 0.8478649961646638, + "grad_norm": 48.35618219632182, + "learning_rate": 9.018339468011173e-06, + "loss": 3.6262, + "step": 9948 + }, + { + "epoch": 0.8479502258586892, + "grad_norm": 80.1246412125775, + "learning_rate": 9.018044380082102e-06, + "loss": 6.0234, + "step": 9949 + }, + { + "epoch": 0.8480354555527145, + "grad_norm": 36.12441310887027, + "learning_rate": 9.017749252637162e-06, + "loss": 3.045, + "step": 9950 + }, + { + "epoch": 0.84812068524674, + "grad_norm": 59.97612555596453, + "learning_rate": 9.017454085679253e-06, + "loss": 4.4869, + "step": 9951 + }, + { + "epoch": 0.8482059149407654, + "grad_norm": 36.47836963147053, + "learning_rate": 9.017158879211282e-06, + "loss": 3.7965, + "step": 9952 + }, + { + "epoch": 0.8482911446347907, + "grad_norm": 43.203484924648755, + "learning_rate": 9.016863633236148e-06, + "loss": 3.6471, + "step": 9953 + }, + { + "epoch": 0.8483763743288162, + "grad_norm": 40.497064749985974, + "learning_rate": 9.016568347756756e-06, + "loss": 3.7042, + "step": 9954 + }, + { + "epoch": 0.8484616040228415, + "grad_norm": 32.6600795931815, + "learning_rate": 9.016273022776013e-06, + "loss": 3.6168, + "step": 9955 + }, + { + "epoch": 0.848546833716867, + "grad_norm": 137.14945676899364, + "learning_rate": 9.015977658296817e-06, + "loss": 4.3022, + "step": 9956 + }, + { + "epoch": 0.8486320634108924, + "grad_norm": 73.93451669663072, + "learning_rate": 9.015682254322078e-06, + "loss": 4.4728, + "step": 9957 + }, + { + "epoch": 0.8487172931049177, + "grad_norm": 97.36586058561856, + "learning_rate": 9.0153868108547e-06, + "loss": 5.0569, + "step": 9958 + }, + { + "epoch": 0.8488025227989432, + "grad_norm": 36.48168724277909, + "learning_rate": 9.015091327897587e-06, + "loss": 2.9813, + "step": 9959 + }, + { + "epoch": 0.8488877524929686, + "grad_norm": 59.331553615452904, + "learning_rate": 9.014795805453648e-06, + "loss": 4.7777, + "step": 9960 + }, + { + "epoch": 0.8489729821869939, + "grad_norm": 37.21928162111068, + "learning_rate": 9.014500243525787e-06, + "loss": 3.5148, + "step": 9961 + }, + { + "epoch": 0.8490582118810194, + "grad_norm": 194.81289133850925, + "learning_rate": 9.014204642116912e-06, + "loss": 3.0092, + "step": 9962 + }, + { + "epoch": 0.8491434415750447, + "grad_norm": 68.57511718804793, + "learning_rate": 9.013909001229927e-06, + "loss": 5.4805, + "step": 9963 + }, + { + "epoch": 0.8492286712690701, + "grad_norm": 82.36897491447458, + "learning_rate": 9.013613320867744e-06, + "loss": 4.2465, + "step": 9964 + }, + { + "epoch": 0.8493139009630956, + "grad_norm": 33.036090600153535, + "learning_rate": 9.013317601033268e-06, + "loss": 4.2929, + "step": 9965 + }, + { + "epoch": 0.8493991306571209, + "grad_norm": 43.682593021909156, + "learning_rate": 9.01302184172941e-06, + "loss": 4.3884, + "step": 9966 + }, + { + "epoch": 0.8494843603511464, + "grad_norm": 26.745971609142675, + "learning_rate": 9.012726042959075e-06, + "loss": 3.4901, + "step": 9967 + }, + { + "epoch": 0.8495695900451717, + "grad_norm": 52.348274632139336, + "learning_rate": 9.012430204725174e-06, + "loss": 3.9652, + "step": 9968 + }, + { + "epoch": 0.8496548197391971, + "grad_norm": 44.294752505805626, + "learning_rate": 9.012134327030618e-06, + "loss": 3.7067, + "step": 9969 + }, + { + "epoch": 0.8497400494332226, + "grad_norm": 46.33237657211083, + "learning_rate": 9.011838409878313e-06, + "loss": 3.3518, + "step": 9970 + }, + { + "epoch": 0.8498252791272479, + "grad_norm": 42.58196930634531, + "learning_rate": 9.011542453271172e-06, + "loss": 4.0746, + "step": 9971 + }, + { + "epoch": 0.8499105088212733, + "grad_norm": 38.63094524661202, + "learning_rate": 9.011246457212103e-06, + "loss": 4.2367, + "step": 9972 + }, + { + "epoch": 0.8499957385152987, + "grad_norm": 36.53800311308925, + "learning_rate": 9.010950421704022e-06, + "loss": 3.432, + "step": 9973 + }, + { + "epoch": 0.8500809682093241, + "grad_norm": 33.530483734454045, + "learning_rate": 9.010654346749836e-06, + "loss": 3.7144, + "step": 9974 + }, + { + "epoch": 0.8501661979033496, + "grad_norm": 30.964491174754517, + "learning_rate": 9.01035823235246e-06, + "loss": 3.1956, + "step": 9975 + }, + { + "epoch": 0.8502514275973749, + "grad_norm": 112.58399630242705, + "learning_rate": 9.010062078514802e-06, + "loss": 5.5536, + "step": 9976 + }, + { + "epoch": 0.8503366572914003, + "grad_norm": 54.98230507313532, + "learning_rate": 9.009765885239778e-06, + "loss": 3.1533, + "step": 9977 + }, + { + "epoch": 0.8504218869854258, + "grad_norm": 35.9261927040606, + "learning_rate": 9.0094696525303e-06, + "loss": 2.2045, + "step": 9978 + }, + { + "epoch": 0.8505071166794511, + "grad_norm": 47.05623556029963, + "learning_rate": 9.00917338038928e-06, + "loss": 4.2403, + "step": 9979 + }, + { + "epoch": 0.8505923463734765, + "grad_norm": 36.12174130182421, + "learning_rate": 9.008877068819634e-06, + "loss": 3.1165, + "step": 9980 + }, + { + "epoch": 0.8506775760675019, + "grad_norm": 70.24894656704285, + "learning_rate": 9.008580717824276e-06, + "loss": 3.9336, + "step": 9981 + }, + { + "epoch": 0.8507628057615273, + "grad_norm": 46.659684877155854, + "learning_rate": 9.008284327406118e-06, + "loss": 3.4791, + "step": 9982 + }, + { + "epoch": 0.8508480354555528, + "grad_norm": 37.15134639035996, + "learning_rate": 9.007987897568077e-06, + "loss": 3.7767, + "step": 9983 + }, + { + "epoch": 0.8509332651495781, + "grad_norm": 34.41643126483846, + "learning_rate": 9.007691428313068e-06, + "loss": 2.5393, + "step": 9984 + }, + { + "epoch": 0.8510184948436035, + "grad_norm": 88.459871901333, + "learning_rate": 9.007394919644004e-06, + "loss": 3.9144, + "step": 9985 + }, + { + "epoch": 0.851103724537629, + "grad_norm": 72.72947277818378, + "learning_rate": 9.007098371563806e-06, + "loss": 1.8656, + "step": 9986 + }, + { + "epoch": 0.8511889542316543, + "grad_norm": 43.30599284207871, + "learning_rate": 9.006801784075388e-06, + "loss": 4.3295, + "step": 9987 + }, + { + "epoch": 0.8512741839256797, + "grad_norm": 38.70469691425848, + "learning_rate": 9.006505157181664e-06, + "loss": 4.4521, + "step": 9988 + }, + { + "epoch": 0.8513594136197051, + "grad_norm": 83.73180683090922, + "learning_rate": 9.006208490885555e-06, + "loss": 5.2546, + "step": 9989 + }, + { + "epoch": 0.8514446433137305, + "grad_norm": 74.00995301452419, + "learning_rate": 9.005911785189979e-06, + "loss": 4.6481, + "step": 9990 + }, + { + "epoch": 0.851529873007756, + "grad_norm": 69.0171462321386, + "learning_rate": 9.00561504009785e-06, + "loss": 3.9876, + "step": 9991 + }, + { + "epoch": 0.8516151027017813, + "grad_norm": 105.15048251818516, + "learning_rate": 9.00531825561209e-06, + "loss": 4.3297, + "step": 9992 + }, + { + "epoch": 0.8517003323958067, + "grad_norm": 55.482050223077266, + "learning_rate": 9.005021431735617e-06, + "loss": 5.1063, + "step": 9993 + }, + { + "epoch": 0.8517855620898321, + "grad_norm": 25.73626119271586, + "learning_rate": 9.004724568471347e-06, + "loss": 3.0959, + "step": 9994 + }, + { + "epoch": 0.8518707917838575, + "grad_norm": 50.51437905622518, + "learning_rate": 9.004427665822203e-06, + "loss": 4.3293, + "step": 9995 + }, + { + "epoch": 0.8519560214778829, + "grad_norm": 75.02431679873908, + "learning_rate": 9.004130723791106e-06, + "loss": 4.6237, + "step": 9996 + }, + { + "epoch": 0.8520412511719083, + "grad_norm": 39.75198452108586, + "learning_rate": 9.003833742380973e-06, + "loss": 3.8042, + "step": 9997 + }, + { + "epoch": 0.8521264808659337, + "grad_norm": 55.34278466901574, + "learning_rate": 9.003536721594728e-06, + "loss": 4.8314, + "step": 9998 + }, + { + "epoch": 0.852211710559959, + "grad_norm": 56.99414867705067, + "learning_rate": 9.00323966143529e-06, + "loss": 3.8571, + "step": 9999 + }, + { + "epoch": 0.8522969402539845, + "grad_norm": 27.932048150497337, + "learning_rate": 9.002942561905579e-06, + "loss": 3.0481, + "step": 10000 + }, + { + "epoch": 0.8523821699480099, + "grad_norm": 72.85331630788589, + "learning_rate": 9.002645423008519e-06, + "loss": 4.6068, + "step": 10001 + }, + { + "epoch": 0.8524673996420353, + "grad_norm": 69.75885491043229, + "learning_rate": 9.002348244747033e-06, + "loss": 4.0009, + "step": 10002 + }, + { + "epoch": 0.8525526293360607, + "grad_norm": 44.0495990745093, + "learning_rate": 9.002051027124039e-06, + "loss": 4.4333, + "step": 10003 + }, + { + "epoch": 0.852637859030086, + "grad_norm": 37.06381535266376, + "learning_rate": 9.001753770142466e-06, + "loss": 3.8773, + "step": 10004 + }, + { + "epoch": 0.8527230887241115, + "grad_norm": 38.160560418406675, + "learning_rate": 9.001456473805235e-06, + "loss": 4.0647, + "step": 10005 + }, + { + "epoch": 0.8528083184181369, + "grad_norm": 33.80755861841109, + "learning_rate": 9.00115913811527e-06, + "loss": 4.1112, + "step": 10006 + }, + { + "epoch": 0.8528935481121622, + "grad_norm": 53.89386398111997, + "learning_rate": 9.000861763075493e-06, + "loss": 4.3088, + "step": 10007 + }, + { + "epoch": 0.8529787778061877, + "grad_norm": 80.14840917185268, + "learning_rate": 9.000564348688832e-06, + "loss": 4.5126, + "step": 10008 + }, + { + "epoch": 0.853064007500213, + "grad_norm": 62.40607281704602, + "learning_rate": 9.00026689495821e-06, + "loss": 3.8257, + "step": 10009 + }, + { + "epoch": 0.8531492371942385, + "grad_norm": 97.766028088919, + "learning_rate": 8.999969401886551e-06, + "loss": 3.982, + "step": 10010 + }, + { + "epoch": 0.8532344668882639, + "grad_norm": 39.53949333087845, + "learning_rate": 8.999671869476783e-06, + "loss": 3.7879, + "step": 10011 + }, + { + "epoch": 0.8533196965822892, + "grad_norm": 55.37037734758711, + "learning_rate": 8.999374297731831e-06, + "loss": 4.2571, + "step": 10012 + }, + { + "epoch": 0.8534049262763147, + "grad_norm": 42.314329554904205, + "learning_rate": 8.999076686654624e-06, + "loss": 4.0153, + "step": 10013 + }, + { + "epoch": 0.8534901559703401, + "grad_norm": 114.70927690421351, + "learning_rate": 8.998779036248086e-06, + "loss": 5.3289, + "step": 10014 + }, + { + "epoch": 0.8535753856643654, + "grad_norm": 45.3778713535547, + "learning_rate": 8.998481346515144e-06, + "loss": 3.7911, + "step": 10015 + }, + { + "epoch": 0.8536606153583909, + "grad_norm": 51.89640305046642, + "learning_rate": 8.99818361745873e-06, + "loss": 2.2976, + "step": 10016 + }, + { + "epoch": 0.8537458450524162, + "grad_norm": 58.25377084091259, + "learning_rate": 8.997885849081765e-06, + "loss": 4.0585, + "step": 10017 + }, + { + "epoch": 0.8538310747464417, + "grad_norm": 39.38066888630656, + "learning_rate": 8.997588041387182e-06, + "loss": 3.386, + "step": 10018 + }, + { + "epoch": 0.8539163044404671, + "grad_norm": 34.001081194195805, + "learning_rate": 8.99729019437791e-06, + "loss": 3.3138, + "step": 10019 + }, + { + "epoch": 0.8540015341344924, + "grad_norm": 69.34996338491035, + "learning_rate": 8.996992308056877e-06, + "loss": 3.3205, + "step": 10020 + }, + { + "epoch": 0.8540867638285179, + "grad_norm": 142.1676083996533, + "learning_rate": 8.996694382427015e-06, + "loss": 4.3966, + "step": 10021 + }, + { + "epoch": 0.8541719935225432, + "grad_norm": 30.388821169725112, + "learning_rate": 8.99639641749125e-06, + "loss": 2.9948, + "step": 10022 + }, + { + "epoch": 0.8542572232165686, + "grad_norm": 39.61315659254151, + "learning_rate": 8.996098413252514e-06, + "loss": 3.4885, + "step": 10023 + }, + { + "epoch": 0.8543424529105941, + "grad_norm": 46.70257772837448, + "learning_rate": 8.995800369713738e-06, + "loss": 3.4291, + "step": 10024 + }, + { + "epoch": 0.8544276826046194, + "grad_norm": 38.84114138006464, + "learning_rate": 8.995502286877854e-06, + "loss": 2.6938, + "step": 10025 + }, + { + "epoch": 0.8545129122986449, + "grad_norm": 70.81341202080719, + "learning_rate": 8.995204164747792e-06, + "loss": 4.9012, + "step": 10026 + }, + { + "epoch": 0.8545981419926703, + "grad_norm": 60.86266989474801, + "learning_rate": 8.994906003326485e-06, + "loss": 4.3227, + "step": 10027 + }, + { + "epoch": 0.8546833716866956, + "grad_norm": 58.33366592931789, + "learning_rate": 8.994607802616863e-06, + "loss": 4.931, + "step": 10028 + }, + { + "epoch": 0.8547686013807211, + "grad_norm": 34.43601332639176, + "learning_rate": 8.994309562621863e-06, + "loss": 2.3105, + "step": 10029 + }, + { + "epoch": 0.8548538310747464, + "grad_norm": 45.333751194153194, + "learning_rate": 8.994011283344416e-06, + "loss": 4.1372, + "step": 10030 + }, + { + "epoch": 0.8549390607687718, + "grad_norm": 37.26864797284089, + "learning_rate": 8.993712964787453e-06, + "loss": 3.4149, + "step": 10031 + }, + { + "epoch": 0.8550242904627973, + "grad_norm": 46.740334203084224, + "learning_rate": 8.993414606953909e-06, + "loss": 5.2729, + "step": 10032 + }, + { + "epoch": 0.8551095201568226, + "grad_norm": 31.162586874894373, + "learning_rate": 8.993116209846721e-06, + "loss": 4.1972, + "step": 10033 + }, + { + "epoch": 0.8551947498508481, + "grad_norm": 34.88306510599483, + "learning_rate": 8.992817773468823e-06, + "loss": 3.3296, + "step": 10034 + }, + { + "epoch": 0.8552799795448734, + "grad_norm": 35.63303905168673, + "learning_rate": 8.992519297823148e-06, + "loss": 4.0159, + "step": 10035 + }, + { + "epoch": 0.8553652092388988, + "grad_norm": 66.999033390413, + "learning_rate": 8.992220782912633e-06, + "loss": 4.9019, + "step": 10036 + }, + { + "epoch": 0.8554504389329243, + "grad_norm": 32.89975953585537, + "learning_rate": 8.99192222874021e-06, + "loss": 3.9141, + "step": 10037 + }, + { + "epoch": 0.8555356686269496, + "grad_norm": 33.42449530035779, + "learning_rate": 8.991623635308819e-06, + "loss": 2.6588, + "step": 10038 + }, + { + "epoch": 0.855620898320975, + "grad_norm": 40.33860534955243, + "learning_rate": 8.991325002621397e-06, + "loss": 2.9414, + "step": 10039 + }, + { + "epoch": 0.8557061280150005, + "grad_norm": 276.8318973724485, + "learning_rate": 8.991026330680879e-06, + "loss": 4.1562, + "step": 10040 + }, + { + "epoch": 0.8557913577090258, + "grad_norm": 29.331619940141817, + "learning_rate": 8.990727619490203e-06, + "loss": 3.8041, + "step": 10041 + }, + { + "epoch": 0.8558765874030512, + "grad_norm": 59.549690064596405, + "learning_rate": 8.990428869052307e-06, + "loss": 3.0694, + "step": 10042 + }, + { + "epoch": 0.8559618170970766, + "grad_norm": 57.98155947102173, + "learning_rate": 8.990130079370127e-06, + "loss": 5.7615, + "step": 10043 + }, + { + "epoch": 0.856047046791102, + "grad_norm": 47.142064058720436, + "learning_rate": 8.989831250446605e-06, + "loss": 4.2719, + "step": 10044 + }, + { + "epoch": 0.8561322764851275, + "grad_norm": 56.10213013093515, + "learning_rate": 8.989532382284678e-06, + "loss": 4.0931, + "step": 10045 + }, + { + "epoch": 0.8562175061791528, + "grad_norm": 38.12737323157151, + "learning_rate": 8.989233474887284e-06, + "loss": 3.4626, + "step": 10046 + }, + { + "epoch": 0.8563027358731782, + "grad_norm": 42.147861477358084, + "learning_rate": 8.988934528257365e-06, + "loss": 3.9949, + "step": 10047 + }, + { + "epoch": 0.8563879655672036, + "grad_norm": 61.50055435842744, + "learning_rate": 8.988635542397861e-06, + "loss": 5.005, + "step": 10048 + }, + { + "epoch": 0.856473195261229, + "grad_norm": 30.24147197067601, + "learning_rate": 8.98833651731171e-06, + "loss": 3.8048, + "step": 10049 + }, + { + "epoch": 0.8565584249552544, + "grad_norm": 45.275341482211154, + "learning_rate": 8.988037453001854e-06, + "loss": 4.1141, + "step": 10050 + }, + { + "epoch": 0.8566436546492798, + "grad_norm": 27.505686001859956, + "learning_rate": 8.987738349471235e-06, + "loss": 3.4856, + "step": 10051 + }, + { + "epoch": 0.8567288843433052, + "grad_norm": 59.39222754086637, + "learning_rate": 8.987439206722793e-06, + "loss": 4.1599, + "step": 10052 + }, + { + "epoch": 0.8568141140373307, + "grad_norm": 44.28025829326886, + "learning_rate": 8.987140024759471e-06, + "loss": 3.5549, + "step": 10053 + }, + { + "epoch": 0.856899343731356, + "grad_norm": 33.73970491714685, + "learning_rate": 8.986840803584212e-06, + "loss": 3.3191, + "step": 10054 + }, + { + "epoch": 0.8569845734253814, + "grad_norm": 41.2622666887411, + "learning_rate": 8.986541543199957e-06, + "loss": 4.7056, + "step": 10055 + }, + { + "epoch": 0.8570698031194068, + "grad_norm": 49.24561649621777, + "learning_rate": 8.98624224360965e-06, + "loss": 3.8593, + "step": 10056 + }, + { + "epoch": 0.8571550328134322, + "grad_norm": 54.68850169520663, + "learning_rate": 8.985942904816235e-06, + "loss": 4.3665, + "step": 10057 + }, + { + "epoch": 0.8572402625074576, + "grad_norm": 56.217097371532724, + "learning_rate": 8.985643526822657e-06, + "loss": 3.5929, + "step": 10058 + }, + { + "epoch": 0.857325492201483, + "grad_norm": 44.22978714039422, + "learning_rate": 8.985344109631856e-06, + "loss": 3.0349, + "step": 10059 + }, + { + "epoch": 0.8574107218955084, + "grad_norm": 42.863427793801804, + "learning_rate": 8.985044653246781e-06, + "loss": 5.5205, + "step": 10060 + }, + { + "epoch": 0.8574959515895338, + "grad_norm": 59.64582287373757, + "learning_rate": 8.984745157670374e-06, + "loss": 3.8251, + "step": 10061 + }, + { + "epoch": 0.8575811812835592, + "grad_norm": 27.603161946103867, + "learning_rate": 8.984445622905583e-06, + "loss": 3.2288, + "step": 10062 + }, + { + "epoch": 0.8576664109775846, + "grad_norm": 22.76471277603925, + "learning_rate": 8.984146048955352e-06, + "loss": 3.8136, + "step": 10063 + }, + { + "epoch": 0.85775164067161, + "grad_norm": 57.07581137431777, + "learning_rate": 8.983846435822627e-06, + "loss": 4.4319, + "step": 10064 + }, + { + "epoch": 0.8578368703656354, + "grad_norm": 48.20598053674434, + "learning_rate": 8.983546783510357e-06, + "loss": 3.9306, + "step": 10065 + }, + { + "epoch": 0.8579221000596607, + "grad_norm": 82.5611902369194, + "learning_rate": 8.983247092021486e-06, + "loss": 6.3573, + "step": 10066 + }, + { + "epoch": 0.8580073297536862, + "grad_norm": 45.16046901736582, + "learning_rate": 8.982947361358962e-06, + "loss": 4.332, + "step": 10067 + }, + { + "epoch": 0.8580925594477116, + "grad_norm": 42.68546142961108, + "learning_rate": 8.982647591525734e-06, + "loss": 4.1983, + "step": 10068 + }, + { + "epoch": 0.858177789141737, + "grad_norm": 37.44270626957084, + "learning_rate": 8.982347782524752e-06, + "loss": 3.5798, + "step": 10069 + }, + { + "epoch": 0.8582630188357624, + "grad_norm": 46.70813469898316, + "learning_rate": 8.98204793435896e-06, + "loss": 4.409, + "step": 10070 + }, + { + "epoch": 0.8583482485297877, + "grad_norm": 33.59975639165927, + "learning_rate": 8.981748047031308e-06, + "loss": 3.5723, + "step": 10071 + }, + { + "epoch": 0.8584334782238132, + "grad_norm": 38.18759574135594, + "learning_rate": 8.981448120544747e-06, + "loss": 3.3534, + "step": 10072 + }, + { + "epoch": 0.8585187079178386, + "grad_norm": 33.62936915052215, + "learning_rate": 8.981148154902225e-06, + "loss": 3.7255, + "step": 10073 + }, + { + "epoch": 0.8586039376118639, + "grad_norm": 30.49321692355622, + "learning_rate": 8.980848150106693e-06, + "loss": 3.1297, + "step": 10074 + }, + { + "epoch": 0.8586891673058894, + "grad_norm": 120.43493714760992, + "learning_rate": 8.980548106161102e-06, + "loss": 4.6574, + "step": 10075 + }, + { + "epoch": 0.8587743969999148, + "grad_norm": 36.81808549733623, + "learning_rate": 8.980248023068402e-06, + "loss": 4.0051, + "step": 10076 + }, + { + "epoch": 0.8588596266939401, + "grad_norm": 70.43173335482521, + "learning_rate": 8.979947900831546e-06, + "loss": 4.2949, + "step": 10077 + }, + { + "epoch": 0.8589448563879656, + "grad_norm": 52.288075186720484, + "learning_rate": 8.979647739453481e-06, + "loss": 4.4752, + "step": 10078 + }, + { + "epoch": 0.8590300860819909, + "grad_norm": 51.71512515747339, + "learning_rate": 8.979347538937162e-06, + "loss": 4.9988, + "step": 10079 + }, + { + "epoch": 0.8591153157760164, + "grad_norm": 48.63079772359001, + "learning_rate": 8.979047299285543e-06, + "loss": 3.7317, + "step": 10080 + }, + { + "epoch": 0.8592005454700418, + "grad_norm": 50.58118182467473, + "learning_rate": 8.978747020501575e-06, + "loss": 3.9706, + "step": 10081 + }, + { + "epoch": 0.8592857751640671, + "grad_norm": 40.34894729187555, + "learning_rate": 8.978446702588208e-06, + "loss": 4.1299, + "step": 10082 + }, + { + "epoch": 0.8593710048580926, + "grad_norm": 60.027423568680064, + "learning_rate": 8.978146345548403e-06, + "loss": 4.5734, + "step": 10083 + }, + { + "epoch": 0.859456234552118, + "grad_norm": 30.100396716063297, + "learning_rate": 8.977845949385106e-06, + "loss": 1.7413, + "step": 10084 + }, + { + "epoch": 0.8595414642461433, + "grad_norm": 31.5180875909614, + "learning_rate": 8.977545514101275e-06, + "loss": 2.7723, + "step": 10085 + }, + { + "epoch": 0.8596266939401688, + "grad_norm": 50.011405063890486, + "learning_rate": 8.977245039699866e-06, + "loss": 4.3196, + "step": 10086 + }, + { + "epoch": 0.8597119236341941, + "grad_norm": 41.33208325169685, + "learning_rate": 8.97694452618383e-06, + "loss": 4.078, + "step": 10087 + }, + { + "epoch": 0.8597971533282196, + "grad_norm": 98.69306580822578, + "learning_rate": 8.976643973556128e-06, + "loss": 4.2963, + "step": 10088 + }, + { + "epoch": 0.859882383022245, + "grad_norm": 32.785429203582126, + "learning_rate": 8.976343381819712e-06, + "loss": 3.0139, + "step": 10089 + }, + { + "epoch": 0.8599676127162703, + "grad_norm": 150.07445736863914, + "learning_rate": 8.976042750977536e-06, + "loss": 3.4924, + "step": 10090 + }, + { + "epoch": 0.8600528424102958, + "grad_norm": 30.266452852667406, + "learning_rate": 8.975742081032562e-06, + "loss": 3.6913, + "step": 10091 + }, + { + "epoch": 0.8601380721043211, + "grad_norm": 43.46046772636909, + "learning_rate": 8.975441371987744e-06, + "loss": 4.2008, + "step": 10092 + }, + { + "epoch": 0.8602233017983465, + "grad_norm": 45.84257021219796, + "learning_rate": 8.97514062384604e-06, + "loss": 4.3308, + "step": 10093 + }, + { + "epoch": 0.860308531492372, + "grad_norm": 50.73332442976, + "learning_rate": 8.974839836610408e-06, + "loss": 4.2533, + "step": 10094 + }, + { + "epoch": 0.8603937611863973, + "grad_norm": 35.22924539250801, + "learning_rate": 8.974539010283805e-06, + "loss": 3.9021, + "step": 10095 + }, + { + "epoch": 0.8604789908804228, + "grad_norm": 63.89534091430368, + "learning_rate": 8.97423814486919e-06, + "loss": 4.1974, + "step": 10096 + }, + { + "epoch": 0.8605642205744481, + "grad_norm": 62.527334645530104, + "learning_rate": 8.97393724036952e-06, + "loss": 4.488, + "step": 10097 + }, + { + "epoch": 0.8606494502684735, + "grad_norm": 53.793655926755136, + "learning_rate": 8.97363629678776e-06, + "loss": 4.171, + "step": 10098 + }, + { + "epoch": 0.860734679962499, + "grad_norm": 30.451561100027646, + "learning_rate": 8.973335314126864e-06, + "loss": 3.3442, + "step": 10099 + }, + { + "epoch": 0.8608199096565243, + "grad_norm": 62.21839117762054, + "learning_rate": 8.973034292389795e-06, + "loss": 4.1314, + "step": 10100 + }, + { + "epoch": 0.8609051393505497, + "grad_norm": 37.96905721773958, + "learning_rate": 8.972733231579512e-06, + "loss": 3.8875, + "step": 10101 + }, + { + "epoch": 0.8609903690445752, + "grad_norm": 79.78531000384889, + "learning_rate": 8.972432131698976e-06, + "loss": 2.5746, + "step": 10102 + }, + { + "epoch": 0.8610755987386005, + "grad_norm": 27.86884006168514, + "learning_rate": 8.97213099275115e-06, + "loss": 2.2805, + "step": 10103 + }, + { + "epoch": 0.861160828432626, + "grad_norm": 47.356931475556564, + "learning_rate": 8.971829814738993e-06, + "loss": 3.6269, + "step": 10104 + }, + { + "epoch": 0.8612460581266513, + "grad_norm": 58.75491265930219, + "learning_rate": 8.971528597665467e-06, + "loss": 4.2841, + "step": 10105 + }, + { + "epoch": 0.8613312878206767, + "grad_norm": 50.00026980425793, + "learning_rate": 8.971227341533536e-06, + "loss": 4.5843, + "step": 10106 + }, + { + "epoch": 0.8614165175147022, + "grad_norm": 38.65868351114015, + "learning_rate": 8.970926046346162e-06, + "loss": 3.9962, + "step": 10107 + }, + { + "epoch": 0.8615017472087275, + "grad_norm": 43.70834071534109, + "learning_rate": 8.970624712106309e-06, + "loss": 4.643, + "step": 10108 + }, + { + "epoch": 0.8615869769027529, + "grad_norm": 32.530268671520496, + "learning_rate": 8.97032333881694e-06, + "loss": 3.4872, + "step": 10109 + }, + { + "epoch": 0.8616722065967783, + "grad_norm": 63.82105442998296, + "learning_rate": 8.970021926481017e-06, + "loss": 5.4537, + "step": 10110 + }, + { + "epoch": 0.8617574362908037, + "grad_norm": 31.323073431867606, + "learning_rate": 8.969720475101506e-06, + "loss": 2.7617, + "step": 10111 + }, + { + "epoch": 0.8618426659848291, + "grad_norm": 35.995138892633335, + "learning_rate": 8.969418984681373e-06, + "loss": 4.1791, + "step": 10112 + }, + { + "epoch": 0.8619278956788545, + "grad_norm": 24.574412024183175, + "learning_rate": 8.969117455223582e-06, + "loss": 3.0051, + "step": 10113 + }, + { + "epoch": 0.8620131253728799, + "grad_norm": 44.92933492955829, + "learning_rate": 8.968815886731095e-06, + "loss": 4.3891, + "step": 10114 + }, + { + "epoch": 0.8620983550669054, + "grad_norm": 42.05638743544194, + "learning_rate": 8.968514279206883e-06, + "loss": 3.4459, + "step": 10115 + }, + { + "epoch": 0.8621835847609307, + "grad_norm": 39.38548835849348, + "learning_rate": 8.96821263265391e-06, + "loss": 4.3495, + "step": 10116 + }, + { + "epoch": 0.8622688144549561, + "grad_norm": 43.373182626989156, + "learning_rate": 8.967910947075143e-06, + "loss": 3.9514, + "step": 10117 + }, + { + "epoch": 0.8623540441489815, + "grad_norm": 59.02822942194443, + "learning_rate": 8.967609222473547e-06, + "loss": 4.4848, + "step": 10118 + }, + { + "epoch": 0.8624392738430069, + "grad_norm": 113.56004619706407, + "learning_rate": 8.967307458852092e-06, + "loss": 5.2377, + "step": 10119 + }, + { + "epoch": 0.8625245035370322, + "grad_norm": 45.98733378908821, + "learning_rate": 8.967005656213744e-06, + "loss": 4.0446, + "step": 10120 + }, + { + "epoch": 0.8626097332310577, + "grad_norm": 40.89570058501888, + "learning_rate": 8.966703814561472e-06, + "loss": 3.9266, + "step": 10121 + }, + { + "epoch": 0.8626949629250831, + "grad_norm": 73.99323851274818, + "learning_rate": 8.966401933898245e-06, + "loss": 3.4203, + "step": 10122 + }, + { + "epoch": 0.8627801926191085, + "grad_norm": 47.41158515783145, + "learning_rate": 8.96610001422703e-06, + "loss": 4.3275, + "step": 10123 + }, + { + "epoch": 0.8628654223131339, + "grad_norm": 84.58552925725596, + "learning_rate": 8.965798055550797e-06, + "loss": 4.1845, + "step": 10124 + }, + { + "epoch": 0.8629506520071593, + "grad_norm": 89.21716636770097, + "learning_rate": 8.965496057872516e-06, + "loss": 4.194, + "step": 10125 + }, + { + "epoch": 0.8630358817011847, + "grad_norm": 64.54146431223278, + "learning_rate": 8.965194021195158e-06, + "loss": 4.3418, + "step": 10126 + }, + { + "epoch": 0.8631211113952101, + "grad_norm": 69.68635749905185, + "learning_rate": 8.964891945521691e-06, + "loss": 4.1926, + "step": 10127 + }, + { + "epoch": 0.8632063410892354, + "grad_norm": 30.398443393724463, + "learning_rate": 8.96458983085509e-06, + "loss": 3.5237, + "step": 10128 + }, + { + "epoch": 0.8632915707832609, + "grad_norm": 31.0333884884759, + "learning_rate": 8.964287677198321e-06, + "loss": 3.1866, + "step": 10129 + }, + { + "epoch": 0.8633768004772863, + "grad_norm": 46.00718777542498, + "learning_rate": 8.963985484554359e-06, + "loss": 4.4189, + "step": 10130 + }, + { + "epoch": 0.8634620301713117, + "grad_norm": 31.711342927510724, + "learning_rate": 8.963683252926175e-06, + "loss": 3.6033, + "step": 10131 + }, + { + "epoch": 0.8635472598653371, + "grad_norm": 35.10190882290828, + "learning_rate": 8.963380982316738e-06, + "loss": 3.3591, + "step": 10132 + }, + { + "epoch": 0.8636324895593624, + "grad_norm": 65.08126132953979, + "learning_rate": 8.963078672729027e-06, + "loss": 4.1696, + "step": 10133 + }, + { + "epoch": 0.8637177192533879, + "grad_norm": 74.638022830281, + "learning_rate": 8.962776324166012e-06, + "loss": 3.6166, + "step": 10134 + }, + { + "epoch": 0.8638029489474133, + "grad_norm": 73.7056518437967, + "learning_rate": 8.962473936630666e-06, + "loss": 4.531, + "step": 10135 + }, + { + "epoch": 0.8638881786414386, + "grad_norm": 34.785096148925035, + "learning_rate": 8.962171510125963e-06, + "loss": 4.0452, + "step": 10136 + }, + { + "epoch": 0.8639734083354641, + "grad_norm": 47.61939725331213, + "learning_rate": 8.961869044654879e-06, + "loss": 3.6532, + "step": 10137 + }, + { + "epoch": 0.8640586380294895, + "grad_norm": 54.566549561256124, + "learning_rate": 8.961566540220386e-06, + "loss": 4.3556, + "step": 10138 + }, + { + "epoch": 0.8641438677235149, + "grad_norm": 60.56224259780247, + "learning_rate": 8.96126399682546e-06, + "loss": 3.2039, + "step": 10139 + }, + { + "epoch": 0.8642290974175403, + "grad_norm": 53.36477230431247, + "learning_rate": 8.960961414473077e-06, + "loss": 4.231, + "step": 10140 + }, + { + "epoch": 0.8643143271115656, + "grad_norm": 28.049043535679896, + "learning_rate": 8.960658793166213e-06, + "loss": 3.7518, + "step": 10141 + }, + { + "epoch": 0.8643995568055911, + "grad_norm": 89.80724277336545, + "learning_rate": 8.960356132907844e-06, + "loss": 4.5795, + "step": 10142 + }, + { + "epoch": 0.8644847864996165, + "grad_norm": 26.820685047078207, + "learning_rate": 8.960053433700944e-06, + "loss": 2.4184, + "step": 10143 + }, + { + "epoch": 0.8645700161936418, + "grad_norm": 77.36866186906309, + "learning_rate": 8.959750695548494e-06, + "loss": 5.0685, + "step": 10144 + }, + { + "epoch": 0.8646552458876673, + "grad_norm": 58.01400836653678, + "learning_rate": 8.959447918453469e-06, + "loss": 4.2233, + "step": 10145 + }, + { + "epoch": 0.8647404755816926, + "grad_norm": 67.79316059943001, + "learning_rate": 8.959145102418847e-06, + "loss": 3.242, + "step": 10146 + }, + { + "epoch": 0.8648257052757181, + "grad_norm": 42.5366991534537, + "learning_rate": 8.958842247447605e-06, + "loss": 3.8904, + "step": 10147 + }, + { + "epoch": 0.8649109349697435, + "grad_norm": 67.98149398744451, + "learning_rate": 8.958539353542723e-06, + "loss": 5.4863, + "step": 10148 + }, + { + "epoch": 0.8649961646637688, + "grad_norm": 42.30022630624333, + "learning_rate": 8.95823642070718e-06, + "loss": 3.9837, + "step": 10149 + }, + { + "epoch": 0.8650813943577943, + "grad_norm": 94.82835065562375, + "learning_rate": 8.957933448943955e-06, + "loss": 4.9742, + "step": 10150 + }, + { + "epoch": 0.8651666240518197, + "grad_norm": 77.47901780016086, + "learning_rate": 8.957630438256027e-06, + "loss": 5.5963, + "step": 10151 + }, + { + "epoch": 0.865251853745845, + "grad_norm": 34.219461269604466, + "learning_rate": 8.957327388646376e-06, + "loss": 4.4489, + "step": 10152 + }, + { + "epoch": 0.8653370834398705, + "grad_norm": 34.561984308036195, + "learning_rate": 8.957024300117983e-06, + "loss": 3.6372, + "step": 10153 + }, + { + "epoch": 0.8654223131338958, + "grad_norm": 41.995048314999856, + "learning_rate": 8.956721172673829e-06, + "loss": 3.9361, + "step": 10154 + }, + { + "epoch": 0.8655075428279212, + "grad_norm": 67.36736708824901, + "learning_rate": 8.956418006316893e-06, + "loss": 4.9514, + "step": 10155 + }, + { + "epoch": 0.8655927725219467, + "grad_norm": 45.69172051766725, + "learning_rate": 8.956114801050158e-06, + "loss": 3.465, + "step": 10156 + }, + { + "epoch": 0.865678002215972, + "grad_norm": 67.27577476325901, + "learning_rate": 8.955811556876605e-06, + "loss": 3.721, + "step": 10157 + }, + { + "epoch": 0.8657632319099975, + "grad_norm": 50.40238835646157, + "learning_rate": 8.95550827379922e-06, + "loss": 4.531, + "step": 10158 + }, + { + "epoch": 0.8658484616040228, + "grad_norm": 28.805170017963494, + "learning_rate": 8.955204951820982e-06, + "loss": 3.0425, + "step": 10159 + }, + { + "epoch": 0.8659336912980482, + "grad_norm": 25.804283697041008, + "learning_rate": 8.954901590944874e-06, + "loss": 3.7313, + "step": 10160 + }, + { + "epoch": 0.8660189209920737, + "grad_norm": 53.132362722703654, + "learning_rate": 8.954598191173883e-06, + "loss": 4.1629, + "step": 10161 + }, + { + "epoch": 0.866104150686099, + "grad_norm": 42.750441550054184, + "learning_rate": 8.954294752510986e-06, + "loss": 4.0993, + "step": 10162 + }, + { + "epoch": 0.8661893803801244, + "grad_norm": 52.85469430675415, + "learning_rate": 8.953991274959175e-06, + "loss": 4.1421, + "step": 10163 + }, + { + "epoch": 0.8662746100741499, + "grad_norm": 104.93558812953614, + "learning_rate": 8.953687758521428e-06, + "loss": 6.1952, + "step": 10164 + }, + { + "epoch": 0.8663598397681752, + "grad_norm": 61.91193763967894, + "learning_rate": 8.953384203200735e-06, + "loss": 3.3897, + "step": 10165 + }, + { + "epoch": 0.8664450694622007, + "grad_norm": 57.7749580325813, + "learning_rate": 8.953080609000078e-06, + "loss": 4.2823, + "step": 10166 + }, + { + "epoch": 0.866530299156226, + "grad_norm": 73.21424916394785, + "learning_rate": 8.952776975922445e-06, + "loss": 3.8458, + "step": 10167 + }, + { + "epoch": 0.8666155288502514, + "grad_norm": 35.05600415696205, + "learning_rate": 8.952473303970821e-06, + "loss": 3.3727, + "step": 10168 + }, + { + "epoch": 0.8667007585442769, + "grad_norm": 55.745244282442535, + "learning_rate": 8.952169593148193e-06, + "loss": 4.5221, + "step": 10169 + }, + { + "epoch": 0.8667859882383022, + "grad_norm": 36.60651974888778, + "learning_rate": 8.951865843457547e-06, + "loss": 3.3492, + "step": 10170 + }, + { + "epoch": 0.8668712179323276, + "grad_norm": 32.93208046814637, + "learning_rate": 8.95156205490187e-06, + "loss": 3.6009, + "step": 10171 + }, + { + "epoch": 0.866956447626353, + "grad_norm": 37.39958416539252, + "learning_rate": 8.951258227484153e-06, + "loss": 3.5852, + "step": 10172 + }, + { + "epoch": 0.8670416773203784, + "grad_norm": 46.75176611991759, + "learning_rate": 8.950954361207379e-06, + "loss": 4.8124, + "step": 10173 + }, + { + "epoch": 0.8671269070144039, + "grad_norm": 33.72427730067331, + "learning_rate": 8.950650456074538e-06, + "loss": 3.7536, + "step": 10174 + }, + { + "epoch": 0.8672121367084292, + "grad_norm": 35.535073242272794, + "learning_rate": 8.95034651208862e-06, + "loss": 3.2209, + "step": 10175 + }, + { + "epoch": 0.8672973664024546, + "grad_norm": 31.297234984263806, + "learning_rate": 8.950042529252617e-06, + "loss": 3.707, + "step": 10176 + }, + { + "epoch": 0.86738259609648, + "grad_norm": 36.828279038007835, + "learning_rate": 8.949738507569512e-06, + "loss": 4.1311, + "step": 10177 + }, + { + "epoch": 0.8674678257905054, + "grad_norm": 67.99668570854352, + "learning_rate": 8.9494344470423e-06, + "loss": 4.6673, + "step": 10178 + }, + { + "epoch": 0.8675530554845308, + "grad_norm": 46.59979326647757, + "learning_rate": 8.94913034767397e-06, + "loss": 4.0448, + "step": 10179 + }, + { + "epoch": 0.8676382851785562, + "grad_norm": 47.60591129684647, + "learning_rate": 8.94882620946751e-06, + "loss": 4.0463, + "step": 10180 + }, + { + "epoch": 0.8677235148725816, + "grad_norm": 56.207347194730154, + "learning_rate": 8.948522032425915e-06, + "loss": 2.8495, + "step": 10181 + }, + { + "epoch": 0.8678087445666071, + "grad_norm": 52.44444637429105, + "learning_rate": 8.948217816552174e-06, + "loss": 3.3415, + "step": 10182 + }, + { + "epoch": 0.8678939742606324, + "grad_norm": 57.56862744687803, + "learning_rate": 8.94791356184928e-06, + "loss": 4.5307, + "step": 10183 + }, + { + "epoch": 0.8679792039546578, + "grad_norm": 64.7564344126234, + "learning_rate": 8.947609268320222e-06, + "loss": 5.0674, + "step": 10184 + }, + { + "epoch": 0.8680644336486832, + "grad_norm": 43.33144283356925, + "learning_rate": 8.947304935967999e-06, + "loss": 3.5539, + "step": 10185 + }, + { + "epoch": 0.8681496633427086, + "grad_norm": 27.4576122305398, + "learning_rate": 8.9470005647956e-06, + "loss": 2.3127, + "step": 10186 + }, + { + "epoch": 0.868234893036734, + "grad_norm": 46.42087695520784, + "learning_rate": 8.94669615480602e-06, + "loss": 4.0714, + "step": 10187 + }, + { + "epoch": 0.8683201227307594, + "grad_norm": 45.71620821532348, + "learning_rate": 8.94639170600225e-06, + "loss": 3.3642, + "step": 10188 + }, + { + "epoch": 0.8684053524247848, + "grad_norm": 41.59597835062089, + "learning_rate": 8.946087218387284e-06, + "loss": 2.9468, + "step": 10189 + }, + { + "epoch": 0.8684905821188101, + "grad_norm": 32.85400876965397, + "learning_rate": 8.94578269196412e-06, + "loss": 3.9814, + "step": 10190 + }, + { + "epoch": 0.8685758118128356, + "grad_norm": 49.5731746505159, + "learning_rate": 8.945478126735752e-06, + "loss": 5.1022, + "step": 10191 + }, + { + "epoch": 0.868661041506861, + "grad_norm": 33.966208672574304, + "learning_rate": 8.945173522705173e-06, + "loss": 3.3939, + "step": 10192 + }, + { + "epoch": 0.8687462712008864, + "grad_norm": 68.56405059436203, + "learning_rate": 8.944868879875381e-06, + "loss": 4.3434, + "step": 10193 + }, + { + "epoch": 0.8688315008949118, + "grad_norm": 70.67705925124272, + "learning_rate": 8.94456419824937e-06, + "loss": 5.0517, + "step": 10194 + }, + { + "epoch": 0.8689167305889371, + "grad_norm": 28.52193132258948, + "learning_rate": 8.94425947783014e-06, + "loss": 2.8005, + "step": 10195 + }, + { + "epoch": 0.8690019602829626, + "grad_norm": 85.44602007882723, + "learning_rate": 8.943954718620683e-06, + "loss": 4.1903, + "step": 10196 + }, + { + "epoch": 0.869087189976988, + "grad_norm": 31.02901271014607, + "learning_rate": 8.943649920623999e-06, + "loss": 3.8252, + "step": 10197 + }, + { + "epoch": 0.8691724196710133, + "grad_norm": 29.38011610512972, + "learning_rate": 8.943345083843085e-06, + "loss": 2.9495, + "step": 10198 + }, + { + "epoch": 0.8692576493650388, + "grad_norm": 31.657885292814957, + "learning_rate": 8.94304020828094e-06, + "loss": 3.3155, + "step": 10199 + }, + { + "epoch": 0.8693428790590642, + "grad_norm": 89.86653827043148, + "learning_rate": 8.94273529394056e-06, + "loss": 5.244, + "step": 10200 + }, + { + "epoch": 0.8694281087530896, + "grad_norm": 36.53951934367895, + "learning_rate": 8.942430340824946e-06, + "loss": 3.7628, + "step": 10201 + }, + { + "epoch": 0.869513338447115, + "grad_norm": 47.44214551226184, + "learning_rate": 8.942125348937097e-06, + "loss": 3.3158, + "step": 10202 + }, + { + "epoch": 0.8695985681411403, + "grad_norm": 56.63675834630762, + "learning_rate": 8.941820318280009e-06, + "loss": 3.4656, + "step": 10203 + }, + { + "epoch": 0.8696837978351658, + "grad_norm": 61.342727234386274, + "learning_rate": 8.941515248856686e-06, + "loss": 5.3362, + "step": 10204 + }, + { + "epoch": 0.8697690275291912, + "grad_norm": 74.09493577246327, + "learning_rate": 8.941210140670127e-06, + "loss": 4.9192, + "step": 10205 + }, + { + "epoch": 0.8698542572232165, + "grad_norm": 104.77934710160687, + "learning_rate": 8.940904993723331e-06, + "loss": 4.7589, + "step": 10206 + }, + { + "epoch": 0.869939486917242, + "grad_norm": 40.16603838866819, + "learning_rate": 8.940599808019302e-06, + "loss": 3.3678, + "step": 10207 + }, + { + "epoch": 0.8700247166112673, + "grad_norm": 88.60950547191833, + "learning_rate": 8.940294583561038e-06, + "loss": 4.0806, + "step": 10208 + }, + { + "epoch": 0.8701099463052928, + "grad_norm": 57.582112033061584, + "learning_rate": 8.939989320351544e-06, + "loss": 3.5505, + "step": 10209 + }, + { + "epoch": 0.8701951759993182, + "grad_norm": 47.23442774852139, + "learning_rate": 8.939684018393822e-06, + "loss": 4.28, + "step": 10210 + }, + { + "epoch": 0.8702804056933435, + "grad_norm": 57.040137833138125, + "learning_rate": 8.93937867769087e-06, + "loss": 4.4645, + "step": 10211 + }, + { + "epoch": 0.870365635387369, + "grad_norm": 49.78029808851564, + "learning_rate": 8.939073298245695e-06, + "loss": 4.1553, + "step": 10212 + }, + { + "epoch": 0.8704508650813944, + "grad_norm": 33.16174474929821, + "learning_rate": 8.938767880061301e-06, + "loss": 4.14, + "step": 10213 + }, + { + "epoch": 0.8705360947754197, + "grad_norm": 74.45403709336485, + "learning_rate": 8.938462423140688e-06, + "loss": 5.0397, + "step": 10214 + }, + { + "epoch": 0.8706213244694452, + "grad_norm": 55.27967665692586, + "learning_rate": 8.938156927486862e-06, + "loss": 5.3264, + "step": 10215 + }, + { + "epoch": 0.8707065541634705, + "grad_norm": 45.20555543790387, + "learning_rate": 8.93785139310283e-06, + "loss": 4.0944, + "step": 10216 + }, + { + "epoch": 0.870791783857496, + "grad_norm": 50.97536001872556, + "learning_rate": 8.937545819991592e-06, + "loss": 4.3011, + "step": 10217 + }, + { + "epoch": 0.8708770135515214, + "grad_norm": 49.736382419399135, + "learning_rate": 8.937240208156157e-06, + "loss": 3.8171, + "step": 10218 + }, + { + "epoch": 0.8709622432455467, + "grad_norm": 65.4550297955134, + "learning_rate": 8.936934557599528e-06, + "loss": 4.2803, + "step": 10219 + }, + { + "epoch": 0.8710474729395722, + "grad_norm": 56.89074608748382, + "learning_rate": 8.936628868324711e-06, + "loss": 4.5903, + "step": 10220 + }, + { + "epoch": 0.8711327026335975, + "grad_norm": 22.025696547574444, + "learning_rate": 8.936323140334715e-06, + "loss": 2.9781, + "step": 10221 + }, + { + "epoch": 0.8712179323276229, + "grad_norm": 137.50516797876324, + "learning_rate": 8.936017373632545e-06, + "loss": 4.9347, + "step": 10222 + }, + { + "epoch": 0.8713031620216484, + "grad_norm": 28.116477435864248, + "learning_rate": 8.935711568221209e-06, + "loss": 2.3036, + "step": 10223 + }, + { + "epoch": 0.8713883917156737, + "grad_norm": 92.40529341243312, + "learning_rate": 8.935405724103712e-06, + "loss": 3.6094, + "step": 10224 + }, + { + "epoch": 0.8714736214096991, + "grad_norm": 60.23091781504803, + "learning_rate": 8.935099841283063e-06, + "loss": 4.5903, + "step": 10225 + }, + { + "epoch": 0.8715588511037246, + "grad_norm": 106.660066763079, + "learning_rate": 8.934793919762273e-06, + "loss": 4.0208, + "step": 10226 + }, + { + "epoch": 0.8716440807977499, + "grad_norm": 39.548166752299906, + "learning_rate": 8.934487959544346e-06, + "loss": 4.0361, + "step": 10227 + }, + { + "epoch": 0.8717293104917754, + "grad_norm": 33.03053537371332, + "learning_rate": 8.934181960632293e-06, + "loss": 3.6288, + "step": 10228 + }, + { + "epoch": 0.8718145401858007, + "grad_norm": 51.7935302647853, + "learning_rate": 8.933875923029125e-06, + "loss": 5.5825, + "step": 10229 + }, + { + "epoch": 0.8718997698798261, + "grad_norm": 39.23390208090273, + "learning_rate": 8.93356984673785e-06, + "loss": 3.8161, + "step": 10230 + }, + { + "epoch": 0.8719849995738516, + "grad_norm": 31.466052518588526, + "learning_rate": 8.93326373176148e-06, + "loss": 3.4914, + "step": 10231 + }, + { + "epoch": 0.8720702292678769, + "grad_norm": 39.8040164050193, + "learning_rate": 8.932957578103024e-06, + "loss": 3.5108, + "step": 10232 + }, + { + "epoch": 0.8721554589619023, + "grad_norm": 42.526198466108205, + "learning_rate": 8.932651385765492e-06, + "loss": 4.4329, + "step": 10233 + }, + { + "epoch": 0.8722406886559277, + "grad_norm": 62.20152941864713, + "learning_rate": 8.932345154751895e-06, + "loss": 2.87, + "step": 10234 + }, + { + "epoch": 0.8723259183499531, + "grad_norm": 61.096277633240724, + "learning_rate": 8.93203888506525e-06, + "loss": 4.9643, + "step": 10235 + }, + { + "epoch": 0.8724111480439786, + "grad_norm": 41.605358880763646, + "learning_rate": 8.93173257670856e-06, + "loss": 4.0018, + "step": 10236 + }, + { + "epoch": 0.8724963777380039, + "grad_norm": 86.8680993745269, + "learning_rate": 8.931426229684846e-06, + "loss": 4.5848, + "step": 10237 + }, + { + "epoch": 0.8725816074320293, + "grad_norm": 356.3605187237333, + "learning_rate": 8.931119843997115e-06, + "loss": 3.8128, + "step": 10238 + }, + { + "epoch": 0.8726668371260548, + "grad_norm": 24.64736797649567, + "learning_rate": 8.930813419648384e-06, + "loss": 2.5759, + "step": 10239 + }, + { + "epoch": 0.8727520668200801, + "grad_norm": 107.22096173863825, + "learning_rate": 8.930506956641665e-06, + "loss": 5.7979, + "step": 10240 + }, + { + "epoch": 0.8728372965141055, + "grad_norm": 42.296130814943915, + "learning_rate": 8.930200454979971e-06, + "loss": 3.0516, + "step": 10241 + }, + { + "epoch": 0.8729225262081309, + "grad_norm": 43.85552783591197, + "learning_rate": 8.929893914666317e-06, + "loss": 4.229, + "step": 10242 + }, + { + "epoch": 0.8730077559021563, + "grad_norm": 42.06743293408128, + "learning_rate": 8.929587335703719e-06, + "loss": 3.4021, + "step": 10243 + }, + { + "epoch": 0.8730929855961818, + "grad_norm": 45.057279241126615, + "learning_rate": 8.92928071809519e-06, + "loss": 3.9677, + "step": 10244 + }, + { + "epoch": 0.8731782152902071, + "grad_norm": 29.25360311014999, + "learning_rate": 8.928974061843747e-06, + "loss": 3.4288, + "step": 10245 + }, + { + "epoch": 0.8732634449842325, + "grad_norm": 33.53956516723743, + "learning_rate": 8.928667366952406e-06, + "loss": 3.5416, + "step": 10246 + }, + { + "epoch": 0.8733486746782579, + "grad_norm": 54.37833957816054, + "learning_rate": 8.928360633424182e-06, + "loss": 4.5119, + "step": 10247 + }, + { + "epoch": 0.8734339043722833, + "grad_norm": 56.822788474903504, + "learning_rate": 8.928053861262091e-06, + "loss": 3.8392, + "step": 10248 + }, + { + "epoch": 0.8735191340663087, + "grad_norm": 81.75471529065513, + "learning_rate": 8.927747050469153e-06, + "loss": 3.4943, + "step": 10249 + }, + { + "epoch": 0.8736043637603341, + "grad_norm": 34.14705392145715, + "learning_rate": 8.927440201048382e-06, + "loss": 3.7649, + "step": 10250 + }, + { + "epoch": 0.8736895934543595, + "grad_norm": 26.920002721827192, + "learning_rate": 8.927133313002797e-06, + "loss": 3.6652, + "step": 10251 + }, + { + "epoch": 0.873774823148385, + "grad_norm": 35.62507693647544, + "learning_rate": 8.926826386335419e-06, + "loss": 4.228, + "step": 10252 + }, + { + "epoch": 0.8738600528424103, + "grad_norm": 58.12731582319902, + "learning_rate": 8.926519421049262e-06, + "loss": 3.2384, + "step": 10253 + }, + { + "epoch": 0.8739452825364357, + "grad_norm": 36.58244959608479, + "learning_rate": 8.926212417147347e-06, + "loss": 3.9533, + "step": 10254 + }, + { + "epoch": 0.8740305122304611, + "grad_norm": 41.499695961022766, + "learning_rate": 8.925905374632694e-06, + "loss": 3.3216, + "step": 10255 + }, + { + "epoch": 0.8741157419244865, + "grad_norm": 35.88421393619774, + "learning_rate": 8.92559829350832e-06, + "loss": 4.715, + "step": 10256 + }, + { + "epoch": 0.8742009716185118, + "grad_norm": 54.0363028579493, + "learning_rate": 8.925291173777247e-06, + "loss": 4.4361, + "step": 10257 + }, + { + "epoch": 0.8742862013125373, + "grad_norm": 52.75242310028524, + "learning_rate": 8.924984015442494e-06, + "loss": 3.6026, + "step": 10258 + }, + { + "epoch": 0.8743714310065627, + "grad_norm": 43.54991042458119, + "learning_rate": 8.924676818507085e-06, + "loss": 3.5528, + "step": 10259 + }, + { + "epoch": 0.8744566607005881, + "grad_norm": 45.47572549809163, + "learning_rate": 8.924369582974037e-06, + "loss": 3.7559, + "step": 10260 + }, + { + "epoch": 0.8745418903946135, + "grad_norm": 38.56033289125657, + "learning_rate": 8.924062308846376e-06, + "loss": 3.5915, + "step": 10261 + }, + { + "epoch": 0.8746271200886389, + "grad_norm": 42.0092620687784, + "learning_rate": 8.923754996127119e-06, + "loss": 3.8967, + "step": 10262 + }, + { + "epoch": 0.8747123497826643, + "grad_norm": 57.92697803822402, + "learning_rate": 8.923447644819291e-06, + "loss": 5.0061, + "step": 10263 + }, + { + "epoch": 0.8747975794766897, + "grad_norm": 82.3037295656821, + "learning_rate": 8.923140254925918e-06, + "loss": 4.565, + "step": 10264 + }, + { + "epoch": 0.874882809170715, + "grad_norm": 46.71651674398158, + "learning_rate": 8.922832826450016e-06, + "loss": 4.6204, + "step": 10265 + }, + { + "epoch": 0.8749680388647405, + "grad_norm": 58.24350846339067, + "learning_rate": 8.922525359394612e-06, + "loss": 4.2429, + "step": 10266 + }, + { + "epoch": 0.8750532685587659, + "grad_norm": 34.79254740313303, + "learning_rate": 8.922217853762731e-06, + "loss": 4.5309, + "step": 10267 + }, + { + "epoch": 0.8751384982527912, + "grad_norm": 35.97060477916401, + "learning_rate": 8.921910309557395e-06, + "loss": 4.3455, + "step": 10268 + }, + { + "epoch": 0.8752237279468167, + "grad_norm": 36.2295413739184, + "learning_rate": 8.921602726781631e-06, + "loss": 3.6618, + "step": 10269 + }, + { + "epoch": 0.875308957640842, + "grad_norm": 37.451711146972556, + "learning_rate": 8.921295105438461e-06, + "loss": 4.0134, + "step": 10270 + }, + { + "epoch": 0.8753941873348675, + "grad_norm": 41.19902565795938, + "learning_rate": 8.920987445530912e-06, + "loss": 3.7017, + "step": 10271 + }, + { + "epoch": 0.8754794170288929, + "grad_norm": 23.82447086028476, + "learning_rate": 8.920679747062009e-06, + "loss": 3.2597, + "step": 10272 + }, + { + "epoch": 0.8755646467229182, + "grad_norm": 64.95528812860677, + "learning_rate": 8.920372010034781e-06, + "loss": 4.0082, + "step": 10273 + }, + { + "epoch": 0.8756498764169437, + "grad_norm": 94.29852059670357, + "learning_rate": 8.92006423445225e-06, + "loss": 5.2208, + "step": 10274 + }, + { + "epoch": 0.875735106110969, + "grad_norm": 34.06823448552555, + "learning_rate": 8.919756420317446e-06, + "loss": 4.1315, + "step": 10275 + }, + { + "epoch": 0.8758203358049944, + "grad_norm": 42.177384748223616, + "learning_rate": 8.919448567633393e-06, + "loss": 4.3113, + "step": 10276 + }, + { + "epoch": 0.8759055654990199, + "grad_norm": 36.29328968462571, + "learning_rate": 8.919140676403122e-06, + "loss": 3.6304, + "step": 10277 + }, + { + "epoch": 0.8759907951930452, + "grad_norm": 47.50420940875107, + "learning_rate": 8.91883274662966e-06, + "loss": 4.0279, + "step": 10278 + }, + { + "epoch": 0.8760760248870707, + "grad_norm": 37.16124120916681, + "learning_rate": 8.918524778316036e-06, + "loss": 3.2191, + "step": 10279 + }, + { + "epoch": 0.8761612545810961, + "grad_norm": 47.41799273505472, + "learning_rate": 8.918216771465276e-06, + "loss": 5.1121, + "step": 10280 + }, + { + "epoch": 0.8762464842751214, + "grad_norm": 36.46997998511313, + "learning_rate": 8.917908726080413e-06, + "loss": 3.6283, + "step": 10281 + }, + { + "epoch": 0.8763317139691469, + "grad_norm": 55.9501412566133, + "learning_rate": 8.917600642164474e-06, + "loss": 4.2948, + "step": 10282 + }, + { + "epoch": 0.8764169436631722, + "grad_norm": 44.00065288955428, + "learning_rate": 8.917292519720489e-06, + "loss": 3.8272, + "step": 10283 + }, + { + "epoch": 0.8765021733571976, + "grad_norm": 30.724511168284593, + "learning_rate": 8.916984358751488e-06, + "loss": 3.9123, + "step": 10284 + }, + { + "epoch": 0.8765874030512231, + "grad_norm": 49.78898334661988, + "learning_rate": 8.916676159260502e-06, + "loss": 3.4437, + "step": 10285 + }, + { + "epoch": 0.8766726327452484, + "grad_norm": 67.51650483249111, + "learning_rate": 8.916367921250563e-06, + "loss": 3.8571, + "step": 10286 + }, + { + "epoch": 0.8767578624392739, + "grad_norm": 35.46135151250245, + "learning_rate": 8.916059644724703e-06, + "loss": 2.2736, + "step": 10287 + }, + { + "epoch": 0.8768430921332993, + "grad_norm": 33.36603559635833, + "learning_rate": 8.915751329685951e-06, + "loss": 4.0224, + "step": 10288 + }, + { + "epoch": 0.8769283218273246, + "grad_norm": 114.342656032308, + "learning_rate": 8.91544297613734e-06, + "loss": 6.0088, + "step": 10289 + }, + { + "epoch": 0.8770135515213501, + "grad_norm": 88.59733754855021, + "learning_rate": 8.915134584081906e-06, + "loss": 5.609, + "step": 10290 + }, + { + "epoch": 0.8770987812153754, + "grad_norm": 61.46361893524475, + "learning_rate": 8.914826153522676e-06, + "loss": 4.9375, + "step": 10291 + }, + { + "epoch": 0.8771840109094008, + "grad_norm": 33.531814180647324, + "learning_rate": 8.914517684462689e-06, + "loss": 2.9959, + "step": 10292 + }, + { + "epoch": 0.8772692406034263, + "grad_norm": 61.115822783939564, + "learning_rate": 8.914209176904974e-06, + "loss": 4.2191, + "step": 10293 + }, + { + "epoch": 0.8773544702974516, + "grad_norm": 63.681726144127225, + "learning_rate": 8.913900630852568e-06, + "loss": 4.073, + "step": 10294 + }, + { + "epoch": 0.8774396999914771, + "grad_norm": 47.20673063034711, + "learning_rate": 8.913592046308503e-06, + "loss": 3.9584, + "step": 10295 + }, + { + "epoch": 0.8775249296855024, + "grad_norm": 78.84820242697693, + "learning_rate": 8.913283423275818e-06, + "loss": 5.3185, + "step": 10296 + }, + { + "epoch": 0.8776101593795278, + "grad_norm": 53.817442471984144, + "learning_rate": 8.912974761757544e-06, + "loss": 4.3611, + "step": 10297 + }, + { + "epoch": 0.8776953890735533, + "grad_norm": 47.53016774605702, + "learning_rate": 8.912666061756719e-06, + "loss": 3.5118, + "step": 10298 + }, + { + "epoch": 0.8777806187675786, + "grad_norm": 60.65857795192553, + "learning_rate": 8.912357323276377e-06, + "loss": 4.6705, + "step": 10299 + }, + { + "epoch": 0.877865848461604, + "grad_norm": 31.579419740408223, + "learning_rate": 8.912048546319556e-06, + "loss": 3.8635, + "step": 10300 + }, + { + "epoch": 0.8779510781556294, + "grad_norm": 225.50409052700962, + "learning_rate": 8.91173973088929e-06, + "loss": 4.0871, + "step": 10301 + }, + { + "epoch": 0.8780363078496548, + "grad_norm": 61.277363180716996, + "learning_rate": 8.911430876988621e-06, + "loss": 4.9473, + "step": 10302 + }, + { + "epoch": 0.8781215375436802, + "grad_norm": 70.10314552513161, + "learning_rate": 8.911121984620584e-06, + "loss": 4.3181, + "step": 10303 + }, + { + "epoch": 0.8782067672377056, + "grad_norm": 49.073502181300555, + "learning_rate": 8.910813053788213e-06, + "loss": 4.8981, + "step": 10304 + }, + { + "epoch": 0.878291996931731, + "grad_norm": 55.23933767913596, + "learning_rate": 8.910504084494552e-06, + "loss": 4.1295, + "step": 10305 + }, + { + "epoch": 0.8783772266257565, + "grad_norm": 32.12038598997696, + "learning_rate": 8.910195076742636e-06, + "loss": 3.0629, + "step": 10306 + }, + { + "epoch": 0.8784624563197818, + "grad_norm": 27.7481509475559, + "learning_rate": 8.909886030535506e-06, + "loss": 2.8997, + "step": 10307 + }, + { + "epoch": 0.8785476860138072, + "grad_norm": 60.70806423691908, + "learning_rate": 8.909576945876201e-06, + "loss": 5.4684, + "step": 10308 + }, + { + "epoch": 0.8786329157078326, + "grad_norm": 65.47087543442228, + "learning_rate": 8.909267822767758e-06, + "loss": 3.5467, + "step": 10309 + }, + { + "epoch": 0.878718145401858, + "grad_norm": 32.92946272234009, + "learning_rate": 8.908958661213222e-06, + "loss": 3.5339, + "step": 10310 + }, + { + "epoch": 0.8788033750958834, + "grad_norm": 26.95877998485759, + "learning_rate": 8.908649461215629e-06, + "loss": 3.5036, + "step": 10311 + }, + { + "epoch": 0.8788886047899088, + "grad_norm": 32.30102084512409, + "learning_rate": 8.908340222778024e-06, + "loss": 3.5687, + "step": 10312 + }, + { + "epoch": 0.8789738344839342, + "grad_norm": 52.006811310716905, + "learning_rate": 8.908030945903443e-06, + "loss": 4.1609, + "step": 10313 + }, + { + "epoch": 0.8790590641779596, + "grad_norm": 31.44583294703062, + "learning_rate": 8.907721630594932e-06, + "loss": 2.7486, + "step": 10314 + }, + { + "epoch": 0.879144293871985, + "grad_norm": 51.98946756407174, + "learning_rate": 8.907412276855531e-06, + "loss": 3.9319, + "step": 10315 + }, + { + "epoch": 0.8792295235660104, + "grad_norm": 54.58408910036957, + "learning_rate": 8.907102884688285e-06, + "loss": 4.4152, + "step": 10316 + }, + { + "epoch": 0.8793147532600358, + "grad_norm": 53.98198183642313, + "learning_rate": 8.906793454096232e-06, + "loss": 4.0961, + "step": 10317 + }, + { + "epoch": 0.8793999829540612, + "grad_norm": 47.8947168520704, + "learning_rate": 8.90648398508242e-06, + "loss": 4.7764, + "step": 10318 + }, + { + "epoch": 0.8794852126480865, + "grad_norm": 78.69102478527147, + "learning_rate": 8.906174477649889e-06, + "loss": 3.6582, + "step": 10319 + }, + { + "epoch": 0.879570442342112, + "grad_norm": 42.93993912599185, + "learning_rate": 8.905864931801684e-06, + "loss": 4.1887, + "step": 10320 + }, + { + "epoch": 0.8796556720361374, + "grad_norm": 35.846110292700835, + "learning_rate": 8.905555347540849e-06, + "loss": 3.7755, + "step": 10321 + }, + { + "epoch": 0.8797409017301628, + "grad_norm": 45.4963830534417, + "learning_rate": 8.90524572487043e-06, + "loss": 3.8729, + "step": 10322 + }, + { + "epoch": 0.8798261314241882, + "grad_norm": 60.63132436897707, + "learning_rate": 8.904936063793472e-06, + "loss": 5.1434, + "step": 10323 + }, + { + "epoch": 0.8799113611182136, + "grad_norm": 39.58485472351123, + "learning_rate": 8.904626364313018e-06, + "loss": 4.7232, + "step": 10324 + }, + { + "epoch": 0.879996590812239, + "grad_norm": 29.121080924177953, + "learning_rate": 8.904316626432114e-06, + "loss": 3.9466, + "step": 10325 + }, + { + "epoch": 0.8800818205062644, + "grad_norm": 27.84805781058624, + "learning_rate": 8.904006850153811e-06, + "loss": 3.9184, + "step": 10326 + }, + { + "epoch": 0.8801670502002897, + "grad_norm": 65.25538113696713, + "learning_rate": 8.903697035481149e-06, + "loss": 4.442, + "step": 10327 + }, + { + "epoch": 0.8802522798943152, + "grad_norm": 48.4540330179628, + "learning_rate": 8.90338718241718e-06, + "loss": 4.7177, + "step": 10328 + }, + { + "epoch": 0.8803375095883406, + "grad_norm": 50.61305686268943, + "learning_rate": 8.903077290964948e-06, + "loss": 4.3547, + "step": 10329 + }, + { + "epoch": 0.880422739282366, + "grad_norm": 83.90614907385127, + "learning_rate": 8.9027673611275e-06, + "loss": 6.1523, + "step": 10330 + }, + { + "epoch": 0.8805079689763914, + "grad_norm": 50.18264446379576, + "learning_rate": 8.902457392907888e-06, + "loss": 3.8583, + "step": 10331 + }, + { + "epoch": 0.8805931986704167, + "grad_norm": 41.30812549062029, + "learning_rate": 8.902147386309158e-06, + "loss": 3.1847, + "step": 10332 + }, + { + "epoch": 0.8806784283644422, + "grad_norm": 51.31468838627754, + "learning_rate": 8.901837341334359e-06, + "loss": 4.1061, + "step": 10333 + }, + { + "epoch": 0.8807636580584676, + "grad_norm": 58.79466501244712, + "learning_rate": 8.90152725798654e-06, + "loss": 3.5567, + "step": 10334 + }, + { + "epoch": 0.8808488877524929, + "grad_norm": 52.15457124386917, + "learning_rate": 8.90121713626875e-06, + "loss": 4.5721, + "step": 10335 + }, + { + "epoch": 0.8809341174465184, + "grad_norm": 72.974784930002, + "learning_rate": 8.900906976184041e-06, + "loss": 4.1661, + "step": 10336 + }, + { + "epoch": 0.8810193471405438, + "grad_norm": 60.90817421105464, + "learning_rate": 8.90059677773546e-06, + "loss": 3.3873, + "step": 10337 + }, + { + "epoch": 0.8811045768345691, + "grad_norm": 48.89795704234368, + "learning_rate": 8.900286540926062e-06, + "loss": 4.3317, + "step": 10338 + }, + { + "epoch": 0.8811898065285946, + "grad_norm": 37.02519696778595, + "learning_rate": 8.899976265758893e-06, + "loss": 3.4234, + "step": 10339 + }, + { + "epoch": 0.8812750362226199, + "grad_norm": 25.412136499931783, + "learning_rate": 8.899665952237009e-06, + "loss": 2.2947, + "step": 10340 + }, + { + "epoch": 0.8813602659166454, + "grad_norm": 52.885289775951115, + "learning_rate": 8.89935560036346e-06, + "loss": 3.7643, + "step": 10341 + }, + { + "epoch": 0.8814454956106708, + "grad_norm": 33.05983997508758, + "learning_rate": 8.899045210141299e-06, + "loss": 2.5185, + "step": 10342 + }, + { + "epoch": 0.8815307253046961, + "grad_norm": 29.36516904054598, + "learning_rate": 8.898734781573575e-06, + "loss": 3.0481, + "step": 10343 + }, + { + "epoch": 0.8816159549987216, + "grad_norm": 85.40239883400506, + "learning_rate": 8.898424314663345e-06, + "loss": 4.4071, + "step": 10344 + }, + { + "epoch": 0.8817011846927469, + "grad_norm": 66.177236992073, + "learning_rate": 8.898113809413662e-06, + "loss": 4.5373, + "step": 10345 + }, + { + "epoch": 0.8817864143867723, + "grad_norm": 25.389256404785186, + "learning_rate": 8.897803265827578e-06, + "loss": 3.1266, + "step": 10346 + }, + { + "epoch": 0.8818716440807978, + "grad_norm": 52.07332142807681, + "learning_rate": 8.897492683908146e-06, + "loss": 4.6804, + "step": 10347 + }, + { + "epoch": 0.8819568737748231, + "grad_norm": 49.31589220764077, + "learning_rate": 8.897182063658425e-06, + "loss": 2.8567, + "step": 10348 + }, + { + "epoch": 0.8820421034688486, + "grad_norm": 37.526604614488534, + "learning_rate": 8.896871405081466e-06, + "loss": 3.9541, + "step": 10349 + }, + { + "epoch": 0.882127333162874, + "grad_norm": 46.23070809119739, + "learning_rate": 8.896560708180325e-06, + "loss": 3.7489, + "step": 10350 + }, + { + "epoch": 0.8822125628568993, + "grad_norm": 198.00208936447243, + "learning_rate": 8.896249972958056e-06, + "loss": 3.9905, + "step": 10351 + }, + { + "epoch": 0.8822977925509248, + "grad_norm": 50.196958731344324, + "learning_rate": 8.895939199417718e-06, + "loss": 2.558, + "step": 10352 + }, + { + "epoch": 0.8823830222449501, + "grad_norm": 77.88341809427082, + "learning_rate": 8.895628387562368e-06, + "loss": 3.4783, + "step": 10353 + }, + { + "epoch": 0.8824682519389755, + "grad_norm": 77.33073461505164, + "learning_rate": 8.895317537395058e-06, + "loss": 4.323, + "step": 10354 + }, + { + "epoch": 0.882553481633001, + "grad_norm": 66.92515108855585, + "learning_rate": 8.89500664891885e-06, + "loss": 2.9553, + "step": 10355 + }, + { + "epoch": 0.8826387113270263, + "grad_norm": 93.33792103311406, + "learning_rate": 8.894695722136797e-06, + "loss": 4.6673, + "step": 10356 + }, + { + "epoch": 0.8827239410210518, + "grad_norm": 72.454017271102, + "learning_rate": 8.894384757051962e-06, + "loss": 5.0248, + "step": 10357 + }, + { + "epoch": 0.8828091707150771, + "grad_norm": 27.30805596247779, + "learning_rate": 8.894073753667399e-06, + "loss": 2.9197, + "step": 10358 + }, + { + "epoch": 0.8828944004091025, + "grad_norm": 43.09235760056022, + "learning_rate": 8.893762711986167e-06, + "loss": 4.0143, + "step": 10359 + }, + { + "epoch": 0.882979630103128, + "grad_norm": 67.35630665029156, + "learning_rate": 8.893451632011326e-06, + "loss": 2.452, + "step": 10360 + }, + { + "epoch": 0.8830648597971533, + "grad_norm": 31.193023636729105, + "learning_rate": 8.893140513745937e-06, + "loss": 3.2885, + "step": 10361 + }, + { + "epoch": 0.8831500894911787, + "grad_norm": 45.47630785774522, + "learning_rate": 8.892829357193057e-06, + "loss": 3.3065, + "step": 10362 + }, + { + "epoch": 0.8832353191852041, + "grad_norm": 63.86923347028688, + "learning_rate": 8.892518162355747e-06, + "loss": 4.5476, + "step": 10363 + }, + { + "epoch": 0.8833205488792295, + "grad_norm": 37.02873067059776, + "learning_rate": 8.892206929237069e-06, + "loss": 3.2856, + "step": 10364 + }, + { + "epoch": 0.883405778573255, + "grad_norm": 73.16897992011683, + "learning_rate": 8.89189565784008e-06, + "loss": 4.9424, + "step": 10365 + }, + { + "epoch": 0.8834910082672803, + "grad_norm": 52.22002450632272, + "learning_rate": 8.891584348167846e-06, + "loss": 2.8664, + "step": 10366 + }, + { + "epoch": 0.8835762379613057, + "grad_norm": 92.60652687875725, + "learning_rate": 8.891273000223425e-06, + "loss": 4.5746, + "step": 10367 + }, + { + "epoch": 0.8836614676553312, + "grad_norm": 33.13287324071947, + "learning_rate": 8.89096161400988e-06, + "loss": 3.5987, + "step": 10368 + }, + { + "epoch": 0.8837466973493565, + "grad_norm": 90.9385347196911, + "learning_rate": 8.890650189530274e-06, + "loss": 5.4421, + "step": 10369 + }, + { + "epoch": 0.8838319270433819, + "grad_norm": 71.15420731878068, + "learning_rate": 8.89033872678767e-06, + "loss": 4.0561, + "step": 10370 + }, + { + "epoch": 0.8839171567374073, + "grad_norm": 53.66334631860279, + "learning_rate": 8.89002722578513e-06, + "loss": 3.8609, + "step": 10371 + }, + { + "epoch": 0.8840023864314327, + "grad_norm": 60.432920914357865, + "learning_rate": 8.889715686525718e-06, + "loss": 4.6359, + "step": 10372 + }, + { + "epoch": 0.8840876161254582, + "grad_norm": 33.955361766410995, + "learning_rate": 8.889404109012497e-06, + "loss": 2.3167, + "step": 10373 + }, + { + "epoch": 0.8841728458194835, + "grad_norm": 63.11305390066284, + "learning_rate": 8.889092493248535e-06, + "loss": 4.6718, + "step": 10374 + }, + { + "epoch": 0.8842580755135089, + "grad_norm": 59.965896782837866, + "learning_rate": 8.88878083923689e-06, + "loss": 3.7969, + "step": 10375 + }, + { + "epoch": 0.8843433052075343, + "grad_norm": 75.64057813589524, + "learning_rate": 8.888469146980633e-06, + "loss": 6.3904, + "step": 10376 + }, + { + "epoch": 0.8844285349015597, + "grad_norm": 39.906931538088735, + "learning_rate": 8.888157416482827e-06, + "loss": 3.8914, + "step": 10377 + }, + { + "epoch": 0.8845137645955851, + "grad_norm": 82.76605572214912, + "learning_rate": 8.887845647746538e-06, + "loss": 4.623, + "step": 10378 + }, + { + "epoch": 0.8845989942896105, + "grad_norm": 68.98913164595857, + "learning_rate": 8.887533840774832e-06, + "loss": 2.7891, + "step": 10379 + }, + { + "epoch": 0.8846842239836359, + "grad_norm": 34.209796669182005, + "learning_rate": 8.887221995570774e-06, + "loss": 4.1806, + "step": 10380 + }, + { + "epoch": 0.8847694536776612, + "grad_norm": 60.380884800732495, + "learning_rate": 8.886910112137432e-06, + "loss": 4.3224, + "step": 10381 + }, + { + "epoch": 0.8848546833716867, + "grad_norm": 33.76210554433796, + "learning_rate": 8.886598190477876e-06, + "loss": 3.4603, + "step": 10382 + }, + { + "epoch": 0.8849399130657121, + "grad_norm": 69.48696098806663, + "learning_rate": 8.886286230595169e-06, + "loss": 4.753, + "step": 10383 + }, + { + "epoch": 0.8850251427597375, + "grad_norm": 94.6775397216419, + "learning_rate": 8.885974232492382e-06, + "loss": 4.7466, + "step": 10384 + }, + { + "epoch": 0.8851103724537629, + "grad_norm": 56.356104168585574, + "learning_rate": 8.885662196172581e-06, + "loss": 4.007, + "step": 10385 + }, + { + "epoch": 0.8851956021477883, + "grad_norm": 63.21073754793325, + "learning_rate": 8.885350121638838e-06, + "loss": 4.7177, + "step": 10386 + }, + { + "epoch": 0.8852808318418137, + "grad_norm": 48.055813434984124, + "learning_rate": 8.885038008894221e-06, + "loss": 4.2138, + "step": 10387 + }, + { + "epoch": 0.8853660615358391, + "grad_norm": 36.95515141310326, + "learning_rate": 8.884725857941798e-06, + "loss": 3.4623, + "step": 10388 + }, + { + "epoch": 0.8854512912298644, + "grad_norm": 27.048149819254302, + "learning_rate": 8.88441366878464e-06, + "loss": 4.2493, + "step": 10389 + }, + { + "epoch": 0.8855365209238899, + "grad_norm": 25.32582217180532, + "learning_rate": 8.884101441425818e-06, + "loss": 2.9906, + "step": 10390 + }, + { + "epoch": 0.8856217506179153, + "grad_norm": 26.648076924837344, + "learning_rate": 8.8837891758684e-06, + "loss": 2.8269, + "step": 10391 + }, + { + "epoch": 0.8857069803119407, + "grad_norm": 62.76196142963507, + "learning_rate": 8.88347687211546e-06, + "loss": 4.2687, + "step": 10392 + }, + { + "epoch": 0.8857922100059661, + "grad_norm": 54.5628586604325, + "learning_rate": 8.883164530170068e-06, + "loss": 5.3769, + "step": 10393 + }, + { + "epoch": 0.8858774396999914, + "grad_norm": 68.50250630924887, + "learning_rate": 8.882852150035295e-06, + "loss": 5.1198, + "step": 10394 + }, + { + "epoch": 0.8859626693940169, + "grad_norm": 25.22206691726932, + "learning_rate": 8.882539731714214e-06, + "loss": 3.2381, + "step": 10395 + }, + { + "epoch": 0.8860478990880423, + "grad_norm": 33.09106247190086, + "learning_rate": 8.882227275209898e-06, + "loss": 3.7821, + "step": 10396 + }, + { + "epoch": 0.8861331287820676, + "grad_norm": 29.114791536632918, + "learning_rate": 8.88191478052542e-06, + "loss": 3.4085, + "step": 10397 + }, + { + "epoch": 0.8862183584760931, + "grad_norm": 29.874258734962158, + "learning_rate": 8.881602247663852e-06, + "loss": 3.4678, + "step": 10398 + }, + { + "epoch": 0.8863035881701185, + "grad_norm": 35.25399660794742, + "learning_rate": 8.881289676628267e-06, + "loss": 3.0095, + "step": 10399 + }, + { + "epoch": 0.8863888178641439, + "grad_norm": 75.00399549166909, + "learning_rate": 8.880977067421743e-06, + "loss": 4.5243, + "step": 10400 + }, + { + "epoch": 0.8864740475581693, + "grad_norm": 47.465532848832076, + "learning_rate": 8.88066442004735e-06, + "loss": 4.4493, + "step": 10401 + }, + { + "epoch": 0.8865592772521946, + "grad_norm": 46.65582051551574, + "learning_rate": 8.880351734508164e-06, + "loss": 3.9824, + "step": 10402 + }, + { + "epoch": 0.8866445069462201, + "grad_norm": 128.58237051440787, + "learning_rate": 8.880039010807263e-06, + "loss": 3.0622, + "step": 10403 + }, + { + "epoch": 0.8867297366402455, + "grad_norm": 36.771835005342865, + "learning_rate": 8.879726248947718e-06, + "loss": 4.2538, + "step": 10404 + }, + { + "epoch": 0.8868149663342708, + "grad_norm": 39.76844297750219, + "learning_rate": 8.879413448932606e-06, + "loss": 3.6185, + "step": 10405 + }, + { + "epoch": 0.8869001960282963, + "grad_norm": 77.56157776362107, + "learning_rate": 8.879100610765007e-06, + "loss": 4.0015, + "step": 10406 + }, + { + "epoch": 0.8869854257223216, + "grad_norm": 27.534870173341663, + "learning_rate": 8.878787734447993e-06, + "loss": 3.9656, + "step": 10407 + }, + { + "epoch": 0.8870706554163471, + "grad_norm": 90.4770026051034, + "learning_rate": 8.878474819984644e-06, + "loss": 5.4539, + "step": 10408 + }, + { + "epoch": 0.8871558851103725, + "grad_norm": 29.886873220528056, + "learning_rate": 8.878161867378034e-06, + "loss": 3.8927, + "step": 10409 + }, + { + "epoch": 0.8872411148043978, + "grad_norm": 104.61791536638205, + "learning_rate": 8.877848876631246e-06, + "loss": 1.779, + "step": 10410 + }, + { + "epoch": 0.8873263444984233, + "grad_norm": 32.31724607389823, + "learning_rate": 8.877535847747354e-06, + "loss": 3.7147, + "step": 10411 + }, + { + "epoch": 0.8874115741924486, + "grad_norm": 81.75970357823009, + "learning_rate": 8.877222780729438e-06, + "loss": 4.5855, + "step": 10412 + }, + { + "epoch": 0.887496803886474, + "grad_norm": 32.7420319299426, + "learning_rate": 8.876909675580576e-06, + "loss": 3.7872, + "step": 10413 + }, + { + "epoch": 0.8875820335804995, + "grad_norm": 33.084940882596996, + "learning_rate": 8.876596532303849e-06, + "loss": 3.5056, + "step": 10414 + }, + { + "epoch": 0.8876672632745248, + "grad_norm": 36.72349806771335, + "learning_rate": 8.876283350902335e-06, + "loss": 5.029, + "step": 10415 + }, + { + "epoch": 0.8877524929685502, + "grad_norm": 66.47707117005774, + "learning_rate": 8.875970131379115e-06, + "loss": 4.8435, + "step": 10416 + }, + { + "epoch": 0.8878377226625757, + "grad_norm": 45.33150840529535, + "learning_rate": 8.875656873737267e-06, + "loss": 4.2721, + "step": 10417 + }, + { + "epoch": 0.887922952356601, + "grad_norm": 44.29666485865966, + "learning_rate": 8.875343577979876e-06, + "loss": 4.5273, + "step": 10418 + }, + { + "epoch": 0.8880081820506265, + "grad_norm": 34.296453462433846, + "learning_rate": 8.87503024411002e-06, + "loss": 4.3969, + "step": 10419 + }, + { + "epoch": 0.8880934117446518, + "grad_norm": 74.37372674486181, + "learning_rate": 8.87471687213078e-06, + "loss": 4.2293, + "step": 10420 + }, + { + "epoch": 0.8881786414386772, + "grad_norm": 30.87438220882202, + "learning_rate": 8.87440346204524e-06, + "loss": 4.0465, + "step": 10421 + }, + { + "epoch": 0.8882638711327027, + "grad_norm": 32.80740367196984, + "learning_rate": 8.874090013856483e-06, + "loss": 3.178, + "step": 10422 + }, + { + "epoch": 0.888349100826728, + "grad_norm": 68.47069938383648, + "learning_rate": 8.873776527567587e-06, + "loss": 4.425, + "step": 10423 + }, + { + "epoch": 0.8884343305207534, + "grad_norm": 26.29762367086026, + "learning_rate": 8.873463003181642e-06, + "loss": 2.9913, + "step": 10424 + }, + { + "epoch": 0.8885195602147788, + "grad_norm": 50.3144576034952, + "learning_rate": 8.873149440701726e-06, + "loss": 4.2472, + "step": 10425 + }, + { + "epoch": 0.8886047899088042, + "grad_norm": 26.563626501524435, + "learning_rate": 8.872835840130923e-06, + "loss": 3.8225, + "step": 10426 + }, + { + "epoch": 0.8886900196028297, + "grad_norm": 38.185498412558374, + "learning_rate": 8.87252220147232e-06, + "loss": 4.1028, + "step": 10427 + }, + { + "epoch": 0.888775249296855, + "grad_norm": 110.32155390146997, + "learning_rate": 8.872208524728998e-06, + "loss": 4.67, + "step": 10428 + }, + { + "epoch": 0.8888604789908804, + "grad_norm": 45.606716230432134, + "learning_rate": 8.871894809904045e-06, + "loss": 5.1197, + "step": 10429 + }, + { + "epoch": 0.8889457086849059, + "grad_norm": 47.64495254165959, + "learning_rate": 8.871581057000546e-06, + "loss": 4.0182, + "step": 10430 + }, + { + "epoch": 0.8890309383789312, + "grad_norm": 108.73644887961896, + "learning_rate": 8.871267266021584e-06, + "loss": 3.9441, + "step": 10431 + }, + { + "epoch": 0.8891161680729566, + "grad_norm": 34.0737936215478, + "learning_rate": 8.870953436970247e-06, + "loss": 4.6996, + "step": 10432 + }, + { + "epoch": 0.889201397766982, + "grad_norm": 51.78321356171162, + "learning_rate": 8.870639569849622e-06, + "loss": 3.4075, + "step": 10433 + }, + { + "epoch": 0.8892866274610074, + "grad_norm": 41.71698991550041, + "learning_rate": 8.870325664662795e-06, + "loss": 3.5379, + "step": 10434 + }, + { + "epoch": 0.8893718571550329, + "grad_norm": 33.973302271854344, + "learning_rate": 8.870011721412852e-06, + "loss": 2.8613, + "step": 10435 + }, + { + "epoch": 0.8894570868490582, + "grad_norm": 41.55643662040644, + "learning_rate": 8.869697740102882e-06, + "loss": 4.0482, + "step": 10436 + }, + { + "epoch": 0.8895423165430836, + "grad_norm": 36.597954742858825, + "learning_rate": 8.869383720735973e-06, + "loss": 3.7225, + "step": 10437 + }, + { + "epoch": 0.889627546237109, + "grad_norm": 121.7661600130304, + "learning_rate": 8.869069663315212e-06, + "loss": 2.8999, + "step": 10438 + }, + { + "epoch": 0.8897127759311344, + "grad_norm": 35.78196975764647, + "learning_rate": 8.868755567843689e-06, + "loss": 4.1141, + "step": 10439 + }, + { + "epoch": 0.8897980056251598, + "grad_norm": 31.27148739969308, + "learning_rate": 8.86844143432449e-06, + "loss": 3.5762, + "step": 10440 + }, + { + "epoch": 0.8898832353191852, + "grad_norm": 48.112294661948, + "learning_rate": 8.86812726276071e-06, + "loss": 4.2145, + "step": 10441 + }, + { + "epoch": 0.8899684650132106, + "grad_norm": 58.09171692155485, + "learning_rate": 8.867813053155435e-06, + "loss": 3.9975, + "step": 10442 + }, + { + "epoch": 0.890053694707236, + "grad_norm": 37.745565442096506, + "learning_rate": 8.867498805511754e-06, + "loss": 3.311, + "step": 10443 + }, + { + "epoch": 0.8901389244012614, + "grad_norm": 48.89759347974239, + "learning_rate": 8.86718451983276e-06, + "loss": 4.2286, + "step": 10444 + }, + { + "epoch": 0.8902241540952868, + "grad_norm": 34.06363742414565, + "learning_rate": 8.86687019612154e-06, + "loss": 3.7187, + "step": 10445 + }, + { + "epoch": 0.8903093837893122, + "grad_norm": 34.00413052896808, + "learning_rate": 8.866555834381192e-06, + "loss": 4.0606, + "step": 10446 + }, + { + "epoch": 0.8903946134833376, + "grad_norm": 38.309893088124525, + "learning_rate": 8.866241434614801e-06, + "loss": 2.9973, + "step": 10447 + }, + { + "epoch": 0.890479843177363, + "grad_norm": 128.6168952154027, + "learning_rate": 8.865926996825464e-06, + "loss": 3.8815, + "step": 10448 + }, + { + "epoch": 0.8905650728713884, + "grad_norm": 33.862328698938505, + "learning_rate": 8.86561252101627e-06, + "loss": 4.1429, + "step": 10449 + }, + { + "epoch": 0.8906503025654138, + "grad_norm": 56.301469928594656, + "learning_rate": 8.865298007190313e-06, + "loss": 3.6696, + "step": 10450 + }, + { + "epoch": 0.8907355322594391, + "grad_norm": 31.46255001529664, + "learning_rate": 8.864983455350687e-06, + "loss": 3.3432, + "step": 10451 + }, + { + "epoch": 0.8908207619534646, + "grad_norm": 40.316838451353874, + "learning_rate": 8.864668865500484e-06, + "loss": 4.0665, + "step": 10452 + }, + { + "epoch": 0.89090599164749, + "grad_norm": 50.64716953975244, + "learning_rate": 8.864354237642796e-06, + "loss": 4.0642, + "step": 10453 + }, + { + "epoch": 0.8909912213415154, + "grad_norm": 27.44469686823986, + "learning_rate": 8.864039571780723e-06, + "loss": 3.2156, + "step": 10454 + }, + { + "epoch": 0.8910764510355408, + "grad_norm": 61.8589560549757, + "learning_rate": 8.863724867917355e-06, + "loss": 4.4551, + "step": 10455 + }, + { + "epoch": 0.8911616807295661, + "grad_norm": 39.080330947202825, + "learning_rate": 8.863410126055787e-06, + "loss": 4.2118, + "step": 10456 + }, + { + "epoch": 0.8912469104235916, + "grad_norm": 49.51668824926712, + "learning_rate": 8.863095346199116e-06, + "loss": 4.564, + "step": 10457 + }, + { + "epoch": 0.891332140117617, + "grad_norm": 40.11399392260315, + "learning_rate": 8.86278052835044e-06, + "loss": 3.7708, + "step": 10458 + }, + { + "epoch": 0.8914173698116423, + "grad_norm": 43.0747413471742, + "learning_rate": 8.862465672512848e-06, + "loss": 3.6061, + "step": 10459 + }, + { + "epoch": 0.8915025995056678, + "grad_norm": 37.07287665534016, + "learning_rate": 8.862150778689445e-06, + "loss": 4.3567, + "step": 10460 + }, + { + "epoch": 0.8915878291996931, + "grad_norm": 77.07034709440987, + "learning_rate": 8.861835846883322e-06, + "loss": 2.9556, + "step": 10461 + }, + { + "epoch": 0.8916730588937186, + "grad_norm": 32.33772472277365, + "learning_rate": 8.861520877097577e-06, + "loss": 3.7356, + "step": 10462 + }, + { + "epoch": 0.891758288587744, + "grad_norm": 37.33978822886419, + "learning_rate": 8.861205869335309e-06, + "loss": 3.0715, + "step": 10463 + }, + { + "epoch": 0.8918435182817693, + "grad_norm": 32.21725373086511, + "learning_rate": 8.860890823599618e-06, + "loss": 3.9327, + "step": 10464 + }, + { + "epoch": 0.8919287479757948, + "grad_norm": 58.57504103865836, + "learning_rate": 8.860575739893596e-06, + "loss": 4.283, + "step": 10465 + }, + { + "epoch": 0.8920139776698202, + "grad_norm": 31.25579677260734, + "learning_rate": 8.860260618220349e-06, + "loss": 2.9501, + "step": 10466 + }, + { + "epoch": 0.8920992073638455, + "grad_norm": 37.19179068125763, + "learning_rate": 8.85994545858297e-06, + "loss": 3.774, + "step": 10467 + }, + { + "epoch": 0.892184437057871, + "grad_norm": 72.09136627389704, + "learning_rate": 8.859630260984564e-06, + "loss": 4.4576, + "step": 10468 + }, + { + "epoch": 0.8922696667518963, + "grad_norm": 34.24072381909789, + "learning_rate": 8.859315025428225e-06, + "loss": 3.7729, + "step": 10469 + }, + { + "epoch": 0.8923548964459218, + "grad_norm": 49.12435991897473, + "learning_rate": 8.85899975191706e-06, + "loss": 4.0574, + "step": 10470 + }, + { + "epoch": 0.8924401261399472, + "grad_norm": 40.76597282537367, + "learning_rate": 8.858684440454163e-06, + "loss": 4.1025, + "step": 10471 + }, + { + "epoch": 0.8925253558339725, + "grad_norm": 78.78088723830935, + "learning_rate": 8.858369091042638e-06, + "loss": 5.5749, + "step": 10472 + }, + { + "epoch": 0.892610585527998, + "grad_norm": 59.156234039909016, + "learning_rate": 8.858053703685588e-06, + "loss": 4.4802, + "step": 10473 + }, + { + "epoch": 0.8926958152220233, + "grad_norm": 36.913955861991205, + "learning_rate": 8.857738278386112e-06, + "loss": 3.5794, + "step": 10474 + }, + { + "epoch": 0.8927810449160487, + "grad_norm": 55.5130584437511, + "learning_rate": 8.857422815147311e-06, + "loss": 4.4258, + "step": 10475 + }, + { + "epoch": 0.8928662746100742, + "grad_norm": 30.146790280571153, + "learning_rate": 8.857107313972292e-06, + "loss": 2.4712, + "step": 10476 + }, + { + "epoch": 0.8929515043040995, + "grad_norm": 61.61234845787928, + "learning_rate": 8.856791774864155e-06, + "loss": 5.072, + "step": 10477 + }, + { + "epoch": 0.893036733998125, + "grad_norm": 39.15692977495906, + "learning_rate": 8.856476197826003e-06, + "loss": 2.9125, + "step": 10478 + }, + { + "epoch": 0.8931219636921504, + "grad_norm": 40.25956075026867, + "learning_rate": 8.856160582860939e-06, + "loss": 3.6016, + "step": 10479 + }, + { + "epoch": 0.8932071933861757, + "grad_norm": 107.01393779329601, + "learning_rate": 8.855844929972068e-06, + "loss": 4.7489, + "step": 10480 + }, + { + "epoch": 0.8932924230802012, + "grad_norm": 23.90894702050363, + "learning_rate": 8.855529239162495e-06, + "loss": 2.2869, + "step": 10481 + }, + { + "epoch": 0.8933776527742265, + "grad_norm": 29.509743400182295, + "learning_rate": 8.855213510435324e-06, + "loss": 2.0353, + "step": 10482 + }, + { + "epoch": 0.8934628824682519, + "grad_norm": 41.519337195766234, + "learning_rate": 8.85489774379366e-06, + "loss": 4.3361, + "step": 10483 + }, + { + "epoch": 0.8935481121622774, + "grad_norm": 31.961579572768784, + "learning_rate": 8.854581939240608e-06, + "loss": 3.2548, + "step": 10484 + }, + { + "epoch": 0.8936333418563027, + "grad_norm": 212.82680599801353, + "learning_rate": 8.854266096779275e-06, + "loss": 4.5319, + "step": 10485 + }, + { + "epoch": 0.8937185715503282, + "grad_norm": 40.10266574743727, + "learning_rate": 8.853950216412766e-06, + "loss": 3.5265, + "step": 10486 + }, + { + "epoch": 0.8938038012443535, + "grad_norm": 40.44463935221351, + "learning_rate": 8.853634298144189e-06, + "loss": 3.302, + "step": 10487 + }, + { + "epoch": 0.8938890309383789, + "grad_norm": 77.40011112489597, + "learning_rate": 8.853318341976648e-06, + "loss": 4.2934, + "step": 10488 + }, + { + "epoch": 0.8939742606324044, + "grad_norm": 37.81899876224474, + "learning_rate": 8.853002347913254e-06, + "loss": 3.603, + "step": 10489 + }, + { + "epoch": 0.8940594903264297, + "grad_norm": 45.55630943949654, + "learning_rate": 8.852686315957114e-06, + "loss": 3.5428, + "step": 10490 + }, + { + "epoch": 0.8941447200204551, + "grad_norm": 54.87117858012576, + "learning_rate": 8.852370246111333e-06, + "loss": 3.4958, + "step": 10491 + }, + { + "epoch": 0.8942299497144806, + "grad_norm": 29.138368385363535, + "learning_rate": 8.85205413837902e-06, + "loss": 3.3462, + "step": 10492 + }, + { + "epoch": 0.8943151794085059, + "grad_norm": 40.56086276551836, + "learning_rate": 8.851737992763287e-06, + "loss": 3.3212, + "step": 10493 + }, + { + "epoch": 0.8944004091025313, + "grad_norm": 59.36697473450603, + "learning_rate": 8.851421809267242e-06, + "loss": 4.4125, + "step": 10494 + }, + { + "epoch": 0.8944856387965567, + "grad_norm": 51.72873933898612, + "learning_rate": 8.851105587893994e-06, + "loss": 4.5201, + "step": 10495 + }, + { + "epoch": 0.8945708684905821, + "grad_norm": 63.89200779908315, + "learning_rate": 8.850789328646652e-06, + "loss": 4.2963, + "step": 10496 + }, + { + "epoch": 0.8946560981846076, + "grad_norm": 45.952833525946524, + "learning_rate": 8.850473031528325e-06, + "loss": 3.7952, + "step": 10497 + }, + { + "epoch": 0.8947413278786329, + "grad_norm": 46.91838645361232, + "learning_rate": 8.85015669654213e-06, + "loss": 3.9307, + "step": 10498 + }, + { + "epoch": 0.8948265575726583, + "grad_norm": 51.93948687870097, + "learning_rate": 8.849840323691171e-06, + "loss": 4.0427, + "step": 10499 + }, + { + "epoch": 0.8949117872666837, + "grad_norm": 73.45968042126303, + "learning_rate": 8.849523912978564e-06, + "loss": 4.758, + "step": 10500 + }, + { + "epoch": 0.8949970169607091, + "grad_norm": 81.01688768270571, + "learning_rate": 8.849207464407417e-06, + "loss": 4.5681, + "step": 10501 + }, + { + "epoch": 0.8950822466547345, + "grad_norm": 44.25247760634497, + "learning_rate": 8.848890977980845e-06, + "loss": 2.2957, + "step": 10502 + }, + { + "epoch": 0.8951674763487599, + "grad_norm": 55.445766698557904, + "learning_rate": 8.84857445370196e-06, + "loss": 3.4214, + "step": 10503 + }, + { + "epoch": 0.8952527060427853, + "grad_norm": 47.63402503275056, + "learning_rate": 8.848257891573874e-06, + "loss": 4.7373, + "step": 10504 + }, + { + "epoch": 0.8953379357368108, + "grad_norm": 105.06735642619267, + "learning_rate": 8.8479412915997e-06, + "loss": 5.2662, + "step": 10505 + }, + { + "epoch": 0.8954231654308361, + "grad_norm": 52.202470497606704, + "learning_rate": 8.847624653782554e-06, + "loss": 5.1506, + "step": 10506 + }, + { + "epoch": 0.8955083951248615, + "grad_norm": 52.617573611943456, + "learning_rate": 8.847307978125548e-06, + "loss": 3.7966, + "step": 10507 + }, + { + "epoch": 0.8955936248188869, + "grad_norm": 37.199302598915644, + "learning_rate": 8.846991264631797e-06, + "loss": 3.6621, + "step": 10508 + }, + { + "epoch": 0.8956788545129123, + "grad_norm": 31.00224413587038, + "learning_rate": 8.846674513304414e-06, + "loss": 2.3562, + "step": 10509 + }, + { + "epoch": 0.8957640842069376, + "grad_norm": 62.755434860314494, + "learning_rate": 8.846357724146516e-06, + "loss": 3.7684, + "step": 10510 + }, + { + "epoch": 0.8958493139009631, + "grad_norm": 32.712701285306586, + "learning_rate": 8.846040897161222e-06, + "loss": 3.1432, + "step": 10511 + }, + { + "epoch": 0.8959345435949885, + "grad_norm": 42.86267236893408, + "learning_rate": 8.845724032351639e-06, + "loss": 3.0395, + "step": 10512 + }, + { + "epoch": 0.8960197732890139, + "grad_norm": 42.78198607717438, + "learning_rate": 8.845407129720891e-06, + "loss": 3.6788, + "step": 10513 + }, + { + "epoch": 0.8961050029830393, + "grad_norm": 80.88841101019831, + "learning_rate": 8.84509018927209e-06, + "loss": 4.8265, + "step": 10514 + }, + { + "epoch": 0.8961902326770647, + "grad_norm": 106.58726943867103, + "learning_rate": 8.844773211008357e-06, + "loss": 4.7306, + "step": 10515 + }, + { + "epoch": 0.8962754623710901, + "grad_norm": 48.88777845556862, + "learning_rate": 8.844456194932806e-06, + "loss": 3.7121, + "step": 10516 + }, + { + "epoch": 0.8963606920651155, + "grad_norm": 48.61135848019655, + "learning_rate": 8.844139141048557e-06, + "loss": 5.0611, + "step": 10517 + }, + { + "epoch": 0.8964459217591408, + "grad_norm": 64.0885971271889, + "learning_rate": 8.843822049358726e-06, + "loss": 4.4381, + "step": 10518 + }, + { + "epoch": 0.8965311514531663, + "grad_norm": 43.93117078226734, + "learning_rate": 8.843504919866434e-06, + "loss": 4.6244, + "step": 10519 + }, + { + "epoch": 0.8966163811471917, + "grad_norm": 30.426922352494987, + "learning_rate": 8.843187752574797e-06, + "loss": 3.5182, + "step": 10520 + }, + { + "epoch": 0.8967016108412171, + "grad_norm": 31.081647134675432, + "learning_rate": 8.842870547486936e-06, + "loss": 3.2394, + "step": 10521 + }, + { + "epoch": 0.8967868405352425, + "grad_norm": 33.503211655911535, + "learning_rate": 8.842553304605969e-06, + "loss": 3.4508, + "step": 10522 + }, + { + "epoch": 0.8968720702292678, + "grad_norm": 22.785776120911574, + "learning_rate": 8.842236023935018e-06, + "loss": 2.9723, + "step": 10523 + }, + { + "epoch": 0.8969572999232933, + "grad_norm": 41.49889866295254, + "learning_rate": 8.841918705477204e-06, + "loss": 3.1375, + "step": 10524 + }, + { + "epoch": 0.8970425296173187, + "grad_norm": 86.04803652305021, + "learning_rate": 8.841601349235644e-06, + "loss": 4.6077, + "step": 10525 + }, + { + "epoch": 0.897127759311344, + "grad_norm": 37.81342509958631, + "learning_rate": 8.841283955213462e-06, + "loss": 4.3814, + "step": 10526 + }, + { + "epoch": 0.8972129890053695, + "grad_norm": 55.32239894519668, + "learning_rate": 8.840966523413778e-06, + "loss": 4.7302, + "step": 10527 + }, + { + "epoch": 0.8972982186993949, + "grad_norm": 36.69200168653771, + "learning_rate": 8.840649053839717e-06, + "loss": 3.9862, + "step": 10528 + }, + { + "epoch": 0.8973834483934202, + "grad_norm": 35.90029562993257, + "learning_rate": 8.840331546494396e-06, + "loss": 4.3341, + "step": 10529 + }, + { + "epoch": 0.8974686780874457, + "grad_norm": 30.70353449583699, + "learning_rate": 8.840014001380942e-06, + "loss": 3.5751, + "step": 10530 + }, + { + "epoch": 0.897553907781471, + "grad_norm": 41.476927463057294, + "learning_rate": 8.839696418502474e-06, + "loss": 3.9103, + "step": 10531 + }, + { + "epoch": 0.8976391374754965, + "grad_norm": 92.72248689915942, + "learning_rate": 8.839378797862119e-06, + "loss": 4.9802, + "step": 10532 + }, + { + "epoch": 0.8977243671695219, + "grad_norm": 73.1606616094459, + "learning_rate": 8.839061139462999e-06, + "loss": 3.0429, + "step": 10533 + }, + { + "epoch": 0.8978095968635472, + "grad_norm": 42.89648828112029, + "learning_rate": 8.838743443308237e-06, + "loss": 3.7514, + "step": 10534 + }, + { + "epoch": 0.8978948265575727, + "grad_norm": 78.27103827109491, + "learning_rate": 8.838425709400959e-06, + "loss": 5.3848, + "step": 10535 + }, + { + "epoch": 0.897980056251598, + "grad_norm": 35.22911668420871, + "learning_rate": 8.83810793774429e-06, + "loss": 2.9059, + "step": 10536 + }, + { + "epoch": 0.8980652859456234, + "grad_norm": 69.25930733072663, + "learning_rate": 8.837790128341356e-06, + "loss": 4.8308, + "step": 10537 + }, + { + "epoch": 0.8981505156396489, + "grad_norm": 108.81176543338624, + "learning_rate": 8.83747228119528e-06, + "loss": 5.1436, + "step": 10538 + }, + { + "epoch": 0.8982357453336742, + "grad_norm": 63.609283882527734, + "learning_rate": 8.837154396309189e-06, + "loss": 3.0878, + "step": 10539 + }, + { + "epoch": 0.8983209750276997, + "grad_norm": 49.900658882236996, + "learning_rate": 8.836836473686208e-06, + "loss": 4.8891, + "step": 10540 + }, + { + "epoch": 0.898406204721725, + "grad_norm": 56.81159839364318, + "learning_rate": 8.836518513329466e-06, + "loss": 5.2185, + "step": 10541 + }, + { + "epoch": 0.8984914344157504, + "grad_norm": 74.15873638872145, + "learning_rate": 8.836200515242088e-06, + "loss": 4.6013, + "step": 10542 + }, + { + "epoch": 0.8985766641097759, + "grad_norm": 42.89196427515866, + "learning_rate": 8.835882479427205e-06, + "loss": 3.378, + "step": 10543 + }, + { + "epoch": 0.8986618938038012, + "grad_norm": 96.94365856805365, + "learning_rate": 8.835564405887938e-06, + "loss": 4.527, + "step": 10544 + }, + { + "epoch": 0.8987471234978266, + "grad_norm": 65.35708940028056, + "learning_rate": 8.835246294627423e-06, + "loss": 3.5196, + "step": 10545 + }, + { + "epoch": 0.8988323531918521, + "grad_norm": 70.76501634635194, + "learning_rate": 8.834928145648783e-06, + "loss": 3.6294, + "step": 10546 + }, + { + "epoch": 0.8989175828858774, + "grad_norm": 62.497128036402856, + "learning_rate": 8.834609958955147e-06, + "loss": 5.3118, + "step": 10547 + }, + { + "epoch": 0.8990028125799029, + "grad_norm": 51.65528215614387, + "learning_rate": 8.834291734549647e-06, + "loss": 4.2154, + "step": 10548 + }, + { + "epoch": 0.8990880422739282, + "grad_norm": 77.41657106184057, + "learning_rate": 8.833973472435411e-06, + "loss": 6.3021, + "step": 10549 + }, + { + "epoch": 0.8991732719679536, + "grad_norm": 41.43021642004363, + "learning_rate": 8.833655172615572e-06, + "loss": 4.0773, + "step": 10550 + }, + { + "epoch": 0.8992585016619791, + "grad_norm": 139.50396172843963, + "learning_rate": 8.833336835093255e-06, + "loss": 5.2984, + "step": 10551 + }, + { + "epoch": 0.8993437313560044, + "grad_norm": 50.08643395426232, + "learning_rate": 8.833018459871594e-06, + "loss": 3.4888, + "step": 10552 + }, + { + "epoch": 0.8994289610500298, + "grad_norm": 61.401521751828014, + "learning_rate": 8.832700046953719e-06, + "loss": 4.8847, + "step": 10553 + }, + { + "epoch": 0.8995141907440553, + "grad_norm": 60.7085412924356, + "learning_rate": 8.832381596342762e-06, + "loss": 3.3816, + "step": 10554 + }, + { + "epoch": 0.8995994204380806, + "grad_norm": 28.80707682727611, + "learning_rate": 8.832063108041856e-06, + "loss": 2.9705, + "step": 10555 + }, + { + "epoch": 0.8996846501321061, + "grad_norm": 83.23913566464839, + "learning_rate": 8.831744582054132e-06, + "loss": 4.2706, + "step": 10556 + }, + { + "epoch": 0.8997698798261314, + "grad_norm": 74.9333346263327, + "learning_rate": 8.831426018382723e-06, + "loss": 4.6975, + "step": 10557 + }, + { + "epoch": 0.8998551095201568, + "grad_norm": 55.104020044476464, + "learning_rate": 8.83110741703076e-06, + "loss": 4.6976, + "step": 10558 + }, + { + "epoch": 0.8999403392141823, + "grad_norm": 59.512958672146674, + "learning_rate": 8.830788778001377e-06, + "loss": 4.8858, + "step": 10559 + }, + { + "epoch": 0.9000255689082076, + "grad_norm": 73.74078974717492, + "learning_rate": 8.830470101297713e-06, + "loss": 4.8181, + "step": 10560 + }, + { + "epoch": 0.900110798602233, + "grad_norm": 30.61416771979781, + "learning_rate": 8.830151386922893e-06, + "loss": 3.1691, + "step": 10561 + }, + { + "epoch": 0.9001960282962584, + "grad_norm": 58.63776061904155, + "learning_rate": 8.82983263488006e-06, + "loss": 5.3562, + "step": 10562 + }, + { + "epoch": 0.9002812579902838, + "grad_norm": 31.35459962770679, + "learning_rate": 8.829513845172342e-06, + "loss": 2.2139, + "step": 10563 + }, + { + "epoch": 0.9003664876843092, + "grad_norm": 59.635391726675195, + "learning_rate": 8.829195017802877e-06, + "loss": 4.7914, + "step": 10564 + }, + { + "epoch": 0.9004517173783346, + "grad_norm": 36.66350276295853, + "learning_rate": 8.828876152774802e-06, + "loss": 3.9289, + "step": 10565 + }, + { + "epoch": 0.90053694707236, + "grad_norm": 64.55779360411793, + "learning_rate": 8.82855725009125e-06, + "loss": 6.4693, + "step": 10566 + }, + { + "epoch": 0.9006221767663855, + "grad_norm": 30.921371359648926, + "learning_rate": 8.82823830975536e-06, + "loss": 3.3403, + "step": 10567 + }, + { + "epoch": 0.9007074064604108, + "grad_norm": 42.923278610092275, + "learning_rate": 8.827919331770267e-06, + "loss": 3.2889, + "step": 10568 + }, + { + "epoch": 0.9007926361544362, + "grad_norm": 31.322122609891117, + "learning_rate": 8.827600316139108e-06, + "loss": 2.9713, + "step": 10569 + }, + { + "epoch": 0.9008778658484616, + "grad_norm": 30.462106113847618, + "learning_rate": 8.827281262865022e-06, + "loss": 2.8579, + "step": 10570 + }, + { + "epoch": 0.900963095542487, + "grad_norm": 80.69857219430055, + "learning_rate": 8.826962171951144e-06, + "loss": 4.4963, + "step": 10571 + }, + { + "epoch": 0.9010483252365123, + "grad_norm": 140.14541401419623, + "learning_rate": 8.826643043400615e-06, + "loss": 4.3892, + "step": 10572 + }, + { + "epoch": 0.9011335549305378, + "grad_norm": 44.01556377833539, + "learning_rate": 8.826323877216572e-06, + "loss": 4.3772, + "step": 10573 + }, + { + "epoch": 0.9012187846245632, + "grad_norm": 60.785732809992155, + "learning_rate": 8.826004673402153e-06, + "loss": 3.3792, + "step": 10574 + }, + { + "epoch": 0.9013040143185886, + "grad_norm": 49.36762798920561, + "learning_rate": 8.825685431960501e-06, + "loss": 4.8426, + "step": 10575 + }, + { + "epoch": 0.901389244012614, + "grad_norm": 39.61399197786364, + "learning_rate": 8.82536615289475e-06, + "loss": 3.8622, + "step": 10576 + }, + { + "epoch": 0.9014744737066394, + "grad_norm": 42.14671851039923, + "learning_rate": 8.825046836208044e-06, + "loss": 4.6121, + "step": 10577 + }, + { + "epoch": 0.9015597034006648, + "grad_norm": 39.40492766787781, + "learning_rate": 8.824727481903523e-06, + "loss": 3.4619, + "step": 10578 + }, + { + "epoch": 0.9016449330946902, + "grad_norm": 30.779220118514225, + "learning_rate": 8.824408089984327e-06, + "loss": 3.3086, + "step": 10579 + }, + { + "epoch": 0.9017301627887155, + "grad_norm": 42.61735380356479, + "learning_rate": 8.824088660453596e-06, + "loss": 4.0951, + "step": 10580 + }, + { + "epoch": 0.901815392482741, + "grad_norm": 26.534178677729734, + "learning_rate": 8.823769193314472e-06, + "loss": 2.5516, + "step": 10581 + }, + { + "epoch": 0.9019006221767664, + "grad_norm": 36.399897254542914, + "learning_rate": 8.8234496885701e-06, + "loss": 4.0386, + "step": 10582 + }, + { + "epoch": 0.9019858518707918, + "grad_norm": 72.0544442708803, + "learning_rate": 8.823130146223617e-06, + "loss": 5.1287, + "step": 10583 + }, + { + "epoch": 0.9020710815648172, + "grad_norm": 31.685634690618585, + "learning_rate": 8.82281056627817e-06, + "loss": 2.6835, + "step": 10584 + }, + { + "epoch": 0.9021563112588425, + "grad_norm": 71.92212274677358, + "learning_rate": 8.8224909487369e-06, + "loss": 3.1989, + "step": 10585 + }, + { + "epoch": 0.902241540952868, + "grad_norm": 72.99790641328286, + "learning_rate": 8.822171293602948e-06, + "loss": 5.1171, + "step": 10586 + }, + { + "epoch": 0.9023267706468934, + "grad_norm": 49.022957534099184, + "learning_rate": 8.821851600879462e-06, + "loss": 3.7961, + "step": 10587 + }, + { + "epoch": 0.9024120003409187, + "grad_norm": 35.88542061071366, + "learning_rate": 8.821531870569586e-06, + "loss": 2.6462, + "step": 10588 + }, + { + "epoch": 0.9024972300349442, + "grad_norm": 47.4927607253819, + "learning_rate": 8.82121210267646e-06, + "loss": 4.4577, + "step": 10589 + }, + { + "epoch": 0.9025824597289696, + "grad_norm": 37.70059641107814, + "learning_rate": 8.820892297203231e-06, + "loss": 3.3167, + "step": 10590 + }, + { + "epoch": 0.902667689422995, + "grad_norm": 42.55797858756201, + "learning_rate": 8.820572454153047e-06, + "loss": 3.9156, + "step": 10591 + }, + { + "epoch": 0.9027529191170204, + "grad_norm": 68.02108171735826, + "learning_rate": 8.820252573529049e-06, + "loss": 5.4248, + "step": 10592 + }, + { + "epoch": 0.9028381488110457, + "grad_norm": 47.05379985876589, + "learning_rate": 8.819932655334386e-06, + "loss": 4.0874, + "step": 10593 + }, + { + "epoch": 0.9029233785050712, + "grad_norm": 79.73224467117859, + "learning_rate": 8.819612699572203e-06, + "loss": 4.7174, + "step": 10594 + }, + { + "epoch": 0.9030086081990966, + "grad_norm": 39.16489385716022, + "learning_rate": 8.819292706245646e-06, + "loss": 2.3213, + "step": 10595 + }, + { + "epoch": 0.9030938378931219, + "grad_norm": 103.56288096795967, + "learning_rate": 8.818972675357864e-06, + "loss": 4.0079, + "step": 10596 + }, + { + "epoch": 0.9031790675871474, + "grad_norm": 41.35296070456367, + "learning_rate": 8.818652606912003e-06, + "loss": 3.8922, + "step": 10597 + }, + { + "epoch": 0.9032642972811727, + "grad_norm": 53.170200365130725, + "learning_rate": 8.818332500911211e-06, + "loss": 5.5002, + "step": 10598 + }, + { + "epoch": 0.9033495269751982, + "grad_norm": 49.521836420355335, + "learning_rate": 8.818012357358637e-06, + "loss": 5.3257, + "step": 10599 + }, + { + "epoch": 0.9034347566692236, + "grad_norm": 113.03402218655526, + "learning_rate": 8.817692176257426e-06, + "loss": 4.8458, + "step": 10600 + }, + { + "epoch": 0.9035199863632489, + "grad_norm": 27.035863690605403, + "learning_rate": 8.817371957610732e-06, + "loss": 2.0234, + "step": 10601 + }, + { + "epoch": 0.9036052160572744, + "grad_norm": 85.37513162822917, + "learning_rate": 8.817051701421699e-06, + "loss": 5.0471, + "step": 10602 + }, + { + "epoch": 0.9036904457512998, + "grad_norm": 47.08568021797136, + "learning_rate": 8.816731407693481e-06, + "loss": 4.4158, + "step": 10603 + }, + { + "epoch": 0.9037756754453251, + "grad_norm": 37.4990096460723, + "learning_rate": 8.816411076429227e-06, + "loss": 2.9582, + "step": 10604 + }, + { + "epoch": 0.9038609051393506, + "grad_norm": 49.291658377844094, + "learning_rate": 8.816090707632086e-06, + "loss": 3.8176, + "step": 10605 + }, + { + "epoch": 0.9039461348333759, + "grad_norm": 47.7036471641252, + "learning_rate": 8.815770301305209e-06, + "loss": 4.0186, + "step": 10606 + }, + { + "epoch": 0.9040313645274013, + "grad_norm": 43.82054710991117, + "learning_rate": 8.815449857451748e-06, + "loss": 3.8493, + "step": 10607 + }, + { + "epoch": 0.9041165942214268, + "grad_norm": 89.23405295482377, + "learning_rate": 8.815129376074852e-06, + "loss": 4.9687, + "step": 10608 + }, + { + "epoch": 0.9042018239154521, + "grad_norm": 57.8099662466928, + "learning_rate": 8.814808857177677e-06, + "loss": 3.7781, + "step": 10609 + }, + { + "epoch": 0.9042870536094776, + "grad_norm": 41.3860376096881, + "learning_rate": 8.81448830076337e-06, + "loss": 4.2718, + "step": 10610 + }, + { + "epoch": 0.9043722833035029, + "grad_norm": 33.31264153597836, + "learning_rate": 8.814167706835089e-06, + "loss": 3.8737, + "step": 10611 + }, + { + "epoch": 0.9044575129975283, + "grad_norm": 46.69634069632796, + "learning_rate": 8.813847075395982e-06, + "loss": 3.8797, + "step": 10612 + }, + { + "epoch": 0.9045427426915538, + "grad_norm": 49.96747448819431, + "learning_rate": 8.813526406449206e-06, + "loss": 3.7202, + "step": 10613 + }, + { + "epoch": 0.9046279723855791, + "grad_norm": 49.94642582041874, + "learning_rate": 8.813205699997912e-06, + "loss": 3.206, + "step": 10614 + }, + { + "epoch": 0.9047132020796045, + "grad_norm": 53.38641414617966, + "learning_rate": 8.812884956045255e-06, + "loss": 4.2767, + "step": 10615 + }, + { + "epoch": 0.90479843177363, + "grad_norm": 41.523571008399564, + "learning_rate": 8.812564174594391e-06, + "loss": 4.6857, + "step": 10616 + }, + { + "epoch": 0.9048836614676553, + "grad_norm": 34.978550751709236, + "learning_rate": 8.812243355648471e-06, + "loss": 3.2253, + "step": 10617 + }, + { + "epoch": 0.9049688911616808, + "grad_norm": 29.90740637570106, + "learning_rate": 8.811922499210654e-06, + "loss": 3.7446, + "step": 10618 + }, + { + "epoch": 0.9050541208557061, + "grad_norm": 28.973202320405576, + "learning_rate": 8.811601605284094e-06, + "loss": 3.1669, + "step": 10619 + }, + { + "epoch": 0.9051393505497315, + "grad_norm": 52.55529239347685, + "learning_rate": 8.811280673871947e-06, + "loss": 4.2874, + "step": 10620 + }, + { + "epoch": 0.905224580243757, + "grad_norm": 44.87884495920788, + "learning_rate": 8.810959704977369e-06, + "loss": 2.764, + "step": 10621 + }, + { + "epoch": 0.9053098099377823, + "grad_norm": 42.12645714691195, + "learning_rate": 8.810638698603517e-06, + "loss": 4.0665, + "step": 10622 + }, + { + "epoch": 0.9053950396318077, + "grad_norm": 33.137437934378745, + "learning_rate": 8.810317654753547e-06, + "loss": 2.825, + "step": 10623 + }, + { + "epoch": 0.9054802693258331, + "grad_norm": 39.94778197079136, + "learning_rate": 8.809996573430617e-06, + "loss": 3.4426, + "step": 10624 + }, + { + "epoch": 0.9055654990198585, + "grad_norm": 79.85567754304545, + "learning_rate": 8.809675454637883e-06, + "loss": 3.7429, + "step": 10625 + }, + { + "epoch": 0.905650728713884, + "grad_norm": 81.90454729270212, + "learning_rate": 8.809354298378508e-06, + "loss": 5.0592, + "step": 10626 + }, + { + "epoch": 0.9057359584079093, + "grad_norm": 51.31689985989131, + "learning_rate": 8.809033104655644e-06, + "loss": 4.5368, + "step": 10627 + }, + { + "epoch": 0.9058211881019347, + "grad_norm": 33.68808739217703, + "learning_rate": 8.808711873472455e-06, + "loss": 3.9266, + "step": 10628 + }, + { + "epoch": 0.9059064177959602, + "grad_norm": 90.00305408402309, + "learning_rate": 8.808390604832099e-06, + "loss": 3.5305, + "step": 10629 + }, + { + "epoch": 0.9059916474899855, + "grad_norm": 44.5664943714832, + "learning_rate": 8.80806929873773e-06, + "loss": 3.796, + "step": 10630 + }, + { + "epoch": 0.9060768771840109, + "grad_norm": 43.805634089406574, + "learning_rate": 8.807747955192518e-06, + "loss": 4.0377, + "step": 10631 + }, + { + "epoch": 0.9061621068780363, + "grad_norm": 52.452667914495365, + "learning_rate": 8.807426574199616e-06, + "loss": 4.445, + "step": 10632 + }, + { + "epoch": 0.9062473365720617, + "grad_norm": 53.15614737655847, + "learning_rate": 8.807105155762186e-06, + "loss": 5.392, + "step": 10633 + }, + { + "epoch": 0.9063325662660872, + "grad_norm": 35.90998499289088, + "learning_rate": 8.80678369988339e-06, + "loss": 3.4922, + "step": 10634 + }, + { + "epoch": 0.9064177959601125, + "grad_norm": 36.52378208076253, + "learning_rate": 8.806462206566388e-06, + "loss": 3.8705, + "step": 10635 + }, + { + "epoch": 0.9065030256541379, + "grad_norm": 47.40102873317028, + "learning_rate": 8.806140675814344e-06, + "loss": 4.1134, + "step": 10636 + }, + { + "epoch": 0.9065882553481633, + "grad_norm": 33.04398271901555, + "learning_rate": 8.805819107630418e-06, + "loss": 3.3636, + "step": 10637 + }, + { + "epoch": 0.9066734850421887, + "grad_norm": 37.94154714384589, + "learning_rate": 8.805497502017775e-06, + "loss": 2.6714, + "step": 10638 + }, + { + "epoch": 0.906758714736214, + "grad_norm": 45.150246511750105, + "learning_rate": 8.805175858979574e-06, + "loss": 4.5829, + "step": 10639 + }, + { + "epoch": 0.9068439444302395, + "grad_norm": 25.866738877143455, + "learning_rate": 8.804854178518982e-06, + "loss": 1.8031, + "step": 10640 + }, + { + "epoch": 0.9069291741242649, + "grad_norm": 48.737959525133185, + "learning_rate": 8.80453246063916e-06, + "loss": 3.7468, + "step": 10641 + }, + { + "epoch": 0.9070144038182902, + "grad_norm": 36.936403436167474, + "learning_rate": 8.804210705343275e-06, + "loss": 3.0978, + "step": 10642 + }, + { + "epoch": 0.9070996335123157, + "grad_norm": 23.739794149304945, + "learning_rate": 8.803888912634488e-06, + "loss": 2.4159, + "step": 10643 + }, + { + "epoch": 0.9071848632063411, + "grad_norm": 47.33046107061893, + "learning_rate": 8.803567082515964e-06, + "loss": 3.1104, + "step": 10644 + }, + { + "epoch": 0.9072700929003665, + "grad_norm": 44.395597763838296, + "learning_rate": 8.803245214990869e-06, + "loss": 4.6632, + "step": 10645 + }, + { + "epoch": 0.9073553225943919, + "grad_norm": 31.00311175979833, + "learning_rate": 8.802923310062371e-06, + "loss": 2.9536, + "step": 10646 + }, + { + "epoch": 0.9074405522884172, + "grad_norm": 46.22310480552174, + "learning_rate": 8.802601367733632e-06, + "loss": 3.22, + "step": 10647 + }, + { + "epoch": 0.9075257819824427, + "grad_norm": 29.7164314938909, + "learning_rate": 8.80227938800782e-06, + "loss": 4.2034, + "step": 10648 + }, + { + "epoch": 0.9076110116764681, + "grad_norm": 43.7780477384, + "learning_rate": 8.801957370888101e-06, + "loss": 2.63, + "step": 10649 + }, + { + "epoch": 0.9076962413704934, + "grad_norm": 51.1014020350219, + "learning_rate": 8.80163531637764e-06, + "loss": 4.0879, + "step": 10650 + }, + { + "epoch": 0.9077814710645189, + "grad_norm": 56.63135624756747, + "learning_rate": 8.80131322447961e-06, + "loss": 5.0734, + "step": 10651 + }, + { + "epoch": 0.9078667007585443, + "grad_norm": 81.73707560980903, + "learning_rate": 8.800991095197174e-06, + "loss": 5.2664, + "step": 10652 + }, + { + "epoch": 0.9079519304525697, + "grad_norm": 32.060039558838376, + "learning_rate": 8.800668928533498e-06, + "loss": 2.8316, + "step": 10653 + }, + { + "epoch": 0.9080371601465951, + "grad_norm": 24.908889012866307, + "learning_rate": 8.800346724491756e-06, + "loss": 2.5154, + "step": 10654 + }, + { + "epoch": 0.9081223898406204, + "grad_norm": 92.34560439409972, + "learning_rate": 8.800024483075114e-06, + "loss": 4.6837, + "step": 10655 + }, + { + "epoch": 0.9082076195346459, + "grad_norm": 23.663598671271828, + "learning_rate": 8.799702204286739e-06, + "loss": 1.9587, + "step": 10656 + }, + { + "epoch": 0.9082928492286713, + "grad_norm": 60.60295377743999, + "learning_rate": 8.799379888129803e-06, + "loss": 4.761, + "step": 10657 + }, + { + "epoch": 0.9083780789226966, + "grad_norm": 46.862506558531464, + "learning_rate": 8.799057534607478e-06, + "loss": 4.0499, + "step": 10658 + }, + { + "epoch": 0.9084633086167221, + "grad_norm": 32.4901974867012, + "learning_rate": 8.798735143722931e-06, + "loss": 2.9993, + "step": 10659 + }, + { + "epoch": 0.9085485383107474, + "grad_norm": 39.590906704277934, + "learning_rate": 8.798412715479333e-06, + "loss": 3.6082, + "step": 10660 + }, + { + "epoch": 0.9086337680047729, + "grad_norm": 50.18352746656878, + "learning_rate": 8.798090249879855e-06, + "loss": 3.124, + "step": 10661 + }, + { + "epoch": 0.9087189976987983, + "grad_norm": 44.09802239541272, + "learning_rate": 8.797767746927668e-06, + "loss": 4.3985, + "step": 10662 + }, + { + "epoch": 0.9088042273928236, + "grad_norm": 32.9058092965225, + "learning_rate": 8.797445206625945e-06, + "loss": 3.4208, + "step": 10663 + }, + { + "epoch": 0.9088894570868491, + "grad_norm": 38.483922951476195, + "learning_rate": 8.797122628977858e-06, + "loss": 4.2847, + "step": 10664 + }, + { + "epoch": 0.9089746867808745, + "grad_norm": 69.11054937503992, + "learning_rate": 8.796800013986578e-06, + "loss": 2.4742, + "step": 10665 + }, + { + "epoch": 0.9090599164748998, + "grad_norm": 77.29403335242743, + "learning_rate": 8.796477361655279e-06, + "loss": 4.4836, + "step": 10666 + }, + { + "epoch": 0.9091451461689253, + "grad_norm": 35.62445018661074, + "learning_rate": 8.796154671987134e-06, + "loss": 3.3789, + "step": 10667 + }, + { + "epoch": 0.9092303758629506, + "grad_norm": 146.8122241647815, + "learning_rate": 8.795831944985314e-06, + "loss": 4.7069, + "step": 10668 + }, + { + "epoch": 0.9093156055569761, + "grad_norm": 30.94628517625946, + "learning_rate": 8.795509180652997e-06, + "loss": 3.0598, + "step": 10669 + }, + { + "epoch": 0.9094008352510015, + "grad_norm": 73.41161244126069, + "learning_rate": 8.795186378993356e-06, + "loss": 4.3462, + "step": 10670 + }, + { + "epoch": 0.9094860649450268, + "grad_norm": 39.61474111018912, + "learning_rate": 8.794863540009564e-06, + "loss": 4.1459, + "step": 10671 + }, + { + "epoch": 0.9095712946390523, + "grad_norm": 48.35927184916606, + "learning_rate": 8.794540663704797e-06, + "loss": 3.8389, + "step": 10672 + }, + { + "epoch": 0.9096565243330776, + "grad_norm": 32.03134041448226, + "learning_rate": 8.79421775008223e-06, + "loss": 3.6551, + "step": 10673 + }, + { + "epoch": 0.909741754027103, + "grad_norm": 71.81854353652344, + "learning_rate": 8.79389479914504e-06, + "loss": 5.105, + "step": 10674 + }, + { + "epoch": 0.9098269837211285, + "grad_norm": 60.55674628259444, + "learning_rate": 8.793571810896401e-06, + "loss": 4.3497, + "step": 10675 + }, + { + "epoch": 0.9099122134151538, + "grad_norm": 33.43768727008304, + "learning_rate": 8.793248785339493e-06, + "loss": 3.9569, + "step": 10676 + }, + { + "epoch": 0.9099974431091792, + "grad_norm": 31.3832754861161, + "learning_rate": 8.79292572247749e-06, + "loss": 3.2251, + "step": 10677 + }, + { + "epoch": 0.9100826728032047, + "grad_norm": 51.17900466179823, + "learning_rate": 8.792602622313568e-06, + "loss": 4.1776, + "step": 10678 + }, + { + "epoch": 0.91016790249723, + "grad_norm": 49.819830065078634, + "learning_rate": 8.792279484850907e-06, + "loss": 3.6033, + "step": 10679 + }, + { + "epoch": 0.9102531321912555, + "grad_norm": 66.22923047066763, + "learning_rate": 8.791956310092685e-06, + "loss": 3.6856, + "step": 10680 + }, + { + "epoch": 0.9103383618852808, + "grad_norm": 35.507692505177694, + "learning_rate": 8.791633098042077e-06, + "loss": 3.8925, + "step": 10681 + }, + { + "epoch": 0.9104235915793062, + "grad_norm": 48.26939850400589, + "learning_rate": 8.791309848702266e-06, + "loss": 4.5245, + "step": 10682 + }, + { + "epoch": 0.9105088212733317, + "grad_norm": 64.47729516178667, + "learning_rate": 8.790986562076428e-06, + "loss": 4.4277, + "step": 10683 + }, + { + "epoch": 0.910594050967357, + "grad_norm": 35.116853726241274, + "learning_rate": 8.790663238167744e-06, + "loss": 3.0121, + "step": 10684 + }, + { + "epoch": 0.9106792806613824, + "grad_norm": 30.0616790301766, + "learning_rate": 8.790339876979395e-06, + "loss": 2.9721, + "step": 10685 + }, + { + "epoch": 0.9107645103554078, + "grad_norm": 41.8586683370504, + "learning_rate": 8.790016478514558e-06, + "loss": 3.5409, + "step": 10686 + }, + { + "epoch": 0.9108497400494332, + "grad_norm": 24.753392977921127, + "learning_rate": 8.789693042776415e-06, + "loss": 1.5639, + "step": 10687 + }, + { + "epoch": 0.9109349697434587, + "grad_norm": 37.156695647731304, + "learning_rate": 8.789369569768146e-06, + "loss": 4.1383, + "step": 10688 + }, + { + "epoch": 0.911020199437484, + "grad_norm": 48.216915449546576, + "learning_rate": 8.789046059492935e-06, + "loss": 4.1258, + "step": 10689 + }, + { + "epoch": 0.9111054291315094, + "grad_norm": 53.405493110860476, + "learning_rate": 8.78872251195396e-06, + "loss": 4.1571, + "step": 10690 + }, + { + "epoch": 0.9111906588255348, + "grad_norm": 73.48199115506614, + "learning_rate": 8.788398927154405e-06, + "loss": 5.1214, + "step": 10691 + }, + { + "epoch": 0.9112758885195602, + "grad_norm": 33.97796493732964, + "learning_rate": 8.788075305097454e-06, + "loss": 3.7864, + "step": 10692 + }, + { + "epoch": 0.9113611182135856, + "grad_norm": 42.69926095293364, + "learning_rate": 8.787751645786284e-06, + "loss": 3.7482, + "step": 10693 + }, + { + "epoch": 0.911446347907611, + "grad_norm": 36.88509466256158, + "learning_rate": 8.787427949224083e-06, + "loss": 3.1053, + "step": 10694 + }, + { + "epoch": 0.9115315776016364, + "grad_norm": 41.4166288128931, + "learning_rate": 8.787104215414035e-06, + "loss": 2.3871, + "step": 10695 + }, + { + "epoch": 0.9116168072956619, + "grad_norm": 61.43257385445981, + "learning_rate": 8.786780444359319e-06, + "loss": 4.6694, + "step": 10696 + }, + { + "epoch": 0.9117020369896872, + "grad_norm": 53.2486718146995, + "learning_rate": 8.786456636063125e-06, + "loss": 4.6666, + "step": 10697 + }, + { + "epoch": 0.9117872666837126, + "grad_norm": 39.700974957679996, + "learning_rate": 8.786132790528633e-06, + "loss": 3.8572, + "step": 10698 + }, + { + "epoch": 0.911872496377738, + "grad_norm": 78.82000075692268, + "learning_rate": 8.785808907759028e-06, + "loss": 3.6295, + "step": 10699 + }, + { + "epoch": 0.9119577260717634, + "grad_norm": 75.16075194156242, + "learning_rate": 8.785484987757499e-06, + "loss": 4.4956, + "step": 10700 + }, + { + "epoch": 0.9120429557657888, + "grad_norm": 36.0059106730309, + "learning_rate": 8.785161030527229e-06, + "loss": 3.261, + "step": 10701 + }, + { + "epoch": 0.9121281854598142, + "grad_norm": 39.76882612261687, + "learning_rate": 8.784837036071403e-06, + "loss": 4.322, + "step": 10702 + }, + { + "epoch": 0.9122134151538396, + "grad_norm": 53.08684679345402, + "learning_rate": 8.784513004393208e-06, + "loss": 2.2203, + "step": 10703 + }, + { + "epoch": 0.912298644847865, + "grad_norm": 44.24217314296038, + "learning_rate": 8.784188935495833e-06, + "loss": 3.0607, + "step": 10704 + }, + { + "epoch": 0.9123838745418904, + "grad_norm": 30.9270662768507, + "learning_rate": 8.783864829382465e-06, + "loss": 3.3747, + "step": 10705 + }, + { + "epoch": 0.9124691042359158, + "grad_norm": 69.94680509305533, + "learning_rate": 8.783540686056288e-06, + "loss": 4.7065, + "step": 10706 + }, + { + "epoch": 0.9125543339299412, + "grad_norm": 38.73299460528091, + "learning_rate": 8.783216505520493e-06, + "loss": 4.0223, + "step": 10707 + }, + { + "epoch": 0.9126395636239666, + "grad_norm": 35.33425248435194, + "learning_rate": 8.782892287778266e-06, + "loss": 3.1401, + "step": 10708 + }, + { + "epoch": 0.912724793317992, + "grad_norm": 36.864528738664305, + "learning_rate": 8.782568032832797e-06, + "loss": 1.9634, + "step": 10709 + }, + { + "epoch": 0.9128100230120174, + "grad_norm": 96.91311774152373, + "learning_rate": 8.782243740687273e-06, + "loss": 5.2016, + "step": 10710 + }, + { + "epoch": 0.9128952527060428, + "grad_norm": 61.305427358139276, + "learning_rate": 8.781919411344886e-06, + "loss": 4.3449, + "step": 10711 + }, + { + "epoch": 0.9129804824000682, + "grad_norm": 32.98516632869914, + "learning_rate": 8.781595044808824e-06, + "loss": 3.3107, + "step": 10712 + }, + { + "epoch": 0.9130657120940936, + "grad_norm": 84.42209085420058, + "learning_rate": 8.781270641082278e-06, + "loss": 3.8142, + "step": 10713 + }, + { + "epoch": 0.913150941788119, + "grad_norm": 94.86023244409763, + "learning_rate": 8.780946200168438e-06, + "loss": 4.7912, + "step": 10714 + }, + { + "epoch": 0.9132361714821444, + "grad_norm": 41.11356125059686, + "learning_rate": 8.780621722070494e-06, + "loss": 4.2863, + "step": 10715 + }, + { + "epoch": 0.9133214011761698, + "grad_norm": 31.828038265028955, + "learning_rate": 8.780297206791637e-06, + "loss": 2.9411, + "step": 10716 + }, + { + "epoch": 0.9134066308701951, + "grad_norm": 32.70660402269958, + "learning_rate": 8.77997265433506e-06, + "loss": 3.5541, + "step": 10717 + }, + { + "epoch": 0.9134918605642206, + "grad_norm": 70.76501231381955, + "learning_rate": 8.779648064703953e-06, + "loss": 4.5423, + "step": 10718 + }, + { + "epoch": 0.913577090258246, + "grad_norm": 38.420848100793116, + "learning_rate": 8.779323437901511e-06, + "loss": 4.7175, + "step": 10719 + }, + { + "epoch": 0.9136623199522713, + "grad_norm": 40.089322224010964, + "learning_rate": 8.778998773930922e-06, + "loss": 4.0521, + "step": 10720 + }, + { + "epoch": 0.9137475496462968, + "grad_norm": 100.60352463755856, + "learning_rate": 8.778674072795384e-06, + "loss": 5.4355, + "step": 10721 + }, + { + "epoch": 0.9138327793403221, + "grad_norm": 33.62485987194701, + "learning_rate": 8.778349334498087e-06, + "loss": 4.5414, + "step": 10722 + }, + { + "epoch": 0.9139180090343476, + "grad_norm": 38.92887546249238, + "learning_rate": 8.778024559042227e-06, + "loss": 4.4168, + "step": 10723 + }, + { + "epoch": 0.914003238728373, + "grad_norm": 79.86478796269485, + "learning_rate": 8.777699746430994e-06, + "loss": 5.3216, + "step": 10724 + }, + { + "epoch": 0.9140884684223983, + "grad_norm": 71.53852375741573, + "learning_rate": 8.777374896667588e-06, + "loss": 5.0649, + "step": 10725 + }, + { + "epoch": 0.9141736981164238, + "grad_norm": 101.75247742532754, + "learning_rate": 8.7770500097552e-06, + "loss": 4.1285, + "step": 10726 + }, + { + "epoch": 0.9142589278104492, + "grad_norm": 27.87742583452745, + "learning_rate": 8.776725085697024e-06, + "loss": 3.1332, + "step": 10727 + }, + { + "epoch": 0.9143441575044745, + "grad_norm": 31.69254930007461, + "learning_rate": 8.77640012449626e-06, + "loss": 4.049, + "step": 10728 + }, + { + "epoch": 0.9144293871985, + "grad_norm": 89.2876927575301, + "learning_rate": 8.7760751261561e-06, + "loss": 5.0779, + "step": 10729 + }, + { + "epoch": 0.9145146168925253, + "grad_norm": 45.84703924668268, + "learning_rate": 8.775750090679742e-06, + "loss": 4.2763, + "step": 10730 + }, + { + "epoch": 0.9145998465865508, + "grad_norm": 33.44116838675246, + "learning_rate": 8.775425018070383e-06, + "loss": 3.9615, + "step": 10731 + }, + { + "epoch": 0.9146850762805762, + "grad_norm": 37.10182862247014, + "learning_rate": 8.775099908331218e-06, + "loss": 3.5565, + "step": 10732 + }, + { + "epoch": 0.9147703059746015, + "grad_norm": 28.311636098179985, + "learning_rate": 8.774774761465445e-06, + "loss": 2.809, + "step": 10733 + }, + { + "epoch": 0.914855535668627, + "grad_norm": 75.54907666875292, + "learning_rate": 8.774449577476265e-06, + "loss": 4.8889, + "step": 10734 + }, + { + "epoch": 0.9149407653626523, + "grad_norm": 88.93048422664057, + "learning_rate": 8.77412435636687e-06, + "loss": 5.6583, + "step": 10735 + }, + { + "epoch": 0.9150259950566777, + "grad_norm": 21.3983653683593, + "learning_rate": 8.773799098140462e-06, + "loss": 2.5916, + "step": 10736 + }, + { + "epoch": 0.9151112247507032, + "grad_norm": 37.97653690701972, + "learning_rate": 8.77347380280024e-06, + "loss": 3.0476, + "step": 10737 + }, + { + "epoch": 0.9151964544447285, + "grad_norm": 61.338361121972795, + "learning_rate": 8.773148470349403e-06, + "loss": 4.269, + "step": 10738 + }, + { + "epoch": 0.915281684138754, + "grad_norm": 42.93671569743629, + "learning_rate": 8.772823100791152e-06, + "loss": 4.6137, + "step": 10739 + }, + { + "epoch": 0.9153669138327793, + "grad_norm": 45.35629983402191, + "learning_rate": 8.772497694128684e-06, + "loss": 4.7757, + "step": 10740 + }, + { + "epoch": 0.9154521435268047, + "grad_norm": 45.49068984199324, + "learning_rate": 8.772172250365198e-06, + "loss": 4.4476, + "step": 10741 + }, + { + "epoch": 0.9155373732208302, + "grad_norm": 45.05457642918348, + "learning_rate": 8.771846769503898e-06, + "loss": 3.9897, + "step": 10742 + }, + { + "epoch": 0.9156226029148555, + "grad_norm": 33.69098723484483, + "learning_rate": 8.771521251547984e-06, + "loss": 2.6968, + "step": 10743 + }, + { + "epoch": 0.9157078326088809, + "grad_norm": 47.10735381787039, + "learning_rate": 8.771195696500655e-06, + "loss": 3.7957, + "step": 10744 + }, + { + "epoch": 0.9157930623029064, + "grad_norm": 38.20080462896119, + "learning_rate": 8.770870104365119e-06, + "loss": 3.3447, + "step": 10745 + }, + { + "epoch": 0.9158782919969317, + "grad_norm": 42.23579404032229, + "learning_rate": 8.77054447514457e-06, + "loss": 4.6169, + "step": 10746 + }, + { + "epoch": 0.9159635216909572, + "grad_norm": 25.34340398629825, + "learning_rate": 8.770218808842218e-06, + "loss": 2.3263, + "step": 10747 + }, + { + "epoch": 0.9160487513849825, + "grad_norm": 142.27870260582785, + "learning_rate": 8.76989310546126e-06, + "loss": 3.9269, + "step": 10748 + }, + { + "epoch": 0.9161339810790079, + "grad_norm": 70.54477777159755, + "learning_rate": 8.769567365004901e-06, + "loss": 4.1169, + "step": 10749 + }, + { + "epoch": 0.9162192107730334, + "grad_norm": 57.45164132801591, + "learning_rate": 8.769241587476346e-06, + "loss": 3.6434, + "step": 10750 + }, + { + "epoch": 0.9163044404670587, + "grad_norm": 40.623626619440344, + "learning_rate": 8.768915772878797e-06, + "loss": 4.2948, + "step": 10751 + }, + { + "epoch": 0.9163896701610841, + "grad_norm": 25.532490452618333, + "learning_rate": 8.768589921215458e-06, + "loss": 3.2394, + "step": 10752 + }, + { + "epoch": 0.9164748998551095, + "grad_norm": 39.40825804348448, + "learning_rate": 8.768264032489538e-06, + "loss": 3.0407, + "step": 10753 + }, + { + "epoch": 0.9165601295491349, + "grad_norm": 20.32938420332369, + "learning_rate": 8.767938106704235e-06, + "loss": 2.4783, + "step": 10754 + }, + { + "epoch": 0.9166453592431603, + "grad_norm": 58.902060201587915, + "learning_rate": 8.767612143862761e-06, + "loss": 4.018, + "step": 10755 + }, + { + "epoch": 0.9167305889371857, + "grad_norm": 31.35603441768772, + "learning_rate": 8.767286143968315e-06, + "loss": 1.7416, + "step": 10756 + }, + { + "epoch": 0.9168158186312111, + "grad_norm": 34.15050316296714, + "learning_rate": 8.76696010702411e-06, + "loss": 2.6553, + "step": 10757 + }, + { + "epoch": 0.9169010483252366, + "grad_norm": 64.90233465954567, + "learning_rate": 8.766634033033348e-06, + "loss": 4.7932, + "step": 10758 + }, + { + "epoch": 0.9169862780192619, + "grad_norm": 38.60184283511307, + "learning_rate": 8.766307921999236e-06, + "loss": 4.0696, + "step": 10759 + }, + { + "epoch": 0.9170715077132873, + "grad_norm": 42.73076387671773, + "learning_rate": 8.765981773924983e-06, + "loss": 3.3373, + "step": 10760 + }, + { + "epoch": 0.9171567374073127, + "grad_norm": 27.23583122894786, + "learning_rate": 8.765655588813796e-06, + "loss": 3.0433, + "step": 10761 + }, + { + "epoch": 0.9172419671013381, + "grad_norm": 44.40241994396733, + "learning_rate": 8.765329366668882e-06, + "loss": 4.242, + "step": 10762 + }, + { + "epoch": 0.9173271967953635, + "grad_norm": 121.0488187992895, + "learning_rate": 8.765003107493451e-06, + "loss": 3.5345, + "step": 10763 + }, + { + "epoch": 0.9174124264893889, + "grad_norm": 33.937863647729145, + "learning_rate": 8.76467681129071e-06, + "loss": 3.8252, + "step": 10764 + }, + { + "epoch": 0.9174976561834143, + "grad_norm": 49.291034257474976, + "learning_rate": 8.764350478063868e-06, + "loss": 4.7071, + "step": 10765 + }, + { + "epoch": 0.9175828858774397, + "grad_norm": 36.29558442522994, + "learning_rate": 8.764024107816135e-06, + "loss": 2.5903, + "step": 10766 + }, + { + "epoch": 0.9176681155714651, + "grad_norm": 32.88175513590895, + "learning_rate": 8.763697700550722e-06, + "loss": 3.5496, + "step": 10767 + }, + { + "epoch": 0.9177533452654905, + "grad_norm": 36.35087089404147, + "learning_rate": 8.763371256270837e-06, + "loss": 3.1381, + "step": 10768 + }, + { + "epoch": 0.9178385749595159, + "grad_norm": 52.13716430105211, + "learning_rate": 8.763044774979689e-06, + "loss": 4.0786, + "step": 10769 + }, + { + "epoch": 0.9179238046535413, + "grad_norm": 38.3741110217038, + "learning_rate": 8.762718256680493e-06, + "loss": 3.5192, + "step": 10770 + }, + { + "epoch": 0.9180090343475666, + "grad_norm": 27.223446115203842, + "learning_rate": 8.762391701376457e-06, + "loss": 3.3347, + "step": 10771 + }, + { + "epoch": 0.9180942640415921, + "grad_norm": 34.54070050737676, + "learning_rate": 8.762065109070794e-06, + "loss": 3.973, + "step": 10772 + }, + { + "epoch": 0.9181794937356175, + "grad_norm": 44.87784563769294, + "learning_rate": 8.761738479766715e-06, + "loss": 3.7941, + "step": 10773 + }, + { + "epoch": 0.9182647234296429, + "grad_norm": 89.17045558780056, + "learning_rate": 8.761411813467436e-06, + "loss": 4.3188, + "step": 10774 + }, + { + "epoch": 0.9183499531236683, + "grad_norm": 68.4574848115755, + "learning_rate": 8.761085110176163e-06, + "loss": 4.0239, + "step": 10775 + }, + { + "epoch": 0.9184351828176937, + "grad_norm": 52.08672870277408, + "learning_rate": 8.760758369896113e-06, + "loss": 4.0901, + "step": 10776 + }, + { + "epoch": 0.9185204125117191, + "grad_norm": 27.56272339058928, + "learning_rate": 8.7604315926305e-06, + "loss": 3.1852, + "step": 10777 + }, + { + "epoch": 0.9186056422057445, + "grad_norm": 34.19261648975765, + "learning_rate": 8.760104778382535e-06, + "loss": 3.5242, + "step": 10778 + }, + { + "epoch": 0.9186908718997698, + "grad_norm": 50.707620708381455, + "learning_rate": 8.759777927155434e-06, + "loss": 4.2496, + "step": 10779 + }, + { + "epoch": 0.9187761015937953, + "grad_norm": 31.59327372536332, + "learning_rate": 8.759451038952412e-06, + "loss": 3.6517, + "step": 10780 + }, + { + "epoch": 0.9188613312878207, + "grad_norm": 53.352975202005894, + "learning_rate": 8.759124113776682e-06, + "loss": 3.8232, + "step": 10781 + }, + { + "epoch": 0.9189465609818461, + "grad_norm": 35.63273068175829, + "learning_rate": 8.758797151631462e-06, + "loss": 2.9432, + "step": 10782 + }, + { + "epoch": 0.9190317906758715, + "grad_norm": 41.841836597939626, + "learning_rate": 8.758470152519963e-06, + "loss": 3.7289, + "step": 10783 + }, + { + "epoch": 0.9191170203698968, + "grad_norm": 136.10635924654252, + "learning_rate": 8.758143116445405e-06, + "loss": 5.4037, + "step": 10784 + }, + { + "epoch": 0.9192022500639223, + "grad_norm": 33.57947673053278, + "learning_rate": 8.757816043411003e-06, + "loss": 3.3855, + "step": 10785 + }, + { + "epoch": 0.9192874797579477, + "grad_norm": 45.74821984342089, + "learning_rate": 8.757488933419972e-06, + "loss": 3.5922, + "step": 10786 + }, + { + "epoch": 0.919372709451973, + "grad_norm": 39.275457353339384, + "learning_rate": 8.757161786475533e-06, + "loss": 4.4965, + "step": 10787 + }, + { + "epoch": 0.9194579391459985, + "grad_norm": 31.029546691747395, + "learning_rate": 8.756834602580897e-06, + "loss": 3.203, + "step": 10788 + }, + { + "epoch": 0.9195431688400239, + "grad_norm": 38.985848654665496, + "learning_rate": 8.756507381739288e-06, + "loss": 3.0173, + "step": 10789 + }, + { + "epoch": 0.9196283985340492, + "grad_norm": 51.06484482232102, + "learning_rate": 8.756180123953922e-06, + "loss": 3.3598, + "step": 10790 + }, + { + "epoch": 0.9197136282280747, + "grad_norm": 73.76989563687914, + "learning_rate": 8.755852829228016e-06, + "loss": 6.0133, + "step": 10791 + }, + { + "epoch": 0.9197988579221, + "grad_norm": 53.5828322882372, + "learning_rate": 8.75552549756479e-06, + "loss": 4.1115, + "step": 10792 + }, + { + "epoch": 0.9198840876161255, + "grad_norm": 34.60094326697911, + "learning_rate": 8.755198128967461e-06, + "loss": 3.9599, + "step": 10793 + }, + { + "epoch": 0.9199693173101509, + "grad_norm": 26.506081993776306, + "learning_rate": 8.754870723439254e-06, + "loss": 3.8826, + "step": 10794 + }, + { + "epoch": 0.9200545470041762, + "grad_norm": 27.129007527256444, + "learning_rate": 8.754543280983383e-06, + "loss": 2.7693, + "step": 10795 + }, + { + "epoch": 0.9201397766982017, + "grad_norm": 72.83615156644865, + "learning_rate": 8.754215801603071e-06, + "loss": 4.813, + "step": 10796 + }, + { + "epoch": 0.920225006392227, + "grad_norm": 39.599939665880434, + "learning_rate": 8.75388828530154e-06, + "loss": 3.3461, + "step": 10797 + }, + { + "epoch": 0.9203102360862524, + "grad_norm": 25.932871811302935, + "learning_rate": 8.753560732082009e-06, + "loss": 3.4779, + "step": 10798 + }, + { + "epoch": 0.9203954657802779, + "grad_norm": 35.801489200545845, + "learning_rate": 8.753233141947699e-06, + "loss": 3.8851, + "step": 10799 + }, + { + "epoch": 0.9204806954743032, + "grad_norm": 33.20202187528521, + "learning_rate": 8.752905514901833e-06, + "loss": 2.3924, + "step": 10800 + }, + { + "epoch": 0.9205659251683287, + "grad_norm": 37.99220991849827, + "learning_rate": 8.752577850947632e-06, + "loss": 3.918, + "step": 10801 + }, + { + "epoch": 0.920651154862354, + "grad_norm": 54.8339147692998, + "learning_rate": 8.75225015008832e-06, + "loss": 4.3924, + "step": 10802 + }, + { + "epoch": 0.9207363845563794, + "grad_norm": 71.28498686730472, + "learning_rate": 8.751922412327118e-06, + "loss": 4.9586, + "step": 10803 + }, + { + "epoch": 0.9208216142504049, + "grad_norm": 60.79174950671734, + "learning_rate": 8.75159463766725e-06, + "loss": 3.9241, + "step": 10804 + }, + { + "epoch": 0.9209068439444302, + "grad_norm": 80.22392906923793, + "learning_rate": 8.751266826111939e-06, + "loss": 4.8244, + "step": 10805 + }, + { + "epoch": 0.9209920736384556, + "grad_norm": 80.45226935736599, + "learning_rate": 8.75093897766441e-06, + "loss": 4.4062, + "step": 10806 + }, + { + "epoch": 0.9210773033324811, + "grad_norm": 31.66458046857385, + "learning_rate": 8.750611092327884e-06, + "loss": 3.5132, + "step": 10807 + }, + { + "epoch": 0.9211625330265064, + "grad_norm": 100.72421881796438, + "learning_rate": 8.750283170105592e-06, + "loss": 5.4015, + "step": 10808 + }, + { + "epoch": 0.9212477627205319, + "grad_norm": 47.494830798539645, + "learning_rate": 8.749955211000755e-06, + "loss": 4.5933, + "step": 10809 + }, + { + "epoch": 0.9213329924145572, + "grad_norm": 54.639928231552545, + "learning_rate": 8.749627215016597e-06, + "loss": 4.5388, + "step": 10810 + }, + { + "epoch": 0.9214182221085826, + "grad_norm": 36.82021709642605, + "learning_rate": 8.749299182156346e-06, + "loss": 4.106, + "step": 10811 + }, + { + "epoch": 0.9215034518026081, + "grad_norm": 55.77644522020897, + "learning_rate": 8.748971112423228e-06, + "loss": 2.7922, + "step": 10812 + }, + { + "epoch": 0.9215886814966334, + "grad_norm": 69.06155616184984, + "learning_rate": 8.748643005820467e-06, + "loss": 4.8153, + "step": 10813 + }, + { + "epoch": 0.9216739111906588, + "grad_norm": 197.12564533003788, + "learning_rate": 8.748314862351293e-06, + "loss": 4.962, + "step": 10814 + }, + { + "epoch": 0.9217591408846842, + "grad_norm": 38.29299770844889, + "learning_rate": 8.74798668201893e-06, + "loss": 4.217, + "step": 10815 + }, + { + "epoch": 0.9218443705787096, + "grad_norm": 44.83714593373584, + "learning_rate": 8.747658464826609e-06, + "loss": 4.3906, + "step": 10816 + }, + { + "epoch": 0.9219296002727351, + "grad_norm": 33.14780576210076, + "learning_rate": 8.747330210777556e-06, + "loss": 3.7661, + "step": 10817 + }, + { + "epoch": 0.9220148299667604, + "grad_norm": 38.73469003855545, + "learning_rate": 8.747001919875e-06, + "loss": 3.7742, + "step": 10818 + }, + { + "epoch": 0.9221000596607858, + "grad_norm": 40.921258272028645, + "learning_rate": 8.746673592122168e-06, + "loss": 3.56, + "step": 10819 + }, + { + "epoch": 0.9221852893548113, + "grad_norm": 34.903857395281705, + "learning_rate": 8.74634522752229e-06, + "loss": 3.5482, + "step": 10820 + }, + { + "epoch": 0.9222705190488366, + "grad_norm": 50.75720453911423, + "learning_rate": 8.746016826078595e-06, + "loss": 5.392, + "step": 10821 + }, + { + "epoch": 0.922355748742862, + "grad_norm": 27.74035865042211, + "learning_rate": 8.745688387794313e-06, + "loss": 3.3252, + "step": 10822 + }, + { + "epoch": 0.9224409784368874, + "grad_norm": 52.84170616605628, + "learning_rate": 8.745359912672675e-06, + "loss": 4.223, + "step": 10823 + }, + { + "epoch": 0.9225262081309128, + "grad_norm": 36.00565457000539, + "learning_rate": 8.745031400716911e-06, + "loss": 3.3614, + "step": 10824 + }, + { + "epoch": 0.9226114378249383, + "grad_norm": 42.61155680286956, + "learning_rate": 8.74470285193025e-06, + "loss": 4.2954, + "step": 10825 + }, + { + "epoch": 0.9226966675189636, + "grad_norm": 73.45144315981027, + "learning_rate": 8.744374266315925e-06, + "loss": 5.7823, + "step": 10826 + }, + { + "epoch": 0.922781897212989, + "grad_norm": 54.16486372947258, + "learning_rate": 8.744045643877166e-06, + "loss": 5.1935, + "step": 10827 + }, + { + "epoch": 0.9228671269070144, + "grad_norm": 79.68182634353468, + "learning_rate": 8.743716984617208e-06, + "loss": 5.69, + "step": 10828 + }, + { + "epoch": 0.9229523566010398, + "grad_norm": 30.85744684415262, + "learning_rate": 8.74338828853928e-06, + "loss": 2.72, + "step": 10829 + }, + { + "epoch": 0.9230375862950652, + "grad_norm": 62.55132745270822, + "learning_rate": 8.743059555646616e-06, + "loss": 4.8777, + "step": 10830 + }, + { + "epoch": 0.9231228159890906, + "grad_norm": 78.73923397612937, + "learning_rate": 8.742730785942446e-06, + "loss": 4.753, + "step": 10831 + }, + { + "epoch": 0.923208045683116, + "grad_norm": 33.822705654579416, + "learning_rate": 8.742401979430008e-06, + "loss": 3.2527, + "step": 10832 + }, + { + "epoch": 0.9232932753771413, + "grad_norm": 34.68690330634546, + "learning_rate": 8.742073136112535e-06, + "loss": 3.4447, + "step": 10833 + }, + { + "epoch": 0.9233785050711668, + "grad_norm": 59.77803007040078, + "learning_rate": 8.741744255993259e-06, + "loss": 3.8036, + "step": 10834 + }, + { + "epoch": 0.9234637347651922, + "grad_norm": 66.15217397061451, + "learning_rate": 8.741415339075414e-06, + "loss": 4.7541, + "step": 10835 + }, + { + "epoch": 0.9235489644592176, + "grad_norm": 41.39964368354999, + "learning_rate": 8.741086385362237e-06, + "loss": 4.8381, + "step": 10836 + }, + { + "epoch": 0.923634194153243, + "grad_norm": 44.30858425382368, + "learning_rate": 8.74075739485696e-06, + "loss": 4.5133, + "step": 10837 + }, + { + "epoch": 0.9237194238472684, + "grad_norm": 30.58725829385963, + "learning_rate": 8.74042836756282e-06, + "loss": 4.3529, + "step": 10838 + }, + { + "epoch": 0.9238046535412938, + "grad_norm": 44.91761112730217, + "learning_rate": 8.740099303483055e-06, + "loss": 2.4478, + "step": 10839 + }, + { + "epoch": 0.9238898832353192, + "grad_norm": 64.01833182322673, + "learning_rate": 8.739770202620901e-06, + "loss": 4.2678, + "step": 10840 + }, + { + "epoch": 0.9239751129293445, + "grad_norm": 30.199905400013147, + "learning_rate": 8.73944106497959e-06, + "loss": 4.0527, + "step": 10841 + }, + { + "epoch": 0.92406034262337, + "grad_norm": 73.09510899703159, + "learning_rate": 8.739111890562364e-06, + "loss": 4.8835, + "step": 10842 + }, + { + "epoch": 0.9241455723173954, + "grad_norm": 29.11405964769285, + "learning_rate": 8.73878267937246e-06, + "loss": 4.0068, + "step": 10843 + }, + { + "epoch": 0.9242308020114208, + "grad_norm": 21.471231755975772, + "learning_rate": 8.73845343141311e-06, + "loss": 2.5918, + "step": 10844 + }, + { + "epoch": 0.9243160317054462, + "grad_norm": 39.85611023280934, + "learning_rate": 8.738124146687559e-06, + "loss": 3.6136, + "step": 10845 + }, + { + "epoch": 0.9244012613994715, + "grad_norm": 40.67710136399755, + "learning_rate": 8.737794825199041e-06, + "loss": 3.8052, + "step": 10846 + }, + { + "epoch": 0.924486491093497, + "grad_norm": 24.33577052708745, + "learning_rate": 8.737465466950797e-06, + "loss": 3.2335, + "step": 10847 + }, + { + "epoch": 0.9245717207875224, + "grad_norm": 34.2754754608065, + "learning_rate": 8.737136071946064e-06, + "loss": 3.0073, + "step": 10848 + }, + { + "epoch": 0.9246569504815477, + "grad_norm": 69.31074097454683, + "learning_rate": 8.736806640188083e-06, + "loss": 4.7747, + "step": 10849 + }, + { + "epoch": 0.9247421801755732, + "grad_norm": 46.98769834365052, + "learning_rate": 8.736477171680094e-06, + "loss": 3.4649, + "step": 10850 + }, + { + "epoch": 0.9248274098695985, + "grad_norm": 32.44837726802777, + "learning_rate": 8.736147666425336e-06, + "loss": 3.5401, + "step": 10851 + }, + { + "epoch": 0.924912639563624, + "grad_norm": 47.20006156477777, + "learning_rate": 8.73581812442705e-06, + "loss": 3.3181, + "step": 10852 + }, + { + "epoch": 0.9249978692576494, + "grad_norm": 35.117154915394686, + "learning_rate": 8.735488545688481e-06, + "loss": 3.4741, + "step": 10853 + }, + { + "epoch": 0.9250830989516747, + "grad_norm": 40.651518016949964, + "learning_rate": 8.735158930212864e-06, + "loss": 3.9897, + "step": 10854 + }, + { + "epoch": 0.9251683286457002, + "grad_norm": 64.46410287140517, + "learning_rate": 8.734829278003443e-06, + "loss": 4.6509, + "step": 10855 + }, + { + "epoch": 0.9252535583397256, + "grad_norm": 60.78389884880561, + "learning_rate": 8.734499589063459e-06, + "loss": 5.5282, + "step": 10856 + }, + { + "epoch": 0.9253387880337509, + "grad_norm": 35.03570588295688, + "learning_rate": 8.734169863396157e-06, + "loss": 3.9941, + "step": 10857 + }, + { + "epoch": 0.9254240177277764, + "grad_norm": 27.659418563903714, + "learning_rate": 8.733840101004778e-06, + "loss": 2.4921, + "step": 10858 + }, + { + "epoch": 0.9255092474218017, + "grad_norm": 40.30094063411248, + "learning_rate": 8.733510301892566e-06, + "loss": 2.9853, + "step": 10859 + }, + { + "epoch": 0.9255944771158272, + "grad_norm": 48.640766123742566, + "learning_rate": 8.73318046606276e-06, + "loss": 4.4872, + "step": 10860 + }, + { + "epoch": 0.9256797068098526, + "grad_norm": 114.71352081137083, + "learning_rate": 8.73285059351861e-06, + "loss": 5.352, + "step": 10861 + }, + { + "epoch": 0.9257649365038779, + "grad_norm": 86.7232934285406, + "learning_rate": 8.732520684263359e-06, + "loss": 4.2852, + "step": 10862 + }, + { + "epoch": 0.9258501661979034, + "grad_norm": 60.250124382965176, + "learning_rate": 8.73219073830025e-06, + "loss": 3.7354, + "step": 10863 + }, + { + "epoch": 0.9259353958919287, + "grad_norm": 73.71824866199341, + "learning_rate": 8.731860755632526e-06, + "loss": 4.7693, + "step": 10864 + }, + { + "epoch": 0.9260206255859541, + "grad_norm": 39.50869570475163, + "learning_rate": 8.731530736263436e-06, + "loss": 3.5278, + "step": 10865 + }, + { + "epoch": 0.9261058552799796, + "grad_norm": 55.99531535625878, + "learning_rate": 8.731200680196224e-06, + "loss": 4.1632, + "step": 10866 + }, + { + "epoch": 0.9261910849740049, + "grad_norm": 76.58098600670928, + "learning_rate": 8.730870587434136e-06, + "loss": 5.5221, + "step": 10867 + }, + { + "epoch": 0.9262763146680303, + "grad_norm": 74.57339190999704, + "learning_rate": 8.730540457980418e-06, + "loss": 4.794, + "step": 10868 + }, + { + "epoch": 0.9263615443620558, + "grad_norm": 46.101604753823125, + "learning_rate": 8.730210291838316e-06, + "loss": 4.4975, + "step": 10869 + }, + { + "epoch": 0.9264467740560811, + "grad_norm": 33.759213796362715, + "learning_rate": 8.729880089011078e-06, + "loss": 2.5442, + "step": 10870 + }, + { + "epoch": 0.9265320037501066, + "grad_norm": 34.874144355534014, + "learning_rate": 8.729549849501954e-06, + "loss": 3.2137, + "step": 10871 + }, + { + "epoch": 0.9266172334441319, + "grad_norm": 47.26333094943713, + "learning_rate": 8.729219573314187e-06, + "loss": 3.8744, + "step": 10872 + }, + { + "epoch": 0.9267024631381573, + "grad_norm": 36.47250598903108, + "learning_rate": 8.728889260451027e-06, + "loss": 4.0524, + "step": 10873 + }, + { + "epoch": 0.9267876928321828, + "grad_norm": 51.22934590119851, + "learning_rate": 8.728558910915725e-06, + "loss": 4.1878, + "step": 10874 + }, + { + "epoch": 0.9268729225262081, + "grad_norm": 100.54564384220998, + "learning_rate": 8.728228524711526e-06, + "loss": 4.5697, + "step": 10875 + }, + { + "epoch": 0.9269581522202335, + "grad_norm": 37.04195387476839, + "learning_rate": 8.727898101841681e-06, + "loss": 3.8016, + "step": 10876 + }, + { + "epoch": 0.927043381914259, + "grad_norm": 53.361825014794334, + "learning_rate": 8.72756764230944e-06, + "loss": 3.2069, + "step": 10877 + }, + { + "epoch": 0.9271286116082843, + "grad_norm": 50.51540374236654, + "learning_rate": 8.727237146118053e-06, + "loss": 3.8132, + "step": 10878 + }, + { + "epoch": 0.9272138413023098, + "grad_norm": 27.02296611158146, + "learning_rate": 8.726906613270767e-06, + "loss": 3.6384, + "step": 10879 + }, + { + "epoch": 0.9272990709963351, + "grad_norm": 39.36298774050243, + "learning_rate": 8.726576043770837e-06, + "loss": 4.1684, + "step": 10880 + }, + { + "epoch": 0.9273843006903605, + "grad_norm": 38.57434172357283, + "learning_rate": 8.726245437621515e-06, + "loss": 3.6134, + "step": 10881 + }, + { + "epoch": 0.927469530384386, + "grad_norm": 47.52035124504132, + "learning_rate": 8.725914794826047e-06, + "loss": 3.3617, + "step": 10882 + }, + { + "epoch": 0.9275547600784113, + "grad_norm": 42.62733779753851, + "learning_rate": 8.725584115387688e-06, + "loss": 3.7334, + "step": 10883 + }, + { + "epoch": 0.9276399897724367, + "grad_norm": 53.53492653967661, + "learning_rate": 8.72525339930969e-06, + "loss": 3.025, + "step": 10884 + }, + { + "epoch": 0.9277252194664621, + "grad_norm": 760.1885065282105, + "learning_rate": 8.724922646595304e-06, + "loss": 5.4797, + "step": 10885 + }, + { + "epoch": 0.9278104491604875, + "grad_norm": 55.444757617310486, + "learning_rate": 8.724591857247783e-06, + "loss": 4.3627, + "step": 10886 + }, + { + "epoch": 0.927895678854513, + "grad_norm": 24.442425717203516, + "learning_rate": 8.724261031270385e-06, + "loss": 2.67, + "step": 10887 + }, + { + "epoch": 0.9279809085485383, + "grad_norm": 41.57374765358963, + "learning_rate": 8.723930168666357e-06, + "loss": 2.9366, + "step": 10888 + }, + { + "epoch": 0.9280661382425637, + "grad_norm": 36.62866821449663, + "learning_rate": 8.723599269438956e-06, + "loss": 3.1415, + "step": 10889 + }, + { + "epoch": 0.9281513679365891, + "grad_norm": 33.39665237399379, + "learning_rate": 8.723268333591435e-06, + "loss": 2.71, + "step": 10890 + }, + { + "epoch": 0.9282365976306145, + "grad_norm": 98.64058808605425, + "learning_rate": 8.722937361127051e-06, + "loss": 4.7389, + "step": 10891 + }, + { + "epoch": 0.9283218273246399, + "grad_norm": 99.25595860439364, + "learning_rate": 8.722606352049056e-06, + "loss": 5.0347, + "step": 10892 + }, + { + "epoch": 0.9284070570186653, + "grad_norm": 45.87810906563446, + "learning_rate": 8.722275306360707e-06, + "loss": 3.9628, + "step": 10893 + }, + { + "epoch": 0.9284922867126907, + "grad_norm": 33.831259407636715, + "learning_rate": 8.72194422406526e-06, + "loss": 3.0387, + "step": 10894 + }, + { + "epoch": 0.9285775164067162, + "grad_norm": 40.188698655398234, + "learning_rate": 8.72161310516597e-06, + "loss": 3.3762, + "step": 10895 + }, + { + "epoch": 0.9286627461007415, + "grad_norm": 33.64130583717301, + "learning_rate": 8.721281949666093e-06, + "loss": 2.1029, + "step": 10896 + }, + { + "epoch": 0.9287479757947669, + "grad_norm": 53.08856711890764, + "learning_rate": 8.720950757568889e-06, + "loss": 4.4645, + "step": 10897 + }, + { + "epoch": 0.9288332054887923, + "grad_norm": 72.03510573090927, + "learning_rate": 8.720619528877612e-06, + "loss": 3.4911, + "step": 10898 + }, + { + "epoch": 0.9289184351828177, + "grad_norm": 56.701289621496855, + "learning_rate": 8.72028826359552e-06, + "loss": 4.0673, + "step": 10899 + }, + { + "epoch": 0.929003664876843, + "grad_norm": 49.87045639782382, + "learning_rate": 8.719956961725871e-06, + "loss": 4.1629, + "step": 10900 + }, + { + "epoch": 0.9290888945708685, + "grad_norm": 46.30259898464032, + "learning_rate": 8.719625623271924e-06, + "loss": 4.6498, + "step": 10901 + }, + { + "epoch": 0.9291741242648939, + "grad_norm": 45.89922417132901, + "learning_rate": 8.719294248236939e-06, + "loss": 3.9342, + "step": 10902 + }, + { + "epoch": 0.9292593539589193, + "grad_norm": 122.29003961753793, + "learning_rate": 8.718962836624168e-06, + "loss": 4.7182, + "step": 10903 + }, + { + "epoch": 0.9293445836529447, + "grad_norm": 60.0546286096544, + "learning_rate": 8.718631388436878e-06, + "loss": 4.3819, + "step": 10904 + }, + { + "epoch": 0.9294298133469701, + "grad_norm": 29.62321491367339, + "learning_rate": 8.718299903678326e-06, + "loss": 2.7459, + "step": 10905 + }, + { + "epoch": 0.9295150430409955, + "grad_norm": 51.30432250924798, + "learning_rate": 8.717968382351773e-06, + "loss": 4.9593, + "step": 10906 + }, + { + "epoch": 0.9296002727350209, + "grad_norm": 43.15899267607192, + "learning_rate": 8.717636824460478e-06, + "loss": 3.8356, + "step": 10907 + }, + { + "epoch": 0.9296855024290462, + "grad_norm": 27.72518581022667, + "learning_rate": 8.7173052300077e-06, + "loss": 3.1485, + "step": 10908 + }, + { + "epoch": 0.9297707321230717, + "grad_norm": 31.8006388822728, + "learning_rate": 8.716973598996704e-06, + "loss": 4.7229, + "step": 10909 + }, + { + "epoch": 0.9298559618170971, + "grad_norm": 52.841665717991745, + "learning_rate": 8.716641931430749e-06, + "loss": 4.1051, + "step": 10910 + }, + { + "epoch": 0.9299411915111224, + "grad_norm": 80.1726210107899, + "learning_rate": 8.716310227313098e-06, + "loss": 3.2028, + "step": 10911 + }, + { + "epoch": 0.9300264212051479, + "grad_norm": 64.69807785402723, + "learning_rate": 8.715978486647012e-06, + "loss": 4.0699, + "step": 10912 + }, + { + "epoch": 0.9301116508991732, + "grad_norm": 33.930705483360406, + "learning_rate": 8.715646709435754e-06, + "loss": 3.4638, + "step": 10913 + }, + { + "epoch": 0.9301968805931987, + "grad_norm": 78.00192385158505, + "learning_rate": 8.715314895682589e-06, + "loss": 4.1812, + "step": 10914 + }, + { + "epoch": 0.9302821102872241, + "grad_norm": 39.2713234154537, + "learning_rate": 8.714983045390775e-06, + "loss": 3.8286, + "step": 10915 + }, + { + "epoch": 0.9303673399812494, + "grad_norm": 44.846738077307094, + "learning_rate": 8.714651158563583e-06, + "loss": 4.0201, + "step": 10916 + }, + { + "epoch": 0.9304525696752749, + "grad_norm": 48.13924976771399, + "learning_rate": 8.714319235204269e-06, + "loss": 3.9311, + "step": 10917 + }, + { + "epoch": 0.9305377993693003, + "grad_norm": 61.55916329653664, + "learning_rate": 8.713987275316102e-06, + "loss": 4.7126, + "step": 10918 + }, + { + "epoch": 0.9306230290633256, + "grad_norm": 49.12556485772111, + "learning_rate": 8.713655278902348e-06, + "loss": 3.7418, + "step": 10919 + }, + { + "epoch": 0.9307082587573511, + "grad_norm": 59.38095726693109, + "learning_rate": 8.713323245966269e-06, + "loss": 4.1722, + "step": 10920 + }, + { + "epoch": 0.9307934884513764, + "grad_norm": 49.82728019064077, + "learning_rate": 8.71299117651113e-06, + "loss": 3.8988, + "step": 10921 + }, + { + "epoch": 0.9308787181454019, + "grad_norm": 52.54129098468973, + "learning_rate": 8.7126590705402e-06, + "loss": 4.5938, + "step": 10922 + }, + { + "epoch": 0.9309639478394273, + "grad_norm": 73.2370963355134, + "learning_rate": 8.712326928056742e-06, + "loss": 3.124, + "step": 10923 + }, + { + "epoch": 0.9310491775334526, + "grad_norm": 131.83424301938524, + "learning_rate": 8.711994749064025e-06, + "loss": 7.2946, + "step": 10924 + }, + { + "epoch": 0.9311344072274781, + "grad_norm": 32.46338769124296, + "learning_rate": 8.711662533565314e-06, + "loss": 1.6452, + "step": 10925 + }, + { + "epoch": 0.9312196369215034, + "grad_norm": 44.83683945001246, + "learning_rate": 8.711330281563878e-06, + "loss": 3.0603, + "step": 10926 + }, + { + "epoch": 0.9313048666155288, + "grad_norm": 54.10646543289803, + "learning_rate": 8.710997993062981e-06, + "loss": 3.9398, + "step": 10927 + }, + { + "epoch": 0.9313900963095543, + "grad_norm": 27.967456042371225, + "learning_rate": 8.710665668065896e-06, + "loss": 3.5438, + "step": 10928 + }, + { + "epoch": 0.9314753260035796, + "grad_norm": 34.65154168318616, + "learning_rate": 8.710333306575888e-06, + "loss": 3.2634, + "step": 10929 + }, + { + "epoch": 0.9315605556976051, + "grad_norm": 63.01838004053873, + "learning_rate": 8.710000908596226e-06, + "loss": 3.7316, + "step": 10930 + }, + { + "epoch": 0.9316457853916305, + "grad_norm": 21.93058530233285, + "learning_rate": 8.709668474130178e-06, + "loss": 2.7569, + "step": 10931 + }, + { + "epoch": 0.9317310150856558, + "grad_norm": 32.42948468858783, + "learning_rate": 8.709336003181015e-06, + "loss": 3.5274, + "step": 10932 + }, + { + "epoch": 0.9318162447796813, + "grad_norm": 40.1183509887687, + "learning_rate": 8.709003495752008e-06, + "loss": 3.9591, + "step": 10933 + }, + { + "epoch": 0.9319014744737066, + "grad_norm": 90.2443359152672, + "learning_rate": 8.708670951846423e-06, + "loss": 6.0677, + "step": 10934 + }, + { + "epoch": 0.931986704167732, + "grad_norm": 33.263322809144846, + "learning_rate": 8.708338371467536e-06, + "loss": 3.5836, + "step": 10935 + }, + { + "epoch": 0.9320719338617575, + "grad_norm": 109.42113168010646, + "learning_rate": 8.70800575461861e-06, + "loss": 4.1339, + "step": 10936 + }, + { + "epoch": 0.9321571635557828, + "grad_norm": 36.24384642511883, + "learning_rate": 8.707673101302925e-06, + "loss": 3.7478, + "step": 10937 + }, + { + "epoch": 0.9322423932498083, + "grad_norm": 59.429076863334096, + "learning_rate": 8.707340411523747e-06, + "loss": 3.738, + "step": 10938 + }, + { + "epoch": 0.9323276229438336, + "grad_norm": 55.28786170326042, + "learning_rate": 8.70700768528435e-06, + "loss": 3.0498, + "step": 10939 + }, + { + "epoch": 0.932412852637859, + "grad_norm": 217.01358865732817, + "learning_rate": 8.706674922588005e-06, + "loss": 4.2898, + "step": 10940 + }, + { + "epoch": 0.9324980823318845, + "grad_norm": 31.777512664494196, + "learning_rate": 8.706342123437986e-06, + "loss": 3.0173, + "step": 10941 + }, + { + "epoch": 0.9325833120259098, + "grad_norm": 59.622670580021676, + "learning_rate": 8.706009287837563e-06, + "loss": 3.9062, + "step": 10942 + }, + { + "epoch": 0.9326685417199352, + "grad_norm": 51.23878134399547, + "learning_rate": 8.705676415790012e-06, + "loss": 3.7403, + "step": 10943 + }, + { + "epoch": 0.9327537714139607, + "grad_norm": 35.929506871921255, + "learning_rate": 8.705343507298605e-06, + "loss": 3.6359, + "step": 10944 + }, + { + "epoch": 0.932839001107986, + "grad_norm": 37.23748264885565, + "learning_rate": 8.705010562366618e-06, + "loss": 4.2599, + "step": 10945 + }, + { + "epoch": 0.9329242308020114, + "grad_norm": 29.510899278206228, + "learning_rate": 8.704677580997326e-06, + "loss": 2.6214, + "step": 10946 + }, + { + "epoch": 0.9330094604960368, + "grad_norm": 44.443083507011984, + "learning_rate": 8.704344563193998e-06, + "loss": 3.4858, + "step": 10947 + }, + { + "epoch": 0.9330946901900622, + "grad_norm": 30.810872235255694, + "learning_rate": 8.704011508959916e-06, + "loss": 2.9465, + "step": 10948 + }, + { + "epoch": 0.9331799198840877, + "grad_norm": 33.70192337864107, + "learning_rate": 8.703678418298352e-06, + "loss": 3.6005, + "step": 10949 + }, + { + "epoch": 0.933265149578113, + "grad_norm": 55.68353017947403, + "learning_rate": 8.703345291212583e-06, + "loss": 3.7631, + "step": 10950 + }, + { + "epoch": 0.9333503792721384, + "grad_norm": 41.37178421949372, + "learning_rate": 8.703012127705884e-06, + "loss": 2.2126, + "step": 10951 + }, + { + "epoch": 0.9334356089661638, + "grad_norm": 50.2382063376218, + "learning_rate": 8.702678927781532e-06, + "loss": 5.1303, + "step": 10952 + }, + { + "epoch": 0.9335208386601892, + "grad_norm": 77.26396803440157, + "learning_rate": 8.702345691442805e-06, + "loss": 5.0587, + "step": 10953 + }, + { + "epoch": 0.9336060683542146, + "grad_norm": 31.90082838031202, + "learning_rate": 8.702012418692979e-06, + "loss": 3.5517, + "step": 10954 + }, + { + "epoch": 0.93369129804824, + "grad_norm": 41.34061761844632, + "learning_rate": 8.701679109535332e-06, + "loss": 2.6706, + "step": 10955 + }, + { + "epoch": 0.9337765277422654, + "grad_norm": 78.96397798917127, + "learning_rate": 8.701345763973141e-06, + "loss": 4.7104, + "step": 10956 + }, + { + "epoch": 0.9338617574362909, + "grad_norm": 94.33476853171187, + "learning_rate": 8.701012382009685e-06, + "loss": 4.1502, + "step": 10957 + }, + { + "epoch": 0.9339469871303162, + "grad_norm": 35.76222040871814, + "learning_rate": 8.700678963648243e-06, + "loss": 3.704, + "step": 10958 + }, + { + "epoch": 0.9340322168243416, + "grad_norm": 74.13884331157286, + "learning_rate": 8.700345508892096e-06, + "loss": 4.4973, + "step": 10959 + }, + { + "epoch": 0.934117446518367, + "grad_norm": 26.462373368261606, + "learning_rate": 8.70001201774452e-06, + "loss": 3.2303, + "step": 10960 + }, + { + "epoch": 0.9342026762123924, + "grad_norm": 35.67697848508617, + "learning_rate": 8.699678490208796e-06, + "loss": 3.4005, + "step": 10961 + }, + { + "epoch": 0.9342879059064177, + "grad_norm": 47.00771569882567, + "learning_rate": 8.699344926288203e-06, + "loss": 4.0727, + "step": 10962 + }, + { + "epoch": 0.9343731356004432, + "grad_norm": 25.262209903810756, + "learning_rate": 8.699011325986025e-06, + "loss": 2.5556, + "step": 10963 + }, + { + "epoch": 0.9344583652944686, + "grad_norm": 65.73842108034235, + "learning_rate": 8.698677689305537e-06, + "loss": 4.4805, + "step": 10964 + }, + { + "epoch": 0.934543594988494, + "grad_norm": 61.36810326503725, + "learning_rate": 8.698344016250027e-06, + "loss": 4.2809, + "step": 10965 + }, + { + "epoch": 0.9346288246825194, + "grad_norm": 48.62563094702984, + "learning_rate": 8.69801030682277e-06, + "loss": 3.5287, + "step": 10966 + }, + { + "epoch": 0.9347140543765448, + "grad_norm": 35.14174222325311, + "learning_rate": 8.697676561027054e-06, + "loss": 3.8083, + "step": 10967 + }, + { + "epoch": 0.9347992840705702, + "grad_norm": 58.3099059652203, + "learning_rate": 8.697342778866157e-06, + "loss": 4.9981, + "step": 10968 + }, + { + "epoch": 0.9348845137645956, + "grad_norm": 49.07023932980542, + "learning_rate": 8.697008960343362e-06, + "loss": 5.1346, + "step": 10969 + }, + { + "epoch": 0.9349697434586209, + "grad_norm": 199.2927061229301, + "learning_rate": 8.696675105461956e-06, + "loss": 4.2981, + "step": 10970 + }, + { + "epoch": 0.9350549731526464, + "grad_norm": 85.7258604077852, + "learning_rate": 8.696341214225217e-06, + "loss": 5.1485, + "step": 10971 + }, + { + "epoch": 0.9351402028466718, + "grad_norm": 47.818642544185266, + "learning_rate": 8.69600728663643e-06, + "loss": 3.9585, + "step": 10972 + }, + { + "epoch": 0.9352254325406972, + "grad_norm": 32.63531853688719, + "learning_rate": 8.695673322698882e-06, + "loss": 3.9942, + "step": 10973 + }, + { + "epoch": 0.9353106622347226, + "grad_norm": 65.12468648180825, + "learning_rate": 8.695339322415854e-06, + "loss": 4.6987, + "step": 10974 + }, + { + "epoch": 0.935395891928748, + "grad_norm": 27.10105783801978, + "learning_rate": 8.695005285790633e-06, + "loss": 3.0257, + "step": 10975 + }, + { + "epoch": 0.9354811216227734, + "grad_norm": 53.485226852823544, + "learning_rate": 8.694671212826504e-06, + "loss": 4.16, + "step": 10976 + }, + { + "epoch": 0.9355663513167988, + "grad_norm": 31.172007363734185, + "learning_rate": 8.69433710352675e-06, + "loss": 3.092, + "step": 10977 + }, + { + "epoch": 0.9356515810108241, + "grad_norm": 85.81741521790237, + "learning_rate": 8.694002957894661e-06, + "loss": 4.8977, + "step": 10978 + }, + { + "epoch": 0.9357368107048496, + "grad_norm": 36.477759705187765, + "learning_rate": 8.693668775933519e-06, + "loss": 3.5921, + "step": 10979 + }, + { + "epoch": 0.935822040398875, + "grad_norm": 68.72998811769226, + "learning_rate": 8.69333455764661e-06, + "loss": 4.8406, + "step": 10980 + }, + { + "epoch": 0.9359072700929003, + "grad_norm": 41.09494984973199, + "learning_rate": 8.693000303037227e-06, + "loss": 4.7819, + "step": 10981 + }, + { + "epoch": 0.9359924997869258, + "grad_norm": 58.00772397508805, + "learning_rate": 8.692666012108652e-06, + "loss": 4.3076, + "step": 10982 + }, + { + "epoch": 0.9360777294809511, + "grad_norm": 44.73560716832341, + "learning_rate": 8.692331684864174e-06, + "loss": 4.2796, + "step": 10983 + }, + { + "epoch": 0.9361629591749766, + "grad_norm": 51.72563888567797, + "learning_rate": 8.691997321307082e-06, + "loss": 4.2445, + "step": 10984 + }, + { + "epoch": 0.936248188869002, + "grad_norm": 33.863981655996874, + "learning_rate": 8.691662921440663e-06, + "loss": 3.7075, + "step": 10985 + }, + { + "epoch": 0.9363334185630273, + "grad_norm": 40.63460948592932, + "learning_rate": 8.691328485268205e-06, + "loss": 3.8422, + "step": 10986 + }, + { + "epoch": 0.9364186482570528, + "grad_norm": 36.800463374828404, + "learning_rate": 8.690994012793e-06, + "loss": 4.1473, + "step": 10987 + }, + { + "epoch": 0.9365038779510781, + "grad_norm": 78.50247935699248, + "learning_rate": 8.690659504018333e-06, + "loss": 4.617, + "step": 10988 + }, + { + "epoch": 0.9365891076451035, + "grad_norm": 113.90884864574733, + "learning_rate": 8.690324958947498e-06, + "loss": 3.9192, + "step": 10989 + }, + { + "epoch": 0.936674337339129, + "grad_norm": 43.34681109581543, + "learning_rate": 8.689990377583784e-06, + "loss": 2.9234, + "step": 10990 + }, + { + "epoch": 0.9367595670331543, + "grad_norm": 25.56075288786055, + "learning_rate": 8.68965575993048e-06, + "loss": 1.8949, + "step": 10991 + }, + { + "epoch": 0.9368447967271798, + "grad_norm": 80.19017021108387, + "learning_rate": 8.689321105990878e-06, + "loss": 3.6509, + "step": 10992 + }, + { + "epoch": 0.9369300264212052, + "grad_norm": 86.87619786243908, + "learning_rate": 8.688986415768268e-06, + "loss": 3.9527, + "step": 10993 + }, + { + "epoch": 0.9370152561152305, + "grad_norm": 33.36881102966502, + "learning_rate": 8.688651689265944e-06, + "loss": 2.8843, + "step": 10994 + }, + { + "epoch": 0.937100485809256, + "grad_norm": 69.63933348492965, + "learning_rate": 8.688316926487194e-06, + "loss": 3.5011, + "step": 10995 + }, + { + "epoch": 0.9371857155032813, + "grad_norm": 19.031501371930464, + "learning_rate": 8.687982127435315e-06, + "loss": 2.2193, + "step": 10996 + }, + { + "epoch": 0.9372709451973067, + "grad_norm": 51.96069989321445, + "learning_rate": 8.687647292113596e-06, + "loss": 4.1221, + "step": 10997 + }, + { + "epoch": 0.9373561748913322, + "grad_norm": 29.974361732448994, + "learning_rate": 8.687312420525333e-06, + "loss": 3.7406, + "step": 10998 + }, + { + "epoch": 0.9374414045853575, + "grad_norm": 41.551165364434404, + "learning_rate": 8.686977512673815e-06, + "loss": 3.9752, + "step": 10999 + }, + { + "epoch": 0.937526634279383, + "grad_norm": 89.93742441159301, + "learning_rate": 8.68664256856234e-06, + "loss": 4.2858, + "step": 11000 + }, + { + "epoch": 0.9376118639734083, + "grad_norm": 52.25021803694539, + "learning_rate": 8.6863075881942e-06, + "loss": 3.4924, + "step": 11001 + }, + { + "epoch": 0.9376970936674337, + "grad_norm": 32.09086275890427, + "learning_rate": 8.68597257157269e-06, + "loss": 3.004, + "step": 11002 + }, + { + "epoch": 0.9377823233614592, + "grad_norm": 80.40879776050576, + "learning_rate": 8.685637518701103e-06, + "loss": 4.7109, + "step": 11003 + }, + { + "epoch": 0.9378675530554845, + "grad_norm": 43.1165729405149, + "learning_rate": 8.685302429582736e-06, + "loss": 3.7871, + "step": 11004 + }, + { + "epoch": 0.9379527827495099, + "grad_norm": 124.21879368768897, + "learning_rate": 8.684967304220884e-06, + "loss": 4.6126, + "step": 11005 + }, + { + "epoch": 0.9380380124435354, + "grad_norm": 71.61124059344615, + "learning_rate": 8.684632142618843e-06, + "loss": 3.9197, + "step": 11006 + }, + { + "epoch": 0.9381232421375607, + "grad_norm": 36.6060929567224, + "learning_rate": 8.684296944779909e-06, + "loss": 2.8421, + "step": 11007 + }, + { + "epoch": 0.9382084718315862, + "grad_norm": 38.87268703930769, + "learning_rate": 8.683961710707379e-06, + "loss": 4.3141, + "step": 11008 + }, + { + "epoch": 0.9382937015256115, + "grad_norm": 31.504916970215582, + "learning_rate": 8.68362644040455e-06, + "loss": 2.8889, + "step": 11009 + }, + { + "epoch": 0.9383789312196369, + "grad_norm": 34.548769279693104, + "learning_rate": 8.683291133874717e-06, + "loss": 3.4813, + "step": 11010 + }, + { + "epoch": 0.9384641609136624, + "grad_norm": 49.770720008773104, + "learning_rate": 8.68295579112118e-06, + "loss": 4.7324, + "step": 11011 + }, + { + "epoch": 0.9385493906076877, + "grad_norm": 81.45438859351106, + "learning_rate": 8.682620412147235e-06, + "loss": 6.144, + "step": 11012 + }, + { + "epoch": 0.9386346203017131, + "grad_norm": 51.8794290271632, + "learning_rate": 8.682284996956184e-06, + "loss": 3.6369, + "step": 11013 + }, + { + "epoch": 0.9387198499957385, + "grad_norm": 47.35239843191153, + "learning_rate": 8.681949545551322e-06, + "loss": 4.2476, + "step": 11014 + }, + { + "epoch": 0.9388050796897639, + "grad_norm": 39.331078202948554, + "learning_rate": 8.681614057935949e-06, + "loss": 4.7986, + "step": 11015 + }, + { + "epoch": 0.9388903093837894, + "grad_norm": 32.59222141266588, + "learning_rate": 8.681278534113367e-06, + "loss": 3.526, + "step": 11016 + }, + { + "epoch": 0.9389755390778147, + "grad_norm": 49.80523380944982, + "learning_rate": 8.68094297408687e-06, + "loss": 2.4584, + "step": 11017 + }, + { + "epoch": 0.9390607687718401, + "grad_norm": 50.491305652399745, + "learning_rate": 8.680607377859763e-06, + "loss": 4.0937, + "step": 11018 + }, + { + "epoch": 0.9391459984658656, + "grad_norm": 46.4812332199248, + "learning_rate": 8.680271745435344e-06, + "loss": 3.8924, + "step": 11019 + }, + { + "epoch": 0.9392312281598909, + "grad_norm": 50.97750751454309, + "learning_rate": 8.679936076816916e-06, + "loss": 2.5579, + "step": 11020 + }, + { + "epoch": 0.9393164578539163, + "grad_norm": 79.76104847253804, + "learning_rate": 8.679600372007779e-06, + "loss": 5.1737, + "step": 11021 + }, + { + "epoch": 0.9394016875479417, + "grad_norm": 45.450232913659654, + "learning_rate": 8.679264631011234e-06, + "loss": 4.4021, + "step": 11022 + }, + { + "epoch": 0.9394869172419671, + "grad_norm": 45.3960208959989, + "learning_rate": 8.678928853830584e-06, + "loss": 4.4598, + "step": 11023 + }, + { + "epoch": 0.9395721469359924, + "grad_norm": 35.164008829284604, + "learning_rate": 8.67859304046913e-06, + "loss": 4.0218, + "step": 11024 + }, + { + "epoch": 0.9396573766300179, + "grad_norm": 45.26689465316558, + "learning_rate": 8.678257190930176e-06, + "loss": 4.0553, + "step": 11025 + }, + { + "epoch": 0.9397426063240433, + "grad_norm": 35.08694968526676, + "learning_rate": 8.677921305217023e-06, + "loss": 3.2476, + "step": 11026 + }, + { + "epoch": 0.9398278360180687, + "grad_norm": 36.32119649441666, + "learning_rate": 8.677585383332975e-06, + "loss": 3.1855, + "step": 11027 + }, + { + "epoch": 0.9399130657120941, + "grad_norm": 38.61989720511463, + "learning_rate": 8.677249425281337e-06, + "loss": 3.8412, + "step": 11028 + }, + { + "epoch": 0.9399982954061195, + "grad_norm": 62.29529232306893, + "learning_rate": 8.676913431065413e-06, + "loss": 4.3807, + "step": 11029 + }, + { + "epoch": 0.9400835251001449, + "grad_norm": 29.40542964280408, + "learning_rate": 8.676577400688507e-06, + "loss": 2.6858, + "step": 11030 + }, + { + "epoch": 0.9401687547941703, + "grad_norm": 62.430318275661726, + "learning_rate": 8.67624133415392e-06, + "loss": 3.6684, + "step": 11031 + }, + { + "epoch": 0.9402539844881956, + "grad_norm": 108.90901339634944, + "learning_rate": 8.675905231464963e-06, + "loss": 4.0778, + "step": 11032 + }, + { + "epoch": 0.9403392141822211, + "grad_norm": 81.71505221838646, + "learning_rate": 8.675569092624937e-06, + "loss": 4.2055, + "step": 11033 + }, + { + "epoch": 0.9404244438762465, + "grad_norm": 54.08043976285241, + "learning_rate": 8.67523291763715e-06, + "loss": 3.3887, + "step": 11034 + }, + { + "epoch": 0.9405096735702719, + "grad_norm": 30.17813372423778, + "learning_rate": 8.674896706504909e-06, + "loss": 4.3001, + "step": 11035 + }, + { + "epoch": 0.9405949032642973, + "grad_norm": 33.36771258519969, + "learning_rate": 8.674560459231517e-06, + "loss": 4.1769, + "step": 11036 + }, + { + "epoch": 0.9406801329583226, + "grad_norm": 44.02756742503662, + "learning_rate": 8.674224175820284e-06, + "loss": 4.4327, + "step": 11037 + }, + { + "epoch": 0.9407653626523481, + "grad_norm": 45.934718205752155, + "learning_rate": 8.673887856274517e-06, + "loss": 3.4019, + "step": 11038 + }, + { + "epoch": 0.9408505923463735, + "grad_norm": 63.80547356347118, + "learning_rate": 8.673551500597521e-06, + "loss": 3.9122, + "step": 11039 + }, + { + "epoch": 0.9409358220403988, + "grad_norm": 69.75026106967277, + "learning_rate": 8.673215108792608e-06, + "loss": 4.3011, + "step": 11040 + }, + { + "epoch": 0.9410210517344243, + "grad_norm": 68.60579748004665, + "learning_rate": 8.672878680863082e-06, + "loss": 4.1, + "step": 11041 + }, + { + "epoch": 0.9411062814284497, + "grad_norm": 40.5266382070193, + "learning_rate": 8.672542216812254e-06, + "loss": 2.5404, + "step": 11042 + }, + { + "epoch": 0.9411915111224751, + "grad_norm": 66.62289683560653, + "learning_rate": 8.672205716643433e-06, + "loss": 4.4769, + "step": 11043 + }, + { + "epoch": 0.9412767408165005, + "grad_norm": 78.32927395865826, + "learning_rate": 8.67186918035993e-06, + "loss": 4.3489, + "step": 11044 + }, + { + "epoch": 0.9413619705105258, + "grad_norm": 64.82313284716523, + "learning_rate": 8.67153260796505e-06, + "loss": 4.6948, + "step": 11045 + }, + { + "epoch": 0.9414472002045513, + "grad_norm": 144.93949047340175, + "learning_rate": 8.671195999462105e-06, + "loss": 4.7459, + "step": 11046 + }, + { + "epoch": 0.9415324298985767, + "grad_norm": 67.12687633649533, + "learning_rate": 8.670859354854407e-06, + "loss": 5.2802, + "step": 11047 + }, + { + "epoch": 0.941617659592602, + "grad_norm": 43.97392866892739, + "learning_rate": 8.670522674145267e-06, + "loss": 4.5214, + "step": 11048 + }, + { + "epoch": 0.9417028892866275, + "grad_norm": 51.55258558297006, + "learning_rate": 8.670185957337993e-06, + "loss": 4.3332, + "step": 11049 + }, + { + "epoch": 0.9417881189806528, + "grad_norm": 70.31099412583706, + "learning_rate": 8.6698492044359e-06, + "loss": 3.0626, + "step": 11050 + }, + { + "epoch": 0.9418733486746783, + "grad_norm": 39.05818502758747, + "learning_rate": 8.6695124154423e-06, + "loss": 3.5035, + "step": 11051 + }, + { + "epoch": 0.9419585783687037, + "grad_norm": 68.3423391452902, + "learning_rate": 8.669175590360502e-06, + "loss": 3.5942, + "step": 11052 + }, + { + "epoch": 0.942043808062729, + "grad_norm": 42.628467027262325, + "learning_rate": 8.668838729193818e-06, + "loss": 4.1181, + "step": 11053 + }, + { + "epoch": 0.9421290377567545, + "grad_norm": 59.13780036757093, + "learning_rate": 8.668501831945565e-06, + "loss": 4.6729, + "step": 11054 + }, + { + "epoch": 0.9422142674507799, + "grad_norm": 70.92577651303755, + "learning_rate": 8.668164898619054e-06, + "loss": 4.1414, + "step": 11055 + }, + { + "epoch": 0.9422994971448052, + "grad_norm": 37.961097780754905, + "learning_rate": 8.667827929217599e-06, + "loss": 3.7648, + "step": 11056 + }, + { + "epoch": 0.9423847268388307, + "grad_norm": 52.904229094937506, + "learning_rate": 8.667490923744514e-06, + "loss": 3.7208, + "step": 11057 + }, + { + "epoch": 0.942469956532856, + "grad_norm": 82.66806292951968, + "learning_rate": 8.667153882203113e-06, + "loss": 3.7825, + "step": 11058 + }, + { + "epoch": 0.9425551862268814, + "grad_norm": 44.302649723002894, + "learning_rate": 8.666816804596709e-06, + "loss": 3.6439, + "step": 11059 + }, + { + "epoch": 0.9426404159209069, + "grad_norm": 38.28981969418386, + "learning_rate": 8.666479690928622e-06, + "loss": 2.9067, + "step": 11060 + }, + { + "epoch": 0.9427256456149322, + "grad_norm": 50.996274642407, + "learning_rate": 8.66614254120216e-06, + "loss": 3.2118, + "step": 11061 + }, + { + "epoch": 0.9428108753089577, + "grad_norm": 36.179025182905335, + "learning_rate": 8.665805355420647e-06, + "loss": 3.8275, + "step": 11062 + }, + { + "epoch": 0.942896105002983, + "grad_norm": 50.39146901507895, + "learning_rate": 8.665468133587392e-06, + "loss": 3.6246, + "step": 11063 + }, + { + "epoch": 0.9429813346970084, + "grad_norm": 51.10734543565582, + "learning_rate": 8.665130875705715e-06, + "loss": 4.6453, + "step": 11064 + }, + { + "epoch": 0.9430665643910339, + "grad_norm": 28.740519909652892, + "learning_rate": 8.664793581778935e-06, + "loss": 3.046, + "step": 11065 + }, + { + "epoch": 0.9431517940850592, + "grad_norm": 40.624697996307944, + "learning_rate": 8.664456251810362e-06, + "loss": 3.6185, + "step": 11066 + }, + { + "epoch": 0.9432370237790846, + "grad_norm": 59.14679202781184, + "learning_rate": 8.66411888580332e-06, + "loss": 4.2005, + "step": 11067 + }, + { + "epoch": 0.94332225347311, + "grad_norm": 40.91163946540699, + "learning_rate": 8.663781483761128e-06, + "loss": 3.8424, + "step": 11068 + }, + { + "epoch": 0.9434074831671354, + "grad_norm": 55.23042119204571, + "learning_rate": 8.663444045687095e-06, + "loss": 3.6357, + "step": 11069 + }, + { + "epoch": 0.9434927128611609, + "grad_norm": 36.80792053733356, + "learning_rate": 8.66310657158455e-06, + "loss": 4.2902, + "step": 11070 + }, + { + "epoch": 0.9435779425551862, + "grad_norm": 84.14139297250587, + "learning_rate": 8.662769061456804e-06, + "loss": 4.7468, + "step": 11071 + }, + { + "epoch": 0.9436631722492116, + "grad_norm": 32.978292275748395, + "learning_rate": 8.662431515307183e-06, + "loss": 3.8425, + "step": 11072 + }, + { + "epoch": 0.9437484019432371, + "grad_norm": 111.40320512371761, + "learning_rate": 8.662093933139e-06, + "loss": 5.1211, + "step": 11073 + }, + { + "epoch": 0.9438336316372624, + "grad_norm": 52.38419361999757, + "learning_rate": 8.66175631495558e-06, + "loss": 4.7629, + "step": 11074 + }, + { + "epoch": 0.9439188613312878, + "grad_norm": 56.94280269257253, + "learning_rate": 8.661418660760244e-06, + "loss": 5.1223, + "step": 11075 + }, + { + "epoch": 0.9440040910253132, + "grad_norm": 30.47536683645046, + "learning_rate": 8.661080970556307e-06, + "loss": 3.069, + "step": 11076 + }, + { + "epoch": 0.9440893207193386, + "grad_norm": 55.93788205178849, + "learning_rate": 8.660743244347093e-06, + "loss": 4.4158, + "step": 11077 + }, + { + "epoch": 0.9441745504133641, + "grad_norm": 48.36271062124633, + "learning_rate": 8.660405482135927e-06, + "loss": 3.751, + "step": 11078 + }, + { + "epoch": 0.9442597801073894, + "grad_norm": 44.18861458218908, + "learning_rate": 8.660067683926125e-06, + "loss": 3.6624, + "step": 11079 + }, + { + "epoch": 0.9443450098014148, + "grad_norm": 39.9134426085869, + "learning_rate": 8.659729849721015e-06, + "loss": 3.6286, + "step": 11080 + }, + { + "epoch": 0.9444302394954402, + "grad_norm": 35.30736627885141, + "learning_rate": 8.659391979523912e-06, + "loss": 4.2315, + "step": 11081 + }, + { + "epoch": 0.9445154691894656, + "grad_norm": 30.337448825851304, + "learning_rate": 8.659054073338146e-06, + "loss": 2.8761, + "step": 11082 + }, + { + "epoch": 0.944600698883491, + "grad_norm": 41.57855085568578, + "learning_rate": 8.658716131167036e-06, + "loss": 4.503, + "step": 11083 + }, + { + "epoch": 0.9446859285775164, + "grad_norm": 57.615160848855325, + "learning_rate": 8.658378153013906e-06, + "loss": 4.2255, + "step": 11084 + }, + { + "epoch": 0.9447711582715418, + "grad_norm": 34.75564408092268, + "learning_rate": 8.658040138882082e-06, + "loss": 3.2701, + "step": 11085 + }, + { + "epoch": 0.9448563879655673, + "grad_norm": 43.7404076561284, + "learning_rate": 8.657702088774886e-06, + "loss": 3.9756, + "step": 11086 + }, + { + "epoch": 0.9449416176595926, + "grad_norm": 45.601360042107814, + "learning_rate": 8.657364002695643e-06, + "loss": 3.7237, + "step": 11087 + }, + { + "epoch": 0.945026847353618, + "grad_norm": 35.585745452416205, + "learning_rate": 8.657025880647679e-06, + "loss": 3.5653, + "step": 11088 + }, + { + "epoch": 0.9451120770476434, + "grad_norm": 36.94086253822609, + "learning_rate": 8.656687722634318e-06, + "loss": 3.729, + "step": 11089 + }, + { + "epoch": 0.9451973067416688, + "grad_norm": 40.19501956858171, + "learning_rate": 8.656349528658888e-06, + "loss": 3.906, + "step": 11090 + }, + { + "epoch": 0.9452825364356942, + "grad_norm": 47.866297147590615, + "learning_rate": 8.656011298724712e-06, + "loss": 4.5546, + "step": 11091 + }, + { + "epoch": 0.9453677661297196, + "grad_norm": 73.52646287916353, + "learning_rate": 8.655673032835117e-06, + "loss": 4.4687, + "step": 11092 + }, + { + "epoch": 0.945452995823745, + "grad_norm": 34.589581729820466, + "learning_rate": 8.65533473099343e-06, + "loss": 3.0773, + "step": 11093 + }, + { + "epoch": 0.9455382255177703, + "grad_norm": 42.45876308488692, + "learning_rate": 8.654996393202979e-06, + "loss": 4.8042, + "step": 11094 + }, + { + "epoch": 0.9456234552117958, + "grad_norm": 44.61747895410002, + "learning_rate": 8.654658019467091e-06, + "loss": 4.3565, + "step": 11095 + }, + { + "epoch": 0.9457086849058212, + "grad_norm": 38.742199408412574, + "learning_rate": 8.654319609789095e-06, + "loss": 3.5242, + "step": 11096 + }, + { + "epoch": 0.9457939145998466, + "grad_norm": 60.385002049369646, + "learning_rate": 8.653981164172316e-06, + "loss": 4.2054, + "step": 11097 + }, + { + "epoch": 0.945879144293872, + "grad_norm": 43.85354522359222, + "learning_rate": 8.653642682620084e-06, + "loss": 3.2267, + "step": 11098 + }, + { + "epoch": 0.9459643739878973, + "grad_norm": 31.390364135353092, + "learning_rate": 8.653304165135729e-06, + "loss": 3.7062, + "step": 11099 + }, + { + "epoch": 0.9460496036819228, + "grad_norm": 75.3534354051077, + "learning_rate": 8.652965611722579e-06, + "loss": 5.0761, + "step": 11100 + }, + { + "epoch": 0.9461348333759482, + "grad_norm": 36.30186306976455, + "learning_rate": 8.652627022383963e-06, + "loss": 3.8807, + "step": 11101 + }, + { + "epoch": 0.9462200630699735, + "grad_norm": 43.809491219456845, + "learning_rate": 8.652288397123212e-06, + "loss": 4.3767, + "step": 11102 + }, + { + "epoch": 0.946305292763999, + "grad_norm": 56.43202941029422, + "learning_rate": 8.651949735943658e-06, + "loss": 4.3785, + "step": 11103 + }, + { + "epoch": 0.9463905224580244, + "grad_norm": 26.208480873402504, + "learning_rate": 8.651611038848626e-06, + "loss": 2.9166, + "step": 11104 + }, + { + "epoch": 0.9464757521520498, + "grad_norm": 64.8432932445168, + "learning_rate": 8.651272305841454e-06, + "loss": 5.163, + "step": 11105 + }, + { + "epoch": 0.9465609818460752, + "grad_norm": 30.641308804321863, + "learning_rate": 8.650933536925468e-06, + "loss": 3.6919, + "step": 11106 + }, + { + "epoch": 0.9466462115401005, + "grad_norm": 105.37704622516529, + "learning_rate": 8.650594732104e-06, + "loss": 4.0445, + "step": 11107 + }, + { + "epoch": 0.946731441234126, + "grad_norm": 56.77437309645633, + "learning_rate": 8.650255891380387e-06, + "loss": 4.6193, + "step": 11108 + }, + { + "epoch": 0.9468166709281514, + "grad_norm": 49.01541726806804, + "learning_rate": 8.649917014757953e-06, + "loss": 3.5432, + "step": 11109 + }, + { + "epoch": 0.9469019006221767, + "grad_norm": 112.92072450823281, + "learning_rate": 8.649578102240039e-06, + "loss": 4.1763, + "step": 11110 + }, + { + "epoch": 0.9469871303162022, + "grad_norm": 37.11892812602318, + "learning_rate": 8.649239153829974e-06, + "loss": 4.3513, + "step": 11111 + }, + { + "epoch": 0.9470723600102275, + "grad_norm": 30.55813033154181, + "learning_rate": 8.648900169531091e-06, + "loss": 3.6214, + "step": 11112 + }, + { + "epoch": 0.947157589704253, + "grad_norm": 33.1599443371318, + "learning_rate": 8.648561149346725e-06, + "loss": 2.923, + "step": 11113 + }, + { + "epoch": 0.9472428193982784, + "grad_norm": 56.28794153172234, + "learning_rate": 8.64822209328021e-06, + "loss": 3.9952, + "step": 11114 + }, + { + "epoch": 0.9473280490923037, + "grad_norm": 56.84035086525151, + "learning_rate": 8.64788300133488e-06, + "loss": 4.6667, + "step": 11115 + }, + { + "epoch": 0.9474132787863292, + "grad_norm": 89.06563151494694, + "learning_rate": 8.647543873514071e-06, + "loss": 4.4435, + "step": 11116 + }, + { + "epoch": 0.9474985084803546, + "grad_norm": 53.68205415223154, + "learning_rate": 8.647204709821118e-06, + "loss": 3.0275, + "step": 11117 + }, + { + "epoch": 0.9475837381743799, + "grad_norm": 37.339247761011784, + "learning_rate": 8.646865510259355e-06, + "loss": 4.6431, + "step": 11118 + }, + { + "epoch": 0.9476689678684054, + "grad_norm": 32.51165688170734, + "learning_rate": 8.646526274832119e-06, + "loss": 4.3982, + "step": 11119 + }, + { + "epoch": 0.9477541975624307, + "grad_norm": 32.04817505290133, + "learning_rate": 8.646187003542746e-06, + "loss": 4.0875, + "step": 11120 + }, + { + "epoch": 0.9478394272564562, + "grad_norm": 53.08792009871782, + "learning_rate": 8.645847696394573e-06, + "loss": 5.1116, + "step": 11121 + }, + { + "epoch": 0.9479246569504816, + "grad_norm": 47.77296324347203, + "learning_rate": 8.645508353390935e-06, + "loss": 3.5282, + "step": 11122 + }, + { + "epoch": 0.9480098866445069, + "grad_norm": 61.10545135522013, + "learning_rate": 8.645168974535173e-06, + "loss": 4.4545, + "step": 11123 + }, + { + "epoch": 0.9480951163385324, + "grad_norm": 37.97544021885301, + "learning_rate": 8.64482955983062e-06, + "loss": 3.6881, + "step": 11124 + }, + { + "epoch": 0.9481803460325577, + "grad_norm": 39.905482613609664, + "learning_rate": 8.64449010928062e-06, + "loss": 3.606, + "step": 11125 + }, + { + "epoch": 0.9482655757265831, + "grad_norm": 37.1346406659319, + "learning_rate": 8.644150622888506e-06, + "loss": 3.6493, + "step": 11126 + }, + { + "epoch": 0.9483508054206086, + "grad_norm": 66.37841606184463, + "learning_rate": 8.64381110065762e-06, + "loss": 5.9312, + "step": 11127 + }, + { + "epoch": 0.9484360351146339, + "grad_norm": 57.92491919927671, + "learning_rate": 8.643471542591297e-06, + "loss": 4.5464, + "step": 11128 + }, + { + "epoch": 0.9485212648086594, + "grad_norm": 69.71217518962628, + "learning_rate": 8.643131948692883e-06, + "loss": 3.7104, + "step": 11129 + }, + { + "epoch": 0.9486064945026847, + "grad_norm": 46.93339516609922, + "learning_rate": 8.64279231896571e-06, + "loss": 4.5186, + "step": 11130 + }, + { + "epoch": 0.9486917241967101, + "grad_norm": 52.23995421566484, + "learning_rate": 8.642452653413124e-06, + "loss": 3.5709, + "step": 11131 + }, + { + "epoch": 0.9487769538907356, + "grad_norm": 53.63665829953077, + "learning_rate": 8.642112952038464e-06, + "loss": 3.7207, + "step": 11132 + }, + { + "epoch": 0.9488621835847609, + "grad_norm": 51.8750435709737, + "learning_rate": 8.641773214845069e-06, + "loss": 4.0113, + "step": 11133 + }, + { + "epoch": 0.9489474132787863, + "grad_norm": 83.04223473783021, + "learning_rate": 8.641433441836284e-06, + "loss": 4.9593, + "step": 11134 + }, + { + "epoch": 0.9490326429728118, + "grad_norm": 44.3993654643134, + "learning_rate": 8.641093633015445e-06, + "loss": 4.0953, + "step": 11135 + }, + { + "epoch": 0.9491178726668371, + "grad_norm": 63.96776905226384, + "learning_rate": 8.640753788385899e-06, + "loss": 2.5733, + "step": 11136 + }, + { + "epoch": 0.9492031023608625, + "grad_norm": 39.263156882342095, + "learning_rate": 8.640413907950986e-06, + "loss": 3.5989, + "step": 11137 + }, + { + "epoch": 0.9492883320548879, + "grad_norm": 40.239466415743316, + "learning_rate": 8.640073991714047e-06, + "loss": 3.3406, + "step": 11138 + }, + { + "epoch": 0.9493735617489133, + "grad_norm": 79.39775780391334, + "learning_rate": 8.639734039678428e-06, + "loss": 3.9029, + "step": 11139 + }, + { + "epoch": 0.9494587914429388, + "grad_norm": 36.68798503312954, + "learning_rate": 8.639394051847472e-06, + "loss": 3.9996, + "step": 11140 + }, + { + "epoch": 0.9495440211369641, + "grad_norm": 38.26992583545703, + "learning_rate": 8.63905402822452e-06, + "loss": 3.1445, + "step": 11141 + }, + { + "epoch": 0.9496292508309895, + "grad_norm": 40.58536791379287, + "learning_rate": 8.638713968812917e-06, + "loss": 3.9624, + "step": 11142 + }, + { + "epoch": 0.949714480525015, + "grad_norm": 71.79865491649242, + "learning_rate": 8.63837387361601e-06, + "loss": 5.7887, + "step": 11143 + }, + { + "epoch": 0.9497997102190403, + "grad_norm": 64.53142181946616, + "learning_rate": 8.638033742637142e-06, + "loss": 4.693, + "step": 11144 + }, + { + "epoch": 0.9498849399130657, + "grad_norm": 61.65281405340965, + "learning_rate": 8.637693575879655e-06, + "loss": 4.4657, + "step": 11145 + }, + { + "epoch": 0.9499701696070911, + "grad_norm": 60.201164780914795, + "learning_rate": 8.6373533733469e-06, + "loss": 3.9702, + "step": 11146 + }, + { + "epoch": 0.9500553993011165, + "grad_norm": 54.52688356413426, + "learning_rate": 8.637013135042218e-06, + "loss": 4.2239, + "step": 11147 + }, + { + "epoch": 0.950140628995142, + "grad_norm": 33.378057645050475, + "learning_rate": 8.636672860968958e-06, + "loss": 4.2246, + "step": 11148 + }, + { + "epoch": 0.9502258586891673, + "grad_norm": 75.63182093224654, + "learning_rate": 8.636332551130466e-06, + "loss": 4.4031, + "step": 11149 + }, + { + "epoch": 0.9503110883831927, + "grad_norm": 46.30930373423039, + "learning_rate": 8.635992205530088e-06, + "loss": 3.8824, + "step": 11150 + }, + { + "epoch": 0.9503963180772181, + "grad_norm": 57.59241811128239, + "learning_rate": 8.635651824171171e-06, + "loss": 3.5853, + "step": 11151 + }, + { + "epoch": 0.9504815477712435, + "grad_norm": 73.47779166028579, + "learning_rate": 8.635311407057063e-06, + "loss": 4.719, + "step": 11152 + }, + { + "epoch": 0.9505667774652689, + "grad_norm": 77.03731463495146, + "learning_rate": 8.634970954191112e-06, + "loss": 3.5107, + "step": 11153 + }, + { + "epoch": 0.9506520071592943, + "grad_norm": 30.389805637187735, + "learning_rate": 8.634630465576666e-06, + "loss": 3.0824, + "step": 11154 + }, + { + "epoch": 0.9507372368533197, + "grad_norm": 59.61622188972792, + "learning_rate": 8.634289941217073e-06, + "loss": 4.4838, + "step": 11155 + }, + { + "epoch": 0.9508224665473451, + "grad_norm": 135.63997601378, + "learning_rate": 8.633949381115683e-06, + "loss": 4.99, + "step": 11156 + }, + { + "epoch": 0.9509076962413705, + "grad_norm": 55.20666514061963, + "learning_rate": 8.633608785275845e-06, + "loss": 4.3714, + "step": 11157 + }, + { + "epoch": 0.9509929259353959, + "grad_norm": 60.01642541361965, + "learning_rate": 8.63326815370091e-06, + "loss": 3.8452, + "step": 11158 + }, + { + "epoch": 0.9510781556294213, + "grad_norm": 43.407539011900084, + "learning_rate": 8.632927486394222e-06, + "loss": 3.7332, + "step": 11159 + }, + { + "epoch": 0.9511633853234467, + "grad_norm": 61.264550447164126, + "learning_rate": 8.63258678335914e-06, + "loss": 4.4622, + "step": 11160 + }, + { + "epoch": 0.951248615017472, + "grad_norm": 34.575996121561985, + "learning_rate": 8.632246044599008e-06, + "loss": 3.6944, + "step": 11161 + }, + { + "epoch": 0.9513338447114975, + "grad_norm": 45.47340745245181, + "learning_rate": 8.631905270117182e-06, + "loss": 3.9478, + "step": 11162 + }, + { + "epoch": 0.9514190744055229, + "grad_norm": 59.61068941882142, + "learning_rate": 8.631564459917008e-06, + "loss": 4.9197, + "step": 11163 + }, + { + "epoch": 0.9515043040995483, + "grad_norm": 39.50197857397219, + "learning_rate": 8.631223614001843e-06, + "loss": 3.4776, + "step": 11164 + }, + { + "epoch": 0.9515895337935737, + "grad_norm": 64.39315742114373, + "learning_rate": 8.630882732375035e-06, + "loss": 4.0307, + "step": 11165 + }, + { + "epoch": 0.951674763487599, + "grad_norm": 48.83383556028253, + "learning_rate": 8.630541815039939e-06, + "loss": 5.4188, + "step": 11166 + }, + { + "epoch": 0.9517599931816245, + "grad_norm": 81.00052371801475, + "learning_rate": 8.630200861999907e-06, + "loss": 5.0669, + "step": 11167 + }, + { + "epoch": 0.9518452228756499, + "grad_norm": 52.92204521023751, + "learning_rate": 8.62985987325829e-06, + "loss": 4.1258, + "step": 11168 + }, + { + "epoch": 0.9519304525696752, + "grad_norm": 44.50699318267539, + "learning_rate": 8.629518848818445e-06, + "loss": 4.0728, + "step": 11169 + }, + { + "epoch": 0.9520156822637007, + "grad_norm": 29.126918701584728, + "learning_rate": 8.629177788683726e-06, + "loss": 2.923, + "step": 11170 + }, + { + "epoch": 0.9521009119577261, + "grad_norm": 49.84325860475952, + "learning_rate": 8.628836692857484e-06, + "loss": 4.726, + "step": 11171 + }, + { + "epoch": 0.9521861416517514, + "grad_norm": 35.824868247660156, + "learning_rate": 8.628495561343076e-06, + "loss": 3.9139, + "step": 11172 + }, + { + "epoch": 0.9522713713457769, + "grad_norm": 33.99911024999839, + "learning_rate": 8.628154394143853e-06, + "loss": 4.1591, + "step": 11173 + }, + { + "epoch": 0.9523566010398022, + "grad_norm": 47.38689664394788, + "learning_rate": 8.627813191263176e-06, + "loss": 4.1262, + "step": 11174 + }, + { + "epoch": 0.9524418307338277, + "grad_norm": 51.334280925167846, + "learning_rate": 8.627471952704398e-06, + "loss": 3.3017, + "step": 11175 + }, + { + "epoch": 0.9525270604278531, + "grad_norm": 47.35509430895806, + "learning_rate": 8.627130678470873e-06, + "loss": 3.7128, + "step": 11176 + }, + { + "epoch": 0.9526122901218784, + "grad_norm": 80.48181966054264, + "learning_rate": 8.62678936856596e-06, + "loss": 4.1797, + "step": 11177 + }, + { + "epoch": 0.9526975198159039, + "grad_norm": 52.72888984009288, + "learning_rate": 8.626448022993015e-06, + "loss": 3.0787, + "step": 11178 + }, + { + "epoch": 0.9527827495099292, + "grad_norm": 37.72175647435178, + "learning_rate": 8.626106641755393e-06, + "loss": 3.5175, + "step": 11179 + }, + { + "epoch": 0.9528679792039546, + "grad_norm": 20.136852467369856, + "learning_rate": 8.625765224856455e-06, + "loss": 3.0476, + "step": 11180 + }, + { + "epoch": 0.9529532088979801, + "grad_norm": 48.718302952683125, + "learning_rate": 8.625423772299557e-06, + "loss": 3.8547, + "step": 11181 + }, + { + "epoch": 0.9530384385920054, + "grad_norm": 40.32531982322977, + "learning_rate": 8.625082284088056e-06, + "loss": 3.7124, + "step": 11182 + }, + { + "epoch": 0.9531236682860309, + "grad_norm": 38.71976549849337, + "learning_rate": 8.62474076022531e-06, + "loss": 3.2773, + "step": 11183 + }, + { + "epoch": 0.9532088979800563, + "grad_norm": 26.707877874042644, + "learning_rate": 8.624399200714681e-06, + "loss": 2.3798, + "step": 11184 + }, + { + "epoch": 0.9532941276740816, + "grad_norm": 86.86507176281611, + "learning_rate": 8.624057605559524e-06, + "loss": 4.239, + "step": 11185 + }, + { + "epoch": 0.9533793573681071, + "grad_norm": 123.62548827891683, + "learning_rate": 8.623715974763203e-06, + "loss": 4.2569, + "step": 11186 + }, + { + "epoch": 0.9534645870621324, + "grad_norm": 58.19895297671851, + "learning_rate": 8.623374308329073e-06, + "loss": 3.882, + "step": 11187 + }, + { + "epoch": 0.9535498167561578, + "grad_norm": 39.50366211664893, + "learning_rate": 8.623032606260498e-06, + "loss": 3.1798, + "step": 11188 + }, + { + "epoch": 0.9536350464501833, + "grad_norm": 54.9466210413659, + "learning_rate": 8.622690868560837e-06, + "loss": 4.0426, + "step": 11189 + }, + { + "epoch": 0.9537202761442086, + "grad_norm": 41.093543082218254, + "learning_rate": 8.622349095233449e-06, + "loss": 4.0235, + "step": 11190 + }, + { + "epoch": 0.9538055058382341, + "grad_norm": 38.53828294828686, + "learning_rate": 8.622007286281699e-06, + "loss": 3.8484, + "step": 11191 + }, + { + "epoch": 0.9538907355322594, + "grad_norm": 70.93136670424032, + "learning_rate": 8.621665441708946e-06, + "loss": 4.318, + "step": 11192 + }, + { + "epoch": 0.9539759652262848, + "grad_norm": 53.97481537182599, + "learning_rate": 8.62132356151855e-06, + "loss": 3.5224, + "step": 11193 + }, + { + "epoch": 0.9540611949203103, + "grad_norm": 29.30194407189563, + "learning_rate": 8.620981645713879e-06, + "loss": 3.3327, + "step": 11194 + }, + { + "epoch": 0.9541464246143356, + "grad_norm": 39.31753909031858, + "learning_rate": 8.620639694298292e-06, + "loss": 4.1733, + "step": 11195 + }, + { + "epoch": 0.954231654308361, + "grad_norm": 29.165487067130694, + "learning_rate": 8.620297707275152e-06, + "loss": 3.0707, + "step": 11196 + }, + { + "epoch": 0.9543168840023865, + "grad_norm": 58.74690189435481, + "learning_rate": 8.619955684647823e-06, + "loss": 4.3752, + "step": 11197 + }, + { + "epoch": 0.9544021136964118, + "grad_norm": 115.15997083052162, + "learning_rate": 8.619613626419666e-06, + "loss": 6.5945, + "step": 11198 + }, + { + "epoch": 0.9544873433904373, + "grad_norm": 56.21018921526102, + "learning_rate": 8.619271532594049e-06, + "loss": 3.9992, + "step": 11199 + }, + { + "epoch": 0.9545725730844626, + "grad_norm": 65.39742414581173, + "learning_rate": 8.618929403174334e-06, + "loss": 3.579, + "step": 11200 + }, + { + "epoch": 0.954657802778488, + "grad_norm": 38.929592026765306, + "learning_rate": 8.618587238163888e-06, + "loss": 3.43, + "step": 11201 + }, + { + "epoch": 0.9547430324725135, + "grad_norm": 32.36430468691417, + "learning_rate": 8.618245037566074e-06, + "loss": 3.0165, + "step": 11202 + }, + { + "epoch": 0.9548282621665388, + "grad_norm": 38.8376494328342, + "learning_rate": 8.617902801384256e-06, + "loss": 4.7337, + "step": 11203 + }, + { + "epoch": 0.9549134918605642, + "grad_norm": 33.1153288194891, + "learning_rate": 8.617560529621802e-06, + "loss": 3.6265, + "step": 11204 + }, + { + "epoch": 0.9549987215545896, + "grad_norm": 33.94170787064997, + "learning_rate": 8.61721822228208e-06, + "loss": 3.0195, + "step": 11205 + }, + { + "epoch": 0.955083951248615, + "grad_norm": 38.30294521113394, + "learning_rate": 8.61687587936845e-06, + "loss": 3.6132, + "step": 11206 + }, + { + "epoch": 0.9551691809426404, + "grad_norm": 56.75075788951218, + "learning_rate": 8.616533500884288e-06, + "loss": 4.0887, + "step": 11207 + }, + { + "epoch": 0.9552544106366658, + "grad_norm": 29.335663261640274, + "learning_rate": 8.616191086832952e-06, + "loss": 3.9531, + "step": 11208 + }, + { + "epoch": 0.9553396403306912, + "grad_norm": 73.11228224069058, + "learning_rate": 8.615848637217815e-06, + "loss": 5.8526, + "step": 11209 + }, + { + "epoch": 0.9554248700247167, + "grad_norm": 47.00099543339088, + "learning_rate": 8.615506152042243e-06, + "loss": 3.2789, + "step": 11210 + }, + { + "epoch": 0.955510099718742, + "grad_norm": 70.19460204658873, + "learning_rate": 8.615163631309606e-06, + "loss": 3.7037, + "step": 11211 + }, + { + "epoch": 0.9555953294127674, + "grad_norm": 36.46146032764056, + "learning_rate": 8.61482107502327e-06, + "loss": 3.5304, + "step": 11212 + }, + { + "epoch": 0.9556805591067928, + "grad_norm": 75.45683949541058, + "learning_rate": 8.614478483186606e-06, + "loss": 4.6702, + "step": 11213 + }, + { + "epoch": 0.9557657888008182, + "grad_norm": 42.802353904043564, + "learning_rate": 8.614135855802981e-06, + "loss": 3.4519, + "step": 11214 + }, + { + "epoch": 0.9558510184948436, + "grad_norm": 59.54667473049165, + "learning_rate": 8.613793192875768e-06, + "loss": 4.3111, + "step": 11215 + }, + { + "epoch": 0.955936248188869, + "grad_norm": 40.0355161535729, + "learning_rate": 8.613450494408333e-06, + "loss": 4.4772, + "step": 11216 + }, + { + "epoch": 0.9560214778828944, + "grad_norm": 59.81933819869481, + "learning_rate": 8.61310776040405e-06, + "loss": 4.659, + "step": 11217 + }, + { + "epoch": 0.9561067075769198, + "grad_norm": 32.89749689580449, + "learning_rate": 8.612764990866285e-06, + "loss": 3.6946, + "step": 11218 + }, + { + "epoch": 0.9561919372709452, + "grad_norm": 75.95899652584299, + "learning_rate": 8.612422185798414e-06, + "loss": 4.8766, + "step": 11219 + }, + { + "epoch": 0.9562771669649706, + "grad_norm": 23.204449533058153, + "learning_rate": 8.612079345203806e-06, + "loss": 2.4638, + "step": 11220 + }, + { + "epoch": 0.956362396658996, + "grad_norm": 62.74644256431461, + "learning_rate": 8.611736469085833e-06, + "loss": 4.381, + "step": 11221 + }, + { + "epoch": 0.9564476263530214, + "grad_norm": 62.45356770254757, + "learning_rate": 8.611393557447867e-06, + "loss": 3.6705, + "step": 11222 + }, + { + "epoch": 0.9565328560470467, + "grad_norm": 73.26436618165943, + "learning_rate": 8.61105061029328e-06, + "loss": 4.4159, + "step": 11223 + }, + { + "epoch": 0.9566180857410722, + "grad_norm": 33.21052890300632, + "learning_rate": 8.610707627625445e-06, + "loss": 3.9307, + "step": 11224 + }, + { + "epoch": 0.9567033154350976, + "grad_norm": 92.8353966215278, + "learning_rate": 8.610364609447737e-06, + "loss": 4.4161, + "step": 11225 + }, + { + "epoch": 0.956788545129123, + "grad_norm": 47.54880347396343, + "learning_rate": 8.610021555763526e-06, + "loss": 3.3863, + "step": 11226 + }, + { + "epoch": 0.9568737748231484, + "grad_norm": 79.27950778618336, + "learning_rate": 8.609678466576188e-06, + "loss": 3.0833, + "step": 11227 + }, + { + "epoch": 0.9569590045171738, + "grad_norm": 30.23239880181668, + "learning_rate": 8.609335341889096e-06, + "loss": 3.4358, + "step": 11228 + }, + { + "epoch": 0.9570442342111992, + "grad_norm": 71.27009464849029, + "learning_rate": 8.608992181705626e-06, + "loss": 5.3031, + "step": 11229 + }, + { + "epoch": 0.9571294639052246, + "grad_norm": 75.87699620784782, + "learning_rate": 8.60864898602915e-06, + "loss": 6.1699, + "step": 11230 + }, + { + "epoch": 0.9572146935992499, + "grad_norm": 89.4083344059836, + "learning_rate": 8.608305754863048e-06, + "loss": 4.143, + "step": 11231 + }, + { + "epoch": 0.9572999232932754, + "grad_norm": 56.19903105003254, + "learning_rate": 8.607962488210691e-06, + "loss": 4.8745, + "step": 11232 + }, + { + "epoch": 0.9573851529873008, + "grad_norm": 42.47590390620451, + "learning_rate": 8.607619186075458e-06, + "loss": 4.4231, + "step": 11233 + }, + { + "epoch": 0.9574703826813262, + "grad_norm": 29.930178941748856, + "learning_rate": 8.607275848460723e-06, + "loss": 3.3631, + "step": 11234 + }, + { + "epoch": 0.9575556123753516, + "grad_norm": 25.542575611691838, + "learning_rate": 8.606932475369862e-06, + "loss": 3.9166, + "step": 11235 + }, + { + "epoch": 0.9576408420693769, + "grad_norm": 64.34935073078591, + "learning_rate": 8.606589066806254e-06, + "loss": 4.406, + "step": 11236 + }, + { + "epoch": 0.9577260717634024, + "grad_norm": 60.365600216133075, + "learning_rate": 8.606245622773277e-06, + "loss": 4.7258, + "step": 11237 + }, + { + "epoch": 0.9578113014574278, + "grad_norm": 72.96211373474449, + "learning_rate": 8.605902143274307e-06, + "loss": 5.8608, + "step": 11238 + }, + { + "epoch": 0.9578965311514531, + "grad_norm": 73.90742619828572, + "learning_rate": 8.605558628312723e-06, + "loss": 3.5384, + "step": 11239 + }, + { + "epoch": 0.9579817608454786, + "grad_norm": 57.54134013789088, + "learning_rate": 8.6052150778919e-06, + "loss": 4.2865, + "step": 11240 + }, + { + "epoch": 0.958066990539504, + "grad_norm": 125.84324621141974, + "learning_rate": 8.60487149201522e-06, + "loss": 3.8452, + "step": 11241 + }, + { + "epoch": 0.9581522202335294, + "grad_norm": 37.2305249349935, + "learning_rate": 8.604527870686064e-06, + "loss": 3.0891, + "step": 11242 + }, + { + "epoch": 0.9582374499275548, + "grad_norm": 68.94580634473138, + "learning_rate": 8.604184213907804e-06, + "loss": 4.9883, + "step": 11243 + }, + { + "epoch": 0.9583226796215801, + "grad_norm": 45.127416317276364, + "learning_rate": 8.603840521683828e-06, + "loss": 3.5683, + "step": 11244 + }, + { + "epoch": 0.9584079093156056, + "grad_norm": 33.22728521884597, + "learning_rate": 8.603496794017511e-06, + "loss": 3.278, + "step": 11245 + }, + { + "epoch": 0.958493139009631, + "grad_norm": 94.849204146998, + "learning_rate": 8.603153030912238e-06, + "loss": 4.1427, + "step": 11246 + }, + { + "epoch": 0.9585783687036563, + "grad_norm": 35.826286313233595, + "learning_rate": 8.602809232371383e-06, + "loss": 3.394, + "step": 11247 + }, + { + "epoch": 0.9586635983976818, + "grad_norm": 53.44753758332551, + "learning_rate": 8.602465398398332e-06, + "loss": 4.3421, + "step": 11248 + }, + { + "epoch": 0.9587488280917071, + "grad_norm": 22.606696331667436, + "learning_rate": 8.602121528996466e-06, + "loss": 2.8883, + "step": 11249 + }, + { + "epoch": 0.9588340577857325, + "grad_norm": 31.65561715247751, + "learning_rate": 8.601777624169165e-06, + "loss": 3.7539, + "step": 11250 + }, + { + "epoch": 0.958919287479758, + "grad_norm": 37.95511433274675, + "learning_rate": 8.601433683919813e-06, + "loss": 4.3958, + "step": 11251 + }, + { + "epoch": 0.9590045171737833, + "grad_norm": 220.91930658200087, + "learning_rate": 8.60108970825179e-06, + "loss": 3.8362, + "step": 11252 + }, + { + "epoch": 0.9590897468678088, + "grad_norm": 58.56522511955536, + "learning_rate": 8.600745697168482e-06, + "loss": 4.8656, + "step": 11253 + }, + { + "epoch": 0.9591749765618341, + "grad_norm": 34.022224207257864, + "learning_rate": 8.600401650673272e-06, + "loss": 2.791, + "step": 11254 + }, + { + "epoch": 0.9592602062558595, + "grad_norm": 29.564691794606464, + "learning_rate": 8.600057568769542e-06, + "loss": 3.0743, + "step": 11255 + }, + { + "epoch": 0.959345435949885, + "grad_norm": 60.527886779962095, + "learning_rate": 8.599713451460676e-06, + "loss": 4.5795, + "step": 11256 + }, + { + "epoch": 0.9594306656439103, + "grad_norm": 70.26978258351626, + "learning_rate": 8.599369298750058e-06, + "loss": 5.5462, + "step": 11257 + }, + { + "epoch": 0.9595158953379357, + "grad_norm": 58.03202453801915, + "learning_rate": 8.599025110641073e-06, + "loss": 4.1195, + "step": 11258 + }, + { + "epoch": 0.9596011250319612, + "grad_norm": 34.59096612894098, + "learning_rate": 8.598680887137106e-06, + "loss": 3.5821, + "step": 11259 + }, + { + "epoch": 0.9596863547259865, + "grad_norm": 81.36311146292071, + "learning_rate": 8.598336628241544e-06, + "loss": 3.9987, + "step": 11260 + }, + { + "epoch": 0.959771584420012, + "grad_norm": 40.51144194223512, + "learning_rate": 8.59799233395777e-06, + "loss": 4.1221, + "step": 11261 + }, + { + "epoch": 0.9598568141140373, + "grad_norm": 45.80763375012873, + "learning_rate": 8.597648004289172e-06, + "loss": 3.584, + "step": 11262 + }, + { + "epoch": 0.9599420438080627, + "grad_norm": 24.120238832679565, + "learning_rate": 8.597303639239134e-06, + "loss": 3.2531, + "step": 11263 + }, + { + "epoch": 0.9600272735020882, + "grad_norm": 57.04473156315878, + "learning_rate": 8.596959238811045e-06, + "loss": 4.1162, + "step": 11264 + }, + { + "epoch": 0.9601125031961135, + "grad_norm": 45.572450951506255, + "learning_rate": 8.596614803008291e-06, + "loss": 3.7619, + "step": 11265 + }, + { + "epoch": 0.9601977328901389, + "grad_norm": 33.18018085260521, + "learning_rate": 8.59627033183426e-06, + "loss": 2.8694, + "step": 11266 + }, + { + "epoch": 0.9602829625841643, + "grad_norm": 76.52163362353004, + "learning_rate": 8.595925825292339e-06, + "loss": 5.269, + "step": 11267 + }, + { + "epoch": 0.9603681922781897, + "grad_norm": 60.368062261499176, + "learning_rate": 8.595581283385917e-06, + "loss": 2.2563, + "step": 11268 + }, + { + "epoch": 0.9604534219722152, + "grad_norm": 53.898727305963085, + "learning_rate": 8.59523670611838e-06, + "loss": 3.6389, + "step": 11269 + }, + { + "epoch": 0.9605386516662405, + "grad_norm": 61.191351915790484, + "learning_rate": 8.59489209349312e-06, + "loss": 3.351, + "step": 11270 + }, + { + "epoch": 0.9606238813602659, + "grad_norm": 37.90212177628853, + "learning_rate": 8.594547445513525e-06, + "loss": 4.4623, + "step": 11271 + }, + { + "epoch": 0.9607091110542914, + "grad_norm": 37.080074071307195, + "learning_rate": 8.594202762182984e-06, + "loss": 3.759, + "step": 11272 + }, + { + "epoch": 0.9607943407483167, + "grad_norm": 99.74050738351049, + "learning_rate": 8.593858043504885e-06, + "loss": 4.5442, + "step": 11273 + }, + { + "epoch": 0.9608795704423421, + "grad_norm": 75.31169641920366, + "learning_rate": 8.593513289482623e-06, + "loss": 5.7164, + "step": 11274 + }, + { + "epoch": 0.9609648001363675, + "grad_norm": 31.710430899458807, + "learning_rate": 8.593168500119584e-06, + "loss": 3.9509, + "step": 11275 + }, + { + "epoch": 0.9610500298303929, + "grad_norm": 29.1888733759726, + "learning_rate": 8.592823675419162e-06, + "loss": 3.1, + "step": 11276 + }, + { + "epoch": 0.9611352595244184, + "grad_norm": 52.175637870654384, + "learning_rate": 8.592478815384746e-06, + "loss": 4.3164, + "step": 11277 + }, + { + "epoch": 0.9612204892184437, + "grad_norm": 29.86258346359973, + "learning_rate": 8.592133920019729e-06, + "loss": 4.1928, + "step": 11278 + }, + { + "epoch": 0.9613057189124691, + "grad_norm": 42.2652213015669, + "learning_rate": 8.591788989327501e-06, + "loss": 4.1032, + "step": 11279 + }, + { + "epoch": 0.9613909486064945, + "grad_norm": 48.7932732868287, + "learning_rate": 8.591444023311455e-06, + "loss": 4.1872, + "step": 11280 + }, + { + "epoch": 0.9614761783005199, + "grad_norm": 23.481373305521874, + "learning_rate": 8.591099021974986e-06, + "loss": 2.4898, + "step": 11281 + }, + { + "epoch": 0.9615614079945453, + "grad_norm": 47.209394031208674, + "learning_rate": 8.590753985321485e-06, + "loss": 4.1545, + "step": 11282 + }, + { + "epoch": 0.9616466376885707, + "grad_norm": 23.724723953308363, + "learning_rate": 8.590408913354345e-06, + "loss": 2.7295, + "step": 11283 + }, + { + "epoch": 0.9617318673825961, + "grad_norm": 49.4241707298376, + "learning_rate": 8.590063806076961e-06, + "loss": 4.7549, + "step": 11284 + }, + { + "epoch": 0.9618170970766214, + "grad_norm": 46.84688992377699, + "learning_rate": 8.589718663492723e-06, + "loss": 3.9663, + "step": 11285 + }, + { + "epoch": 0.9619023267706469, + "grad_norm": 46.585780812272645, + "learning_rate": 8.589373485605031e-06, + "loss": 4.0593, + "step": 11286 + }, + { + "epoch": 0.9619875564646723, + "grad_norm": 42.5931423935424, + "learning_rate": 8.589028272417275e-06, + "loss": 3.0304, + "step": 11287 + }, + { + "epoch": 0.9620727861586977, + "grad_norm": 81.04402898041697, + "learning_rate": 8.588683023932853e-06, + "loss": 4.13, + "step": 11288 + }, + { + "epoch": 0.9621580158527231, + "grad_norm": 76.14794081371699, + "learning_rate": 8.588337740155162e-06, + "loss": 4.9239, + "step": 11289 + }, + { + "epoch": 0.9622432455467484, + "grad_norm": 41.02501871371747, + "learning_rate": 8.587992421087593e-06, + "loss": 4.156, + "step": 11290 + }, + { + "epoch": 0.9623284752407739, + "grad_norm": 36.606668539253015, + "learning_rate": 8.587647066733544e-06, + "loss": 4.2845, + "step": 11291 + }, + { + "epoch": 0.9624137049347993, + "grad_norm": 42.6586124445885, + "learning_rate": 8.587301677096411e-06, + "loss": 3.3313, + "step": 11292 + }, + { + "epoch": 0.9624989346288246, + "grad_norm": 78.9495717755637, + "learning_rate": 8.586956252179592e-06, + "loss": 3.7651, + "step": 11293 + }, + { + "epoch": 0.9625841643228501, + "grad_norm": 28.884420227011574, + "learning_rate": 8.586610791986484e-06, + "loss": 2.954, + "step": 11294 + }, + { + "epoch": 0.9626693940168755, + "grad_norm": 59.26395326120716, + "learning_rate": 8.586265296520485e-06, + "loss": 6.0336, + "step": 11295 + }, + { + "epoch": 0.9627546237109009, + "grad_norm": 37.36379160910231, + "learning_rate": 8.585919765784992e-06, + "loss": 4.2831, + "step": 11296 + }, + { + "epoch": 0.9628398534049263, + "grad_norm": 49.43200657792156, + "learning_rate": 8.5855741997834e-06, + "loss": 4.3109, + "step": 11297 + }, + { + "epoch": 0.9629250830989516, + "grad_norm": 66.87046159205558, + "learning_rate": 8.585228598519114e-06, + "loss": 4.2483, + "step": 11298 + }, + { + "epoch": 0.9630103127929771, + "grad_norm": 37.84875616608242, + "learning_rate": 8.584882961995527e-06, + "loss": 4.2279, + "step": 11299 + }, + { + "epoch": 0.9630955424870025, + "grad_norm": 46.410660971703294, + "learning_rate": 8.584537290216041e-06, + "loss": 4.2759, + "step": 11300 + }, + { + "epoch": 0.9631807721810278, + "grad_norm": 46.402221384440836, + "learning_rate": 8.584191583184056e-06, + "loss": 4.6586, + "step": 11301 + }, + { + "epoch": 0.9632660018750533, + "grad_norm": 38.96542691838995, + "learning_rate": 8.583845840902969e-06, + "loss": 3.8351, + "step": 11302 + }, + { + "epoch": 0.9633512315690786, + "grad_norm": 43.54650327643991, + "learning_rate": 8.583500063376184e-06, + "loss": 4.2295, + "step": 11303 + }, + { + "epoch": 0.9634364612631041, + "grad_norm": 38.30390257375504, + "learning_rate": 8.5831542506071e-06, + "loss": 3.4969, + "step": 11304 + }, + { + "epoch": 0.9635216909571295, + "grad_norm": 34.96808866177714, + "learning_rate": 8.582808402599116e-06, + "loss": 4.7969, + "step": 11305 + }, + { + "epoch": 0.9636069206511548, + "grad_norm": 71.43356568843883, + "learning_rate": 8.582462519355638e-06, + "loss": 4.9327, + "step": 11306 + }, + { + "epoch": 0.9636921503451803, + "grad_norm": 34.19811431113633, + "learning_rate": 8.582116600880061e-06, + "loss": 3.4215, + "step": 11307 + }, + { + "epoch": 0.9637773800392057, + "grad_norm": 29.77952789563288, + "learning_rate": 8.581770647175794e-06, + "loss": 3.9636, + "step": 11308 + }, + { + "epoch": 0.963862609733231, + "grad_norm": 68.9306155170284, + "learning_rate": 8.581424658246233e-06, + "loss": 4.4477, + "step": 11309 + }, + { + "epoch": 0.9639478394272565, + "grad_norm": 27.8294566767274, + "learning_rate": 8.581078634094786e-06, + "loss": 3.6849, + "step": 11310 + }, + { + "epoch": 0.9640330691212818, + "grad_norm": 51.43702480342914, + "learning_rate": 8.580732574724853e-06, + "loss": 4.7124, + "step": 11311 + }, + { + "epoch": 0.9641182988153073, + "grad_norm": 60.73411018673972, + "learning_rate": 8.580386480139837e-06, + "loss": 4.2232, + "step": 11312 + }, + { + "epoch": 0.9642035285093327, + "grad_norm": 163.99880094339264, + "learning_rate": 8.580040350343144e-06, + "loss": 4.3085, + "step": 11313 + }, + { + "epoch": 0.964288758203358, + "grad_norm": 29.172264764368194, + "learning_rate": 8.579694185338177e-06, + "loss": 3.6094, + "step": 11314 + }, + { + "epoch": 0.9643739878973835, + "grad_norm": 54.294496607051805, + "learning_rate": 8.579347985128338e-06, + "loss": 3.4668, + "step": 11315 + }, + { + "epoch": 0.9644592175914088, + "grad_norm": 38.33555036297991, + "learning_rate": 8.579001749717035e-06, + "loss": 3.3546, + "step": 11316 + }, + { + "epoch": 0.9645444472854342, + "grad_norm": 50.81592578075325, + "learning_rate": 8.578655479107671e-06, + "loss": 3.8403, + "step": 11317 + }, + { + "epoch": 0.9646296769794597, + "grad_norm": 31.469272196948648, + "learning_rate": 8.578309173303654e-06, + "loss": 3.7793, + "step": 11318 + }, + { + "epoch": 0.964714906673485, + "grad_norm": 54.92411573408345, + "learning_rate": 8.577962832308388e-06, + "loss": 4.1061, + "step": 11319 + }, + { + "epoch": 0.9648001363675104, + "grad_norm": 35.973781367275606, + "learning_rate": 8.57761645612528e-06, + "loss": 4.4084, + "step": 11320 + }, + { + "epoch": 0.9648853660615359, + "grad_norm": 61.44228961473879, + "learning_rate": 8.577270044757732e-06, + "loss": 4.7881, + "step": 11321 + }, + { + "epoch": 0.9649705957555612, + "grad_norm": 118.19708359257906, + "learning_rate": 8.576923598209159e-06, + "loss": 3.9063, + "step": 11322 + }, + { + "epoch": 0.9650558254495867, + "grad_norm": 35.940027069804685, + "learning_rate": 8.576577116482962e-06, + "loss": 3.8165, + "step": 11323 + }, + { + "epoch": 0.965141055143612, + "grad_norm": 59.721215865906, + "learning_rate": 8.57623059958255e-06, + "loss": 4.2233, + "step": 11324 + }, + { + "epoch": 0.9652262848376374, + "grad_norm": 89.63981897118633, + "learning_rate": 8.575884047511332e-06, + "loss": 3.7937, + "step": 11325 + }, + { + "epoch": 0.9653115145316629, + "grad_norm": 34.06675734379415, + "learning_rate": 8.575537460272714e-06, + "loss": 3.7969, + "step": 11326 + }, + { + "epoch": 0.9653967442256882, + "grad_norm": 28.92030375856374, + "learning_rate": 8.575190837870107e-06, + "loss": 2.1981, + "step": 11327 + }, + { + "epoch": 0.9654819739197136, + "grad_norm": 32.69275217528337, + "learning_rate": 8.574844180306918e-06, + "loss": 3.5646, + "step": 11328 + }, + { + "epoch": 0.965567203613739, + "grad_norm": 34.96965634216925, + "learning_rate": 8.574497487586556e-06, + "loss": 3.1596, + "step": 11329 + }, + { + "epoch": 0.9656524333077644, + "grad_norm": 31.314495152098036, + "learning_rate": 8.574150759712432e-06, + "loss": 3.449, + "step": 11330 + }, + { + "epoch": 0.9657376630017899, + "grad_norm": 41.63826734866096, + "learning_rate": 8.573803996687957e-06, + "loss": 4.0085, + "step": 11331 + }, + { + "epoch": 0.9658228926958152, + "grad_norm": 38.97951688575567, + "learning_rate": 8.57345719851654e-06, + "loss": 3.6699, + "step": 11332 + }, + { + "epoch": 0.9659081223898406, + "grad_norm": 66.47258135059153, + "learning_rate": 8.573110365201589e-06, + "loss": 3.8762, + "step": 11333 + }, + { + "epoch": 0.965993352083866, + "grad_norm": 76.2970777271943, + "learning_rate": 8.57276349674652e-06, + "loss": 4.2733, + "step": 11334 + }, + { + "epoch": 0.9660785817778914, + "grad_norm": 59.99905934191728, + "learning_rate": 8.57241659315474e-06, + "loss": 5.1853, + "step": 11335 + }, + { + "epoch": 0.9661638114719168, + "grad_norm": 44.50551105476749, + "learning_rate": 8.572069654429663e-06, + "loss": 3.3459, + "step": 11336 + }, + { + "epoch": 0.9662490411659422, + "grad_norm": 58.0068110305859, + "learning_rate": 8.5717226805747e-06, + "loss": 5.1573, + "step": 11337 + }, + { + "epoch": 0.9663342708599676, + "grad_norm": 46.40391908981176, + "learning_rate": 8.571375671593265e-06, + "loss": 3.0496, + "step": 11338 + }, + { + "epoch": 0.9664195005539931, + "grad_norm": 75.68838871143826, + "learning_rate": 8.571028627488766e-06, + "loss": 5.2372, + "step": 11339 + }, + { + "epoch": 0.9665047302480184, + "grad_norm": 90.09871529484985, + "learning_rate": 8.570681548264624e-06, + "loss": 5.7564, + "step": 11340 + }, + { + "epoch": 0.9665899599420438, + "grad_norm": 37.77509787550908, + "learning_rate": 8.570334433924245e-06, + "loss": 3.2541, + "step": 11341 + }, + { + "epoch": 0.9666751896360692, + "grad_norm": 38.51529345116559, + "learning_rate": 8.569987284471047e-06, + "loss": 3.6403, + "step": 11342 + }, + { + "epoch": 0.9667604193300946, + "grad_norm": 43.84083043827391, + "learning_rate": 8.56964009990844e-06, + "loss": 4.3988, + "step": 11343 + }, + { + "epoch": 0.96684564902412, + "grad_norm": 71.40421780457585, + "learning_rate": 8.569292880239843e-06, + "loss": 4.8065, + "step": 11344 + }, + { + "epoch": 0.9669308787181454, + "grad_norm": 105.36095704541998, + "learning_rate": 8.56894562546867e-06, + "loss": 3.208, + "step": 11345 + }, + { + "epoch": 0.9670161084121708, + "grad_norm": 31.040361900178812, + "learning_rate": 8.568598335598332e-06, + "loss": 3.7772, + "step": 11346 + }, + { + "epoch": 0.9671013381061963, + "grad_norm": 30.534592445616173, + "learning_rate": 8.568251010632249e-06, + "loss": 4.3136, + "step": 11347 + }, + { + "epoch": 0.9671865678002216, + "grad_norm": 37.38580689757697, + "learning_rate": 8.567903650573834e-06, + "loss": 2.5163, + "step": 11348 + }, + { + "epoch": 0.967271797494247, + "grad_norm": 31.36850465639077, + "learning_rate": 8.567556255426506e-06, + "loss": 3.0073, + "step": 11349 + }, + { + "epoch": 0.9673570271882724, + "grad_norm": 37.80683252694545, + "learning_rate": 8.56720882519368e-06, + "loss": 3.1395, + "step": 11350 + }, + { + "epoch": 0.9674422568822978, + "grad_norm": 30.623397421445198, + "learning_rate": 8.566861359878772e-06, + "loss": 3.8705, + "step": 11351 + }, + { + "epoch": 0.9675274865763231, + "grad_norm": 32.636863767356, + "learning_rate": 8.5665138594852e-06, + "loss": 3.3308, + "step": 11352 + }, + { + "epoch": 0.9676127162703486, + "grad_norm": 48.29025464777401, + "learning_rate": 8.56616632401638e-06, + "loss": 5.4778, + "step": 11353 + }, + { + "epoch": 0.967697945964374, + "grad_norm": 46.59168006448831, + "learning_rate": 8.565818753475733e-06, + "loss": 4.1915, + "step": 11354 + }, + { + "epoch": 0.9677831756583994, + "grad_norm": 48.30987150618491, + "learning_rate": 8.565471147866676e-06, + "loss": 4.7608, + "step": 11355 + }, + { + "epoch": 0.9678684053524248, + "grad_norm": 28.597830363278025, + "learning_rate": 8.565123507192626e-06, + "loss": 3.0829, + "step": 11356 + }, + { + "epoch": 0.9679536350464502, + "grad_norm": 29.3070021555648, + "learning_rate": 8.564775831457002e-06, + "loss": 2.142, + "step": 11357 + }, + { + "epoch": 0.9680388647404756, + "grad_norm": 47.045471426054206, + "learning_rate": 8.564428120663225e-06, + "loss": 3.7738, + "step": 11358 + }, + { + "epoch": 0.968124094434501, + "grad_norm": 32.79392314020031, + "learning_rate": 8.564080374814714e-06, + "loss": 3.2903, + "step": 11359 + }, + { + "epoch": 0.9682093241285263, + "grad_norm": 67.05781240075167, + "learning_rate": 8.563732593914888e-06, + "loss": 2.5664, + "step": 11360 + }, + { + "epoch": 0.9682945538225518, + "grad_norm": 33.774422735259954, + "learning_rate": 8.56338477796717e-06, + "loss": 3.6798, + "step": 11361 + }, + { + "epoch": 0.9683797835165772, + "grad_norm": 81.0345012717534, + "learning_rate": 8.563036926974978e-06, + "loss": 6.3993, + "step": 11362 + }, + { + "epoch": 0.9684650132106025, + "grad_norm": 31.46394014041239, + "learning_rate": 8.562689040941732e-06, + "loss": 3.0894, + "step": 11363 + }, + { + "epoch": 0.968550242904628, + "grad_norm": 31.434200125796572, + "learning_rate": 8.562341119870856e-06, + "loss": 3.4172, + "step": 11364 + }, + { + "epoch": 0.9686354725986533, + "grad_norm": 29.490604218425393, + "learning_rate": 8.56199316376577e-06, + "loss": 2.343, + "step": 11365 + }, + { + "epoch": 0.9687207022926788, + "grad_norm": 47.648624168845, + "learning_rate": 8.561645172629898e-06, + "loss": 4.5813, + "step": 11366 + }, + { + "epoch": 0.9688059319867042, + "grad_norm": 46.331436304399176, + "learning_rate": 8.561297146466661e-06, + "loss": 3.7213, + "step": 11367 + }, + { + "epoch": 0.9688911616807295, + "grad_norm": 57.39961006114845, + "learning_rate": 8.560949085279479e-06, + "loss": 3.8956, + "step": 11368 + }, + { + "epoch": 0.968976391374755, + "grad_norm": 49.442073556632025, + "learning_rate": 8.560600989071781e-06, + "loss": 4.6192, + "step": 11369 + }, + { + "epoch": 0.9690616210687804, + "grad_norm": 44.67713774024535, + "learning_rate": 8.560252857846985e-06, + "loss": 3.4231, + "step": 11370 + }, + { + "epoch": 0.9691468507628057, + "grad_norm": 33.186122918108204, + "learning_rate": 8.559904691608519e-06, + "loss": 3.3902, + "step": 11371 + }, + { + "epoch": 0.9692320804568312, + "grad_norm": 37.10456396563853, + "learning_rate": 8.559556490359802e-06, + "loss": 3.7051, + "step": 11372 + }, + { + "epoch": 0.9693173101508565, + "grad_norm": 86.28318879977977, + "learning_rate": 8.559208254104263e-06, + "loss": 4.2429, + "step": 11373 + }, + { + "epoch": 0.969402539844882, + "grad_norm": 42.44179299616674, + "learning_rate": 8.558859982845324e-06, + "loss": 3.7382, + "step": 11374 + }, + { + "epoch": 0.9694877695389074, + "grad_norm": 43.917534869049405, + "learning_rate": 8.558511676586413e-06, + "loss": 2.9371, + "step": 11375 + }, + { + "epoch": 0.9695729992329327, + "grad_norm": 149.087469468157, + "learning_rate": 8.558163335330953e-06, + "loss": 4.2499, + "step": 11376 + }, + { + "epoch": 0.9696582289269582, + "grad_norm": 32.79066480913152, + "learning_rate": 8.55781495908237e-06, + "loss": 2.9422, + "step": 11377 + }, + { + "epoch": 0.9697434586209835, + "grad_norm": 40.760832317781286, + "learning_rate": 8.557466547844091e-06, + "loss": 4.1482, + "step": 11378 + }, + { + "epoch": 0.9698286883150089, + "grad_norm": 27.89191840864219, + "learning_rate": 8.557118101619541e-06, + "loss": 4.3109, + "step": 11379 + }, + { + "epoch": 0.9699139180090344, + "grad_norm": 74.53557039790813, + "learning_rate": 8.556769620412151e-06, + "loss": 5.1554, + "step": 11380 + }, + { + "epoch": 0.9699991477030597, + "grad_norm": 18.678445700106757, + "learning_rate": 8.556421104225343e-06, + "loss": 2.4411, + "step": 11381 + }, + { + "epoch": 0.9700843773970852, + "grad_norm": 43.297558762527004, + "learning_rate": 8.556072553062546e-06, + "loss": 4.1889, + "step": 11382 + }, + { + "epoch": 0.9701696070911106, + "grad_norm": 80.84504163055874, + "learning_rate": 8.55572396692719e-06, + "loss": 4.1428, + "step": 11383 + }, + { + "epoch": 0.9702548367851359, + "grad_norm": 47.29613186931303, + "learning_rate": 8.555375345822701e-06, + "loss": 4.5758, + "step": 11384 + }, + { + "epoch": 0.9703400664791614, + "grad_norm": 36.11953377823128, + "learning_rate": 8.555026689752508e-06, + "loss": 3.6127, + "step": 11385 + }, + { + "epoch": 0.9704252961731867, + "grad_norm": 71.10194682667739, + "learning_rate": 8.55467799872004e-06, + "loss": 3.1852, + "step": 11386 + }, + { + "epoch": 0.9705105258672121, + "grad_norm": 22.74549293426004, + "learning_rate": 8.554329272728725e-06, + "loss": 3.2271, + "step": 11387 + }, + { + "epoch": 0.9705957555612376, + "grad_norm": 41.20682686955063, + "learning_rate": 8.553980511781996e-06, + "loss": 2.3394, + "step": 11388 + }, + { + "epoch": 0.9706809852552629, + "grad_norm": 32.007231368346766, + "learning_rate": 8.553631715883282e-06, + "loss": 3.8593, + "step": 11389 + }, + { + "epoch": 0.9707662149492884, + "grad_norm": 38.598621872099926, + "learning_rate": 8.55328288503601e-06, + "loss": 3.8509, + "step": 11390 + }, + { + "epoch": 0.9708514446433137, + "grad_norm": 42.15874974705101, + "learning_rate": 8.552934019243613e-06, + "loss": 1.787, + "step": 11391 + }, + { + "epoch": 0.9709366743373391, + "grad_norm": 42.781037978125845, + "learning_rate": 8.552585118509522e-06, + "loss": 3.791, + "step": 11392 + }, + { + "epoch": 0.9710219040313646, + "grad_norm": 192.54977862664302, + "learning_rate": 8.552236182837169e-06, + "loss": 3.2013, + "step": 11393 + }, + { + "epoch": 0.9711071337253899, + "grad_norm": 101.48591270846654, + "learning_rate": 8.551887212229983e-06, + "loss": 5.1449, + "step": 11394 + }, + { + "epoch": 0.9711923634194153, + "grad_norm": 47.67686996837462, + "learning_rate": 8.5515382066914e-06, + "loss": 4.417, + "step": 11395 + }, + { + "epoch": 0.9712775931134408, + "grad_norm": 76.4407124470466, + "learning_rate": 8.551189166224848e-06, + "loss": 3.8304, + "step": 11396 + }, + { + "epoch": 0.9713628228074661, + "grad_norm": 64.94377409868721, + "learning_rate": 8.55084009083376e-06, + "loss": 5.4751, + "step": 11397 + }, + { + "epoch": 0.9714480525014915, + "grad_norm": 122.2174876457461, + "learning_rate": 8.550490980521574e-06, + "loss": 4.6825, + "step": 11398 + }, + { + "epoch": 0.9715332821955169, + "grad_norm": 90.51993363435216, + "learning_rate": 8.550141835291719e-06, + "loss": 5.1864, + "step": 11399 + }, + { + "epoch": 0.9716185118895423, + "grad_norm": 37.59169587910452, + "learning_rate": 8.549792655147629e-06, + "loss": 3.6898, + "step": 11400 + }, + { + "epoch": 0.9717037415835678, + "grad_norm": 41.24585181775269, + "learning_rate": 8.54944344009274e-06, + "loss": 3.621, + "step": 11401 + }, + { + "epoch": 0.9717889712775931, + "grad_norm": 63.30644928090416, + "learning_rate": 8.549094190130483e-06, + "loss": 3.8958, + "step": 11402 + }, + { + "epoch": 0.9718742009716185, + "grad_norm": 40.08059249401169, + "learning_rate": 8.548744905264296e-06, + "loss": 3.8951, + "step": 11403 + }, + { + "epoch": 0.9719594306656439, + "grad_norm": 48.8018894876076, + "learning_rate": 8.548395585497614e-06, + "loss": 4.341, + "step": 11404 + }, + { + "epoch": 0.9720446603596693, + "grad_norm": 63.473488360305176, + "learning_rate": 8.54804623083387e-06, + "loss": 5.8051, + "step": 11405 + }, + { + "epoch": 0.9721298900536947, + "grad_norm": 32.29129323129928, + "learning_rate": 8.547696841276503e-06, + "loss": 4.9059, + "step": 11406 + }, + { + "epoch": 0.9722151197477201, + "grad_norm": 40.30113509808738, + "learning_rate": 8.547347416828946e-06, + "loss": 3.9776, + "step": 11407 + }, + { + "epoch": 0.9723003494417455, + "grad_norm": 36.00080470730599, + "learning_rate": 8.546997957494637e-06, + "loss": 4.5874, + "step": 11408 + }, + { + "epoch": 0.972385579135771, + "grad_norm": 71.24332125720842, + "learning_rate": 8.546648463277012e-06, + "loss": 3.8832, + "step": 11409 + }, + { + "epoch": 0.9724708088297963, + "grad_norm": 39.93768362103277, + "learning_rate": 8.54629893417951e-06, + "loss": 3.1672, + "step": 11410 + }, + { + "epoch": 0.9725560385238217, + "grad_norm": 43.04174208597937, + "learning_rate": 8.545949370205566e-06, + "loss": 3.4825, + "step": 11411 + }, + { + "epoch": 0.9726412682178471, + "grad_norm": 67.03007227752701, + "learning_rate": 8.54559977135862e-06, + "loss": 4.6758, + "step": 11412 + }, + { + "epoch": 0.9727264979118725, + "grad_norm": 43.57638621057074, + "learning_rate": 8.545250137642108e-06, + "loss": 3.9104, + "step": 11413 + }, + { + "epoch": 0.9728117276058978, + "grad_norm": 37.02483838642878, + "learning_rate": 8.54490046905947e-06, + "loss": 3.6123, + "step": 11414 + }, + { + "epoch": 0.9728969572999233, + "grad_norm": 63.73604485729071, + "learning_rate": 8.544550765614144e-06, + "loss": 4.0697, + "step": 11415 + }, + { + "epoch": 0.9729821869939487, + "grad_norm": 68.48116048113839, + "learning_rate": 8.544201027309572e-06, + "loss": 4.3646, + "step": 11416 + }, + { + "epoch": 0.9730674166879741, + "grad_norm": 35.89529187460823, + "learning_rate": 8.543851254149187e-06, + "loss": 3.2618, + "step": 11417 + }, + { + "epoch": 0.9731526463819995, + "grad_norm": 30.107169459846208, + "learning_rate": 8.543501446136436e-06, + "loss": 3.5327, + "step": 11418 + }, + { + "epoch": 0.9732378760760249, + "grad_norm": 35.087075958444764, + "learning_rate": 8.543151603274755e-06, + "loss": 3.1562, + "step": 11419 + }, + { + "epoch": 0.9733231057700503, + "grad_norm": 40.495270532624616, + "learning_rate": 8.542801725567587e-06, + "loss": 3.4975, + "step": 11420 + }, + { + "epoch": 0.9734083354640757, + "grad_norm": 31.087401882756875, + "learning_rate": 8.542451813018372e-06, + "loss": 3.147, + "step": 11421 + }, + { + "epoch": 0.973493565158101, + "grad_norm": 43.112718618859354, + "learning_rate": 8.54210186563055e-06, + "loss": 3.4884, + "step": 11422 + }, + { + "epoch": 0.9735787948521265, + "grad_norm": 66.21733658123746, + "learning_rate": 8.541751883407565e-06, + "loss": 4.3078, + "step": 11423 + }, + { + "epoch": 0.9736640245461519, + "grad_norm": 35.024317498629365, + "learning_rate": 8.541401866352856e-06, + "loss": 4.3258, + "step": 11424 + }, + { + "epoch": 0.9737492542401773, + "grad_norm": 38.26260205613921, + "learning_rate": 8.54105181446987e-06, + "loss": 2.4492, + "step": 11425 + }, + { + "epoch": 0.9738344839342027, + "grad_norm": 33.30931751928733, + "learning_rate": 8.540701727762044e-06, + "loss": 3.7492, + "step": 11426 + }, + { + "epoch": 0.973919713628228, + "grad_norm": 47.934842054941974, + "learning_rate": 8.540351606232822e-06, + "loss": 4.4024, + "step": 11427 + }, + { + "epoch": 0.9740049433222535, + "grad_norm": 49.78244629037212, + "learning_rate": 8.540001449885652e-06, + "loss": 3.5505, + "step": 11428 + }, + { + "epoch": 0.9740901730162789, + "grad_norm": 28.674884906049062, + "learning_rate": 8.53965125872397e-06, + "loss": 3.6174, + "step": 11429 + }, + { + "epoch": 0.9741754027103042, + "grad_norm": 44.827675699256496, + "learning_rate": 8.539301032751228e-06, + "loss": 3.1584, + "step": 11430 + }, + { + "epoch": 0.9742606324043297, + "grad_norm": 45.773219996271436, + "learning_rate": 8.538950771970867e-06, + "loss": 2.5274, + "step": 11431 + }, + { + "epoch": 0.974345862098355, + "grad_norm": 39.0725773328893, + "learning_rate": 8.538600476386329e-06, + "loss": 3.6479, + "step": 11432 + }, + { + "epoch": 0.9744310917923804, + "grad_norm": 33.75600153133126, + "learning_rate": 8.538250146001064e-06, + "loss": 3.2986, + "step": 11433 + }, + { + "epoch": 0.9745163214864059, + "grad_norm": 167.46953876152037, + "learning_rate": 8.537899780818512e-06, + "loss": 5.9332, + "step": 11434 + }, + { + "epoch": 0.9746015511804312, + "grad_norm": 30.31878515102019, + "learning_rate": 8.537549380842124e-06, + "loss": 3.1974, + "step": 11435 + }, + { + "epoch": 0.9746867808744567, + "grad_norm": 37.38495522289492, + "learning_rate": 8.537198946075342e-06, + "loss": 3.8484, + "step": 11436 + }, + { + "epoch": 0.9747720105684821, + "grad_norm": 54.76991797949736, + "learning_rate": 8.536848476521616e-06, + "loss": 3.7363, + "step": 11437 + }, + { + "epoch": 0.9748572402625074, + "grad_norm": 38.94997899628408, + "learning_rate": 8.536497972184388e-06, + "loss": 3.7353, + "step": 11438 + }, + { + "epoch": 0.9749424699565329, + "grad_norm": 38.36536224879352, + "learning_rate": 8.536147433067109e-06, + "loss": 4.4801, + "step": 11439 + }, + { + "epoch": 0.9750276996505582, + "grad_norm": 32.2296915219442, + "learning_rate": 8.535796859173224e-06, + "loss": 3.4164, + "step": 11440 + }, + { + "epoch": 0.9751129293445836, + "grad_norm": 32.83861727852214, + "learning_rate": 8.535446250506183e-06, + "loss": 3.8258, + "step": 11441 + }, + { + "epoch": 0.9751981590386091, + "grad_norm": 39.3396175763137, + "learning_rate": 8.535095607069432e-06, + "loss": 3.615, + "step": 11442 + }, + { + "epoch": 0.9752833887326344, + "grad_norm": 30.782239534283423, + "learning_rate": 8.534744928866422e-06, + "loss": 2.9958, + "step": 11443 + }, + { + "epoch": 0.9753686184266599, + "grad_norm": 49.94623263896482, + "learning_rate": 8.534394215900598e-06, + "loss": 4.5836, + "step": 11444 + }, + { + "epoch": 0.9754538481206853, + "grad_norm": 56.05747801640022, + "learning_rate": 8.534043468175414e-06, + "loss": 5.1768, + "step": 11445 + }, + { + "epoch": 0.9755390778147106, + "grad_norm": 34.15510783913783, + "learning_rate": 8.533692685694315e-06, + "loss": 3.7002, + "step": 11446 + }, + { + "epoch": 0.9756243075087361, + "grad_norm": 56.500950294268904, + "learning_rate": 8.533341868460754e-06, + "loss": 4.6969, + "step": 11447 + }, + { + "epoch": 0.9757095372027614, + "grad_norm": 46.91852099217514, + "learning_rate": 8.532991016478179e-06, + "loss": 3.4261, + "step": 11448 + }, + { + "epoch": 0.9757947668967868, + "grad_norm": 34.05444395617053, + "learning_rate": 8.532640129750041e-06, + "loss": 4.0882, + "step": 11449 + }, + { + "epoch": 0.9758799965908123, + "grad_norm": 38.983268059577945, + "learning_rate": 8.53228920827979e-06, + "loss": 3.3018, + "step": 11450 + }, + { + "epoch": 0.9759652262848376, + "grad_norm": 44.12692778625681, + "learning_rate": 8.531938252070879e-06, + "loss": 4.3876, + "step": 11451 + }, + { + "epoch": 0.9760504559788631, + "grad_norm": 29.93010297924025, + "learning_rate": 8.531587261126759e-06, + "loss": 3.4225, + "step": 11452 + }, + { + "epoch": 0.9761356856728884, + "grad_norm": 42.45922056279002, + "learning_rate": 8.53123623545088e-06, + "loss": 3.4967, + "step": 11453 + }, + { + "epoch": 0.9762209153669138, + "grad_norm": 50.18615424762317, + "learning_rate": 8.5308851750467e-06, + "loss": 4.1519, + "step": 11454 + }, + { + "epoch": 0.9763061450609393, + "grad_norm": 20.68503469676828, + "learning_rate": 8.530534079917664e-06, + "loss": 2.6317, + "step": 11455 + }, + { + "epoch": 0.9763913747549646, + "grad_norm": 49.48440022397437, + "learning_rate": 8.53018295006723e-06, + "loss": 4.6519, + "step": 11456 + }, + { + "epoch": 0.97647660444899, + "grad_norm": 38.87435210716184, + "learning_rate": 8.529831785498847e-06, + "loss": 3.809, + "step": 11457 + }, + { + "epoch": 0.9765618341430155, + "grad_norm": 90.32319284039822, + "learning_rate": 8.529480586215972e-06, + "loss": 3.6148, + "step": 11458 + }, + { + "epoch": 0.9766470638370408, + "grad_norm": 59.97855968022985, + "learning_rate": 8.529129352222058e-06, + "loss": 4.1296, + "step": 11459 + }, + { + "epoch": 0.9767322935310663, + "grad_norm": 31.747711980229568, + "learning_rate": 8.52877808352056e-06, + "loss": 2.958, + "step": 11460 + }, + { + "epoch": 0.9768175232250916, + "grad_norm": 64.86912655425526, + "learning_rate": 8.528426780114931e-06, + "loss": 5.0722, + "step": 11461 + }, + { + "epoch": 0.976902752919117, + "grad_norm": 25.608993368910234, + "learning_rate": 8.528075442008628e-06, + "loss": 3.3507, + "step": 11462 + }, + { + "epoch": 0.9769879826131425, + "grad_norm": 46.51641845480393, + "learning_rate": 8.527724069205103e-06, + "loss": 4.1149, + "step": 11463 + }, + { + "epoch": 0.9770732123071678, + "grad_norm": 56.29852511926101, + "learning_rate": 8.527372661707814e-06, + "loss": 4.355, + "step": 11464 + }, + { + "epoch": 0.9771584420011932, + "grad_norm": 52.98819876979663, + "learning_rate": 8.527021219520217e-06, + "loss": 3.5928, + "step": 11465 + }, + { + "epoch": 0.9772436716952186, + "grad_norm": 17.72672112149774, + "learning_rate": 8.526669742645768e-06, + "loss": 2.036, + "step": 11466 + }, + { + "epoch": 0.977328901389244, + "grad_norm": 83.47159287634369, + "learning_rate": 8.526318231087924e-06, + "loss": 5.4754, + "step": 11467 + }, + { + "epoch": 0.9774141310832695, + "grad_norm": 37.995780820204196, + "learning_rate": 8.525966684850139e-06, + "loss": 3.5471, + "step": 11468 + }, + { + "epoch": 0.9774993607772948, + "grad_norm": 57.90525570638177, + "learning_rate": 8.525615103935875e-06, + "loss": 5.1711, + "step": 11469 + }, + { + "epoch": 0.9775845904713202, + "grad_norm": 33.60857173208037, + "learning_rate": 8.525263488348587e-06, + "loss": 2.818, + "step": 11470 + }, + { + "epoch": 0.9776698201653456, + "grad_norm": 47.07876277964723, + "learning_rate": 8.524911838091734e-06, + "loss": 3.9066, + "step": 11471 + }, + { + "epoch": 0.977755049859371, + "grad_norm": 42.33591590065146, + "learning_rate": 8.524560153168773e-06, + "loss": 3.9024, + "step": 11472 + }, + { + "epoch": 0.9778402795533964, + "grad_norm": 38.096995996483116, + "learning_rate": 8.524208433583163e-06, + "loss": 2.9697, + "step": 11473 + }, + { + "epoch": 0.9779255092474218, + "grad_norm": 52.375325220231545, + "learning_rate": 8.523856679338363e-06, + "loss": 4.3588, + "step": 11474 + }, + { + "epoch": 0.9780107389414472, + "grad_norm": 58.79532878228758, + "learning_rate": 8.523504890437834e-06, + "loss": 4.9554, + "step": 11475 + }, + { + "epoch": 0.9780959686354725, + "grad_norm": 47.58690373450339, + "learning_rate": 8.523153066885035e-06, + "loss": 4.2516, + "step": 11476 + }, + { + "epoch": 0.978181198329498, + "grad_norm": 62.65620528880854, + "learning_rate": 8.522801208683423e-06, + "loss": 4.1476, + "step": 11477 + }, + { + "epoch": 0.9782664280235234, + "grad_norm": 44.17937255318767, + "learning_rate": 8.522449315836462e-06, + "loss": 3.1878, + "step": 11478 + }, + { + "epoch": 0.9783516577175488, + "grad_norm": 125.08890680943601, + "learning_rate": 8.522097388347613e-06, + "loss": 2.7326, + "step": 11479 + }, + { + "epoch": 0.9784368874115742, + "grad_norm": 30.359339945425436, + "learning_rate": 8.521745426220336e-06, + "loss": 2.6694, + "step": 11480 + }, + { + "epoch": 0.9785221171055996, + "grad_norm": 31.743803338799918, + "learning_rate": 8.52139342945809e-06, + "loss": 3.7628, + "step": 11481 + }, + { + "epoch": 0.978607346799625, + "grad_norm": 55.651237804650435, + "learning_rate": 8.52104139806434e-06, + "loss": 4.5292, + "step": 11482 + }, + { + "epoch": 0.9786925764936504, + "grad_norm": 67.78961871887104, + "learning_rate": 8.520689332042546e-06, + "loss": 4.0577, + "step": 11483 + }, + { + "epoch": 0.9787778061876757, + "grad_norm": 51.52782865983822, + "learning_rate": 8.520337231396172e-06, + "loss": 3.2048, + "step": 11484 + }, + { + "epoch": 0.9788630358817012, + "grad_norm": 93.76309907895035, + "learning_rate": 8.519985096128682e-06, + "loss": 4.2435, + "step": 11485 + }, + { + "epoch": 0.9789482655757266, + "grad_norm": 41.6016237535203, + "learning_rate": 8.519632926243535e-06, + "loss": 3.0831, + "step": 11486 + }, + { + "epoch": 0.979033495269752, + "grad_norm": 53.672244011948536, + "learning_rate": 8.519280721744198e-06, + "loss": 4.7556, + "step": 11487 + }, + { + "epoch": 0.9791187249637774, + "grad_norm": 43.7143973419985, + "learning_rate": 8.518928482634133e-06, + "loss": 3.8739, + "step": 11488 + }, + { + "epoch": 0.9792039546578027, + "grad_norm": 30.001780119017738, + "learning_rate": 8.518576208916805e-06, + "loss": 3.6983, + "step": 11489 + }, + { + "epoch": 0.9792891843518282, + "grad_norm": 33.73573826920355, + "learning_rate": 8.518223900595678e-06, + "loss": 3.7789, + "step": 11490 + }, + { + "epoch": 0.9793744140458536, + "grad_norm": 38.10798217357058, + "learning_rate": 8.517871557674215e-06, + "loss": 3.6388, + "step": 11491 + }, + { + "epoch": 0.9794596437398789, + "grad_norm": 35.54157547063222, + "learning_rate": 8.517519180155886e-06, + "loss": 3.477, + "step": 11492 + }, + { + "epoch": 0.9795448734339044, + "grad_norm": 32.23465458483856, + "learning_rate": 8.517166768044151e-06, + "loss": 2.4589, + "step": 11493 + }, + { + "epoch": 0.9796301031279298, + "grad_norm": 47.74564047766921, + "learning_rate": 8.516814321342482e-06, + "loss": 5.0466, + "step": 11494 + }, + { + "epoch": 0.9797153328219552, + "grad_norm": 37.54298169008075, + "learning_rate": 8.516461840054338e-06, + "loss": 4.0622, + "step": 11495 + }, + { + "epoch": 0.9798005625159806, + "grad_norm": 106.74720508392086, + "learning_rate": 8.516109324183191e-06, + "loss": 3.7901, + "step": 11496 + }, + { + "epoch": 0.9798857922100059, + "grad_norm": 59.91737605274263, + "learning_rate": 8.515756773732507e-06, + "loss": 4.9866, + "step": 11497 + }, + { + "epoch": 0.9799710219040314, + "grad_norm": 62.7680225343307, + "learning_rate": 8.51540418870575e-06, + "loss": 4.1586, + "step": 11498 + }, + { + "epoch": 0.9800562515980568, + "grad_norm": 40.50509903675385, + "learning_rate": 8.515051569106389e-06, + "loss": 4.0071, + "step": 11499 + }, + { + "epoch": 0.9801414812920821, + "grad_norm": 29.757059238777554, + "learning_rate": 8.514698914937894e-06, + "loss": 3.8106, + "step": 11500 + }, + { + "epoch": 0.9802267109861076, + "grad_norm": 184.7046987376714, + "learning_rate": 8.514346226203733e-06, + "loss": 5.2688, + "step": 11501 + }, + { + "epoch": 0.9803119406801329, + "grad_norm": 56.43376989806921, + "learning_rate": 8.513993502907372e-06, + "loss": 4.604, + "step": 11502 + }, + { + "epoch": 0.9803971703741584, + "grad_norm": 46.6850341160854, + "learning_rate": 8.513640745052281e-06, + "loss": 3.5193, + "step": 11503 + }, + { + "epoch": 0.9804824000681838, + "grad_norm": 77.21121798888935, + "learning_rate": 8.513287952641928e-06, + "loss": 4.0945, + "step": 11504 + }, + { + "epoch": 0.9805676297622091, + "grad_norm": 42.774005715026085, + "learning_rate": 8.512935125679786e-06, + "loss": 3.5237, + "step": 11505 + }, + { + "epoch": 0.9806528594562346, + "grad_norm": 38.46858589280889, + "learning_rate": 8.512582264169325e-06, + "loss": 4.2214, + "step": 11506 + }, + { + "epoch": 0.98073808915026, + "grad_norm": 27.57620352870993, + "learning_rate": 8.51222936811401e-06, + "loss": 2.7387, + "step": 11507 + }, + { + "epoch": 0.9808233188442853, + "grad_norm": 32.91702250759723, + "learning_rate": 8.511876437517315e-06, + "loss": 3.3344, + "step": 11508 + }, + { + "epoch": 0.9809085485383108, + "grad_norm": 56.22182758018048, + "learning_rate": 8.511523472382712e-06, + "loss": 3.0344, + "step": 11509 + }, + { + "epoch": 0.9809937782323361, + "grad_norm": 49.52606513063515, + "learning_rate": 8.51117047271367e-06, + "loss": 3.4608, + "step": 11510 + }, + { + "epoch": 0.9810790079263615, + "grad_norm": 39.572955119873974, + "learning_rate": 8.510817438513661e-06, + "loss": 3.2897, + "step": 11511 + }, + { + "epoch": 0.981164237620387, + "grad_norm": 28.884952381117877, + "learning_rate": 8.510464369786159e-06, + "loss": 2.3041, + "step": 11512 + }, + { + "epoch": 0.9812494673144123, + "grad_norm": 95.23170441222427, + "learning_rate": 8.510111266534635e-06, + "loss": 3.4613, + "step": 11513 + }, + { + "epoch": 0.9813346970084378, + "grad_norm": 50.38066517994914, + "learning_rate": 8.50975812876256e-06, + "loss": 3.7708, + "step": 11514 + }, + { + "epoch": 0.9814199267024631, + "grad_norm": 29.598947520669498, + "learning_rate": 8.50940495647341e-06, + "loss": 3.9121, + "step": 11515 + }, + { + "epoch": 0.9815051563964885, + "grad_norm": 78.203151114363, + "learning_rate": 8.509051749670656e-06, + "loss": 5.3424, + "step": 11516 + }, + { + "epoch": 0.981590386090514, + "grad_norm": 39.92505938444354, + "learning_rate": 8.508698508357772e-06, + "loss": 3.1938, + "step": 11517 + }, + { + "epoch": 0.9816756157845393, + "grad_norm": 89.356111223093, + "learning_rate": 8.508345232538233e-06, + "loss": 4.3249, + "step": 11518 + }, + { + "epoch": 0.9817608454785647, + "grad_norm": 57.463963921196445, + "learning_rate": 8.507991922215513e-06, + "loss": 4.704, + "step": 11519 + }, + { + "epoch": 0.9818460751725901, + "grad_norm": 52.534971394864385, + "learning_rate": 8.507638577393085e-06, + "loss": 3.6852, + "step": 11520 + }, + { + "epoch": 0.9819313048666155, + "grad_norm": 87.69336953426642, + "learning_rate": 8.507285198074428e-06, + "loss": 4.455, + "step": 11521 + }, + { + "epoch": 0.982016534560641, + "grad_norm": 55.30768916431327, + "learning_rate": 8.506931784263011e-06, + "loss": 3.9931, + "step": 11522 + }, + { + "epoch": 0.9821017642546663, + "grad_norm": 73.38525175149363, + "learning_rate": 8.506578335962317e-06, + "loss": 4.1659, + "step": 11523 + }, + { + "epoch": 0.9821869939486917, + "grad_norm": 35.51479576450778, + "learning_rate": 8.506224853175815e-06, + "loss": 4.1895, + "step": 11524 + }, + { + "epoch": 0.9822722236427172, + "grad_norm": 44.98848290205526, + "learning_rate": 8.505871335906989e-06, + "loss": 3.8491, + "step": 11525 + }, + { + "epoch": 0.9823574533367425, + "grad_norm": 48.156393515431944, + "learning_rate": 8.505517784159308e-06, + "loss": 4.2552, + "step": 11526 + }, + { + "epoch": 0.9824426830307679, + "grad_norm": 43.461553337983126, + "learning_rate": 8.505164197936254e-06, + "loss": 4.135, + "step": 11527 + }, + { + "epoch": 0.9825279127247933, + "grad_norm": 72.24776089673578, + "learning_rate": 8.504810577241303e-06, + "loss": 4.5334, + "step": 11528 + }, + { + "epoch": 0.9826131424188187, + "grad_norm": 37.051800407769406, + "learning_rate": 8.504456922077933e-06, + "loss": 3.056, + "step": 11529 + }, + { + "epoch": 0.9826983721128442, + "grad_norm": 114.33748674671777, + "learning_rate": 8.50410323244962e-06, + "loss": 5.4659, + "step": 11530 + }, + { + "epoch": 0.9827836018068695, + "grad_norm": 34.741792241177244, + "learning_rate": 8.503749508359846e-06, + "loss": 3.7102, + "step": 11531 + }, + { + "epoch": 0.9828688315008949, + "grad_norm": 43.82641259799215, + "learning_rate": 8.503395749812086e-06, + "loss": 3.4194, + "step": 11532 + }, + { + "epoch": 0.9829540611949203, + "grad_norm": 63.33586236456392, + "learning_rate": 8.503041956809824e-06, + "loss": 3.9818, + "step": 11533 + }, + { + "epoch": 0.9830392908889457, + "grad_norm": 42.61993471548191, + "learning_rate": 8.502688129356533e-06, + "loss": 3.5914, + "step": 11534 + }, + { + "epoch": 0.9831245205829711, + "grad_norm": 26.251395645826765, + "learning_rate": 8.502334267455698e-06, + "loss": 2.9922, + "step": 11535 + }, + { + "epoch": 0.9832097502769965, + "grad_norm": 33.87425035322568, + "learning_rate": 8.501980371110795e-06, + "loss": 3.4031, + "step": 11536 + }, + { + "epoch": 0.9832949799710219, + "grad_norm": 57.14636188606865, + "learning_rate": 8.501626440325308e-06, + "loss": 4.3535, + "step": 11537 + }, + { + "epoch": 0.9833802096650474, + "grad_norm": 71.11311676340088, + "learning_rate": 8.501272475102718e-06, + "loss": 4.2181, + "step": 11538 + }, + { + "epoch": 0.9834654393590727, + "grad_norm": 46.54275323948228, + "learning_rate": 8.500918475446503e-06, + "loss": 3.6627, + "step": 11539 + }, + { + "epoch": 0.9835506690530981, + "grad_norm": 66.16477497247486, + "learning_rate": 8.500564441360145e-06, + "loss": 4.258, + "step": 11540 + }, + { + "epoch": 0.9836358987471235, + "grad_norm": 95.07223733545528, + "learning_rate": 8.500210372847128e-06, + "loss": 5.2216, + "step": 11541 + }, + { + "epoch": 0.9837211284411489, + "grad_norm": 65.87381238060395, + "learning_rate": 8.499856269910932e-06, + "loss": 4.6477, + "step": 11542 + }, + { + "epoch": 0.9838063581351743, + "grad_norm": 37.12698906132067, + "learning_rate": 8.499502132555041e-06, + "loss": 3.4127, + "step": 11543 + }, + { + "epoch": 0.9838915878291997, + "grad_norm": 34.22271044128615, + "learning_rate": 8.499147960782936e-06, + "loss": 3.5925, + "step": 11544 + }, + { + "epoch": 0.9839768175232251, + "grad_norm": 35.684184607748676, + "learning_rate": 8.498793754598103e-06, + "loss": 3.3514, + "step": 11545 + }, + { + "epoch": 0.9840620472172504, + "grad_norm": 30.239870473314983, + "learning_rate": 8.498439514004022e-06, + "loss": 4.1723, + "step": 11546 + }, + { + "epoch": 0.9841472769112759, + "grad_norm": 31.658896865713537, + "learning_rate": 8.49808523900418e-06, + "loss": 3.1759, + "step": 11547 + }, + { + "epoch": 0.9842325066053013, + "grad_norm": 32.741726487996374, + "learning_rate": 8.497730929602058e-06, + "loss": 3.5494, + "step": 11548 + }, + { + "epoch": 0.9843177362993267, + "grad_norm": 37.69123399102332, + "learning_rate": 8.497376585801144e-06, + "loss": 4.0768, + "step": 11549 + }, + { + "epoch": 0.9844029659933521, + "grad_norm": 56.067727039550185, + "learning_rate": 8.49702220760492e-06, + "loss": 4.7375, + "step": 11550 + }, + { + "epoch": 0.9844881956873774, + "grad_norm": 57.2838628063865, + "learning_rate": 8.496667795016872e-06, + "loss": 4.8415, + "step": 11551 + }, + { + "epoch": 0.9845734253814029, + "grad_norm": 36.00804219949727, + "learning_rate": 8.496313348040486e-06, + "loss": 3.4303, + "step": 11552 + }, + { + "epoch": 0.9846586550754283, + "grad_norm": 29.399341177417504, + "learning_rate": 8.495958866679248e-06, + "loss": 3.405, + "step": 11553 + }, + { + "epoch": 0.9847438847694536, + "grad_norm": 30.495129761506785, + "learning_rate": 8.495604350936643e-06, + "loss": 3.4597, + "step": 11554 + }, + { + "epoch": 0.9848291144634791, + "grad_norm": 88.40926914654185, + "learning_rate": 8.495249800816158e-06, + "loss": 4.8232, + "step": 11555 + }, + { + "epoch": 0.9849143441575045, + "grad_norm": 41.920843118483965, + "learning_rate": 8.49489521632128e-06, + "loss": 4.5602, + "step": 11556 + }, + { + "epoch": 0.9849995738515299, + "grad_norm": 35.26277477378282, + "learning_rate": 8.494540597455497e-06, + "loss": 3.8915, + "step": 11557 + }, + { + "epoch": 0.9850848035455553, + "grad_norm": 74.11373058898603, + "learning_rate": 8.494185944222295e-06, + "loss": 4.7254, + "step": 11558 + }, + { + "epoch": 0.9851700332395806, + "grad_norm": 24.61692971349528, + "learning_rate": 8.493831256625165e-06, + "loss": 3.2663, + "step": 11559 + }, + { + "epoch": 0.9852552629336061, + "grad_norm": 44.01342381363855, + "learning_rate": 8.493476534667591e-06, + "loss": 3.7143, + "step": 11560 + }, + { + "epoch": 0.9853404926276315, + "grad_norm": 37.44685002942893, + "learning_rate": 8.493121778353062e-06, + "loss": 3.3662, + "step": 11561 + }, + { + "epoch": 0.9854257223216568, + "grad_norm": 32.243399999974436, + "learning_rate": 8.49276698768507e-06, + "loss": 3.4275, + "step": 11562 + }, + { + "epoch": 0.9855109520156823, + "grad_norm": 38.920630238997916, + "learning_rate": 8.492412162667104e-06, + "loss": 3.2519, + "step": 11563 + }, + { + "epoch": 0.9855961817097076, + "grad_norm": 48.9055583185276, + "learning_rate": 8.49205730330265e-06, + "loss": 3.6101, + "step": 11564 + }, + { + "epoch": 0.9856814114037331, + "grad_norm": 97.67244424527013, + "learning_rate": 8.4917024095952e-06, + "loss": 4.7593, + "step": 11565 + }, + { + "epoch": 0.9857666410977585, + "grad_norm": 106.6874280661308, + "learning_rate": 8.491347481548247e-06, + "loss": 5.8094, + "step": 11566 + }, + { + "epoch": 0.9858518707917838, + "grad_norm": 132.57738033534258, + "learning_rate": 8.490992519165276e-06, + "loss": 5.3756, + "step": 11567 + }, + { + "epoch": 0.9859371004858093, + "grad_norm": 60.539929683547754, + "learning_rate": 8.49063752244978e-06, + "loss": 3.5922, + "step": 11568 + }, + { + "epoch": 0.9860223301798346, + "grad_norm": 33.97371074502893, + "learning_rate": 8.490282491405253e-06, + "loss": 3.6015, + "step": 11569 + }, + { + "epoch": 0.98610755987386, + "grad_norm": 90.93134626750157, + "learning_rate": 8.489927426035186e-06, + "loss": 3.5106, + "step": 11570 + }, + { + "epoch": 0.9861927895678855, + "grad_norm": 28.933604587382387, + "learning_rate": 8.489572326343066e-06, + "loss": 3.7676, + "step": 11571 + }, + { + "epoch": 0.9862780192619108, + "grad_norm": 39.60961399537734, + "learning_rate": 8.489217192332391e-06, + "loss": 4.3365, + "step": 11572 + }, + { + "epoch": 0.9863632489559363, + "grad_norm": 41.646064860314155, + "learning_rate": 8.48886202400665e-06, + "loss": 4.1762, + "step": 11573 + }, + { + "epoch": 0.9864484786499617, + "grad_norm": 38.76562829222921, + "learning_rate": 8.48850682136934e-06, + "loss": 4.3668, + "step": 11574 + }, + { + "epoch": 0.986533708343987, + "grad_norm": 31.4162582267457, + "learning_rate": 8.488151584423948e-06, + "loss": 4.0096, + "step": 11575 + }, + { + "epoch": 0.9866189380380125, + "grad_norm": 63.687007649450145, + "learning_rate": 8.487796313173975e-06, + "loss": 4.6255, + "step": 11576 + }, + { + "epoch": 0.9867041677320378, + "grad_norm": 31.052611183682195, + "learning_rate": 8.487441007622908e-06, + "loss": 3.4369, + "step": 11577 + }, + { + "epoch": 0.9867893974260632, + "grad_norm": 48.18625935337507, + "learning_rate": 8.487085667774246e-06, + "loss": 4.2746, + "step": 11578 + }, + { + "epoch": 0.9868746271200887, + "grad_norm": 67.001528994656, + "learning_rate": 8.486730293631483e-06, + "loss": 4.0199, + "step": 11579 + }, + { + "epoch": 0.986959856814114, + "grad_norm": 51.94892687018091, + "learning_rate": 8.48637488519811e-06, + "loss": 5.1184, + "step": 11580 + }, + { + "epoch": 0.9870450865081395, + "grad_norm": 52.11316015280495, + "learning_rate": 8.486019442477628e-06, + "loss": 3.5729, + "step": 11581 + }, + { + "epoch": 0.9871303162021648, + "grad_norm": 64.40282569119198, + "learning_rate": 8.48566396547353e-06, + "loss": 4.6315, + "step": 11582 + }, + { + "epoch": 0.9872155458961902, + "grad_norm": 34.299764639715235, + "learning_rate": 8.485308454189309e-06, + "loss": 3.5316, + "step": 11583 + }, + { + "epoch": 0.9873007755902157, + "grad_norm": 65.56253699000193, + "learning_rate": 8.484952908628468e-06, + "loss": 6.1831, + "step": 11584 + }, + { + "epoch": 0.987386005284241, + "grad_norm": 54.10625022719822, + "learning_rate": 8.484597328794497e-06, + "loss": 4.3532, + "step": 11585 + }, + { + "epoch": 0.9874712349782664, + "grad_norm": 27.66704914406248, + "learning_rate": 8.484241714690898e-06, + "loss": 2.1811, + "step": 11586 + }, + { + "epoch": 0.9875564646722919, + "grad_norm": 37.36210901449704, + "learning_rate": 8.483886066321166e-06, + "loss": 3.0365, + "step": 11587 + }, + { + "epoch": 0.9876416943663172, + "grad_norm": 36.28412651725217, + "learning_rate": 8.483530383688798e-06, + "loss": 3.3514, + "step": 11588 + }, + { + "epoch": 0.9877269240603426, + "grad_norm": 54.15938075590862, + "learning_rate": 8.483174666797293e-06, + "loss": 4.9029, + "step": 11589 + }, + { + "epoch": 0.987812153754368, + "grad_norm": 31.512524305151203, + "learning_rate": 8.48281891565015e-06, + "loss": 3.4361, + "step": 11590 + }, + { + "epoch": 0.9878973834483934, + "grad_norm": 36.394842799078525, + "learning_rate": 8.482463130250866e-06, + "loss": 4.0232, + "step": 11591 + }, + { + "epoch": 0.9879826131424189, + "grad_norm": 34.683194718506215, + "learning_rate": 8.482107310602942e-06, + "loss": 3.0302, + "step": 11592 + }, + { + "epoch": 0.9880678428364442, + "grad_norm": 37.92044707762541, + "learning_rate": 8.481751456709875e-06, + "loss": 3.3726, + "step": 11593 + }, + { + "epoch": 0.9881530725304696, + "grad_norm": 47.963991763950034, + "learning_rate": 8.481395568575167e-06, + "loss": 4.7594, + "step": 11594 + }, + { + "epoch": 0.988238302224495, + "grad_norm": 116.40818028248972, + "learning_rate": 8.481039646202315e-06, + "loss": 5.6871, + "step": 11595 + }, + { + "epoch": 0.9883235319185204, + "grad_norm": 30.973697035315112, + "learning_rate": 8.480683689594823e-06, + "loss": 3.3964, + "step": 11596 + }, + { + "epoch": 0.9884087616125458, + "grad_norm": 36.40567612742515, + "learning_rate": 8.48032769875619e-06, + "loss": 3.7794, + "step": 11597 + }, + { + "epoch": 0.9884939913065712, + "grad_norm": 41.88916320940357, + "learning_rate": 8.479971673689917e-06, + "loss": 4.0574, + "step": 11598 + }, + { + "epoch": 0.9885792210005966, + "grad_norm": 39.03419212212006, + "learning_rate": 8.479615614399506e-06, + "loss": 3.576, + "step": 11599 + }, + { + "epoch": 0.988664450694622, + "grad_norm": 88.34488127227557, + "learning_rate": 8.479259520888457e-06, + "loss": 4.8006, + "step": 11600 + }, + { + "epoch": 0.9887496803886474, + "grad_norm": 29.704911004405858, + "learning_rate": 8.478903393160274e-06, + "loss": 2.9576, + "step": 11601 + }, + { + "epoch": 0.9888349100826728, + "grad_norm": 60.118904548136136, + "learning_rate": 8.478547231218457e-06, + "loss": 5.0894, + "step": 11602 + }, + { + "epoch": 0.9889201397766982, + "grad_norm": 36.7226169621531, + "learning_rate": 8.47819103506651e-06, + "loss": 3.3523, + "step": 11603 + }, + { + "epoch": 0.9890053694707236, + "grad_norm": 91.1687814751161, + "learning_rate": 8.477834804707939e-06, + "loss": 4.2631, + "step": 11604 + }, + { + "epoch": 0.989090599164749, + "grad_norm": 32.376718923977364, + "learning_rate": 8.477478540146242e-06, + "loss": 4.1983, + "step": 11605 + }, + { + "epoch": 0.9891758288587744, + "grad_norm": 57.8814741046346, + "learning_rate": 8.477122241384928e-06, + "loss": 3.2522, + "step": 11606 + }, + { + "epoch": 0.9892610585527998, + "grad_norm": 29.38851329826472, + "learning_rate": 8.476765908427497e-06, + "loss": 3.0625, + "step": 11607 + }, + { + "epoch": 0.9893462882468252, + "grad_norm": 54.413237569667835, + "learning_rate": 8.476409541277454e-06, + "loss": 4.1333, + "step": 11608 + }, + { + "epoch": 0.9894315179408506, + "grad_norm": 51.97011727999051, + "learning_rate": 8.476053139938305e-06, + "loss": 4.1689, + "step": 11609 + }, + { + "epoch": 0.989516747634876, + "grad_norm": 47.86399028048781, + "learning_rate": 8.475696704413554e-06, + "loss": 3.9694, + "step": 11610 + }, + { + "epoch": 0.9896019773289014, + "grad_norm": 37.86106174380667, + "learning_rate": 8.47534023470671e-06, + "loss": 3.7385, + "step": 11611 + }, + { + "epoch": 0.9896872070229268, + "grad_norm": 44.72018190491415, + "learning_rate": 8.474983730821272e-06, + "loss": 4.0526, + "step": 11612 + }, + { + "epoch": 0.9897724367169521, + "grad_norm": 55.07996264823395, + "learning_rate": 8.474627192760752e-06, + "loss": 3.9615, + "step": 11613 + }, + { + "epoch": 0.9898576664109776, + "grad_norm": 71.93691328387827, + "learning_rate": 8.474270620528654e-06, + "loss": 4.8116, + "step": 11614 + }, + { + "epoch": 0.989942896105003, + "grad_norm": 54.046247407840404, + "learning_rate": 8.473914014128484e-06, + "loss": 4.7023, + "step": 11615 + }, + { + "epoch": 0.9900281257990284, + "grad_norm": 37.64019477915692, + "learning_rate": 8.473557373563751e-06, + "loss": 3.5061, + "step": 11616 + }, + { + "epoch": 0.9901133554930538, + "grad_norm": 30.710175177641965, + "learning_rate": 8.473200698837962e-06, + "loss": 2.5418, + "step": 11617 + }, + { + "epoch": 0.9901985851870791, + "grad_norm": 38.53685000942444, + "learning_rate": 8.472843989954623e-06, + "loss": 3.4634, + "step": 11618 + }, + { + "epoch": 0.9902838148811046, + "grad_norm": 47.442403712353894, + "learning_rate": 8.472487246917245e-06, + "loss": 2.3593, + "step": 11619 + }, + { + "epoch": 0.99036904457513, + "grad_norm": 53.747078794143924, + "learning_rate": 8.472130469729333e-06, + "loss": 3.6999, + "step": 11620 + }, + { + "epoch": 0.9904542742691553, + "grad_norm": 41.83229397971842, + "learning_rate": 8.471773658394397e-06, + "loss": 4.4947, + "step": 11621 + }, + { + "epoch": 0.9905395039631808, + "grad_norm": 88.05079962248735, + "learning_rate": 8.47141681291595e-06, + "loss": 4.6454, + "step": 11622 + }, + { + "epoch": 0.9906247336572062, + "grad_norm": 57.38576471227245, + "learning_rate": 8.471059933297494e-06, + "loss": 4.8792, + "step": 11623 + }, + { + "epoch": 0.9907099633512315, + "grad_norm": 54.90386761305821, + "learning_rate": 8.470703019542546e-06, + "loss": 4.0121, + "step": 11624 + }, + { + "epoch": 0.990795193045257, + "grad_norm": 39.12679029890812, + "learning_rate": 8.47034607165461e-06, + "loss": 3.8694, + "step": 11625 + }, + { + "epoch": 0.9908804227392823, + "grad_norm": 24.59807619097821, + "learning_rate": 8.4699890896372e-06, + "loss": 3.1647, + "step": 11626 + }, + { + "epoch": 0.9909656524333078, + "grad_norm": 23.09912765627295, + "learning_rate": 8.469632073493827e-06, + "loss": 3.2159, + "step": 11627 + }, + { + "epoch": 0.9910508821273332, + "grad_norm": 65.27128003081025, + "learning_rate": 8.469275023228004e-06, + "loss": 3.1144, + "step": 11628 + }, + { + "epoch": 0.9911361118213585, + "grad_norm": 58.29974451342334, + "learning_rate": 8.468917938843236e-06, + "loss": 4.2735, + "step": 11629 + }, + { + "epoch": 0.991221341515384, + "grad_norm": 34.7298736613226, + "learning_rate": 8.468560820343038e-06, + "loss": 3.5479, + "step": 11630 + }, + { + "epoch": 0.9913065712094093, + "grad_norm": 69.70824110677705, + "learning_rate": 8.468203667730925e-06, + "loss": 4.69, + "step": 11631 + }, + { + "epoch": 0.9913918009034347, + "grad_norm": 45.340775922089094, + "learning_rate": 8.467846481010407e-06, + "loss": 2.9608, + "step": 11632 + }, + { + "epoch": 0.9914770305974602, + "grad_norm": 35.28695585550392, + "learning_rate": 8.467489260184996e-06, + "loss": 3.8923, + "step": 11633 + }, + { + "epoch": 0.9915622602914855, + "grad_norm": 31.532524568390976, + "learning_rate": 8.467132005258206e-06, + "loss": 3.4824, + "step": 11634 + }, + { + "epoch": 0.991647489985511, + "grad_norm": 74.37516801816447, + "learning_rate": 8.466774716233551e-06, + "loss": 3.6105, + "step": 11635 + }, + { + "epoch": 0.9917327196795364, + "grad_norm": 31.42176746577896, + "learning_rate": 8.466417393114542e-06, + "loss": 3.7841, + "step": 11636 + }, + { + "epoch": 0.9918179493735617, + "grad_norm": 39.41074595170864, + "learning_rate": 8.466060035904697e-06, + "loss": 3.5753, + "step": 11637 + }, + { + "epoch": 0.9919031790675872, + "grad_norm": 31.616666541494826, + "learning_rate": 8.46570264460753e-06, + "loss": 2.3906, + "step": 11638 + }, + { + "epoch": 0.9919884087616125, + "grad_norm": 67.06773879838637, + "learning_rate": 8.465345219226553e-06, + "loss": 5.2418, + "step": 11639 + }, + { + "epoch": 0.9920736384556379, + "grad_norm": 84.86376541367805, + "learning_rate": 8.464987759765281e-06, + "loss": 4.4723, + "step": 11640 + }, + { + "epoch": 0.9921588681496634, + "grad_norm": 48.62521492855115, + "learning_rate": 8.464630266227235e-06, + "loss": 4.8084, + "step": 11641 + }, + { + "epoch": 0.9922440978436887, + "grad_norm": 43.47423401327703, + "learning_rate": 8.464272738615925e-06, + "loss": 3.9402, + "step": 11642 + }, + { + "epoch": 0.9923293275377142, + "grad_norm": 29.65384918912519, + "learning_rate": 8.463915176934869e-06, + "loss": 3.0145, + "step": 11643 + }, + { + "epoch": 0.9924145572317395, + "grad_norm": 47.616372900206514, + "learning_rate": 8.463557581187585e-06, + "loss": 4.1557, + "step": 11644 + }, + { + "epoch": 0.9924997869257649, + "grad_norm": 25.725702191574282, + "learning_rate": 8.463199951377587e-06, + "loss": 3.3945, + "step": 11645 + }, + { + "epoch": 0.9925850166197904, + "grad_norm": 67.71932144607989, + "learning_rate": 8.462842287508395e-06, + "loss": 5.3298, + "step": 11646 + }, + { + "epoch": 0.9926702463138157, + "grad_norm": 187.28994048789144, + "learning_rate": 8.462484589583526e-06, + "loss": 4.4191, + "step": 11647 + }, + { + "epoch": 0.9927554760078411, + "grad_norm": 38.32880308125177, + "learning_rate": 8.462126857606494e-06, + "loss": 3.8753, + "step": 11648 + }, + { + "epoch": 0.9928407057018666, + "grad_norm": 76.42273970760264, + "learning_rate": 8.461769091580822e-06, + "loss": 5.3167, + "step": 11649 + }, + { + "epoch": 0.9929259353958919, + "grad_norm": 26.468633146674726, + "learning_rate": 8.461411291510026e-06, + "loss": 2.3655, + "step": 11650 + }, + { + "epoch": 0.9930111650899174, + "grad_norm": 51.17038305505378, + "learning_rate": 8.461053457397626e-06, + "loss": 3.6055, + "step": 11651 + }, + { + "epoch": 0.9930963947839427, + "grad_norm": 36.01649887666329, + "learning_rate": 8.460695589247138e-06, + "loss": 3.4532, + "step": 11652 + }, + { + "epoch": 0.9931816244779681, + "grad_norm": 49.96475557646039, + "learning_rate": 8.460337687062087e-06, + "loss": 5.3664, + "step": 11653 + }, + { + "epoch": 0.9932668541719936, + "grad_norm": 71.64246602444008, + "learning_rate": 8.45997975084599e-06, + "loss": 3.3796, + "step": 11654 + }, + { + "epoch": 0.9933520838660189, + "grad_norm": 62.487753735788765, + "learning_rate": 8.459621780602366e-06, + "loss": 4.8872, + "step": 11655 + }, + { + "epoch": 0.9934373135600443, + "grad_norm": 24.869923757490277, + "learning_rate": 8.459263776334735e-06, + "loss": 3.1017, + "step": 11656 + }, + { + "epoch": 0.9935225432540697, + "grad_norm": 82.31908140911777, + "learning_rate": 8.45890573804662e-06, + "loss": 4.2587, + "step": 11657 + }, + { + "epoch": 0.9936077729480951, + "grad_norm": 31.33640664356427, + "learning_rate": 8.458547665741543e-06, + "loss": 3.9395, + "step": 11658 + }, + { + "epoch": 0.9936930026421205, + "grad_norm": 83.42470118929957, + "learning_rate": 8.458189559423021e-06, + "loss": 5.4433, + "step": 11659 + }, + { + "epoch": 0.9937782323361459, + "grad_norm": 38.369011723679094, + "learning_rate": 8.457831419094583e-06, + "loss": 3.8882, + "step": 11660 + }, + { + "epoch": 0.9938634620301713, + "grad_norm": 35.089822691976224, + "learning_rate": 8.457473244759744e-06, + "loss": 3.5597, + "step": 11661 + }, + { + "epoch": 0.9939486917241968, + "grad_norm": 31.43539460710417, + "learning_rate": 8.457115036422032e-06, + "loss": 4.037, + "step": 11662 + }, + { + "epoch": 0.9940339214182221, + "grad_norm": 28.369647518214688, + "learning_rate": 8.456756794084965e-06, + "loss": 3.2009, + "step": 11663 + }, + { + "epoch": 0.9941191511122475, + "grad_norm": 74.84111347277792, + "learning_rate": 8.456398517752071e-06, + "loss": 5.3715, + "step": 11664 + }, + { + "epoch": 0.9942043808062729, + "grad_norm": 109.63947806800236, + "learning_rate": 8.456040207426867e-06, + "loss": 4.9434, + "step": 11665 + }, + { + "epoch": 0.9942896105002983, + "grad_norm": 37.212393882374094, + "learning_rate": 8.455681863112884e-06, + "loss": 3.4402, + "step": 11666 + }, + { + "epoch": 0.9943748401943237, + "grad_norm": 34.40605271601622, + "learning_rate": 8.455323484813642e-06, + "loss": 3.2501, + "step": 11667 + }, + { + "epoch": 0.9944600698883491, + "grad_norm": 48.14849645265382, + "learning_rate": 8.454965072532668e-06, + "loss": 3.7364, + "step": 11668 + }, + { + "epoch": 0.9945452995823745, + "grad_norm": 49.55230616220338, + "learning_rate": 8.454606626273485e-06, + "loss": 4.3472, + "step": 11669 + }, + { + "epoch": 0.9946305292763999, + "grad_norm": 100.83006156051071, + "learning_rate": 8.454248146039616e-06, + "loss": 4.5455, + "step": 11670 + }, + { + "epoch": 0.9947157589704253, + "grad_norm": 43.73439585783493, + "learning_rate": 8.453889631834592e-06, + "loss": 3.9284, + "step": 11671 + }, + { + "epoch": 0.9948009886644507, + "grad_norm": 81.96020634682934, + "learning_rate": 8.453531083661935e-06, + "loss": 4.5657, + "step": 11672 + }, + { + "epoch": 0.9948862183584761, + "grad_norm": 40.68580882860081, + "learning_rate": 8.453172501525171e-06, + "loss": 3.771, + "step": 11673 + }, + { + "epoch": 0.9949714480525015, + "grad_norm": 51.755822269999115, + "learning_rate": 8.452813885427828e-06, + "loss": 3.5624, + "step": 11674 + }, + { + "epoch": 0.9950566777465268, + "grad_norm": 45.60216120925011, + "learning_rate": 8.452455235373433e-06, + "loss": 2.9296, + "step": 11675 + }, + { + "epoch": 0.9951419074405523, + "grad_norm": 27.503929941855326, + "learning_rate": 8.452096551365511e-06, + "loss": 3.3917, + "step": 11676 + }, + { + "epoch": 0.9952271371345777, + "grad_norm": 59.89950683952417, + "learning_rate": 8.451737833407594e-06, + "loss": 3.811, + "step": 11677 + }, + { + "epoch": 0.9953123668286031, + "grad_norm": 24.124780320699198, + "learning_rate": 8.451379081503205e-06, + "loss": 2.7673, + "step": 11678 + }, + { + "epoch": 0.9953975965226285, + "grad_norm": 62.455800018495715, + "learning_rate": 8.451020295655877e-06, + "loss": 5.1156, + "step": 11679 + }, + { + "epoch": 0.9954828262166538, + "grad_norm": 49.16776616187321, + "learning_rate": 8.450661475869132e-06, + "loss": 4.0457, + "step": 11680 + }, + { + "epoch": 0.9955680559106793, + "grad_norm": 35.49396304927185, + "learning_rate": 8.450302622146504e-06, + "loss": 3.776, + "step": 11681 + }, + { + "epoch": 0.9956532856047047, + "grad_norm": 57.97495696167167, + "learning_rate": 8.449943734491521e-06, + "loss": 4.2003, + "step": 11682 + }, + { + "epoch": 0.99573851529873, + "grad_norm": 79.10892345894173, + "learning_rate": 8.449584812907712e-06, + "loss": 3.9115, + "step": 11683 + }, + { + "epoch": 0.9958237449927555, + "grad_norm": 31.2093325608396, + "learning_rate": 8.449225857398607e-06, + "loss": 3.139, + "step": 11684 + }, + { + "epoch": 0.9959089746867809, + "grad_norm": 101.90984843540046, + "learning_rate": 8.448866867967736e-06, + "loss": 4.7321, + "step": 11685 + }, + { + "epoch": 0.9959942043808063, + "grad_norm": 34.37092632092332, + "learning_rate": 8.44850784461863e-06, + "loss": 2.8252, + "step": 11686 + }, + { + "epoch": 0.9960794340748317, + "grad_norm": 132.28206476391438, + "learning_rate": 8.44814878735482e-06, + "loss": 5.126, + "step": 11687 + }, + { + "epoch": 0.996164663768857, + "grad_norm": 28.25083954879613, + "learning_rate": 8.447789696179838e-06, + "loss": 3.2819, + "step": 11688 + }, + { + "epoch": 0.9962498934628825, + "grad_norm": 38.72772135038014, + "learning_rate": 8.447430571097213e-06, + "loss": 4.3686, + "step": 11689 + }, + { + "epoch": 0.9963351231569079, + "grad_norm": 32.94141828371351, + "learning_rate": 8.447071412110477e-06, + "loss": 3.5486, + "step": 11690 + }, + { + "epoch": 0.9964203528509332, + "grad_norm": 135.94309852698663, + "learning_rate": 8.446712219223165e-06, + "loss": 6.6052, + "step": 11691 + }, + { + "epoch": 0.9965055825449587, + "grad_norm": 41.66545800315277, + "learning_rate": 8.446352992438807e-06, + "loss": 3.8123, + "step": 11692 + }, + { + "epoch": 0.996590812238984, + "grad_norm": 24.875966147896474, + "learning_rate": 8.445993731760937e-06, + "loss": 2.9837, + "step": 11693 + }, + { + "epoch": 0.9966760419330095, + "grad_norm": 51.17553877811267, + "learning_rate": 8.445634437193088e-06, + "loss": 3.7444, + "step": 11694 + }, + { + "epoch": 0.9967612716270349, + "grad_norm": 84.10309362747344, + "learning_rate": 8.445275108738795e-06, + "loss": 5.5532, + "step": 11695 + }, + { + "epoch": 0.9968465013210602, + "grad_norm": 49.616377216205706, + "learning_rate": 8.444915746401588e-06, + "loss": 3.7381, + "step": 11696 + }, + { + "epoch": 0.9969317310150857, + "grad_norm": 33.52223893754867, + "learning_rate": 8.444556350185006e-06, + "loss": 3.41, + "step": 11697 + }, + { + "epoch": 0.997016960709111, + "grad_norm": 141.87895733560174, + "learning_rate": 8.444196920092578e-06, + "loss": 6.0968, + "step": 11698 + }, + { + "epoch": 0.9971021904031364, + "grad_norm": 36.04154595687007, + "learning_rate": 8.443837456127845e-06, + "loss": 4.1329, + "step": 11699 + }, + { + "epoch": 0.9971874200971619, + "grad_norm": 54.66707333093644, + "learning_rate": 8.443477958294337e-06, + "loss": 3.6656, + "step": 11700 + }, + { + "epoch": 0.9972726497911872, + "grad_norm": 52.70397483865323, + "learning_rate": 8.443118426595592e-06, + "loss": 4.709, + "step": 11701 + }, + { + "epoch": 0.9973578794852126, + "grad_norm": 31.49385956529928, + "learning_rate": 8.442758861035146e-06, + "loss": 2.9088, + "step": 11702 + }, + { + "epoch": 0.9974431091792381, + "grad_norm": 44.368958090491034, + "learning_rate": 8.442399261616534e-06, + "loss": 4.4243, + "step": 11703 + }, + { + "epoch": 0.9975283388732634, + "grad_norm": 36.55203427679761, + "learning_rate": 8.442039628343292e-06, + "loss": 3.8419, + "step": 11704 + }, + { + "epoch": 0.9976135685672889, + "grad_norm": 44.816487923541715, + "learning_rate": 8.44167996121896e-06, + "loss": 3.1627, + "step": 11705 + }, + { + "epoch": 0.9976987982613142, + "grad_norm": 86.14730288603623, + "learning_rate": 8.441320260247073e-06, + "loss": 4.0538, + "step": 11706 + }, + { + "epoch": 0.9977840279553396, + "grad_norm": 44.34450575351882, + "learning_rate": 8.440960525431168e-06, + "loss": 2.8239, + "step": 11707 + }, + { + "epoch": 0.9978692576493651, + "grad_norm": 31.786137848593096, + "learning_rate": 8.440600756774783e-06, + "loss": 3.9876, + "step": 11708 + }, + { + "epoch": 0.9979544873433904, + "grad_norm": 38.94021777443996, + "learning_rate": 8.440240954281457e-06, + "loss": 3.2592, + "step": 11709 + }, + { + "epoch": 0.9980397170374158, + "grad_norm": 149.5179869623931, + "learning_rate": 8.43988111795473e-06, + "loss": 4.7194, + "step": 11710 + }, + { + "epoch": 0.9981249467314413, + "grad_norm": 60.928387483757305, + "learning_rate": 8.439521247798137e-06, + "loss": 4.1033, + "step": 11711 + }, + { + "epoch": 0.9982101764254666, + "grad_norm": 1052.98899336688, + "learning_rate": 8.43916134381522e-06, + "loss": 4.9079, + "step": 11712 + }, + { + "epoch": 0.9982954061194921, + "grad_norm": 35.28875806393917, + "learning_rate": 8.438801406009518e-06, + "loss": 1.9249, + "step": 11713 + }, + { + "epoch": 0.9983806358135174, + "grad_norm": 33.8387340467472, + "learning_rate": 8.438441434384571e-06, + "loss": 3.5051, + "step": 11714 + }, + { + "epoch": 0.9984658655075428, + "grad_norm": 55.102499384061176, + "learning_rate": 8.438081428943918e-06, + "loss": 3.5721, + "step": 11715 + }, + { + "epoch": 0.9985510952015683, + "grad_norm": 70.87549557422577, + "learning_rate": 8.4377213896911e-06, + "loss": 4.9799, + "step": 11716 + }, + { + "epoch": 0.9986363248955936, + "grad_norm": 34.546211884404364, + "learning_rate": 8.43736131662966e-06, + "loss": 4.435, + "step": 11717 + }, + { + "epoch": 0.998721554589619, + "grad_norm": 67.97687897897762, + "learning_rate": 8.437001209763136e-06, + "loss": 5.1845, + "step": 11718 + }, + { + "epoch": 0.9988067842836444, + "grad_norm": 26.21605375002799, + "learning_rate": 8.43664106909507e-06, + "loss": 3.5198, + "step": 11719 + }, + { + "epoch": 0.9988920139776698, + "grad_norm": 53.747905145146674, + "learning_rate": 8.436280894629005e-06, + "loss": 4.6383, + "step": 11720 + }, + { + "epoch": 0.9989772436716953, + "grad_norm": 36.26690521750919, + "learning_rate": 8.435920686368485e-06, + "loss": 2.5964, + "step": 11721 + }, + { + "epoch": 0.9990624733657206, + "grad_norm": 36.77189250034284, + "learning_rate": 8.435560444317048e-06, + "loss": 3.2455, + "step": 11722 + }, + { + "epoch": 0.999147703059746, + "grad_norm": 67.58749680594319, + "learning_rate": 8.43520016847824e-06, + "loss": 4.8, + "step": 11723 + }, + { + "epoch": 0.9992329327537715, + "grad_norm": 35.27788076719091, + "learning_rate": 8.434839858855602e-06, + "loss": 3.3017, + "step": 11724 + }, + { + "epoch": 0.9993181624477968, + "grad_norm": 46.204190319961924, + "learning_rate": 8.43447951545268e-06, + "loss": 3.8927, + "step": 11725 + }, + { + "epoch": 0.9994033921418222, + "grad_norm": 31.526369753249966, + "learning_rate": 8.434119138273017e-06, + "loss": 2.6303, + "step": 11726 + }, + { + "epoch": 0.9994886218358476, + "grad_norm": 39.04157812379149, + "learning_rate": 8.433758727320157e-06, + "loss": 3.7493, + "step": 11727 + }, + { + "epoch": 0.999573851529873, + "grad_norm": 25.672544101733635, + "learning_rate": 8.433398282597643e-06, + "loss": 2.8595, + "step": 11728 + }, + { + "epoch": 0.9996590812238985, + "grad_norm": 65.17543115952465, + "learning_rate": 8.433037804109024e-06, + "loss": 4.9439, + "step": 11729 + }, + { + "epoch": 0.9997443109179238, + "grad_norm": 72.67214558972702, + "learning_rate": 8.43267729185784e-06, + "loss": 4.5555, + "step": 11730 + }, + { + "epoch": 0.9998295406119492, + "grad_norm": 49.8529208063336, + "learning_rate": 8.43231674584764e-06, + "loss": 4.5675, + "step": 11731 + }, + { + "epoch": 0.9999147703059746, + "grad_norm": 28.303724959938936, + "learning_rate": 8.431956166081968e-06, + "loss": 3.7022, + "step": 11732 + }, + { + "epoch": 1.0, + "grad_norm": 44.26687093531854, + "learning_rate": 8.43159555256437e-06, + "loss": 5.2332, + "step": 11733 + }, + { + "epoch": 1.0000852296940255, + "grad_norm": 45.38317090258041, + "learning_rate": 8.431234905298393e-06, + "loss": 3.4946, + "step": 11734 + }, + { + "epoch": 1.0001704593880507, + "grad_norm": 130.02274304854268, + "learning_rate": 8.430874224287587e-06, + "loss": 4.7758, + "step": 11735 + }, + { + "epoch": 1.0002556890820762, + "grad_norm": 54.3186875187529, + "learning_rate": 8.430513509535494e-06, + "loss": 2.9453, + "step": 11736 + }, + { + "epoch": 1.0003409187761017, + "grad_norm": 51.68463213385514, + "learning_rate": 8.430152761045664e-06, + "loss": 4.1073, + "step": 11737 + }, + { + "epoch": 1.000426148470127, + "grad_norm": 36.43007487675594, + "learning_rate": 8.429791978821645e-06, + "loss": 3.6715, + "step": 11738 + }, + { + "epoch": 1.0005113781641524, + "grad_norm": 45.01849271750971, + "learning_rate": 8.429431162866984e-06, + "loss": 3.8249, + "step": 11739 + }, + { + "epoch": 1.0005966078581778, + "grad_norm": 57.99790027827597, + "learning_rate": 8.429070313185233e-06, + "loss": 4.3453, + "step": 11740 + }, + { + "epoch": 1.000681837552203, + "grad_norm": 57.27768133519033, + "learning_rate": 8.428709429779936e-06, + "loss": 3.809, + "step": 11741 + }, + { + "epoch": 1.0007670672462285, + "grad_norm": 31.829653031543483, + "learning_rate": 8.428348512654643e-06, + "loss": 3.2493, + "step": 11742 + }, + { + "epoch": 1.000852296940254, + "grad_norm": 31.48106899203425, + "learning_rate": 8.427987561812906e-06, + "loss": 2.753, + "step": 11743 + }, + { + "epoch": 1.0009375266342795, + "grad_norm": 40.86309095072482, + "learning_rate": 8.427626577258274e-06, + "loss": 3.0483, + "step": 11744 + }, + { + "epoch": 1.0010227563283047, + "grad_norm": 30.121077139262216, + "learning_rate": 8.427265558994295e-06, + "loss": 2.6992, + "step": 11745 + }, + { + "epoch": 1.0011079860223302, + "grad_norm": 44.56693906096524, + "learning_rate": 8.426904507024523e-06, + "loss": 3.3907, + "step": 11746 + }, + { + "epoch": 1.0011932157163557, + "grad_norm": 49.620608306901616, + "learning_rate": 8.426543421352505e-06, + "loss": 3.039, + "step": 11747 + }, + { + "epoch": 1.001278445410381, + "grad_norm": 49.363967792299334, + "learning_rate": 8.426182301981796e-06, + "loss": 3.2393, + "step": 11748 + }, + { + "epoch": 1.0013636751044064, + "grad_norm": 39.85845145338479, + "learning_rate": 8.425821148915943e-06, + "loss": 2.5475, + "step": 11749 + }, + { + "epoch": 1.0014489047984318, + "grad_norm": 34.34466314400195, + "learning_rate": 8.425459962158503e-06, + "loss": 2.5507, + "step": 11750 + }, + { + "epoch": 1.001534134492457, + "grad_norm": 33.62685037154112, + "learning_rate": 8.425098741713024e-06, + "loss": 2.8626, + "step": 11751 + }, + { + "epoch": 1.0016193641864826, + "grad_norm": 46.45644769908343, + "learning_rate": 8.424737487583061e-06, + "loss": 3.7546, + "step": 11752 + }, + { + "epoch": 1.001704593880508, + "grad_norm": 57.59968847743513, + "learning_rate": 8.424376199772164e-06, + "loss": 4.0389, + "step": 11753 + }, + { + "epoch": 1.0017898235745333, + "grad_norm": 68.96003187934853, + "learning_rate": 8.424014878283888e-06, + "loss": 2.8714, + "step": 11754 + }, + { + "epoch": 1.0018750532685587, + "grad_norm": 89.72794456801151, + "learning_rate": 8.423653523121787e-06, + "loss": 3.6227, + "step": 11755 + }, + { + "epoch": 1.0019602829625842, + "grad_norm": 39.1783158797695, + "learning_rate": 8.423292134289414e-06, + "loss": 1.8591, + "step": 11756 + }, + { + "epoch": 1.0020455126566095, + "grad_norm": 33.406335570057486, + "learning_rate": 8.422930711790323e-06, + "loss": 3.67, + "step": 11757 + }, + { + "epoch": 1.002130742350635, + "grad_norm": 37.46141741828393, + "learning_rate": 8.422569255628069e-06, + "loss": 3.22, + "step": 11758 + }, + { + "epoch": 1.0022159720446604, + "grad_norm": 66.93333484470102, + "learning_rate": 8.422207765806206e-06, + "loss": 2.3638, + "step": 11759 + }, + { + "epoch": 1.0023012017386859, + "grad_norm": 62.25011715864257, + "learning_rate": 8.421846242328288e-06, + "loss": 3.6901, + "step": 11760 + }, + { + "epoch": 1.0023864314327111, + "grad_norm": 32.47930352605833, + "learning_rate": 8.421484685197874e-06, + "loss": 2.4981, + "step": 11761 + }, + { + "epoch": 1.0024716611267366, + "grad_norm": 57.96981096643527, + "learning_rate": 8.421123094418518e-06, + "loss": 2.884, + "step": 11762 + }, + { + "epoch": 1.002556890820762, + "grad_norm": 66.52191754123783, + "learning_rate": 8.420761469993775e-06, + "loss": 4.2701, + "step": 11763 + }, + { + "epoch": 1.0026421205147873, + "grad_norm": 34.63478305533989, + "learning_rate": 8.420399811927201e-06, + "loss": 2.4706, + "step": 11764 + }, + { + "epoch": 1.0027273502088128, + "grad_norm": 44.09632471960196, + "learning_rate": 8.420038120222356e-06, + "loss": 2.8705, + "step": 11765 + }, + { + "epoch": 1.0028125799028382, + "grad_norm": 57.259601017259406, + "learning_rate": 8.419676394882794e-06, + "loss": 2.8338, + "step": 11766 + }, + { + "epoch": 1.0028978095968635, + "grad_norm": 104.05763400975573, + "learning_rate": 8.419314635912073e-06, + "loss": 3.5433, + "step": 11767 + }, + { + "epoch": 1.002983039290889, + "grad_norm": 41.752068611906424, + "learning_rate": 8.418952843313753e-06, + "loss": 3.4127, + "step": 11768 + }, + { + "epoch": 1.0030682689849144, + "grad_norm": 45.583004208383784, + "learning_rate": 8.418591017091388e-06, + "loss": 2.0654, + "step": 11769 + }, + { + "epoch": 1.0031534986789397, + "grad_norm": 43.11087548816877, + "learning_rate": 8.41822915724854e-06, + "loss": 2.0858, + "step": 11770 + }, + { + "epoch": 1.0032387283729651, + "grad_norm": 36.4581190489276, + "learning_rate": 8.417867263788766e-06, + "loss": 3.6789, + "step": 11771 + }, + { + "epoch": 1.0033239580669906, + "grad_norm": 46.0909493622783, + "learning_rate": 8.417505336715626e-06, + "loss": 2.558, + "step": 11772 + }, + { + "epoch": 1.0034091877610158, + "grad_norm": 43.55862353160988, + "learning_rate": 8.41714337603268e-06, + "loss": 3.2301, + "step": 11773 + }, + { + "epoch": 1.0034944174550413, + "grad_norm": 42.47266946086762, + "learning_rate": 8.416781381743485e-06, + "loss": 2.8383, + "step": 11774 + }, + { + "epoch": 1.0035796471490668, + "grad_norm": 44.76292899066754, + "learning_rate": 8.416419353851603e-06, + "loss": 3.1312, + "step": 11775 + }, + { + "epoch": 1.003664876843092, + "grad_norm": 62.35162496721248, + "learning_rate": 8.416057292360596e-06, + "loss": 3.4829, + "step": 11776 + }, + { + "epoch": 1.0037501065371175, + "grad_norm": 47.9183514855339, + "learning_rate": 8.41569519727402e-06, + "loss": 3.3303, + "step": 11777 + }, + { + "epoch": 1.003835336231143, + "grad_norm": 53.79646232499907, + "learning_rate": 8.415333068595441e-06, + "loss": 2.8181, + "step": 11778 + }, + { + "epoch": 1.0039205659251684, + "grad_norm": 50.682108590133, + "learning_rate": 8.41497090632842e-06, + "loss": 3.549, + "step": 11779 + }, + { + "epoch": 1.0040057956191937, + "grad_norm": 60.4148478619957, + "learning_rate": 8.414608710476514e-06, + "loss": 3.5495, + "step": 11780 + }, + { + "epoch": 1.0040910253132191, + "grad_norm": 35.29169916921552, + "learning_rate": 8.41424648104329e-06, + "loss": 2.5511, + "step": 11781 + }, + { + "epoch": 1.0041762550072446, + "grad_norm": 26.59688388716961, + "learning_rate": 8.413884218032307e-06, + "loss": 2.1647, + "step": 11782 + }, + { + "epoch": 1.0042614847012699, + "grad_norm": 41.58799711461542, + "learning_rate": 8.41352192144713e-06, + "loss": 3.7486, + "step": 11783 + }, + { + "epoch": 1.0043467143952953, + "grad_norm": 161.29982288599794, + "learning_rate": 8.413159591291322e-06, + "loss": 3.396, + "step": 11784 + }, + { + "epoch": 1.0044319440893208, + "grad_norm": 33.30557000107089, + "learning_rate": 8.412797227568446e-06, + "loss": 3.1309, + "step": 11785 + }, + { + "epoch": 1.004517173783346, + "grad_norm": 45.187986785794855, + "learning_rate": 8.412434830282063e-06, + "loss": 2.5721, + "step": 11786 + }, + { + "epoch": 1.0046024034773715, + "grad_norm": 62.794338753610965, + "learning_rate": 8.412072399435742e-06, + "loss": 3.6655, + "step": 11787 + }, + { + "epoch": 1.004687633171397, + "grad_norm": 34.648743809659614, + "learning_rate": 8.411709935033044e-06, + "loss": 2.8315, + "step": 11788 + }, + { + "epoch": 1.0047728628654222, + "grad_norm": 66.42187263984235, + "learning_rate": 8.411347437077534e-06, + "loss": 3.3821, + "step": 11789 + }, + { + "epoch": 1.0048580925594477, + "grad_norm": 49.671630031745515, + "learning_rate": 8.410984905572776e-06, + "loss": 2.378, + "step": 11790 + }, + { + "epoch": 1.0049433222534732, + "grad_norm": 54.229260866000736, + "learning_rate": 8.410622340522341e-06, + "loss": 3.4493, + "step": 11791 + }, + { + "epoch": 1.0050285519474984, + "grad_norm": 50.40918180815792, + "learning_rate": 8.410259741929787e-06, + "loss": 2.5499, + "step": 11792 + }, + { + "epoch": 1.0051137816415239, + "grad_norm": 61.78971641175964, + "learning_rate": 8.409897109798684e-06, + "loss": 3.475, + "step": 11793 + }, + { + "epoch": 1.0051990113355493, + "grad_norm": 45.74321655965214, + "learning_rate": 8.4095344441326e-06, + "loss": 2.9174, + "step": 11794 + }, + { + "epoch": 1.0052842410295748, + "grad_norm": 40.58630161013869, + "learning_rate": 8.409171744935097e-06, + "loss": 3.0122, + "step": 11795 + }, + { + "epoch": 1.0053694707236, + "grad_norm": 72.36952972165258, + "learning_rate": 8.408809012209747e-06, + "loss": 3.3196, + "step": 11796 + }, + { + "epoch": 1.0054547004176255, + "grad_norm": 132.19878048853562, + "learning_rate": 8.408446245960112e-06, + "loss": 2.5521, + "step": 11797 + }, + { + "epoch": 1.005539930111651, + "grad_norm": 79.59562264243147, + "learning_rate": 8.408083446189765e-06, + "loss": 2.9469, + "step": 11798 + }, + { + "epoch": 1.0056251598056762, + "grad_norm": 34.8873075095413, + "learning_rate": 8.40772061290227e-06, + "loss": 1.5241, + "step": 11799 + }, + { + "epoch": 1.0057103894997017, + "grad_norm": 62.19939279615265, + "learning_rate": 8.407357746101198e-06, + "loss": 4.0373, + "step": 11800 + }, + { + "epoch": 1.0057956191937272, + "grad_norm": 55.9732218631737, + "learning_rate": 8.406994845790116e-06, + "loss": 2.3106, + "step": 11801 + }, + { + "epoch": 1.0058808488877524, + "grad_norm": 44.70104344093414, + "learning_rate": 8.406631911972593e-06, + "loss": 2.6848, + "step": 11802 + }, + { + "epoch": 1.005966078581778, + "grad_norm": 79.34230329090852, + "learning_rate": 8.406268944652199e-06, + "loss": 2.68, + "step": 11803 + }, + { + "epoch": 1.0060513082758034, + "grad_norm": 37.495771448559275, + "learning_rate": 8.405905943832505e-06, + "loss": 3.54, + "step": 11804 + }, + { + "epoch": 1.0061365379698286, + "grad_norm": 78.74482773140808, + "learning_rate": 8.405542909517076e-06, + "loss": 3.3979, + "step": 11805 + }, + { + "epoch": 1.006221767663854, + "grad_norm": 55.40685320530539, + "learning_rate": 8.40517984170949e-06, + "loss": 3.7887, + "step": 11806 + }, + { + "epoch": 1.0063069973578795, + "grad_norm": 37.842710999428995, + "learning_rate": 8.40481674041331e-06, + "loss": 3.1366, + "step": 11807 + }, + { + "epoch": 1.0063922270519048, + "grad_norm": 45.65186393048454, + "learning_rate": 8.404453605632112e-06, + "loss": 2.556, + "step": 11808 + }, + { + "epoch": 1.0064774567459303, + "grad_norm": 62.579597034469096, + "learning_rate": 8.404090437369466e-06, + "loss": 2.8011, + "step": 11809 + }, + { + "epoch": 1.0065626864399557, + "grad_norm": 69.81025953996306, + "learning_rate": 8.403727235628941e-06, + "loss": 2.3216, + "step": 11810 + }, + { + "epoch": 1.006647916133981, + "grad_norm": 79.2040088930018, + "learning_rate": 8.403364000414112e-06, + "loss": 3.0576, + "step": 11811 + }, + { + "epoch": 1.0067331458280064, + "grad_norm": 34.01446612168571, + "learning_rate": 8.403000731728552e-06, + "loss": 2.4834, + "step": 11812 + }, + { + "epoch": 1.006818375522032, + "grad_norm": 135.82159810284458, + "learning_rate": 8.40263742957583e-06, + "loss": 2.7206, + "step": 11813 + }, + { + "epoch": 1.0069036052160574, + "grad_norm": 57.467435772177225, + "learning_rate": 8.402274093959521e-06, + "loss": 3.5502, + "step": 11814 + }, + { + "epoch": 1.0069888349100826, + "grad_norm": 38.464565881190765, + "learning_rate": 8.4019107248832e-06, + "loss": 3.1072, + "step": 11815 + }, + { + "epoch": 1.007074064604108, + "grad_norm": 54.56190580576785, + "learning_rate": 8.401547322350439e-06, + "loss": 1.1905, + "step": 11816 + }, + { + "epoch": 1.0071592942981336, + "grad_norm": 46.447249898792805, + "learning_rate": 8.40118388636481e-06, + "loss": 3.076, + "step": 11817 + }, + { + "epoch": 1.0072445239921588, + "grad_norm": 80.66527427616957, + "learning_rate": 8.400820416929888e-06, + "loss": 2.7605, + "step": 11818 + }, + { + "epoch": 1.0073297536861843, + "grad_norm": 62.52585978199792, + "learning_rate": 8.40045691404925e-06, + "loss": 3.6504, + "step": 11819 + }, + { + "epoch": 1.0074149833802097, + "grad_norm": 70.22746598341024, + "learning_rate": 8.40009337772647e-06, + "loss": 2.9013, + "step": 11820 + }, + { + "epoch": 1.007500213074235, + "grad_norm": 41.51799547887639, + "learning_rate": 8.399729807965124e-06, + "loss": 2.0872, + "step": 11821 + }, + { + "epoch": 1.0075854427682605, + "grad_norm": 38.68191015260174, + "learning_rate": 8.399366204768783e-06, + "loss": 3.1009, + "step": 11822 + }, + { + "epoch": 1.007670672462286, + "grad_norm": 39.968422246568444, + "learning_rate": 8.399002568141028e-06, + "loss": 2.9174, + "step": 11823 + }, + { + "epoch": 1.0077559021563112, + "grad_norm": 87.62639172959209, + "learning_rate": 8.398638898085434e-06, + "loss": 2.3855, + "step": 11824 + }, + { + "epoch": 1.0078411318503366, + "grad_norm": 82.12400173213341, + "learning_rate": 8.398275194605576e-06, + "loss": 3.8342, + "step": 11825 + }, + { + "epoch": 1.007926361544362, + "grad_norm": 53.893537859687534, + "learning_rate": 8.397911457705032e-06, + "loss": 2.9342, + "step": 11826 + }, + { + "epoch": 1.0080115912383873, + "grad_norm": 89.13611735728713, + "learning_rate": 8.397547687387378e-06, + "loss": 3.6591, + "step": 11827 + }, + { + "epoch": 1.0080968209324128, + "grad_norm": 35.31937894314676, + "learning_rate": 8.397183883656195e-06, + "loss": 2.9233, + "step": 11828 + }, + { + "epoch": 1.0081820506264383, + "grad_norm": 134.07388590753055, + "learning_rate": 8.396820046515055e-06, + "loss": 3.3464, + "step": 11829 + }, + { + "epoch": 1.0082672803204638, + "grad_norm": 46.91740046374145, + "learning_rate": 8.396456175967542e-06, + "loss": 2.864, + "step": 11830 + }, + { + "epoch": 1.008352510014489, + "grad_norm": 36.205144003546536, + "learning_rate": 8.396092272017232e-06, + "loss": 2.5321, + "step": 11831 + }, + { + "epoch": 1.0084377397085145, + "grad_norm": 100.80033298781882, + "learning_rate": 8.395728334667703e-06, + "loss": 4.3832, + "step": 11832 + }, + { + "epoch": 1.00852296940254, + "grad_norm": 106.28660612277815, + "learning_rate": 8.395364363922536e-06, + "loss": 3.6596, + "step": 11833 + }, + { + "epoch": 1.0086081990965652, + "grad_norm": 40.62518312136755, + "learning_rate": 8.39500035978531e-06, + "loss": 2.8593, + "step": 11834 + }, + { + "epoch": 1.0086934287905907, + "grad_norm": 35.49202116005199, + "learning_rate": 8.394636322259602e-06, + "loss": 3.3032, + "step": 11835 + }, + { + "epoch": 1.0087786584846161, + "grad_norm": 51.99161590662399, + "learning_rate": 8.394272251348998e-06, + "loss": 3.2605, + "step": 11836 + }, + { + "epoch": 1.0088638881786414, + "grad_norm": 42.12646134230617, + "learning_rate": 8.393908147057072e-06, + "loss": 3.6083, + "step": 11837 + }, + { + "epoch": 1.0089491178726668, + "grad_norm": 44.9999102890555, + "learning_rate": 8.393544009387411e-06, + "loss": 2.7952, + "step": 11838 + }, + { + "epoch": 1.0090343475666923, + "grad_norm": 46.0318628168747, + "learning_rate": 8.39317983834359e-06, + "loss": 3.3837, + "step": 11839 + }, + { + "epoch": 1.0091195772607175, + "grad_norm": 23.952658611716558, + "learning_rate": 8.392815633929197e-06, + "loss": 2.4219, + "step": 11840 + }, + { + "epoch": 1.009204806954743, + "grad_norm": 67.74194061445864, + "learning_rate": 8.39245139614781e-06, + "loss": 2.8265, + "step": 11841 + }, + { + "epoch": 1.0092900366487685, + "grad_norm": 48.80135579853108, + "learning_rate": 8.392087125003008e-06, + "loss": 3.1415, + "step": 11842 + }, + { + "epoch": 1.0093752663427937, + "grad_norm": 28.258136128493838, + "learning_rate": 8.39172282049838e-06, + "loss": 1.9566, + "step": 11843 + }, + { + "epoch": 1.0094604960368192, + "grad_norm": 71.75568324257276, + "learning_rate": 8.391358482637506e-06, + "loss": 3.735, + "step": 11844 + }, + { + "epoch": 1.0095457257308447, + "grad_norm": 43.96126248622196, + "learning_rate": 8.390994111423967e-06, + "loss": 3.5903, + "step": 11845 + }, + { + "epoch": 1.00963095542487, + "grad_norm": 57.14193422588479, + "learning_rate": 8.39062970686135e-06, + "loss": 2.2765, + "step": 11846 + }, + { + "epoch": 1.0097161851188954, + "grad_norm": 37.03121929055126, + "learning_rate": 8.390265268953236e-06, + "loss": 2.2297, + "step": 11847 + }, + { + "epoch": 1.0098014148129209, + "grad_norm": 57.567963881469545, + "learning_rate": 8.389900797703211e-06, + "loss": 3.4807, + "step": 11848 + }, + { + "epoch": 1.0098866445069463, + "grad_norm": 78.08262376248787, + "learning_rate": 8.389536293114859e-06, + "loss": 3.5351, + "step": 11849 + }, + { + "epoch": 1.0099718742009716, + "grad_norm": 28.165157581552005, + "learning_rate": 8.389171755191763e-06, + "loss": 2.7238, + "step": 11850 + }, + { + "epoch": 1.010057103894997, + "grad_norm": 43.417933003895115, + "learning_rate": 8.38880718393751e-06, + "loss": 3.3144, + "step": 11851 + }, + { + "epoch": 1.0101423335890225, + "grad_norm": 44.6901689319238, + "learning_rate": 8.388442579355685e-06, + "loss": 2.5079, + "step": 11852 + }, + { + "epoch": 1.0102275632830477, + "grad_norm": 40.17181109516137, + "learning_rate": 8.388077941449874e-06, + "loss": 3.3151, + "step": 11853 + }, + { + "epoch": 1.0103127929770732, + "grad_norm": 40.43627748904806, + "learning_rate": 8.387713270223663e-06, + "loss": 3.4104, + "step": 11854 + }, + { + "epoch": 1.0103980226710987, + "grad_norm": 116.73394012258365, + "learning_rate": 8.387348565680637e-06, + "loss": 2.1341, + "step": 11855 + }, + { + "epoch": 1.010483252365124, + "grad_norm": 73.80596579421967, + "learning_rate": 8.386983827824386e-06, + "loss": 3.1754, + "step": 11856 + }, + { + "epoch": 1.0105684820591494, + "grad_norm": 55.20320433298376, + "learning_rate": 8.386619056658494e-06, + "loss": 3.5555, + "step": 11857 + }, + { + "epoch": 1.0106537117531749, + "grad_norm": 49.34257963060477, + "learning_rate": 8.386254252186549e-06, + "loss": 2.937, + "step": 11858 + }, + { + "epoch": 1.0107389414472001, + "grad_norm": 49.486326573343845, + "learning_rate": 8.385889414412139e-06, + "loss": 2.8902, + "step": 11859 + }, + { + "epoch": 1.0108241711412256, + "grad_norm": 37.24985500903645, + "learning_rate": 8.385524543338852e-06, + "loss": 2.8611, + "step": 11860 + }, + { + "epoch": 1.010909400835251, + "grad_norm": 44.30936508832461, + "learning_rate": 8.385159638970275e-06, + "loss": 3.0356, + "step": 11861 + }, + { + "epoch": 1.0109946305292763, + "grad_norm": 81.34484955007851, + "learning_rate": 8.384794701309999e-06, + "loss": 3.3781, + "step": 11862 + }, + { + "epoch": 1.0110798602233018, + "grad_norm": 87.79955346906387, + "learning_rate": 8.384429730361614e-06, + "loss": 2.6611, + "step": 11863 + }, + { + "epoch": 1.0111650899173272, + "grad_norm": 62.62186913147332, + "learning_rate": 8.384064726128706e-06, + "loss": 2.8366, + "step": 11864 + }, + { + "epoch": 1.0112503196113527, + "grad_norm": 51.72061649173721, + "learning_rate": 8.383699688614865e-06, + "loss": 2.688, + "step": 11865 + }, + { + "epoch": 1.011335549305378, + "grad_norm": 62.71929731278564, + "learning_rate": 8.383334617823685e-06, + "loss": 2.9285, + "step": 11866 + }, + { + "epoch": 1.0114207789994034, + "grad_norm": 26.860671963864345, + "learning_rate": 8.38296951375875e-06, + "loss": 2.0028, + "step": 11867 + }, + { + "epoch": 1.0115060086934289, + "grad_norm": 45.25716214564304, + "learning_rate": 8.382604376423655e-06, + "loss": 3.9385, + "step": 11868 + }, + { + "epoch": 1.0115912383874541, + "grad_norm": 49.3862105687932, + "learning_rate": 8.382239205821992e-06, + "loss": 3.6738, + "step": 11869 + }, + { + "epoch": 1.0116764680814796, + "grad_norm": 68.76395140166676, + "learning_rate": 8.381874001957348e-06, + "loss": 3.0755, + "step": 11870 + }, + { + "epoch": 1.011761697775505, + "grad_norm": 48.998205120263094, + "learning_rate": 8.381508764833319e-06, + "loss": 2.8202, + "step": 11871 + }, + { + "epoch": 1.0118469274695303, + "grad_norm": 39.142035190728826, + "learning_rate": 8.381143494453494e-06, + "loss": 2.9489, + "step": 11872 + }, + { + "epoch": 1.0119321571635558, + "grad_norm": 36.68975546139711, + "learning_rate": 8.380778190821467e-06, + "loss": 3.8244, + "step": 11873 + }, + { + "epoch": 1.0120173868575812, + "grad_norm": 126.09144611737784, + "learning_rate": 8.38041285394083e-06, + "loss": 3.275, + "step": 11874 + }, + { + "epoch": 1.0121026165516065, + "grad_norm": 52.973719266825526, + "learning_rate": 8.380047483815175e-06, + "loss": 2.6112, + "step": 11875 + }, + { + "epoch": 1.012187846245632, + "grad_norm": 47.69852045584731, + "learning_rate": 8.379682080448095e-06, + "loss": 2.9054, + "step": 11876 + }, + { + "epoch": 1.0122730759396574, + "grad_norm": 56.64890496521767, + "learning_rate": 8.379316643843187e-06, + "loss": 2.7893, + "step": 11877 + }, + { + "epoch": 1.0123583056336827, + "grad_norm": 42.12038283059093, + "learning_rate": 8.37895117400404e-06, + "loss": 3.5259, + "step": 11878 + }, + { + "epoch": 1.0124435353277081, + "grad_norm": 78.59627260151298, + "learning_rate": 8.378585670934254e-06, + "loss": 3.1634, + "step": 11879 + }, + { + "epoch": 1.0125287650217336, + "grad_norm": 25.198314003292978, + "learning_rate": 8.378220134637419e-06, + "loss": 2.9025, + "step": 11880 + }, + { + "epoch": 1.012613994715759, + "grad_norm": 58.01057856943254, + "learning_rate": 8.37785456511713e-06, + "loss": 4.0137, + "step": 11881 + }, + { + "epoch": 1.0126992244097843, + "grad_norm": 62.701006259240536, + "learning_rate": 8.377488962376986e-06, + "loss": 2.903, + "step": 11882 + }, + { + "epoch": 1.0127844541038098, + "grad_norm": 38.215509266418046, + "learning_rate": 8.377123326420578e-06, + "loss": 2.2338, + "step": 11883 + }, + { + "epoch": 1.0128696837978353, + "grad_norm": 113.71489076271125, + "learning_rate": 8.376757657251505e-06, + "loss": 4.1218, + "step": 11884 + }, + { + "epoch": 1.0129549134918605, + "grad_norm": 34.38301341979753, + "learning_rate": 8.376391954873362e-06, + "loss": 2.8988, + "step": 11885 + }, + { + "epoch": 1.013040143185886, + "grad_norm": 118.77006094907976, + "learning_rate": 8.376026219289746e-06, + "loss": 3.035, + "step": 11886 + }, + { + "epoch": 1.0131253728799114, + "grad_norm": 41.22560516450514, + "learning_rate": 8.375660450504252e-06, + "loss": 2.9024, + "step": 11887 + }, + { + "epoch": 1.0132106025739367, + "grad_norm": 48.53387764676707, + "learning_rate": 8.375294648520482e-06, + "loss": 3.3846, + "step": 11888 + }, + { + "epoch": 1.0132958322679622, + "grad_norm": 44.57360981146461, + "learning_rate": 8.374928813342029e-06, + "loss": 2.3373, + "step": 11889 + }, + { + "epoch": 1.0133810619619876, + "grad_norm": 61.34569237117728, + "learning_rate": 8.374562944972493e-06, + "loss": 2.6494, + "step": 11890 + }, + { + "epoch": 1.0134662916560129, + "grad_norm": 34.48070925078272, + "learning_rate": 8.374197043415469e-06, + "loss": 2.6314, + "step": 11891 + }, + { + "epoch": 1.0135515213500383, + "grad_norm": 41.1640147788467, + "learning_rate": 8.37383110867456e-06, + "loss": 3.4562, + "step": 11892 + }, + { + "epoch": 1.0136367510440638, + "grad_norm": 38.192383768605076, + "learning_rate": 8.37346514075336e-06, + "loss": 3.8856, + "step": 11893 + }, + { + "epoch": 1.013721980738089, + "grad_norm": 32.19946360895667, + "learning_rate": 8.373099139655474e-06, + "loss": 2.5541, + "step": 11894 + }, + { + "epoch": 1.0138072104321145, + "grad_norm": 33.715000224398146, + "learning_rate": 8.372733105384496e-06, + "loss": 2.841, + "step": 11895 + }, + { + "epoch": 1.01389244012614, + "grad_norm": 31.540018895001182, + "learning_rate": 8.37236703794403e-06, + "loss": 2.6193, + "step": 11896 + }, + { + "epoch": 1.0139776698201652, + "grad_norm": 47.63319393651158, + "learning_rate": 8.372000937337671e-06, + "loss": 2.8849, + "step": 11897 + }, + { + "epoch": 1.0140628995141907, + "grad_norm": 44.48890578804555, + "learning_rate": 8.371634803569026e-06, + "loss": 2.5512, + "step": 11898 + }, + { + "epoch": 1.0141481292082162, + "grad_norm": 30.393705446352996, + "learning_rate": 8.371268636641691e-06, + "loss": 2.6279, + "step": 11899 + }, + { + "epoch": 1.0142333589022416, + "grad_norm": 60.01733520489234, + "learning_rate": 8.37090243655927e-06, + "loss": 2.8484, + "step": 11900 + }, + { + "epoch": 1.014318588596267, + "grad_norm": 98.9866044079477, + "learning_rate": 8.37053620332536e-06, + "loss": 2.7572, + "step": 11901 + }, + { + "epoch": 1.0144038182902924, + "grad_norm": 35.224192037609306, + "learning_rate": 8.37016993694357e-06, + "loss": 2.3763, + "step": 11902 + }, + { + "epoch": 1.0144890479843178, + "grad_norm": 70.87671877903946, + "learning_rate": 8.369803637417494e-06, + "loss": 3.7533, + "step": 11903 + }, + { + "epoch": 1.014574277678343, + "grad_norm": 34.13306765660377, + "learning_rate": 8.369437304750741e-06, + "loss": 2.3285, + "step": 11904 + }, + { + "epoch": 1.0146595073723685, + "grad_norm": 37.555541122594036, + "learning_rate": 8.36907093894691e-06, + "loss": 2.8139, + "step": 11905 + }, + { + "epoch": 1.014744737066394, + "grad_norm": 44.92752606732865, + "learning_rate": 8.368704540009605e-06, + "loss": 2.4922, + "step": 11906 + }, + { + "epoch": 1.0148299667604193, + "grad_norm": 69.61516281383943, + "learning_rate": 8.36833810794243e-06, + "loss": 2.1507, + "step": 11907 + }, + { + "epoch": 1.0149151964544447, + "grad_norm": 24.249537940761112, + "learning_rate": 8.367971642748986e-06, + "loss": 2.3322, + "step": 11908 + }, + { + "epoch": 1.0150004261484702, + "grad_norm": 46.002418160054035, + "learning_rate": 8.367605144432881e-06, + "loss": 2.4981, + "step": 11909 + }, + { + "epoch": 1.0150856558424954, + "grad_norm": 152.05255604062927, + "learning_rate": 8.367238612997718e-06, + "loss": 4.1298, + "step": 11910 + }, + { + "epoch": 1.015170885536521, + "grad_norm": 60.56121297526588, + "learning_rate": 8.3668720484471e-06, + "loss": 3.0944, + "step": 11911 + }, + { + "epoch": 1.0152561152305464, + "grad_norm": 37.408306770903714, + "learning_rate": 8.366505450784633e-06, + "loss": 2.8407, + "step": 11912 + }, + { + "epoch": 1.0153413449245716, + "grad_norm": 64.48721570780701, + "learning_rate": 8.366138820013924e-06, + "loss": 2.9306, + "step": 11913 + }, + { + "epoch": 1.015426574618597, + "grad_norm": 48.884905284911795, + "learning_rate": 8.365772156138576e-06, + "loss": 3.3747, + "step": 11914 + }, + { + "epoch": 1.0155118043126226, + "grad_norm": 36.64970930118617, + "learning_rate": 8.365405459162197e-06, + "loss": 1.2119, + "step": 11915 + }, + { + "epoch": 1.015597034006648, + "grad_norm": 55.474459889448234, + "learning_rate": 8.365038729088393e-06, + "loss": 3.1944, + "step": 11916 + }, + { + "epoch": 1.0156822637006733, + "grad_norm": 58.261853268592255, + "learning_rate": 8.36467196592077e-06, + "loss": 3.6196, + "step": 11917 + }, + { + "epoch": 1.0157674933946987, + "grad_norm": 40.253587743019175, + "learning_rate": 8.364305169662935e-06, + "loss": 2.4655, + "step": 11918 + }, + { + "epoch": 1.0158527230887242, + "grad_norm": 32.43604712570426, + "learning_rate": 8.363938340318496e-06, + "loss": 2.3871, + "step": 11919 + }, + { + "epoch": 1.0159379527827495, + "grad_norm": 36.996788574749225, + "learning_rate": 8.363571477891059e-06, + "loss": 2.5138, + "step": 11920 + }, + { + "epoch": 1.016023182476775, + "grad_norm": 32.62410492288877, + "learning_rate": 8.363204582384232e-06, + "loss": 3.19, + "step": 11921 + }, + { + "epoch": 1.0161084121708004, + "grad_norm": 40.34523205476701, + "learning_rate": 8.362837653801627e-06, + "loss": 2.6444, + "step": 11922 + }, + { + "epoch": 1.0161936418648256, + "grad_norm": 52.731690909404264, + "learning_rate": 8.362470692146848e-06, + "loss": 2.8947, + "step": 11923 + }, + { + "epoch": 1.016278871558851, + "grad_norm": 61.83186168910114, + "learning_rate": 8.362103697423509e-06, + "loss": 3.6219, + "step": 11924 + }, + { + "epoch": 1.0163641012528766, + "grad_norm": 76.373202582022, + "learning_rate": 8.361736669635212e-06, + "loss": 1.9373, + "step": 11925 + }, + { + "epoch": 1.0164493309469018, + "grad_norm": 52.92808106428857, + "learning_rate": 8.361369608785574e-06, + "loss": 2.068, + "step": 11926 + }, + { + "epoch": 1.0165345606409273, + "grad_norm": 124.86541835604223, + "learning_rate": 8.3610025148782e-06, + "loss": 3.0433, + "step": 11927 + }, + { + "epoch": 1.0166197903349528, + "grad_norm": 70.38469414648408, + "learning_rate": 8.360635387916702e-06, + "loss": 3.4406, + "step": 11928 + }, + { + "epoch": 1.016705020028978, + "grad_norm": 69.9508701963521, + "learning_rate": 8.36026822790469e-06, + "loss": 3.0674, + "step": 11929 + }, + { + "epoch": 1.0167902497230035, + "grad_norm": 34.83903560778063, + "learning_rate": 8.359901034845774e-06, + "loss": 2.6178, + "step": 11930 + }, + { + "epoch": 1.016875479417029, + "grad_norm": 80.22742939338384, + "learning_rate": 8.35953380874357e-06, + "loss": 2.8143, + "step": 11931 + }, + { + "epoch": 1.0169607091110542, + "grad_norm": 35.10683779631871, + "learning_rate": 8.359166549601683e-06, + "loss": 3.1849, + "step": 11932 + }, + { + "epoch": 1.0170459388050797, + "grad_norm": 354.7634050844316, + "learning_rate": 8.358799257423729e-06, + "loss": 2.7702, + "step": 11933 + }, + { + "epoch": 1.0171311684991051, + "grad_norm": 71.30314061351247, + "learning_rate": 8.358431932213317e-06, + "loss": 3.1125, + "step": 11934 + }, + { + "epoch": 1.0172163981931306, + "grad_norm": 66.20405120485066, + "learning_rate": 8.358064573974064e-06, + "loss": 3.7804, + "step": 11935 + }, + { + "epoch": 1.0173016278871558, + "grad_norm": 22.26719143993813, + "learning_rate": 8.357697182709579e-06, + "loss": 0.9469, + "step": 11936 + }, + { + "epoch": 1.0173868575811813, + "grad_norm": 29.65297635169156, + "learning_rate": 8.357329758423477e-06, + "loss": 2.5206, + "step": 11937 + }, + { + "epoch": 1.0174720872752068, + "grad_norm": 55.386415620194136, + "learning_rate": 8.35696230111937e-06, + "loss": 2.5646, + "step": 11938 + }, + { + "epoch": 1.017557316969232, + "grad_norm": 71.17948604420422, + "learning_rate": 8.356594810800873e-06, + "loss": 3.2769, + "step": 11939 + }, + { + "epoch": 1.0176425466632575, + "grad_norm": 28.779051596925655, + "learning_rate": 8.356227287471602e-06, + "loss": 2.7562, + "step": 11940 + }, + { + "epoch": 1.017727776357283, + "grad_norm": 76.53650438833319, + "learning_rate": 8.355859731135166e-06, + "loss": 2.8871, + "step": 11941 + }, + { + "epoch": 1.0178130060513082, + "grad_norm": 47.47397205276545, + "learning_rate": 8.355492141795185e-06, + "loss": 3.3207, + "step": 11942 + }, + { + "epoch": 1.0178982357453337, + "grad_norm": 55.164384063022645, + "learning_rate": 8.35512451945527e-06, + "loss": 3.1226, + "step": 11943 + }, + { + "epoch": 1.0179834654393591, + "grad_norm": 39.544851376783335, + "learning_rate": 8.354756864119041e-06, + "loss": 3.3188, + "step": 11944 + }, + { + "epoch": 1.0180686951333844, + "grad_norm": 34.07651551252048, + "learning_rate": 8.35438917579011e-06, + "loss": 2.84, + "step": 11945 + }, + { + "epoch": 1.0181539248274099, + "grad_norm": 73.98924440666518, + "learning_rate": 8.354021454472095e-06, + "loss": 2.9985, + "step": 11946 + }, + { + "epoch": 1.0182391545214353, + "grad_norm": 150.65262198323953, + "learning_rate": 8.35365370016861e-06, + "loss": 3.8928, + "step": 11947 + }, + { + "epoch": 1.0183243842154606, + "grad_norm": 55.536435762141146, + "learning_rate": 8.353285912883278e-06, + "loss": 3.3883, + "step": 11948 + }, + { + "epoch": 1.018409613909486, + "grad_norm": 33.437094219905816, + "learning_rate": 8.352918092619707e-06, + "loss": 3.2298, + "step": 11949 + }, + { + "epoch": 1.0184948436035115, + "grad_norm": 36.19835181942524, + "learning_rate": 8.352550239381521e-06, + "loss": 3.814, + "step": 11950 + }, + { + "epoch": 1.018580073297537, + "grad_norm": 47.080998641299416, + "learning_rate": 8.352182353172334e-06, + "loss": 2.8158, + "step": 11951 + }, + { + "epoch": 1.0186653029915622, + "grad_norm": 53.2234337782961, + "learning_rate": 8.351814433995767e-06, + "loss": 2.8536, + "step": 11952 + }, + { + "epoch": 1.0187505326855877, + "grad_norm": 79.65044428744245, + "learning_rate": 8.351446481855436e-06, + "loss": 2.6159, + "step": 11953 + }, + { + "epoch": 1.0188357623796132, + "grad_norm": 75.32449422093963, + "learning_rate": 8.351078496754962e-06, + "loss": 3.1276, + "step": 11954 + }, + { + "epoch": 1.0189209920736384, + "grad_norm": 57.533305412648645, + "learning_rate": 8.350710478697962e-06, + "loss": 3.7392, + "step": 11955 + }, + { + "epoch": 1.0190062217676639, + "grad_norm": 66.4873775052634, + "learning_rate": 8.350342427688054e-06, + "loss": 3.6075, + "step": 11956 + }, + { + "epoch": 1.0190914514616893, + "grad_norm": 28.125937182661136, + "learning_rate": 8.349974343728862e-06, + "loss": 2.0822, + "step": 11957 + }, + { + "epoch": 1.0191766811557146, + "grad_norm": 73.04915644043759, + "learning_rate": 8.349606226824002e-06, + "loss": 3.6436, + "step": 11958 + }, + { + "epoch": 1.01926191084974, + "grad_norm": 71.56102744208363, + "learning_rate": 8.349238076977097e-06, + "loss": 4.1744, + "step": 11959 + }, + { + "epoch": 1.0193471405437655, + "grad_norm": 30.510049635409285, + "learning_rate": 8.348869894191765e-06, + "loss": 2.1516, + "step": 11960 + }, + { + "epoch": 1.0194323702377908, + "grad_norm": 120.68071762093298, + "learning_rate": 8.34850167847163e-06, + "loss": 2.9233, + "step": 11961 + }, + { + "epoch": 1.0195175999318162, + "grad_norm": 33.666412710266606, + "learning_rate": 8.348133429820309e-06, + "loss": 3.5357, + "step": 11962 + }, + { + "epoch": 1.0196028296258417, + "grad_norm": 97.45853718134107, + "learning_rate": 8.347765148241429e-06, + "loss": 2.9795, + "step": 11963 + }, + { + "epoch": 1.019688059319867, + "grad_norm": 142.00598974224982, + "learning_rate": 8.347396833738606e-06, + "loss": 3.8161, + "step": 11964 + }, + { + "epoch": 1.0197732890138924, + "grad_norm": 40.05126282101368, + "learning_rate": 8.347028486315467e-06, + "loss": 2.9449, + "step": 11965 + }, + { + "epoch": 1.0198585187079179, + "grad_norm": 48.68350234296137, + "learning_rate": 8.346660105975633e-06, + "loss": 3.4528, + "step": 11966 + }, + { + "epoch": 1.0199437484019431, + "grad_norm": 126.63450420619617, + "learning_rate": 8.346291692722728e-06, + "loss": 3.5125, + "step": 11967 + }, + { + "epoch": 1.0200289780959686, + "grad_norm": 95.01082007880747, + "learning_rate": 8.34592324656037e-06, + "loss": 3.3785, + "step": 11968 + }, + { + "epoch": 1.020114207789994, + "grad_norm": 53.29411232041607, + "learning_rate": 8.34555476749219e-06, + "loss": 2.6555, + "step": 11969 + }, + { + "epoch": 1.0201994374840195, + "grad_norm": 57.881965974499664, + "learning_rate": 8.345186255521806e-06, + "loss": 3.4598, + "step": 11970 + }, + { + "epoch": 1.0202846671780448, + "grad_norm": 68.55780987691314, + "learning_rate": 8.344817710652847e-06, + "loss": 2.7919, + "step": 11971 + }, + { + "epoch": 1.0203698968720702, + "grad_norm": 59.59549544065441, + "learning_rate": 8.344449132888932e-06, + "loss": 2.453, + "step": 11972 + }, + { + "epoch": 1.0204551265660957, + "grad_norm": 51.006551732599405, + "learning_rate": 8.34408052223369e-06, + "loss": 3.4937, + "step": 11973 + }, + { + "epoch": 1.020540356260121, + "grad_norm": 36.441634392139285, + "learning_rate": 8.343711878690746e-06, + "loss": 2.7734, + "step": 11974 + }, + { + "epoch": 1.0206255859541464, + "grad_norm": 35.7225535765648, + "learning_rate": 8.343343202263722e-06, + "loss": 2.8239, + "step": 11975 + }, + { + "epoch": 1.020710815648172, + "grad_norm": 25.5158595796291, + "learning_rate": 8.342974492956248e-06, + "loss": 1.461, + "step": 11976 + }, + { + "epoch": 1.0207960453421971, + "grad_norm": 45.648844176484594, + "learning_rate": 8.342605750771948e-06, + "loss": 2.5045, + "step": 11977 + }, + { + "epoch": 1.0208812750362226, + "grad_norm": 37.254280313712655, + "learning_rate": 8.342236975714448e-06, + "loss": 2.8637, + "step": 11978 + }, + { + "epoch": 1.020966504730248, + "grad_norm": 44.24953467732467, + "learning_rate": 8.341868167787376e-06, + "loss": 2.8455, + "step": 11979 + }, + { + "epoch": 1.0210517344242733, + "grad_norm": 52.640530010698264, + "learning_rate": 8.341499326994357e-06, + "loss": 2.2946, + "step": 11980 + }, + { + "epoch": 1.0211369641182988, + "grad_norm": 49.48567708462841, + "learning_rate": 8.34113045333902e-06, + "loss": 4.3302, + "step": 11981 + }, + { + "epoch": 1.0212221938123243, + "grad_norm": 34.20554273638565, + "learning_rate": 8.340761546824996e-06, + "loss": 2.7765, + "step": 11982 + }, + { + "epoch": 1.0213074235063495, + "grad_norm": 65.61927887007245, + "learning_rate": 8.340392607455908e-06, + "loss": 4.6208, + "step": 11983 + }, + { + "epoch": 1.021392653200375, + "grad_norm": 59.70169781660392, + "learning_rate": 8.340023635235384e-06, + "loss": 3.3727, + "step": 11984 + }, + { + "epoch": 1.0214778828944004, + "grad_norm": 64.80301401903475, + "learning_rate": 8.339654630167056e-06, + "loss": 3.4048, + "step": 11985 + }, + { + "epoch": 1.021563112588426, + "grad_norm": 45.08764390523909, + "learning_rate": 8.339285592254553e-06, + "loss": 1.5458, + "step": 11986 + }, + { + "epoch": 1.0216483422824512, + "grad_norm": 38.65116513079606, + "learning_rate": 8.3389165215015e-06, + "loss": 2.1609, + "step": 11987 + }, + { + "epoch": 1.0217335719764766, + "grad_norm": 124.05542062671185, + "learning_rate": 8.338547417911533e-06, + "loss": 2.6138, + "step": 11988 + }, + { + "epoch": 1.021818801670502, + "grad_norm": 68.20225450150072, + "learning_rate": 8.338178281488278e-06, + "loss": 4.142, + "step": 11989 + }, + { + "epoch": 1.0219040313645273, + "grad_norm": 42.15173990845635, + "learning_rate": 8.337809112235365e-06, + "loss": 2.7802, + "step": 11990 + }, + { + "epoch": 1.0219892610585528, + "grad_norm": 74.0734684279512, + "learning_rate": 8.337439910156427e-06, + "loss": 3.4816, + "step": 11991 + }, + { + "epoch": 1.0220744907525783, + "grad_norm": 62.360832193600785, + "learning_rate": 8.337070675255092e-06, + "loss": 2.4664, + "step": 11992 + }, + { + "epoch": 1.0221597204466035, + "grad_norm": 29.142678893039836, + "learning_rate": 8.336701407534995e-06, + "loss": 2.7977, + "step": 11993 + }, + { + "epoch": 1.022244950140629, + "grad_norm": 66.66778020342284, + "learning_rate": 8.336332106999764e-06, + "loss": 2.6002, + "step": 11994 + }, + { + "epoch": 1.0223301798346545, + "grad_norm": 91.27456726506765, + "learning_rate": 8.335962773653032e-06, + "loss": 2.4939, + "step": 11995 + }, + { + "epoch": 1.0224154095286797, + "grad_norm": 45.6859503163368, + "learning_rate": 8.335593407498433e-06, + "loss": 2.2703, + "step": 11996 + }, + { + "epoch": 1.0225006392227052, + "grad_norm": 63.78433602491671, + "learning_rate": 8.335224008539598e-06, + "loss": 1.6989, + "step": 11997 + }, + { + "epoch": 1.0225858689167306, + "grad_norm": 70.54161171893968, + "learning_rate": 8.33485457678016e-06, + "loss": 2.6441, + "step": 11998 + }, + { + "epoch": 1.022671098610756, + "grad_norm": 47.737311223165484, + "learning_rate": 8.334485112223753e-06, + "loss": 3.2917, + "step": 11999 + }, + { + "epoch": 1.0227563283047814, + "grad_norm": 73.4590153803666, + "learning_rate": 8.33411561487401e-06, + "loss": 2.9723, + "step": 12000 + }, + { + "epoch": 1.0228415579988068, + "grad_norm": 55.68164565647488, + "learning_rate": 8.333746084734562e-06, + "loss": 2.638, + "step": 12001 + }, + { + "epoch": 1.022926787692832, + "grad_norm": 31.675132427498138, + "learning_rate": 8.333376521809048e-06, + "loss": 2.9389, + "step": 12002 + }, + { + "epoch": 1.0230120173868575, + "grad_norm": 43.97718354166046, + "learning_rate": 8.333006926101102e-06, + "loss": 3.6774, + "step": 12003 + }, + { + "epoch": 1.023097247080883, + "grad_norm": 46.754740818457904, + "learning_rate": 8.332637297614355e-06, + "loss": 3.963, + "step": 12004 + }, + { + "epoch": 1.0231824767749085, + "grad_norm": 45.93702269720376, + "learning_rate": 8.332267636352444e-06, + "loss": 3.4095, + "step": 12005 + }, + { + "epoch": 1.0232677064689337, + "grad_norm": 59.8108753192952, + "learning_rate": 8.331897942319006e-06, + "loss": 3.8245, + "step": 12006 + }, + { + "epoch": 1.0233529361629592, + "grad_norm": 40.16143314784885, + "learning_rate": 8.331528215517677e-06, + "loss": 3.7469, + "step": 12007 + }, + { + "epoch": 1.0234381658569847, + "grad_norm": 100.61400617706637, + "learning_rate": 8.33115845595209e-06, + "loss": 3.0512, + "step": 12008 + }, + { + "epoch": 1.02352339555101, + "grad_norm": 46.6261129295404, + "learning_rate": 8.330788663625884e-06, + "loss": 3.1204, + "step": 12009 + }, + { + "epoch": 1.0236086252450354, + "grad_norm": 89.50252740949149, + "learning_rate": 8.330418838542696e-06, + "loss": 5.0063, + "step": 12010 + }, + { + "epoch": 1.0236938549390608, + "grad_norm": 36.00387053812735, + "learning_rate": 8.33004898070616e-06, + "loss": 2.3764, + "step": 12011 + }, + { + "epoch": 1.023779084633086, + "grad_norm": 65.68598691090077, + "learning_rate": 8.329679090119918e-06, + "loss": 3.5188, + "step": 12012 + }, + { + "epoch": 1.0238643143271116, + "grad_norm": 42.58843296995172, + "learning_rate": 8.329309166787602e-06, + "loss": 3.3917, + "step": 12013 + }, + { + "epoch": 1.023949544021137, + "grad_norm": 60.975726910220665, + "learning_rate": 8.328939210712857e-06, + "loss": 2.2708, + "step": 12014 + }, + { + "epoch": 1.0240347737151623, + "grad_norm": 26.351851704316008, + "learning_rate": 8.328569221899315e-06, + "loss": 2.0519, + "step": 12015 + }, + { + "epoch": 1.0241200034091877, + "grad_norm": 38.970246121549714, + "learning_rate": 8.328199200350619e-06, + "loss": 2.781, + "step": 12016 + }, + { + "epoch": 1.0242052331032132, + "grad_norm": 33.56528675487952, + "learning_rate": 8.327829146070406e-06, + "loss": 1.6213, + "step": 12017 + }, + { + "epoch": 1.0242904627972385, + "grad_norm": 41.520667698051994, + "learning_rate": 8.327459059062314e-06, + "loss": 2.189, + "step": 12018 + }, + { + "epoch": 1.024375692491264, + "grad_norm": 36.016860138864516, + "learning_rate": 8.327088939329986e-06, + "loss": 2.5897, + "step": 12019 + }, + { + "epoch": 1.0244609221852894, + "grad_norm": 41.30154933455078, + "learning_rate": 8.326718786877059e-06, + "loss": 2.9835, + "step": 12020 + }, + { + "epoch": 1.0245461518793149, + "grad_norm": 29.969246409485297, + "learning_rate": 8.326348601707174e-06, + "loss": 1.9649, + "step": 12021 + }, + { + "epoch": 1.02463138157334, + "grad_norm": 30.905775572322874, + "learning_rate": 8.325978383823975e-06, + "loss": 2.4608, + "step": 12022 + }, + { + "epoch": 1.0247166112673656, + "grad_norm": 36.18749610081567, + "learning_rate": 8.325608133231099e-06, + "loss": 2.2164, + "step": 12023 + }, + { + "epoch": 1.024801840961391, + "grad_norm": 44.75772931134985, + "learning_rate": 8.325237849932188e-06, + "loss": 2.8359, + "step": 12024 + }, + { + "epoch": 1.0248870706554163, + "grad_norm": 105.17309077715474, + "learning_rate": 8.324867533930885e-06, + "loss": 2.7522, + "step": 12025 + }, + { + "epoch": 1.0249723003494418, + "grad_norm": 58.11475320374627, + "learning_rate": 8.324497185230832e-06, + "loss": 2.7369, + "step": 12026 + }, + { + "epoch": 1.0250575300434672, + "grad_norm": 39.79806568239663, + "learning_rate": 8.324126803835668e-06, + "loss": 2.0449, + "step": 12027 + }, + { + "epoch": 1.0251427597374925, + "grad_norm": 149.12534437672713, + "learning_rate": 8.323756389749037e-06, + "loss": 2.1207, + "step": 12028 + }, + { + "epoch": 1.025227989431518, + "grad_norm": 60.110376663089404, + "learning_rate": 8.323385942974585e-06, + "loss": 3.0433, + "step": 12029 + }, + { + "epoch": 1.0253132191255434, + "grad_norm": 26.800261086385717, + "learning_rate": 8.32301546351595e-06, + "loss": 1.5612, + "step": 12030 + }, + { + "epoch": 1.0253984488195687, + "grad_norm": 127.12279702405927, + "learning_rate": 8.322644951376782e-06, + "loss": 3.5096, + "step": 12031 + }, + { + "epoch": 1.0254836785135941, + "grad_norm": 71.73789828625517, + "learning_rate": 8.32227440656072e-06, + "loss": 3.0356, + "step": 12032 + }, + { + "epoch": 1.0255689082076196, + "grad_norm": 42.875077933528445, + "learning_rate": 8.321903829071408e-06, + "loss": 2.766, + "step": 12033 + }, + { + "epoch": 1.0256541379016448, + "grad_norm": 40.438345614111704, + "learning_rate": 8.321533218912492e-06, + "loss": 2.6417, + "step": 12034 + }, + { + "epoch": 1.0257393675956703, + "grad_norm": 57.81012977336028, + "learning_rate": 8.321162576087618e-06, + "loss": 2.6819, + "step": 12035 + }, + { + "epoch": 1.0258245972896958, + "grad_norm": 45.88918544223887, + "learning_rate": 8.32079190060043e-06, + "loss": 3.0564, + "step": 12036 + }, + { + "epoch": 1.025909826983721, + "grad_norm": 86.21657904074722, + "learning_rate": 8.320421192454571e-06, + "loss": 3.1222, + "step": 12037 + }, + { + "epoch": 1.0259950566777465, + "grad_norm": 129.309726656141, + "learning_rate": 8.32005045165369e-06, + "loss": 3.0698, + "step": 12038 + }, + { + "epoch": 1.026080286371772, + "grad_norm": 56.429247521654375, + "learning_rate": 8.319679678201432e-06, + "loss": 3.2758, + "step": 12039 + }, + { + "epoch": 1.0261655160657974, + "grad_norm": 47.90137718273909, + "learning_rate": 8.319308872101443e-06, + "loss": 2.2561, + "step": 12040 + }, + { + "epoch": 1.0262507457598227, + "grad_norm": 51.24999577375799, + "learning_rate": 8.318938033357372e-06, + "loss": 3.6064, + "step": 12041 + }, + { + "epoch": 1.0263359754538481, + "grad_norm": 55.75453080553084, + "learning_rate": 8.318567161972864e-06, + "loss": 3.2209, + "step": 12042 + }, + { + "epoch": 1.0264212051478736, + "grad_norm": 47.302897708394376, + "learning_rate": 8.318196257951565e-06, + "loss": 2.8715, + "step": 12043 + }, + { + "epoch": 1.0265064348418989, + "grad_norm": 51.30399873827621, + "learning_rate": 8.317825321297126e-06, + "loss": 2.2445, + "step": 12044 + }, + { + "epoch": 1.0265916645359243, + "grad_norm": 39.454631087151576, + "learning_rate": 8.317454352013192e-06, + "loss": 2.8715, + "step": 12045 + }, + { + "epoch": 1.0266768942299498, + "grad_norm": 21.04171520343813, + "learning_rate": 8.317083350103414e-06, + "loss": 1.2495, + "step": 12046 + }, + { + "epoch": 1.026762123923975, + "grad_norm": 89.30585292173834, + "learning_rate": 8.316712315571437e-06, + "loss": 4.8021, + "step": 12047 + }, + { + "epoch": 1.0268473536180005, + "grad_norm": 27.476643603398653, + "learning_rate": 8.316341248420914e-06, + "loss": 2.8705, + "step": 12048 + }, + { + "epoch": 1.026932583312026, + "grad_norm": 31.39488923162237, + "learning_rate": 8.315970148655493e-06, + "loss": 2.6723, + "step": 12049 + }, + { + "epoch": 1.0270178130060512, + "grad_norm": 73.1884282079927, + "learning_rate": 8.315599016278822e-06, + "loss": 3.0645, + "step": 12050 + }, + { + "epoch": 1.0271030427000767, + "grad_norm": 66.2693846376388, + "learning_rate": 8.315227851294553e-06, + "loss": 3.0912, + "step": 12051 + }, + { + "epoch": 1.0271882723941022, + "grad_norm": 36.41651069569498, + "learning_rate": 8.314856653706336e-06, + "loss": 2.9135, + "step": 12052 + }, + { + "epoch": 1.0272735020881274, + "grad_norm": 38.67795261562982, + "learning_rate": 8.314485423517819e-06, + "loss": 2.7374, + "step": 12053 + }, + { + "epoch": 1.0273587317821529, + "grad_norm": 27.91265909162827, + "learning_rate": 8.314114160732658e-06, + "loss": 2.7201, + "step": 12054 + }, + { + "epoch": 1.0274439614761783, + "grad_norm": 74.48789179261122, + "learning_rate": 8.3137428653545e-06, + "loss": 2.7952, + "step": 12055 + }, + { + "epoch": 1.0275291911702038, + "grad_norm": 35.01211622939492, + "learning_rate": 8.313371537386995e-06, + "loss": 2.9157, + "step": 12056 + }, + { + "epoch": 1.027614420864229, + "grad_norm": 46.616256815625675, + "learning_rate": 8.313000176833801e-06, + "loss": 2.5103, + "step": 12057 + }, + { + "epoch": 1.0276996505582545, + "grad_norm": 50.38354049487272, + "learning_rate": 8.312628783698565e-06, + "loss": 2.6798, + "step": 12058 + }, + { + "epoch": 1.02778488025228, + "grad_norm": 67.21983910198462, + "learning_rate": 8.31225735798494e-06, + "loss": 3.668, + "step": 12059 + }, + { + "epoch": 1.0278701099463052, + "grad_norm": 41.92208785384545, + "learning_rate": 8.311885899696582e-06, + "loss": 3.005, + "step": 12060 + }, + { + "epoch": 1.0279553396403307, + "grad_norm": 89.00338337196803, + "learning_rate": 8.311514408837142e-06, + "loss": 2.9891, + "step": 12061 + }, + { + "epoch": 1.0280405693343562, + "grad_norm": 49.48911778365489, + "learning_rate": 8.311142885410273e-06, + "loss": 3.6685, + "step": 12062 + }, + { + "epoch": 1.0281257990283814, + "grad_norm": 52.58560213148292, + "learning_rate": 8.31077132941963e-06, + "loss": 2.9745, + "step": 12063 + }, + { + "epoch": 1.0282110287224069, + "grad_norm": 52.31550868792085, + "learning_rate": 8.310399740868866e-06, + "loss": 3.1633, + "step": 12064 + }, + { + "epoch": 1.0282962584164324, + "grad_norm": 40.776796960870186, + "learning_rate": 8.310028119761638e-06, + "loss": 2.7213, + "step": 12065 + }, + { + "epoch": 1.0283814881104576, + "grad_norm": 40.43735546819658, + "learning_rate": 8.309656466101596e-06, + "loss": 2.3147, + "step": 12066 + }, + { + "epoch": 1.028466717804483, + "grad_norm": 31.537628463058187, + "learning_rate": 8.309284779892398e-06, + "loss": 3.0633, + "step": 12067 + }, + { + "epoch": 1.0285519474985085, + "grad_norm": 45.80619658833765, + "learning_rate": 8.3089130611377e-06, + "loss": 3.5824, + "step": 12068 + }, + { + "epoch": 1.0286371771925338, + "grad_norm": 106.1214819715821, + "learning_rate": 8.308541309841158e-06, + "loss": 4.4241, + "step": 12069 + }, + { + "epoch": 1.0287224068865592, + "grad_norm": 55.4619701091259, + "learning_rate": 8.308169526006425e-06, + "loss": 3.5519, + "step": 12070 + }, + { + "epoch": 1.0288076365805847, + "grad_norm": 39.454128039252005, + "learning_rate": 8.30779770963716e-06, + "loss": 3.0177, + "step": 12071 + }, + { + "epoch": 1.0288928662746102, + "grad_norm": 61.841616315692534, + "learning_rate": 8.30742586073702e-06, + "loss": 2.8892, + "step": 12072 + }, + { + "epoch": 1.0289780959686354, + "grad_norm": 45.745934024737856, + "learning_rate": 8.307053979309658e-06, + "loss": 2.8734, + "step": 12073 + }, + { + "epoch": 1.029063325662661, + "grad_norm": 32.46378440355535, + "learning_rate": 8.306682065358736e-06, + "loss": 1.9293, + "step": 12074 + }, + { + "epoch": 1.0291485553566864, + "grad_norm": 62.82925421679311, + "learning_rate": 8.30631011888791e-06, + "loss": 4.4954, + "step": 12075 + }, + { + "epoch": 1.0292337850507116, + "grad_norm": 50.83987653254775, + "learning_rate": 8.305938139900837e-06, + "loss": 2.4312, + "step": 12076 + }, + { + "epoch": 1.029319014744737, + "grad_norm": 58.54244026731671, + "learning_rate": 8.305566128401178e-06, + "loss": 3.1698, + "step": 12077 + }, + { + "epoch": 1.0294042444387626, + "grad_norm": 60.90974157840569, + "learning_rate": 8.305194084392588e-06, + "loss": 3.7036, + "step": 12078 + }, + { + "epoch": 1.0294894741327878, + "grad_norm": 53.50700031275279, + "learning_rate": 8.304822007878728e-06, + "loss": 3.5482, + "step": 12079 + }, + { + "epoch": 1.0295747038268133, + "grad_norm": 75.41780938783552, + "learning_rate": 8.304449898863256e-06, + "loss": 3.2272, + "step": 12080 + }, + { + "epoch": 1.0296599335208387, + "grad_norm": 45.061579920042576, + "learning_rate": 8.304077757349833e-06, + "loss": 3.9132, + "step": 12081 + }, + { + "epoch": 1.029745163214864, + "grad_norm": 27.79218769281735, + "learning_rate": 8.30370558334212e-06, + "loss": 1.2646, + "step": 12082 + }, + { + "epoch": 1.0298303929088894, + "grad_norm": 120.71790039925695, + "learning_rate": 8.303333376843772e-06, + "loss": 4.5193, + "step": 12083 + }, + { + "epoch": 1.029915622602915, + "grad_norm": 37.09426708758325, + "learning_rate": 8.302961137858454e-06, + "loss": 4.2033, + "step": 12084 + }, + { + "epoch": 1.0300008522969402, + "grad_norm": 31.494873757521887, + "learning_rate": 8.302588866389827e-06, + "loss": 3.1313, + "step": 12085 + }, + { + "epoch": 1.0300860819909656, + "grad_norm": 32.59436090048641, + "learning_rate": 8.30221656244155e-06, + "loss": 3.0113, + "step": 12086 + }, + { + "epoch": 1.030171311684991, + "grad_norm": 78.62304788833134, + "learning_rate": 8.301844226017284e-06, + "loss": 2.839, + "step": 12087 + }, + { + "epoch": 1.0302565413790163, + "grad_norm": 27.773154983534543, + "learning_rate": 8.301471857120694e-06, + "loss": 2.2133, + "step": 12088 + }, + { + "epoch": 1.0303417710730418, + "grad_norm": 28.792424671822783, + "learning_rate": 8.301099455755438e-06, + "loss": 2.6284, + "step": 12089 + }, + { + "epoch": 1.0304270007670673, + "grad_norm": 35.42224382680802, + "learning_rate": 8.300727021925184e-06, + "loss": 3.4008, + "step": 12090 + }, + { + "epoch": 1.0305122304610927, + "grad_norm": 55.1525642303998, + "learning_rate": 8.300354555633588e-06, + "loss": 3.2568, + "step": 12091 + }, + { + "epoch": 1.030597460155118, + "grad_norm": 36.38037769388634, + "learning_rate": 8.299982056884316e-06, + "loss": 3.0799, + "step": 12092 + }, + { + "epoch": 1.0306826898491435, + "grad_norm": 43.81914177197984, + "learning_rate": 8.299609525681035e-06, + "loss": 3.5719, + "step": 12093 + }, + { + "epoch": 1.030767919543169, + "grad_norm": 88.98025260154694, + "learning_rate": 8.299236962027404e-06, + "loss": 3.0516, + "step": 12094 + }, + { + "epoch": 1.0308531492371942, + "grad_norm": 30.755249604879996, + "learning_rate": 8.298864365927087e-06, + "loss": 2.03, + "step": 12095 + }, + { + "epoch": 1.0309383789312196, + "grad_norm": 68.7346594767212, + "learning_rate": 8.298491737383752e-06, + "loss": 3.7415, + "step": 12096 + }, + { + "epoch": 1.0310236086252451, + "grad_norm": 73.52357940409918, + "learning_rate": 8.29811907640106e-06, + "loss": 3.4497, + "step": 12097 + }, + { + "epoch": 1.0311088383192704, + "grad_norm": 87.47522335104755, + "learning_rate": 8.297746382982679e-06, + "loss": 2.5431, + "step": 12098 + }, + { + "epoch": 1.0311940680132958, + "grad_norm": 40.03515163702693, + "learning_rate": 8.297373657132271e-06, + "loss": 2.9981, + "step": 12099 + }, + { + "epoch": 1.0312792977073213, + "grad_norm": 39.035058723356066, + "learning_rate": 8.297000898853505e-06, + "loss": 3.2094, + "step": 12100 + }, + { + "epoch": 1.0313645274013465, + "grad_norm": 39.135417230639305, + "learning_rate": 8.296628108150042e-06, + "loss": 2.6842, + "step": 12101 + }, + { + "epoch": 1.031449757095372, + "grad_norm": 44.88806441526292, + "learning_rate": 8.296255285025554e-06, + "loss": 3.3504, + "step": 12102 + }, + { + "epoch": 1.0315349867893975, + "grad_norm": 67.06793639081697, + "learning_rate": 8.295882429483706e-06, + "loss": 4.1064, + "step": 12103 + }, + { + "epoch": 1.0316202164834227, + "grad_norm": 190.10754159372897, + "learning_rate": 8.295509541528162e-06, + "loss": 3.7139, + "step": 12104 + }, + { + "epoch": 1.0317054461774482, + "grad_norm": 77.97341453722024, + "learning_rate": 8.295136621162592e-06, + "loss": 4.7411, + "step": 12105 + }, + { + "epoch": 1.0317906758714737, + "grad_norm": 40.58776366963585, + "learning_rate": 8.294763668390662e-06, + "loss": 2.6259, + "step": 12106 + }, + { + "epoch": 1.031875905565499, + "grad_norm": 49.35868285244354, + "learning_rate": 8.294390683216042e-06, + "loss": 3.3312, + "step": 12107 + }, + { + "epoch": 1.0319611352595244, + "grad_norm": 30.082251947318923, + "learning_rate": 8.294017665642397e-06, + "loss": 1.8245, + "step": 12108 + }, + { + "epoch": 1.0320463649535498, + "grad_norm": 64.03879221240777, + "learning_rate": 8.293644615673399e-06, + "loss": 3.0372, + "step": 12109 + }, + { + "epoch": 1.0321315946475753, + "grad_norm": 53.06460515142394, + "learning_rate": 8.293271533312714e-06, + "loss": 3.0575, + "step": 12110 + }, + { + "epoch": 1.0322168243416006, + "grad_norm": 60.23121291687428, + "learning_rate": 8.292898418564012e-06, + "loss": 3.3947, + "step": 12111 + }, + { + "epoch": 1.032302054035626, + "grad_norm": 33.59365812796446, + "learning_rate": 8.292525271430962e-06, + "loss": 2.7568, + "step": 12112 + }, + { + "epoch": 1.0323872837296515, + "grad_norm": 40.591803557920244, + "learning_rate": 8.292152091917236e-06, + "loss": 3.1683, + "step": 12113 + }, + { + "epoch": 1.0324725134236767, + "grad_norm": 105.27973692048667, + "learning_rate": 8.291778880026499e-06, + "loss": 3.8201, + "step": 12114 + }, + { + "epoch": 1.0325577431177022, + "grad_norm": 43.40825568842732, + "learning_rate": 8.291405635762429e-06, + "loss": 3.3717, + "step": 12115 + }, + { + "epoch": 1.0326429728117277, + "grad_norm": 52.2936166540494, + "learning_rate": 8.29103235912869e-06, + "loss": 2.056, + "step": 12116 + }, + { + "epoch": 1.032728202505753, + "grad_norm": 29.9203110572437, + "learning_rate": 8.290659050128954e-06, + "loss": 2.617, + "step": 12117 + }, + { + "epoch": 1.0328134321997784, + "grad_norm": 78.22616621864691, + "learning_rate": 8.290285708766897e-06, + "loss": 2.9991, + "step": 12118 + }, + { + "epoch": 1.0328986618938039, + "grad_norm": 68.88434245755035, + "learning_rate": 8.289912335046186e-06, + "loss": 2.8536, + "step": 12119 + }, + { + "epoch": 1.032983891587829, + "grad_norm": 43.7060848287245, + "learning_rate": 8.289538928970495e-06, + "loss": 2.9166, + "step": 12120 + }, + { + "epoch": 1.0330691212818546, + "grad_norm": 87.94924141162493, + "learning_rate": 8.289165490543496e-06, + "loss": 3.2493, + "step": 12121 + }, + { + "epoch": 1.03315435097588, + "grad_norm": 61.10792227772566, + "learning_rate": 8.28879201976886e-06, + "loss": 3.3436, + "step": 12122 + }, + { + "epoch": 1.0332395806699053, + "grad_norm": 29.71701717182553, + "learning_rate": 8.288418516650263e-06, + "loss": 2.763, + "step": 12123 + }, + { + "epoch": 1.0333248103639308, + "grad_norm": 39.12608308785298, + "learning_rate": 8.288044981191375e-06, + "loss": 2.6939, + "step": 12124 + }, + { + "epoch": 1.0334100400579562, + "grad_norm": 77.16004697607302, + "learning_rate": 8.287671413395873e-06, + "loss": 3.427, + "step": 12125 + }, + { + "epoch": 1.0334952697519817, + "grad_norm": 100.47385187764769, + "learning_rate": 8.287297813267427e-06, + "loss": 3.0782, + "step": 12126 + }, + { + "epoch": 1.033580499446007, + "grad_norm": 60.06890982820693, + "learning_rate": 8.286924180809714e-06, + "loss": 2.6003, + "step": 12127 + }, + { + "epoch": 1.0336657291400324, + "grad_norm": 31.662871313036728, + "learning_rate": 8.286550516026408e-06, + "loss": 2.2428, + "step": 12128 + }, + { + "epoch": 1.0337509588340579, + "grad_norm": 30.894421028127805, + "learning_rate": 8.286176818921184e-06, + "loss": 2.3125, + "step": 12129 + }, + { + "epoch": 1.0338361885280831, + "grad_norm": 87.7637517775818, + "learning_rate": 8.285803089497717e-06, + "loss": 3.4624, + "step": 12130 + }, + { + "epoch": 1.0339214182221086, + "grad_norm": 57.14174964017084, + "learning_rate": 8.285429327759681e-06, + "loss": 3.2439, + "step": 12131 + }, + { + "epoch": 1.034006647916134, + "grad_norm": 61.94836739564466, + "learning_rate": 8.285055533710752e-06, + "loss": 2.3238, + "step": 12132 + }, + { + "epoch": 1.0340918776101593, + "grad_norm": 92.94299415317427, + "learning_rate": 8.284681707354612e-06, + "loss": 3.2177, + "step": 12133 + }, + { + "epoch": 1.0341771073041848, + "grad_norm": 29.584212923569336, + "learning_rate": 8.284307848694928e-06, + "loss": 2.7611, + "step": 12134 + }, + { + "epoch": 1.0342623369982102, + "grad_norm": 77.94189645442697, + "learning_rate": 8.283933957735384e-06, + "loss": 2.3605, + "step": 12135 + }, + { + "epoch": 1.0343475666922355, + "grad_norm": 88.21031937811698, + "learning_rate": 8.283560034479652e-06, + "loss": 3.715, + "step": 12136 + }, + { + "epoch": 1.034432796386261, + "grad_norm": 42.58983143150038, + "learning_rate": 8.283186078931413e-06, + "loss": 1.7881, + "step": 12137 + }, + { + "epoch": 1.0345180260802864, + "grad_norm": 121.62253567601516, + "learning_rate": 8.282812091094344e-06, + "loss": 3.4997, + "step": 12138 + }, + { + "epoch": 1.0346032557743117, + "grad_norm": 23.808774685878802, + "learning_rate": 8.282438070972123e-06, + "loss": 2.114, + "step": 12139 + }, + { + "epoch": 1.0346884854683371, + "grad_norm": 66.73950494682572, + "learning_rate": 8.282064018568426e-06, + "loss": 3.6591, + "step": 12140 + }, + { + "epoch": 1.0347737151623626, + "grad_norm": 41.01311106113405, + "learning_rate": 8.281689933886935e-06, + "loss": 2.2891, + "step": 12141 + }, + { + "epoch": 1.034858944856388, + "grad_norm": 43.46488286278063, + "learning_rate": 8.281315816931325e-06, + "loss": 2.504, + "step": 12142 + }, + { + "epoch": 1.0349441745504133, + "grad_norm": 36.326314562066216, + "learning_rate": 8.280941667705282e-06, + "loss": 2.3952, + "step": 12143 + }, + { + "epoch": 1.0350294042444388, + "grad_norm": 54.19403547279413, + "learning_rate": 8.280567486212478e-06, + "loss": 2.4816, + "step": 12144 + }, + { + "epoch": 1.0351146339384643, + "grad_norm": 77.48487619481895, + "learning_rate": 8.280193272456597e-06, + "loss": 2.7541, + "step": 12145 + }, + { + "epoch": 1.0351998636324895, + "grad_norm": 32.37793459945194, + "learning_rate": 8.27981902644132e-06, + "loss": 3.0471, + "step": 12146 + }, + { + "epoch": 1.035285093326515, + "grad_norm": 55.693866758668065, + "learning_rate": 8.279444748170327e-06, + "loss": 2.3478, + "step": 12147 + }, + { + "epoch": 1.0353703230205404, + "grad_norm": 82.2773618782188, + "learning_rate": 8.279070437647297e-06, + "loss": 2.9358, + "step": 12148 + }, + { + "epoch": 1.0354555527145657, + "grad_norm": 43.96349659864246, + "learning_rate": 8.278696094875911e-06, + "loss": 3.3119, + "step": 12149 + }, + { + "epoch": 1.0355407824085912, + "grad_norm": 47.70730883772513, + "learning_rate": 8.278321719859852e-06, + "loss": 3.7531, + "step": 12150 + }, + { + "epoch": 1.0356260121026166, + "grad_norm": 34.25884159093591, + "learning_rate": 8.277947312602803e-06, + "loss": 2.6121, + "step": 12151 + }, + { + "epoch": 1.0357112417966419, + "grad_norm": 37.19895680749179, + "learning_rate": 8.277572873108443e-06, + "loss": 3.2859, + "step": 12152 + }, + { + "epoch": 1.0357964714906673, + "grad_norm": 55.75896796778944, + "learning_rate": 8.277198401380456e-06, + "loss": 2.7541, + "step": 12153 + }, + { + "epoch": 1.0358817011846928, + "grad_norm": 80.72770884976103, + "learning_rate": 8.276823897422525e-06, + "loss": 1.7742, + "step": 12154 + }, + { + "epoch": 1.035966930878718, + "grad_norm": 58.03302862556559, + "learning_rate": 8.276449361238334e-06, + "loss": 3.3615, + "step": 12155 + }, + { + "epoch": 1.0360521605727435, + "grad_norm": 81.74542488318203, + "learning_rate": 8.276074792831566e-06, + "loss": 2.5188, + "step": 12156 + }, + { + "epoch": 1.036137390266769, + "grad_norm": 25.525056137924903, + "learning_rate": 8.275700192205904e-06, + "loss": 2.302, + "step": 12157 + }, + { + "epoch": 1.0362226199607942, + "grad_norm": 37.52416601408021, + "learning_rate": 8.275325559365033e-06, + "loss": 2.8707, + "step": 12158 + }, + { + "epoch": 1.0363078496548197, + "grad_norm": 76.73541305252233, + "learning_rate": 8.274950894312633e-06, + "loss": 3.2727, + "step": 12159 + }, + { + "epoch": 1.0363930793488452, + "grad_norm": 58.917608211806865, + "learning_rate": 8.274576197052396e-06, + "loss": 4.6359, + "step": 12160 + }, + { + "epoch": 1.0364783090428706, + "grad_norm": 81.69357179256977, + "learning_rate": 8.274201467588002e-06, + "loss": 3.5538, + "step": 12161 + }, + { + "epoch": 1.0365635387368959, + "grad_norm": 33.925780063585954, + "learning_rate": 8.273826705923136e-06, + "loss": 2.2728, + "step": 12162 + }, + { + "epoch": 1.0366487684309214, + "grad_norm": 61.274699932158235, + "learning_rate": 8.273451912061488e-06, + "loss": 2.1879, + "step": 12163 + }, + { + "epoch": 1.0367339981249468, + "grad_norm": 71.39845424071167, + "learning_rate": 8.27307708600674e-06, + "loss": 4.1202, + "step": 12164 + }, + { + "epoch": 1.036819227818972, + "grad_norm": 67.73588533598227, + "learning_rate": 8.272702227762579e-06, + "loss": 4.4866, + "step": 12165 + }, + { + "epoch": 1.0369044575129975, + "grad_norm": 45.87350606294019, + "learning_rate": 8.272327337332693e-06, + "loss": 2.7187, + "step": 12166 + }, + { + "epoch": 1.036989687207023, + "grad_norm": 53.88858362662162, + "learning_rate": 8.271952414720767e-06, + "loss": 3.618, + "step": 12167 + }, + { + "epoch": 1.0370749169010482, + "grad_norm": 48.065069543797975, + "learning_rate": 8.271577459930489e-06, + "loss": 3.2468, + "step": 12168 + }, + { + "epoch": 1.0371601465950737, + "grad_norm": 67.38401939881358, + "learning_rate": 8.271202472965547e-06, + "loss": 2.4885, + "step": 12169 + }, + { + "epoch": 1.0372453762890992, + "grad_norm": 50.11090355867709, + "learning_rate": 8.270827453829629e-06, + "loss": 3.7502, + "step": 12170 + }, + { + "epoch": 1.0373306059831244, + "grad_norm": 51.16715735784727, + "learning_rate": 8.270452402526423e-06, + "loss": 3.1488, + "step": 12171 + }, + { + "epoch": 1.03741583567715, + "grad_norm": 73.12230021324552, + "learning_rate": 8.270077319059616e-06, + "loss": 3.289, + "step": 12172 + }, + { + "epoch": 1.0375010653711754, + "grad_norm": 36.15556049605914, + "learning_rate": 8.2697022034329e-06, + "loss": 2.4248, + "step": 12173 + }, + { + "epoch": 1.0375862950652006, + "grad_norm": 65.45137814529086, + "learning_rate": 8.269327055649959e-06, + "loss": 4.5566, + "step": 12174 + }, + { + "epoch": 1.037671524759226, + "grad_norm": 92.9225494995293, + "learning_rate": 8.268951875714488e-06, + "loss": 2.3743, + "step": 12175 + }, + { + "epoch": 1.0377567544532516, + "grad_norm": 32.366089845208904, + "learning_rate": 8.268576663630174e-06, + "loss": 2.7812, + "step": 12176 + }, + { + "epoch": 1.0378419841472768, + "grad_norm": 34.41125762482978, + "learning_rate": 8.268201419400706e-06, + "loss": 3.3563, + "step": 12177 + }, + { + "epoch": 1.0379272138413023, + "grad_norm": 81.53889851125065, + "learning_rate": 8.267826143029777e-06, + "loss": 2.4966, + "step": 12178 + }, + { + "epoch": 1.0380124435353277, + "grad_norm": 34.848047846484754, + "learning_rate": 8.267450834521075e-06, + "loss": 2.4022, + "step": 12179 + }, + { + "epoch": 1.0380976732293532, + "grad_norm": 36.93902057765923, + "learning_rate": 8.267075493878293e-06, + "loss": 2.6924, + "step": 12180 + }, + { + "epoch": 1.0381829029233784, + "grad_norm": 47.31450533921139, + "learning_rate": 8.266700121105122e-06, + "loss": 3.1651, + "step": 12181 + }, + { + "epoch": 1.038268132617404, + "grad_norm": 76.49894756726589, + "learning_rate": 8.266324716205254e-06, + "loss": 2.3218, + "step": 12182 + }, + { + "epoch": 1.0383533623114294, + "grad_norm": 42.3955159594013, + "learning_rate": 8.265949279182379e-06, + "loss": 2.885, + "step": 12183 + }, + { + "epoch": 1.0384385920054546, + "grad_norm": 41.60446555562181, + "learning_rate": 8.265573810040191e-06, + "loss": 2.221, + "step": 12184 + }, + { + "epoch": 1.03852382169948, + "grad_norm": 85.19921580042002, + "learning_rate": 8.265198308782383e-06, + "loss": 4.2757, + "step": 12185 + }, + { + "epoch": 1.0386090513935056, + "grad_norm": 56.8101868530747, + "learning_rate": 8.264822775412647e-06, + "loss": 3.674, + "step": 12186 + }, + { + "epoch": 1.0386942810875308, + "grad_norm": 48.10083929706645, + "learning_rate": 8.264447209934673e-06, + "loss": 3.0798, + "step": 12187 + }, + { + "epoch": 1.0387795107815563, + "grad_norm": 62.95892417066675, + "learning_rate": 8.264071612352163e-06, + "loss": 2.8069, + "step": 12188 + }, + { + "epoch": 1.0388647404755817, + "grad_norm": 47.10048390720347, + "learning_rate": 8.263695982668802e-06, + "loss": 2.9342, + "step": 12189 + }, + { + "epoch": 1.038949970169607, + "grad_norm": 29.201501598633556, + "learning_rate": 8.26332032088829e-06, + "loss": 1.8246, + "step": 12190 + }, + { + "epoch": 1.0390351998636325, + "grad_norm": 93.8539832065586, + "learning_rate": 8.26294462701432e-06, + "loss": 3.6537, + "step": 12191 + }, + { + "epoch": 1.039120429557658, + "grad_norm": 56.469607670273426, + "learning_rate": 8.262568901050583e-06, + "loss": 3.6683, + "step": 12192 + }, + { + "epoch": 1.0392056592516834, + "grad_norm": 84.33708709094385, + "learning_rate": 8.26219314300078e-06, + "loss": 2.4693, + "step": 12193 + }, + { + "epoch": 1.0392908889457086, + "grad_norm": 89.15958401350191, + "learning_rate": 8.261817352868602e-06, + "loss": 3.6222, + "step": 12194 + }, + { + "epoch": 1.0393761186397341, + "grad_norm": 26.949915726290524, + "learning_rate": 8.261441530657748e-06, + "loss": 2.4346, + "step": 12195 + }, + { + "epoch": 1.0394613483337596, + "grad_norm": 55.234231298161504, + "learning_rate": 8.261065676371911e-06, + "loss": 2.1384, + "step": 12196 + }, + { + "epoch": 1.0395465780277848, + "grad_norm": 52.197223049944306, + "learning_rate": 8.260689790014791e-06, + "loss": 2.2414, + "step": 12197 + }, + { + "epoch": 1.0396318077218103, + "grad_norm": 79.58189639985815, + "learning_rate": 8.260313871590079e-06, + "loss": 3.2919, + "step": 12198 + }, + { + "epoch": 1.0397170374158358, + "grad_norm": 59.1878848057259, + "learning_rate": 8.259937921101479e-06, + "loss": 2.6712, + "step": 12199 + }, + { + "epoch": 1.039802267109861, + "grad_norm": 84.66589201890126, + "learning_rate": 8.259561938552685e-06, + "loss": 4.007, + "step": 12200 + }, + { + "epoch": 1.0398874968038865, + "grad_norm": 42.79954608006502, + "learning_rate": 8.259185923947392e-06, + "loss": 3.0068, + "step": 12201 + }, + { + "epoch": 1.039972726497912, + "grad_norm": 63.07539392542155, + "learning_rate": 8.258809877289302e-06, + "loss": 2.5227, + "step": 12202 + }, + { + "epoch": 1.0400579561919372, + "grad_norm": 68.0568464197457, + "learning_rate": 8.258433798582113e-06, + "loss": 3.0783, + "step": 12203 + }, + { + "epoch": 1.0401431858859627, + "grad_norm": 59.819798830272994, + "learning_rate": 8.258057687829519e-06, + "loss": 3.5568, + "step": 12204 + }, + { + "epoch": 1.0402284155799881, + "grad_norm": 71.2876645152556, + "learning_rate": 8.257681545035226e-06, + "loss": 2.6985, + "step": 12205 + }, + { + "epoch": 1.0403136452740134, + "grad_norm": 42.47665143993179, + "learning_rate": 8.257305370202927e-06, + "loss": 1.7838, + "step": 12206 + }, + { + "epoch": 1.0403988749680388, + "grad_norm": 65.64178855071286, + "learning_rate": 8.256929163336324e-06, + "loss": 3.7487, + "step": 12207 + }, + { + "epoch": 1.0404841046620643, + "grad_norm": 39.362826869601115, + "learning_rate": 8.256552924439118e-06, + "loss": 2.8482, + "step": 12208 + }, + { + "epoch": 1.0405693343560896, + "grad_norm": 62.02722870124476, + "learning_rate": 8.256176653515008e-06, + "loss": 1.8265, + "step": 12209 + }, + { + "epoch": 1.040654564050115, + "grad_norm": 44.15667191311648, + "learning_rate": 8.255800350567694e-06, + "loss": 2.4389, + "step": 12210 + }, + { + "epoch": 1.0407397937441405, + "grad_norm": 53.17125070889923, + "learning_rate": 8.255424015600878e-06, + "loss": 3.2054, + "step": 12211 + }, + { + "epoch": 1.040825023438166, + "grad_norm": 128.50727960384066, + "learning_rate": 8.25504764861826e-06, + "loss": 2.5336, + "step": 12212 + }, + { + "epoch": 1.0409102531321912, + "grad_norm": 56.939195747416626, + "learning_rate": 8.25467124962354e-06, + "loss": 2.3594, + "step": 12213 + }, + { + "epoch": 1.0409954828262167, + "grad_norm": 107.59664029900621, + "learning_rate": 8.254294818620423e-06, + "loss": 3.3482, + "step": 12214 + }, + { + "epoch": 1.0410807125202421, + "grad_norm": 61.6923095212438, + "learning_rate": 8.25391835561261e-06, + "loss": 3.9536, + "step": 12215 + }, + { + "epoch": 1.0411659422142674, + "grad_norm": 36.28043608093918, + "learning_rate": 8.253541860603803e-06, + "loss": 3.0384, + "step": 12216 + }, + { + "epoch": 1.0412511719082929, + "grad_norm": 76.01077056712498, + "learning_rate": 8.253165333597704e-06, + "loss": 5.0611, + "step": 12217 + }, + { + "epoch": 1.0413364016023183, + "grad_norm": 44.08218308210829, + "learning_rate": 8.252788774598017e-06, + "loss": 2.7401, + "step": 12218 + }, + { + "epoch": 1.0414216312963436, + "grad_norm": 57.1330587227507, + "learning_rate": 8.252412183608446e-06, + "loss": 3.0422, + "step": 12219 + }, + { + "epoch": 1.041506860990369, + "grad_norm": 43.57551784511266, + "learning_rate": 8.252035560632692e-06, + "loss": 3.4864, + "step": 12220 + }, + { + "epoch": 1.0415920906843945, + "grad_norm": 46.25754087201935, + "learning_rate": 8.25165890567446e-06, + "loss": 3.2001, + "step": 12221 + }, + { + "epoch": 1.0416773203784198, + "grad_norm": 34.52635488051408, + "learning_rate": 8.251282218737455e-06, + "loss": 2.3554, + "step": 12222 + }, + { + "epoch": 1.0417625500724452, + "grad_norm": 64.13689497626636, + "learning_rate": 8.250905499825383e-06, + "loss": 2.2294, + "step": 12223 + }, + { + "epoch": 1.0418477797664707, + "grad_norm": 40.75218507382318, + "learning_rate": 8.250528748941946e-06, + "loss": 2.8847, + "step": 12224 + }, + { + "epoch": 1.041933009460496, + "grad_norm": 34.36401564491208, + "learning_rate": 8.250151966090852e-06, + "loss": 2.0774, + "step": 12225 + }, + { + "epoch": 1.0420182391545214, + "grad_norm": 71.74353738122508, + "learning_rate": 8.2497751512758e-06, + "loss": 3.5669, + "step": 12226 + }, + { + "epoch": 1.0421034688485469, + "grad_norm": 41.28436707934247, + "learning_rate": 8.249398304500507e-06, + "loss": 2.8946, + "step": 12227 + }, + { + "epoch": 1.0421886985425721, + "grad_norm": 50.55171832039368, + "learning_rate": 8.249021425768669e-06, + "loss": 4.2093, + "step": 12228 + }, + { + "epoch": 1.0422739282365976, + "grad_norm": 77.95424481074107, + "learning_rate": 8.248644515083996e-06, + "loss": 3.559, + "step": 12229 + }, + { + "epoch": 1.042359157930623, + "grad_norm": 66.25532614906314, + "learning_rate": 8.248267572450197e-06, + "loss": 1.9529, + "step": 12230 + }, + { + "epoch": 1.0424443876246485, + "grad_norm": 75.90926426388411, + "learning_rate": 8.247890597870978e-06, + "loss": 3.2928, + "step": 12231 + }, + { + "epoch": 1.0425296173186738, + "grad_norm": 21.154987509216976, + "learning_rate": 8.247513591350042e-06, + "loss": 2.2189, + "step": 12232 + }, + { + "epoch": 1.0426148470126992, + "grad_norm": 66.25663418365933, + "learning_rate": 8.247136552891105e-06, + "loss": 3.8417, + "step": 12233 + }, + { + "epoch": 1.0427000767067247, + "grad_norm": 57.89051267894836, + "learning_rate": 8.246759482497866e-06, + "loss": 3.2935, + "step": 12234 + }, + { + "epoch": 1.04278530640075, + "grad_norm": 39.36399930861447, + "learning_rate": 8.24638238017404e-06, + "loss": 2.64, + "step": 12235 + }, + { + "epoch": 1.0428705360947754, + "grad_norm": 50.29316073088064, + "learning_rate": 8.246005245923332e-06, + "loss": 2.2568, + "step": 12236 + }, + { + "epoch": 1.042955765788801, + "grad_norm": 37.41662405524002, + "learning_rate": 8.245628079749452e-06, + "loss": 2.8199, + "step": 12237 + }, + { + "epoch": 1.0430409954828261, + "grad_norm": 56.284375667019816, + "learning_rate": 8.24525088165611e-06, + "loss": 3.5607, + "step": 12238 + }, + { + "epoch": 1.0431262251768516, + "grad_norm": 91.19102304138461, + "learning_rate": 8.244873651647016e-06, + "loss": 3.4295, + "step": 12239 + }, + { + "epoch": 1.043211454870877, + "grad_norm": 26.591729288614243, + "learning_rate": 8.24449638972588e-06, + "loss": 1.7495, + "step": 12240 + }, + { + "epoch": 1.0432966845649023, + "grad_norm": 33.41080344596049, + "learning_rate": 8.244119095896408e-06, + "loss": 1.5533, + "step": 12241 + }, + { + "epoch": 1.0433819142589278, + "grad_norm": 46.11090464701139, + "learning_rate": 8.243741770162316e-06, + "loss": 3.1475, + "step": 12242 + }, + { + "epoch": 1.0434671439529533, + "grad_norm": 50.059016415911394, + "learning_rate": 8.243364412527313e-06, + "loss": 3.7901, + "step": 12243 + }, + { + "epoch": 1.0435523736469785, + "grad_norm": 41.84130769100806, + "learning_rate": 8.242987022995108e-06, + "loss": 2.7553, + "step": 12244 + }, + { + "epoch": 1.043637603341004, + "grad_norm": 55.49240445380228, + "learning_rate": 8.242609601569415e-06, + "loss": 3.6451, + "step": 12245 + }, + { + "epoch": 1.0437228330350294, + "grad_norm": 29.136966866951916, + "learning_rate": 8.242232148253945e-06, + "loss": 2.6636, + "step": 12246 + }, + { + "epoch": 1.043808062729055, + "grad_norm": 42.43854734039109, + "learning_rate": 8.241854663052411e-06, + "loss": 2.8327, + "step": 12247 + }, + { + "epoch": 1.0438932924230802, + "grad_norm": 42.49068691728419, + "learning_rate": 8.241477145968523e-06, + "loss": 2.734, + "step": 12248 + }, + { + "epoch": 1.0439785221171056, + "grad_norm": 60.75529534041711, + "learning_rate": 8.241099597005998e-06, + "loss": 2.5079, + "step": 12249 + }, + { + "epoch": 1.044063751811131, + "grad_norm": 47.196076762615796, + "learning_rate": 8.240722016168543e-06, + "loss": 4.1936, + "step": 12250 + }, + { + "epoch": 1.0441489815051563, + "grad_norm": 56.25410746257083, + "learning_rate": 8.240344403459877e-06, + "loss": 2.971, + "step": 12251 + }, + { + "epoch": 1.0442342111991818, + "grad_norm": 30.09076608274687, + "learning_rate": 8.23996675888371e-06, + "loss": 2.6801, + "step": 12252 + }, + { + "epoch": 1.0443194408932073, + "grad_norm": 95.98762283861363, + "learning_rate": 8.239589082443758e-06, + "loss": 5.349, + "step": 12253 + }, + { + "epoch": 1.0444046705872325, + "grad_norm": 32.18404060843681, + "learning_rate": 8.239211374143734e-06, + "loss": 3.1669, + "step": 12254 + }, + { + "epoch": 1.044489900281258, + "grad_norm": 152.32262993710512, + "learning_rate": 8.238833633987356e-06, + "loss": 3.6525, + "step": 12255 + }, + { + "epoch": 1.0445751299752835, + "grad_norm": 92.74792968988777, + "learning_rate": 8.238455861978333e-06, + "loss": 3.5649, + "step": 12256 + }, + { + "epoch": 1.0446603596693087, + "grad_norm": 134.82037193571963, + "learning_rate": 8.238078058120385e-06, + "loss": 4.551, + "step": 12257 + }, + { + "epoch": 1.0447455893633342, + "grad_norm": 32.138483904867314, + "learning_rate": 8.237700222417226e-06, + "loss": 2.6368, + "step": 12258 + }, + { + "epoch": 1.0448308190573596, + "grad_norm": 61.19681993444605, + "learning_rate": 8.23732235487257e-06, + "loss": 2.1193, + "step": 12259 + }, + { + "epoch": 1.0449160487513849, + "grad_norm": 87.05777922427457, + "learning_rate": 8.236944455490137e-06, + "loss": 3.1102, + "step": 12260 + }, + { + "epoch": 1.0450012784454104, + "grad_norm": 40.818805274316865, + "learning_rate": 8.236566524273642e-06, + "loss": 2.8756, + "step": 12261 + }, + { + "epoch": 1.0450865081394358, + "grad_norm": 34.21877573245202, + "learning_rate": 8.236188561226799e-06, + "loss": 2.9869, + "step": 12262 + }, + { + "epoch": 1.0451717378334613, + "grad_norm": 25.66132943569737, + "learning_rate": 8.235810566353328e-06, + "loss": 1.8501, + "step": 12263 + }, + { + "epoch": 1.0452569675274865, + "grad_norm": 40.559453900186696, + "learning_rate": 8.235432539656947e-06, + "loss": 3.1663, + "step": 12264 + }, + { + "epoch": 1.045342197221512, + "grad_norm": 85.44794890112217, + "learning_rate": 8.235054481141373e-06, + "loss": 2.9423, + "step": 12265 + }, + { + "epoch": 1.0454274269155375, + "grad_norm": 63.80005547747782, + "learning_rate": 8.234676390810323e-06, + "loss": 2.3252, + "step": 12266 + }, + { + "epoch": 1.0455126566095627, + "grad_norm": 46.24996038362892, + "learning_rate": 8.234298268667515e-06, + "loss": 2.8285, + "step": 12267 + }, + { + "epoch": 1.0455978863035882, + "grad_norm": 28.517324276925486, + "learning_rate": 8.233920114716671e-06, + "loss": 2.2878, + "step": 12268 + }, + { + "epoch": 1.0456831159976137, + "grad_norm": 48.749322439215355, + "learning_rate": 8.233541928961507e-06, + "loss": 3.6451, + "step": 12269 + }, + { + "epoch": 1.045768345691639, + "grad_norm": 113.37969553474639, + "learning_rate": 8.23316371140574e-06, + "loss": 3.3333, + "step": 12270 + }, + { + "epoch": 1.0458535753856644, + "grad_norm": 19.908823314737624, + "learning_rate": 8.232785462053096e-06, + "loss": 1.5818, + "step": 12271 + }, + { + "epoch": 1.0459388050796898, + "grad_norm": 34.508918906722336, + "learning_rate": 8.232407180907291e-06, + "loss": 2.1542, + "step": 12272 + }, + { + "epoch": 1.046024034773715, + "grad_norm": 50.312221253562335, + "learning_rate": 8.232028867972044e-06, + "loss": 3.5855, + "step": 12273 + }, + { + "epoch": 1.0461092644677406, + "grad_norm": 82.41558259635927, + "learning_rate": 8.23165052325108e-06, + "loss": 3.1587, + "step": 12274 + }, + { + "epoch": 1.046194494161766, + "grad_norm": 69.45565085299192, + "learning_rate": 8.231272146748115e-06, + "loss": 2.386, + "step": 12275 + }, + { + "epoch": 1.0462797238557913, + "grad_norm": 57.218059977626716, + "learning_rate": 8.230893738466875e-06, + "loss": 3.1216, + "step": 12276 + }, + { + "epoch": 1.0463649535498167, + "grad_norm": 57.61875992427941, + "learning_rate": 8.230515298411074e-06, + "loss": 2.8469, + "step": 12277 + }, + { + "epoch": 1.0464501832438422, + "grad_norm": 122.87785556943642, + "learning_rate": 8.230136826584443e-06, + "loss": 3.5911, + "step": 12278 + }, + { + "epoch": 1.0465354129378674, + "grad_norm": 34.84217616321286, + "learning_rate": 8.229758322990699e-06, + "loss": 1.448, + "step": 12279 + }, + { + "epoch": 1.046620642631893, + "grad_norm": 26.62270169807395, + "learning_rate": 8.229379787633566e-06, + "loss": 1.8153, + "step": 12280 + }, + { + "epoch": 1.0467058723259184, + "grad_norm": 63.3407279734554, + "learning_rate": 8.229001220516766e-06, + "loss": 3.5834, + "step": 12281 + }, + { + "epoch": 1.0467911020199439, + "grad_norm": 67.3095415197277, + "learning_rate": 8.22862262164402e-06, + "loss": 4.8274, + "step": 12282 + }, + { + "epoch": 1.046876331713969, + "grad_norm": 63.83371467395872, + "learning_rate": 8.228243991019056e-06, + "loss": 2.1085, + "step": 12283 + }, + { + "epoch": 1.0469615614079946, + "grad_norm": 98.28802826577297, + "learning_rate": 8.227865328645592e-06, + "loss": 2.6547, + "step": 12284 + }, + { + "epoch": 1.04704679110202, + "grad_norm": 149.75205153212815, + "learning_rate": 8.227486634527357e-06, + "loss": 2.5802, + "step": 12285 + }, + { + "epoch": 1.0471320207960453, + "grad_norm": 36.18047556799608, + "learning_rate": 8.227107908668074e-06, + "loss": 2.9452, + "step": 12286 + }, + { + "epoch": 1.0472172504900708, + "grad_norm": 86.59491582210677, + "learning_rate": 8.226729151071467e-06, + "loss": 4.0716, + "step": 12287 + }, + { + "epoch": 1.0473024801840962, + "grad_norm": 112.7034153732447, + "learning_rate": 8.22635036174126e-06, + "loss": 4.6594, + "step": 12288 + }, + { + "epoch": 1.0473877098781215, + "grad_norm": 40.7283958063168, + "learning_rate": 8.225971540681179e-06, + "loss": 3.6203, + "step": 12289 + }, + { + "epoch": 1.047472939572147, + "grad_norm": 62.86722657628479, + "learning_rate": 8.22559268789495e-06, + "loss": 3.0649, + "step": 12290 + }, + { + "epoch": 1.0475581692661724, + "grad_norm": 73.68362205749256, + "learning_rate": 8.2252138033863e-06, + "loss": 2.6607, + "step": 12291 + }, + { + "epoch": 1.0476433989601976, + "grad_norm": 93.48391897544029, + "learning_rate": 8.224834887158952e-06, + "loss": 3.6458, + "step": 12292 + }, + { + "epoch": 1.0477286286542231, + "grad_norm": 36.44043683398488, + "learning_rate": 8.224455939216636e-06, + "loss": 3.8426, + "step": 12293 + }, + { + "epoch": 1.0478138583482486, + "grad_norm": 72.58796933379257, + "learning_rate": 8.224076959563076e-06, + "loss": 3.0678, + "step": 12294 + }, + { + "epoch": 1.0478990880422738, + "grad_norm": 30.808776085542352, + "learning_rate": 8.223697948201999e-06, + "loss": 2.4936, + "step": 12295 + }, + { + "epoch": 1.0479843177362993, + "grad_norm": 44.810009130294105, + "learning_rate": 8.223318905137136e-06, + "loss": 2.1605, + "step": 12296 + }, + { + "epoch": 1.0480695474303248, + "grad_norm": 38.06663145998961, + "learning_rate": 8.222939830372211e-06, + "loss": 3.1347, + "step": 12297 + }, + { + "epoch": 1.04815477712435, + "grad_norm": 58.42299043489317, + "learning_rate": 8.222560723910956e-06, + "loss": 2.5839, + "step": 12298 + }, + { + "epoch": 1.0482400068183755, + "grad_norm": 32.04729636925353, + "learning_rate": 8.222181585757094e-06, + "loss": 3.3115, + "step": 12299 + }, + { + "epoch": 1.048325236512401, + "grad_norm": 79.43327312492451, + "learning_rate": 8.221802415914357e-06, + "loss": 3.8673, + "step": 12300 + }, + { + "epoch": 1.0484104662064264, + "grad_norm": 31.336269263340686, + "learning_rate": 8.221423214386472e-06, + "loss": 2.4258, + "step": 12301 + }, + { + "epoch": 1.0484956959004517, + "grad_norm": 33.924026341873095, + "learning_rate": 8.221043981177173e-06, + "loss": 2.5612, + "step": 12302 + }, + { + "epoch": 1.0485809255944771, + "grad_norm": 77.57184731528449, + "learning_rate": 8.220664716290184e-06, + "loss": 2.9833, + "step": 12303 + }, + { + "epoch": 1.0486661552885026, + "grad_norm": 80.31392866587848, + "learning_rate": 8.220285419729237e-06, + "loss": 3.3638, + "step": 12304 + }, + { + "epoch": 1.0487513849825278, + "grad_norm": 82.77402561695872, + "learning_rate": 8.219906091498065e-06, + "loss": 3.8487, + "step": 12305 + }, + { + "epoch": 1.0488366146765533, + "grad_norm": 82.28359355011318, + "learning_rate": 8.219526731600394e-06, + "loss": 3.5355, + "step": 12306 + }, + { + "epoch": 1.0489218443705788, + "grad_norm": 35.96827688863511, + "learning_rate": 8.219147340039956e-06, + "loss": 3.1122, + "step": 12307 + }, + { + "epoch": 1.049007074064604, + "grad_norm": 56.169975773857615, + "learning_rate": 8.218767916820486e-06, + "loss": 2.59, + "step": 12308 + }, + { + "epoch": 1.0490923037586295, + "grad_norm": 71.29272835805055, + "learning_rate": 8.218388461945711e-06, + "loss": 2.2017, + "step": 12309 + }, + { + "epoch": 1.049177533452655, + "grad_norm": 41.786147794640144, + "learning_rate": 8.218008975419364e-06, + "loss": 1.2567, + "step": 12310 + }, + { + "epoch": 1.0492627631466802, + "grad_norm": 35.207445398695164, + "learning_rate": 8.217629457245178e-06, + "loss": 2.0586, + "step": 12311 + }, + { + "epoch": 1.0493479928407057, + "grad_norm": 81.36119186117621, + "learning_rate": 8.217249907426885e-06, + "loss": 3.3974, + "step": 12312 + }, + { + "epoch": 1.0494332225347311, + "grad_norm": 22.349350624341152, + "learning_rate": 8.216870325968218e-06, + "loss": 1.7061, + "step": 12313 + }, + { + "epoch": 1.0495184522287564, + "grad_norm": 74.86170107258542, + "learning_rate": 8.21649071287291e-06, + "loss": 3.7523, + "step": 12314 + }, + { + "epoch": 1.0496036819227819, + "grad_norm": 68.20328189199056, + "learning_rate": 8.216111068144693e-06, + "loss": 2.7123, + "step": 12315 + }, + { + "epoch": 1.0496889116168073, + "grad_norm": 36.08184862363077, + "learning_rate": 8.2157313917873e-06, + "loss": 3.4008, + "step": 12316 + }, + { + "epoch": 1.0497741413108328, + "grad_norm": 20.314217636082507, + "learning_rate": 8.215351683804469e-06, + "loss": 1.1232, + "step": 12317 + }, + { + "epoch": 1.049859371004858, + "grad_norm": 121.20136091190405, + "learning_rate": 8.21497194419993e-06, + "loss": 4.0579, + "step": 12318 + }, + { + "epoch": 1.0499446006988835, + "grad_norm": 73.14524760186856, + "learning_rate": 8.21459217297742e-06, + "loss": 2.8425, + "step": 12319 + }, + { + "epoch": 1.050029830392909, + "grad_norm": 39.70783451224723, + "learning_rate": 8.214212370140673e-06, + "loss": 1.8518, + "step": 12320 + }, + { + "epoch": 1.0501150600869342, + "grad_norm": 21.776548932449167, + "learning_rate": 8.213832535693426e-06, + "loss": 1.2441, + "step": 12321 + }, + { + "epoch": 1.0502002897809597, + "grad_norm": 47.55736850209521, + "learning_rate": 8.213452669639412e-06, + "loss": 3.2274, + "step": 12322 + }, + { + "epoch": 1.0502855194749852, + "grad_norm": 62.04855335352334, + "learning_rate": 8.213072771982368e-06, + "loss": 2.2327, + "step": 12323 + }, + { + "epoch": 1.0503707491690104, + "grad_norm": 47.8243402842579, + "learning_rate": 8.21269284272603e-06, + "loss": 2.5583, + "step": 12324 + }, + { + "epoch": 1.0504559788630359, + "grad_norm": 50.615201050925485, + "learning_rate": 8.212312881874135e-06, + "loss": 3.4671, + "step": 12325 + }, + { + "epoch": 1.0505412085570613, + "grad_norm": 61.25653258511651, + "learning_rate": 8.21193288943042e-06, + "loss": 2.7592, + "step": 12326 + }, + { + "epoch": 1.0506264382510866, + "grad_norm": 36.55115530475627, + "learning_rate": 8.21155286539862e-06, + "loss": 2.6258, + "step": 12327 + }, + { + "epoch": 1.050711667945112, + "grad_norm": 50.58043822711448, + "learning_rate": 8.211172809782473e-06, + "loss": 2.7293, + "step": 12328 + }, + { + "epoch": 1.0507968976391375, + "grad_norm": 61.52683371865312, + "learning_rate": 8.210792722585719e-06, + "loss": 3.8041, + "step": 12329 + }, + { + "epoch": 1.0508821273331628, + "grad_norm": 107.8678235693519, + "learning_rate": 8.210412603812095e-06, + "loss": 3.0156, + "step": 12330 + }, + { + "epoch": 1.0509673570271882, + "grad_norm": 58.71279936900484, + "learning_rate": 8.210032453465336e-06, + "loss": 3.4561, + "step": 12331 + }, + { + "epoch": 1.0510525867212137, + "grad_norm": 45.55422243875077, + "learning_rate": 8.209652271549185e-06, + "loss": 3.8077, + "step": 12332 + }, + { + "epoch": 1.0511378164152392, + "grad_norm": 37.47576321778675, + "learning_rate": 8.20927205806738e-06, + "loss": 2.8283, + "step": 12333 + }, + { + "epoch": 1.0512230461092644, + "grad_norm": 52.19931385215736, + "learning_rate": 8.208891813023659e-06, + "loss": 2.6966, + "step": 12334 + }, + { + "epoch": 1.05130827580329, + "grad_norm": 41.7094514326587, + "learning_rate": 8.208511536421762e-06, + "loss": 3.2589, + "step": 12335 + }, + { + "epoch": 1.0513935054973154, + "grad_norm": 82.18902108684576, + "learning_rate": 8.208131228265428e-06, + "loss": 3.2912, + "step": 12336 + }, + { + "epoch": 1.0514787351913406, + "grad_norm": 96.95984814690583, + "learning_rate": 8.207750888558398e-06, + "loss": 4.3806, + "step": 12337 + }, + { + "epoch": 1.051563964885366, + "grad_norm": 92.72098918469526, + "learning_rate": 8.207370517304415e-06, + "loss": 3.294, + "step": 12338 + }, + { + "epoch": 1.0516491945793915, + "grad_norm": 45.00512105256943, + "learning_rate": 8.206990114507217e-06, + "loss": 2.2879, + "step": 12339 + }, + { + "epoch": 1.0517344242734168, + "grad_norm": 59.830840594786686, + "learning_rate": 8.206609680170543e-06, + "loss": 2.8595, + "step": 12340 + }, + { + "epoch": 1.0518196539674423, + "grad_norm": 84.0182588530327, + "learning_rate": 8.206229214298138e-06, + "loss": 3.5769, + "step": 12341 + }, + { + "epoch": 1.0519048836614677, + "grad_norm": 44.49922445984304, + "learning_rate": 8.205848716893745e-06, + "loss": 3.0383, + "step": 12342 + }, + { + "epoch": 1.051990113355493, + "grad_norm": 94.2466705753641, + "learning_rate": 8.2054681879611e-06, + "loss": 3.5823, + "step": 12343 + }, + { + "epoch": 1.0520753430495184, + "grad_norm": 39.10593655957083, + "learning_rate": 8.205087627503952e-06, + "loss": 2.3999, + "step": 12344 + }, + { + "epoch": 1.052160572743544, + "grad_norm": 39.46584726620626, + "learning_rate": 8.204707035526041e-06, + "loss": 2.9838, + "step": 12345 + }, + { + "epoch": 1.0522458024375692, + "grad_norm": 22.890450468559123, + "learning_rate": 8.204326412031106e-06, + "loss": 2.4545, + "step": 12346 + }, + { + "epoch": 1.0523310321315946, + "grad_norm": 94.94497171545166, + "learning_rate": 8.2039457570229e-06, + "loss": 3.877, + "step": 12347 + }, + { + "epoch": 1.05241626182562, + "grad_norm": 44.5645879522767, + "learning_rate": 8.203565070505155e-06, + "loss": 2.6533, + "step": 12348 + }, + { + "epoch": 1.0525014915196453, + "grad_norm": 38.87762609672025, + "learning_rate": 8.203184352481623e-06, + "loss": 2.957, + "step": 12349 + }, + { + "epoch": 1.0525867212136708, + "grad_norm": 73.25445835292493, + "learning_rate": 8.202803602956047e-06, + "loss": 2.6182, + "step": 12350 + }, + { + "epoch": 1.0526719509076963, + "grad_norm": 43.421975780191865, + "learning_rate": 8.202422821932167e-06, + "loss": 3.1964, + "step": 12351 + }, + { + "epoch": 1.0527571806017217, + "grad_norm": 40.80852082530399, + "learning_rate": 8.202042009413735e-06, + "loss": 2.806, + "step": 12352 + }, + { + "epoch": 1.052842410295747, + "grad_norm": 28.057850484929475, + "learning_rate": 8.201661165404489e-06, + "loss": 2.2467, + "step": 12353 + }, + { + "epoch": 1.0529276399897725, + "grad_norm": 43.890464926815916, + "learning_rate": 8.201280289908179e-06, + "loss": 3.4861, + "step": 12354 + }, + { + "epoch": 1.053012869683798, + "grad_norm": 36.061226723551485, + "learning_rate": 8.200899382928549e-06, + "loss": 2.6348, + "step": 12355 + }, + { + "epoch": 1.0530980993778232, + "grad_norm": 25.317185236168456, + "learning_rate": 8.200518444469346e-06, + "loss": 2.3494, + "step": 12356 + }, + { + "epoch": 1.0531833290718486, + "grad_norm": 37.82557358277883, + "learning_rate": 8.200137474534314e-06, + "loss": 2.7614, + "step": 12357 + }, + { + "epoch": 1.053268558765874, + "grad_norm": 51.15171970970592, + "learning_rate": 8.199756473127204e-06, + "loss": 2.0043, + "step": 12358 + }, + { + "epoch": 1.0533537884598994, + "grad_norm": 35.83521089108219, + "learning_rate": 8.199375440251759e-06, + "loss": 2.6139, + "step": 12359 + }, + { + "epoch": 1.0534390181539248, + "grad_norm": 39.0409672635122, + "learning_rate": 8.198994375911729e-06, + "loss": 2.1842, + "step": 12360 + }, + { + "epoch": 1.0535242478479503, + "grad_norm": 42.72196185539563, + "learning_rate": 8.198613280110858e-06, + "loss": 2.9831, + "step": 12361 + }, + { + "epoch": 1.0536094775419755, + "grad_norm": 108.14113303702567, + "learning_rate": 8.198232152852899e-06, + "loss": 4.5294, + "step": 12362 + }, + { + "epoch": 1.053694707236001, + "grad_norm": 61.20821735863888, + "learning_rate": 8.197850994141597e-06, + "loss": 3.1271, + "step": 12363 + }, + { + "epoch": 1.0537799369300265, + "grad_norm": 53.03835804263051, + "learning_rate": 8.197469803980703e-06, + "loss": 3.0211, + "step": 12364 + }, + { + "epoch": 1.0538651666240517, + "grad_norm": 60.032628313644246, + "learning_rate": 8.19708858237396e-06, + "loss": 2.7471, + "step": 12365 + }, + { + "epoch": 1.0539503963180772, + "grad_norm": 44.20825371079868, + "learning_rate": 8.196707329325124e-06, + "loss": 3.5055, + "step": 12366 + }, + { + "epoch": 1.0540356260121027, + "grad_norm": 66.84899230866516, + "learning_rate": 8.19632604483794e-06, + "loss": 3.6516, + "step": 12367 + }, + { + "epoch": 1.054120855706128, + "grad_norm": 34.63714321122762, + "learning_rate": 8.19594472891616e-06, + "loss": 2.4251, + "step": 12368 + }, + { + "epoch": 1.0542060854001534, + "grad_norm": 44.73226416643598, + "learning_rate": 8.195563381563535e-06, + "loss": 3.5953, + "step": 12369 + }, + { + "epoch": 1.0542913150941788, + "grad_norm": 60.71753259614225, + "learning_rate": 8.195182002783813e-06, + "loss": 3.724, + "step": 12370 + }, + { + "epoch": 1.0543765447882043, + "grad_norm": 28.208950022775234, + "learning_rate": 8.194800592580745e-06, + "loss": 2.8233, + "step": 12371 + }, + { + "epoch": 1.0544617744822296, + "grad_norm": 39.128064862515195, + "learning_rate": 8.194419150958084e-06, + "loss": 3.0656, + "step": 12372 + }, + { + "epoch": 1.054547004176255, + "grad_norm": 37.493957222472616, + "learning_rate": 8.194037677919578e-06, + "loss": 2.7214, + "step": 12373 + }, + { + "epoch": 1.0546322338702805, + "grad_norm": 28.02763389793756, + "learning_rate": 8.193656173468982e-06, + "loss": 2.1243, + "step": 12374 + }, + { + "epoch": 1.0547174635643057, + "grad_norm": 58.77493545084015, + "learning_rate": 8.193274637610047e-06, + "loss": 2.8434, + "step": 12375 + }, + { + "epoch": 1.0548026932583312, + "grad_norm": 63.948227011498204, + "learning_rate": 8.192893070346523e-06, + "loss": 3.3627, + "step": 12376 + }, + { + "epoch": 1.0548879229523567, + "grad_norm": 82.5452302552461, + "learning_rate": 8.192511471682167e-06, + "loss": 3.4085, + "step": 12377 + }, + { + "epoch": 1.054973152646382, + "grad_norm": 43.464902932579555, + "learning_rate": 8.192129841620727e-06, + "loss": 3.0402, + "step": 12378 + }, + { + "epoch": 1.0550583823404074, + "grad_norm": 63.47680122068659, + "learning_rate": 8.191748180165959e-06, + "loss": 3.0199, + "step": 12379 + }, + { + "epoch": 1.0551436120344329, + "grad_norm": 65.73890813745099, + "learning_rate": 8.191366487321617e-06, + "loss": 2.8783, + "step": 12380 + }, + { + "epoch": 1.055228841728458, + "grad_norm": 63.23294746868061, + "learning_rate": 8.190984763091452e-06, + "loss": 2.6769, + "step": 12381 + }, + { + "epoch": 1.0553140714224836, + "grad_norm": 38.975879164712424, + "learning_rate": 8.19060300747922e-06, + "loss": 3.5103, + "step": 12382 + }, + { + "epoch": 1.055399301116509, + "grad_norm": 58.69155296776019, + "learning_rate": 8.190221220488677e-06, + "loss": 3.216, + "step": 12383 + }, + { + "epoch": 1.0554845308105343, + "grad_norm": 33.60078096319299, + "learning_rate": 8.189839402123575e-06, + "loss": 2.4017, + "step": 12384 + }, + { + "epoch": 1.0555697605045598, + "grad_norm": 42.74732348390728, + "learning_rate": 8.18945755238767e-06, + "loss": 3.231, + "step": 12385 + }, + { + "epoch": 1.0556549901985852, + "grad_norm": 31.184789286845398, + "learning_rate": 8.189075671284716e-06, + "loss": 2.5637, + "step": 12386 + }, + { + "epoch": 1.0557402198926107, + "grad_norm": 44.90493933530824, + "learning_rate": 8.188693758818471e-06, + "loss": 3.587, + "step": 12387 + }, + { + "epoch": 1.055825449586636, + "grad_norm": 44.80189051110293, + "learning_rate": 8.18831181499269e-06, + "loss": 3.4699, + "step": 12388 + }, + { + "epoch": 1.0559106792806614, + "grad_norm": 59.6371572815246, + "learning_rate": 8.18792983981113e-06, + "loss": 3.1251, + "step": 12389 + }, + { + "epoch": 1.0559959089746869, + "grad_norm": 39.72684740689227, + "learning_rate": 8.187547833277547e-06, + "loss": 2.8366, + "step": 12390 + }, + { + "epoch": 1.0560811386687121, + "grad_norm": 93.9620845185046, + "learning_rate": 8.187165795395696e-06, + "loss": 3.3391, + "step": 12391 + }, + { + "epoch": 1.0561663683627376, + "grad_norm": 38.890114677516365, + "learning_rate": 8.186783726169338e-06, + "loss": 3.3418, + "step": 12392 + }, + { + "epoch": 1.056251598056763, + "grad_norm": 47.560962232156854, + "learning_rate": 8.186401625602226e-06, + "loss": 2.9575, + "step": 12393 + }, + { + "epoch": 1.0563368277507883, + "grad_norm": 94.44840342105563, + "learning_rate": 8.186019493698122e-06, + "loss": 3.7405, + "step": 12394 + }, + { + "epoch": 1.0564220574448138, + "grad_norm": 43.064139449188254, + "learning_rate": 8.185637330460781e-06, + "loss": 2.5288, + "step": 12395 + }, + { + "epoch": 1.0565072871388392, + "grad_norm": 36.299807118853906, + "learning_rate": 8.185255135893964e-06, + "loss": 3.1189, + "step": 12396 + }, + { + "epoch": 1.0565925168328645, + "grad_norm": 41.03665202522496, + "learning_rate": 8.184872910001428e-06, + "loss": 3.5257, + "step": 12397 + }, + { + "epoch": 1.05667774652689, + "grad_norm": 114.58919325473067, + "learning_rate": 8.18449065278693e-06, + "loss": 4.3487, + "step": 12398 + }, + { + "epoch": 1.0567629762209154, + "grad_norm": 40.24810828305401, + "learning_rate": 8.184108364254236e-06, + "loss": 3.0458, + "step": 12399 + }, + { + "epoch": 1.0568482059149407, + "grad_norm": 39.13659178333236, + "learning_rate": 8.183726044407097e-06, + "loss": 2.5584, + "step": 12400 + }, + { + "epoch": 1.0569334356089661, + "grad_norm": 91.99142144555853, + "learning_rate": 8.183343693249282e-06, + "loss": 4.5842, + "step": 12401 + }, + { + "epoch": 1.0570186653029916, + "grad_norm": 37.268453123209035, + "learning_rate": 8.182961310784543e-06, + "loss": 2.968, + "step": 12402 + }, + { + "epoch": 1.057103894997017, + "grad_norm": 31.231558944967357, + "learning_rate": 8.182578897016646e-06, + "loss": 3.1407, + "step": 12403 + }, + { + "epoch": 1.0571891246910423, + "grad_norm": 39.56605422707412, + "learning_rate": 8.18219645194935e-06, + "loss": 3.1785, + "step": 12404 + }, + { + "epoch": 1.0572743543850678, + "grad_norm": 41.618297048513874, + "learning_rate": 8.181813975586417e-06, + "loss": 3.068, + "step": 12405 + }, + { + "epoch": 1.0573595840790933, + "grad_norm": 33.53866522613749, + "learning_rate": 8.181431467931606e-06, + "loss": 2.9479, + "step": 12406 + }, + { + "epoch": 1.0574448137731185, + "grad_norm": 89.11197369402763, + "learning_rate": 8.181048928988684e-06, + "loss": 3.9707, + "step": 12407 + }, + { + "epoch": 1.057530043467144, + "grad_norm": 38.800997481244984, + "learning_rate": 8.180666358761406e-06, + "loss": 3.3276, + "step": 12408 + }, + { + "epoch": 1.0576152731611694, + "grad_norm": 38.702145567834336, + "learning_rate": 8.18028375725354e-06, + "loss": 2.7523, + "step": 12409 + }, + { + "epoch": 1.0577005028551947, + "grad_norm": 36.71283646885339, + "learning_rate": 8.179901124468848e-06, + "loss": 3.0084, + "step": 12410 + }, + { + "epoch": 1.0577857325492201, + "grad_norm": 57.365548241381106, + "learning_rate": 8.17951846041109e-06, + "loss": 2.4457, + "step": 12411 + }, + { + "epoch": 1.0578709622432456, + "grad_norm": 35.72527567329069, + "learning_rate": 8.179135765084033e-06, + "loss": 3.5063, + "step": 12412 + }, + { + "epoch": 1.0579561919372709, + "grad_norm": 48.837638527227924, + "learning_rate": 8.178753038491439e-06, + "loss": 3.25, + "step": 12413 + }, + { + "epoch": 1.0580414216312963, + "grad_norm": 39.1495922620868, + "learning_rate": 8.178370280637071e-06, + "loss": 3.3231, + "step": 12414 + }, + { + "epoch": 1.0581266513253218, + "grad_norm": 55.85636319681912, + "learning_rate": 8.177987491524694e-06, + "loss": 2.9561, + "step": 12415 + }, + { + "epoch": 1.058211881019347, + "grad_norm": 51.15495517451609, + "learning_rate": 8.177604671158074e-06, + "loss": 2.749, + "step": 12416 + }, + { + "epoch": 1.0582971107133725, + "grad_norm": 24.11302678973945, + "learning_rate": 8.177221819540974e-06, + "loss": 1.9153, + "step": 12417 + }, + { + "epoch": 1.058382340407398, + "grad_norm": 77.53777057834182, + "learning_rate": 8.176838936677161e-06, + "loss": 2.5293, + "step": 12418 + }, + { + "epoch": 1.0584675701014232, + "grad_norm": 15.398351182682681, + "learning_rate": 8.176456022570397e-06, + "loss": 1.1597, + "step": 12419 + }, + { + "epoch": 1.0585527997954487, + "grad_norm": 74.30074005216059, + "learning_rate": 8.176073077224452e-06, + "loss": 2.3205, + "step": 12420 + }, + { + "epoch": 1.0586380294894742, + "grad_norm": 54.08811396306272, + "learning_rate": 8.17569010064309e-06, + "loss": 3.8448, + "step": 12421 + }, + { + "epoch": 1.0587232591834996, + "grad_norm": 89.57635435628785, + "learning_rate": 8.175307092830078e-06, + "loss": 4.8877, + "step": 12422 + }, + { + "epoch": 1.0588084888775249, + "grad_norm": 52.370491681892815, + "learning_rate": 8.174924053789182e-06, + "loss": 3.9656, + "step": 12423 + }, + { + "epoch": 1.0588937185715503, + "grad_norm": 39.078423435290404, + "learning_rate": 8.174540983524169e-06, + "loss": 3.536, + "step": 12424 + }, + { + "epoch": 1.0589789482655758, + "grad_norm": 142.71230887096604, + "learning_rate": 8.17415788203881e-06, + "loss": 3.8778, + "step": 12425 + }, + { + "epoch": 1.059064177959601, + "grad_norm": 64.81953159295931, + "learning_rate": 8.173774749336865e-06, + "loss": 2.4464, + "step": 12426 + }, + { + "epoch": 1.0591494076536265, + "grad_norm": 40.77955562218317, + "learning_rate": 8.17339158542211e-06, + "loss": 2.0762, + "step": 12427 + }, + { + "epoch": 1.059234637347652, + "grad_norm": 48.94994089231049, + "learning_rate": 8.173008390298308e-06, + "loss": 3.4613, + "step": 12428 + }, + { + "epoch": 1.0593198670416772, + "grad_norm": 82.10116496323346, + "learning_rate": 8.17262516396923e-06, + "loss": 2.7892, + "step": 12429 + }, + { + "epoch": 1.0594050967357027, + "grad_norm": 49.90471026367373, + "learning_rate": 8.172241906438642e-06, + "loss": 3.2039, + "step": 12430 + }, + { + "epoch": 1.0594903264297282, + "grad_norm": 43.992201518646034, + "learning_rate": 8.171858617710317e-06, + "loss": 2.1708, + "step": 12431 + }, + { + "epoch": 1.0595755561237534, + "grad_norm": 102.69202052270666, + "learning_rate": 8.171475297788024e-06, + "loss": 4.2676, + "step": 12432 + }, + { + "epoch": 1.059660785817779, + "grad_norm": 59.82153755169923, + "learning_rate": 8.17109194667553e-06, + "loss": 3.596, + "step": 12433 + }, + { + "epoch": 1.0597460155118044, + "grad_norm": 59.46359049346759, + "learning_rate": 8.170708564376607e-06, + "loss": 2.6177, + "step": 12434 + }, + { + "epoch": 1.0598312452058296, + "grad_norm": 323.0150410019383, + "learning_rate": 8.170325150895027e-06, + "loss": 2.2516, + "step": 12435 + }, + { + "epoch": 1.059916474899855, + "grad_norm": 36.20613596380276, + "learning_rate": 8.169941706234558e-06, + "loss": 2.7027, + "step": 12436 + }, + { + "epoch": 1.0600017045938805, + "grad_norm": 31.74330442905819, + "learning_rate": 8.169558230398971e-06, + "loss": 3.2638, + "step": 12437 + }, + { + "epoch": 1.060086934287906, + "grad_norm": 58.66118005264136, + "learning_rate": 8.16917472339204e-06, + "loss": 2.7248, + "step": 12438 + }, + { + "epoch": 1.0601721639819313, + "grad_norm": 90.06167096836897, + "learning_rate": 8.168791185217533e-06, + "loss": 2.7732, + "step": 12439 + }, + { + "epoch": 1.0602573936759567, + "grad_norm": 32.722227817345555, + "learning_rate": 8.168407615879225e-06, + "loss": 2.0434, + "step": 12440 + }, + { + "epoch": 1.0603426233699822, + "grad_norm": 29.08718043284042, + "learning_rate": 8.168024015380886e-06, + "loss": 2.0477, + "step": 12441 + }, + { + "epoch": 1.0604278530640074, + "grad_norm": 50.08444083486948, + "learning_rate": 8.16764038372629e-06, + "loss": 3.5524, + "step": 12442 + }, + { + "epoch": 1.060513082758033, + "grad_norm": 57.888068290805805, + "learning_rate": 8.16725672091921e-06, + "loss": 2.6965, + "step": 12443 + }, + { + "epoch": 1.0605983124520584, + "grad_norm": 30.71677485444328, + "learning_rate": 8.16687302696342e-06, + "loss": 3.0671, + "step": 12444 + }, + { + "epoch": 1.0606835421460836, + "grad_norm": 73.85059036728435, + "learning_rate": 8.16648930186269e-06, + "loss": 3.9544, + "step": 12445 + }, + { + "epoch": 1.060768771840109, + "grad_norm": 125.07253933237703, + "learning_rate": 8.166105545620798e-06, + "loss": 3.0986, + "step": 12446 + }, + { + "epoch": 1.0608540015341346, + "grad_norm": 34.857651542328476, + "learning_rate": 8.165721758241512e-06, + "loss": 3.1546, + "step": 12447 + }, + { + "epoch": 1.0609392312281598, + "grad_norm": 72.0463308821767, + "learning_rate": 8.165337939728615e-06, + "loss": 3.8534, + "step": 12448 + }, + { + "epoch": 1.0610244609221853, + "grad_norm": 52.7375472376749, + "learning_rate": 8.164954090085875e-06, + "loss": 3.636, + "step": 12449 + }, + { + "epoch": 1.0611096906162107, + "grad_norm": 81.67253657170592, + "learning_rate": 8.164570209317069e-06, + "loss": 2.2448, + "step": 12450 + }, + { + "epoch": 1.061194920310236, + "grad_norm": 86.28229097399719, + "learning_rate": 8.164186297425974e-06, + "loss": 3.0531, + "step": 12451 + }, + { + "epoch": 1.0612801500042615, + "grad_norm": 164.3578976690741, + "learning_rate": 8.163802354416361e-06, + "loss": 3.0208, + "step": 12452 + }, + { + "epoch": 1.061365379698287, + "grad_norm": 35.2042386121262, + "learning_rate": 8.163418380292013e-06, + "loss": 3.5885, + "step": 12453 + }, + { + "epoch": 1.0614506093923124, + "grad_norm": 49.42166316007936, + "learning_rate": 8.1630343750567e-06, + "loss": 3.0799, + "step": 12454 + }, + { + "epoch": 1.0615358390863376, + "grad_norm": 53.112929596319326, + "learning_rate": 8.1626503387142e-06, + "loss": 3.0804, + "step": 12455 + }, + { + "epoch": 1.061621068780363, + "grad_norm": 37.23776411244933, + "learning_rate": 8.162266271268292e-06, + "loss": 2.613, + "step": 12456 + }, + { + "epoch": 1.0617062984743886, + "grad_norm": 71.47580183791841, + "learning_rate": 8.161882172722752e-06, + "loss": 3.4266, + "step": 12457 + }, + { + "epoch": 1.0617915281684138, + "grad_norm": 113.18612956395705, + "learning_rate": 8.161498043081357e-06, + "loss": 2.2297, + "step": 12458 + }, + { + "epoch": 1.0618767578624393, + "grad_norm": 38.86443363202475, + "learning_rate": 8.161113882347885e-06, + "loss": 2.8654, + "step": 12459 + }, + { + "epoch": 1.0619619875564648, + "grad_norm": 50.344026146078264, + "learning_rate": 8.160729690526113e-06, + "loss": 2.9438, + "step": 12460 + }, + { + "epoch": 1.06204721725049, + "grad_norm": 57.183854883312684, + "learning_rate": 8.160345467619821e-06, + "loss": 3.0722, + "step": 12461 + }, + { + "epoch": 1.0621324469445155, + "grad_norm": 30.165279803182624, + "learning_rate": 8.159961213632788e-06, + "loss": 2.3883, + "step": 12462 + }, + { + "epoch": 1.062217676638541, + "grad_norm": 35.78406590080792, + "learning_rate": 8.159576928568791e-06, + "loss": 2.2116, + "step": 12463 + }, + { + "epoch": 1.0623029063325662, + "grad_norm": 73.85395615915633, + "learning_rate": 8.15919261243161e-06, + "loss": 2.9113, + "step": 12464 + }, + { + "epoch": 1.0623881360265917, + "grad_norm": 84.87271635372448, + "learning_rate": 8.158808265225027e-06, + "loss": 3.1434, + "step": 12465 + }, + { + "epoch": 1.0624733657206171, + "grad_norm": 67.22090398540563, + "learning_rate": 8.158423886952818e-06, + "loss": 3.3646, + "step": 12466 + }, + { + "epoch": 1.0625585954146424, + "grad_norm": 73.52984338273517, + "learning_rate": 8.158039477618767e-06, + "loss": 3.0109, + "step": 12467 + }, + { + "epoch": 1.0626438251086678, + "grad_norm": 34.65628692420082, + "learning_rate": 8.157655037226651e-06, + "loss": 2.3882, + "step": 12468 + }, + { + "epoch": 1.0627290548026933, + "grad_norm": 69.97902098217747, + "learning_rate": 8.157270565780253e-06, + "loss": 3.2185, + "step": 12469 + }, + { + "epoch": 1.0628142844967186, + "grad_norm": 31.818021853774212, + "learning_rate": 8.156886063283354e-06, + "loss": 2.9095, + "step": 12470 + }, + { + "epoch": 1.062899514190744, + "grad_norm": 36.48822684549608, + "learning_rate": 8.156501529739733e-06, + "loss": 3.0773, + "step": 12471 + }, + { + "epoch": 1.0629847438847695, + "grad_norm": 71.78346074753082, + "learning_rate": 8.156116965153176e-06, + "loss": 4.3118, + "step": 12472 + }, + { + "epoch": 1.063069973578795, + "grad_norm": 61.07116228916157, + "learning_rate": 8.155732369527462e-06, + "loss": 3.0143, + "step": 12473 + }, + { + "epoch": 1.0631552032728202, + "grad_norm": 104.45012733209734, + "learning_rate": 8.155347742866374e-06, + "loss": 3.9818, + "step": 12474 + }, + { + "epoch": 1.0632404329668457, + "grad_norm": 43.236991166392706, + "learning_rate": 8.154963085173696e-06, + "loss": 3.2191, + "step": 12475 + }, + { + "epoch": 1.0633256626608711, + "grad_norm": 57.92179638263502, + "learning_rate": 8.154578396453208e-06, + "loss": 4.0096, + "step": 12476 + }, + { + "epoch": 1.0634108923548964, + "grad_norm": 33.913884906168896, + "learning_rate": 8.154193676708696e-06, + "loss": 3.3002, + "step": 12477 + }, + { + "epoch": 1.0634961220489219, + "grad_norm": 140.6598070231648, + "learning_rate": 8.153808925943941e-06, + "loss": 2.4097, + "step": 12478 + }, + { + "epoch": 1.0635813517429473, + "grad_norm": 36.220709076228474, + "learning_rate": 8.153424144162729e-06, + "loss": 3.0306, + "step": 12479 + }, + { + "epoch": 1.0636665814369726, + "grad_norm": 52.92393726893262, + "learning_rate": 8.153039331368845e-06, + "loss": 2.4034, + "step": 12480 + }, + { + "epoch": 1.063751811130998, + "grad_norm": 43.627293786132505, + "learning_rate": 8.152654487566071e-06, + "loss": 3.3083, + "step": 12481 + }, + { + "epoch": 1.0638370408250235, + "grad_norm": 22.100823988421396, + "learning_rate": 8.152269612758193e-06, + "loss": 1.5205, + "step": 12482 + }, + { + "epoch": 1.0639222705190488, + "grad_norm": 45.06196705127682, + "learning_rate": 8.151884706948995e-06, + "loss": 2.1543, + "step": 12483 + }, + { + "epoch": 1.0640075002130742, + "grad_norm": 37.995660665311576, + "learning_rate": 8.151499770142265e-06, + "loss": 2.5435, + "step": 12484 + }, + { + "epoch": 1.0640927299070997, + "grad_norm": 78.00527029856906, + "learning_rate": 8.151114802341787e-06, + "loss": 2.7159, + "step": 12485 + }, + { + "epoch": 1.064177959601125, + "grad_norm": 74.1600563430279, + "learning_rate": 8.150729803551347e-06, + "loss": 3.3975, + "step": 12486 + }, + { + "epoch": 1.0642631892951504, + "grad_norm": 71.32640061943427, + "learning_rate": 8.15034477377473e-06, + "loss": 3.4081, + "step": 12487 + }, + { + "epoch": 1.0643484189891759, + "grad_norm": 70.55030630461393, + "learning_rate": 8.149959713015724e-06, + "loss": 3.3481, + "step": 12488 + }, + { + "epoch": 1.0644336486832011, + "grad_norm": 67.34332114471967, + "learning_rate": 8.149574621278117e-06, + "loss": 3.1598, + "step": 12489 + }, + { + "epoch": 1.0645188783772266, + "grad_norm": 100.85146875880295, + "learning_rate": 8.149189498565695e-06, + "loss": 3.3759, + "step": 12490 + }, + { + "epoch": 1.064604108071252, + "grad_norm": 68.67413377692333, + "learning_rate": 8.148804344882244e-06, + "loss": 3.5588, + "step": 12491 + }, + { + "epoch": 1.0646893377652775, + "grad_norm": 36.66674819273631, + "learning_rate": 8.148419160231556e-06, + "loss": 4.3637, + "step": 12492 + }, + { + "epoch": 1.0647745674593028, + "grad_norm": 45.739625560313854, + "learning_rate": 8.148033944617415e-06, + "loss": 3.1569, + "step": 12493 + }, + { + "epoch": 1.0648597971533282, + "grad_norm": 44.25791450233891, + "learning_rate": 8.147648698043612e-06, + "loss": 3.737, + "step": 12494 + }, + { + "epoch": 1.0649450268473537, + "grad_norm": 30.510770866143925, + "learning_rate": 8.147263420513934e-06, + "loss": 2.4355, + "step": 12495 + }, + { + "epoch": 1.065030256541379, + "grad_norm": 36.54270488610095, + "learning_rate": 8.146878112032172e-06, + "loss": 2.49, + "step": 12496 + }, + { + "epoch": 1.0651154862354044, + "grad_norm": 115.58843367677844, + "learning_rate": 8.146492772602113e-06, + "loss": 3.9676, + "step": 12497 + }, + { + "epoch": 1.0652007159294299, + "grad_norm": 33.4398858545094, + "learning_rate": 8.146107402227548e-06, + "loss": 2.5209, + "step": 12498 + }, + { + "epoch": 1.0652859456234551, + "grad_norm": 78.0905047834702, + "learning_rate": 8.145722000912267e-06, + "loss": 3.0399, + "step": 12499 + }, + { + "epoch": 1.0653711753174806, + "grad_norm": 45.33588769248083, + "learning_rate": 8.145336568660062e-06, + "loss": 2.6768, + "step": 12500 + }, + { + "epoch": 1.065456405011506, + "grad_norm": 77.2340132709695, + "learning_rate": 8.14495110547472e-06, + "loss": 2.2461, + "step": 12501 + }, + { + "epoch": 1.0655416347055313, + "grad_norm": 30.89289891635652, + "learning_rate": 8.144565611360034e-06, + "loss": 2.6892, + "step": 12502 + }, + { + "epoch": 1.0656268643995568, + "grad_norm": 39.30904459606259, + "learning_rate": 8.144180086319792e-06, + "loss": 2.2081, + "step": 12503 + }, + { + "epoch": 1.0657120940935823, + "grad_norm": 35.768249120738496, + "learning_rate": 8.143794530357791e-06, + "loss": 2.8567, + "step": 12504 + }, + { + "epoch": 1.0657973237876077, + "grad_norm": 47.787424472251026, + "learning_rate": 8.14340894347782e-06, + "loss": 3.0884, + "step": 12505 + }, + { + "epoch": 1.065882553481633, + "grad_norm": 63.37369622567745, + "learning_rate": 8.14302332568367e-06, + "loss": 3.0342, + "step": 12506 + }, + { + "epoch": 1.0659677831756584, + "grad_norm": 77.53306751086656, + "learning_rate": 8.142637676979135e-06, + "loss": 4.4055, + "step": 12507 + }, + { + "epoch": 1.0660530128696837, + "grad_norm": 44.92258084797436, + "learning_rate": 8.142251997368006e-06, + "loss": 2.4637, + "step": 12508 + }, + { + "epoch": 1.0661382425637091, + "grad_norm": 38.83841731501982, + "learning_rate": 8.14186628685408e-06, + "loss": 3.2118, + "step": 12509 + }, + { + "epoch": 1.0662234722577346, + "grad_norm": 16.09075359018139, + "learning_rate": 8.141480545441146e-06, + "loss": 1.6526, + "step": 12510 + }, + { + "epoch": 1.06630870195176, + "grad_norm": 41.36793696442867, + "learning_rate": 8.141094773132998e-06, + "loss": 1.6776, + "step": 12511 + }, + { + "epoch": 1.0663939316457853, + "grad_norm": 38.597539497083055, + "learning_rate": 8.140708969933433e-06, + "loss": 3.1107, + "step": 12512 + }, + { + "epoch": 1.0664791613398108, + "grad_norm": 37.837625214325946, + "learning_rate": 8.140323135846245e-06, + "loss": 4.3273, + "step": 12513 + }, + { + "epoch": 1.0665643910338363, + "grad_norm": 65.96610726491292, + "learning_rate": 8.139937270875222e-06, + "loss": 3.0773, + "step": 12514 + }, + { + "epoch": 1.0666496207278615, + "grad_norm": 36.49621871979925, + "learning_rate": 8.139551375024166e-06, + "loss": 2.8383, + "step": 12515 + }, + { + "epoch": 1.066734850421887, + "grad_norm": 55.649479611119936, + "learning_rate": 8.13916544829687e-06, + "loss": 3.1506, + "step": 12516 + }, + { + "epoch": 1.0668200801159125, + "grad_norm": 53.68274754735032, + "learning_rate": 8.138779490697128e-06, + "loss": 2.6879, + "step": 12517 + }, + { + "epoch": 1.0669053098099377, + "grad_norm": 40.955131557047615, + "learning_rate": 8.138393502228739e-06, + "loss": 2.8483, + "step": 12518 + }, + { + "epoch": 1.0669905395039632, + "grad_norm": 79.38311349098281, + "learning_rate": 8.138007482895496e-06, + "loss": 3.2769, + "step": 12519 + }, + { + "epoch": 1.0670757691979886, + "grad_norm": 49.740556194419774, + "learning_rate": 8.137621432701195e-06, + "loss": 1.6382, + "step": 12520 + }, + { + "epoch": 1.0671609988920139, + "grad_norm": 65.23968463213241, + "learning_rate": 8.137235351649636e-06, + "loss": 3.2659, + "step": 12521 + }, + { + "epoch": 1.0672462285860393, + "grad_norm": 45.752752968992404, + "learning_rate": 8.136849239744612e-06, + "loss": 3.2946, + "step": 12522 + }, + { + "epoch": 1.0673314582800648, + "grad_norm": 43.03784784750645, + "learning_rate": 8.136463096989924e-06, + "loss": 3.1596, + "step": 12523 + }, + { + "epoch": 1.0674166879740903, + "grad_norm": 33.88219044991112, + "learning_rate": 8.136076923389367e-06, + "loss": 3.0126, + "step": 12524 + }, + { + "epoch": 1.0675019176681155, + "grad_norm": 52.28035255462966, + "learning_rate": 8.135690718946741e-06, + "loss": 2.3303, + "step": 12525 + }, + { + "epoch": 1.067587147362141, + "grad_norm": 87.03309618715046, + "learning_rate": 8.135304483665841e-06, + "loss": 3.5887, + "step": 12526 + }, + { + "epoch": 1.0676723770561665, + "grad_norm": 53.77449553371847, + "learning_rate": 8.134918217550467e-06, + "loss": 2.4042, + "step": 12527 + }, + { + "epoch": 1.0677576067501917, + "grad_norm": 46.52357633031342, + "learning_rate": 8.134531920604418e-06, + "loss": 3.3371, + "step": 12528 + }, + { + "epoch": 1.0678428364442172, + "grad_norm": 95.28190599600265, + "learning_rate": 8.134145592831496e-06, + "loss": 3.1665, + "step": 12529 + }, + { + "epoch": 1.0679280661382426, + "grad_norm": 35.14038260854265, + "learning_rate": 8.133759234235494e-06, + "loss": 3.1447, + "step": 12530 + }, + { + "epoch": 1.068013295832268, + "grad_norm": 35.62310041243002, + "learning_rate": 8.133372844820215e-06, + "loss": 2.7498, + "step": 12531 + }, + { + "epoch": 1.0680985255262934, + "grad_norm": 65.09144847290169, + "learning_rate": 8.132986424589463e-06, + "loss": 2.9064, + "step": 12532 + }, + { + "epoch": 1.0681837552203188, + "grad_norm": 77.1787521331629, + "learning_rate": 8.132599973547031e-06, + "loss": 2.5817, + "step": 12533 + }, + { + "epoch": 1.068268984914344, + "grad_norm": 41.84238147964929, + "learning_rate": 8.132213491696722e-06, + "loss": 3.283, + "step": 12534 + }, + { + "epoch": 1.0683542146083695, + "grad_norm": 80.57933917531007, + "learning_rate": 8.131826979042341e-06, + "loss": 3.0109, + "step": 12535 + }, + { + "epoch": 1.068439444302395, + "grad_norm": 55.15629110828298, + "learning_rate": 8.131440435587685e-06, + "loss": 2.2899, + "step": 12536 + }, + { + "epoch": 1.0685246739964203, + "grad_norm": 165.40435545396474, + "learning_rate": 8.131053861336558e-06, + "loss": 1.8935, + "step": 12537 + }, + { + "epoch": 1.0686099036904457, + "grad_norm": 102.12509028097088, + "learning_rate": 8.130667256292759e-06, + "loss": 2.6872, + "step": 12538 + }, + { + "epoch": 1.0686951333844712, + "grad_norm": 71.15219140817803, + "learning_rate": 8.130280620460092e-06, + "loss": 3.5394, + "step": 12539 + }, + { + "epoch": 1.0687803630784964, + "grad_norm": 43.67698627450446, + "learning_rate": 8.129893953842358e-06, + "loss": 2.6401, + "step": 12540 + }, + { + "epoch": 1.068865592772522, + "grad_norm": 36.27519001572424, + "learning_rate": 8.129507256443362e-06, + "loss": 2.9882, + "step": 12541 + }, + { + "epoch": 1.0689508224665474, + "grad_norm": 45.53543900890299, + "learning_rate": 8.129120528266903e-06, + "loss": 2.7068, + "step": 12542 + }, + { + "epoch": 1.0690360521605728, + "grad_norm": 35.75705072181633, + "learning_rate": 8.12873376931679e-06, + "loss": 2.607, + "step": 12543 + }, + { + "epoch": 1.069121281854598, + "grad_norm": 48.11365165147992, + "learning_rate": 8.128346979596821e-06, + "loss": 2.5292, + "step": 12544 + }, + { + "epoch": 1.0692065115486236, + "grad_norm": 59.10026278510981, + "learning_rate": 8.127960159110805e-06, + "loss": 2.3405, + "step": 12545 + }, + { + "epoch": 1.069291741242649, + "grad_norm": 49.817876144729624, + "learning_rate": 8.127573307862544e-06, + "loss": 2.889, + "step": 12546 + }, + { + "epoch": 1.0693769709366743, + "grad_norm": 76.16455809886308, + "learning_rate": 8.127186425855842e-06, + "loss": 3.8258, + "step": 12547 + }, + { + "epoch": 1.0694622006306997, + "grad_norm": 60.246019264318036, + "learning_rate": 8.126799513094503e-06, + "loss": 2.8513, + "step": 12548 + }, + { + "epoch": 1.0695474303247252, + "grad_norm": 32.250166869035624, + "learning_rate": 8.126412569582333e-06, + "loss": 1.5315, + "step": 12549 + }, + { + "epoch": 1.0696326600187505, + "grad_norm": 34.281522884822195, + "learning_rate": 8.126025595323138e-06, + "loss": 3.1986, + "step": 12550 + }, + { + "epoch": 1.069717889712776, + "grad_norm": 66.97030371805744, + "learning_rate": 8.125638590320723e-06, + "loss": 2.2019, + "step": 12551 + }, + { + "epoch": 1.0698031194068014, + "grad_norm": 55.76710128778755, + "learning_rate": 8.125251554578896e-06, + "loss": 2.9488, + "step": 12552 + }, + { + "epoch": 1.0698883491008266, + "grad_norm": 86.39849928109021, + "learning_rate": 8.12486448810146e-06, + "loss": 3.9989, + "step": 12553 + }, + { + "epoch": 1.069973578794852, + "grad_norm": 64.9541267386603, + "learning_rate": 8.124477390892224e-06, + "loss": 3.1946, + "step": 12554 + }, + { + "epoch": 1.0700588084888776, + "grad_norm": 25.105104218909112, + "learning_rate": 8.124090262954995e-06, + "loss": 1.8477, + "step": 12555 + }, + { + "epoch": 1.0701440381829028, + "grad_norm": 42.33792462889796, + "learning_rate": 8.12370310429358e-06, + "loss": 4.0216, + "step": 12556 + }, + { + "epoch": 1.0702292678769283, + "grad_norm": 36.00792004921003, + "learning_rate": 8.123315914911787e-06, + "loss": 2.0445, + "step": 12557 + }, + { + "epoch": 1.0703144975709538, + "grad_norm": 44.285205765979995, + "learning_rate": 8.122928694813422e-06, + "loss": 3.4271, + "step": 12558 + }, + { + "epoch": 1.070399727264979, + "grad_norm": 45.96092515447465, + "learning_rate": 8.122541444002293e-06, + "loss": 3.5317, + "step": 12559 + }, + { + "epoch": 1.0704849569590045, + "grad_norm": 52.387354557930166, + "learning_rate": 8.122154162482211e-06, + "loss": 3.6336, + "step": 12560 + }, + { + "epoch": 1.07057018665303, + "grad_norm": 81.28072624736332, + "learning_rate": 8.121766850256983e-06, + "loss": 2.9004, + "step": 12561 + }, + { + "epoch": 1.0706554163470554, + "grad_norm": 53.8894355185121, + "learning_rate": 8.121379507330418e-06, + "loss": 2.9523, + "step": 12562 + }, + { + "epoch": 1.0707406460410807, + "grad_norm": 66.04780628687547, + "learning_rate": 8.120992133706326e-06, + "loss": 2.8102, + "step": 12563 + }, + { + "epoch": 1.0708258757351061, + "grad_norm": 41.528899527896826, + "learning_rate": 8.120604729388518e-06, + "loss": 3.2838, + "step": 12564 + }, + { + "epoch": 1.0709111054291316, + "grad_norm": 87.40557101681743, + "learning_rate": 8.1202172943808e-06, + "loss": 3.4061, + "step": 12565 + }, + { + "epoch": 1.0709963351231568, + "grad_norm": 39.32002976265192, + "learning_rate": 8.119829828686987e-06, + "loss": 2.145, + "step": 12566 + }, + { + "epoch": 1.0710815648171823, + "grad_norm": 38.24217533469822, + "learning_rate": 8.119442332310887e-06, + "loss": 3.8991, + "step": 12567 + }, + { + "epoch": 1.0711667945112078, + "grad_norm": 35.55455218288104, + "learning_rate": 8.11905480525631e-06, + "loss": 2.7699, + "step": 12568 + }, + { + "epoch": 1.071252024205233, + "grad_norm": 68.6067432542342, + "learning_rate": 8.11866724752707e-06, + "loss": 3.1271, + "step": 12569 + }, + { + "epoch": 1.0713372538992585, + "grad_norm": 220.6699136236478, + "learning_rate": 8.118279659126976e-06, + "loss": 4.1277, + "step": 12570 + }, + { + "epoch": 1.071422483593284, + "grad_norm": 54.799665402809666, + "learning_rate": 8.117892040059841e-06, + "loss": 3.3209, + "step": 12571 + }, + { + "epoch": 1.0715077132873092, + "grad_norm": 65.65131420553874, + "learning_rate": 8.117504390329476e-06, + "loss": 2.9449, + "step": 12572 + }, + { + "epoch": 1.0715929429813347, + "grad_norm": 38.28750277953313, + "learning_rate": 8.117116709939693e-06, + "loss": 2.4667, + "step": 12573 + }, + { + "epoch": 1.0716781726753601, + "grad_norm": 73.37620044931647, + "learning_rate": 8.11672899889431e-06, + "loss": 3.0354, + "step": 12574 + }, + { + "epoch": 1.0717634023693856, + "grad_norm": 53.64197217471979, + "learning_rate": 8.11634125719713e-06, + "loss": 3.5142, + "step": 12575 + }, + { + "epoch": 1.0718486320634109, + "grad_norm": 39.21501170795757, + "learning_rate": 8.115953484851977e-06, + "loss": 2.4535, + "step": 12576 + }, + { + "epoch": 1.0719338617574363, + "grad_norm": 34.15785359589428, + "learning_rate": 8.115565681862658e-06, + "loss": 3.2708, + "step": 12577 + }, + { + "epoch": 1.0720190914514618, + "grad_norm": 31.715365827096058, + "learning_rate": 8.115177848232987e-06, + "loss": 2.0976, + "step": 12578 + }, + { + "epoch": 1.072104321145487, + "grad_norm": 39.87349016304755, + "learning_rate": 8.114789983966783e-06, + "loss": 2.9618, + "step": 12579 + }, + { + "epoch": 1.0721895508395125, + "grad_norm": 49.81975107884668, + "learning_rate": 8.114402089067854e-06, + "loss": 2.1401, + "step": 12580 + }, + { + "epoch": 1.072274780533538, + "grad_norm": 54.65466331086424, + "learning_rate": 8.114014163540018e-06, + "loss": 3.483, + "step": 12581 + }, + { + "epoch": 1.0723600102275632, + "grad_norm": 52.171448123085646, + "learning_rate": 8.113626207387092e-06, + "loss": 3.1908, + "step": 12582 + }, + { + "epoch": 1.0724452399215887, + "grad_norm": 28.290763189672994, + "learning_rate": 8.113238220612888e-06, + "loss": 2.5376, + "step": 12583 + }, + { + "epoch": 1.0725304696156142, + "grad_norm": 80.79430626762921, + "learning_rate": 8.112850203221224e-06, + "loss": 2.9384, + "step": 12584 + }, + { + "epoch": 1.0726156993096394, + "grad_norm": 113.82982497976163, + "learning_rate": 8.112462155215916e-06, + "loss": 3.0417, + "step": 12585 + }, + { + "epoch": 1.0727009290036649, + "grad_norm": 27.30224172318463, + "learning_rate": 8.11207407660078e-06, + "loss": 2.5616, + "step": 12586 + }, + { + "epoch": 1.0727861586976903, + "grad_norm": 124.1422457791999, + "learning_rate": 8.111685967379629e-06, + "loss": 3.9442, + "step": 12587 + }, + { + "epoch": 1.0728713883917156, + "grad_norm": 32.51910870722093, + "learning_rate": 8.111297827556285e-06, + "loss": 2.4912, + "step": 12588 + }, + { + "epoch": 1.072956618085741, + "grad_norm": 51.08621950696432, + "learning_rate": 8.110909657134563e-06, + "loss": 2.8624, + "step": 12589 + }, + { + "epoch": 1.0730418477797665, + "grad_norm": 71.75052839077757, + "learning_rate": 8.11052145611828e-06, + "loss": 3.0911, + "step": 12590 + }, + { + "epoch": 1.0731270774737918, + "grad_norm": 59.36168399526382, + "learning_rate": 8.110133224511254e-06, + "loss": 3.1843, + "step": 12591 + }, + { + "epoch": 1.0732123071678172, + "grad_norm": 83.55071316149787, + "learning_rate": 8.109744962317306e-06, + "loss": 2.7214, + "step": 12592 + }, + { + "epoch": 1.0732975368618427, + "grad_norm": 40.85137016114741, + "learning_rate": 8.10935666954025e-06, + "loss": 3.1463, + "step": 12593 + }, + { + "epoch": 1.0733827665558682, + "grad_norm": 35.39851489075863, + "learning_rate": 8.108968346183908e-06, + "loss": 2.3299, + "step": 12594 + }, + { + "epoch": 1.0734679962498934, + "grad_norm": 84.00615704618284, + "learning_rate": 8.108579992252094e-06, + "loss": 3.3147, + "step": 12595 + }, + { + "epoch": 1.0735532259439189, + "grad_norm": 32.656861589699275, + "learning_rate": 8.108191607748635e-06, + "loss": 2.7785, + "step": 12596 + }, + { + "epoch": 1.0736384556379444, + "grad_norm": 40.70926129459551, + "learning_rate": 8.107803192677346e-06, + "loss": 2.9518, + "step": 12597 + }, + { + "epoch": 1.0737236853319696, + "grad_norm": 20.891789292933407, + "learning_rate": 8.107414747042046e-06, + "loss": 1.8397, + "step": 12598 + }, + { + "epoch": 1.073808915025995, + "grad_norm": 54.72615009917968, + "learning_rate": 8.107026270846559e-06, + "loss": 3.1214, + "step": 12599 + }, + { + "epoch": 1.0738941447200205, + "grad_norm": 52.24026489328776, + "learning_rate": 8.1066377640947e-06, + "loss": 2.883, + "step": 12600 + }, + { + "epoch": 1.0739793744140458, + "grad_norm": 61.5216157052783, + "learning_rate": 8.106249226790296e-06, + "loss": 3.0957, + "step": 12601 + }, + { + "epoch": 1.0740646041080713, + "grad_norm": 115.22329521256056, + "learning_rate": 8.105860658937164e-06, + "loss": 2.801, + "step": 12602 + }, + { + "epoch": 1.0741498338020967, + "grad_norm": 36.32816446221272, + "learning_rate": 8.105472060539128e-06, + "loss": 3.0069, + "step": 12603 + }, + { + "epoch": 1.074235063496122, + "grad_norm": 57.838458512610615, + "learning_rate": 8.105083431600007e-06, + "loss": 2.4695, + "step": 12604 + }, + { + "epoch": 1.0743202931901474, + "grad_norm": 38.696606320979896, + "learning_rate": 8.104694772123624e-06, + "loss": 2.944, + "step": 12605 + }, + { + "epoch": 1.074405522884173, + "grad_norm": 128.10760569669253, + "learning_rate": 8.104306082113803e-06, + "loss": 2.866, + "step": 12606 + }, + { + "epoch": 1.0744907525781981, + "grad_norm": 88.38746913539642, + "learning_rate": 8.103917361574362e-06, + "loss": 3.8827, + "step": 12607 + }, + { + "epoch": 1.0745759822722236, + "grad_norm": 79.08137351937665, + "learning_rate": 8.10352861050913e-06, + "loss": 3.3787, + "step": 12608 + }, + { + "epoch": 1.074661211966249, + "grad_norm": 40.34490060319418, + "learning_rate": 8.103139828921926e-06, + "loss": 3.6252, + "step": 12609 + }, + { + "epoch": 1.0747464416602743, + "grad_norm": 40.29060260262021, + "learning_rate": 8.102751016816576e-06, + "loss": 2.5545, + "step": 12610 + }, + { + "epoch": 1.0748316713542998, + "grad_norm": 135.3062762426118, + "learning_rate": 8.102362174196903e-06, + "loss": 3.6967, + "step": 12611 + }, + { + "epoch": 1.0749169010483253, + "grad_norm": 71.28068673431387, + "learning_rate": 8.101973301066729e-06, + "loss": 3.2725, + "step": 12612 + }, + { + "epoch": 1.0750021307423507, + "grad_norm": 30.516636655299024, + "learning_rate": 8.10158439742988e-06, + "loss": 2.3988, + "step": 12613 + }, + { + "epoch": 1.075087360436376, + "grad_norm": 78.0553666880936, + "learning_rate": 8.101195463290184e-06, + "loss": 2.9333, + "step": 12614 + }, + { + "epoch": 1.0751725901304015, + "grad_norm": 34.04403233439402, + "learning_rate": 8.100806498651459e-06, + "loss": 2.99, + "step": 12615 + }, + { + "epoch": 1.075257819824427, + "grad_norm": 55.24109301379161, + "learning_rate": 8.100417503517537e-06, + "loss": 3.3154, + "step": 12616 + }, + { + "epoch": 1.0753430495184522, + "grad_norm": 41.85118233414052, + "learning_rate": 8.100028477892238e-06, + "loss": 2.9212, + "step": 12617 + }, + { + "epoch": 1.0754282792124776, + "grad_norm": 32.71469271955977, + "learning_rate": 8.099639421779395e-06, + "loss": 2.8101, + "step": 12618 + }, + { + "epoch": 1.075513508906503, + "grad_norm": 55.2985878143348, + "learning_rate": 8.099250335182827e-06, + "loss": 2.6242, + "step": 12619 + }, + { + "epoch": 1.0755987386005283, + "grad_norm": 34.51713644519942, + "learning_rate": 8.098861218106364e-06, + "loss": 2.904, + "step": 12620 + }, + { + "epoch": 1.0756839682945538, + "grad_norm": 41.00161088678413, + "learning_rate": 8.098472070553832e-06, + "loss": 2.5328, + "step": 12621 + }, + { + "epoch": 1.0757691979885793, + "grad_norm": 52.328594175252505, + "learning_rate": 8.09808289252906e-06, + "loss": 3.1835, + "step": 12622 + }, + { + "epoch": 1.0758544276826045, + "grad_norm": 34.02484896993443, + "learning_rate": 8.097693684035873e-06, + "loss": 3.2126, + "step": 12623 + }, + { + "epoch": 1.07593965737663, + "grad_norm": 21.092155040464874, + "learning_rate": 8.097304445078098e-06, + "loss": 2.1546, + "step": 12624 + }, + { + "epoch": 1.0760248870706555, + "grad_norm": 48.57142615853576, + "learning_rate": 8.096915175659566e-06, + "loss": 2.6496, + "step": 12625 + }, + { + "epoch": 1.0761101167646807, + "grad_norm": 219.89408218148745, + "learning_rate": 8.096525875784103e-06, + "loss": 3.2286, + "step": 12626 + }, + { + "epoch": 1.0761953464587062, + "grad_norm": 40.36770529381452, + "learning_rate": 8.096136545455539e-06, + "loss": 3.1002, + "step": 12627 + }, + { + "epoch": 1.0762805761527316, + "grad_norm": 24.25888405102912, + "learning_rate": 8.095747184677702e-06, + "loss": 1.4033, + "step": 12628 + }, + { + "epoch": 1.076365805846757, + "grad_norm": 45.44840454273748, + "learning_rate": 8.095357793454423e-06, + "loss": 3.1605, + "step": 12629 + }, + { + "epoch": 1.0764510355407824, + "grad_norm": 67.51290630746976, + "learning_rate": 8.094968371789527e-06, + "loss": 3.1094, + "step": 12630 + }, + { + "epoch": 1.0765362652348078, + "grad_norm": 44.094461169734025, + "learning_rate": 8.094578919686851e-06, + "loss": 3.1912, + "step": 12631 + }, + { + "epoch": 1.0766214949288333, + "grad_norm": 30.84350754406667, + "learning_rate": 8.094189437150217e-06, + "loss": 3.155, + "step": 12632 + }, + { + "epoch": 1.0767067246228585, + "grad_norm": 45.55182863849657, + "learning_rate": 8.093799924183464e-06, + "loss": 3.8539, + "step": 12633 + }, + { + "epoch": 1.076791954316884, + "grad_norm": 33.87446789816899, + "learning_rate": 8.093410380790412e-06, + "loss": 2.4239, + "step": 12634 + }, + { + "epoch": 1.0768771840109095, + "grad_norm": 124.30204022804277, + "learning_rate": 8.093020806974902e-06, + "loss": 2.0696, + "step": 12635 + }, + { + "epoch": 1.0769624137049347, + "grad_norm": 96.52667652691687, + "learning_rate": 8.092631202740762e-06, + "loss": 2.612, + "step": 12636 + }, + { + "epoch": 1.0770476433989602, + "grad_norm": 82.92940347818877, + "learning_rate": 8.09224156809182e-06, + "loss": 2.7467, + "step": 12637 + }, + { + "epoch": 1.0771328730929857, + "grad_norm": 240.36763262468733, + "learning_rate": 8.091851903031914e-06, + "loss": 2.9763, + "step": 12638 + }, + { + "epoch": 1.077218102787011, + "grad_norm": 63.744525751723934, + "learning_rate": 8.091462207564873e-06, + "loss": 4.3426, + "step": 12639 + }, + { + "epoch": 1.0773033324810364, + "grad_norm": 41.2907142417591, + "learning_rate": 8.091072481694526e-06, + "loss": 2.6852, + "step": 12640 + }, + { + "epoch": 1.0773885621750618, + "grad_norm": 37.61961942788321, + "learning_rate": 8.090682725424713e-06, + "loss": 3.5788, + "step": 12641 + }, + { + "epoch": 1.077473791869087, + "grad_norm": 30.872476287922005, + "learning_rate": 8.090292938759262e-06, + "loss": 1.8413, + "step": 12642 + }, + { + "epoch": 1.0775590215631126, + "grad_norm": 60.37926895031401, + "learning_rate": 8.089903121702009e-06, + "loss": 4.0218, + "step": 12643 + }, + { + "epoch": 1.077644251257138, + "grad_norm": 18.915719735233104, + "learning_rate": 8.089513274256786e-06, + "loss": 0.9036, + "step": 12644 + }, + { + "epoch": 1.0777294809511635, + "grad_norm": 54.56739945976745, + "learning_rate": 8.089123396427428e-06, + "loss": 2.9458, + "step": 12645 + }, + { + "epoch": 1.0778147106451887, + "grad_norm": 43.494403011547185, + "learning_rate": 8.088733488217768e-06, + "loss": 2.523, + "step": 12646 + }, + { + "epoch": 1.0778999403392142, + "grad_norm": 104.08857223500937, + "learning_rate": 8.088343549631642e-06, + "loss": 3.839, + "step": 12647 + }, + { + "epoch": 1.0779851700332397, + "grad_norm": 44.48628529581617, + "learning_rate": 8.087953580672886e-06, + "loss": 2.6794, + "step": 12648 + }, + { + "epoch": 1.078070399727265, + "grad_norm": 31.53291321583257, + "learning_rate": 8.08756358134533e-06, + "loss": 1.8694, + "step": 12649 + }, + { + "epoch": 1.0781556294212904, + "grad_norm": 42.24248890011395, + "learning_rate": 8.087173551652815e-06, + "loss": 2.1, + "step": 12650 + }, + { + "epoch": 1.0782408591153159, + "grad_norm": 99.72498779023077, + "learning_rate": 8.086783491599175e-06, + "loss": 4.5362, + "step": 12651 + }, + { + "epoch": 1.078326088809341, + "grad_norm": 116.57499570590214, + "learning_rate": 8.086393401188247e-06, + "loss": 3.4327, + "step": 12652 + }, + { + "epoch": 1.0784113185033666, + "grad_norm": 35.65630832681583, + "learning_rate": 8.086003280423862e-06, + "loss": 3.3669, + "step": 12653 + }, + { + "epoch": 1.078496548197392, + "grad_norm": 59.69225195311005, + "learning_rate": 8.085613129309866e-06, + "loss": 2.5845, + "step": 12654 + }, + { + "epoch": 1.0785817778914173, + "grad_norm": 31.171508253799452, + "learning_rate": 8.085222947850089e-06, + "loss": 2.584, + "step": 12655 + }, + { + "epoch": 1.0786670075854428, + "grad_norm": 140.43274426873555, + "learning_rate": 8.084832736048368e-06, + "loss": 3.667, + "step": 12656 + }, + { + "epoch": 1.0787522372794682, + "grad_norm": 51.28052103721047, + "learning_rate": 8.084442493908547e-06, + "loss": 2.3627, + "step": 12657 + }, + { + "epoch": 1.0788374669734935, + "grad_norm": 39.804859308024724, + "learning_rate": 8.084052221434458e-06, + "loss": 3.4178, + "step": 12658 + }, + { + "epoch": 1.078922696667519, + "grad_norm": 56.617291591482, + "learning_rate": 8.08366191862994e-06, + "loss": 3.3781, + "step": 12659 + }, + { + "epoch": 1.0790079263615444, + "grad_norm": 35.385832725852886, + "learning_rate": 8.083271585498833e-06, + "loss": 2.9726, + "step": 12660 + }, + { + "epoch": 1.0790931560555697, + "grad_norm": 56.58372037831853, + "learning_rate": 8.082881222044976e-06, + "loss": 3.5461, + "step": 12661 + }, + { + "epoch": 1.0791783857495951, + "grad_norm": 94.38651246449535, + "learning_rate": 8.082490828272205e-06, + "loss": 3.0328, + "step": 12662 + }, + { + "epoch": 1.0792636154436206, + "grad_norm": 48.88414773941858, + "learning_rate": 8.082100404184363e-06, + "loss": 2.7844, + "step": 12663 + }, + { + "epoch": 1.079348845137646, + "grad_norm": 30.773247556466913, + "learning_rate": 8.081709949785288e-06, + "loss": 2.6674, + "step": 12664 + }, + { + "epoch": 1.0794340748316713, + "grad_norm": 71.571285479079, + "learning_rate": 8.08131946507882e-06, + "loss": 2.8783, + "step": 12665 + }, + { + "epoch": 1.0795193045256968, + "grad_norm": 40.65447913370701, + "learning_rate": 8.0809289500688e-06, + "loss": 2.054, + "step": 12666 + }, + { + "epoch": 1.0796045342197222, + "grad_norm": 47.30455073349712, + "learning_rate": 8.080538404759069e-06, + "loss": 2.938, + "step": 12667 + }, + { + "epoch": 1.0796897639137475, + "grad_norm": 35.96882272083455, + "learning_rate": 8.080147829153463e-06, + "loss": 2.8827, + "step": 12668 + }, + { + "epoch": 1.079774993607773, + "grad_norm": 68.68873968400837, + "learning_rate": 8.07975722325583e-06, + "loss": 2.2109, + "step": 12669 + }, + { + "epoch": 1.0798602233017984, + "grad_norm": 34.48362658984852, + "learning_rate": 8.07936658707001e-06, + "loss": 2.8743, + "step": 12670 + }, + { + "epoch": 1.0799454529958237, + "grad_norm": 39.749600932090296, + "learning_rate": 8.07897592059984e-06, + "loss": 2.1496, + "step": 12671 + }, + { + "epoch": 1.0800306826898491, + "grad_norm": 29.92297701839189, + "learning_rate": 8.078585223849166e-06, + "loss": 2.4302, + "step": 12672 + }, + { + "epoch": 1.0801159123838746, + "grad_norm": 43.67810784076426, + "learning_rate": 8.078194496821832e-06, + "loss": 2.5629, + "step": 12673 + }, + { + "epoch": 1.0802011420778999, + "grad_norm": 130.16339072033645, + "learning_rate": 8.077803739521677e-06, + "loss": 3.8272, + "step": 12674 + }, + { + "epoch": 1.0802863717719253, + "grad_norm": 83.91388362572712, + "learning_rate": 8.077412951952544e-06, + "loss": 3.8556, + "step": 12675 + }, + { + "epoch": 1.0803716014659508, + "grad_norm": 49.60341806701047, + "learning_rate": 8.07702213411828e-06, + "loss": 2.8981, + "step": 12676 + }, + { + "epoch": 1.080456831159976, + "grad_norm": 71.17048885984913, + "learning_rate": 8.076631286022723e-06, + "loss": 3.3793, + "step": 12677 + }, + { + "epoch": 1.0805420608540015, + "grad_norm": 38.05883344064434, + "learning_rate": 8.076240407669722e-06, + "loss": 3.274, + "step": 12678 + }, + { + "epoch": 1.080627290548027, + "grad_norm": 65.78108866692763, + "learning_rate": 8.075849499063117e-06, + "loss": 3.4986, + "step": 12679 + }, + { + "epoch": 1.0807125202420522, + "grad_norm": 59.28516131332199, + "learning_rate": 8.075458560206757e-06, + "loss": 4.0869, + "step": 12680 + }, + { + "epoch": 1.0807977499360777, + "grad_norm": 70.2671188162721, + "learning_rate": 8.075067591104481e-06, + "loss": 3.5092, + "step": 12681 + }, + { + "epoch": 1.0808829796301032, + "grad_norm": 38.63355002289888, + "learning_rate": 8.07467659176014e-06, + "loss": 3.1342, + "step": 12682 + }, + { + "epoch": 1.0809682093241286, + "grad_norm": 43.65925308859403, + "learning_rate": 8.074285562177575e-06, + "loss": 3.122, + "step": 12683 + }, + { + "epoch": 1.0810534390181539, + "grad_norm": 41.4607673501807, + "learning_rate": 8.073894502360632e-06, + "loss": 3.2155, + "step": 12684 + }, + { + "epoch": 1.0811386687121793, + "grad_norm": 39.76406248484396, + "learning_rate": 8.07350341231316e-06, + "loss": 2.848, + "step": 12685 + }, + { + "epoch": 1.0812238984062048, + "grad_norm": 94.66405196805007, + "learning_rate": 8.073112292039003e-06, + "loss": 2.2602, + "step": 12686 + }, + { + "epoch": 1.08130912810023, + "grad_norm": 44.310102681367844, + "learning_rate": 8.072721141542006e-06, + "loss": 2.7131, + "step": 12687 + }, + { + "epoch": 1.0813943577942555, + "grad_norm": 55.29437456980673, + "learning_rate": 8.07232996082602e-06, + "loss": 2.8861, + "step": 12688 + }, + { + "epoch": 1.081479587488281, + "grad_norm": 91.8531058044796, + "learning_rate": 8.071938749894889e-06, + "loss": 3.542, + "step": 12689 + }, + { + "epoch": 1.0815648171823062, + "grad_norm": 45.028347688965745, + "learning_rate": 8.07154750875246e-06, + "loss": 3.0828, + "step": 12690 + }, + { + "epoch": 1.0816500468763317, + "grad_norm": 107.48320877382162, + "learning_rate": 8.071156237402582e-06, + "loss": 4.2689, + "step": 12691 + }, + { + "epoch": 1.0817352765703572, + "grad_norm": 43.972026470317076, + "learning_rate": 8.070764935849104e-06, + "loss": 3.4125, + "step": 12692 + }, + { + "epoch": 1.0818205062643824, + "grad_norm": 24.9828073633605, + "learning_rate": 8.070373604095873e-06, + "loss": 1.9573, + "step": 12693 + }, + { + "epoch": 1.0819057359584079, + "grad_norm": 54.35662022965738, + "learning_rate": 8.069982242146736e-06, + "loss": 3.1328, + "step": 12694 + }, + { + "epoch": 1.0819909656524334, + "grad_norm": 126.18106576665926, + "learning_rate": 8.069590850005544e-06, + "loss": 4.1907, + "step": 12695 + }, + { + "epoch": 1.0820761953464586, + "grad_norm": 86.56314168580306, + "learning_rate": 8.069199427676147e-06, + "loss": 2.8794, + "step": 12696 + }, + { + "epoch": 1.082161425040484, + "grad_norm": 43.58599999125407, + "learning_rate": 8.068807975162392e-06, + "loss": 3.6982, + "step": 12697 + }, + { + "epoch": 1.0822466547345095, + "grad_norm": 62.76621103431781, + "learning_rate": 8.06841649246813e-06, + "loss": 2.9037, + "step": 12698 + }, + { + "epoch": 1.0823318844285348, + "grad_norm": 95.05057348169747, + "learning_rate": 8.06802497959721e-06, + "loss": 2.893, + "step": 12699 + }, + { + "epoch": 1.0824171141225603, + "grad_norm": 24.525682898808828, + "learning_rate": 8.067633436553485e-06, + "loss": 2.3212, + "step": 12700 + }, + { + "epoch": 1.0825023438165857, + "grad_norm": 31.13562816285627, + "learning_rate": 8.067241863340805e-06, + "loss": 2.2309, + "step": 12701 + }, + { + "epoch": 1.0825875735106112, + "grad_norm": 41.316365370371535, + "learning_rate": 8.06685025996302e-06, + "loss": 3.4682, + "step": 12702 + }, + { + "epoch": 1.0826728032046364, + "grad_norm": 63.9925946446448, + "learning_rate": 8.06645862642398e-06, + "loss": 3.6716, + "step": 12703 + }, + { + "epoch": 1.082758032898662, + "grad_norm": 31.140285743204036, + "learning_rate": 8.066066962727539e-06, + "loss": 2.8819, + "step": 12704 + }, + { + "epoch": 1.0828432625926874, + "grad_norm": 109.56842728967396, + "learning_rate": 8.065675268877546e-06, + "loss": 3.2932, + "step": 12705 + }, + { + "epoch": 1.0829284922867126, + "grad_norm": 57.655472066950914, + "learning_rate": 8.065283544877856e-06, + "loss": 3.9103, + "step": 12706 + }, + { + "epoch": 1.083013721980738, + "grad_norm": 43.8615847947105, + "learning_rate": 8.06489179073232e-06, + "loss": 2.8139, + "step": 12707 + }, + { + "epoch": 1.0830989516747636, + "grad_norm": 55.69812865842994, + "learning_rate": 8.064500006444791e-06, + "loss": 3.6368, + "step": 12708 + }, + { + "epoch": 1.0831841813687888, + "grad_norm": 171.15005344355504, + "learning_rate": 8.064108192019121e-06, + "loss": 3.4279, + "step": 12709 + }, + { + "epoch": 1.0832694110628143, + "grad_norm": 38.43241337164986, + "learning_rate": 8.063716347459167e-06, + "loss": 3.3311, + "step": 12710 + }, + { + "epoch": 1.0833546407568397, + "grad_norm": 104.25151403380242, + "learning_rate": 8.06332447276878e-06, + "loss": 3.698, + "step": 12711 + }, + { + "epoch": 1.083439870450865, + "grad_norm": 56.393398209630675, + "learning_rate": 8.062932567951813e-06, + "loss": 2.7751, + "step": 12712 + }, + { + "epoch": 1.0835251001448905, + "grad_norm": 47.471253092811, + "learning_rate": 8.062540633012122e-06, + "loss": 2.8234, + "step": 12713 + }, + { + "epoch": 1.083610329838916, + "grad_norm": 58.17209589697759, + "learning_rate": 8.06214866795356e-06, + "loss": 2.7343, + "step": 12714 + }, + { + "epoch": 1.0836955595329414, + "grad_norm": 50.99621471026559, + "learning_rate": 8.061756672779984e-06, + "loss": 3.0407, + "step": 12715 + }, + { + "epoch": 1.0837807892269666, + "grad_norm": 43.04496705623724, + "learning_rate": 8.061364647495246e-06, + "loss": 3.5316, + "step": 12716 + }, + { + "epoch": 1.083866018920992, + "grad_norm": 45.28477170905677, + "learning_rate": 8.060972592103204e-06, + "loss": 2.7047, + "step": 12717 + }, + { + "epoch": 1.0839512486150176, + "grad_norm": 30.51636820862711, + "learning_rate": 8.060580506607711e-06, + "loss": 2.4216, + "step": 12718 + }, + { + "epoch": 1.0840364783090428, + "grad_norm": 49.0455959960277, + "learning_rate": 8.06018839101263e-06, + "loss": 3.2399, + "step": 12719 + }, + { + "epoch": 1.0841217080030683, + "grad_norm": 59.806975646617126, + "learning_rate": 8.059796245321807e-06, + "loss": 3.1607, + "step": 12720 + }, + { + "epoch": 1.0842069376970938, + "grad_norm": 23.854423022205303, + "learning_rate": 8.059404069539105e-06, + "loss": 2.0945, + "step": 12721 + }, + { + "epoch": 1.084292167391119, + "grad_norm": 40.25952265201484, + "learning_rate": 8.05901186366838e-06, + "loss": 2.3149, + "step": 12722 + }, + { + "epoch": 1.0843773970851445, + "grad_norm": 50.25005001298682, + "learning_rate": 8.05861962771349e-06, + "loss": 3.7675, + "step": 12723 + }, + { + "epoch": 1.08446262677917, + "grad_norm": 41.62216167082917, + "learning_rate": 8.058227361678289e-06, + "loss": 3.5773, + "step": 12724 + }, + { + "epoch": 1.0845478564731952, + "grad_norm": 51.16637354127939, + "learning_rate": 8.057835065566639e-06, + "loss": 2.6303, + "step": 12725 + }, + { + "epoch": 1.0846330861672207, + "grad_norm": 76.99501243300604, + "learning_rate": 8.057442739382395e-06, + "loss": 3.3779, + "step": 12726 + }, + { + "epoch": 1.0847183158612461, + "grad_norm": 233.8144957940102, + "learning_rate": 8.057050383129419e-06, + "loss": 2.7662, + "step": 12727 + }, + { + "epoch": 1.0848035455552714, + "grad_norm": 34.279388408841385, + "learning_rate": 8.056657996811565e-06, + "loss": 2.4504, + "step": 12728 + }, + { + "epoch": 1.0848887752492968, + "grad_norm": 73.4370723333049, + "learning_rate": 8.056265580432694e-06, + "loss": 3.6492, + "step": 12729 + }, + { + "epoch": 1.0849740049433223, + "grad_norm": 31.187732762639616, + "learning_rate": 8.055873133996665e-06, + "loss": 2.8937, + "step": 12730 + }, + { + "epoch": 1.0850592346373475, + "grad_norm": 26.20984913736792, + "learning_rate": 8.055480657507341e-06, + "loss": 3.0383, + "step": 12731 + }, + { + "epoch": 1.085144464331373, + "grad_norm": 76.86747403701212, + "learning_rate": 8.055088150968577e-06, + "loss": 2.8095, + "step": 12732 + }, + { + "epoch": 1.0852296940253985, + "grad_norm": 39.632200601110206, + "learning_rate": 8.054695614384234e-06, + "loss": 3.438, + "step": 12733 + }, + { + "epoch": 1.085314923719424, + "grad_norm": 30.467706343067725, + "learning_rate": 8.054303047758176e-06, + "loss": 2.3168, + "step": 12734 + }, + { + "epoch": 1.0854001534134492, + "grad_norm": 52.43237900270578, + "learning_rate": 8.053910451094259e-06, + "loss": 3.8042, + "step": 12735 + }, + { + "epoch": 1.0854853831074747, + "grad_norm": 36.582387530589706, + "learning_rate": 8.053517824396346e-06, + "loss": 2.5656, + "step": 12736 + }, + { + "epoch": 1.0855706128015001, + "grad_norm": 92.63180141155117, + "learning_rate": 8.0531251676683e-06, + "loss": 3.2166, + "step": 12737 + }, + { + "epoch": 1.0856558424955254, + "grad_norm": 24.522996458340053, + "learning_rate": 8.05273248091398e-06, + "loss": 2.1629, + "step": 12738 + }, + { + "epoch": 1.0857410721895508, + "grad_norm": 24.84505959652426, + "learning_rate": 8.05233976413725e-06, + "loss": 2.054, + "step": 12739 + }, + { + "epoch": 1.0858263018835763, + "grad_norm": 40.632817682010405, + "learning_rate": 8.051947017341971e-06, + "loss": 3.3106, + "step": 12740 + }, + { + "epoch": 1.0859115315776016, + "grad_norm": 48.07296973499363, + "learning_rate": 8.051554240532005e-06, + "loss": 2.422, + "step": 12741 + }, + { + "epoch": 1.085996761271627, + "grad_norm": 34.68666867557357, + "learning_rate": 8.051161433711218e-06, + "loss": 2.3251, + "step": 12742 + }, + { + "epoch": 1.0860819909656525, + "grad_norm": 35.68897114245614, + "learning_rate": 8.050768596883467e-06, + "loss": 3.0978, + "step": 12743 + }, + { + "epoch": 1.0861672206596777, + "grad_norm": 111.493033988108, + "learning_rate": 8.050375730052622e-06, + "loss": 2.2772, + "step": 12744 + }, + { + "epoch": 1.0862524503537032, + "grad_norm": 44.26052497143705, + "learning_rate": 8.049982833222544e-06, + "loss": 3.1573, + "step": 12745 + }, + { + "epoch": 1.0863376800477287, + "grad_norm": 77.19005529232524, + "learning_rate": 8.049589906397095e-06, + "loss": 2.4836, + "step": 12746 + }, + { + "epoch": 1.086422909741754, + "grad_norm": 99.54350010084268, + "learning_rate": 8.049196949580143e-06, + "loss": 3.452, + "step": 12747 + }, + { + "epoch": 1.0865081394357794, + "grad_norm": 40.68942154971892, + "learning_rate": 8.048803962775549e-06, + "loss": 2.6692, + "step": 12748 + }, + { + "epoch": 1.0865933691298049, + "grad_norm": 23.141760961787565, + "learning_rate": 8.04841094598718e-06, + "loss": 2.6302, + "step": 12749 + }, + { + "epoch": 1.08667859882383, + "grad_norm": 69.84903382596156, + "learning_rate": 8.048017899218901e-06, + "loss": 2.3898, + "step": 12750 + }, + { + "epoch": 1.0867638285178556, + "grad_norm": 37.3170198626432, + "learning_rate": 8.047624822474575e-06, + "loss": 3.2272, + "step": 12751 + }, + { + "epoch": 1.086849058211881, + "grad_norm": 70.39906732475286, + "learning_rate": 8.047231715758073e-06, + "loss": 3.3346, + "step": 12752 + }, + { + "epoch": 1.0869342879059065, + "grad_norm": 158.7900833020051, + "learning_rate": 8.046838579073257e-06, + "loss": 4.0806, + "step": 12753 + }, + { + "epoch": 1.0870195175999318, + "grad_norm": 61.58096726554016, + "learning_rate": 8.046445412423994e-06, + "loss": 2.928, + "step": 12754 + }, + { + "epoch": 1.0871047472939572, + "grad_norm": 54.44509166155055, + "learning_rate": 8.04605221581415e-06, + "loss": 2.4681, + "step": 12755 + }, + { + "epoch": 1.0871899769879827, + "grad_norm": 150.42215228918099, + "learning_rate": 8.045658989247594e-06, + "loss": 2.8266, + "step": 12756 + }, + { + "epoch": 1.087275206682008, + "grad_norm": 36.387242078064176, + "learning_rate": 8.045265732728192e-06, + "loss": 2.4174, + "step": 12757 + }, + { + "epoch": 1.0873604363760334, + "grad_norm": 34.21580844267266, + "learning_rate": 8.04487244625981e-06, + "loss": 2.6271, + "step": 12758 + }, + { + "epoch": 1.0874456660700589, + "grad_norm": 40.20974723848696, + "learning_rate": 8.044479129846319e-06, + "loss": 3.7302, + "step": 12759 + }, + { + "epoch": 1.0875308957640841, + "grad_norm": 29.50864766034456, + "learning_rate": 8.044085783491585e-06, + "loss": 2.5929, + "step": 12760 + }, + { + "epoch": 1.0876161254581096, + "grad_norm": 48.04754008621765, + "learning_rate": 8.043692407199477e-06, + "loss": 3.3635, + "step": 12761 + }, + { + "epoch": 1.087701355152135, + "grad_norm": 38.19656488259305, + "learning_rate": 8.043299000973865e-06, + "loss": 1.9853, + "step": 12762 + }, + { + "epoch": 1.0877865848461603, + "grad_norm": 52.72908504410394, + "learning_rate": 8.042905564818614e-06, + "loss": 3.7127, + "step": 12763 + }, + { + "epoch": 1.0878718145401858, + "grad_norm": 124.42248463268949, + "learning_rate": 8.042512098737598e-06, + "loss": 3.6, + "step": 12764 + }, + { + "epoch": 1.0879570442342112, + "grad_norm": 35.6356199527527, + "learning_rate": 8.042118602734682e-06, + "loss": 3.2842, + "step": 12765 + }, + { + "epoch": 1.0880422739282367, + "grad_norm": 58.4516346523284, + "learning_rate": 8.041725076813742e-06, + "loss": 2.8379, + "step": 12766 + }, + { + "epoch": 1.088127503622262, + "grad_norm": 47.13457881257144, + "learning_rate": 8.041331520978641e-06, + "loss": 1.8835, + "step": 12767 + }, + { + "epoch": 1.0882127333162874, + "grad_norm": 43.01114105280228, + "learning_rate": 8.040937935233256e-06, + "loss": 2.899, + "step": 12768 + }, + { + "epoch": 1.088297963010313, + "grad_norm": 51.24851987805928, + "learning_rate": 8.040544319581454e-06, + "loss": 2.3595, + "step": 12769 + }, + { + "epoch": 1.0883831927043381, + "grad_norm": 76.06456458641064, + "learning_rate": 8.040150674027104e-06, + "loss": 2.0097, + "step": 12770 + }, + { + "epoch": 1.0884684223983636, + "grad_norm": 29.28013458633101, + "learning_rate": 8.039756998574082e-06, + "loss": 2.1981, + "step": 12771 + }, + { + "epoch": 1.088553652092389, + "grad_norm": 32.15492429566293, + "learning_rate": 8.03936329322626e-06, + "loss": 2.0597, + "step": 12772 + }, + { + "epoch": 1.0886388817864143, + "grad_norm": 28.384714523918873, + "learning_rate": 8.038969557987507e-06, + "loss": 2.1462, + "step": 12773 + }, + { + "epoch": 1.0887241114804398, + "grad_norm": 63.62021007353818, + "learning_rate": 8.038575792861693e-06, + "loss": 3.0074, + "step": 12774 + }, + { + "epoch": 1.0888093411744653, + "grad_norm": 40.78317172587001, + "learning_rate": 8.038181997852696e-06, + "loss": 2.9376, + "step": 12775 + }, + { + "epoch": 1.0888945708684905, + "grad_norm": 38.36124558660859, + "learning_rate": 8.037788172964385e-06, + "loss": 2.4792, + "step": 12776 + }, + { + "epoch": 1.088979800562516, + "grad_norm": 72.15052585155664, + "learning_rate": 8.037394318200636e-06, + "loss": 3.2485, + "step": 12777 + }, + { + "epoch": 1.0890650302565414, + "grad_norm": 122.77872439844636, + "learning_rate": 8.03700043356532e-06, + "loss": 4.028, + "step": 12778 + }, + { + "epoch": 1.0891502599505667, + "grad_norm": 57.583831907292264, + "learning_rate": 8.036606519062312e-06, + "loss": 3.1569, + "step": 12779 + }, + { + "epoch": 1.0892354896445922, + "grad_norm": 52.40020737974723, + "learning_rate": 8.036212574695484e-06, + "loss": 2.7957, + "step": 12780 + }, + { + "epoch": 1.0893207193386176, + "grad_norm": 87.18402159854477, + "learning_rate": 8.035818600468712e-06, + "loss": 2.6192, + "step": 12781 + }, + { + "epoch": 1.0894059490326429, + "grad_norm": 52.38856686583815, + "learning_rate": 8.035424596385871e-06, + "loss": 3.2384, + "step": 12782 + }, + { + "epoch": 1.0894911787266683, + "grad_norm": 159.93134195475653, + "learning_rate": 8.035030562450837e-06, + "loss": 3.4361, + "step": 12783 + }, + { + "epoch": 1.0895764084206938, + "grad_norm": 58.13483637704614, + "learning_rate": 8.03463649866748e-06, + "loss": 4.0739, + "step": 12784 + }, + { + "epoch": 1.0896616381147193, + "grad_norm": 33.29855003479144, + "learning_rate": 8.03424240503968e-06, + "loss": 3.0617, + "step": 12785 + }, + { + "epoch": 1.0897468678087445, + "grad_norm": 49.68971229916649, + "learning_rate": 8.033848281571313e-06, + "loss": 2.8771, + "step": 12786 + }, + { + "epoch": 1.08983209750277, + "grad_norm": 81.77291028099305, + "learning_rate": 8.033454128266255e-06, + "loss": 3.6879, + "step": 12787 + }, + { + "epoch": 1.0899173271967955, + "grad_norm": 45.29508992936784, + "learning_rate": 8.033059945128377e-06, + "loss": 4.2491, + "step": 12788 + }, + { + "epoch": 1.0900025568908207, + "grad_norm": 40.82758927740322, + "learning_rate": 8.032665732161562e-06, + "loss": 2.9936, + "step": 12789 + }, + { + "epoch": 1.0900877865848462, + "grad_norm": 28.616000231563813, + "learning_rate": 8.032271489369685e-06, + "loss": 2.6506, + "step": 12790 + }, + { + "epoch": 1.0901730162788716, + "grad_norm": 41.681798982854936, + "learning_rate": 8.031877216756621e-06, + "loss": 3.5465, + "step": 12791 + }, + { + "epoch": 1.0902582459728969, + "grad_norm": 42.008458949029254, + "learning_rate": 8.03148291432625e-06, + "loss": 3.0889, + "step": 12792 + }, + { + "epoch": 1.0903434756669224, + "grad_norm": 43.93734613872316, + "learning_rate": 8.03108858208245e-06, + "loss": 3.2047, + "step": 12793 + }, + { + "epoch": 1.0904287053609478, + "grad_norm": 46.41105566204027, + "learning_rate": 8.030694220029098e-06, + "loss": 3.6368, + "step": 12794 + }, + { + "epoch": 1.090513935054973, + "grad_norm": 74.15904112807804, + "learning_rate": 8.030299828170071e-06, + "loss": 4.4253, + "step": 12795 + }, + { + "epoch": 1.0905991647489985, + "grad_norm": 33.65762486669926, + "learning_rate": 8.029905406509253e-06, + "loss": 2.3935, + "step": 12796 + }, + { + "epoch": 1.090684394443024, + "grad_norm": 71.7161566927853, + "learning_rate": 8.029510955050516e-06, + "loss": 3.8397, + "step": 12797 + }, + { + "epoch": 1.0907696241370493, + "grad_norm": 51.92736198511787, + "learning_rate": 8.029116473797743e-06, + "loss": 2.2831, + "step": 12798 + }, + { + "epoch": 1.0908548538310747, + "grad_norm": 24.27961088289374, + "learning_rate": 8.028721962754814e-06, + "loss": 1.6927, + "step": 12799 + }, + { + "epoch": 1.0909400835251002, + "grad_norm": 86.21717964165435, + "learning_rate": 8.028327421925608e-06, + "loss": 2.6814, + "step": 12800 + }, + { + "epoch": 1.0910253132191254, + "grad_norm": 27.59132230994799, + "learning_rate": 8.027932851314005e-06, + "loss": 2.1239, + "step": 12801 + }, + { + "epoch": 1.091110542913151, + "grad_norm": 94.60879072347339, + "learning_rate": 8.027538250923885e-06, + "loss": 3.3766, + "step": 12802 + }, + { + "epoch": 1.0911957726071764, + "grad_norm": 25.492154114021478, + "learning_rate": 8.02714362075913e-06, + "loss": 3.0377, + "step": 12803 + }, + { + "epoch": 1.0912810023012018, + "grad_norm": 32.31674623315107, + "learning_rate": 8.02674896082362e-06, + "loss": 3.1817, + "step": 12804 + }, + { + "epoch": 1.091366231995227, + "grad_norm": 59.4460524562369, + "learning_rate": 8.026354271121237e-06, + "loss": 2.2587, + "step": 12805 + }, + { + "epoch": 1.0914514616892526, + "grad_norm": 38.906221238788945, + "learning_rate": 8.025959551655862e-06, + "loss": 2.9916, + "step": 12806 + }, + { + "epoch": 1.091536691383278, + "grad_norm": 77.13335057655183, + "learning_rate": 8.025564802431377e-06, + "loss": 2.9302, + "step": 12807 + }, + { + "epoch": 1.0916219210773033, + "grad_norm": 35.50759516781257, + "learning_rate": 8.025170023451664e-06, + "loss": 1.8826, + "step": 12808 + }, + { + "epoch": 1.0917071507713287, + "grad_norm": 48.75000435447878, + "learning_rate": 8.024775214720609e-06, + "loss": 3.4972, + "step": 12809 + }, + { + "epoch": 1.0917923804653542, + "grad_norm": 35.90055973649933, + "learning_rate": 8.024380376242087e-06, + "loss": 2.9939, + "step": 12810 + }, + { + "epoch": 1.0918776101593795, + "grad_norm": 43.04829428842963, + "learning_rate": 8.02398550801999e-06, + "loss": 2.4056, + "step": 12811 + }, + { + "epoch": 1.091962839853405, + "grad_norm": 35.281410502918874, + "learning_rate": 8.023590610058192e-06, + "loss": 2.5058, + "step": 12812 + }, + { + "epoch": 1.0920480695474304, + "grad_norm": 68.37815685300082, + "learning_rate": 8.023195682360587e-06, + "loss": 2.5706, + "step": 12813 + }, + { + "epoch": 1.0921332992414556, + "grad_norm": 59.58499467584864, + "learning_rate": 8.02280072493105e-06, + "loss": 2.529, + "step": 12814 + }, + { + "epoch": 1.092218528935481, + "grad_norm": 33.183631822001246, + "learning_rate": 8.022405737773473e-06, + "loss": 2.6724, + "step": 12815 + }, + { + "epoch": 1.0923037586295066, + "grad_norm": 32.448896598557454, + "learning_rate": 8.022010720891733e-06, + "loss": 2.3265, + "step": 12816 + }, + { + "epoch": 1.0923889883235318, + "grad_norm": 39.53573903990326, + "learning_rate": 8.02161567428972e-06, + "loss": 3.4796, + "step": 12817 + }, + { + "epoch": 1.0924742180175573, + "grad_norm": 40.79471436826281, + "learning_rate": 8.021220597971317e-06, + "loss": 3.1592, + "step": 12818 + }, + { + "epoch": 1.0925594477115828, + "grad_norm": 34.12840957006971, + "learning_rate": 8.020825491940411e-06, + "loss": 1.607, + "step": 12819 + }, + { + "epoch": 1.092644677405608, + "grad_norm": 33.57091737684885, + "learning_rate": 8.020430356200886e-06, + "loss": 2.9426, + "step": 12820 + }, + { + "epoch": 1.0927299070996335, + "grad_norm": 43.92029821088488, + "learning_rate": 8.020035190756628e-06, + "loss": 2.9986, + "step": 12821 + }, + { + "epoch": 1.092815136793659, + "grad_norm": 50.556154879697814, + "learning_rate": 8.019639995611524e-06, + "loss": 2.5289, + "step": 12822 + }, + { + "epoch": 1.0929003664876844, + "grad_norm": 72.77019194410735, + "learning_rate": 8.019244770769461e-06, + "loss": 3.692, + "step": 12823 + }, + { + "epoch": 1.0929855961817097, + "grad_norm": 43.84488776497517, + "learning_rate": 8.018849516234326e-06, + "loss": 2.4783, + "step": 12824 + }, + { + "epoch": 1.0930708258757351, + "grad_norm": 40.67488748279302, + "learning_rate": 8.018454232010004e-06, + "loss": 2.6903, + "step": 12825 + }, + { + "epoch": 1.0931560555697606, + "grad_norm": 73.52303318941694, + "learning_rate": 8.018058918100386e-06, + "loss": 3.2922, + "step": 12826 + }, + { + "epoch": 1.0932412852637858, + "grad_norm": 101.03856664182713, + "learning_rate": 8.017663574509356e-06, + "loss": 0.7202, + "step": 12827 + }, + { + "epoch": 1.0933265149578113, + "grad_norm": 58.273235592212686, + "learning_rate": 8.017268201240804e-06, + "loss": 3.5188, + "step": 12828 + }, + { + "epoch": 1.0934117446518368, + "grad_norm": 39.44757996092416, + "learning_rate": 8.016872798298618e-06, + "loss": 3.1871, + "step": 12829 + }, + { + "epoch": 1.093496974345862, + "grad_norm": 27.487867819112317, + "learning_rate": 8.016477365686689e-06, + "loss": 3.2654, + "step": 12830 + }, + { + "epoch": 1.0935822040398875, + "grad_norm": 34.06449766626487, + "learning_rate": 8.016081903408903e-06, + "loss": 2.6218, + "step": 12831 + }, + { + "epoch": 1.093667433733913, + "grad_norm": 70.71926446117378, + "learning_rate": 8.015686411469148e-06, + "loss": 3.6162, + "step": 12832 + }, + { + "epoch": 1.0937526634279382, + "grad_norm": 90.90148715737575, + "learning_rate": 8.015290889871317e-06, + "loss": 3.3703, + "step": 12833 + }, + { + "epoch": 1.0938378931219637, + "grad_norm": 40.733307011862756, + "learning_rate": 8.014895338619298e-06, + "loss": 2.8338, + "step": 12834 + }, + { + "epoch": 1.0939231228159891, + "grad_norm": 55.75259505763212, + "learning_rate": 8.014499757716982e-06, + "loss": 3.3649, + "step": 12835 + }, + { + "epoch": 1.0940083525100146, + "grad_norm": 124.02614634554418, + "learning_rate": 8.014104147168257e-06, + "loss": 2.3762, + "step": 12836 + }, + { + "epoch": 1.0940935822040398, + "grad_norm": 31.621114642878236, + "learning_rate": 8.013708506977016e-06, + "loss": 2.8374, + "step": 12837 + }, + { + "epoch": 1.0941788118980653, + "grad_norm": 47.19335872799237, + "learning_rate": 8.01331283714715e-06, + "loss": 3.2884, + "step": 12838 + }, + { + "epoch": 1.0942640415920908, + "grad_norm": 42.533398267750805, + "learning_rate": 8.01291713768255e-06, + "loss": 2.7305, + "step": 12839 + }, + { + "epoch": 1.094349271286116, + "grad_norm": 53.738931110846394, + "learning_rate": 8.012521408587105e-06, + "loss": 3.16, + "step": 12840 + }, + { + "epoch": 1.0944345009801415, + "grad_norm": 39.576666990134335, + "learning_rate": 8.012125649864713e-06, + "loss": 2.8259, + "step": 12841 + }, + { + "epoch": 1.094519730674167, + "grad_norm": 42.01117276221616, + "learning_rate": 8.01172986151926e-06, + "loss": 2.9313, + "step": 12842 + }, + { + "epoch": 1.0946049603681922, + "grad_norm": 33.61506931815774, + "learning_rate": 8.01133404355464e-06, + "loss": 3.0113, + "step": 12843 + }, + { + "epoch": 1.0946901900622177, + "grad_norm": 58.52531594571591, + "learning_rate": 8.010938195974747e-06, + "loss": 4.0563, + "step": 12844 + }, + { + "epoch": 1.0947754197562432, + "grad_norm": 39.16689827277185, + "learning_rate": 8.010542318783473e-06, + "loss": 3.352, + "step": 12845 + }, + { + "epoch": 1.0948606494502684, + "grad_norm": 43.485531022616264, + "learning_rate": 8.01014641198471e-06, + "loss": 3.0689, + "step": 12846 + }, + { + "epoch": 1.0949458791442939, + "grad_norm": 53.33185167749526, + "learning_rate": 8.009750475582355e-06, + "loss": 2.936, + "step": 12847 + }, + { + "epoch": 1.0950311088383193, + "grad_norm": 103.311701537811, + "learning_rate": 8.0093545095803e-06, + "loss": 3.9854, + "step": 12848 + }, + { + "epoch": 1.0951163385323446, + "grad_norm": 36.974155231244055, + "learning_rate": 8.00895851398244e-06, + "loss": 2.6979, + "step": 12849 + }, + { + "epoch": 1.09520156822637, + "grad_norm": 62.30439827079951, + "learning_rate": 8.008562488792665e-06, + "loss": 2.2108, + "step": 12850 + }, + { + "epoch": 1.0952867979203955, + "grad_norm": 84.36888555136314, + "learning_rate": 8.008166434014877e-06, + "loss": 3.0114, + "step": 12851 + }, + { + "epoch": 1.0953720276144208, + "grad_norm": 32.64475799732429, + "learning_rate": 8.007770349652967e-06, + "loss": 2.6119, + "step": 12852 + }, + { + "epoch": 1.0954572573084462, + "grad_norm": 54.100281934616355, + "learning_rate": 8.00737423571083e-06, + "loss": 2.807, + "step": 12853 + }, + { + "epoch": 1.0955424870024717, + "grad_norm": 51.93465422056738, + "learning_rate": 8.006978092192361e-06, + "loss": 2.3947, + "step": 12854 + }, + { + "epoch": 1.0956277166964972, + "grad_norm": 87.67283343486808, + "learning_rate": 8.00658191910146e-06, + "loss": 3.3519, + "step": 12855 + }, + { + "epoch": 1.0957129463905224, + "grad_norm": 84.69031517110257, + "learning_rate": 8.006185716442019e-06, + "loss": 2.8794, + "step": 12856 + }, + { + "epoch": 1.0957981760845479, + "grad_norm": 92.09707176224622, + "learning_rate": 8.005789484217937e-06, + "loss": 2.4041, + "step": 12857 + }, + { + "epoch": 1.0958834057785733, + "grad_norm": 72.7122337387025, + "learning_rate": 8.005393222433109e-06, + "loss": 2.654, + "step": 12858 + }, + { + "epoch": 1.0959686354725986, + "grad_norm": 104.61981615441672, + "learning_rate": 8.004996931091433e-06, + "loss": 3.6332, + "step": 12859 + }, + { + "epoch": 1.096053865166624, + "grad_norm": 34.609605782981504, + "learning_rate": 8.004600610196807e-06, + "loss": 3.0256, + "step": 12860 + }, + { + "epoch": 1.0961390948606495, + "grad_norm": 49.245770659882425, + "learning_rate": 8.004204259753126e-06, + "loss": 2.8611, + "step": 12861 + }, + { + "epoch": 1.0962243245546748, + "grad_norm": 62.696493477425506, + "learning_rate": 8.003807879764291e-06, + "loss": 3.1848, + "step": 12862 + }, + { + "epoch": 1.0963095542487002, + "grad_norm": 19.691318025005533, + "learning_rate": 8.0034114702342e-06, + "loss": 1.885, + "step": 12863 + }, + { + "epoch": 1.0963947839427257, + "grad_norm": 47.44381809768121, + "learning_rate": 8.00301503116675e-06, + "loss": 3.3075, + "step": 12864 + }, + { + "epoch": 1.096480013636751, + "grad_norm": 27.899239305225855, + "learning_rate": 8.00261856256584e-06, + "loss": 1.5147, + "step": 12865 + }, + { + "epoch": 1.0965652433307764, + "grad_norm": 619.8970298338114, + "learning_rate": 8.00222206443537e-06, + "loss": 3.9637, + "step": 12866 + }, + { + "epoch": 1.096650473024802, + "grad_norm": 68.36432649081951, + "learning_rate": 8.00182553677924e-06, + "loss": 2.5886, + "step": 12867 + }, + { + "epoch": 1.0967357027188271, + "grad_norm": 65.54980938839145, + "learning_rate": 8.001428979601346e-06, + "loss": 3.1329, + "step": 12868 + }, + { + "epoch": 1.0968209324128526, + "grad_norm": 34.55034541561068, + "learning_rate": 8.001032392905592e-06, + "loss": 2.3751, + "step": 12869 + }, + { + "epoch": 1.096906162106878, + "grad_norm": 93.11059948189121, + "learning_rate": 8.000635776695877e-06, + "loss": 3.5425, + "step": 12870 + }, + { + "epoch": 1.0969913918009033, + "grad_norm": 58.17839645976503, + "learning_rate": 8.000239130976102e-06, + "loss": 3.1031, + "step": 12871 + }, + { + "epoch": 1.0970766214949288, + "grad_norm": 52.97401635935738, + "learning_rate": 7.999842455750166e-06, + "loss": 3.7584, + "step": 12872 + }, + { + "epoch": 1.0971618511889543, + "grad_norm": 29.275329918605795, + "learning_rate": 7.999445751021973e-06, + "loss": 2.2805, + "step": 12873 + }, + { + "epoch": 1.0972470808829797, + "grad_norm": 104.23107432153023, + "learning_rate": 7.999049016795421e-06, + "loss": 2.4372, + "step": 12874 + }, + { + "epoch": 1.097332310577005, + "grad_norm": 32.12393000267326, + "learning_rate": 7.998652253074416e-06, + "loss": 2.8486, + "step": 12875 + }, + { + "epoch": 1.0974175402710304, + "grad_norm": 58.91049251528563, + "learning_rate": 7.998255459862855e-06, + "loss": 2.8218, + "step": 12876 + }, + { + "epoch": 1.097502769965056, + "grad_norm": 30.419685412889756, + "learning_rate": 7.997858637164646e-06, + "loss": 2.5759, + "step": 12877 + }, + { + "epoch": 1.0975879996590812, + "grad_norm": 34.570398964741365, + "learning_rate": 7.997461784983687e-06, + "loss": 2.2152, + "step": 12878 + }, + { + "epoch": 1.0976732293531066, + "grad_norm": 46.838707237914015, + "learning_rate": 7.997064903323882e-06, + "loss": 2.3632, + "step": 12879 + }, + { + "epoch": 1.097758459047132, + "grad_norm": 85.22710712710102, + "learning_rate": 7.996667992189135e-06, + "loss": 3.7038, + "step": 12880 + }, + { + "epoch": 1.0978436887411573, + "grad_norm": 40.55594779861783, + "learning_rate": 7.996271051583348e-06, + "loss": 2.368, + "step": 12881 + }, + { + "epoch": 1.0979289184351828, + "grad_norm": 74.61492044486701, + "learning_rate": 7.995874081510427e-06, + "loss": 4.1605, + "step": 12882 + }, + { + "epoch": 1.0980141481292083, + "grad_norm": 44.496492164443914, + "learning_rate": 7.995477081974275e-06, + "loss": 3.1325, + "step": 12883 + }, + { + "epoch": 1.0980993778232335, + "grad_norm": 60.94719519915775, + "learning_rate": 7.995080052978797e-06, + "loss": 2.6352, + "step": 12884 + }, + { + "epoch": 1.098184607517259, + "grad_norm": 40.10134635055312, + "learning_rate": 7.994682994527894e-06, + "loss": 3.3149, + "step": 12885 + }, + { + "epoch": 1.0982698372112845, + "grad_norm": 30.673255187749696, + "learning_rate": 7.994285906625476e-06, + "loss": 2.5672, + "step": 12886 + }, + { + "epoch": 1.0983550669053097, + "grad_norm": 114.38114929269557, + "learning_rate": 7.993888789275445e-06, + "loss": 3.2274, + "step": 12887 + }, + { + "epoch": 1.0984402965993352, + "grad_norm": 53.023155033392506, + "learning_rate": 7.993491642481708e-06, + "loss": 3.2212, + "step": 12888 + }, + { + "epoch": 1.0985255262933606, + "grad_norm": 37.46016355214459, + "learning_rate": 7.99309446624817e-06, + "loss": 3.4369, + "step": 12889 + }, + { + "epoch": 1.098610755987386, + "grad_norm": 51.607632988890536, + "learning_rate": 7.992697260578736e-06, + "loss": 2.8106, + "step": 12890 + }, + { + "epoch": 1.0986959856814114, + "grad_norm": 40.48093479467481, + "learning_rate": 7.992300025477316e-06, + "loss": 1.6867, + "step": 12891 + }, + { + "epoch": 1.0987812153754368, + "grad_norm": 36.563770789819344, + "learning_rate": 7.991902760947812e-06, + "loss": 1.831, + "step": 12892 + }, + { + "epoch": 1.0988664450694623, + "grad_norm": 36.58252878877307, + "learning_rate": 7.991505466994134e-06, + "loss": 2.6522, + "step": 12893 + }, + { + "epoch": 1.0989516747634875, + "grad_norm": 35.17825774970502, + "learning_rate": 7.99110814362019e-06, + "loss": 3.3755, + "step": 12894 + }, + { + "epoch": 1.099036904457513, + "grad_norm": 33.399975272736974, + "learning_rate": 7.990710790829886e-06, + "loss": 2.8944, + "step": 12895 + }, + { + "epoch": 1.0991221341515385, + "grad_norm": 44.73085833399081, + "learning_rate": 7.990313408627126e-06, + "loss": 3.2352, + "step": 12896 + }, + { + "epoch": 1.0992073638455637, + "grad_norm": 50.69485578203686, + "learning_rate": 7.989915997015825e-06, + "loss": 1.9963, + "step": 12897 + }, + { + "epoch": 1.0992925935395892, + "grad_norm": 71.34704265584938, + "learning_rate": 7.989518555999887e-06, + "loss": 3.2551, + "step": 12898 + }, + { + "epoch": 1.0993778232336147, + "grad_norm": 82.3856626378236, + "learning_rate": 7.989121085583222e-06, + "loss": 2.8222, + "step": 12899 + }, + { + "epoch": 1.09946305292764, + "grad_norm": 47.47359384062645, + "learning_rate": 7.98872358576974e-06, + "loss": 3.0749, + "step": 12900 + }, + { + "epoch": 1.0995482826216654, + "grad_norm": 118.99096529156918, + "learning_rate": 7.98832605656335e-06, + "loss": 3.224, + "step": 12901 + }, + { + "epoch": 1.0996335123156908, + "grad_norm": 35.74631323780272, + "learning_rate": 7.987928497967958e-06, + "loss": 2.9284, + "step": 12902 + }, + { + "epoch": 1.099718742009716, + "grad_norm": 522.6411839989526, + "learning_rate": 7.987530909987478e-06, + "loss": 2.8724, + "step": 12903 + }, + { + "epoch": 1.0998039717037416, + "grad_norm": 75.04285418284292, + "learning_rate": 7.987133292625818e-06, + "loss": 2.3773, + "step": 12904 + }, + { + "epoch": 1.099889201397767, + "grad_norm": 43.682271365972305, + "learning_rate": 7.98673564588689e-06, + "loss": 3.0203, + "step": 12905 + }, + { + "epoch": 1.0999744310917925, + "grad_norm": 40.27480373720414, + "learning_rate": 7.9863379697746e-06, + "loss": 2.4443, + "step": 12906 + }, + { + "epoch": 1.1000596607858177, + "grad_norm": 77.64025924830636, + "learning_rate": 7.985940264292867e-06, + "loss": 3.7412, + "step": 12907 + }, + { + "epoch": 1.1001448904798432, + "grad_norm": 37.401884180533585, + "learning_rate": 7.985542529445597e-06, + "loss": 2.2061, + "step": 12908 + }, + { + "epoch": 1.1002301201738687, + "grad_norm": 62.40623853575903, + "learning_rate": 7.985144765236703e-06, + "loss": 3.4777, + "step": 12909 + }, + { + "epoch": 1.100315349867894, + "grad_norm": 38.49395991756595, + "learning_rate": 7.984746971670096e-06, + "loss": 2.8534, + "step": 12910 + }, + { + "epoch": 1.1004005795619194, + "grad_norm": 106.94817188679002, + "learning_rate": 7.984349148749688e-06, + "loss": 3.5008, + "step": 12911 + }, + { + "epoch": 1.1004858092559449, + "grad_norm": 31.41326287710515, + "learning_rate": 7.983951296479391e-06, + "loss": 2.1988, + "step": 12912 + }, + { + "epoch": 1.10057103894997, + "grad_norm": 43.98295996697929, + "learning_rate": 7.983553414863121e-06, + "loss": 2.378, + "step": 12913 + }, + { + "epoch": 1.1006562686439956, + "grad_norm": 49.355773834638924, + "learning_rate": 7.983155503904786e-06, + "loss": 1.8352, + "step": 12914 + }, + { + "epoch": 1.100741498338021, + "grad_norm": 51.66291132267101, + "learning_rate": 7.982757563608305e-06, + "loss": 2.1066, + "step": 12915 + }, + { + "epoch": 1.1008267280320463, + "grad_norm": 101.65760788473717, + "learning_rate": 7.982359593977588e-06, + "loss": 2.2918, + "step": 12916 + }, + { + "epoch": 1.1009119577260718, + "grad_norm": 65.8426215590329, + "learning_rate": 7.981961595016548e-06, + "loss": 3.3821, + "step": 12917 + }, + { + "epoch": 1.1009971874200972, + "grad_norm": 38.68487564565365, + "learning_rate": 7.981563566729103e-06, + "loss": 2.2366, + "step": 12918 + }, + { + "epoch": 1.1010824171141225, + "grad_norm": 38.17302097406461, + "learning_rate": 7.981165509119162e-06, + "loss": 2.9454, + "step": 12919 + }, + { + "epoch": 1.101167646808148, + "grad_norm": 130.12350564862288, + "learning_rate": 7.980767422190645e-06, + "loss": 2.4996, + "step": 12920 + }, + { + "epoch": 1.1012528765021734, + "grad_norm": 46.03379038962009, + "learning_rate": 7.980369305947465e-06, + "loss": 3.3232, + "step": 12921 + }, + { + "epoch": 1.1013381061961987, + "grad_norm": 58.95418464234242, + "learning_rate": 7.979971160393534e-06, + "loss": 2.8761, + "step": 12922 + }, + { + "epoch": 1.1014233358902241, + "grad_norm": 50.80292837068642, + "learning_rate": 7.979572985532774e-06, + "loss": 3.2677, + "step": 12923 + }, + { + "epoch": 1.1015085655842496, + "grad_norm": 109.34553938281488, + "learning_rate": 7.979174781369097e-06, + "loss": 5.4096, + "step": 12924 + }, + { + "epoch": 1.101593795278275, + "grad_norm": 130.32960687072867, + "learning_rate": 7.97877654790642e-06, + "loss": 2.575, + "step": 12925 + }, + { + "epoch": 1.1016790249723003, + "grad_norm": 35.05148319495064, + "learning_rate": 7.978378285148657e-06, + "loss": 2.8996, + "step": 12926 + }, + { + "epoch": 1.1017642546663258, + "grad_norm": 32.17735977189511, + "learning_rate": 7.97797999309973e-06, + "loss": 3.0173, + "step": 12927 + }, + { + "epoch": 1.1018494843603512, + "grad_norm": 72.2297350851387, + "learning_rate": 7.977581671763553e-06, + "loss": 2.9349, + "step": 12928 + }, + { + "epoch": 1.1019347140543765, + "grad_norm": 46.16006020615655, + "learning_rate": 7.977183321144042e-06, + "loss": 2.9531, + "step": 12929 + }, + { + "epoch": 1.102019943748402, + "grad_norm": 68.97988942691724, + "learning_rate": 7.976784941245114e-06, + "loss": 2.915, + "step": 12930 + }, + { + "epoch": 1.1021051734424274, + "grad_norm": 41.89066189650515, + "learning_rate": 7.976386532070692e-06, + "loss": 3.3919, + "step": 12931 + }, + { + "epoch": 1.1021904031364527, + "grad_norm": 38.24749220503897, + "learning_rate": 7.975988093624691e-06, + "loss": 3.1223, + "step": 12932 + }, + { + "epoch": 1.1022756328304781, + "grad_norm": 85.45636654654909, + "learning_rate": 7.975589625911029e-06, + "loss": 2.9064, + "step": 12933 + }, + { + "epoch": 1.1023608625245036, + "grad_norm": 45.38357491401409, + "learning_rate": 7.975191128933625e-06, + "loss": 2.639, + "step": 12934 + }, + { + "epoch": 1.1024460922185289, + "grad_norm": 64.47028420611028, + "learning_rate": 7.9747926026964e-06, + "loss": 3.014, + "step": 12935 + }, + { + "epoch": 1.1025313219125543, + "grad_norm": 34.867142876482724, + "learning_rate": 7.974394047203272e-06, + "loss": 2.8412, + "step": 12936 + }, + { + "epoch": 1.1026165516065798, + "grad_norm": 45.49400662982399, + "learning_rate": 7.97399546245816e-06, + "loss": 2.9997, + "step": 12937 + }, + { + "epoch": 1.102701781300605, + "grad_norm": 24.727599820732735, + "learning_rate": 7.973596848464984e-06, + "loss": 2.7657, + "step": 12938 + }, + { + "epoch": 1.1027870109946305, + "grad_norm": 81.42871412367019, + "learning_rate": 7.973198205227663e-06, + "loss": 4.3919, + "step": 12939 + }, + { + "epoch": 1.102872240688656, + "grad_norm": 72.8337278674212, + "learning_rate": 7.972799532750122e-06, + "loss": 3.7751, + "step": 12940 + }, + { + "epoch": 1.1029574703826812, + "grad_norm": 38.3393133772382, + "learning_rate": 7.972400831036276e-06, + "loss": 2.7241, + "step": 12941 + }, + { + "epoch": 1.1030427000767067, + "grad_norm": 81.90887579857436, + "learning_rate": 7.972002100090051e-06, + "loss": 4.2559, + "step": 12942 + }, + { + "epoch": 1.1031279297707322, + "grad_norm": 38.647599237922414, + "learning_rate": 7.971603339915364e-06, + "loss": 2.59, + "step": 12943 + }, + { + "epoch": 1.1032131594647576, + "grad_norm": 39.47126460969784, + "learning_rate": 7.971204550516142e-06, + "loss": 3.488, + "step": 12944 + }, + { + "epoch": 1.1032983891587829, + "grad_norm": 32.75356297563771, + "learning_rate": 7.970805731896301e-06, + "loss": 2.8023, + "step": 12945 + }, + { + "epoch": 1.1033836188528083, + "grad_norm": 48.644248054067866, + "learning_rate": 7.970406884059768e-06, + "loss": 2.1629, + "step": 12946 + }, + { + "epoch": 1.1034688485468338, + "grad_norm": 33.11914742825909, + "learning_rate": 7.970008007010463e-06, + "loss": 2.9084, + "step": 12947 + }, + { + "epoch": 1.103554078240859, + "grad_norm": 48.94163954201425, + "learning_rate": 7.96960910075231e-06, + "loss": 3.1747, + "step": 12948 + }, + { + "epoch": 1.1036393079348845, + "grad_norm": 61.530914053460094, + "learning_rate": 7.96921016528923e-06, + "loss": 2.1493, + "step": 12949 + }, + { + "epoch": 1.10372453762891, + "grad_norm": 51.97374458424927, + "learning_rate": 7.96881120062515e-06, + "loss": 3.4252, + "step": 12950 + }, + { + "epoch": 1.1038097673229352, + "grad_norm": 78.20759390363759, + "learning_rate": 7.968412206763991e-06, + "loss": 3.6162, + "step": 12951 + }, + { + "epoch": 1.1038949970169607, + "grad_norm": 23.739139931619093, + "learning_rate": 7.968013183709677e-06, + "loss": 1.7339, + "step": 12952 + }, + { + "epoch": 1.1039802267109862, + "grad_norm": 78.25382509586272, + "learning_rate": 7.967614131466132e-06, + "loss": 3.244, + "step": 12953 + }, + { + "epoch": 1.1040654564050114, + "grad_norm": 38.98920005987177, + "learning_rate": 7.967215050037282e-06, + "loss": 2.9263, + "step": 12954 + }, + { + "epoch": 1.1041506860990369, + "grad_norm": 34.42270544620967, + "learning_rate": 7.966815939427051e-06, + "loss": 3.5818, + "step": 12955 + }, + { + "epoch": 1.1042359157930624, + "grad_norm": 73.32089835790194, + "learning_rate": 7.966416799639364e-06, + "loss": 2.5225, + "step": 12956 + }, + { + "epoch": 1.1043211454870878, + "grad_norm": 47.70763473408675, + "learning_rate": 7.966017630678147e-06, + "loss": 3.6927, + "step": 12957 + }, + { + "epoch": 1.104406375181113, + "grad_norm": 110.23126417901052, + "learning_rate": 7.965618432547323e-06, + "loss": 2.0783, + "step": 12958 + }, + { + "epoch": 1.1044916048751385, + "grad_norm": 35.39993300291631, + "learning_rate": 7.965219205250823e-06, + "loss": 3.1393, + "step": 12959 + }, + { + "epoch": 1.1045768345691638, + "grad_norm": 33.768882220393145, + "learning_rate": 7.964819948792567e-06, + "loss": 2.7273, + "step": 12960 + }, + { + "epoch": 1.1046620642631892, + "grad_norm": 34.57008692471336, + "learning_rate": 7.964420663176488e-06, + "loss": 3.3101, + "step": 12961 + }, + { + "epoch": 1.1047472939572147, + "grad_norm": 54.240968193451565, + "learning_rate": 7.96402134840651e-06, + "loss": 2.4669, + "step": 12962 + }, + { + "epoch": 1.1048325236512402, + "grad_norm": 25.973064188841462, + "learning_rate": 7.963622004486559e-06, + "loss": 2.4452, + "step": 12963 + }, + { + "epoch": 1.1049177533452654, + "grad_norm": 99.24920949936866, + "learning_rate": 7.963222631420562e-06, + "loss": 2.1872, + "step": 12964 + }, + { + "epoch": 1.105002983039291, + "grad_norm": 34.77629784259262, + "learning_rate": 7.962823229212449e-06, + "loss": 2.8308, + "step": 12965 + }, + { + "epoch": 1.1050882127333164, + "grad_norm": 37.68206888174741, + "learning_rate": 7.962423797866147e-06, + "loss": 3.0854, + "step": 12966 + }, + { + "epoch": 1.1051734424273416, + "grad_norm": 36.86002694014664, + "learning_rate": 7.962024337385583e-06, + "loss": 2.3323, + "step": 12967 + }, + { + "epoch": 1.105258672121367, + "grad_norm": 103.96341016710174, + "learning_rate": 7.961624847774688e-06, + "loss": 3.3628, + "step": 12968 + }, + { + "epoch": 1.1053439018153925, + "grad_norm": 38.14697381185951, + "learning_rate": 7.96122532903739e-06, + "loss": 3.3147, + "step": 12969 + }, + { + "epoch": 1.1054291315094178, + "grad_norm": 61.28366478205676, + "learning_rate": 7.960825781177616e-06, + "loss": 3.8253, + "step": 12970 + }, + { + "epoch": 1.1055143612034433, + "grad_norm": 66.49961889710912, + "learning_rate": 7.960426204199297e-06, + "loss": 3.9073, + "step": 12971 + }, + { + "epoch": 1.1055995908974687, + "grad_norm": 65.39183846906859, + "learning_rate": 7.960026598106362e-06, + "loss": 3.501, + "step": 12972 + }, + { + "epoch": 1.105684820591494, + "grad_norm": 55.865308068603326, + "learning_rate": 7.959626962902743e-06, + "loss": 4.2148, + "step": 12973 + }, + { + "epoch": 1.1057700502855194, + "grad_norm": 66.75195174874042, + "learning_rate": 7.959227298592369e-06, + "loss": 2.8523, + "step": 12974 + }, + { + "epoch": 1.105855279979545, + "grad_norm": 38.76539359707866, + "learning_rate": 7.958827605179169e-06, + "loss": 2.303, + "step": 12975 + }, + { + "epoch": 1.1059405096735704, + "grad_norm": 35.163761534004095, + "learning_rate": 7.958427882667077e-06, + "loss": 1.5735, + "step": 12976 + }, + { + "epoch": 1.1060257393675956, + "grad_norm": 28.129269559643923, + "learning_rate": 7.958028131060021e-06, + "loss": 2.4715, + "step": 12977 + }, + { + "epoch": 1.106110969061621, + "grad_norm": 43.46845413260692, + "learning_rate": 7.957628350361934e-06, + "loss": 2.4811, + "step": 12978 + }, + { + "epoch": 1.1061961987556466, + "grad_norm": 46.099472590015296, + "learning_rate": 7.957228540576748e-06, + "loss": 3.0679, + "step": 12979 + }, + { + "epoch": 1.1062814284496718, + "grad_norm": 36.678522865324425, + "learning_rate": 7.956828701708395e-06, + "loss": 2.7707, + "step": 12980 + }, + { + "epoch": 1.1063666581436973, + "grad_norm": 34.918961647466475, + "learning_rate": 7.956428833760805e-06, + "loss": 3.456, + "step": 12981 + }, + { + "epoch": 1.1064518878377227, + "grad_norm": 60.19032210688176, + "learning_rate": 7.956028936737912e-06, + "loss": 3.136, + "step": 12982 + }, + { + "epoch": 1.106537117531748, + "grad_norm": 68.4847122222956, + "learning_rate": 7.95562901064365e-06, + "loss": 2.713, + "step": 12983 + }, + { + "epoch": 1.1066223472257735, + "grad_norm": 58.707276055551475, + "learning_rate": 7.955229055481951e-06, + "loss": 2.269, + "step": 12984 + }, + { + "epoch": 1.106707576919799, + "grad_norm": 58.67641181053677, + "learning_rate": 7.954829071256748e-06, + "loss": 3.034, + "step": 12985 + }, + { + "epoch": 1.1067928066138242, + "grad_norm": 46.530522860007984, + "learning_rate": 7.954429057971976e-06, + "loss": 3.0808, + "step": 12986 + }, + { + "epoch": 1.1068780363078496, + "grad_norm": 57.72637018419205, + "learning_rate": 7.95402901563157e-06, + "loss": 3.6405, + "step": 12987 + }, + { + "epoch": 1.1069632660018751, + "grad_norm": 78.91017375511014, + "learning_rate": 7.953628944239459e-06, + "loss": 3.046, + "step": 12988 + }, + { + "epoch": 1.1070484956959004, + "grad_norm": 86.82610462598699, + "learning_rate": 7.953228843799582e-06, + "loss": 3.6283, + "step": 12989 + }, + { + "epoch": 1.1071337253899258, + "grad_norm": 43.021494628755164, + "learning_rate": 7.952828714315872e-06, + "loss": 2.9183, + "step": 12990 + }, + { + "epoch": 1.1072189550839513, + "grad_norm": 33.88003839806693, + "learning_rate": 7.952428555792268e-06, + "loss": 2.7429, + "step": 12991 + }, + { + "epoch": 1.1073041847779765, + "grad_norm": 48.75568013676511, + "learning_rate": 7.9520283682327e-06, + "loss": 2.3335, + "step": 12992 + }, + { + "epoch": 1.107389414472002, + "grad_norm": 87.51138896164238, + "learning_rate": 7.951628151641107e-06, + "loss": 2.7983, + "step": 12993 + }, + { + "epoch": 1.1074746441660275, + "grad_norm": 35.422160840629104, + "learning_rate": 7.951227906021423e-06, + "loss": 2.7884, + "step": 12994 + }, + { + "epoch": 1.107559873860053, + "grad_norm": 59.11594696645374, + "learning_rate": 7.950827631377585e-06, + "loss": 2.0754, + "step": 12995 + }, + { + "epoch": 1.1076451035540782, + "grad_norm": 54.72383646429362, + "learning_rate": 7.950427327713532e-06, + "loss": 3.2321, + "step": 12996 + }, + { + "epoch": 1.1077303332481037, + "grad_norm": 80.70993948788868, + "learning_rate": 7.950026995033196e-06, + "loss": 2.1479, + "step": 12997 + }, + { + "epoch": 1.1078155629421291, + "grad_norm": 70.33532669157289, + "learning_rate": 7.949626633340518e-06, + "loss": 3.7138, + "step": 12998 + }, + { + "epoch": 1.1079007926361544, + "grad_norm": 44.451060299324794, + "learning_rate": 7.949226242639435e-06, + "loss": 3.3299, + "step": 12999 + }, + { + "epoch": 1.1079860223301798, + "grad_norm": 63.07155035954625, + "learning_rate": 7.948825822933882e-06, + "loss": 2.8735, + "step": 13000 + }, + { + "epoch": 1.1080712520242053, + "grad_norm": 24.78191540586926, + "learning_rate": 7.9484253742278e-06, + "loss": 2.2387, + "step": 13001 + }, + { + "epoch": 1.1081564817182306, + "grad_norm": 69.17029934122048, + "learning_rate": 7.948024896525124e-06, + "loss": 3.4183, + "step": 13002 + }, + { + "epoch": 1.108241711412256, + "grad_norm": 22.149524599051638, + "learning_rate": 7.947624389829798e-06, + "loss": 1.9093, + "step": 13003 + }, + { + "epoch": 1.1083269411062815, + "grad_norm": 45.73225556940873, + "learning_rate": 7.947223854145754e-06, + "loss": 3.5185, + "step": 13004 + }, + { + "epoch": 1.1084121708003067, + "grad_norm": 63.88977855241037, + "learning_rate": 7.946823289476936e-06, + "loss": 2.6537, + "step": 13005 + }, + { + "epoch": 1.1084974004943322, + "grad_norm": 52.07992482339618, + "learning_rate": 7.946422695827282e-06, + "loss": 3.1299, + "step": 13006 + }, + { + "epoch": 1.1085826301883577, + "grad_norm": 52.97033149759757, + "learning_rate": 7.946022073200731e-06, + "loss": 3.8968, + "step": 13007 + }, + { + "epoch": 1.108667859882383, + "grad_norm": 68.50109332175239, + "learning_rate": 7.945621421601225e-06, + "loss": 2.6028, + "step": 13008 + }, + { + "epoch": 1.1087530895764084, + "grad_norm": 85.59425578623672, + "learning_rate": 7.945220741032703e-06, + "loss": 4.5387, + "step": 13009 + }, + { + "epoch": 1.1088383192704339, + "grad_norm": 65.96433182940352, + "learning_rate": 7.944820031499102e-06, + "loss": 3.0311, + "step": 13010 + }, + { + "epoch": 1.108923548964459, + "grad_norm": 24.019229608868574, + "learning_rate": 7.94441929300437e-06, + "loss": 2.0092, + "step": 13011 + }, + { + "epoch": 1.1090087786584846, + "grad_norm": 37.340491277020945, + "learning_rate": 7.944018525552442e-06, + "loss": 2.8681, + "step": 13012 + }, + { + "epoch": 1.10909400835251, + "grad_norm": 39.989725419449265, + "learning_rate": 7.943617729147263e-06, + "loss": 2.7628, + "step": 13013 + }, + { + "epoch": 1.1091792380465355, + "grad_norm": 68.70273490900092, + "learning_rate": 7.943216903792772e-06, + "loss": 3.6109, + "step": 13014 + }, + { + "epoch": 1.1092644677405608, + "grad_norm": 42.93559280248854, + "learning_rate": 7.942816049492913e-06, + "loss": 3.3193, + "step": 13015 + }, + { + "epoch": 1.1093496974345862, + "grad_norm": 43.925331799554215, + "learning_rate": 7.942415166251627e-06, + "loss": 3.2942, + "step": 13016 + }, + { + "epoch": 1.1094349271286117, + "grad_norm": 29.659632953600244, + "learning_rate": 7.942014254072858e-06, + "loss": 2.618, + "step": 13017 + }, + { + "epoch": 1.109520156822637, + "grad_norm": 72.49704506448721, + "learning_rate": 7.941613312960547e-06, + "loss": 2.3972, + "step": 13018 + }, + { + "epoch": 1.1096053865166624, + "grad_norm": 38.080445560825105, + "learning_rate": 7.941212342918638e-06, + "loss": 3.5058, + "step": 13019 + }, + { + "epoch": 1.1096906162106879, + "grad_norm": 23.653007614152116, + "learning_rate": 7.940811343951074e-06, + "loss": 1.7481, + "step": 13020 + }, + { + "epoch": 1.1097758459047131, + "grad_norm": 65.1928882889451, + "learning_rate": 7.9404103160618e-06, + "loss": 3.3504, + "step": 13021 + }, + { + "epoch": 1.1098610755987386, + "grad_norm": 62.882187174172486, + "learning_rate": 7.940009259254759e-06, + "loss": 2.7512, + "step": 13022 + }, + { + "epoch": 1.109946305292764, + "grad_norm": 58.04237447796106, + "learning_rate": 7.939608173533894e-06, + "loss": 3.079, + "step": 13023 + }, + { + "epoch": 1.1100315349867893, + "grad_norm": 30.43989756054866, + "learning_rate": 7.939207058903152e-06, + "loss": 3.1153, + "step": 13024 + }, + { + "epoch": 1.1101167646808148, + "grad_norm": 28.111475921695682, + "learning_rate": 7.938805915366476e-06, + "loss": 1.3952, + "step": 13025 + }, + { + "epoch": 1.1102019943748402, + "grad_norm": 84.8011312210596, + "learning_rate": 7.938404742927812e-06, + "loss": 2.8475, + "step": 13026 + }, + { + "epoch": 1.1102872240688657, + "grad_norm": 44.94800650497719, + "learning_rate": 7.938003541591102e-06, + "loss": 3.4222, + "step": 13027 + }, + { + "epoch": 1.110372453762891, + "grad_norm": 33.89577872085159, + "learning_rate": 7.937602311360298e-06, + "loss": 2.4347, + "step": 13028 + }, + { + "epoch": 1.1104576834569164, + "grad_norm": 59.67665607053407, + "learning_rate": 7.937201052239341e-06, + "loss": 3.9352, + "step": 13029 + }, + { + "epoch": 1.110542913150942, + "grad_norm": 70.11257903113037, + "learning_rate": 7.93679976423218e-06, + "loss": 3.8216, + "step": 13030 + }, + { + "epoch": 1.1106281428449671, + "grad_norm": 40.198487290183984, + "learning_rate": 7.936398447342759e-06, + "loss": 3.3622, + "step": 13031 + }, + { + "epoch": 1.1107133725389926, + "grad_norm": 36.92528628255152, + "learning_rate": 7.935997101575029e-06, + "loss": 3.0796, + "step": 13032 + }, + { + "epoch": 1.110798602233018, + "grad_norm": 38.700746155715514, + "learning_rate": 7.93559572693293e-06, + "loss": 4.0463, + "step": 13033 + }, + { + "epoch": 1.1108838319270433, + "grad_norm": 64.17239589114106, + "learning_rate": 7.935194323420417e-06, + "loss": 2.6774, + "step": 13034 + }, + { + "epoch": 1.1109690616210688, + "grad_norm": 49.36247630411848, + "learning_rate": 7.93479289104143e-06, + "loss": 4.1648, + "step": 13035 + }, + { + "epoch": 1.1110542913150943, + "grad_norm": 27.187387269595387, + "learning_rate": 7.934391429799925e-06, + "loss": 2.4124, + "step": 13036 + }, + { + "epoch": 1.1111395210091195, + "grad_norm": 249.50630042215926, + "learning_rate": 7.933989939699845e-06, + "loss": 2.4019, + "step": 13037 + }, + { + "epoch": 1.111224750703145, + "grad_norm": 79.35441827861897, + "learning_rate": 7.93358842074514e-06, + "loss": 3.8969, + "step": 13038 + }, + { + "epoch": 1.1113099803971704, + "grad_norm": 49.027032781867455, + "learning_rate": 7.933186872939757e-06, + "loss": 2.5554, + "step": 13039 + }, + { + "epoch": 1.1113952100911957, + "grad_norm": 43.26594128453438, + "learning_rate": 7.932785296287648e-06, + "loss": 3.244, + "step": 13040 + }, + { + "epoch": 1.1114804397852212, + "grad_norm": 37.70610923688364, + "learning_rate": 7.932383690792762e-06, + "loss": 2.3908, + "step": 13041 + }, + { + "epoch": 1.1115656694792466, + "grad_norm": 31.166665495655757, + "learning_rate": 7.931982056459046e-06, + "loss": 3.257, + "step": 13042 + }, + { + "epoch": 1.1116508991732719, + "grad_norm": 47.318229005846305, + "learning_rate": 7.931580393290453e-06, + "loss": 2.5511, + "step": 13043 + }, + { + "epoch": 1.1117361288672973, + "grad_norm": 33.24422749916298, + "learning_rate": 7.93117870129093e-06, + "loss": 2.2133, + "step": 13044 + }, + { + "epoch": 1.1118213585613228, + "grad_norm": 40.695377362877544, + "learning_rate": 7.93077698046443e-06, + "loss": 2.2552, + "step": 13045 + }, + { + "epoch": 1.1119065882553483, + "grad_norm": 56.60557511661558, + "learning_rate": 7.930375230814902e-06, + "loss": 2.3032, + "step": 13046 + }, + { + "epoch": 1.1119918179493735, + "grad_norm": 56.49193025519234, + "learning_rate": 7.9299734523463e-06, + "loss": 3.0614, + "step": 13047 + }, + { + "epoch": 1.112077047643399, + "grad_norm": 74.14078735948638, + "learning_rate": 7.929571645062573e-06, + "loss": 2.9953, + "step": 13048 + }, + { + "epoch": 1.1121622773374245, + "grad_norm": 48.81206812850332, + "learning_rate": 7.929169808967672e-06, + "loss": 3.5253, + "step": 13049 + }, + { + "epoch": 1.1122475070314497, + "grad_norm": 71.39498142436287, + "learning_rate": 7.92876794406555e-06, + "loss": 3.326, + "step": 13050 + }, + { + "epoch": 1.1123327367254752, + "grad_norm": 41.3519985736723, + "learning_rate": 7.92836605036016e-06, + "loss": 2.069, + "step": 13051 + }, + { + "epoch": 1.1124179664195006, + "grad_norm": 75.07919236240342, + "learning_rate": 7.927964127855452e-06, + "loss": 3.8364, + "step": 13052 + }, + { + "epoch": 1.1125031961135259, + "grad_norm": 149.71033099133896, + "learning_rate": 7.927562176555381e-06, + "loss": 3.3347, + "step": 13053 + }, + { + "epoch": 1.1125884258075514, + "grad_norm": 30.340563073312723, + "learning_rate": 7.927160196463898e-06, + "loss": 2.6044, + "step": 13054 + }, + { + "epoch": 1.1126736555015768, + "grad_norm": 51.70487741258667, + "learning_rate": 7.92675818758496e-06, + "loss": 2.7517, + "step": 13055 + }, + { + "epoch": 1.112758885195602, + "grad_norm": 42.70353085654481, + "learning_rate": 7.926356149922518e-06, + "loss": 2.8262, + "step": 13056 + }, + { + "epoch": 1.1128441148896275, + "grad_norm": 39.99265216600061, + "learning_rate": 7.925954083480526e-06, + "loss": 2.8653, + "step": 13057 + }, + { + "epoch": 1.112929344583653, + "grad_norm": 101.9987525782482, + "learning_rate": 7.925551988262937e-06, + "loss": 4.8196, + "step": 13058 + }, + { + "epoch": 1.1130145742776782, + "grad_norm": 45.24959864114492, + "learning_rate": 7.925149864273707e-06, + "loss": 3.3407, + "step": 13059 + }, + { + "epoch": 1.1130998039717037, + "grad_norm": 45.06928165936562, + "learning_rate": 7.924747711516789e-06, + "loss": 2.7351, + "step": 13060 + }, + { + "epoch": 1.1131850336657292, + "grad_norm": 55.689315020373336, + "learning_rate": 7.924345529996141e-06, + "loss": 3.4599, + "step": 13061 + }, + { + "epoch": 1.1132702633597544, + "grad_norm": 109.31223032147064, + "learning_rate": 7.923943319715716e-06, + "loss": 3.6054, + "step": 13062 + }, + { + "epoch": 1.11335549305378, + "grad_norm": 35.796953659300016, + "learning_rate": 7.923541080679472e-06, + "loss": 2.6401, + "step": 13063 + }, + { + "epoch": 1.1134407227478054, + "grad_norm": 37.89045264623175, + "learning_rate": 7.923138812891363e-06, + "loss": 2.1118, + "step": 13064 + }, + { + "epoch": 1.1135259524418308, + "grad_norm": 95.56071178286817, + "learning_rate": 7.922736516355345e-06, + "loss": 4.0171, + "step": 13065 + }, + { + "epoch": 1.113611182135856, + "grad_norm": 34.48959941550874, + "learning_rate": 7.922334191075374e-06, + "loss": 3.2933, + "step": 13066 + }, + { + "epoch": 1.1136964118298815, + "grad_norm": 71.53209660973403, + "learning_rate": 7.921931837055408e-06, + "loss": 4.2536, + "step": 13067 + }, + { + "epoch": 1.113781641523907, + "grad_norm": 48.800612722526715, + "learning_rate": 7.921529454299402e-06, + "loss": 3.2222, + "step": 13068 + }, + { + "epoch": 1.1138668712179323, + "grad_norm": 58.672696492066656, + "learning_rate": 7.921127042811317e-06, + "loss": 2.8546, + "step": 13069 + }, + { + "epoch": 1.1139521009119577, + "grad_norm": 76.81324085566551, + "learning_rate": 7.920724602595106e-06, + "loss": 2.8847, + "step": 13070 + }, + { + "epoch": 1.1140373306059832, + "grad_norm": 80.42808341493149, + "learning_rate": 7.920322133654733e-06, + "loss": 2.7088, + "step": 13071 + }, + { + "epoch": 1.1141225603000084, + "grad_norm": 53.5964036755895, + "learning_rate": 7.919919635994147e-06, + "loss": 2.936, + "step": 13072 + }, + { + "epoch": 1.114207789994034, + "grad_norm": 45.09765896514136, + "learning_rate": 7.919517109617315e-06, + "loss": 2.9516, + "step": 13073 + }, + { + "epoch": 1.1142930196880594, + "grad_norm": 38.095635886394746, + "learning_rate": 7.919114554528191e-06, + "loss": 2.5704, + "step": 13074 + }, + { + "epoch": 1.1143782493820846, + "grad_norm": 45.70965843341635, + "learning_rate": 7.918711970730736e-06, + "loss": 3.0749, + "step": 13075 + }, + { + "epoch": 1.11446347907611, + "grad_norm": 56.24413765924375, + "learning_rate": 7.918309358228908e-06, + "loss": 2.2421, + "step": 13076 + }, + { + "epoch": 1.1145487087701356, + "grad_norm": 70.3343141286419, + "learning_rate": 7.917906717026669e-06, + "loss": 3.007, + "step": 13077 + }, + { + "epoch": 1.1146339384641608, + "grad_norm": 41.08734193299067, + "learning_rate": 7.917504047127974e-06, + "loss": 3.0081, + "step": 13078 + }, + { + "epoch": 1.1147191681581863, + "grad_norm": 51.665886426299025, + "learning_rate": 7.917101348536787e-06, + "loss": 2.7416, + "step": 13079 + }, + { + "epoch": 1.1148043978522117, + "grad_norm": 43.236058241979215, + "learning_rate": 7.916698621257068e-06, + "loss": 2.1491, + "step": 13080 + }, + { + "epoch": 1.114889627546237, + "grad_norm": 64.40111625484272, + "learning_rate": 7.916295865292777e-06, + "loss": 2.9501, + "step": 13081 + }, + { + "epoch": 1.1149748572402625, + "grad_norm": 75.59793074355129, + "learning_rate": 7.915893080647875e-06, + "loss": 3.6603, + "step": 13082 + }, + { + "epoch": 1.115060086934288, + "grad_norm": 80.342764399033, + "learning_rate": 7.915490267326322e-06, + "loss": 3.2396, + "step": 13083 + }, + { + "epoch": 1.1151453166283134, + "grad_norm": 54.6499433524128, + "learning_rate": 7.915087425332081e-06, + "loss": 3.8343, + "step": 13084 + }, + { + "epoch": 1.1152305463223386, + "grad_norm": 39.528365818727465, + "learning_rate": 7.914684554669116e-06, + "loss": 2.4041, + "step": 13085 + }, + { + "epoch": 1.1153157760163641, + "grad_norm": 89.55419191883783, + "learning_rate": 7.914281655341385e-06, + "loss": 3.5977, + "step": 13086 + }, + { + "epoch": 1.1154010057103896, + "grad_norm": 43.00382299081665, + "learning_rate": 7.91387872735285e-06, + "loss": 2.2259, + "step": 13087 + }, + { + "epoch": 1.1154862354044148, + "grad_norm": 33.69391896194539, + "learning_rate": 7.913475770707479e-06, + "loss": 3.323, + "step": 13088 + }, + { + "epoch": 1.1155714650984403, + "grad_norm": 51.29331759320417, + "learning_rate": 7.913072785409229e-06, + "loss": 3.5153, + "step": 13089 + }, + { + "epoch": 1.1156566947924658, + "grad_norm": 62.91948663440792, + "learning_rate": 7.912669771462067e-06, + "loss": 3.0139, + "step": 13090 + }, + { + "epoch": 1.115741924486491, + "grad_norm": 62.84340661783588, + "learning_rate": 7.912266728869955e-06, + "loss": 4.0056, + "step": 13091 + }, + { + "epoch": 1.1158271541805165, + "grad_norm": 25.443080601573875, + "learning_rate": 7.911863657636856e-06, + "loss": 2.1202, + "step": 13092 + }, + { + "epoch": 1.115912383874542, + "grad_norm": 103.51764034143294, + "learning_rate": 7.911460557766736e-06, + "loss": 4.4619, + "step": 13093 + }, + { + "epoch": 1.1159976135685672, + "grad_norm": 30.402200114604735, + "learning_rate": 7.911057429263559e-06, + "loss": 2.4, + "step": 13094 + }, + { + "epoch": 1.1160828432625927, + "grad_norm": 70.74370701328512, + "learning_rate": 7.910654272131286e-06, + "loss": 2.7569, + "step": 13095 + }, + { + "epoch": 1.1161680729566181, + "grad_norm": 40.21722683622093, + "learning_rate": 7.910251086373888e-06, + "loss": 3.1352, + "step": 13096 + }, + { + "epoch": 1.1162533026506436, + "grad_norm": 62.061642704645664, + "learning_rate": 7.909847871995325e-06, + "loss": 3.1501, + "step": 13097 + }, + { + "epoch": 1.1163385323446688, + "grad_norm": 62.87728743266706, + "learning_rate": 7.909444628999565e-06, + "loss": 3.7231, + "step": 13098 + }, + { + "epoch": 1.1164237620386943, + "grad_norm": 43.30181369255359, + "learning_rate": 7.909041357390574e-06, + "loss": 3.4634, + "step": 13099 + }, + { + "epoch": 1.1165089917327198, + "grad_norm": 71.21421766197356, + "learning_rate": 7.908638057172315e-06, + "loss": 3.1631, + "step": 13100 + }, + { + "epoch": 1.116594221426745, + "grad_norm": 45.394991253612595, + "learning_rate": 7.908234728348759e-06, + "loss": 3.783, + "step": 13101 + }, + { + "epoch": 1.1166794511207705, + "grad_norm": 28.315233320809096, + "learning_rate": 7.907831370923868e-06, + "loss": 1.7982, + "step": 13102 + }, + { + "epoch": 1.116764680814796, + "grad_norm": 45.94916761348792, + "learning_rate": 7.907427984901612e-06, + "loss": 2.7397, + "step": 13103 + }, + { + "epoch": 1.1168499105088212, + "grad_norm": 33.958339496149605, + "learning_rate": 7.907024570285956e-06, + "loss": 3.0793, + "step": 13104 + }, + { + "epoch": 1.1169351402028467, + "grad_norm": 56.29200948931055, + "learning_rate": 7.906621127080872e-06, + "loss": 3.5162, + "step": 13105 + }, + { + "epoch": 1.1170203698968721, + "grad_norm": 51.2880153897323, + "learning_rate": 7.90621765529032e-06, + "loss": 2.9357, + "step": 13106 + }, + { + "epoch": 1.1171055995908974, + "grad_norm": 57.58530459668951, + "learning_rate": 7.905814154918274e-06, + "loss": 2.0987, + "step": 13107 + }, + { + "epoch": 1.1171908292849229, + "grad_norm": 31.986279823921304, + "learning_rate": 7.905410625968701e-06, + "loss": 3.1497, + "step": 13108 + }, + { + "epoch": 1.1172760589789483, + "grad_norm": 109.43078597065976, + "learning_rate": 7.905007068445568e-06, + "loss": 3.7246, + "step": 13109 + }, + { + "epoch": 1.1173612886729736, + "grad_norm": 37.89764501264005, + "learning_rate": 7.904603482352845e-06, + "loss": 2.6221, + "step": 13110 + }, + { + "epoch": 1.117446518366999, + "grad_norm": 72.57749857330272, + "learning_rate": 7.904199867694502e-06, + "loss": 2.7079, + "step": 13111 + }, + { + "epoch": 1.1175317480610245, + "grad_norm": 48.508436861045936, + "learning_rate": 7.903796224474506e-06, + "loss": 3.3474, + "step": 13112 + }, + { + "epoch": 1.1176169777550498, + "grad_norm": 58.91216789880498, + "learning_rate": 7.903392552696827e-06, + "loss": 2.9628, + "step": 13113 + }, + { + "epoch": 1.1177022074490752, + "grad_norm": 84.18982307176357, + "learning_rate": 7.902988852365439e-06, + "loss": 4.0512, + "step": 13114 + }, + { + "epoch": 1.1177874371431007, + "grad_norm": 73.19333227856819, + "learning_rate": 7.902585123484307e-06, + "loss": 2.8545, + "step": 13115 + }, + { + "epoch": 1.1178726668371262, + "grad_norm": 54.01302201161479, + "learning_rate": 7.902181366057404e-06, + "loss": 2.2935, + "step": 13116 + }, + { + "epoch": 1.1179578965311514, + "grad_norm": 36.13489423697809, + "learning_rate": 7.901777580088701e-06, + "loss": 2.9867, + "step": 13117 + }, + { + "epoch": 1.1180431262251769, + "grad_norm": 76.2574567066204, + "learning_rate": 7.901373765582167e-06, + "loss": 3.2628, + "step": 13118 + }, + { + "epoch": 1.1181283559192023, + "grad_norm": 54.2801306625028, + "learning_rate": 7.900969922541777e-06, + "loss": 3.9644, + "step": 13119 + }, + { + "epoch": 1.1182135856132276, + "grad_norm": 40.23511120943455, + "learning_rate": 7.900566050971499e-06, + "loss": 2.8133, + "step": 13120 + }, + { + "epoch": 1.118298815307253, + "grad_norm": 59.21345546551088, + "learning_rate": 7.900162150875308e-06, + "loss": 1.9042, + "step": 13121 + }, + { + "epoch": 1.1183840450012785, + "grad_norm": 39.877131799179985, + "learning_rate": 7.899758222257173e-06, + "loss": 1.9096, + "step": 13122 + }, + { + "epoch": 1.1184692746953038, + "grad_norm": 36.53823871027525, + "learning_rate": 7.899354265121069e-06, + "loss": 3.2226, + "step": 13123 + }, + { + "epoch": 1.1185545043893292, + "grad_norm": 87.9939553856293, + "learning_rate": 7.898950279470969e-06, + "loss": 3.5694, + "step": 13124 + }, + { + "epoch": 1.1186397340833547, + "grad_norm": 50.94472403218395, + "learning_rate": 7.898546265310843e-06, + "loss": 2.8014, + "step": 13125 + }, + { + "epoch": 1.11872496377738, + "grad_norm": 46.90121051116478, + "learning_rate": 7.898142222644666e-06, + "loss": 3.4627, + "step": 13126 + }, + { + "epoch": 1.1188101934714054, + "grad_norm": 36.44992702871733, + "learning_rate": 7.897738151476414e-06, + "loss": 3.1626, + "step": 13127 + }, + { + "epoch": 1.118895423165431, + "grad_norm": 43.62935937755476, + "learning_rate": 7.897334051810057e-06, + "loss": 2.9117, + "step": 13128 + }, + { + "epoch": 1.1189806528594561, + "grad_norm": 65.53463738173593, + "learning_rate": 7.896929923649573e-06, + "loss": 2.6607, + "step": 13129 + }, + { + "epoch": 1.1190658825534816, + "grad_norm": 30.89210400249206, + "learning_rate": 7.89652576699893e-06, + "loss": 2.8359, + "step": 13130 + }, + { + "epoch": 1.119151112247507, + "grad_norm": 49.510157986148755, + "learning_rate": 7.89612158186211e-06, + "loss": 3.3249, + "step": 13131 + }, + { + "epoch": 1.1192363419415323, + "grad_norm": 82.97056528671975, + "learning_rate": 7.895717368243086e-06, + "loss": 3.2116, + "step": 13132 + }, + { + "epoch": 1.1193215716355578, + "grad_norm": 35.04499956692049, + "learning_rate": 7.895313126145831e-06, + "loss": 2.0321, + "step": 13133 + }, + { + "epoch": 1.1194068013295833, + "grad_norm": 36.60289982198401, + "learning_rate": 7.89490885557432e-06, + "loss": 3.3789, + "step": 13134 + }, + { + "epoch": 1.1194920310236087, + "grad_norm": 37.84612652708879, + "learning_rate": 7.894504556532535e-06, + "loss": 2.5612, + "step": 13135 + }, + { + "epoch": 1.119577260717634, + "grad_norm": 65.41646829483952, + "learning_rate": 7.894100229024446e-06, + "loss": 2.6573, + "step": 13136 + }, + { + "epoch": 1.1196624904116594, + "grad_norm": 64.13550360706519, + "learning_rate": 7.89369587305403e-06, + "loss": 3.0234, + "step": 13137 + }, + { + "epoch": 1.119747720105685, + "grad_norm": 72.88205921543927, + "learning_rate": 7.893291488625265e-06, + "loss": 3.4318, + "step": 13138 + }, + { + "epoch": 1.1198329497997102, + "grad_norm": 53.73978749701798, + "learning_rate": 7.89288707574213e-06, + "loss": 2.5791, + "step": 13139 + }, + { + "epoch": 1.1199181794937356, + "grad_norm": 78.27787823511726, + "learning_rate": 7.892482634408597e-06, + "loss": 2.5491, + "step": 13140 + }, + { + "epoch": 1.120003409187761, + "grad_norm": 35.355969573422136, + "learning_rate": 7.89207816462865e-06, + "loss": 1.7943, + "step": 13141 + }, + { + "epoch": 1.1200886388817863, + "grad_norm": 25.16498506296677, + "learning_rate": 7.891673666406262e-06, + "loss": 1.9424, + "step": 13142 + }, + { + "epoch": 1.1201738685758118, + "grad_norm": 65.56593416620889, + "learning_rate": 7.891269139745412e-06, + "loss": 2.8665, + "step": 13143 + }, + { + "epoch": 1.1202590982698373, + "grad_norm": 41.85438597328573, + "learning_rate": 7.89086458465008e-06, + "loss": 3.3988, + "step": 13144 + }, + { + "epoch": 1.1203443279638625, + "grad_norm": 44.79307783171756, + "learning_rate": 7.890460001124242e-06, + "loss": 2.6754, + "step": 13145 + }, + { + "epoch": 1.120429557657888, + "grad_norm": 38.699468638732455, + "learning_rate": 7.89005538917188e-06, + "loss": 2.6449, + "step": 13146 + }, + { + "epoch": 1.1205147873519135, + "grad_norm": 23.662494538609824, + "learning_rate": 7.889650748796971e-06, + "loss": 2.2996, + "step": 13147 + }, + { + "epoch": 1.1206000170459387, + "grad_norm": 31.88587359617433, + "learning_rate": 7.889246080003495e-06, + "loss": 2.0171, + "step": 13148 + }, + { + "epoch": 1.1206852467399642, + "grad_norm": 29.77252461046874, + "learning_rate": 7.888841382795431e-06, + "loss": 2.6023, + "step": 13149 + }, + { + "epoch": 1.1207704764339896, + "grad_norm": 59.70352977972403, + "learning_rate": 7.88843665717676e-06, + "loss": 2.6428, + "step": 13150 + }, + { + "epoch": 1.1208557061280149, + "grad_norm": 38.48505681890631, + "learning_rate": 7.888031903151465e-06, + "loss": 3.0295, + "step": 13151 + }, + { + "epoch": 1.1209409358220404, + "grad_norm": 22.732298943401496, + "learning_rate": 7.887627120723523e-06, + "loss": 1.9262, + "step": 13152 + }, + { + "epoch": 1.1210261655160658, + "grad_norm": 28.55622005004971, + "learning_rate": 7.887222309896914e-06, + "loss": 2.7599, + "step": 13153 + }, + { + "epoch": 1.1211113952100913, + "grad_norm": 28.56216275164018, + "learning_rate": 7.886817470675621e-06, + "loss": 2.9499, + "step": 13154 + }, + { + "epoch": 1.1211966249041165, + "grad_norm": 47.778110081998854, + "learning_rate": 7.886412603063627e-06, + "loss": 3.5782, + "step": 13155 + }, + { + "epoch": 1.121281854598142, + "grad_norm": 42.45866656489256, + "learning_rate": 7.88600770706491e-06, + "loss": 2.5926, + "step": 13156 + }, + { + "epoch": 1.1213670842921675, + "grad_norm": 25.93362420905732, + "learning_rate": 7.885602782683454e-06, + "loss": 2.8641, + "step": 13157 + }, + { + "epoch": 1.1214523139861927, + "grad_norm": 73.11921451035917, + "learning_rate": 7.885197829923243e-06, + "loss": 2.3195, + "step": 13158 + }, + { + "epoch": 1.1215375436802182, + "grad_norm": 69.38474276041103, + "learning_rate": 7.884792848788256e-06, + "loss": 3.206, + "step": 13159 + }, + { + "epoch": 1.1216227733742437, + "grad_norm": 32.50458754099385, + "learning_rate": 7.884387839282479e-06, + "loss": 2.8145, + "step": 13160 + }, + { + "epoch": 1.121708003068269, + "grad_norm": 65.4663684373098, + "learning_rate": 7.883982801409893e-06, + "loss": 2.7725, + "step": 13161 + }, + { + "epoch": 1.1217932327622944, + "grad_norm": 50.74286222260616, + "learning_rate": 7.883577735174482e-06, + "loss": 2.7075, + "step": 13162 + }, + { + "epoch": 1.1218784624563198, + "grad_norm": 91.21294109357005, + "learning_rate": 7.88317264058023e-06, + "loss": 3.122, + "step": 13163 + }, + { + "epoch": 1.121963692150345, + "grad_norm": 68.82876433944413, + "learning_rate": 7.88276751763112e-06, + "loss": 2.9286, + "step": 13164 + }, + { + "epoch": 1.1220489218443706, + "grad_norm": 50.64869214865081, + "learning_rate": 7.88236236633114e-06, + "loss": 2.6929, + "step": 13165 + }, + { + "epoch": 1.122134151538396, + "grad_norm": 61.26277216011559, + "learning_rate": 7.881957186684268e-06, + "loss": 3.8944, + "step": 13166 + }, + { + "epoch": 1.1222193812324215, + "grad_norm": 35.65369244826238, + "learning_rate": 7.881551978694495e-06, + "loss": 2.8283, + "step": 13167 + }, + { + "epoch": 1.1223046109264467, + "grad_norm": 40.90266632178496, + "learning_rate": 7.8811467423658e-06, + "loss": 2.5246, + "step": 13168 + }, + { + "epoch": 1.1223898406204722, + "grad_norm": 96.79126710690271, + "learning_rate": 7.880741477702174e-06, + "loss": 2.8751, + "step": 13169 + }, + { + "epoch": 1.1224750703144977, + "grad_norm": 39.65281437508653, + "learning_rate": 7.8803361847076e-06, + "loss": 3.6051, + "step": 13170 + }, + { + "epoch": 1.122560300008523, + "grad_norm": 38.93614505066526, + "learning_rate": 7.879930863386064e-06, + "loss": 2.8651, + "step": 13171 + }, + { + "epoch": 1.1226455297025484, + "grad_norm": 46.238105507453184, + "learning_rate": 7.879525513741552e-06, + "loss": 3.3254, + "step": 13172 + }, + { + "epoch": 1.1227307593965739, + "grad_norm": 74.23430887326462, + "learning_rate": 7.879120135778052e-06, + "loss": 3.5076, + "step": 13173 + }, + { + "epoch": 1.122815989090599, + "grad_norm": 76.83995224674682, + "learning_rate": 7.878714729499548e-06, + "loss": 3.0856, + "step": 13174 + }, + { + "epoch": 1.1229012187846246, + "grad_norm": 76.72760895967531, + "learning_rate": 7.87830929491003e-06, + "loss": 4.6798, + "step": 13175 + }, + { + "epoch": 1.12298644847865, + "grad_norm": 39.66573066365389, + "learning_rate": 7.877903832013482e-06, + "loss": 3.0279, + "step": 13176 + }, + { + "epoch": 1.1230716781726753, + "grad_norm": 39.61952316720885, + "learning_rate": 7.877498340813894e-06, + "loss": 2.0326, + "step": 13177 + }, + { + "epoch": 1.1231569078667007, + "grad_norm": 46.53318394872048, + "learning_rate": 7.877092821315253e-06, + "loss": 3.3685, + "step": 13178 + }, + { + "epoch": 1.1232421375607262, + "grad_norm": 76.27082248873089, + "learning_rate": 7.876687273521548e-06, + "loss": 3.2193, + "step": 13179 + }, + { + "epoch": 1.1233273672547515, + "grad_norm": 29.764218714016174, + "learning_rate": 7.876281697436767e-06, + "loss": 2.036, + "step": 13180 + }, + { + "epoch": 1.123412596948777, + "grad_norm": 47.86025443661767, + "learning_rate": 7.875876093064899e-06, + "loss": 3.122, + "step": 13181 + }, + { + "epoch": 1.1234978266428024, + "grad_norm": 32.753955775329196, + "learning_rate": 7.875470460409931e-06, + "loss": 2.534, + "step": 13182 + }, + { + "epoch": 1.1235830563368276, + "grad_norm": 57.384864959003544, + "learning_rate": 7.875064799475854e-06, + "loss": 3.9458, + "step": 13183 + }, + { + "epoch": 1.1236682860308531, + "grad_norm": 37.309901057664305, + "learning_rate": 7.874659110266656e-06, + "loss": 3.1263, + "step": 13184 + }, + { + "epoch": 1.1237535157248786, + "grad_norm": 73.29964137638467, + "learning_rate": 7.87425339278633e-06, + "loss": 3.747, + "step": 13185 + }, + { + "epoch": 1.123838745418904, + "grad_norm": 98.0054687439915, + "learning_rate": 7.873847647038862e-06, + "loss": 4.9102, + "step": 13186 + }, + { + "epoch": 1.1239239751129293, + "grad_norm": 45.55928400479358, + "learning_rate": 7.873441873028246e-06, + "loss": 3.357, + "step": 13187 + }, + { + "epoch": 1.1240092048069548, + "grad_norm": 46.13013316229662, + "learning_rate": 7.87303607075847e-06, + "loss": 3.2565, + "step": 13188 + }, + { + "epoch": 1.1240944345009802, + "grad_norm": 42.104555656837384, + "learning_rate": 7.872630240233528e-06, + "loss": 2.7439, + "step": 13189 + }, + { + "epoch": 1.1241796641950055, + "grad_norm": 30.861803799456084, + "learning_rate": 7.872224381457405e-06, + "loss": 2.1329, + "step": 13190 + }, + { + "epoch": 1.124264893889031, + "grad_norm": 42.38612067503013, + "learning_rate": 7.871818494434101e-06, + "loss": 2.897, + "step": 13191 + }, + { + "epoch": 1.1243501235830564, + "grad_norm": 45.07250134242765, + "learning_rate": 7.8714125791676e-06, + "loss": 1.994, + "step": 13192 + }, + { + "epoch": 1.1244353532770817, + "grad_norm": 97.00963200268568, + "learning_rate": 7.871006635661897e-06, + "loss": 2.2971, + "step": 13193 + }, + { + "epoch": 1.1245205829711071, + "grad_norm": 48.86423265672883, + "learning_rate": 7.870600663920986e-06, + "loss": 3.2893, + "step": 13194 + }, + { + "epoch": 1.1246058126651326, + "grad_norm": 72.87768129149086, + "learning_rate": 7.87019466394886e-06, + "loss": 3.8593, + "step": 13195 + }, + { + "epoch": 1.1246910423591578, + "grad_norm": 71.17320995376494, + "learning_rate": 7.869788635749507e-06, + "loss": 2.4943, + "step": 13196 + }, + { + "epoch": 1.1247762720531833, + "grad_norm": 30.246327115911377, + "learning_rate": 7.869382579326926e-06, + "loss": 2.1494, + "step": 13197 + }, + { + "epoch": 1.1248615017472088, + "grad_norm": 44.924993576223066, + "learning_rate": 7.868976494685103e-06, + "loss": 2.9162, + "step": 13198 + }, + { + "epoch": 1.124946731441234, + "grad_norm": 76.18862019506972, + "learning_rate": 7.86857038182804e-06, + "loss": 3.6534, + "step": 13199 + }, + { + "epoch": 1.1250319611352595, + "grad_norm": 45.5748518743853, + "learning_rate": 7.868164240759725e-06, + "loss": 3.2248, + "step": 13200 + }, + { + "epoch": 1.125117190829285, + "grad_norm": 30.553676915206058, + "learning_rate": 7.867758071484156e-06, + "loss": 1.8813, + "step": 13201 + }, + { + "epoch": 1.1252024205233102, + "grad_norm": 65.53702702250196, + "learning_rate": 7.867351874005325e-06, + "loss": 3.8166, + "step": 13202 + }, + { + "epoch": 1.1252876502173357, + "grad_norm": 66.9893520547154, + "learning_rate": 7.86694564832723e-06, + "loss": 3.5917, + "step": 13203 + }, + { + "epoch": 1.1253728799113611, + "grad_norm": 50.8709844544745, + "learning_rate": 7.86653939445386e-06, + "loss": 2.9577, + "step": 13204 + }, + { + "epoch": 1.1254581096053866, + "grad_norm": 39.61253665964135, + "learning_rate": 7.866133112389217e-06, + "loss": 1.5581, + "step": 13205 + }, + { + "epoch": 1.1255433392994119, + "grad_norm": 28.41684986220385, + "learning_rate": 7.865726802137292e-06, + "loss": 2.1076, + "step": 13206 + }, + { + "epoch": 1.1256285689934373, + "grad_norm": 74.41295093612237, + "learning_rate": 7.865320463702084e-06, + "loss": 3.8157, + "step": 13207 + }, + { + "epoch": 1.1257137986874628, + "grad_norm": 33.08813862268011, + "learning_rate": 7.864914097087585e-06, + "loss": 2.4023, + "step": 13208 + }, + { + "epoch": 1.125799028381488, + "grad_norm": 57.59953206937913, + "learning_rate": 7.864507702297796e-06, + "loss": 2.2225, + "step": 13209 + }, + { + "epoch": 1.1258842580755135, + "grad_norm": 98.11326427721332, + "learning_rate": 7.864101279336715e-06, + "loss": 4.355, + "step": 13210 + }, + { + "epoch": 1.125969487769539, + "grad_norm": 84.80362309694816, + "learning_rate": 7.863694828208332e-06, + "loss": 2.3099, + "step": 13211 + }, + { + "epoch": 1.1260547174635642, + "grad_norm": 26.970202011788988, + "learning_rate": 7.86328834891665e-06, + "loss": 2.6502, + "step": 13212 + }, + { + "epoch": 1.1261399471575897, + "grad_norm": 40.24535405461996, + "learning_rate": 7.862881841465664e-06, + "loss": 2.3698, + "step": 13213 + }, + { + "epoch": 1.1262251768516152, + "grad_norm": 41.079870045287336, + "learning_rate": 7.862475305859374e-06, + "loss": 2.9126, + "step": 13214 + }, + { + "epoch": 1.1263104065456404, + "grad_norm": 66.7865103218153, + "learning_rate": 7.862068742101775e-06, + "loss": 2.6241, + "step": 13215 + }, + { + "epoch": 1.1263956362396659, + "grad_norm": 66.56432847323136, + "learning_rate": 7.861662150196869e-06, + "loss": 3.0032, + "step": 13216 + }, + { + "epoch": 1.1264808659336913, + "grad_norm": 32.32339271244482, + "learning_rate": 7.86125553014865e-06, + "loss": 2.7374, + "step": 13217 + }, + { + "epoch": 1.1265660956277168, + "grad_norm": 57.865575578691136, + "learning_rate": 7.860848881961122e-06, + "loss": 2.5956, + "step": 13218 + }, + { + "epoch": 1.126651325321742, + "grad_norm": 52.54342689580853, + "learning_rate": 7.860442205638283e-06, + "loss": 3.1918, + "step": 13219 + }, + { + "epoch": 1.1267365550157675, + "grad_norm": 43.76617576227144, + "learning_rate": 7.86003550118413e-06, + "loss": 3.4411, + "step": 13220 + }, + { + "epoch": 1.1268217847097928, + "grad_norm": 37.58709754311339, + "learning_rate": 7.859628768602665e-06, + "loss": 2.7887, + "step": 13221 + }, + { + "epoch": 1.1269070144038182, + "grad_norm": 39.27513973963801, + "learning_rate": 7.859222007897887e-06, + "loss": 3.1037, + "step": 13222 + }, + { + "epoch": 1.1269922440978437, + "grad_norm": 57.977497675308, + "learning_rate": 7.858815219073797e-06, + "loss": 3.2956, + "step": 13223 + }, + { + "epoch": 1.1270774737918692, + "grad_norm": 58.70041920579241, + "learning_rate": 7.858408402134393e-06, + "loss": 3.1559, + "step": 13224 + }, + { + "epoch": 1.1271627034858944, + "grad_norm": 44.253647249813746, + "learning_rate": 7.858001557083682e-06, + "loss": 3.8116, + "step": 13225 + }, + { + "epoch": 1.12724793317992, + "grad_norm": 61.50492701120066, + "learning_rate": 7.85759468392566e-06, + "loss": 3.5044, + "step": 13226 + }, + { + "epoch": 1.1273331628739454, + "grad_norm": 43.010574663413436, + "learning_rate": 7.857187782664328e-06, + "loss": 3.2148, + "step": 13227 + }, + { + "epoch": 1.1274183925679706, + "grad_norm": 57.895685646831836, + "learning_rate": 7.85678085330369e-06, + "loss": 2.9057, + "step": 13228 + }, + { + "epoch": 1.127503622261996, + "grad_norm": 103.48578635393342, + "learning_rate": 7.85637389584775e-06, + "loss": 3.9866, + "step": 13229 + }, + { + "epoch": 1.1275888519560215, + "grad_norm": 45.46604554372579, + "learning_rate": 7.855966910300507e-06, + "loss": 2.7917, + "step": 13230 + }, + { + "epoch": 1.1276740816500468, + "grad_norm": 28.029690826876614, + "learning_rate": 7.855559896665964e-06, + "loss": 1.9907, + "step": 13231 + }, + { + "epoch": 1.1277593113440723, + "grad_norm": 46.504537468010476, + "learning_rate": 7.855152854948122e-06, + "loss": 2.8099, + "step": 13232 + }, + { + "epoch": 1.1278445410380977, + "grad_norm": 42.0342929597926, + "learning_rate": 7.854745785150988e-06, + "loss": 2.8244, + "step": 13233 + }, + { + "epoch": 1.127929770732123, + "grad_norm": 128.8163021900645, + "learning_rate": 7.854338687278563e-06, + "loss": 3.5536, + "step": 13234 + }, + { + "epoch": 1.1280150004261484, + "grad_norm": 39.431448534045195, + "learning_rate": 7.85393156133485e-06, + "loss": 2.7507, + "step": 13235 + }, + { + "epoch": 1.128100230120174, + "grad_norm": 57.422962754052676, + "learning_rate": 7.853524407323858e-06, + "loss": 2.9463, + "step": 13236 + }, + { + "epoch": 1.1281854598141994, + "grad_norm": 68.65871204200974, + "learning_rate": 7.853117225249584e-06, + "loss": 2.343, + "step": 13237 + }, + { + "epoch": 1.1282706895082246, + "grad_norm": 60.42051976109034, + "learning_rate": 7.852710015116036e-06, + "loss": 3.4832, + "step": 13238 + }, + { + "epoch": 1.12835591920225, + "grad_norm": 24.703009849866582, + "learning_rate": 7.852302776927218e-06, + "loss": 2.6838, + "step": 13239 + }, + { + "epoch": 1.1284411488962756, + "grad_norm": 37.63886790401956, + "learning_rate": 7.851895510687138e-06, + "loss": 3.3341, + "step": 13240 + }, + { + "epoch": 1.1285263785903008, + "grad_norm": 46.27207815701591, + "learning_rate": 7.8514882163998e-06, + "loss": 3.2514, + "step": 13241 + }, + { + "epoch": 1.1286116082843263, + "grad_norm": 26.460742341113882, + "learning_rate": 7.851080894069206e-06, + "loss": 2.674, + "step": 13242 + }, + { + "epoch": 1.1286968379783517, + "grad_norm": 42.92880324869662, + "learning_rate": 7.850673543699363e-06, + "loss": 2.6909, + "step": 13243 + }, + { + "epoch": 1.128782067672377, + "grad_norm": 50.99487117716424, + "learning_rate": 7.85026616529428e-06, + "loss": 3.567, + "step": 13244 + }, + { + "epoch": 1.1288672973664025, + "grad_norm": 30.402529242622084, + "learning_rate": 7.849858758857962e-06, + "loss": 2.9569, + "step": 13245 + }, + { + "epoch": 1.128952527060428, + "grad_norm": 36.63883697107798, + "learning_rate": 7.849451324394414e-06, + "loss": 3.0672, + "step": 13246 + }, + { + "epoch": 1.1290377567544532, + "grad_norm": 35.10329123050427, + "learning_rate": 7.849043861907647e-06, + "loss": 2.5181, + "step": 13247 + }, + { + "epoch": 1.1291229864484786, + "grad_norm": 48.663042091134145, + "learning_rate": 7.848636371401665e-06, + "loss": 2.931, + "step": 13248 + }, + { + "epoch": 1.129208216142504, + "grad_norm": 29.977142007834807, + "learning_rate": 7.848228852880475e-06, + "loss": 3.2135, + "step": 13249 + }, + { + "epoch": 1.1292934458365294, + "grad_norm": 88.32171537975222, + "learning_rate": 7.847821306348087e-06, + "loss": 5.0095, + "step": 13250 + }, + { + "epoch": 1.1293786755305548, + "grad_norm": 38.906596052902785, + "learning_rate": 7.847413731808507e-06, + "loss": 2.7261, + "step": 13251 + }, + { + "epoch": 1.1294639052245803, + "grad_norm": 45.3464510193682, + "learning_rate": 7.847006129265747e-06, + "loss": 3.018, + "step": 13252 + }, + { + "epoch": 1.1295491349186055, + "grad_norm": 38.447673504433375, + "learning_rate": 7.84659849872381e-06, + "loss": 2.0319, + "step": 13253 + }, + { + "epoch": 1.129634364612631, + "grad_norm": 55.356183845409284, + "learning_rate": 7.846190840186709e-06, + "loss": 2.2692, + "step": 13254 + }, + { + "epoch": 1.1297195943066565, + "grad_norm": 28.042554827768555, + "learning_rate": 7.845783153658453e-06, + "loss": 2.472, + "step": 13255 + }, + { + "epoch": 1.129804824000682, + "grad_norm": 57.589307764749236, + "learning_rate": 7.845375439143048e-06, + "loss": 3.8716, + "step": 13256 + }, + { + "epoch": 1.1298900536947072, + "grad_norm": 37.53873875247089, + "learning_rate": 7.844967696644509e-06, + "loss": 2.9794, + "step": 13257 + }, + { + "epoch": 1.1299752833887327, + "grad_norm": 35.40150275986363, + "learning_rate": 7.844559926166843e-06, + "loss": 3.1195, + "step": 13258 + }, + { + "epoch": 1.1300605130827581, + "grad_norm": 35.51291658947302, + "learning_rate": 7.844152127714058e-06, + "loss": 3.5237, + "step": 13259 + }, + { + "epoch": 1.1301457427767834, + "grad_norm": 132.38470885361207, + "learning_rate": 7.843744301290171e-06, + "loss": 3.5361, + "step": 13260 + }, + { + "epoch": 1.1302309724708088, + "grad_norm": 32.210606600181045, + "learning_rate": 7.843336446899184e-06, + "loss": 1.2849, + "step": 13261 + }, + { + "epoch": 1.1303162021648343, + "grad_norm": 88.22445706450854, + "learning_rate": 7.842928564545115e-06, + "loss": 4.5133, + "step": 13262 + }, + { + "epoch": 1.1304014318588596, + "grad_norm": 55.89773431912981, + "learning_rate": 7.842520654231975e-06, + "loss": 3.5168, + "step": 13263 + }, + { + "epoch": 1.130486661552885, + "grad_norm": 96.68587616139574, + "learning_rate": 7.842112715963772e-06, + "loss": 5.031, + "step": 13264 + }, + { + "epoch": 1.1305718912469105, + "grad_norm": 40.36020244926477, + "learning_rate": 7.841704749744519e-06, + "loss": 3.3411, + "step": 13265 + }, + { + "epoch": 1.1306571209409357, + "grad_norm": 74.41712519583407, + "learning_rate": 7.84129675557823e-06, + "loss": 3.5675, + "step": 13266 + }, + { + "epoch": 1.1307423506349612, + "grad_norm": 81.80357815433271, + "learning_rate": 7.840888733468916e-06, + "loss": 3.7353, + "step": 13267 + }, + { + "epoch": 1.1308275803289867, + "grad_norm": 54.80625636297915, + "learning_rate": 7.840480683420591e-06, + "loss": 3.4022, + "step": 13268 + }, + { + "epoch": 1.1309128100230121, + "grad_norm": 52.77143558833041, + "learning_rate": 7.840072605437268e-06, + "loss": 2.6009, + "step": 13269 + }, + { + "epoch": 1.1309980397170374, + "grad_norm": 58.52302998095306, + "learning_rate": 7.839664499522958e-06, + "loss": 3.7467, + "step": 13270 + }, + { + "epoch": 1.1310832694110629, + "grad_norm": 55.196069251846815, + "learning_rate": 7.839256365681675e-06, + "loss": 3.2648, + "step": 13271 + }, + { + "epoch": 1.131168499105088, + "grad_norm": 47.94568765047889, + "learning_rate": 7.838848203917436e-06, + "loss": 2.4662, + "step": 13272 + }, + { + "epoch": 1.1312537287991136, + "grad_norm": 30.49320584254683, + "learning_rate": 7.838440014234252e-06, + "loss": 2.1205, + "step": 13273 + }, + { + "epoch": 1.131338958493139, + "grad_norm": 44.11856567156678, + "learning_rate": 7.838031796636139e-06, + "loss": 3.0766, + "step": 13274 + }, + { + "epoch": 1.1314241881871645, + "grad_norm": 28.432467879566566, + "learning_rate": 7.837623551127109e-06, + "loss": 2.171, + "step": 13275 + }, + { + "epoch": 1.1315094178811897, + "grad_norm": 58.02711922116504, + "learning_rate": 7.837215277711181e-06, + "loss": 2.5784, + "step": 13276 + }, + { + "epoch": 1.1315946475752152, + "grad_norm": 35.820280924485175, + "learning_rate": 7.836806976392369e-06, + "loss": 2.3316, + "step": 13277 + }, + { + "epoch": 1.1316798772692407, + "grad_norm": 31.466176569467514, + "learning_rate": 7.836398647174684e-06, + "loss": 3.0167, + "step": 13278 + }, + { + "epoch": 1.131765106963266, + "grad_norm": 60.32430463439753, + "learning_rate": 7.835990290062147e-06, + "loss": 2.8355, + "step": 13279 + }, + { + "epoch": 1.1318503366572914, + "grad_norm": 34.65577505955233, + "learning_rate": 7.835581905058773e-06, + "loss": 2.7933, + "step": 13280 + }, + { + "epoch": 1.1319355663513169, + "grad_norm": 25.175268246133385, + "learning_rate": 7.835173492168579e-06, + "loss": 1.8247, + "step": 13281 + }, + { + "epoch": 1.1320207960453421, + "grad_norm": 55.84852193080909, + "learning_rate": 7.834765051395578e-06, + "loss": 3.2115, + "step": 13282 + }, + { + "epoch": 1.1321060257393676, + "grad_norm": 65.32150943197443, + "learning_rate": 7.83435658274379e-06, + "loss": 3.1346, + "step": 13283 + }, + { + "epoch": 1.132191255433393, + "grad_norm": 61.30097628429188, + "learning_rate": 7.833948086217232e-06, + "loss": 2.7664, + "step": 13284 + }, + { + "epoch": 1.1322764851274183, + "grad_norm": 53.32042261678805, + "learning_rate": 7.83353956181992e-06, + "loss": 3.1166, + "step": 13285 + }, + { + "epoch": 1.1323617148214438, + "grad_norm": 41.51751053003765, + "learning_rate": 7.833131009555873e-06, + "loss": 2.1409, + "step": 13286 + }, + { + "epoch": 1.1324469445154692, + "grad_norm": 87.47318556714343, + "learning_rate": 7.832722429429106e-06, + "loss": 4.1472, + "step": 13287 + }, + { + "epoch": 1.1325321742094947, + "grad_norm": 41.87588622282004, + "learning_rate": 7.832313821443642e-06, + "loss": 3.2734, + "step": 13288 + }, + { + "epoch": 1.13261740390352, + "grad_norm": 47.45427707866683, + "learning_rate": 7.831905185603497e-06, + "loss": 2.6419, + "step": 13289 + }, + { + "epoch": 1.1327026335975454, + "grad_norm": 45.050115701635484, + "learning_rate": 7.831496521912688e-06, + "loss": 2.0756, + "step": 13290 + }, + { + "epoch": 1.1327878632915707, + "grad_norm": 47.39304625561108, + "learning_rate": 7.831087830375238e-06, + "loss": 3.7192, + "step": 13291 + }, + { + "epoch": 1.1328730929855961, + "grad_norm": 38.36551831514829, + "learning_rate": 7.830679110995162e-06, + "loss": 2.5664, + "step": 13292 + }, + { + "epoch": 1.1329583226796216, + "grad_norm": 87.6531020010791, + "learning_rate": 7.830270363776481e-06, + "loss": 2.6994, + "step": 13293 + }, + { + "epoch": 1.133043552373647, + "grad_norm": 50.29876615168946, + "learning_rate": 7.829861588723219e-06, + "loss": 3.3945, + "step": 13294 + }, + { + "epoch": 1.1331287820676723, + "grad_norm": 60.74946205052461, + "learning_rate": 7.829452785839391e-06, + "loss": 3.2101, + "step": 13295 + }, + { + "epoch": 1.1332140117616978, + "grad_norm": 51.59367057813232, + "learning_rate": 7.829043955129019e-06, + "loss": 2.8666, + "step": 13296 + }, + { + "epoch": 1.1332992414557232, + "grad_norm": 60.19488753522333, + "learning_rate": 7.828635096596124e-06, + "loss": 2.1451, + "step": 13297 + }, + { + "epoch": 1.1333844711497485, + "grad_norm": 56.555112317529925, + "learning_rate": 7.828226210244726e-06, + "loss": 2.6065, + "step": 13298 + }, + { + "epoch": 1.133469700843774, + "grad_norm": 33.678222543105285, + "learning_rate": 7.827817296078848e-06, + "loss": 1.9079, + "step": 13299 + }, + { + "epoch": 1.1335549305377994, + "grad_norm": 76.90156766882528, + "learning_rate": 7.82740835410251e-06, + "loss": 4.1794, + "step": 13300 + }, + { + "epoch": 1.1336401602318247, + "grad_norm": 39.61056681818338, + "learning_rate": 7.826999384319736e-06, + "loss": 2.499, + "step": 13301 + }, + { + "epoch": 1.1337253899258501, + "grad_norm": 41.73178940113657, + "learning_rate": 7.826590386734546e-06, + "loss": 2.9534, + "step": 13302 + }, + { + "epoch": 1.1338106196198756, + "grad_norm": 48.90061390971406, + "learning_rate": 7.826181361350962e-06, + "loss": 4.0431, + "step": 13303 + }, + { + "epoch": 1.1338958493139009, + "grad_norm": 101.11348089471245, + "learning_rate": 7.825772308173008e-06, + "loss": 3.332, + "step": 13304 + }, + { + "epoch": 1.1339810790079263, + "grad_norm": 28.925412209207316, + "learning_rate": 7.825363227204706e-06, + "loss": 2.5228, + "step": 13305 + }, + { + "epoch": 1.1340663087019518, + "grad_norm": 291.2522658398281, + "learning_rate": 7.824954118450076e-06, + "loss": 3.2117, + "step": 13306 + }, + { + "epoch": 1.1341515383959773, + "grad_norm": 97.43637702423689, + "learning_rate": 7.824544981913149e-06, + "loss": 4.3892, + "step": 13307 + }, + { + "epoch": 1.1342367680900025, + "grad_norm": 46.29891353036895, + "learning_rate": 7.824135817597943e-06, + "loss": 2.4049, + "step": 13308 + }, + { + "epoch": 1.134321997784028, + "grad_norm": 32.63867798801865, + "learning_rate": 7.823726625508484e-06, + "loss": 2.9049, + "step": 13309 + }, + { + "epoch": 1.1344072274780534, + "grad_norm": 63.85532896009824, + "learning_rate": 7.823317405648795e-06, + "loss": 2.5335, + "step": 13310 + }, + { + "epoch": 1.1344924571720787, + "grad_norm": 36.89075610853161, + "learning_rate": 7.822908158022901e-06, + "loss": 1.7064, + "step": 13311 + }, + { + "epoch": 1.1345776868661042, + "grad_norm": 32.98573634312098, + "learning_rate": 7.822498882634828e-06, + "loss": 2.4094, + "step": 13312 + }, + { + "epoch": 1.1346629165601296, + "grad_norm": 31.40551591756258, + "learning_rate": 7.822089579488598e-06, + "loss": 2.1154, + "step": 13313 + }, + { + "epoch": 1.1347481462541549, + "grad_norm": 55.17154121011631, + "learning_rate": 7.821680248588241e-06, + "loss": 2.9026, + "step": 13314 + }, + { + "epoch": 1.1348333759481803, + "grad_norm": 104.72839119323004, + "learning_rate": 7.821270889937777e-06, + "loss": 2.6578, + "step": 13315 + }, + { + "epoch": 1.1349186056422058, + "grad_norm": 34.95326302899744, + "learning_rate": 7.820861503541238e-06, + "loss": 3.3078, + "step": 13316 + }, + { + "epoch": 1.135003835336231, + "grad_norm": 45.314009984102114, + "learning_rate": 7.820452089402647e-06, + "loss": 2.0166, + "step": 13317 + }, + { + "epoch": 1.1350890650302565, + "grad_norm": 47.086775058010986, + "learning_rate": 7.820042647526027e-06, + "loss": 2.6256, + "step": 13318 + }, + { + "epoch": 1.135174294724282, + "grad_norm": 45.30411576540022, + "learning_rate": 7.81963317791541e-06, + "loss": 3.0065, + "step": 13319 + }, + { + "epoch": 1.1352595244183072, + "grad_norm": 64.86585768777714, + "learning_rate": 7.819223680574823e-06, + "loss": 2.8683, + "step": 13320 + }, + { + "epoch": 1.1353447541123327, + "grad_norm": 35.424078646676136, + "learning_rate": 7.818814155508289e-06, + "loss": 2.8541, + "step": 13321 + }, + { + "epoch": 1.1354299838063582, + "grad_norm": 52.37252370836923, + "learning_rate": 7.818404602719838e-06, + "loss": 2.5941, + "step": 13322 + }, + { + "epoch": 1.1355152135003834, + "grad_norm": 74.5104634578506, + "learning_rate": 7.817995022213498e-06, + "loss": 2.8738, + "step": 13323 + }, + { + "epoch": 1.135600443194409, + "grad_norm": 29.3006415897624, + "learning_rate": 7.817585413993298e-06, + "loss": 2.8176, + "step": 13324 + }, + { + "epoch": 1.1356856728884344, + "grad_norm": 80.8164897185801, + "learning_rate": 7.817175778063263e-06, + "loss": 3.1043, + "step": 13325 + }, + { + "epoch": 1.1357709025824598, + "grad_norm": 55.16793233097143, + "learning_rate": 7.816766114427425e-06, + "loss": 3.1414, + "step": 13326 + }, + { + "epoch": 1.135856132276485, + "grad_norm": 34.16974326671341, + "learning_rate": 7.816356423089811e-06, + "loss": 1.8258, + "step": 13327 + }, + { + "epoch": 1.1359413619705105, + "grad_norm": 36.03684927623122, + "learning_rate": 7.815946704054452e-06, + "loss": 3.2803, + "step": 13328 + }, + { + "epoch": 1.136026591664536, + "grad_norm": 62.370963618939605, + "learning_rate": 7.815536957325373e-06, + "loss": 3.724, + "step": 13329 + }, + { + "epoch": 1.1361118213585613, + "grad_norm": 34.38603019345097, + "learning_rate": 7.81512718290661e-06, + "loss": 2.3124, + "step": 13330 + }, + { + "epoch": 1.1361970510525867, + "grad_norm": 36.299332093936314, + "learning_rate": 7.814717380802187e-06, + "loss": 2.3433, + "step": 13331 + }, + { + "epoch": 1.1362822807466122, + "grad_norm": 23.912453149300884, + "learning_rate": 7.81430755101614e-06, + "loss": 2.0605, + "step": 13332 + }, + { + "epoch": 1.1363675104406374, + "grad_norm": 31.333986327940405, + "learning_rate": 7.813897693552495e-06, + "loss": 3.1654, + "step": 13333 + }, + { + "epoch": 1.136452740134663, + "grad_norm": 51.78854986017469, + "learning_rate": 7.813487808415284e-06, + "loss": 2.9242, + "step": 13334 + }, + { + "epoch": 1.1365379698286884, + "grad_norm": 114.18934760253997, + "learning_rate": 7.813077895608539e-06, + "loss": 5.7142, + "step": 13335 + }, + { + "epoch": 1.1366231995227136, + "grad_norm": 60.416986613029145, + "learning_rate": 7.81266795513629e-06, + "loss": 3.2084, + "step": 13336 + }, + { + "epoch": 1.136708429216739, + "grad_norm": 119.85057686997563, + "learning_rate": 7.81225798700257e-06, + "loss": 2.6827, + "step": 13337 + }, + { + "epoch": 1.1367936589107646, + "grad_norm": 37.18503697904206, + "learning_rate": 7.81184799121141e-06, + "loss": 2.7699, + "step": 13338 + }, + { + "epoch": 1.13687888860479, + "grad_norm": 45.28064364760504, + "learning_rate": 7.811437967766842e-06, + "loss": 2.7024, + "step": 13339 + }, + { + "epoch": 1.1369641182988153, + "grad_norm": 210.1310053218074, + "learning_rate": 7.8110279166729e-06, + "loss": 2.6946, + "step": 13340 + }, + { + "epoch": 1.1370493479928407, + "grad_norm": 31.755319455734003, + "learning_rate": 7.810617837933614e-06, + "loss": 2.9103, + "step": 13341 + }, + { + "epoch": 1.137134577686866, + "grad_norm": 35.52100069225313, + "learning_rate": 7.81020773155302e-06, + "loss": 2.749, + "step": 13342 + }, + { + "epoch": 1.1372198073808915, + "grad_norm": 47.74769632020591, + "learning_rate": 7.809797597535147e-06, + "loss": 2.716, + "step": 13343 + }, + { + "epoch": 1.137305037074917, + "grad_norm": 76.61871296466886, + "learning_rate": 7.809387435884033e-06, + "loss": 3.675, + "step": 13344 + }, + { + "epoch": 1.1373902667689424, + "grad_norm": 36.474454995238645, + "learning_rate": 7.80897724660371e-06, + "loss": 2.8301, + "step": 13345 + }, + { + "epoch": 1.1374754964629676, + "grad_norm": 46.11425854464556, + "learning_rate": 7.808567029698212e-06, + "loss": 3.305, + "step": 13346 + }, + { + "epoch": 1.137560726156993, + "grad_norm": 53.14276846844173, + "learning_rate": 7.808156785171572e-06, + "loss": 2.8096, + "step": 13347 + }, + { + "epoch": 1.1376459558510186, + "grad_norm": 74.19688307840522, + "learning_rate": 7.807746513027828e-06, + "loss": 3.6408, + "step": 13348 + }, + { + "epoch": 1.1377311855450438, + "grad_norm": 58.425077782342626, + "learning_rate": 7.80733621327101e-06, + "loss": 2.6054, + "step": 13349 + }, + { + "epoch": 1.1378164152390693, + "grad_norm": 39.24733507398272, + "learning_rate": 7.806925885905156e-06, + "loss": 3.241, + "step": 13350 + }, + { + "epoch": 1.1379016449330948, + "grad_norm": 49.35670066474755, + "learning_rate": 7.806515530934304e-06, + "loss": 3.9039, + "step": 13351 + }, + { + "epoch": 1.13798687462712, + "grad_norm": 97.91557035751354, + "learning_rate": 7.806105148362486e-06, + "loss": 3.3063, + "step": 13352 + }, + { + "epoch": 1.1380721043211455, + "grad_norm": 24.36900954077425, + "learning_rate": 7.805694738193737e-06, + "loss": 2.6401, + "step": 13353 + }, + { + "epoch": 1.138157334015171, + "grad_norm": 54.34083216262276, + "learning_rate": 7.805284300432096e-06, + "loss": 3.0907, + "step": 13354 + }, + { + "epoch": 1.1382425637091962, + "grad_norm": 71.71754066182845, + "learning_rate": 7.8048738350816e-06, + "loss": 3.1739, + "step": 13355 + }, + { + "epoch": 1.1383277934032217, + "grad_norm": 98.0729193894241, + "learning_rate": 7.804463342146284e-06, + "loss": 3.3929, + "step": 13356 + }, + { + "epoch": 1.1384130230972471, + "grad_norm": 33.127336452668615, + "learning_rate": 7.804052821630184e-06, + "loss": 2.9242, + "step": 13357 + }, + { + "epoch": 1.1384982527912726, + "grad_norm": 112.79107285028705, + "learning_rate": 7.80364227353734e-06, + "loss": 3.928, + "step": 13358 + }, + { + "epoch": 1.1385834824852978, + "grad_norm": 87.73160144095408, + "learning_rate": 7.803231697871789e-06, + "loss": 4.3265, + "step": 13359 + }, + { + "epoch": 1.1386687121793233, + "grad_norm": 60.16691083418668, + "learning_rate": 7.802821094637567e-06, + "loss": 3.069, + "step": 13360 + }, + { + "epoch": 1.1387539418733486, + "grad_norm": 74.14744918753433, + "learning_rate": 7.802410463838712e-06, + "loss": 3.3403, + "step": 13361 + }, + { + "epoch": 1.138839171567374, + "grad_norm": 30.464189084397443, + "learning_rate": 7.801999805479266e-06, + "loss": 2.327, + "step": 13362 + }, + { + "epoch": 1.1389244012613995, + "grad_norm": 68.83005282563536, + "learning_rate": 7.801589119563264e-06, + "loss": 2.5785, + "step": 13363 + }, + { + "epoch": 1.139009630955425, + "grad_norm": 76.31000275015627, + "learning_rate": 7.801178406094746e-06, + "loss": 3.1681, + "step": 13364 + }, + { + "epoch": 1.1390948606494502, + "grad_norm": 29.95549785074877, + "learning_rate": 7.800767665077752e-06, + "loss": 2.7828, + "step": 13365 + }, + { + "epoch": 1.1391800903434757, + "grad_norm": 81.71157263441431, + "learning_rate": 7.800356896516321e-06, + "loss": 2.8482, + "step": 13366 + }, + { + "epoch": 1.1392653200375011, + "grad_norm": 45.5729146753598, + "learning_rate": 7.799946100414493e-06, + "loss": 3.1774, + "step": 13367 + }, + { + "epoch": 1.1393505497315264, + "grad_norm": 43.24940575549171, + "learning_rate": 7.799535276776306e-06, + "loss": 2.9629, + "step": 13368 + }, + { + "epoch": 1.1394357794255519, + "grad_norm": 43.972013779782344, + "learning_rate": 7.799124425605804e-06, + "loss": 3.1538, + "step": 13369 + }, + { + "epoch": 1.1395210091195773, + "grad_norm": 59.5882332714499, + "learning_rate": 7.798713546907025e-06, + "loss": 2.7032, + "step": 13370 + }, + { + "epoch": 1.1396062388136026, + "grad_norm": 81.48806917658807, + "learning_rate": 7.79830264068401e-06, + "loss": 5.299, + "step": 13371 + }, + { + "epoch": 1.139691468507628, + "grad_norm": 44.91929431769982, + "learning_rate": 7.797891706940802e-06, + "loss": 3.1546, + "step": 13372 + }, + { + "epoch": 1.1397766982016535, + "grad_norm": 92.81078183929814, + "learning_rate": 7.797480745681438e-06, + "loss": 3.1127, + "step": 13373 + }, + { + "epoch": 1.1398619278956788, + "grad_norm": 36.98325502798998, + "learning_rate": 7.797069756909965e-06, + "loss": 3.6049, + "step": 13374 + }, + { + "epoch": 1.1399471575897042, + "grad_norm": 31.5906082966299, + "learning_rate": 7.796658740630421e-06, + "loss": 2.3132, + "step": 13375 + }, + { + "epoch": 1.1400323872837297, + "grad_norm": 48.93112163235855, + "learning_rate": 7.79624769684685e-06, + "loss": 2.2478, + "step": 13376 + }, + { + "epoch": 1.1401176169777552, + "grad_norm": 34.86452996790181, + "learning_rate": 7.795836625563294e-06, + "loss": 3.4052, + "step": 13377 + }, + { + "epoch": 1.1402028466717804, + "grad_norm": 106.90232657968454, + "learning_rate": 7.795425526783795e-06, + "loss": 2.9481, + "step": 13378 + }, + { + "epoch": 1.1402880763658059, + "grad_norm": 36.219685675691856, + "learning_rate": 7.795014400512398e-06, + "loss": 2.7205, + "step": 13379 + }, + { + "epoch": 1.1403733060598313, + "grad_norm": 32.34406353838518, + "learning_rate": 7.794603246753145e-06, + "loss": 2.3247, + "step": 13380 + }, + { + "epoch": 1.1404585357538566, + "grad_norm": 45.491095231821475, + "learning_rate": 7.794192065510077e-06, + "loss": 2.3453, + "step": 13381 + }, + { + "epoch": 1.140543765447882, + "grad_norm": 42.76102748163797, + "learning_rate": 7.793780856787243e-06, + "loss": 2.9806, + "step": 13382 + }, + { + "epoch": 1.1406289951419075, + "grad_norm": 45.4235452580448, + "learning_rate": 7.793369620588685e-06, + "loss": 3.1341, + "step": 13383 + }, + { + "epoch": 1.1407142248359328, + "grad_norm": 33.525442618497124, + "learning_rate": 7.792958356918445e-06, + "loss": 2.697, + "step": 13384 + }, + { + "epoch": 1.1407994545299582, + "grad_norm": 113.71887663069148, + "learning_rate": 7.792547065780569e-06, + "loss": 3.6802, + "step": 13385 + }, + { + "epoch": 1.1408846842239837, + "grad_norm": 74.1209278965644, + "learning_rate": 7.792135747179103e-06, + "loss": 3.239, + "step": 13386 + }, + { + "epoch": 1.140969913918009, + "grad_norm": 93.06393411193355, + "learning_rate": 7.791724401118093e-06, + "loss": 3.6636, + "step": 13387 + }, + { + "epoch": 1.1410551436120344, + "grad_norm": 36.45082206189806, + "learning_rate": 7.79131302760158e-06, + "loss": 2.6447, + "step": 13388 + }, + { + "epoch": 1.1411403733060599, + "grad_norm": 38.794458745800945, + "learning_rate": 7.790901626633615e-06, + "loss": 2.6057, + "step": 13389 + }, + { + "epoch": 1.1412256030000854, + "grad_norm": 58.873452016305066, + "learning_rate": 7.79049019821824e-06, + "loss": 2.7577, + "step": 13390 + }, + { + "epoch": 1.1413108326941106, + "grad_norm": 51.18991002601719, + "learning_rate": 7.790078742359504e-06, + "loss": 3.1403, + "step": 13391 + }, + { + "epoch": 1.141396062388136, + "grad_norm": 67.48036762071008, + "learning_rate": 7.789667259061451e-06, + "loss": 2.4049, + "step": 13392 + }, + { + "epoch": 1.1414812920821613, + "grad_norm": 29.941306040301104, + "learning_rate": 7.789255748328129e-06, + "loss": 2.9162, + "step": 13393 + }, + { + "epoch": 1.1415665217761868, + "grad_norm": 38.85703405452143, + "learning_rate": 7.788844210163585e-06, + "loss": 2.1521, + "step": 13394 + }, + { + "epoch": 1.1416517514702123, + "grad_norm": 34.93139530873503, + "learning_rate": 7.788432644571867e-06, + "loss": 2.4181, + "step": 13395 + }, + { + "epoch": 1.1417369811642377, + "grad_norm": 74.02103061958533, + "learning_rate": 7.788021051557022e-06, + "loss": 5.1592, + "step": 13396 + }, + { + "epoch": 1.141822210858263, + "grad_norm": 67.00581348376019, + "learning_rate": 7.787609431123098e-06, + "loss": 2.8393, + "step": 13397 + }, + { + "epoch": 1.1419074405522884, + "grad_norm": 42.6597286911536, + "learning_rate": 7.78719778327414e-06, + "loss": 3.8299, + "step": 13398 + }, + { + "epoch": 1.141992670246314, + "grad_norm": 72.61532441217356, + "learning_rate": 7.786786108014204e-06, + "loss": 3.2062, + "step": 13399 + }, + { + "epoch": 1.1420778999403391, + "grad_norm": 35.02736306641202, + "learning_rate": 7.786374405347332e-06, + "loss": 3.6372, + "step": 13400 + }, + { + "epoch": 1.1421631296343646, + "grad_norm": 34.98486782771247, + "learning_rate": 7.785962675277573e-06, + "loss": 2.9902, + "step": 13401 + }, + { + "epoch": 1.14224835932839, + "grad_norm": 38.44521235865142, + "learning_rate": 7.785550917808981e-06, + "loss": 3.0568, + "step": 13402 + }, + { + "epoch": 1.1423335890224153, + "grad_norm": 32.99858492720466, + "learning_rate": 7.785139132945602e-06, + "loss": 3.289, + "step": 13403 + }, + { + "epoch": 1.1424188187164408, + "grad_norm": 50.609055936544074, + "learning_rate": 7.784727320691486e-06, + "loss": 2.3911, + "step": 13404 + }, + { + "epoch": 1.1425040484104663, + "grad_norm": 33.037480140698406, + "learning_rate": 7.784315481050682e-06, + "loss": 1.8016, + "step": 13405 + }, + { + "epoch": 1.1425892781044915, + "grad_norm": 52.13216407553707, + "learning_rate": 7.783903614027245e-06, + "loss": 3.671, + "step": 13406 + }, + { + "epoch": 1.142674507798517, + "grad_norm": 46.40364875365664, + "learning_rate": 7.783491719625217e-06, + "loss": 3.0066, + "step": 13407 + }, + { + "epoch": 1.1427597374925424, + "grad_norm": 43.37238295412219, + "learning_rate": 7.783079797848659e-06, + "loss": 3.249, + "step": 13408 + }, + { + "epoch": 1.142844967186568, + "grad_norm": 31.669569375993422, + "learning_rate": 7.782667848701615e-06, + "loss": 2.7727, + "step": 13409 + }, + { + "epoch": 1.1429301968805932, + "grad_norm": 43.653320947044996, + "learning_rate": 7.782255872188139e-06, + "loss": 2.6508, + "step": 13410 + }, + { + "epoch": 1.1430154265746186, + "grad_norm": 49.47259205392835, + "learning_rate": 7.78184386831228e-06, + "loss": 2.6018, + "step": 13411 + }, + { + "epoch": 1.1431006562686439, + "grad_norm": 32.01976500868112, + "learning_rate": 7.781431837078092e-06, + "loss": 2.2083, + "step": 13412 + }, + { + "epoch": 1.1431858859626693, + "grad_norm": 47.24744583467288, + "learning_rate": 7.78101977848963e-06, + "loss": 2.5293, + "step": 13413 + }, + { + "epoch": 1.1432711156566948, + "grad_norm": 35.77674370549759, + "learning_rate": 7.780607692550942e-06, + "loss": 2.3833, + "step": 13414 + }, + { + "epoch": 1.1433563453507203, + "grad_norm": 54.93847047973297, + "learning_rate": 7.780195579266082e-06, + "loss": 2.4095, + "step": 13415 + }, + { + "epoch": 1.1434415750447455, + "grad_norm": 32.09927491540862, + "learning_rate": 7.779783438639103e-06, + "loss": 2.6203, + "step": 13416 + }, + { + "epoch": 1.143526804738771, + "grad_norm": 42.77170175200973, + "learning_rate": 7.779371270674058e-06, + "loss": 3.0738, + "step": 13417 + }, + { + "epoch": 1.1436120344327965, + "grad_norm": 136.04032685438074, + "learning_rate": 7.778959075375003e-06, + "loss": 4.9467, + "step": 13418 + }, + { + "epoch": 1.1436972641268217, + "grad_norm": 36.32317465072334, + "learning_rate": 7.778546852745989e-06, + "loss": 2.6372, + "step": 13419 + }, + { + "epoch": 1.1437824938208472, + "grad_norm": 54.7609292361159, + "learning_rate": 7.77813460279107e-06, + "loss": 3.1823, + "step": 13420 + }, + { + "epoch": 1.1438677235148726, + "grad_norm": 64.7578368934345, + "learning_rate": 7.777722325514303e-06, + "loss": 3.5843, + "step": 13421 + }, + { + "epoch": 1.143952953208898, + "grad_norm": 58.28351005031225, + "learning_rate": 7.777310020919738e-06, + "loss": 2.41, + "step": 13422 + }, + { + "epoch": 1.1440381829029234, + "grad_norm": 72.65776561507059, + "learning_rate": 7.776897689011435e-06, + "loss": 3.8982, + "step": 13423 + }, + { + "epoch": 1.1441234125969488, + "grad_norm": 34.32819207961493, + "learning_rate": 7.776485329793445e-06, + "loss": 2.9751, + "step": 13424 + }, + { + "epoch": 1.144208642290974, + "grad_norm": 40.876786911887265, + "learning_rate": 7.776072943269825e-06, + "loss": 3.3574, + "step": 13425 + }, + { + "epoch": 1.1442938719849995, + "grad_norm": 60.23363713383265, + "learning_rate": 7.775660529444632e-06, + "loss": 4.3776, + "step": 13426 + }, + { + "epoch": 1.144379101679025, + "grad_norm": 39.473533331529424, + "learning_rate": 7.775248088321919e-06, + "loss": 2.5679, + "step": 13427 + }, + { + "epoch": 1.1444643313730505, + "grad_norm": 37.07719378780631, + "learning_rate": 7.774835619905744e-06, + "loss": 3.1881, + "step": 13428 + }, + { + "epoch": 1.1445495610670757, + "grad_norm": 93.60344962429951, + "learning_rate": 7.774423124200166e-06, + "loss": 3.6807, + "step": 13429 + }, + { + "epoch": 1.1446347907611012, + "grad_norm": 39.737175068636944, + "learning_rate": 7.774010601209237e-06, + "loss": 3.0562, + "step": 13430 + }, + { + "epoch": 1.1447200204551264, + "grad_norm": 42.5333777949814, + "learning_rate": 7.773598050937015e-06, + "loss": 3.1158, + "step": 13431 + }, + { + "epoch": 1.144805250149152, + "grad_norm": 67.91700058751009, + "learning_rate": 7.773185473387559e-06, + "loss": 3.0588, + "step": 13432 + }, + { + "epoch": 1.1448904798431774, + "grad_norm": 78.75196680808683, + "learning_rate": 7.772772868564926e-06, + "loss": 2.6997, + "step": 13433 + }, + { + "epoch": 1.1449757095372028, + "grad_norm": 67.12112376124433, + "learning_rate": 7.772360236473175e-06, + "loss": 2.8733, + "step": 13434 + }, + { + "epoch": 1.145060939231228, + "grad_norm": 42.91791902244971, + "learning_rate": 7.77194757711636e-06, + "loss": 3.7446, + "step": 13435 + }, + { + "epoch": 1.1451461689252536, + "grad_norm": 40.75773970092485, + "learning_rate": 7.771534890498544e-06, + "loss": 3.0922, + "step": 13436 + }, + { + "epoch": 1.145231398619279, + "grad_norm": 32.75700945185721, + "learning_rate": 7.771122176623784e-06, + "loss": 2.3494, + "step": 13437 + }, + { + "epoch": 1.1453166283133043, + "grad_norm": 29.429375641611834, + "learning_rate": 7.770709435496138e-06, + "loss": 2.3463, + "step": 13438 + }, + { + "epoch": 1.1454018580073297, + "grad_norm": 31.53850291350907, + "learning_rate": 7.770296667119664e-06, + "loss": 2.7957, + "step": 13439 + }, + { + "epoch": 1.1454870877013552, + "grad_norm": 57.48229437726765, + "learning_rate": 7.769883871498427e-06, + "loss": 2.3252, + "step": 13440 + }, + { + "epoch": 1.1455723173953805, + "grad_norm": 30.972319340007555, + "learning_rate": 7.769471048636479e-06, + "loss": 2.5934, + "step": 13441 + }, + { + "epoch": 1.145657547089406, + "grad_norm": 69.16705957080158, + "learning_rate": 7.769058198537884e-06, + "loss": 3.1667, + "step": 13442 + }, + { + "epoch": 1.1457427767834314, + "grad_norm": 41.45746739275768, + "learning_rate": 7.768645321206703e-06, + "loss": 3.3086, + "step": 13443 + }, + { + "epoch": 1.1458280064774566, + "grad_norm": 73.28359959765099, + "learning_rate": 7.768232416646996e-06, + "loss": 3.294, + "step": 13444 + }, + { + "epoch": 1.145913236171482, + "grad_norm": 113.78870430560093, + "learning_rate": 7.767819484862822e-06, + "loss": 3.2523, + "step": 13445 + }, + { + "epoch": 1.1459984658655076, + "grad_norm": 25.41166863520208, + "learning_rate": 7.767406525858244e-06, + "loss": 2.3277, + "step": 13446 + }, + { + "epoch": 1.146083695559533, + "grad_norm": 103.59795972630546, + "learning_rate": 7.766993539637323e-06, + "loss": 3.8632, + "step": 13447 + }, + { + "epoch": 1.1461689252535583, + "grad_norm": 53.609009242544566, + "learning_rate": 7.766580526204119e-06, + "loss": 3.4808, + "step": 13448 + }, + { + "epoch": 1.1462541549475838, + "grad_norm": 33.56222071242145, + "learning_rate": 7.766167485562694e-06, + "loss": 2.7008, + "step": 13449 + }, + { + "epoch": 1.1463393846416092, + "grad_norm": 32.74141391643154, + "learning_rate": 7.765754417717114e-06, + "loss": 2.3756, + "step": 13450 + }, + { + "epoch": 1.1464246143356345, + "grad_norm": 39.352299643982455, + "learning_rate": 7.765341322671435e-06, + "loss": 2.8352, + "step": 13451 + }, + { + "epoch": 1.14650984402966, + "grad_norm": 40.12809437607833, + "learning_rate": 7.764928200429724e-06, + "loss": 3.1682, + "step": 13452 + }, + { + "epoch": 1.1465950737236854, + "grad_norm": 49.89164323437399, + "learning_rate": 7.764515050996043e-06, + "loss": 2.1055, + "step": 13453 + }, + { + "epoch": 1.1466803034177107, + "grad_norm": 32.187612681008126, + "learning_rate": 7.764101874374454e-06, + "loss": 2.9669, + "step": 13454 + }, + { + "epoch": 1.1467655331117361, + "grad_norm": 82.78169342829513, + "learning_rate": 7.763688670569024e-06, + "loss": 2.7985, + "step": 13455 + }, + { + "epoch": 1.1468507628057616, + "grad_norm": 39.464317126595645, + "learning_rate": 7.763275439583812e-06, + "loss": 3.0537, + "step": 13456 + }, + { + "epoch": 1.1469359924997868, + "grad_norm": 117.22739503385445, + "learning_rate": 7.762862181422884e-06, + "loss": 4.2044, + "step": 13457 + }, + { + "epoch": 1.1470212221938123, + "grad_norm": 62.075424563831845, + "learning_rate": 7.762448896090304e-06, + "loss": 3.8048, + "step": 13458 + }, + { + "epoch": 1.1471064518878378, + "grad_norm": 27.958569452627916, + "learning_rate": 7.76203558359014e-06, + "loss": 2.2126, + "step": 13459 + }, + { + "epoch": 1.1471916815818632, + "grad_norm": 65.50406237821704, + "learning_rate": 7.76162224392645e-06, + "loss": 2.9028, + "step": 13460 + }, + { + "epoch": 1.1472769112758885, + "grad_norm": 39.83321387895369, + "learning_rate": 7.761208877103303e-06, + "loss": 2.8589, + "step": 13461 + }, + { + "epoch": 1.147362140969914, + "grad_norm": 31.731100012981663, + "learning_rate": 7.760795483124766e-06, + "loss": 2.1064, + "step": 13462 + }, + { + "epoch": 1.1474473706639392, + "grad_norm": 59.046761723825064, + "learning_rate": 7.7603820619949e-06, + "loss": 4.2143, + "step": 13463 + }, + { + "epoch": 1.1475326003579647, + "grad_norm": 37.7139150897471, + "learning_rate": 7.759968613717776e-06, + "loss": 2.8331, + "step": 13464 + }, + { + "epoch": 1.1476178300519901, + "grad_norm": 95.77701424327256, + "learning_rate": 7.759555138297456e-06, + "loss": 3.1809, + "step": 13465 + }, + { + "epoch": 1.1477030597460156, + "grad_norm": 36.81204953366339, + "learning_rate": 7.759141635738007e-06, + "loss": 2.7717, + "step": 13466 + }, + { + "epoch": 1.1477882894400409, + "grad_norm": 37.65572522506195, + "learning_rate": 7.758728106043498e-06, + "loss": 2.6219, + "step": 13467 + }, + { + "epoch": 1.1478735191340663, + "grad_norm": 61.13445509308519, + "learning_rate": 7.758314549217992e-06, + "loss": 3.4457, + "step": 13468 + }, + { + "epoch": 1.1479587488280918, + "grad_norm": 35.876464871093305, + "learning_rate": 7.75790096526556e-06, + "loss": 3.0149, + "step": 13469 + }, + { + "epoch": 1.148043978522117, + "grad_norm": 41.01215756601422, + "learning_rate": 7.757487354190267e-06, + "loss": 3.283, + "step": 13470 + }, + { + "epoch": 1.1481292082161425, + "grad_norm": 28.20017809913763, + "learning_rate": 7.757073715996183e-06, + "loss": 1.8213, + "step": 13471 + }, + { + "epoch": 1.148214437910168, + "grad_norm": 27.319202225895772, + "learning_rate": 7.756660050687374e-06, + "loss": 2.1187, + "step": 13472 + }, + { + "epoch": 1.1482996676041932, + "grad_norm": 31.821567474132852, + "learning_rate": 7.756246358267907e-06, + "loss": 2.5644, + "step": 13473 + }, + { + "epoch": 1.1483848972982187, + "grad_norm": 71.04876893464704, + "learning_rate": 7.755832638741856e-06, + "loss": 2.916, + "step": 13474 + }, + { + "epoch": 1.1484701269922442, + "grad_norm": 60.7080623157111, + "learning_rate": 7.755418892113282e-06, + "loss": 3.4732, + "step": 13475 + }, + { + "epoch": 1.1485553566862694, + "grad_norm": 46.73262586143576, + "learning_rate": 7.755005118386261e-06, + "loss": 2.2522, + "step": 13476 + }, + { + "epoch": 1.1486405863802949, + "grad_norm": 31.72556882439679, + "learning_rate": 7.754591317564858e-06, + "loss": 3.0962, + "step": 13477 + }, + { + "epoch": 1.1487258160743203, + "grad_norm": 38.23598804490431, + "learning_rate": 7.754177489653145e-06, + "loss": 2.7329, + "step": 13478 + }, + { + "epoch": 1.1488110457683458, + "grad_norm": 60.81609848892615, + "learning_rate": 7.753763634655191e-06, + "loss": 1.7706, + "step": 13479 + }, + { + "epoch": 1.148896275462371, + "grad_norm": 63.09508816943104, + "learning_rate": 7.753349752575063e-06, + "loss": 3.2226, + "step": 13480 + }, + { + "epoch": 1.1489815051563965, + "grad_norm": 63.76618396567055, + "learning_rate": 7.752935843416837e-06, + "loss": 3.1912, + "step": 13481 + }, + { + "epoch": 1.1490667348504218, + "grad_norm": 35.80651870554892, + "learning_rate": 7.75252190718458e-06, + "loss": 3.2358, + "step": 13482 + }, + { + "epoch": 1.1491519645444472, + "grad_norm": 71.04241861644317, + "learning_rate": 7.752107943882365e-06, + "loss": 3.2156, + "step": 13483 + }, + { + "epoch": 1.1492371942384727, + "grad_norm": 58.24751774800761, + "learning_rate": 7.75169395351426e-06, + "loss": 4.0604, + "step": 13484 + }, + { + "epoch": 1.1493224239324982, + "grad_norm": 23.20320824356415, + "learning_rate": 7.75127993608434e-06, + "loss": 1.7336, + "step": 13485 + }, + { + "epoch": 1.1494076536265234, + "grad_norm": 46.26879114723326, + "learning_rate": 7.750865891596674e-06, + "loss": 2.3504, + "step": 13486 + }, + { + "epoch": 1.1494928833205489, + "grad_norm": 38.76640958400296, + "learning_rate": 7.750451820055337e-06, + "loss": 2.721, + "step": 13487 + }, + { + "epoch": 1.1495781130145744, + "grad_norm": 31.909602533528208, + "learning_rate": 7.750037721464397e-06, + "loss": 2.5935, + "step": 13488 + }, + { + "epoch": 1.1496633427085996, + "grad_norm": 74.37886069582002, + "learning_rate": 7.74962359582793e-06, + "loss": 2.9189, + "step": 13489 + }, + { + "epoch": 1.149748572402625, + "grad_norm": 33.46148430885429, + "learning_rate": 7.749209443150006e-06, + "loss": 2.2188, + "step": 13490 + }, + { + "epoch": 1.1498338020966505, + "grad_norm": 30.723201458399007, + "learning_rate": 7.748795263434701e-06, + "loss": 3.1154, + "step": 13491 + }, + { + "epoch": 1.1499190317906758, + "grad_norm": 40.57508180351432, + "learning_rate": 7.748381056686087e-06, + "loss": 3.4321, + "step": 13492 + }, + { + "epoch": 1.1500042614847013, + "grad_norm": 49.423066661682306, + "learning_rate": 7.747966822908236e-06, + "loss": 2.868, + "step": 13493 + }, + { + "epoch": 1.1500894911787267, + "grad_norm": 42.75183646971368, + "learning_rate": 7.747552562105225e-06, + "loss": 2.0712, + "step": 13494 + }, + { + "epoch": 1.150174720872752, + "grad_norm": 86.68110545312682, + "learning_rate": 7.747138274281125e-06, + "loss": 3.1998, + "step": 13495 + }, + { + "epoch": 1.1502599505667774, + "grad_norm": 31.168423019396126, + "learning_rate": 7.746723959440013e-06, + "loss": 2.5838, + "step": 13496 + }, + { + "epoch": 1.150345180260803, + "grad_norm": 53.71846239263523, + "learning_rate": 7.746309617585959e-06, + "loss": 3.24, + "step": 13497 + }, + { + "epoch": 1.1504304099548284, + "grad_norm": 39.540517732051036, + "learning_rate": 7.745895248723045e-06, + "loss": 2.7471, + "step": 13498 + }, + { + "epoch": 1.1505156396488536, + "grad_norm": 53.19661412331783, + "learning_rate": 7.745480852855341e-06, + "loss": 3.7881, + "step": 13499 + }, + { + "epoch": 1.150600869342879, + "grad_norm": 51.43348587534107, + "learning_rate": 7.745066429986923e-06, + "loss": 3.7471, + "step": 13500 + }, + { + "epoch": 1.1506860990369046, + "grad_norm": 61.72528799188296, + "learning_rate": 7.744651980121867e-06, + "loss": 1.9556, + "step": 13501 + }, + { + "epoch": 1.1507713287309298, + "grad_norm": 32.8981075753371, + "learning_rate": 7.744237503264252e-06, + "loss": 2.548, + "step": 13502 + }, + { + "epoch": 1.1508565584249553, + "grad_norm": 43.43896689914883, + "learning_rate": 7.74382299941815e-06, + "loss": 2.4406, + "step": 13503 + }, + { + "epoch": 1.1509417881189807, + "grad_norm": 123.87237106986409, + "learning_rate": 7.743408468587638e-06, + "loss": 3.2842, + "step": 13504 + }, + { + "epoch": 1.151027017813006, + "grad_norm": 90.30193136780746, + "learning_rate": 7.742993910776796e-06, + "loss": 3.3381, + "step": 13505 + }, + { + "epoch": 1.1511122475070314, + "grad_norm": 100.34066128485425, + "learning_rate": 7.742579325989698e-06, + "loss": 3.0062, + "step": 13506 + }, + { + "epoch": 1.151197477201057, + "grad_norm": 35.31421663112701, + "learning_rate": 7.742164714230422e-06, + "loss": 2.7001, + "step": 13507 + }, + { + "epoch": 1.1512827068950822, + "grad_norm": 41.02476967420718, + "learning_rate": 7.741750075503046e-06, + "loss": 2.2886, + "step": 13508 + }, + { + "epoch": 1.1513679365891076, + "grad_norm": 39.54661649736712, + "learning_rate": 7.741335409811647e-06, + "loss": 2.7631, + "step": 13509 + }, + { + "epoch": 1.151453166283133, + "grad_norm": 99.29744875354665, + "learning_rate": 7.740920717160303e-06, + "loss": 4.2998, + "step": 13510 + }, + { + "epoch": 1.1515383959771583, + "grad_norm": 43.86618940838387, + "learning_rate": 7.740505997553094e-06, + "loss": 3.4949, + "step": 13511 + }, + { + "epoch": 1.1516236256711838, + "grad_norm": 35.705801217057434, + "learning_rate": 7.740091250994097e-06, + "loss": 2.8602, + "step": 13512 + }, + { + "epoch": 1.1517088553652093, + "grad_norm": 27.56178251183173, + "learning_rate": 7.739676477487391e-06, + "loss": 1.792, + "step": 13513 + }, + { + "epoch": 1.1517940850592345, + "grad_norm": 42.88566188912125, + "learning_rate": 7.739261677037056e-06, + "loss": 3.6667, + "step": 13514 + }, + { + "epoch": 1.15187931475326, + "grad_norm": 68.80832927649563, + "learning_rate": 7.73884684964717e-06, + "loss": 3.9905, + "step": 13515 + }, + { + "epoch": 1.1519645444472855, + "grad_norm": 65.70124883923411, + "learning_rate": 7.738431995321816e-06, + "loss": 2.8957, + "step": 13516 + }, + { + "epoch": 1.152049774141311, + "grad_norm": 34.395350448550275, + "learning_rate": 7.738017114065068e-06, + "loss": 2.6646, + "step": 13517 + }, + { + "epoch": 1.1521350038353362, + "grad_norm": 54.94327089122905, + "learning_rate": 7.737602205881011e-06, + "loss": 1.6887, + "step": 13518 + }, + { + "epoch": 1.1522202335293616, + "grad_norm": 59.91336263915264, + "learning_rate": 7.737187270773725e-06, + "loss": 2.4093, + "step": 13519 + }, + { + "epoch": 1.1523054632233871, + "grad_norm": 35.427874882299776, + "learning_rate": 7.73677230874729e-06, + "loss": 2.3079, + "step": 13520 + }, + { + "epoch": 1.1523906929174124, + "grad_norm": 40.493125564337895, + "learning_rate": 7.736357319805787e-06, + "loss": 3.1031, + "step": 13521 + }, + { + "epoch": 1.1524759226114378, + "grad_norm": 84.79955247562108, + "learning_rate": 7.735942303953297e-06, + "loss": 4.0133, + "step": 13522 + }, + { + "epoch": 1.1525611523054633, + "grad_norm": 53.80876658468925, + "learning_rate": 7.7355272611939e-06, + "loss": 2.5217, + "step": 13523 + }, + { + "epoch": 1.1526463819994885, + "grad_norm": 65.94699594234618, + "learning_rate": 7.73511219153168e-06, + "loss": 2.4605, + "step": 13524 + }, + { + "epoch": 1.152731611693514, + "grad_norm": 96.25020277878495, + "learning_rate": 7.734697094970717e-06, + "loss": 2.6589, + "step": 13525 + }, + { + "epoch": 1.1528168413875395, + "grad_norm": 92.52272301081209, + "learning_rate": 7.734281971515099e-06, + "loss": 3.5562, + "step": 13526 + }, + { + "epoch": 1.1529020710815647, + "grad_norm": 58.343570551227394, + "learning_rate": 7.7338668211689e-06, + "loss": 2.2446, + "step": 13527 + }, + { + "epoch": 1.1529873007755902, + "grad_norm": 101.9985071959029, + "learning_rate": 7.73345164393621e-06, + "loss": 2.6154, + "step": 13528 + }, + { + "epoch": 1.1530725304696157, + "grad_norm": 39.45193324702146, + "learning_rate": 7.733036439821108e-06, + "loss": 2.9273, + "step": 13529 + }, + { + "epoch": 1.1531577601636411, + "grad_norm": 69.93702748106945, + "learning_rate": 7.73262120882768e-06, + "loss": 2.7526, + "step": 13530 + }, + { + "epoch": 1.1532429898576664, + "grad_norm": 77.4976321619019, + "learning_rate": 7.732205950960006e-06, + "loss": 2.9072, + "step": 13531 + }, + { + "epoch": 1.1533282195516918, + "grad_norm": 39.72718954400888, + "learning_rate": 7.731790666222173e-06, + "loss": 2.9468, + "step": 13532 + }, + { + "epoch": 1.153413449245717, + "grad_norm": 34.392057866813595, + "learning_rate": 7.731375354618265e-06, + "loss": 3.3095, + "step": 13533 + }, + { + "epoch": 1.1534986789397426, + "grad_norm": 38.865690979720696, + "learning_rate": 7.730960016152364e-06, + "loss": 2.7638, + "step": 13534 + }, + { + "epoch": 1.153583908633768, + "grad_norm": 34.99895278438502, + "learning_rate": 7.730544650828558e-06, + "loss": 3.0411, + "step": 13535 + }, + { + "epoch": 1.1536691383277935, + "grad_norm": 32.4461146146375, + "learning_rate": 7.73012925865093e-06, + "loss": 2.7195, + "step": 13536 + }, + { + "epoch": 1.1537543680218187, + "grad_norm": 38.7305417527391, + "learning_rate": 7.729713839623566e-06, + "loss": 2.8225, + "step": 13537 + }, + { + "epoch": 1.1538395977158442, + "grad_norm": 110.7977861700521, + "learning_rate": 7.72929839375055e-06, + "loss": 2.8893, + "step": 13538 + }, + { + "epoch": 1.1539248274098697, + "grad_norm": 51.280667644779086, + "learning_rate": 7.72888292103597e-06, + "loss": 2.7421, + "step": 13539 + }, + { + "epoch": 1.154010057103895, + "grad_norm": 29.348974456109488, + "learning_rate": 7.72846742148391e-06, + "loss": 1.6483, + "step": 13540 + }, + { + "epoch": 1.1540952867979204, + "grad_norm": 30.02480619563264, + "learning_rate": 7.728051895098458e-06, + "loss": 2.3601, + "step": 13541 + }, + { + "epoch": 1.1541805164919459, + "grad_norm": 27.863761109239025, + "learning_rate": 7.7276363418837e-06, + "loss": 2.7398, + "step": 13542 + }, + { + "epoch": 1.154265746185971, + "grad_norm": 18.675761406854964, + "learning_rate": 7.72722076184372e-06, + "loss": 1.3136, + "step": 13543 + }, + { + "epoch": 1.1543509758799966, + "grad_norm": 49.95068218902872, + "learning_rate": 7.726805154982608e-06, + "loss": 3.1114, + "step": 13544 + }, + { + "epoch": 1.154436205574022, + "grad_norm": 62.655001113286396, + "learning_rate": 7.726389521304453e-06, + "loss": 3.4436, + "step": 13545 + }, + { + "epoch": 1.1545214352680473, + "grad_norm": 34.420048627839186, + "learning_rate": 7.725973860813338e-06, + "loss": 2.7495, + "step": 13546 + }, + { + "epoch": 1.1546066649620728, + "grad_norm": 36.826748925273854, + "learning_rate": 7.725558173513355e-06, + "loss": 3.4468, + "step": 13547 + }, + { + "epoch": 1.1546918946560982, + "grad_norm": 54.117000415950955, + "learning_rate": 7.72514245940859e-06, + "loss": 3.6482, + "step": 13548 + }, + { + "epoch": 1.1547771243501237, + "grad_norm": 59.05301794052241, + "learning_rate": 7.724726718503131e-06, + "loss": 2.646, + "step": 13549 + }, + { + "epoch": 1.154862354044149, + "grad_norm": 51.78284091099767, + "learning_rate": 7.724310950801069e-06, + "loss": 2.5281, + "step": 13550 + }, + { + "epoch": 1.1549475837381744, + "grad_norm": 42.826631109946334, + "learning_rate": 7.72389515630649e-06, + "loss": 3.0724, + "step": 13551 + }, + { + "epoch": 1.1550328134321997, + "grad_norm": 36.249677415717585, + "learning_rate": 7.723479335023485e-06, + "loss": 2.8624, + "step": 13552 + }, + { + "epoch": 1.1551180431262251, + "grad_norm": 40.23508380175123, + "learning_rate": 7.723063486956142e-06, + "loss": 3.6073, + "step": 13553 + }, + { + "epoch": 1.1552032728202506, + "grad_norm": 35.77843562845759, + "learning_rate": 7.722647612108554e-06, + "loss": 2.0093, + "step": 13554 + }, + { + "epoch": 1.155288502514276, + "grad_norm": 27.588159646164414, + "learning_rate": 7.722231710484807e-06, + "loss": 1.9025, + "step": 13555 + }, + { + "epoch": 1.1553737322083013, + "grad_norm": 29.82152697415569, + "learning_rate": 7.721815782088992e-06, + "loss": 2.4871, + "step": 13556 + }, + { + "epoch": 1.1554589619023268, + "grad_norm": 96.81789664560846, + "learning_rate": 7.721399826925203e-06, + "loss": 3.3938, + "step": 13557 + }, + { + "epoch": 1.1555441915963522, + "grad_norm": 27.102977794252876, + "learning_rate": 7.720983844997525e-06, + "loss": 2.0056, + "step": 13558 + }, + { + "epoch": 1.1556294212903775, + "grad_norm": 42.294048742598, + "learning_rate": 7.720567836310054e-06, + "loss": 3.4214, + "step": 13559 + }, + { + "epoch": 1.155714650984403, + "grad_norm": 97.78137719159973, + "learning_rate": 7.720151800866877e-06, + "loss": 2.8434, + "step": 13560 + }, + { + "epoch": 1.1557998806784284, + "grad_norm": 64.06016067260384, + "learning_rate": 7.71973573867209e-06, + "loss": 2.5859, + "step": 13561 + }, + { + "epoch": 1.1558851103724537, + "grad_norm": 47.95260928655769, + "learning_rate": 7.719319649729784e-06, + "loss": 2.8987, + "step": 13562 + }, + { + "epoch": 1.1559703400664791, + "grad_norm": 49.485061352236954, + "learning_rate": 7.718903534044046e-06, + "loss": 2.426, + "step": 13563 + }, + { + "epoch": 1.1560555697605046, + "grad_norm": 42.050104274369005, + "learning_rate": 7.718487391618977e-06, + "loss": 2.92, + "step": 13564 + }, + { + "epoch": 1.1561407994545299, + "grad_norm": 54.114739707102686, + "learning_rate": 7.718071222458662e-06, + "loss": 2.4141, + "step": 13565 + }, + { + "epoch": 1.1562260291485553, + "grad_norm": 56.8503771586581, + "learning_rate": 7.717655026567196e-06, + "loss": 3.035, + "step": 13566 + }, + { + "epoch": 1.1563112588425808, + "grad_norm": 39.73507857349146, + "learning_rate": 7.717238803948674e-06, + "loss": 3.0702, + "step": 13567 + }, + { + "epoch": 1.1563964885366063, + "grad_norm": 38.923355594563546, + "learning_rate": 7.716822554607186e-06, + "loss": 2.1136, + "step": 13568 + }, + { + "epoch": 1.1564817182306315, + "grad_norm": 28.0606576545591, + "learning_rate": 7.716406278546831e-06, + "loss": 1.8856, + "step": 13569 + }, + { + "epoch": 1.156566947924657, + "grad_norm": 161.39191958483434, + "learning_rate": 7.715989975771696e-06, + "loss": 4.9217, + "step": 13570 + }, + { + "epoch": 1.1566521776186824, + "grad_norm": 61.66627822248925, + "learning_rate": 7.715573646285882e-06, + "loss": 2.698, + "step": 13571 + }, + { + "epoch": 1.1567374073127077, + "grad_norm": 92.02568507726043, + "learning_rate": 7.715157290093478e-06, + "loss": 3.1692, + "step": 13572 + }, + { + "epoch": 1.1568226370067332, + "grad_norm": 76.93424921186093, + "learning_rate": 7.714740907198584e-06, + "loss": 3.3083, + "step": 13573 + }, + { + "epoch": 1.1569078667007586, + "grad_norm": 34.72818749050556, + "learning_rate": 7.714324497605287e-06, + "loss": 2.0659, + "step": 13574 + }, + { + "epoch": 1.1569930963947839, + "grad_norm": 26.695857989660787, + "learning_rate": 7.713908061317692e-06, + "loss": 1.7156, + "step": 13575 + }, + { + "epoch": 1.1570783260888093, + "grad_norm": 51.51137284828775, + "learning_rate": 7.713491598339887e-06, + "loss": 2.0972, + "step": 13576 + }, + { + "epoch": 1.1571635557828348, + "grad_norm": 36.52550026942879, + "learning_rate": 7.713075108675971e-06, + "loss": 3.204, + "step": 13577 + }, + { + "epoch": 1.15724878547686, + "grad_norm": 32.753508523144546, + "learning_rate": 7.712658592330038e-06, + "loss": 3.2244, + "step": 13578 + }, + { + "epoch": 1.1573340151708855, + "grad_norm": 30.717186333002275, + "learning_rate": 7.712242049306188e-06, + "loss": 1.2857, + "step": 13579 + }, + { + "epoch": 1.157419244864911, + "grad_norm": 57.860768373852146, + "learning_rate": 7.711825479608515e-06, + "loss": 2.7472, + "step": 13580 + }, + { + "epoch": 1.1575044745589365, + "grad_norm": 48.91224221922134, + "learning_rate": 7.711408883241114e-06, + "loss": 3.1848, + "step": 13581 + }, + { + "epoch": 1.1575897042529617, + "grad_norm": 67.779423164405, + "learning_rate": 7.710992260208085e-06, + "loss": 2.7116, + "step": 13582 + }, + { + "epoch": 1.1576749339469872, + "grad_norm": 108.3245211336169, + "learning_rate": 7.710575610513524e-06, + "loss": 4.5599, + "step": 13583 + }, + { + "epoch": 1.1577601636410124, + "grad_norm": 82.2040029641516, + "learning_rate": 7.71015893416153e-06, + "loss": 3.0181, + "step": 13584 + }, + { + "epoch": 1.1578453933350379, + "grad_norm": 38.54163363791968, + "learning_rate": 7.7097422311562e-06, + "loss": 2.7867, + "step": 13585 + }, + { + "epoch": 1.1579306230290634, + "grad_norm": 61.50706757801673, + "learning_rate": 7.709325501501629e-06, + "loss": 2.9994, + "step": 13586 + }, + { + "epoch": 1.1580158527230888, + "grad_norm": 67.3503894542496, + "learning_rate": 7.708908745201921e-06, + "loss": 2.4341, + "step": 13587 + }, + { + "epoch": 1.158101082417114, + "grad_norm": 41.89885566980865, + "learning_rate": 7.708491962261173e-06, + "loss": 3.0247, + "step": 13588 + }, + { + "epoch": 1.1581863121111395, + "grad_norm": 31.744837344057352, + "learning_rate": 7.70807515268348e-06, + "loss": 3.1993, + "step": 13589 + }, + { + "epoch": 1.158271541805165, + "grad_norm": 58.04260180690922, + "learning_rate": 7.707658316472945e-06, + "loss": 3.8935, + "step": 13590 + }, + { + "epoch": 1.1583567714991903, + "grad_norm": 74.35765912233157, + "learning_rate": 7.707241453633667e-06, + "loss": 4.0272, + "step": 13591 + }, + { + "epoch": 1.1584420011932157, + "grad_norm": 36.07378224053762, + "learning_rate": 7.706824564169743e-06, + "loss": 2.9299, + "step": 13592 + }, + { + "epoch": 1.1585272308872412, + "grad_norm": 57.64920644365033, + "learning_rate": 7.706407648085278e-06, + "loss": 2.2796, + "step": 13593 + }, + { + "epoch": 1.1586124605812664, + "grad_norm": 37.28764632358204, + "learning_rate": 7.705990705384368e-06, + "loss": 2.6272, + "step": 13594 + }, + { + "epoch": 1.158697690275292, + "grad_norm": 74.62867047928619, + "learning_rate": 7.705573736071115e-06, + "loss": 3.6286, + "step": 13595 + }, + { + "epoch": 1.1587829199693174, + "grad_norm": 42.8880149236139, + "learning_rate": 7.705156740149617e-06, + "loss": 3.191, + "step": 13596 + }, + { + "epoch": 1.1588681496633426, + "grad_norm": 45.45049002454804, + "learning_rate": 7.70473971762398e-06, + "loss": 1.7555, + "step": 13597 + }, + { + "epoch": 1.158953379357368, + "grad_norm": 38.66124013859831, + "learning_rate": 7.704322668498302e-06, + "loss": 3.0434, + "step": 13598 + }, + { + "epoch": 1.1590386090513936, + "grad_norm": 38.34960147086683, + "learning_rate": 7.703905592776685e-06, + "loss": 2.8404, + "step": 13599 + }, + { + "epoch": 1.159123838745419, + "grad_norm": 45.9818486756133, + "learning_rate": 7.703488490463231e-06, + "loss": 2.93, + "step": 13600 + }, + { + "epoch": 1.1592090684394443, + "grad_norm": 34.69538482885223, + "learning_rate": 7.703071361562041e-06, + "loss": 2.3288, + "step": 13601 + }, + { + "epoch": 1.1592942981334697, + "grad_norm": 76.52210250357037, + "learning_rate": 7.70265420607722e-06, + "loss": 2.7031, + "step": 13602 + }, + { + "epoch": 1.159379527827495, + "grad_norm": 64.99576562342439, + "learning_rate": 7.702237024012867e-06, + "loss": 2.6442, + "step": 13603 + }, + { + "epoch": 1.1594647575215205, + "grad_norm": 42.65564273088822, + "learning_rate": 7.701819815373088e-06, + "loss": 3.0954, + "step": 13604 + }, + { + "epoch": 1.159549987215546, + "grad_norm": 52.32376128120649, + "learning_rate": 7.701402580161986e-06, + "loss": 3.0396, + "step": 13605 + }, + { + "epoch": 1.1596352169095714, + "grad_norm": 66.3437424877549, + "learning_rate": 7.700985318383661e-06, + "loss": 2.7388, + "step": 13606 + }, + { + "epoch": 1.1597204466035966, + "grad_norm": 59.88866103440107, + "learning_rate": 7.700568030042217e-06, + "loss": 2.7619, + "step": 13607 + }, + { + "epoch": 1.159805676297622, + "grad_norm": 34.12684585739894, + "learning_rate": 7.700150715141763e-06, + "loss": 2.8372, + "step": 13608 + }, + { + "epoch": 1.1598909059916476, + "grad_norm": 95.52211300421158, + "learning_rate": 7.699733373686398e-06, + "loss": 3.5303, + "step": 13609 + }, + { + "epoch": 1.1599761356856728, + "grad_norm": 42.30561810205686, + "learning_rate": 7.699316005680228e-06, + "loss": 3.3375, + "step": 13610 + }, + { + "epoch": 1.1600613653796983, + "grad_norm": 59.34845236038572, + "learning_rate": 7.698898611127356e-06, + "loss": 1.9472, + "step": 13611 + }, + { + "epoch": 1.1601465950737238, + "grad_norm": 30.86673815862774, + "learning_rate": 7.698481190031893e-06, + "loss": 2.987, + "step": 13612 + }, + { + "epoch": 1.160231824767749, + "grad_norm": 72.08061027079005, + "learning_rate": 7.698063742397936e-06, + "loss": 3.5267, + "step": 13613 + }, + { + "epoch": 1.1603170544617745, + "grad_norm": 21.88844971102711, + "learning_rate": 7.697646268229595e-06, + "loss": 1.6257, + "step": 13614 + }, + { + "epoch": 1.1604022841558, + "grad_norm": 60.82907933687577, + "learning_rate": 7.697228767530975e-06, + "loss": 2.4229, + "step": 13615 + }, + { + "epoch": 1.1604875138498252, + "grad_norm": 49.133650193605554, + "learning_rate": 7.696811240306184e-06, + "loss": 2.2434, + "step": 13616 + }, + { + "epoch": 1.1605727435438506, + "grad_norm": 258.00170047181325, + "learning_rate": 7.696393686559322e-06, + "loss": 4.4796, + "step": 13617 + }, + { + "epoch": 1.1606579732378761, + "grad_norm": 59.00279272085307, + "learning_rate": 7.695976106294503e-06, + "loss": 2.8282, + "step": 13618 + }, + { + "epoch": 1.1607432029319016, + "grad_norm": 71.11816226769902, + "learning_rate": 7.695558499515829e-06, + "loss": 2.8974, + "step": 13619 + }, + { + "epoch": 1.1608284326259268, + "grad_norm": 44.57435352269905, + "learning_rate": 7.695140866227408e-06, + "loss": 2.4506, + "step": 13620 + }, + { + "epoch": 1.1609136623199523, + "grad_norm": 84.28400114105241, + "learning_rate": 7.694723206433348e-06, + "loss": 2.998, + "step": 13621 + }, + { + "epoch": 1.1609988920139775, + "grad_norm": 41.20483912248173, + "learning_rate": 7.694305520137755e-06, + "loss": 3.3479, + "step": 13622 + }, + { + "epoch": 1.161084121708003, + "grad_norm": 62.644950673474646, + "learning_rate": 7.69388780734474e-06, + "loss": 3.198, + "step": 13623 + }, + { + "epoch": 1.1611693514020285, + "grad_norm": 41.33417156978883, + "learning_rate": 7.693470068058406e-06, + "loss": 3.5447, + "step": 13624 + }, + { + "epoch": 1.161254581096054, + "grad_norm": 107.8602206186845, + "learning_rate": 7.693052302282867e-06, + "loss": 2.6091, + "step": 13625 + }, + { + "epoch": 1.1613398107900792, + "grad_norm": 160.78066747427232, + "learning_rate": 7.692634510022227e-06, + "loss": 3.3026, + "step": 13626 + }, + { + "epoch": 1.1614250404841047, + "grad_norm": 27.50627830559935, + "learning_rate": 7.692216691280596e-06, + "loss": 2.5397, + "step": 13627 + }, + { + "epoch": 1.1615102701781301, + "grad_norm": 48.61095317946724, + "learning_rate": 7.691798846062086e-06, + "loss": 3.1194, + "step": 13628 + }, + { + "epoch": 1.1615954998721554, + "grad_norm": 39.15825555241062, + "learning_rate": 7.691380974370803e-06, + "loss": 2.7477, + "step": 13629 + }, + { + "epoch": 1.1616807295661808, + "grad_norm": 41.731800929252046, + "learning_rate": 7.690963076210856e-06, + "loss": 2.5036, + "step": 13630 + }, + { + "epoch": 1.1617659592602063, + "grad_norm": 46.581021471828834, + "learning_rate": 7.690545151586358e-06, + "loss": 2.3191, + "step": 13631 + }, + { + "epoch": 1.1618511889542316, + "grad_norm": 36.864467611558744, + "learning_rate": 7.690127200501416e-06, + "loss": 2.8096, + "step": 13632 + }, + { + "epoch": 1.161936418648257, + "grad_norm": 21.963680584320088, + "learning_rate": 7.689709222960144e-06, + "loss": 2.0424, + "step": 13633 + }, + { + "epoch": 1.1620216483422825, + "grad_norm": 76.02027203822824, + "learning_rate": 7.689291218966648e-06, + "loss": 2.7891, + "step": 13634 + }, + { + "epoch": 1.1621068780363077, + "grad_norm": 95.05199101278927, + "learning_rate": 7.688873188525044e-06, + "loss": 2.8941, + "step": 13635 + }, + { + "epoch": 1.1621921077303332, + "grad_norm": 24.855066943139708, + "learning_rate": 7.688455131639441e-06, + "loss": 1.8252, + "step": 13636 + }, + { + "epoch": 1.1622773374243587, + "grad_norm": 91.30783966047629, + "learning_rate": 7.688037048313947e-06, + "loss": 3.5858, + "step": 13637 + }, + { + "epoch": 1.1623625671183841, + "grad_norm": 95.26185163941003, + "learning_rate": 7.687618938552679e-06, + "loss": 4.2037, + "step": 13638 + }, + { + "epoch": 1.1624477968124094, + "grad_norm": 98.83613016237759, + "learning_rate": 7.687200802359747e-06, + "loss": 2.126, + "step": 13639 + }, + { + "epoch": 1.1625330265064349, + "grad_norm": 54.373500675837825, + "learning_rate": 7.686782639739263e-06, + "loss": 2.2626, + "step": 13640 + }, + { + "epoch": 1.1626182562004603, + "grad_norm": 37.65155328138133, + "learning_rate": 7.686364450695338e-06, + "loss": 2.6589, + "step": 13641 + }, + { + "epoch": 1.1627034858944856, + "grad_norm": 57.873247623436804, + "learning_rate": 7.685946235232088e-06, + "loss": 2.3889, + "step": 13642 + }, + { + "epoch": 1.162788715588511, + "grad_norm": 43.74389063084973, + "learning_rate": 7.685527993353622e-06, + "loss": 2.8465, + "step": 13643 + }, + { + "epoch": 1.1628739452825365, + "grad_norm": 34.77227043101249, + "learning_rate": 7.685109725064055e-06, + "loss": 2.221, + "step": 13644 + }, + { + "epoch": 1.1629591749765618, + "grad_norm": 30.044901582826, + "learning_rate": 7.684691430367501e-06, + "loss": 2.4044, + "step": 13645 + }, + { + "epoch": 1.1630444046705872, + "grad_norm": 94.70462617507438, + "learning_rate": 7.684273109268075e-06, + "loss": 3.4106, + "step": 13646 + }, + { + "epoch": 1.1631296343646127, + "grad_norm": 86.48170319155925, + "learning_rate": 7.68385476176989e-06, + "loss": 3.0594, + "step": 13647 + }, + { + "epoch": 1.163214864058638, + "grad_norm": 37.60478329204748, + "learning_rate": 7.683436387877058e-06, + "loss": 2.9442, + "step": 13648 + }, + { + "epoch": 1.1633000937526634, + "grad_norm": 60.75454881810907, + "learning_rate": 7.683017987593696e-06, + "loss": 2.7096, + "step": 13649 + }, + { + "epoch": 1.1633853234466889, + "grad_norm": 43.35640179706338, + "learning_rate": 7.682599560923917e-06, + "loss": 3.1618, + "step": 13650 + }, + { + "epoch": 1.1634705531407143, + "grad_norm": 37.003333164542966, + "learning_rate": 7.68218110787184e-06, + "loss": 2.1907, + "step": 13651 + }, + { + "epoch": 1.1635557828347396, + "grad_norm": 42.47776576644919, + "learning_rate": 7.681762628441576e-06, + "loss": 2.3079, + "step": 13652 + }, + { + "epoch": 1.163641012528765, + "grad_norm": 65.67709324846902, + "learning_rate": 7.681344122637243e-06, + "loss": 2.7417, + "step": 13653 + }, + { + "epoch": 1.1637262422227903, + "grad_norm": 108.85904281860938, + "learning_rate": 7.680925590462955e-06, + "loss": 3.9242, + "step": 13654 + }, + { + "epoch": 1.1638114719168158, + "grad_norm": 103.25592749345542, + "learning_rate": 7.68050703192283e-06, + "loss": 3.4705, + "step": 13655 + }, + { + "epoch": 1.1638967016108412, + "grad_norm": 32.53925855455111, + "learning_rate": 7.680088447020984e-06, + "loss": 1.9907, + "step": 13656 + }, + { + "epoch": 1.1639819313048667, + "grad_norm": 68.81064757879358, + "learning_rate": 7.679669835761531e-06, + "loss": 2.0874, + "step": 13657 + }, + { + "epoch": 1.164067160998892, + "grad_norm": 71.51532376396241, + "learning_rate": 7.679251198148591e-06, + "loss": 3.0159, + "step": 13658 + }, + { + "epoch": 1.1641523906929174, + "grad_norm": 41.83303883363893, + "learning_rate": 7.678832534186281e-06, + "loss": 3.2804, + "step": 13659 + }, + { + "epoch": 1.164237620386943, + "grad_norm": 79.73661108683096, + "learning_rate": 7.678413843878717e-06, + "loss": 3.5974, + "step": 13660 + }, + { + "epoch": 1.1643228500809681, + "grad_norm": 35.463481377055004, + "learning_rate": 7.677995127230015e-06, + "loss": 3.1727, + "step": 13661 + }, + { + "epoch": 1.1644080797749936, + "grad_norm": 42.78438610673103, + "learning_rate": 7.677576384244298e-06, + "loss": 3.0041, + "step": 13662 + }, + { + "epoch": 1.164493309469019, + "grad_norm": 47.39723231192181, + "learning_rate": 7.677157614925681e-06, + "loss": 3.3911, + "step": 13663 + }, + { + "epoch": 1.1645785391630443, + "grad_norm": 46.24009924117266, + "learning_rate": 7.67673881927828e-06, + "loss": 3.6386, + "step": 13664 + }, + { + "epoch": 1.1646637688570698, + "grad_norm": 36.04463061868952, + "learning_rate": 7.676319997306218e-06, + "loss": 3.043, + "step": 13665 + }, + { + "epoch": 1.1647489985510953, + "grad_norm": 40.37672553699558, + "learning_rate": 7.675901149013614e-06, + "loss": 2.4061, + "step": 13666 + }, + { + "epoch": 1.1648342282451205, + "grad_norm": 198.7805093343411, + "learning_rate": 7.675482274404584e-06, + "loss": 2.5862, + "step": 13667 + }, + { + "epoch": 1.164919457939146, + "grad_norm": 53.26121310427001, + "learning_rate": 7.67506337348325e-06, + "loss": 3.8007, + "step": 13668 + }, + { + "epoch": 1.1650046876331714, + "grad_norm": 76.4085801949789, + "learning_rate": 7.674644446253728e-06, + "loss": 3.1286, + "step": 13669 + }, + { + "epoch": 1.165089917327197, + "grad_norm": 58.27645943266262, + "learning_rate": 7.674225492720141e-06, + "loss": 3.1024, + "step": 13670 + }, + { + "epoch": 1.1651751470212222, + "grad_norm": 56.77933876497759, + "learning_rate": 7.673806512886612e-06, + "loss": 2.5457, + "step": 13671 + }, + { + "epoch": 1.1652603767152476, + "grad_norm": 56.484107347116044, + "learning_rate": 7.673387506757257e-06, + "loss": 2.5857, + "step": 13672 + }, + { + "epoch": 1.1653456064092729, + "grad_norm": 55.47492086187015, + "learning_rate": 7.672968474336196e-06, + "loss": 2.6714, + "step": 13673 + }, + { + "epoch": 1.1654308361032983, + "grad_norm": 78.51331439417139, + "learning_rate": 7.672549415627555e-06, + "loss": 2.9731, + "step": 13674 + }, + { + "epoch": 1.1655160657973238, + "grad_norm": 46.445926844089975, + "learning_rate": 7.672130330635453e-06, + "loss": 3.1657, + "step": 13675 + }, + { + "epoch": 1.1656012954913493, + "grad_norm": 53.912576569573154, + "learning_rate": 7.671711219364008e-06, + "loss": 3.1614, + "step": 13676 + }, + { + "epoch": 1.1656865251853745, + "grad_norm": 29.620026517619532, + "learning_rate": 7.671292081817345e-06, + "loss": 2.8704, + "step": 13677 + }, + { + "epoch": 1.1657717548794, + "grad_norm": 62.396247735977596, + "learning_rate": 7.670872917999588e-06, + "loss": 3.7766, + "step": 13678 + }, + { + "epoch": 1.1658569845734255, + "grad_norm": 50.916451976025584, + "learning_rate": 7.670453727914857e-06, + "loss": 2.4113, + "step": 13679 + }, + { + "epoch": 1.1659422142674507, + "grad_norm": 87.58502693184944, + "learning_rate": 7.670034511567274e-06, + "loss": 3.1272, + "step": 13680 + }, + { + "epoch": 1.1660274439614762, + "grad_norm": 44.69245826343002, + "learning_rate": 7.669615268960962e-06, + "loss": 3.2471, + "step": 13681 + }, + { + "epoch": 1.1661126736555016, + "grad_norm": 53.03570975686874, + "learning_rate": 7.669196000100045e-06, + "loss": 3.239, + "step": 13682 + }, + { + "epoch": 1.1661979033495269, + "grad_norm": 30.100453725489416, + "learning_rate": 7.668776704988647e-06, + "loss": 3.2063, + "step": 13683 + }, + { + "epoch": 1.1662831330435524, + "grad_norm": 43.40877696573102, + "learning_rate": 7.668357383630888e-06, + "loss": 3.3398, + "step": 13684 + }, + { + "epoch": 1.1663683627375778, + "grad_norm": 59.56852202392226, + "learning_rate": 7.667938036030898e-06, + "loss": 3.4408, + "step": 13685 + }, + { + "epoch": 1.166453592431603, + "grad_norm": 94.28330032504911, + "learning_rate": 7.667518662192794e-06, + "loss": 3.704, + "step": 13686 + }, + { + "epoch": 1.1665388221256285, + "grad_norm": 35.231653720061296, + "learning_rate": 7.667099262120705e-06, + "loss": 2.4877, + "step": 13687 + }, + { + "epoch": 1.166624051819654, + "grad_norm": 51.26760896434843, + "learning_rate": 7.666679835818756e-06, + "loss": 2.5424, + "step": 13688 + }, + { + "epoch": 1.1667092815136795, + "grad_norm": 47.80205743640497, + "learning_rate": 7.666260383291069e-06, + "loss": 3.1098, + "step": 13689 + }, + { + "epoch": 1.1667945112077047, + "grad_norm": 42.2109041748058, + "learning_rate": 7.665840904541771e-06, + "loss": 2.7195, + "step": 13690 + }, + { + "epoch": 1.1668797409017302, + "grad_norm": 44.74327780376709, + "learning_rate": 7.665421399574988e-06, + "loss": 2.9645, + "step": 13691 + }, + { + "epoch": 1.1669649705957557, + "grad_norm": 71.37517970954691, + "learning_rate": 7.665001868394843e-06, + "loss": 3.3754, + "step": 13692 + }, + { + "epoch": 1.167050200289781, + "grad_norm": 85.93711939150505, + "learning_rate": 7.664582311005465e-06, + "loss": 3.9016, + "step": 13693 + }, + { + "epoch": 1.1671354299838064, + "grad_norm": 40.27036173600915, + "learning_rate": 7.664162727410978e-06, + "loss": 2.9385, + "step": 13694 + }, + { + "epoch": 1.1672206596778318, + "grad_norm": 56.148773334512235, + "learning_rate": 7.66374311761551e-06, + "loss": 2.8479, + "step": 13695 + }, + { + "epoch": 1.167305889371857, + "grad_norm": 25.701320371460476, + "learning_rate": 7.663323481623186e-06, + "loss": 1.9555, + "step": 13696 + }, + { + "epoch": 1.1673911190658826, + "grad_norm": 60.97662080780539, + "learning_rate": 7.662903819438134e-06, + "loss": 2.4246, + "step": 13697 + }, + { + "epoch": 1.167476348759908, + "grad_norm": 31.585780855587277, + "learning_rate": 7.662484131064482e-06, + "loss": 2.469, + "step": 13698 + }, + { + "epoch": 1.1675615784539333, + "grad_norm": 55.395679931747964, + "learning_rate": 7.662064416506357e-06, + "loss": 3.5629, + "step": 13699 + }, + { + "epoch": 1.1676468081479587, + "grad_norm": 51.60671234188201, + "learning_rate": 7.661644675767883e-06, + "loss": 3.0451, + "step": 13700 + }, + { + "epoch": 1.1677320378419842, + "grad_norm": 56.89441731571241, + "learning_rate": 7.661224908853192e-06, + "loss": 3.3899, + "step": 13701 + }, + { + "epoch": 1.1678172675360095, + "grad_norm": 95.78505309352579, + "learning_rate": 7.660805115766414e-06, + "loss": 3.5904, + "step": 13702 + }, + { + "epoch": 1.167902497230035, + "grad_norm": 60.58671875557527, + "learning_rate": 7.660385296511674e-06, + "loss": 3.9556, + "step": 13703 + }, + { + "epoch": 1.1679877269240604, + "grad_norm": 44.63071974308823, + "learning_rate": 7.659965451093102e-06, + "loss": 4.0546, + "step": 13704 + }, + { + "epoch": 1.1680729566180856, + "grad_norm": 32.52247814004547, + "learning_rate": 7.659545579514826e-06, + "loss": 2.6035, + "step": 13705 + }, + { + "epoch": 1.168158186312111, + "grad_norm": 30.96975220437435, + "learning_rate": 7.659125681780977e-06, + "loss": 2.1534, + "step": 13706 + }, + { + "epoch": 1.1682434160061366, + "grad_norm": 42.22278958431205, + "learning_rate": 7.658705757895683e-06, + "loss": 2.3035, + "step": 13707 + }, + { + "epoch": 1.168328645700162, + "grad_norm": 36.361959515101454, + "learning_rate": 7.658285807863072e-06, + "loss": 2.7999, + "step": 13708 + }, + { + "epoch": 1.1684138753941873, + "grad_norm": 31.852763492007313, + "learning_rate": 7.65786583168728e-06, + "loss": 2.2263, + "step": 13709 + }, + { + "epoch": 1.1684991050882128, + "grad_norm": 52.62619549299175, + "learning_rate": 7.657445829372433e-06, + "loss": 2.1203, + "step": 13710 + }, + { + "epoch": 1.1685843347822382, + "grad_norm": 62.425939606127635, + "learning_rate": 7.657025800922662e-06, + "loss": 2.8815, + "step": 13711 + }, + { + "epoch": 1.1686695644762635, + "grad_norm": 21.27309664397605, + "learning_rate": 7.656605746342096e-06, + "loss": 2.1789, + "step": 13712 + }, + { + "epoch": 1.168754794170289, + "grad_norm": 53.22557670229934, + "learning_rate": 7.65618566563487e-06, + "loss": 2.9081, + "step": 13713 + }, + { + "epoch": 1.1688400238643144, + "grad_norm": 52.04173527282965, + "learning_rate": 7.655765558805113e-06, + "loss": 3.4844, + "step": 13714 + }, + { + "epoch": 1.1689252535583396, + "grad_norm": 37.07561644771996, + "learning_rate": 7.655345425856957e-06, + "loss": 2.4661, + "step": 13715 + }, + { + "epoch": 1.1690104832523651, + "grad_norm": 61.114921498421346, + "learning_rate": 7.654925266794533e-06, + "loss": 4.0582, + "step": 13716 + }, + { + "epoch": 1.1690957129463906, + "grad_norm": 64.4766402553707, + "learning_rate": 7.654505081621977e-06, + "loss": 2.7427, + "step": 13717 + }, + { + "epoch": 1.1691809426404158, + "grad_norm": 62.35603507061776, + "learning_rate": 7.654084870343416e-06, + "loss": 2.5858, + "step": 13718 + }, + { + "epoch": 1.1692661723344413, + "grad_norm": 37.5220389023811, + "learning_rate": 7.653664632962986e-06, + "loss": 3.18, + "step": 13719 + }, + { + "epoch": 1.1693514020284668, + "grad_norm": 36.77118920527712, + "learning_rate": 7.653244369484816e-06, + "loss": 3.0439, + "step": 13720 + }, + { + "epoch": 1.1694366317224922, + "grad_norm": 40.261094613958825, + "learning_rate": 7.652824079913045e-06, + "loss": 3.1499, + "step": 13721 + }, + { + "epoch": 1.1695218614165175, + "grad_norm": 79.70290659238061, + "learning_rate": 7.652403764251802e-06, + "loss": 3.0412, + "step": 13722 + }, + { + "epoch": 1.169607091110543, + "grad_norm": 29.43336494298292, + "learning_rate": 7.651983422505222e-06, + "loss": 2.3482, + "step": 13723 + }, + { + "epoch": 1.1696923208045682, + "grad_norm": 63.09606469609085, + "learning_rate": 7.651563054677437e-06, + "loss": 3.4319, + "step": 13724 + }, + { + "epoch": 1.1697775504985937, + "grad_norm": 35.35663830633522, + "learning_rate": 7.651142660772588e-06, + "loss": 3.0003, + "step": 13725 + }, + { + "epoch": 1.1698627801926191, + "grad_norm": 41.94829185979963, + "learning_rate": 7.650722240794802e-06, + "loss": 3.3844, + "step": 13726 + }, + { + "epoch": 1.1699480098866446, + "grad_norm": 37.504614010011046, + "learning_rate": 7.650301794748214e-06, + "loss": 2.5749, + "step": 13727 + }, + { + "epoch": 1.1700332395806698, + "grad_norm": 46.07663860548525, + "learning_rate": 7.649881322636963e-06, + "loss": 3.0399, + "step": 13728 + }, + { + "epoch": 1.1701184692746953, + "grad_norm": 64.02433029796268, + "learning_rate": 7.64946082446518e-06, + "loss": 4.2309, + "step": 13729 + }, + { + "epoch": 1.1702036989687208, + "grad_norm": 98.68205107200086, + "learning_rate": 7.649040300237005e-06, + "loss": 2.9481, + "step": 13730 + }, + { + "epoch": 1.170288928662746, + "grad_norm": 161.34236858802035, + "learning_rate": 7.64861974995657e-06, + "loss": 4.0097, + "step": 13731 + }, + { + "epoch": 1.1703741583567715, + "grad_norm": 66.27610301558445, + "learning_rate": 7.648199173628013e-06, + "loss": 3.4678, + "step": 13732 + }, + { + "epoch": 1.170459388050797, + "grad_norm": 44.96233122587993, + "learning_rate": 7.647778571255469e-06, + "loss": 3.0482, + "step": 13733 + }, + { + "epoch": 1.1705446177448222, + "grad_norm": 60.517595171425945, + "learning_rate": 7.647357942843075e-06, + "loss": 3.2522, + "step": 13734 + }, + { + "epoch": 1.1706298474388477, + "grad_norm": 39.44637436748044, + "learning_rate": 7.646937288394968e-06, + "loss": 2.8751, + "step": 13735 + }, + { + "epoch": 1.1707150771328731, + "grad_norm": 72.52091533262937, + "learning_rate": 7.646516607915285e-06, + "loss": 3.3204, + "step": 13736 + }, + { + "epoch": 1.1708003068268984, + "grad_norm": 131.35315480002743, + "learning_rate": 7.646095901408159e-06, + "loss": 3.2562, + "step": 13737 + }, + { + "epoch": 1.1708855365209239, + "grad_norm": 68.94682759104484, + "learning_rate": 7.645675168877735e-06, + "loss": 4.2622, + "step": 13738 + }, + { + "epoch": 1.1709707662149493, + "grad_norm": 78.41195208611411, + "learning_rate": 7.645254410328147e-06, + "loss": 2.1458, + "step": 13739 + }, + { + "epoch": 1.1710559959089748, + "grad_norm": 32.37898021011577, + "learning_rate": 7.644833625763532e-06, + "loss": 2.806, + "step": 13740 + }, + { + "epoch": 1.171141225603, + "grad_norm": 49.30671080176075, + "learning_rate": 7.644412815188029e-06, + "loss": 2.7382, + "step": 13741 + }, + { + "epoch": 1.1712264552970255, + "grad_norm": 96.78708058960527, + "learning_rate": 7.643991978605779e-06, + "loss": 3.2644, + "step": 13742 + }, + { + "epoch": 1.1713116849910508, + "grad_norm": 41.992236831752614, + "learning_rate": 7.643571116020915e-06, + "loss": 3.9397, + "step": 13743 + }, + { + "epoch": 1.1713969146850762, + "grad_norm": 41.458781209006794, + "learning_rate": 7.64315022743758e-06, + "loss": 3.0902, + "step": 13744 + }, + { + "epoch": 1.1714821443791017, + "grad_norm": 49.52867705770155, + "learning_rate": 7.642729312859915e-06, + "loss": 2.4303, + "step": 13745 + }, + { + "epoch": 1.1715673740731272, + "grad_norm": 60.05233938004381, + "learning_rate": 7.642308372292056e-06, + "loss": 2.7259, + "step": 13746 + }, + { + "epoch": 1.1716526037671524, + "grad_norm": 38.665016303809516, + "learning_rate": 7.641887405738145e-06, + "loss": 3.0589, + "step": 13747 + }, + { + "epoch": 1.1717378334611779, + "grad_norm": 51.96319378358068, + "learning_rate": 7.64146641320232e-06, + "loss": 3.7117, + "step": 13748 + }, + { + "epoch": 1.1718230631552033, + "grad_norm": 56.68065603364425, + "learning_rate": 7.641045394688724e-06, + "loss": 3.3351, + "step": 13749 + }, + { + "epoch": 1.1719082928492286, + "grad_norm": 35.8030529261217, + "learning_rate": 7.640624350201495e-06, + "loss": 3.0092, + "step": 13750 + }, + { + "epoch": 1.171993522543254, + "grad_norm": 59.95456424223146, + "learning_rate": 7.640203279744774e-06, + "loss": 4.1149, + "step": 13751 + }, + { + "epoch": 1.1720787522372795, + "grad_norm": 32.53793671500292, + "learning_rate": 7.639782183322702e-06, + "loss": 3.1884, + "step": 13752 + }, + { + "epoch": 1.1721639819313048, + "grad_norm": 27.99706031807394, + "learning_rate": 7.639361060939425e-06, + "loss": 2.308, + "step": 13753 + }, + { + "epoch": 1.1722492116253302, + "grad_norm": 59.2523218556742, + "learning_rate": 7.638939912599077e-06, + "loss": 2.1855, + "step": 13754 + }, + { + "epoch": 1.1723344413193557, + "grad_norm": 70.53724759306832, + "learning_rate": 7.638518738305805e-06, + "loss": 2.7073, + "step": 13755 + }, + { + "epoch": 1.172419671013381, + "grad_norm": 54.224515874735324, + "learning_rate": 7.638097538063752e-06, + "loss": 2.7836, + "step": 13756 + }, + { + "epoch": 1.1725049007074064, + "grad_norm": 55.39991001863358, + "learning_rate": 7.637676311877054e-06, + "loss": 2.6201, + "step": 13757 + }, + { + "epoch": 1.172590130401432, + "grad_norm": 37.195937233248884, + "learning_rate": 7.637255059749859e-06, + "loss": 2.601, + "step": 13758 + }, + { + "epoch": 1.1726753600954574, + "grad_norm": 105.46909113743457, + "learning_rate": 7.636833781686308e-06, + "loss": 3.0762, + "step": 13759 + }, + { + "epoch": 1.1727605897894826, + "grad_norm": 58.36118476103769, + "learning_rate": 7.636412477690548e-06, + "loss": 3.0271, + "step": 13760 + }, + { + "epoch": 1.172845819483508, + "grad_norm": 41.44089380846353, + "learning_rate": 7.635991147766715e-06, + "loss": 3.1308, + "step": 13761 + }, + { + "epoch": 1.1729310491775335, + "grad_norm": 43.59665121424405, + "learning_rate": 7.635569791918958e-06, + "loss": 3.8148, + "step": 13762 + }, + { + "epoch": 1.1730162788715588, + "grad_norm": 59.95642374000018, + "learning_rate": 7.635148410151419e-06, + "loss": 2.9501, + "step": 13763 + }, + { + "epoch": 1.1731015085655843, + "grad_norm": 72.46357164907984, + "learning_rate": 7.634727002468243e-06, + "loss": 2.7695, + "step": 13764 + }, + { + "epoch": 1.1731867382596097, + "grad_norm": 45.20619825762353, + "learning_rate": 7.634305568873573e-06, + "loss": 2.4782, + "step": 13765 + }, + { + "epoch": 1.173271967953635, + "grad_norm": 73.0282173492062, + "learning_rate": 7.633884109371555e-06, + "loss": 3.559, + "step": 13766 + }, + { + "epoch": 1.1733571976476604, + "grad_norm": 51.26261694522462, + "learning_rate": 7.633462623966333e-06, + "loss": 2.8724, + "step": 13767 + }, + { + "epoch": 1.173442427341686, + "grad_norm": 38.47940095616392, + "learning_rate": 7.633041112662052e-06, + "loss": 3.2366, + "step": 13768 + }, + { + "epoch": 1.1735276570357112, + "grad_norm": 71.48095846671085, + "learning_rate": 7.63261957546286e-06, + "loss": 3.6265, + "step": 13769 + }, + { + "epoch": 1.1736128867297366, + "grad_norm": 57.787260187278235, + "learning_rate": 7.632198012372899e-06, + "loss": 2.9234, + "step": 13770 + }, + { + "epoch": 1.173698116423762, + "grad_norm": 76.96492495472766, + "learning_rate": 7.631776423396317e-06, + "loss": 3.0889, + "step": 13771 + }, + { + "epoch": 1.1737833461177873, + "grad_norm": 62.77374708630708, + "learning_rate": 7.631354808537258e-06, + "loss": 3.2567, + "step": 13772 + }, + { + "epoch": 1.1738685758118128, + "grad_norm": 57.93167038776986, + "learning_rate": 7.630933167799873e-06, + "loss": 2.8608, + "step": 13773 + }, + { + "epoch": 1.1739538055058383, + "grad_norm": 65.19660791491094, + "learning_rate": 7.630511501188305e-06, + "loss": 2.821, + "step": 13774 + }, + { + "epoch": 1.1740390351998635, + "grad_norm": 48.18204489604491, + "learning_rate": 7.6300898087067e-06, + "loss": 2.2794, + "step": 13775 + }, + { + "epoch": 1.174124264893889, + "grad_norm": 74.93067077384013, + "learning_rate": 7.629668090359206e-06, + "loss": 4.467, + "step": 13776 + }, + { + "epoch": 1.1742094945879145, + "grad_norm": 50.5890398257893, + "learning_rate": 7.629246346149975e-06, + "loss": 3.1569, + "step": 13777 + }, + { + "epoch": 1.17429472428194, + "grad_norm": 49.937025768990054, + "learning_rate": 7.628824576083148e-06, + "loss": 3.6355, + "step": 13778 + }, + { + "epoch": 1.1743799539759652, + "grad_norm": 63.434226916387146, + "learning_rate": 7.628402780162877e-06, + "loss": 4.1593, + "step": 13779 + }, + { + "epoch": 1.1744651836699906, + "grad_norm": 19.016405917809184, + "learning_rate": 7.627980958393308e-06, + "loss": 1.1891, + "step": 13780 + }, + { + "epoch": 1.174550413364016, + "grad_norm": 27.587510405711186, + "learning_rate": 7.627559110778593e-06, + "loss": 2.7003, + "step": 13781 + }, + { + "epoch": 1.1746356430580414, + "grad_norm": 55.9571867540984, + "learning_rate": 7.627137237322876e-06, + "loss": 1.8403, + "step": 13782 + }, + { + "epoch": 1.1747208727520668, + "grad_norm": 31.990472560751364, + "learning_rate": 7.626715338030308e-06, + "loss": 2.6998, + "step": 13783 + }, + { + "epoch": 1.1748061024460923, + "grad_norm": 30.744518455789148, + "learning_rate": 7.626293412905041e-06, + "loss": 2.7621, + "step": 13784 + }, + { + "epoch": 1.1748913321401175, + "grad_norm": 37.27257316108493, + "learning_rate": 7.6258714619512195e-06, + "loss": 3.7329, + "step": 13785 + }, + { + "epoch": 1.174976561834143, + "grad_norm": 52.47477311467355, + "learning_rate": 7.625449485172995e-06, + "loss": 2.9297, + "step": 13786 + }, + { + "epoch": 1.1750617915281685, + "grad_norm": 73.76243252262005, + "learning_rate": 7.62502748257452e-06, + "loss": 2.802, + "step": 13787 + }, + { + "epoch": 1.1751470212221937, + "grad_norm": 37.61667689536003, + "learning_rate": 7.624605454159942e-06, + "loss": 2.2091, + "step": 13788 + }, + { + "epoch": 1.1752322509162192, + "grad_norm": 62.747462824238156, + "learning_rate": 7.624183399933413e-06, + "loss": 2.8594, + "step": 13789 + }, + { + "epoch": 1.1753174806102447, + "grad_norm": 28.173255926290928, + "learning_rate": 7.623761319899081e-06, + "loss": 1.7585, + "step": 13790 + }, + { + "epoch": 1.1754027103042701, + "grad_norm": 55.08996021509114, + "learning_rate": 7.6233392140611e-06, + "loss": 2.6383, + "step": 13791 + }, + { + "epoch": 1.1754879399982954, + "grad_norm": 63.09726568627999, + "learning_rate": 7.622917082423622e-06, + "loss": 3.1799, + "step": 13792 + }, + { + "epoch": 1.1755731696923208, + "grad_norm": 39.27283127953334, + "learning_rate": 7.622494924990796e-06, + "loss": 2.6784, + "step": 13793 + }, + { + "epoch": 1.175658399386346, + "grad_norm": 38.6217533952847, + "learning_rate": 7.622072741766773e-06, + "loss": 2.4956, + "step": 13794 + }, + { + "epoch": 1.1757436290803716, + "grad_norm": 94.73841593628909, + "learning_rate": 7.621650532755707e-06, + "loss": 2.9792, + "step": 13795 + }, + { + "epoch": 1.175828858774397, + "grad_norm": 61.08926834416499, + "learning_rate": 7.6212282979617505e-06, + "loss": 2.7449, + "step": 13796 + }, + { + "epoch": 1.1759140884684225, + "grad_norm": 34.46878832175953, + "learning_rate": 7.620806037389055e-06, + "loss": 2.3287, + "step": 13797 + }, + { + "epoch": 1.1759993181624477, + "grad_norm": 32.144400333808, + "learning_rate": 7.620383751041773e-06, + "loss": 3.2081, + "step": 13798 + }, + { + "epoch": 1.1760845478564732, + "grad_norm": 77.90046827563803, + "learning_rate": 7.6199614389240596e-06, + "loss": 3.0806, + "step": 13799 + }, + { + "epoch": 1.1761697775504987, + "grad_norm": 46.83907552760372, + "learning_rate": 7.6195391010400646e-06, + "loss": 2.6794, + "step": 13800 + }, + { + "epoch": 1.176255007244524, + "grad_norm": 57.32058024134834, + "learning_rate": 7.619116737393944e-06, + "loss": 3.1274, + "step": 13801 + }, + { + "epoch": 1.1763402369385494, + "grad_norm": 85.00690054699014, + "learning_rate": 7.61869434798985e-06, + "loss": 3.4391, + "step": 13802 + }, + { + "epoch": 1.1764254666325749, + "grad_norm": 51.85551447995358, + "learning_rate": 7.6182719328319395e-06, + "loss": 3.5444, + "step": 13803 + }, + { + "epoch": 1.1765106963266, + "grad_norm": 31.544878632997072, + "learning_rate": 7.617849491924365e-06, + "loss": 1.8647, + "step": 13804 + }, + { + "epoch": 1.1765959260206256, + "grad_norm": 35.349062240805075, + "learning_rate": 7.61742702527128e-06, + "loss": 3.2559, + "step": 13805 + }, + { + "epoch": 1.176681155714651, + "grad_norm": 71.12354594036724, + "learning_rate": 7.617004532876839e-06, + "loss": 2.2766, + "step": 13806 + }, + { + "epoch": 1.1767663854086763, + "grad_norm": 57.14043973925901, + "learning_rate": 7.616582014745202e-06, + "loss": 2.226, + "step": 13807 + }, + { + "epoch": 1.1768516151027018, + "grad_norm": 98.57420790467278, + "learning_rate": 7.616159470880518e-06, + "loss": 3.5207, + "step": 13808 + }, + { + "epoch": 1.1769368447967272, + "grad_norm": 68.36344394031502, + "learning_rate": 7.615736901286946e-06, + "loss": 2.6272, + "step": 13809 + }, + { + "epoch": 1.1770220744907527, + "grad_norm": 33.09001402163291, + "learning_rate": 7.615314305968639e-06, + "loss": 2.9808, + "step": 13810 + }, + { + "epoch": 1.177107304184778, + "grad_norm": 46.53168114621247, + "learning_rate": 7.614891684929757e-06, + "loss": 2.985, + "step": 13811 + }, + { + "epoch": 1.1771925338788034, + "grad_norm": 48.764246994452044, + "learning_rate": 7.614469038174454e-06, + "loss": 2.4053, + "step": 13812 + }, + { + "epoch": 1.1772777635728287, + "grad_norm": 39.822181973694526, + "learning_rate": 7.614046365706886e-06, + "loss": 2.9831, + "step": 13813 + }, + { + "epoch": 1.1773629932668541, + "grad_norm": 43.67155709839079, + "learning_rate": 7.613623667531214e-06, + "loss": 3.4385, + "step": 13814 + }, + { + "epoch": 1.1774482229608796, + "grad_norm": 70.48558243285115, + "learning_rate": 7.6132009436515885e-06, + "loss": 3.0619, + "step": 13815 + }, + { + "epoch": 1.177533452654905, + "grad_norm": 106.50221922716517, + "learning_rate": 7.612778194072171e-06, + "loss": 3.2201, + "step": 13816 + }, + { + "epoch": 1.1776186823489303, + "grad_norm": 24.920432505131075, + "learning_rate": 7.612355418797117e-06, + "loss": 2.4923, + "step": 13817 + }, + { + "epoch": 1.1777039120429558, + "grad_norm": 81.49981361239678, + "learning_rate": 7.611932617830588e-06, + "loss": 3.348, + "step": 13818 + }, + { + "epoch": 1.1777891417369812, + "grad_norm": 36.73096105677315, + "learning_rate": 7.611509791176736e-06, + "loss": 2.8589, + "step": 13819 + }, + { + "epoch": 1.1778743714310065, + "grad_norm": 37.80254605597318, + "learning_rate": 7.611086938839727e-06, + "loss": 3.0591, + "step": 13820 + }, + { + "epoch": 1.177959601125032, + "grad_norm": 82.81737198403582, + "learning_rate": 7.610664060823714e-06, + "loss": 4.2012, + "step": 13821 + }, + { + "epoch": 1.1780448308190574, + "grad_norm": 46.542810117361235, + "learning_rate": 7.610241157132856e-06, + "loss": 2.412, + "step": 13822 + }, + { + "epoch": 1.1781300605130827, + "grad_norm": 38.426372914888205, + "learning_rate": 7.609818227771314e-06, + "loss": 2.6133, + "step": 13823 + }, + { + "epoch": 1.1782152902071081, + "grad_norm": 50.47793039345965, + "learning_rate": 7.609395272743247e-06, + "loss": 2.8218, + "step": 13824 + }, + { + "epoch": 1.1783005199011336, + "grad_norm": 82.27277895827226, + "learning_rate": 7.608972292052815e-06, + "loss": 4.1464, + "step": 13825 + }, + { + "epoch": 1.1783857495951588, + "grad_norm": 58.78369643792833, + "learning_rate": 7.608549285704177e-06, + "loss": 2.9373, + "step": 13826 + }, + { + "epoch": 1.1784709792891843, + "grad_norm": 46.983523921188144, + "learning_rate": 7.6081262537014935e-06, + "loss": 2.3741, + "step": 13827 + }, + { + "epoch": 1.1785562089832098, + "grad_norm": 111.73044864865133, + "learning_rate": 7.607703196048923e-06, + "loss": 3.3608, + "step": 13828 + }, + { + "epoch": 1.1786414386772353, + "grad_norm": 56.46278302655001, + "learning_rate": 7.60728011275063e-06, + "loss": 2.3693, + "step": 13829 + }, + { + "epoch": 1.1787266683712605, + "grad_norm": 54.33905321870497, + "learning_rate": 7.606857003810771e-06, + "loss": 3.3044, + "step": 13830 + }, + { + "epoch": 1.178811898065286, + "grad_norm": 88.50051079610395, + "learning_rate": 7.606433869233511e-06, + "loss": 3.6524, + "step": 13831 + }, + { + "epoch": 1.1788971277593114, + "grad_norm": 68.60075049114332, + "learning_rate": 7.606010709023008e-06, + "loss": 3.1139, + "step": 13832 + }, + { + "epoch": 1.1789823574533367, + "grad_norm": 51.50925527804785, + "learning_rate": 7.605587523183426e-06, + "loss": 3.0541, + "step": 13833 + }, + { + "epoch": 1.1790675871473622, + "grad_norm": 100.73474022012027, + "learning_rate": 7.605164311718926e-06, + "loss": 4.0891, + "step": 13834 + }, + { + "epoch": 1.1791528168413876, + "grad_norm": 41.1859661591862, + "learning_rate": 7.604741074633673e-06, + "loss": 3.8689, + "step": 13835 + }, + { + "epoch": 1.1792380465354129, + "grad_norm": 87.76849818857531, + "learning_rate": 7.604317811931823e-06, + "loss": 2.7521, + "step": 13836 + }, + { + "epoch": 1.1793232762294383, + "grad_norm": 142.44455443177233, + "learning_rate": 7.603894523617543e-06, + "loss": 4.8607, + "step": 13837 + }, + { + "epoch": 1.1794085059234638, + "grad_norm": 40.71060025756964, + "learning_rate": 7.603471209694995e-06, + "loss": 2.6911, + "step": 13838 + }, + { + "epoch": 1.179493735617489, + "grad_norm": 29.132211415660503, + "learning_rate": 7.603047870168343e-06, + "loss": 2.4281, + "step": 13839 + }, + { + "epoch": 1.1795789653115145, + "grad_norm": 49.60002768807291, + "learning_rate": 7.602624505041748e-06, + "loss": 2.746, + "step": 13840 + }, + { + "epoch": 1.17966419500554, + "grad_norm": 83.36422287440092, + "learning_rate": 7.6022011143193766e-06, + "loss": 3.0997, + "step": 13841 + }, + { + "epoch": 1.1797494246995655, + "grad_norm": 44.71724154417855, + "learning_rate": 7.6017776980053905e-06, + "loss": 3.717, + "step": 13842 + }, + { + "epoch": 1.1798346543935907, + "grad_norm": 46.48442798633224, + "learning_rate": 7.6013542561039545e-06, + "loss": 2.5929, + "step": 13843 + }, + { + "epoch": 1.1799198840876162, + "grad_norm": 47.06469457538996, + "learning_rate": 7.600930788619234e-06, + "loss": 3.1796, + "step": 13844 + }, + { + "epoch": 1.1800051137816414, + "grad_norm": 29.972824684284063, + "learning_rate": 7.60050729555539e-06, + "loss": 2.4495, + "step": 13845 + }, + { + "epoch": 1.1800903434756669, + "grad_norm": 47.53451088065134, + "learning_rate": 7.600083776916594e-06, + "loss": 3.596, + "step": 13846 + }, + { + "epoch": 1.1801755731696923, + "grad_norm": 28.74127626884854, + "learning_rate": 7.599660232707005e-06, + "loss": 2.2895, + "step": 13847 + }, + { + "epoch": 1.1802608028637178, + "grad_norm": 65.4340253375124, + "learning_rate": 7.599236662930791e-06, + "loss": 3.3137, + "step": 13848 + }, + { + "epoch": 1.180346032557743, + "grad_norm": 52.4794754112699, + "learning_rate": 7.598813067592117e-06, + "loss": 2.999, + "step": 13849 + }, + { + "epoch": 1.1804312622517685, + "grad_norm": 27.255970816739573, + "learning_rate": 7.598389446695149e-06, + "loss": 2.8812, + "step": 13850 + }, + { + "epoch": 1.180516491945794, + "grad_norm": 84.68025329841328, + "learning_rate": 7.5979658002440534e-06, + "loss": 3.7906, + "step": 13851 + }, + { + "epoch": 1.1806017216398192, + "grad_norm": 39.69625464591955, + "learning_rate": 7.597542128242998e-06, + "loss": 3.1493, + "step": 13852 + }, + { + "epoch": 1.1806869513338447, + "grad_norm": 33.21484298347103, + "learning_rate": 7.597118430696146e-06, + "loss": 1.9003, + "step": 13853 + }, + { + "epoch": 1.1807721810278702, + "grad_norm": 46.94321963286928, + "learning_rate": 7.596694707607668e-06, + "loss": 3.8572, + "step": 13854 + }, + { + "epoch": 1.1808574107218954, + "grad_norm": 59.80093450502656, + "learning_rate": 7.596270958981728e-06, + "loss": 3.4832, + "step": 13855 + }, + { + "epoch": 1.180942640415921, + "grad_norm": 44.54912067309979, + "learning_rate": 7.595847184822494e-06, + "loss": 2.7299, + "step": 13856 + }, + { + "epoch": 1.1810278701099464, + "grad_norm": 53.86751034488923, + "learning_rate": 7.595423385134137e-06, + "loss": 2.4244, + "step": 13857 + }, + { + "epoch": 1.1811130998039716, + "grad_norm": 35.84934999649657, + "learning_rate": 7.59499955992082e-06, + "loss": 2.8524, + "step": 13858 + }, + { + "epoch": 1.181198329497997, + "grad_norm": 31.653254873422895, + "learning_rate": 7.594575709186716e-06, + "loss": 2.1034, + "step": 13859 + }, + { + "epoch": 1.1812835591920225, + "grad_norm": 36.06106447874619, + "learning_rate": 7.594151832935987e-06, + "loss": 2.7166, + "step": 13860 + }, + { + "epoch": 1.181368788886048, + "grad_norm": 64.00017609249673, + "learning_rate": 7.593727931172809e-06, + "loss": 2.9087, + "step": 13861 + }, + { + "epoch": 1.1814540185800733, + "grad_norm": 74.84463559372436, + "learning_rate": 7.593304003901345e-06, + "loss": 3.4013, + "step": 13862 + }, + { + "epoch": 1.1815392482740987, + "grad_norm": 35.596528585704135, + "learning_rate": 7.592880051125769e-06, + "loss": 2.8615, + "step": 13863 + }, + { + "epoch": 1.181624477968124, + "grad_norm": 62.93850420651414, + "learning_rate": 7.592456072850245e-06, + "loss": 3.4824, + "step": 13864 + }, + { + "epoch": 1.1817097076621494, + "grad_norm": 63.126214369284675, + "learning_rate": 7.592032069078948e-06, + "loss": 2.7018, + "step": 13865 + }, + { + "epoch": 1.181794937356175, + "grad_norm": 164.09797364096937, + "learning_rate": 7.5916080398160454e-06, + "loss": 4.4645, + "step": 13866 + }, + { + "epoch": 1.1818801670502004, + "grad_norm": 56.6287770289636, + "learning_rate": 7.591183985065708e-06, + "loss": 2.5227, + "step": 13867 + }, + { + "epoch": 1.1819653967442256, + "grad_norm": 72.80432711411058, + "learning_rate": 7.590759904832104e-06, + "loss": 3.1637, + "step": 13868 + }, + { + "epoch": 1.182050626438251, + "grad_norm": 35.519602868124736, + "learning_rate": 7.590335799119406e-06, + "loss": 2.3574, + "step": 13869 + }, + { + "epoch": 1.1821358561322766, + "grad_norm": 45.12334232600177, + "learning_rate": 7.589911667931786e-06, + "loss": 2.1002, + "step": 13870 + }, + { + "epoch": 1.1822210858263018, + "grad_norm": 71.7899561300006, + "learning_rate": 7.589487511273414e-06, + "loss": 2.717, + "step": 13871 + }, + { + "epoch": 1.1823063155203273, + "grad_norm": 53.54312008546548, + "learning_rate": 7.5890633291484605e-06, + "loss": 2.9726, + "step": 13872 + }, + { + "epoch": 1.1823915452143527, + "grad_norm": 70.87876730884749, + "learning_rate": 7.588639121561096e-06, + "loss": 2.0441, + "step": 13873 + }, + { + "epoch": 1.182476774908378, + "grad_norm": 42.5923876875865, + "learning_rate": 7.588214888515499e-06, + "loss": 2.3532, + "step": 13874 + }, + { + "epoch": 1.1825620046024035, + "grad_norm": 78.5081954594878, + "learning_rate": 7.587790630015833e-06, + "loss": 4.2648, + "step": 13875 + }, + { + "epoch": 1.182647234296429, + "grad_norm": 37.65033037885424, + "learning_rate": 7.587366346066276e-06, + "loss": 2.337, + "step": 13876 + }, + { + "epoch": 1.1827324639904542, + "grad_norm": 83.51210437966147, + "learning_rate": 7.586942036670999e-06, + "loss": 3.2065, + "step": 13877 + }, + { + "epoch": 1.1828176936844796, + "grad_norm": 46.25899562256947, + "learning_rate": 7.586517701834175e-06, + "loss": 3.9403, + "step": 13878 + }, + { + "epoch": 1.1829029233785051, + "grad_norm": 49.09715480989018, + "learning_rate": 7.586093341559977e-06, + "loss": 3.2472, + "step": 13879 + }, + { + "epoch": 1.1829881530725306, + "grad_norm": 43.37794067335251, + "learning_rate": 7.585668955852579e-06, + "loss": 4.42, + "step": 13880 + }, + { + "epoch": 1.1830733827665558, + "grad_norm": 27.474776963130594, + "learning_rate": 7.585244544716152e-06, + "loss": 1.9265, + "step": 13881 + }, + { + "epoch": 1.1831586124605813, + "grad_norm": 33.506317648787345, + "learning_rate": 7.584820108154875e-06, + "loss": 2.9173, + "step": 13882 + }, + { + "epoch": 1.1832438421546068, + "grad_norm": 21.96602914603602, + "learning_rate": 7.5843956461729195e-06, + "loss": 1.6645, + "step": 13883 + }, + { + "epoch": 1.183329071848632, + "grad_norm": 18.144789472366757, + "learning_rate": 7.5839711587744595e-06, + "loss": 1.7749, + "step": 13884 + }, + { + "epoch": 1.1834143015426575, + "grad_norm": 45.42003904557437, + "learning_rate": 7.5835466459636686e-06, + "loss": 2.3507, + "step": 13885 + }, + { + "epoch": 1.183499531236683, + "grad_norm": 34.517701334581794, + "learning_rate": 7.583122107744723e-06, + "loss": 2.677, + "step": 13886 + }, + { + "epoch": 1.1835847609307082, + "grad_norm": 67.22180414651646, + "learning_rate": 7.5826975441218e-06, + "loss": 3.4021, + "step": 13887 + }, + { + "epoch": 1.1836699906247337, + "grad_norm": 38.570298944446264, + "learning_rate": 7.58227295509907e-06, + "loss": 3.1289, + "step": 13888 + }, + { + "epoch": 1.1837552203187591, + "grad_norm": 34.23146545324738, + "learning_rate": 7.581848340680714e-06, + "loss": 2.8167, + "step": 13889 + }, + { + "epoch": 1.1838404500127844, + "grad_norm": 44.452357332346395, + "learning_rate": 7.581423700870905e-06, + "loss": 2.2142, + "step": 13890 + }, + { + "epoch": 1.1839256797068098, + "grad_norm": 37.41763064344112, + "learning_rate": 7.580999035673818e-06, + "loss": 3.0941, + "step": 13891 + }, + { + "epoch": 1.1840109094008353, + "grad_norm": 127.80351699073837, + "learning_rate": 7.580574345093632e-06, + "loss": 2.187, + "step": 13892 + }, + { + "epoch": 1.1840961390948606, + "grad_norm": 40.265996112817994, + "learning_rate": 7.580149629134523e-06, + "loss": 3.1375, + "step": 13893 + }, + { + "epoch": 1.184181368788886, + "grad_norm": 37.42185633843022, + "learning_rate": 7.579724887800668e-06, + "loss": 2.5775, + "step": 13894 + }, + { + "epoch": 1.1842665984829115, + "grad_norm": 46.533516448035556, + "learning_rate": 7.579300121096244e-06, + "loss": 3.5347, + "step": 13895 + }, + { + "epoch": 1.1843518281769367, + "grad_norm": 48.58114605107596, + "learning_rate": 7.578875329025427e-06, + "loss": 3.1691, + "step": 13896 + }, + { + "epoch": 1.1844370578709622, + "grad_norm": 67.83069808586782, + "learning_rate": 7.578450511592397e-06, + "loss": 3.3122, + "step": 13897 + }, + { + "epoch": 1.1845222875649877, + "grad_norm": 33.507466117633726, + "learning_rate": 7.57802566880133e-06, + "loss": 2.8183, + "step": 13898 + }, + { + "epoch": 1.1846075172590131, + "grad_norm": 75.32638013915799, + "learning_rate": 7.577600800656406e-06, + "loss": 3.7933, + "step": 13899 + }, + { + "epoch": 1.1846927469530384, + "grad_norm": 37.772850759483255, + "learning_rate": 7.577175907161801e-06, + "loss": 3.4133, + "step": 13900 + }, + { + "epoch": 1.1847779766470639, + "grad_norm": 39.126199909812996, + "learning_rate": 7.576750988321697e-06, + "loss": 2.3297, + "step": 13901 + }, + { + "epoch": 1.1848632063410893, + "grad_norm": 60.131549057814915, + "learning_rate": 7.576326044140269e-06, + "loss": 2.9115, + "step": 13902 + }, + { + "epoch": 1.1849484360351146, + "grad_norm": 63.312629082394096, + "learning_rate": 7.5759010746216985e-06, + "loss": 2.9745, + "step": 13903 + }, + { + "epoch": 1.18503366572914, + "grad_norm": 36.96779008221307, + "learning_rate": 7.5754760797701654e-06, + "loss": 2.4096, + "step": 13904 + }, + { + "epoch": 1.1851188954231655, + "grad_norm": 58.71631281878677, + "learning_rate": 7.575051059589848e-06, + "loss": 2.8673, + "step": 13905 + }, + { + "epoch": 1.1852041251171908, + "grad_norm": 53.739519328772076, + "learning_rate": 7.574626014084927e-06, + "loss": 3.1103, + "step": 13906 + }, + { + "epoch": 1.1852893548112162, + "grad_norm": 48.907331891335275, + "learning_rate": 7.57420094325958e-06, + "loss": 2.8144, + "step": 13907 + }, + { + "epoch": 1.1853745845052417, + "grad_norm": 70.27986457069211, + "learning_rate": 7.573775847117992e-06, + "loss": 2.804, + "step": 13908 + }, + { + "epoch": 1.185459814199267, + "grad_norm": 37.91179452599851, + "learning_rate": 7.57335072566434e-06, + "loss": 2.7472, + "step": 13909 + }, + { + "epoch": 1.1855450438932924, + "grad_norm": 45.73615474477137, + "learning_rate": 7.572925578902808e-06, + "loss": 2.6296, + "step": 13910 + }, + { + "epoch": 1.1856302735873179, + "grad_norm": 42.653545687596186, + "learning_rate": 7.572500406837573e-06, + "loss": 2.6963, + "step": 13911 + }, + { + "epoch": 1.1857155032813433, + "grad_norm": 52.03045823796845, + "learning_rate": 7.57207520947282e-06, + "loss": 3.3316, + "step": 13912 + }, + { + "epoch": 1.1858007329753686, + "grad_norm": 54.63843231729781, + "learning_rate": 7.571649986812728e-06, + "loss": 3.9513, + "step": 13913 + }, + { + "epoch": 1.185885962669394, + "grad_norm": 59.73551230815861, + "learning_rate": 7.571224738861483e-06, + "loss": 2.8389, + "step": 13914 + }, + { + "epoch": 1.1859711923634193, + "grad_norm": 42.38972817410551, + "learning_rate": 7.570799465623262e-06, + "loss": 3.0627, + "step": 13915 + }, + { + "epoch": 1.1860564220574448, + "grad_norm": 103.5689151363263, + "learning_rate": 7.57037416710225e-06, + "loss": 2.4011, + "step": 13916 + }, + { + "epoch": 1.1861416517514702, + "grad_norm": 91.71245123999958, + "learning_rate": 7.56994884330263e-06, + "loss": 3.8647, + "step": 13917 + }, + { + "epoch": 1.1862268814454957, + "grad_norm": 49.69756546886137, + "learning_rate": 7.5695234942285845e-06, + "loss": 3.0931, + "step": 13918 + }, + { + "epoch": 1.186312111139521, + "grad_norm": 39.078156581268715, + "learning_rate": 7.569098119884296e-06, + "loss": 3.3784, + "step": 13919 + }, + { + "epoch": 1.1863973408335464, + "grad_norm": 34.077802472919664, + "learning_rate": 7.5686727202739484e-06, + "loss": 2.2467, + "step": 13920 + }, + { + "epoch": 1.186482570527572, + "grad_norm": 49.273024744577, + "learning_rate": 7.5682472954017265e-06, + "loss": 2.9105, + "step": 13921 + }, + { + "epoch": 1.1865678002215971, + "grad_norm": 34.26249850085493, + "learning_rate": 7.567821845271811e-06, + "loss": 2.1627, + "step": 13922 + }, + { + "epoch": 1.1866530299156226, + "grad_norm": 56.855282766671635, + "learning_rate": 7.567396369888389e-06, + "loss": 2.8907, + "step": 13923 + }, + { + "epoch": 1.186738259609648, + "grad_norm": 39.443145582139316, + "learning_rate": 7.566970869255643e-06, + "loss": 2.999, + "step": 13924 + }, + { + "epoch": 1.1868234893036733, + "grad_norm": 68.26685454391395, + "learning_rate": 7.56654534337776e-06, + "loss": 4.3087, + "step": 13925 + }, + { + "epoch": 1.1869087189976988, + "grad_norm": 56.678384761638085, + "learning_rate": 7.566119792258924e-06, + "loss": 3.0173, + "step": 13926 + }, + { + "epoch": 1.1869939486917243, + "grad_norm": 61.08661512894285, + "learning_rate": 7.565694215903317e-06, + "loss": 4.5631, + "step": 13927 + }, + { + "epoch": 1.1870791783857495, + "grad_norm": 51.82546314865243, + "learning_rate": 7.565268614315128e-06, + "loss": 2.3246, + "step": 13928 + }, + { + "epoch": 1.187164408079775, + "grad_norm": 75.70541245357172, + "learning_rate": 7.564842987498542e-06, + "loss": 3.2469, + "step": 13929 + }, + { + "epoch": 1.1872496377738004, + "grad_norm": 64.3295711964557, + "learning_rate": 7.564417335457743e-06, + "loss": 3.5718, + "step": 13930 + }, + { + "epoch": 1.187334867467826, + "grad_norm": 68.19506375529778, + "learning_rate": 7.563991658196921e-06, + "loss": 3.293, + "step": 13931 + }, + { + "epoch": 1.1874200971618512, + "grad_norm": 35.73941501857819, + "learning_rate": 7.563565955720258e-06, + "loss": 2.4936, + "step": 13932 + }, + { + "epoch": 1.1875053268558766, + "grad_norm": 39.44711090449598, + "learning_rate": 7.563140228031944e-06, + "loss": 2.9393, + "step": 13933 + }, + { + "epoch": 1.1875905565499019, + "grad_norm": 99.8803359494448, + "learning_rate": 7.562714475136161e-06, + "loss": 1.993, + "step": 13934 + }, + { + "epoch": 1.1876757862439273, + "grad_norm": 51.74906570165192, + "learning_rate": 7.562288697037102e-06, + "loss": 3.3599, + "step": 13935 + }, + { + "epoch": 1.1877610159379528, + "grad_norm": 59.24617180821254, + "learning_rate": 7.561862893738953e-06, + "loss": 3.6501, + "step": 13936 + }, + { + "epoch": 1.1878462456319783, + "grad_norm": 36.092356883731824, + "learning_rate": 7.561437065245898e-06, + "loss": 3.2329, + "step": 13937 + }, + { + "epoch": 1.1879314753260035, + "grad_norm": 169.4415264675829, + "learning_rate": 7.561011211562128e-06, + "loss": 2.5889, + "step": 13938 + }, + { + "epoch": 1.188016705020029, + "grad_norm": 58.33442622827888, + "learning_rate": 7.56058533269183e-06, + "loss": 3.2136, + "step": 13939 + }, + { + "epoch": 1.1881019347140545, + "grad_norm": 97.25406046915741, + "learning_rate": 7.5601594286391955e-06, + "loss": 3.2234, + "step": 13940 + }, + { + "epoch": 1.1881871644080797, + "grad_norm": 31.635296549402028, + "learning_rate": 7.559733499408407e-06, + "loss": 1.8276, + "step": 13941 + }, + { + "epoch": 1.1882723941021052, + "grad_norm": 45.034858865024866, + "learning_rate": 7.559307545003658e-06, + "loss": 3.6582, + "step": 13942 + }, + { + "epoch": 1.1883576237961306, + "grad_norm": 35.69114706620539, + "learning_rate": 7.5588815654291345e-06, + "loss": 3.6184, + "step": 13943 + }, + { + "epoch": 1.1884428534901559, + "grad_norm": 24.938096057326213, + "learning_rate": 7.558455560689031e-06, + "loss": 2.5745, + "step": 13944 + }, + { + "epoch": 1.1885280831841813, + "grad_norm": 55.83837552355469, + "learning_rate": 7.558029530787532e-06, + "loss": 3.2318, + "step": 13945 + }, + { + "epoch": 1.1886133128782068, + "grad_norm": 146.92801723566905, + "learning_rate": 7.557603475728827e-06, + "loss": 3.8082, + "step": 13946 + }, + { + "epoch": 1.188698542572232, + "grad_norm": 72.31062045003942, + "learning_rate": 7.5571773955171124e-06, + "loss": 3.955, + "step": 13947 + }, + { + "epoch": 1.1887837722662575, + "grad_norm": 34.59207656675834, + "learning_rate": 7.556751290156571e-06, + "loss": 2.605, + "step": 13948 + }, + { + "epoch": 1.188869001960283, + "grad_norm": 60.48487211194281, + "learning_rate": 7.556325159651398e-06, + "loss": 4.3031, + "step": 13949 + }, + { + "epoch": 1.1889542316543085, + "grad_norm": 32.23733082279425, + "learning_rate": 7.555899004005781e-06, + "loss": 2.694, + "step": 13950 + }, + { + "epoch": 1.1890394613483337, + "grad_norm": 35.74827584988356, + "learning_rate": 7.555472823223915e-06, + "loss": 2.9038, + "step": 13951 + }, + { + "epoch": 1.1891246910423592, + "grad_norm": 55.15525842140474, + "learning_rate": 7.555046617309989e-06, + "loss": 3.0416, + "step": 13952 + }, + { + "epoch": 1.1892099207363847, + "grad_norm": 39.80541927236206, + "learning_rate": 7.554620386268194e-06, + "loss": 3.4476, + "step": 13953 + }, + { + "epoch": 1.18929515043041, + "grad_norm": 50.848582019607505, + "learning_rate": 7.554194130102722e-06, + "loss": 4.1553, + "step": 13954 + }, + { + "epoch": 1.1893803801244354, + "grad_norm": 32.69328863396648, + "learning_rate": 7.553767848817768e-06, + "loss": 1.8131, + "step": 13955 + }, + { + "epoch": 1.1894656098184608, + "grad_norm": 103.66136953878718, + "learning_rate": 7.55334154241752e-06, + "loss": 4.5011, + "step": 13956 + }, + { + "epoch": 1.189550839512486, + "grad_norm": 51.52386660679524, + "learning_rate": 7.552915210906173e-06, + "loss": 2.8569, + "step": 13957 + }, + { + "epoch": 1.1896360692065115, + "grad_norm": 36.13877249839106, + "learning_rate": 7.552488854287918e-06, + "loss": 3.2881, + "step": 13958 + }, + { + "epoch": 1.189721298900537, + "grad_norm": 60.80401291440621, + "learning_rate": 7.5520624725669525e-06, + "loss": 3.1967, + "step": 13959 + }, + { + "epoch": 1.1898065285945623, + "grad_norm": 39.29851799015351, + "learning_rate": 7.551636065747466e-06, + "loss": 2.6841, + "step": 13960 + }, + { + "epoch": 1.1898917582885877, + "grad_norm": 52.24735215501077, + "learning_rate": 7.55120963383365e-06, + "loss": 3.6869, + "step": 13961 + }, + { + "epoch": 1.1899769879826132, + "grad_norm": 46.52782558481333, + "learning_rate": 7.550783176829703e-06, + "loss": 2.5677, + "step": 13962 + }, + { + "epoch": 1.1900622176766384, + "grad_norm": 69.73523395550298, + "learning_rate": 7.550356694739815e-06, + "loss": 3.5499, + "step": 13963 + }, + { + "epoch": 1.190147447370664, + "grad_norm": 58.49926013111827, + "learning_rate": 7.549930187568186e-06, + "loss": 3.9892, + "step": 13964 + }, + { + "epoch": 1.1902326770646894, + "grad_norm": 29.303061668752584, + "learning_rate": 7.549503655319004e-06, + "loss": 2.3577, + "step": 13965 + }, + { + "epoch": 1.1903179067587146, + "grad_norm": 92.20785956996717, + "learning_rate": 7.549077097996468e-06, + "loss": 1.8291, + "step": 13966 + }, + { + "epoch": 1.19040313645274, + "grad_norm": 69.28072160685481, + "learning_rate": 7.548650515604769e-06, + "loss": 2.9288, + "step": 13967 + }, + { + "epoch": 1.1904883661467656, + "grad_norm": 44.872127329126634, + "learning_rate": 7.548223908148109e-06, + "loss": 2.9575, + "step": 13968 + }, + { + "epoch": 1.190573595840791, + "grad_norm": 130.11230942247457, + "learning_rate": 7.547797275630677e-06, + "loss": 2.4779, + "step": 13969 + }, + { + "epoch": 1.1906588255348163, + "grad_norm": 59.3303139228807, + "learning_rate": 7.547370618056672e-06, + "loss": 2.5256, + "step": 13970 + }, + { + "epoch": 1.1907440552288417, + "grad_norm": 62.99088247728135, + "learning_rate": 7.546943935430289e-06, + "loss": 3.7892, + "step": 13971 + }, + { + "epoch": 1.1908292849228672, + "grad_norm": 59.35733617043973, + "learning_rate": 7.546517227755724e-06, + "loss": 4.241, + "step": 13972 + }, + { + "epoch": 1.1909145146168925, + "grad_norm": 50.804379372235495, + "learning_rate": 7.546090495037175e-06, + "loss": 2.8277, + "step": 13973 + }, + { + "epoch": 1.190999744310918, + "grad_norm": 51.67881078881667, + "learning_rate": 7.545663737278837e-06, + "loss": 2.5357, + "step": 13974 + }, + { + "epoch": 1.1910849740049434, + "grad_norm": 49.61376974114693, + "learning_rate": 7.545236954484907e-06, + "loss": 3.4868, + "step": 13975 + }, + { + "epoch": 1.1911702036989686, + "grad_norm": 33.81816717485593, + "learning_rate": 7.544810146659583e-06, + "loss": 3.1388, + "step": 13976 + }, + { + "epoch": 1.1912554333929941, + "grad_norm": 38.95397436478045, + "learning_rate": 7.544383313807063e-06, + "loss": 2.778, + "step": 13977 + }, + { + "epoch": 1.1913406630870196, + "grad_norm": 74.9067844854888, + "learning_rate": 7.543956455931545e-06, + "loss": 3.3893, + "step": 13978 + }, + { + "epoch": 1.1914258927810448, + "grad_norm": 56.35433875896685, + "learning_rate": 7.543529573037225e-06, + "loss": 2.8079, + "step": 13979 + }, + { + "epoch": 1.1915111224750703, + "grad_norm": 77.61374924364857, + "learning_rate": 7.543102665128302e-06, + "loss": 3.0027, + "step": 13980 + }, + { + "epoch": 1.1915963521690958, + "grad_norm": 22.383946420179768, + "learning_rate": 7.542675732208975e-06, + "loss": 1.8666, + "step": 13981 + }, + { + "epoch": 1.1916815818631212, + "grad_norm": 54.737196374434035, + "learning_rate": 7.542248774283442e-06, + "loss": 3.1045, + "step": 13982 + }, + { + "epoch": 1.1917668115571465, + "grad_norm": 54.15889544091364, + "learning_rate": 7.541821791355906e-06, + "loss": 3.4614, + "step": 13983 + }, + { + "epoch": 1.191852041251172, + "grad_norm": 81.30886148545343, + "learning_rate": 7.541394783430559e-06, + "loss": 3.6079, + "step": 13984 + }, + { + "epoch": 1.1919372709451972, + "grad_norm": 63.37037977786618, + "learning_rate": 7.540967750511605e-06, + "loss": 3.4451, + "step": 13985 + }, + { + "epoch": 1.1920225006392227, + "grad_norm": 61.320194014865606, + "learning_rate": 7.540540692603243e-06, + "loss": 4.0192, + "step": 13986 + }, + { + "epoch": 1.1921077303332481, + "grad_norm": 110.5213879725629, + "learning_rate": 7.540113609709674e-06, + "loss": 2.8222, + "step": 13987 + }, + { + "epoch": 1.1921929600272736, + "grad_norm": 31.05794792270597, + "learning_rate": 7.539686501835095e-06, + "loss": 2.4357, + "step": 13988 + }, + { + "epoch": 1.1922781897212988, + "grad_norm": 45.679756976409486, + "learning_rate": 7.539259368983709e-06, + "loss": 1.612, + "step": 13989 + }, + { + "epoch": 1.1923634194153243, + "grad_norm": 25.43847654656822, + "learning_rate": 7.538832211159718e-06, + "loss": 1.959, + "step": 13990 + }, + { + "epoch": 1.1924486491093498, + "grad_norm": 53.6055349166001, + "learning_rate": 7.538405028367319e-06, + "loss": 2.9584, + "step": 13991 + }, + { + "epoch": 1.192533878803375, + "grad_norm": 34.8208962657424, + "learning_rate": 7.537977820610716e-06, + "loss": 2.8067, + "step": 13992 + }, + { + "epoch": 1.1926191084974005, + "grad_norm": 45.540103414744095, + "learning_rate": 7.537550587894109e-06, + "loss": 3.6874, + "step": 13993 + }, + { + "epoch": 1.192704338191426, + "grad_norm": 70.77520004021729, + "learning_rate": 7.537123330221701e-06, + "loss": 3.1055, + "step": 13994 + }, + { + "epoch": 1.1927895678854512, + "grad_norm": 32.64013703337983, + "learning_rate": 7.536696047597693e-06, + "loss": 2.6766, + "step": 13995 + }, + { + "epoch": 1.1928747975794767, + "grad_norm": 85.59749504067018, + "learning_rate": 7.536268740026287e-06, + "loss": 2.6276, + "step": 13996 + }, + { + "epoch": 1.1929600272735021, + "grad_norm": 61.109384099986514, + "learning_rate": 7.535841407511685e-06, + "loss": 3.8215, + "step": 13997 + }, + { + "epoch": 1.1930452569675274, + "grad_norm": 98.92780212915915, + "learning_rate": 7.535414050058092e-06, + "loss": 3.1358, + "step": 13998 + }, + { + "epoch": 1.1931304866615529, + "grad_norm": 64.85825389755935, + "learning_rate": 7.534986667669708e-06, + "loss": 2.6417, + "step": 13999 + }, + { + "epoch": 1.1932157163555783, + "grad_norm": 58.671032187443494, + "learning_rate": 7.53455926035074e-06, + "loss": 2.3936, + "step": 14000 + }, + { + "epoch": 1.1933009460496038, + "grad_norm": 36.56935286078576, + "learning_rate": 7.534131828105385e-06, + "loss": 2.9083, + "step": 14001 + }, + { + "epoch": 1.193386175743629, + "grad_norm": 57.087868075080216, + "learning_rate": 7.5337043709378535e-06, + "loss": 3.2065, + "step": 14002 + }, + { + "epoch": 1.1934714054376545, + "grad_norm": 38.897077354261384, + "learning_rate": 7.533276888852346e-06, + "loss": 2.1862, + "step": 14003 + }, + { + "epoch": 1.1935566351316798, + "grad_norm": 41.84952701453955, + "learning_rate": 7.532849381853064e-06, + "loss": 1.8297, + "step": 14004 + }, + { + "epoch": 1.1936418648257052, + "grad_norm": 33.21475653620581, + "learning_rate": 7.532421849944218e-06, + "loss": 2.6872, + "step": 14005 + }, + { + "epoch": 1.1937270945197307, + "grad_norm": 48.655240856689815, + "learning_rate": 7.531994293130007e-06, + "loss": 3.1467, + "step": 14006 + }, + { + "epoch": 1.1938123242137562, + "grad_norm": 22.793380869009436, + "learning_rate": 7.5315667114146405e-06, + "loss": 1.9158, + "step": 14007 + }, + { + "epoch": 1.1938975539077814, + "grad_norm": 57.60860512219087, + "learning_rate": 7.531139104802321e-06, + "loss": 2.8668, + "step": 14008 + }, + { + "epoch": 1.1939827836018069, + "grad_norm": 46.61177425748574, + "learning_rate": 7.530711473297252e-06, + "loss": 2.9302, + "step": 14009 + }, + { + "epoch": 1.1940680132958323, + "grad_norm": 68.79470798232211, + "learning_rate": 7.530283816903643e-06, + "loss": 3.159, + "step": 14010 + }, + { + "epoch": 1.1941532429898576, + "grad_norm": 66.74104948684244, + "learning_rate": 7.529856135625699e-06, + "loss": 4.0133, + "step": 14011 + }, + { + "epoch": 1.194238472683883, + "grad_norm": 38.64266983931882, + "learning_rate": 7.529428429467623e-06, + "loss": 2.9973, + "step": 14012 + }, + { + "epoch": 1.1943237023779085, + "grad_norm": 38.28529665027894, + "learning_rate": 7.529000698433624e-06, + "loss": 2.5368, + "step": 14013 + }, + { + "epoch": 1.1944089320719338, + "grad_norm": 44.400519126174714, + "learning_rate": 7.52857294252791e-06, + "loss": 2.4947, + "step": 14014 + }, + { + "epoch": 1.1944941617659592, + "grad_norm": 52.93189982750943, + "learning_rate": 7.528145161754684e-06, + "loss": 2.3078, + "step": 14015 + }, + { + "epoch": 1.1945793914599847, + "grad_norm": 36.88131749720555, + "learning_rate": 7.527717356118156e-06, + "loss": 2.6154, + "step": 14016 + }, + { + "epoch": 1.19466462115401, + "grad_norm": 49.90404178712014, + "learning_rate": 7.52728952562253e-06, + "loss": 2.5273, + "step": 14017 + }, + { + "epoch": 1.1947498508480354, + "grad_norm": 54.34682104222884, + "learning_rate": 7.526861670272019e-06, + "loss": 3.2933, + "step": 14018 + }, + { + "epoch": 1.194835080542061, + "grad_norm": 47.5421512120005, + "learning_rate": 7.526433790070825e-06, + "loss": 2.329, + "step": 14019 + }, + { + "epoch": 1.1949203102360864, + "grad_norm": 30.90742704494676, + "learning_rate": 7.52600588502316e-06, + "loss": 2.5291, + "step": 14020 + }, + { + "epoch": 1.1950055399301116, + "grad_norm": 42.010108380550946, + "learning_rate": 7.52557795513323e-06, + "loss": 3.2014, + "step": 14021 + }, + { + "epoch": 1.195090769624137, + "grad_norm": 37.349833176613345, + "learning_rate": 7.5251500004052446e-06, + "loss": 2.1808, + "step": 14022 + }, + { + "epoch": 1.1951759993181625, + "grad_norm": 31.340397830256048, + "learning_rate": 7.524722020843413e-06, + "loss": 2.3262, + "step": 14023 + }, + { + "epoch": 1.1952612290121878, + "grad_norm": 67.56759646080631, + "learning_rate": 7.524294016451943e-06, + "loss": 2.9543, + "step": 14024 + }, + { + "epoch": 1.1953464587062133, + "grad_norm": 50.55858249364835, + "learning_rate": 7.523865987235044e-06, + "loss": 3.4933, + "step": 14025 + }, + { + "epoch": 1.1954316884002387, + "grad_norm": 56.78981359052401, + "learning_rate": 7.523437933196927e-06, + "loss": 2.5418, + "step": 14026 + }, + { + "epoch": 1.195516918094264, + "grad_norm": 77.62092902324346, + "learning_rate": 7.523009854341798e-06, + "loss": 3.1514, + "step": 14027 + }, + { + "epoch": 1.1956021477882894, + "grad_norm": 44.350299019104135, + "learning_rate": 7.522581750673872e-06, + "loss": 3.4886, + "step": 14028 + }, + { + "epoch": 1.195687377482315, + "grad_norm": 51.885613997641684, + "learning_rate": 7.522153622197356e-06, + "loss": 3.1557, + "step": 14029 + }, + { + "epoch": 1.1957726071763402, + "grad_norm": 90.44752177309786, + "learning_rate": 7.521725468916463e-06, + "loss": 2.9685, + "step": 14030 + }, + { + "epoch": 1.1958578368703656, + "grad_norm": 43.25568139860958, + "learning_rate": 7.521297290835401e-06, + "loss": 2.8771, + "step": 14031 + }, + { + "epoch": 1.195943066564391, + "grad_norm": 32.87020701456878, + "learning_rate": 7.520869087958381e-06, + "loss": 2.5405, + "step": 14032 + }, + { + "epoch": 1.1960282962584166, + "grad_norm": 29.26273147428076, + "learning_rate": 7.520440860289617e-06, + "loss": 2.0708, + "step": 14033 + }, + { + "epoch": 1.1961135259524418, + "grad_norm": 72.085733279037, + "learning_rate": 7.520012607833318e-06, + "loss": 3.0427, + "step": 14034 + }, + { + "epoch": 1.1961987556464673, + "grad_norm": 42.66018862555831, + "learning_rate": 7.519584330593697e-06, + "loss": 2.9028, + "step": 14035 + }, + { + "epoch": 1.1962839853404925, + "grad_norm": 34.28048853994437, + "learning_rate": 7.519156028574962e-06, + "loss": 2.7149, + "step": 14036 + }, + { + "epoch": 1.196369215034518, + "grad_norm": 46.492332602834225, + "learning_rate": 7.518727701781332e-06, + "loss": 2.6505, + "step": 14037 + }, + { + "epoch": 1.1964544447285435, + "grad_norm": 96.59293918166557, + "learning_rate": 7.518299350217014e-06, + "loss": 3.4067, + "step": 14038 + }, + { + "epoch": 1.196539674422569, + "grad_norm": 49.912506996667, + "learning_rate": 7.517870973886224e-06, + "loss": 3.0264, + "step": 14039 + }, + { + "epoch": 1.1966249041165942, + "grad_norm": 80.39690382102812, + "learning_rate": 7.517442572793172e-06, + "loss": 3.0687, + "step": 14040 + }, + { + "epoch": 1.1967101338106196, + "grad_norm": 51.30315811643006, + "learning_rate": 7.517014146942074e-06, + "loss": 2.7033, + "step": 14041 + }, + { + "epoch": 1.196795363504645, + "grad_norm": 70.73203152814672, + "learning_rate": 7.516585696337141e-06, + "loss": 3.2588, + "step": 14042 + }, + { + "epoch": 1.1968805931986704, + "grad_norm": 50.40321415284726, + "learning_rate": 7.516157220982589e-06, + "loss": 3.5308, + "step": 14043 + }, + { + "epoch": 1.1969658228926958, + "grad_norm": 50.25787536482024, + "learning_rate": 7.515728720882629e-06, + "loss": 2.2697, + "step": 14044 + }, + { + "epoch": 1.1970510525867213, + "grad_norm": 45.47014114664084, + "learning_rate": 7.515300196041478e-06, + "loss": 3.5847, + "step": 14045 + }, + { + "epoch": 1.1971362822807465, + "grad_norm": 100.8400108706139, + "learning_rate": 7.514871646463348e-06, + "loss": 3.3992, + "step": 14046 + }, + { + "epoch": 1.197221511974772, + "grad_norm": 47.66269271854966, + "learning_rate": 7.514443072152455e-06, + "loss": 2.5107, + "step": 14047 + }, + { + "epoch": 1.1973067416687975, + "grad_norm": 75.0216736864066, + "learning_rate": 7.5140144731130135e-06, + "loss": 2.8329, + "step": 14048 + }, + { + "epoch": 1.1973919713628227, + "grad_norm": 28.826820936549808, + "learning_rate": 7.513585849349239e-06, + "loss": 2.4216, + "step": 14049 + }, + { + "epoch": 1.1974772010568482, + "grad_norm": 84.34778249594271, + "learning_rate": 7.513157200865347e-06, + "loss": 3.3396, + "step": 14050 + }, + { + "epoch": 1.1975624307508737, + "grad_norm": 38.09183309611167, + "learning_rate": 7.512728527665552e-06, + "loss": 2.8219, + "step": 14051 + }, + { + "epoch": 1.1976476604448991, + "grad_norm": 62.173558139675684, + "learning_rate": 7.512299829754069e-06, + "loss": 3.3906, + "step": 14052 + }, + { + "epoch": 1.1977328901389244, + "grad_norm": 43.14764050117336, + "learning_rate": 7.511871107135116e-06, + "loss": 2.357, + "step": 14053 + }, + { + "epoch": 1.1978181198329498, + "grad_norm": 52.901664357217406, + "learning_rate": 7.511442359812911e-06, + "loss": 2.3459, + "step": 14054 + }, + { + "epoch": 1.197903349526975, + "grad_norm": 32.6293423642531, + "learning_rate": 7.511013587791666e-06, + "loss": 2.6483, + "step": 14055 + }, + { + "epoch": 1.1979885792210005, + "grad_norm": 72.22936812869823, + "learning_rate": 7.5105847910756e-06, + "loss": 2.8734, + "step": 14056 + }, + { + "epoch": 1.198073808915026, + "grad_norm": 42.10442834627049, + "learning_rate": 7.510155969668931e-06, + "loss": 3.3238, + "step": 14057 + }, + { + "epoch": 1.1981590386090515, + "grad_norm": 67.25347710436256, + "learning_rate": 7.509727123575876e-06, + "loss": 2.7923, + "step": 14058 + }, + { + "epoch": 1.1982442683030767, + "grad_norm": 34.101310428327736, + "learning_rate": 7.509298252800652e-06, + "loss": 2.7953, + "step": 14059 + }, + { + "epoch": 1.1983294979971022, + "grad_norm": 37.79150610503473, + "learning_rate": 7.508869357347475e-06, + "loss": 3.3513, + "step": 14060 + }, + { + "epoch": 1.1984147276911277, + "grad_norm": 56.77679895972647, + "learning_rate": 7.5084404372205665e-06, + "loss": 3.3361, + "step": 14061 + }, + { + "epoch": 1.198499957385153, + "grad_norm": 145.933406662786, + "learning_rate": 7.508011492424143e-06, + "loss": 3.2697, + "step": 14062 + }, + { + "epoch": 1.1985851870791784, + "grad_norm": 55.327706553641185, + "learning_rate": 7.507582522962422e-06, + "loss": 3.3951, + "step": 14063 + }, + { + "epoch": 1.1986704167732039, + "grad_norm": 69.08118560471424, + "learning_rate": 7.507153528839625e-06, + "loss": 3.9812, + "step": 14064 + }, + { + "epoch": 1.198755646467229, + "grad_norm": 61.41097006897832, + "learning_rate": 7.506724510059968e-06, + "loss": 2.4662, + "step": 14065 + }, + { + "epoch": 1.1988408761612546, + "grad_norm": 37.014974297280546, + "learning_rate": 7.5062954666276725e-06, + "loss": 2.9356, + "step": 14066 + }, + { + "epoch": 1.19892610585528, + "grad_norm": 33.87490183458349, + "learning_rate": 7.5058663985469545e-06, + "loss": 2.8931, + "step": 14067 + }, + { + "epoch": 1.1990113355493053, + "grad_norm": 67.06824564241528, + "learning_rate": 7.505437305822039e-06, + "loss": 2.865, + "step": 14068 + }, + { + "epoch": 1.1990965652433307, + "grad_norm": 33.29304964469964, + "learning_rate": 7.5050081884571435e-06, + "loss": 1.786, + "step": 14069 + }, + { + "epoch": 1.1991817949373562, + "grad_norm": 30.464162313647186, + "learning_rate": 7.504579046456486e-06, + "loss": 2.2869, + "step": 14070 + }, + { + "epoch": 1.1992670246313817, + "grad_norm": 39.562425343109176, + "learning_rate": 7.5041498798242885e-06, + "loss": 3.0922, + "step": 14071 + }, + { + "epoch": 1.199352254325407, + "grad_norm": 40.96253114869511, + "learning_rate": 7.503720688564773e-06, + "loss": 2.0036, + "step": 14072 + }, + { + "epoch": 1.1994374840194324, + "grad_norm": 36.63392578440451, + "learning_rate": 7.503291472682159e-06, + "loss": 2.747, + "step": 14073 + }, + { + "epoch": 1.1995227137134576, + "grad_norm": 90.8997169444045, + "learning_rate": 7.502862232180669e-06, + "loss": 3.002, + "step": 14074 + }, + { + "epoch": 1.1996079434074831, + "grad_norm": 25.605682487943465, + "learning_rate": 7.502432967064522e-06, + "loss": 2.2396, + "step": 14075 + }, + { + "epoch": 1.1996931731015086, + "grad_norm": 33.62821960804165, + "learning_rate": 7.502003677337943e-06, + "loss": 2.4319, + "step": 14076 + }, + { + "epoch": 1.199778402795534, + "grad_norm": 32.37403188086232, + "learning_rate": 7.5015743630051506e-06, + "loss": 3.2327, + "step": 14077 + }, + { + "epoch": 1.1998636324895593, + "grad_norm": 111.3074719178361, + "learning_rate": 7.501145024070369e-06, + "loss": 2.6136, + "step": 14078 + }, + { + "epoch": 1.1999488621835848, + "grad_norm": 55.47068372248559, + "learning_rate": 7.500715660537818e-06, + "loss": 1.8814, + "step": 14079 + }, + { + "epoch": 1.2000340918776102, + "grad_norm": 65.15063239018023, + "learning_rate": 7.500286272411725e-06, + "loss": 3.2444, + "step": 14080 + }, + { + "epoch": 1.2001193215716355, + "grad_norm": 35.87910076349761, + "learning_rate": 7.49985685969631e-06, + "loss": 3.3111, + "step": 14081 + }, + { + "epoch": 1.200204551265661, + "grad_norm": 58.54718991315423, + "learning_rate": 7.499427422395795e-06, + "loss": 3.151, + "step": 14082 + }, + { + "epoch": 1.2002897809596864, + "grad_norm": 56.35110300662195, + "learning_rate": 7.498997960514402e-06, + "loss": 2.5182, + "step": 14083 + }, + { + "epoch": 1.2003750106537117, + "grad_norm": 51.146084767792644, + "learning_rate": 7.498568474056361e-06, + "loss": 3.4433, + "step": 14084 + }, + { + "epoch": 1.2004602403477371, + "grad_norm": 51.91686577775545, + "learning_rate": 7.498138963025891e-06, + "loss": 2.8118, + "step": 14085 + }, + { + "epoch": 1.2005454700417626, + "grad_norm": 46.58339506284948, + "learning_rate": 7.497709427427217e-06, + "loss": 3.3535, + "step": 14086 + }, + { + "epoch": 1.2006306997357878, + "grad_norm": 40.004539971916614, + "learning_rate": 7.497279867264561e-06, + "loss": 2.9666, + "step": 14087 + }, + { + "epoch": 1.2007159294298133, + "grad_norm": 56.31405128414443, + "learning_rate": 7.496850282542152e-06, + "loss": 2.33, + "step": 14088 + }, + { + "epoch": 1.2008011591238388, + "grad_norm": 38.143568930705534, + "learning_rate": 7.496420673264212e-06, + "loss": 3.058, + "step": 14089 + }, + { + "epoch": 1.2008863888178642, + "grad_norm": 39.206685279627614, + "learning_rate": 7.495991039434968e-06, + "loss": 2.7557, + "step": 14090 + }, + { + "epoch": 1.2009716185118895, + "grad_norm": 72.04112949942771, + "learning_rate": 7.495561381058643e-06, + "loss": 3.9381, + "step": 14091 + }, + { + "epoch": 1.201056848205915, + "grad_norm": 28.43542135203921, + "learning_rate": 7.4951316981394615e-06, + "loss": 2.8471, + "step": 14092 + }, + { + "epoch": 1.2011420778999404, + "grad_norm": 49.90334889742033, + "learning_rate": 7.494701990681654e-06, + "loss": 3.6288, + "step": 14093 + }, + { + "epoch": 1.2012273075939657, + "grad_norm": 46.68404783953467, + "learning_rate": 7.494272258689442e-06, + "loss": 4.6727, + "step": 14094 + }, + { + "epoch": 1.2013125372879911, + "grad_norm": 33.23137208028385, + "learning_rate": 7.493842502167055e-06, + "loss": 3.1226, + "step": 14095 + }, + { + "epoch": 1.2013977669820166, + "grad_norm": 41.313088927174924, + "learning_rate": 7.493412721118716e-06, + "loss": 2.438, + "step": 14096 + }, + { + "epoch": 1.2014829966760419, + "grad_norm": 95.3057206919621, + "learning_rate": 7.4929829155486555e-06, + "loss": 3.7985, + "step": 14097 + }, + { + "epoch": 1.2015682263700673, + "grad_norm": 81.34112339926294, + "learning_rate": 7.492553085461098e-06, + "loss": 3.9884, + "step": 14098 + }, + { + "epoch": 1.2016534560640928, + "grad_norm": 61.01046854956029, + "learning_rate": 7.492123230860271e-06, + "loss": 2.6924, + "step": 14099 + }, + { + "epoch": 1.201738685758118, + "grad_norm": 59.101938902145186, + "learning_rate": 7.491693351750402e-06, + "loss": 2.9601, + "step": 14100 + }, + { + "epoch": 1.2018239154521435, + "grad_norm": 52.576546240644284, + "learning_rate": 7.491263448135719e-06, + "loss": 3.6235, + "step": 14101 + }, + { + "epoch": 1.201909145146169, + "grad_norm": 27.92292144260744, + "learning_rate": 7.490833520020451e-06, + "loss": 2.1133, + "step": 14102 + }, + { + "epoch": 1.2019943748401944, + "grad_norm": 49.54830154633657, + "learning_rate": 7.490403567408824e-06, + "loss": 3.1862, + "step": 14103 + }, + { + "epoch": 1.2020796045342197, + "grad_norm": 30.814992439304195, + "learning_rate": 7.489973590305069e-06, + "loss": 1.2638, + "step": 14104 + }, + { + "epoch": 1.2021648342282452, + "grad_norm": 94.55598848544271, + "learning_rate": 7.489543588713412e-06, + "loss": 3.1004, + "step": 14105 + }, + { + "epoch": 1.2022500639222704, + "grad_norm": 70.22477462669781, + "learning_rate": 7.4891135626380825e-06, + "loss": 2.9447, + "step": 14106 + }, + { + "epoch": 1.2023352936162959, + "grad_norm": 81.83593740003226, + "learning_rate": 7.48868351208331e-06, + "loss": 3.2343, + "step": 14107 + }, + { + "epoch": 1.2024205233103213, + "grad_norm": 61.31006780553879, + "learning_rate": 7.488253437053327e-06, + "loss": 3.1062, + "step": 14108 + }, + { + "epoch": 1.2025057530043468, + "grad_norm": 30.43458244210003, + "learning_rate": 7.487823337552358e-06, + "loss": 2.6698, + "step": 14109 + }, + { + "epoch": 1.202590982698372, + "grad_norm": 69.18777166307238, + "learning_rate": 7.4873932135846335e-06, + "loss": 4.0018, + "step": 14110 + }, + { + "epoch": 1.2026762123923975, + "grad_norm": 27.607801450848324, + "learning_rate": 7.486963065154387e-06, + "loss": 2.4129, + "step": 14111 + }, + { + "epoch": 1.202761442086423, + "grad_norm": 40.87164020478128, + "learning_rate": 7.486532892265848e-06, + "loss": 2.6129, + "step": 14112 + }, + { + "epoch": 1.2028466717804482, + "grad_norm": 71.23607725026586, + "learning_rate": 7.486102694923244e-06, + "loss": 3.5532, + "step": 14113 + }, + { + "epoch": 1.2029319014744737, + "grad_norm": 57.7733890522828, + "learning_rate": 7.485672473130809e-06, + "loss": 3.0471, + "step": 14114 + }, + { + "epoch": 1.2030171311684992, + "grad_norm": 41.05701932672623, + "learning_rate": 7.485242226892772e-06, + "loss": 3.9671, + "step": 14115 + }, + { + "epoch": 1.2031023608625244, + "grad_norm": 34.54145298155406, + "learning_rate": 7.484811956213367e-06, + "loss": 3.0326, + "step": 14116 + }, + { + "epoch": 1.20318759055655, + "grad_norm": 28.32158670137648, + "learning_rate": 7.484381661096822e-06, + "loss": 3.4332, + "step": 14117 + }, + { + "epoch": 1.2032728202505754, + "grad_norm": 24.030928802804187, + "learning_rate": 7.483951341547371e-06, + "loss": 1.5877, + "step": 14118 + }, + { + "epoch": 1.2033580499446006, + "grad_norm": 46.056227946306215, + "learning_rate": 7.483520997569245e-06, + "loss": 2.5297, + "step": 14119 + }, + { + "epoch": 1.203443279638626, + "grad_norm": 73.3695802682914, + "learning_rate": 7.4830906291666785e-06, + "loss": 2.169, + "step": 14120 + }, + { + "epoch": 1.2035285093326515, + "grad_norm": 122.39168695211278, + "learning_rate": 7.482660236343903e-06, + "loss": 3.7165, + "step": 14121 + }, + { + "epoch": 1.203613739026677, + "grad_norm": 55.4618934276807, + "learning_rate": 7.482229819105148e-06, + "loss": 2.7016, + "step": 14122 + }, + { + "epoch": 1.2036989687207023, + "grad_norm": 78.83809649879946, + "learning_rate": 7.481799377454651e-06, + "loss": 2.4543, + "step": 14123 + }, + { + "epoch": 1.2037841984147277, + "grad_norm": 23.41728126675534, + "learning_rate": 7.4813689113966414e-06, + "loss": 2.0434, + "step": 14124 + }, + { + "epoch": 1.203869428108753, + "grad_norm": 52.23394492772566, + "learning_rate": 7.480938420935356e-06, + "loss": 3.255, + "step": 14125 + }, + { + "epoch": 1.2039546578027784, + "grad_norm": 59.87177073367255, + "learning_rate": 7.480507906075025e-06, + "loss": 4.3712, + "step": 14126 + }, + { + "epoch": 1.204039887496804, + "grad_norm": 68.36926590942768, + "learning_rate": 7.480077366819888e-06, + "loss": 2.4215, + "step": 14127 + }, + { + "epoch": 1.2041251171908294, + "grad_norm": 60.89224974876048, + "learning_rate": 7.479646803174173e-06, + "loss": 2.9985, + "step": 14128 + }, + { + "epoch": 1.2042103468848546, + "grad_norm": 64.2038754795527, + "learning_rate": 7.479216215142118e-06, + "loss": 3.047, + "step": 14129 + }, + { + "epoch": 1.20429557657888, + "grad_norm": 35.83612294831457, + "learning_rate": 7.478785602727956e-06, + "loss": 2.9485, + "step": 14130 + }, + { + "epoch": 1.2043808062729056, + "grad_norm": 38.11501651195281, + "learning_rate": 7.478354965935923e-06, + "loss": 3.271, + "step": 14131 + }, + { + "epoch": 1.2044660359669308, + "grad_norm": 35.053783498560335, + "learning_rate": 7.477924304770253e-06, + "loss": 2.3945, + "step": 14132 + }, + { + "epoch": 1.2045512656609563, + "grad_norm": 55.7116779121083, + "learning_rate": 7.477493619235184e-06, + "loss": 3.3204, + "step": 14133 + }, + { + "epoch": 1.2046364953549817, + "grad_norm": 25.366301068531286, + "learning_rate": 7.4770629093349465e-06, + "loss": 1.8903, + "step": 14134 + }, + { + "epoch": 1.204721725049007, + "grad_norm": 33.670977338931145, + "learning_rate": 7.4766321750737826e-06, + "loss": 2.8943, + "step": 14135 + }, + { + "epoch": 1.2048069547430325, + "grad_norm": 94.2358275985315, + "learning_rate": 7.4762014164559245e-06, + "loss": 3.3939, + "step": 14136 + }, + { + "epoch": 1.204892184437058, + "grad_norm": 76.36514463565634, + "learning_rate": 7.475770633485609e-06, + "loss": 3.5212, + "step": 14137 + }, + { + "epoch": 1.2049774141310832, + "grad_norm": 33.08846267946201, + "learning_rate": 7.475339826167074e-06, + "loss": 3.3742, + "step": 14138 + }, + { + "epoch": 1.2050626438251086, + "grad_norm": 53.24696207871396, + "learning_rate": 7.474908994504554e-06, + "loss": 2.3881, + "step": 14139 + }, + { + "epoch": 1.205147873519134, + "grad_norm": 48.77190091313094, + "learning_rate": 7.474478138502288e-06, + "loss": 1.9617, + "step": 14140 + }, + { + "epoch": 1.2052331032131596, + "grad_norm": 44.99760850228128, + "learning_rate": 7.474047258164513e-06, + "loss": 2.6534, + "step": 14141 + }, + { + "epoch": 1.2053183329071848, + "grad_norm": 54.65053860584325, + "learning_rate": 7.473616353495466e-06, + "loss": 1.8847, + "step": 14142 + }, + { + "epoch": 1.2054035626012103, + "grad_norm": 74.80139413665337, + "learning_rate": 7.473185424499385e-06, + "loss": 3.7963, + "step": 14143 + }, + { + "epoch": 1.2054887922952358, + "grad_norm": 57.38881817153751, + "learning_rate": 7.47275447118051e-06, + "loss": 2.1092, + "step": 14144 + }, + { + "epoch": 1.205574021989261, + "grad_norm": 41.80378701733189, + "learning_rate": 7.472323493543074e-06, + "loss": 3.23, + "step": 14145 + }, + { + "epoch": 1.2056592516832865, + "grad_norm": 60.10849005024671, + "learning_rate": 7.471892491591322e-06, + "loss": 2.9564, + "step": 14146 + }, + { + "epoch": 1.205744481377312, + "grad_norm": 67.51538556565961, + "learning_rate": 7.471461465329487e-06, + "loss": 3.2621, + "step": 14147 + }, + { + "epoch": 1.2058297110713372, + "grad_norm": 91.55891556063379, + "learning_rate": 7.471030414761813e-06, + "loss": 3.2195, + "step": 14148 + }, + { + "epoch": 1.2059149407653627, + "grad_norm": 115.70498604805871, + "learning_rate": 7.470599339892535e-06, + "loss": 2.6471, + "step": 14149 + }, + { + "epoch": 1.2060001704593881, + "grad_norm": 67.59264102270069, + "learning_rate": 7.470168240725895e-06, + "loss": 2.9002, + "step": 14150 + }, + { + "epoch": 1.2060854001534134, + "grad_norm": 42.536310877473454, + "learning_rate": 7.4697371172661315e-06, + "loss": 3.5842, + "step": 14151 + }, + { + "epoch": 1.2061706298474388, + "grad_norm": 229.58554561741028, + "learning_rate": 7.469305969517484e-06, + "loss": 2.6254, + "step": 14152 + }, + { + "epoch": 1.2062558595414643, + "grad_norm": 80.89614774871379, + "learning_rate": 7.468874797484194e-06, + "loss": 3.8456, + "step": 14153 + }, + { + "epoch": 1.2063410892354895, + "grad_norm": 50.92270354794413, + "learning_rate": 7.468443601170503e-06, + "loss": 2.4744, + "step": 14154 + }, + { + "epoch": 1.206426318929515, + "grad_norm": 61.77948313968789, + "learning_rate": 7.4680123805806495e-06, + "loss": 2.785, + "step": 14155 + }, + { + "epoch": 1.2065115486235405, + "grad_norm": 66.77732393717545, + "learning_rate": 7.467581135718874e-06, + "loss": 2.6058, + "step": 14156 + }, + { + "epoch": 1.2065967783175657, + "grad_norm": 32.53483313776938, + "learning_rate": 7.467149866589418e-06, + "loss": 2.7445, + "step": 14157 + }, + { + "epoch": 1.2066820080115912, + "grad_norm": 44.25890144930868, + "learning_rate": 7.466718573196524e-06, + "loss": 2.3544, + "step": 14158 + }, + { + "epoch": 1.2067672377056167, + "grad_norm": 30.94052328077017, + "learning_rate": 7.466287255544435e-06, + "loss": 2.7071, + "step": 14159 + }, + { + "epoch": 1.2068524673996421, + "grad_norm": 47.43658525164046, + "learning_rate": 7.465855913637388e-06, + "loss": 3.4956, + "step": 14160 + }, + { + "epoch": 1.2069376970936674, + "grad_norm": 57.98582013406426, + "learning_rate": 7.46542454747963e-06, + "loss": 3.7676, + "step": 14161 + }, + { + "epoch": 1.2070229267876929, + "grad_norm": 78.72424854838803, + "learning_rate": 7.464993157075399e-06, + "loss": 2.4438, + "step": 14162 + }, + { + "epoch": 1.2071081564817183, + "grad_norm": 46.53711862073889, + "learning_rate": 7.464561742428942e-06, + "loss": 2.6792, + "step": 14163 + }, + { + "epoch": 1.2071933861757436, + "grad_norm": 39.576094953030385, + "learning_rate": 7.464130303544499e-06, + "loss": 3.1721, + "step": 14164 + }, + { + "epoch": 1.207278615869769, + "grad_norm": 69.09844945974882, + "learning_rate": 7.463698840426312e-06, + "loss": 3.8138, + "step": 14165 + }, + { + "epoch": 1.2073638455637945, + "grad_norm": 38.174933343940495, + "learning_rate": 7.463267353078628e-06, + "loss": 2.8366, + "step": 14166 + }, + { + "epoch": 1.2074490752578197, + "grad_norm": 53.383360841871365, + "learning_rate": 7.462835841505687e-06, + "loss": 2.6826, + "step": 14167 + }, + { + "epoch": 1.2075343049518452, + "grad_norm": 34.03544023949841, + "learning_rate": 7.462404305711735e-06, + "loss": 2.3828, + "step": 14168 + }, + { + "epoch": 1.2076195346458707, + "grad_norm": 88.59479228044505, + "learning_rate": 7.461972745701014e-06, + "loss": 3.6865, + "step": 14169 + }, + { + "epoch": 1.207704764339896, + "grad_norm": 73.95700731999769, + "learning_rate": 7.461541161477771e-06, + "loss": 3.4537, + "step": 14170 + }, + { + "epoch": 1.2077899940339214, + "grad_norm": 65.03286763374561, + "learning_rate": 7.461109553046247e-06, + "loss": 3.5919, + "step": 14171 + }, + { + "epoch": 1.2078752237279469, + "grad_norm": 58.600418801982215, + "learning_rate": 7.460677920410691e-06, + "loss": 3.5837, + "step": 14172 + }, + { + "epoch": 1.2079604534219723, + "grad_norm": 66.25534850519585, + "learning_rate": 7.460246263575343e-06, + "loss": 3.8899, + "step": 14173 + }, + { + "epoch": 1.2080456831159976, + "grad_norm": 39.00940423793262, + "learning_rate": 7.4598145825444515e-06, + "loss": 2.4599, + "step": 14174 + }, + { + "epoch": 1.208130912810023, + "grad_norm": 59.283443318433314, + "learning_rate": 7.459382877322261e-06, + "loss": 3.1847, + "step": 14175 + }, + { + "epoch": 1.2082161425040483, + "grad_norm": 53.0887510873414, + "learning_rate": 7.4589511479130185e-06, + "loss": 2.7965, + "step": 14176 + }, + { + "epoch": 1.2083013721980738, + "grad_norm": 51.26542577775911, + "learning_rate": 7.458519394320966e-06, + "loss": 3.5583, + "step": 14177 + }, + { + "epoch": 1.2083866018920992, + "grad_norm": 31.64793923324556, + "learning_rate": 7.4580876165503535e-06, + "loss": 2.6843, + "step": 14178 + }, + { + "epoch": 1.2084718315861247, + "grad_norm": 41.005659535651304, + "learning_rate": 7.457655814605427e-06, + "loss": 2.9773, + "step": 14179 + }, + { + "epoch": 1.20855706128015, + "grad_norm": 47.35896790243581, + "learning_rate": 7.457223988490429e-06, + "loss": 2.7769, + "step": 14180 + }, + { + "epoch": 1.2086422909741754, + "grad_norm": 71.98985783837976, + "learning_rate": 7.456792138209612e-06, + "loss": 3.9288, + "step": 14181 + }, + { + "epoch": 1.2087275206682009, + "grad_norm": 103.92731809047112, + "learning_rate": 7.45636026376722e-06, + "loss": 3.4698, + "step": 14182 + }, + { + "epoch": 1.2088127503622261, + "grad_norm": 43.89132170413734, + "learning_rate": 7.455928365167502e-06, + "loss": 2.371, + "step": 14183 + }, + { + "epoch": 1.2088979800562516, + "grad_norm": 86.89120492640521, + "learning_rate": 7.455496442414703e-06, + "loss": 4.2257, + "step": 14184 + }, + { + "epoch": 1.208983209750277, + "grad_norm": 42.78673576478043, + "learning_rate": 7.455064495513072e-06, + "loss": 3.4949, + "step": 14185 + }, + { + "epoch": 1.2090684394443023, + "grad_norm": 34.2802509215112, + "learning_rate": 7.454632524466858e-06, + "loss": 3.0088, + "step": 14186 + }, + { + "epoch": 1.2091536691383278, + "grad_norm": 35.71182016901893, + "learning_rate": 7.454200529280309e-06, + "loss": 2.9786, + "step": 14187 + }, + { + "epoch": 1.2092388988323532, + "grad_norm": 59.47644484690275, + "learning_rate": 7.453768509957672e-06, + "loss": 3.5949, + "step": 14188 + }, + { + "epoch": 1.2093241285263785, + "grad_norm": 41.47095586355421, + "learning_rate": 7.453336466503196e-06, + "loss": 2.7707, + "step": 14189 + }, + { + "epoch": 1.209409358220404, + "grad_norm": 35.03835569874981, + "learning_rate": 7.4529043989211326e-06, + "loss": 2.9031, + "step": 14190 + }, + { + "epoch": 1.2094945879144294, + "grad_norm": 29.49922612196373, + "learning_rate": 7.452472307215728e-06, + "loss": 2.8537, + "step": 14191 + }, + { + "epoch": 1.209579817608455, + "grad_norm": 31.525840115116416, + "learning_rate": 7.452040191391233e-06, + "loss": 3.2975, + "step": 14192 + }, + { + "epoch": 1.2096650473024801, + "grad_norm": 84.41674644662714, + "learning_rate": 7.451608051451898e-06, + "loss": 2.7737, + "step": 14193 + }, + { + "epoch": 1.2097502769965056, + "grad_norm": 37.15798523097592, + "learning_rate": 7.4511758874019716e-06, + "loss": 2.7741, + "step": 14194 + }, + { + "epoch": 1.2098355066905309, + "grad_norm": 35.35947755681723, + "learning_rate": 7.450743699245703e-06, + "loss": 3.1561, + "step": 14195 + }, + { + "epoch": 1.2099207363845563, + "grad_norm": 38.88974191997474, + "learning_rate": 7.450311486987345e-06, + "loss": 3.9744, + "step": 14196 + }, + { + "epoch": 1.2100059660785818, + "grad_norm": 49.92023220652216, + "learning_rate": 7.449879250631147e-06, + "loss": 3.3156, + "step": 14197 + }, + { + "epoch": 1.2100911957726073, + "grad_norm": 31.00122716831125, + "learning_rate": 7.4494469901813605e-06, + "loss": 2.6978, + "step": 14198 + }, + { + "epoch": 1.2101764254666325, + "grad_norm": 38.430370443156434, + "learning_rate": 7.449014705642234e-06, + "loss": 2.3883, + "step": 14199 + }, + { + "epoch": 1.210261655160658, + "grad_norm": 32.30332429145797, + "learning_rate": 7.448582397018024e-06, + "loss": 2.5801, + "step": 14200 + }, + { + "epoch": 1.2103468848546834, + "grad_norm": 32.266027803658744, + "learning_rate": 7.448150064312976e-06, + "loss": 2.8079, + "step": 14201 + }, + { + "epoch": 1.2104321145487087, + "grad_norm": 64.95362129796321, + "learning_rate": 7.447717707531348e-06, + "loss": 3.6774, + "step": 14202 + }, + { + "epoch": 1.2105173442427342, + "grad_norm": 33.26116544995869, + "learning_rate": 7.4472853266773885e-06, + "loss": 3.4686, + "step": 14203 + }, + { + "epoch": 1.2106025739367596, + "grad_norm": 50.38167801114224, + "learning_rate": 7.446852921755349e-06, + "loss": 2.867, + "step": 14204 + }, + { + "epoch": 1.2106878036307849, + "grad_norm": 80.7229048402656, + "learning_rate": 7.446420492769483e-06, + "loss": 3.4407, + "step": 14205 + }, + { + "epoch": 1.2107730333248103, + "grad_norm": 68.53392550396484, + "learning_rate": 7.445988039724046e-06, + "loss": 3.4315, + "step": 14206 + }, + { + "epoch": 1.2108582630188358, + "grad_norm": 49.467164781769604, + "learning_rate": 7.445555562623286e-06, + "loss": 2.5171, + "step": 14207 + }, + { + "epoch": 1.210943492712861, + "grad_norm": 82.86803068724079, + "learning_rate": 7.44512306147146e-06, + "loss": 3.4132, + "step": 14208 + }, + { + "epoch": 1.2110287224068865, + "grad_norm": 43.536517494482894, + "learning_rate": 7.4446905362728205e-06, + "loss": 2.8233, + "step": 14209 + }, + { + "epoch": 1.211113952100912, + "grad_norm": 48.99743108174959, + "learning_rate": 7.44425798703162e-06, + "loss": 3.8136, + "step": 14210 + }, + { + "epoch": 1.2111991817949375, + "grad_norm": 38.751570257158605, + "learning_rate": 7.443825413752115e-06, + "loss": 3.8714, + "step": 14211 + }, + { + "epoch": 1.2112844114889627, + "grad_norm": 39.0899599512274, + "learning_rate": 7.443392816438555e-06, + "loss": 2.8388, + "step": 14212 + }, + { + "epoch": 1.2113696411829882, + "grad_norm": 40.46816649728874, + "learning_rate": 7.4429601950952e-06, + "loss": 3.2016, + "step": 14213 + }, + { + "epoch": 1.2114548708770136, + "grad_norm": 63.347424937331894, + "learning_rate": 7.442527549726301e-06, + "loss": 2.5635, + "step": 14214 + }, + { + "epoch": 1.211540100571039, + "grad_norm": 42.582845531056634, + "learning_rate": 7.442094880336117e-06, + "loss": 3.3291, + "step": 14215 + }, + { + "epoch": 1.2116253302650644, + "grad_norm": 34.53974263647766, + "learning_rate": 7.441662186928897e-06, + "loss": 3.117, + "step": 14216 + }, + { + "epoch": 1.2117105599590898, + "grad_norm": 66.77050119216986, + "learning_rate": 7.441229469508902e-06, + "loss": 3.7103, + "step": 14217 + }, + { + "epoch": 1.211795789653115, + "grad_norm": 56.14761856070844, + "learning_rate": 7.440796728080384e-06, + "loss": 2.6618, + "step": 14218 + }, + { + "epoch": 1.2118810193471405, + "grad_norm": 67.90605029241506, + "learning_rate": 7.4403639626476e-06, + "loss": 2.7775, + "step": 14219 + }, + { + "epoch": 1.211966249041166, + "grad_norm": 61.99535487893303, + "learning_rate": 7.439931173214805e-06, + "loss": 3.1192, + "step": 14220 + }, + { + "epoch": 1.2120514787351913, + "grad_norm": 37.45976147761642, + "learning_rate": 7.439498359786259e-06, + "loss": 2.6623, + "step": 14221 + }, + { + "epoch": 1.2121367084292167, + "grad_norm": 40.988685470113474, + "learning_rate": 7.439065522366214e-06, + "loss": 3.0473, + "step": 14222 + }, + { + "epoch": 1.2122219381232422, + "grad_norm": 45.65440160956837, + "learning_rate": 7.43863266095893e-06, + "loss": 3.3722, + "step": 14223 + }, + { + "epoch": 1.2123071678172674, + "grad_norm": 107.84011755294199, + "learning_rate": 7.438199775568662e-06, + "loss": 2.934, + "step": 14224 + }, + { + "epoch": 1.212392397511293, + "grad_norm": 84.7452248646234, + "learning_rate": 7.437766866199669e-06, + "loss": 3.8577, + "step": 14225 + }, + { + "epoch": 1.2124776272053184, + "grad_norm": 78.32955107315287, + "learning_rate": 7.437333932856207e-06, + "loss": 4.2559, + "step": 14226 + }, + { + "epoch": 1.2125628568993436, + "grad_norm": 70.60548676834073, + "learning_rate": 7.436900975542534e-06, + "loss": 3.3655, + "step": 14227 + }, + { + "epoch": 1.212648086593369, + "grad_norm": 84.69576094085738, + "learning_rate": 7.4364679942629085e-06, + "loss": 3.8095, + "step": 14228 + }, + { + "epoch": 1.2127333162873946, + "grad_norm": 54.253859290849704, + "learning_rate": 7.436034989021588e-06, + "loss": 2.517, + "step": 14229 + }, + { + "epoch": 1.21281854598142, + "grad_norm": 24.41532802181114, + "learning_rate": 7.435601959822831e-06, + "loss": 2.0114, + "step": 14230 + }, + { + "epoch": 1.2129037756754453, + "grad_norm": 73.08957511389819, + "learning_rate": 7.435168906670898e-06, + "loss": 2.9242, + "step": 14231 + }, + { + "epoch": 1.2129890053694707, + "grad_norm": 60.693680432074764, + "learning_rate": 7.434735829570046e-06, + "loss": 3.7322, + "step": 14232 + }, + { + "epoch": 1.2130742350634962, + "grad_norm": 25.49366951084671, + "learning_rate": 7.434302728524533e-06, + "loss": 2.2687, + "step": 14233 + }, + { + "epoch": 1.2131594647575215, + "grad_norm": 38.23602295787501, + "learning_rate": 7.4338696035386215e-06, + "loss": 3.132, + "step": 14234 + }, + { + "epoch": 1.213244694451547, + "grad_norm": 41.10033037156581, + "learning_rate": 7.433436454616569e-06, + "loss": 2.9928, + "step": 14235 + }, + { + "epoch": 1.2133299241455724, + "grad_norm": 122.42517389634678, + "learning_rate": 7.4330032817626365e-06, + "loss": 4.9712, + "step": 14236 + }, + { + "epoch": 1.2134151538395976, + "grad_norm": 40.26581315573088, + "learning_rate": 7.432570084981084e-06, + "loss": 2.6592, + "step": 14237 + }, + { + "epoch": 1.213500383533623, + "grad_norm": 39.50419062287909, + "learning_rate": 7.4321368642761694e-06, + "loss": 2.8323, + "step": 14238 + }, + { + "epoch": 1.2135856132276486, + "grad_norm": 37.24257780842302, + "learning_rate": 7.431703619652156e-06, + "loss": 3.1944, + "step": 14239 + }, + { + "epoch": 1.2136708429216738, + "grad_norm": 39.581897212542295, + "learning_rate": 7.431270351113304e-06, + "loss": 2.8211, + "step": 14240 + }, + { + "epoch": 1.2137560726156993, + "grad_norm": 29.227681123073324, + "learning_rate": 7.430837058663875e-06, + "loss": 2.3999, + "step": 14241 + }, + { + "epoch": 1.2138413023097248, + "grad_norm": 125.59658211651897, + "learning_rate": 7.430403742308129e-06, + "loss": 4.1824, + "step": 14242 + }, + { + "epoch": 1.2139265320037502, + "grad_norm": 62.73650288327322, + "learning_rate": 7.429970402050327e-06, + "loss": 2.2424, + "step": 14243 + }, + { + "epoch": 1.2140117616977755, + "grad_norm": 59.94693127523905, + "learning_rate": 7.429537037894732e-06, + "loss": 4.2545, + "step": 14244 + }, + { + "epoch": 1.214096991391801, + "grad_norm": 40.447098096876886, + "learning_rate": 7.429103649845607e-06, + "loss": 3.0302, + "step": 14245 + }, + { + "epoch": 1.2141822210858262, + "grad_norm": 23.60145441342124, + "learning_rate": 7.428670237907212e-06, + "loss": 1.5511, + "step": 14246 + }, + { + "epoch": 1.2142674507798517, + "grad_norm": 23.549954383486806, + "learning_rate": 7.4282368020838094e-06, + "loss": 2.7117, + "step": 14247 + }, + { + "epoch": 1.2143526804738771, + "grad_norm": 43.560928868430445, + "learning_rate": 7.427803342379662e-06, + "loss": 2.4396, + "step": 14248 + }, + { + "epoch": 1.2144379101679026, + "grad_norm": 45.75464524560748, + "learning_rate": 7.427369858799036e-06, + "loss": 2.8306, + "step": 14249 + }, + { + "epoch": 1.2145231398619278, + "grad_norm": 44.300982468240136, + "learning_rate": 7.426936351346191e-06, + "loss": 2.8314, + "step": 14250 + }, + { + "epoch": 1.2146083695559533, + "grad_norm": 28.392950791356963, + "learning_rate": 7.426502820025391e-06, + "loss": 2.0282, + "step": 14251 + }, + { + "epoch": 1.2146935992499788, + "grad_norm": 56.61400495833487, + "learning_rate": 7.426069264840901e-06, + "loss": 2.8184, + "step": 14252 + }, + { + "epoch": 1.214778828944004, + "grad_norm": 23.809107449207286, + "learning_rate": 7.425635685796982e-06, + "loss": 2.4727, + "step": 14253 + }, + { + "epoch": 1.2148640586380295, + "grad_norm": 33.92874962753122, + "learning_rate": 7.425202082897902e-06, + "loss": 2.9954, + "step": 14254 + }, + { + "epoch": 1.214949288332055, + "grad_norm": 74.70902628334862, + "learning_rate": 7.42476845614792e-06, + "loss": 2.351, + "step": 14255 + }, + { + "epoch": 1.2150345180260802, + "grad_norm": 48.328276901292526, + "learning_rate": 7.424334805551308e-06, + "loss": 2.3402, + "step": 14256 + }, + { + "epoch": 1.2151197477201057, + "grad_norm": 58.32606040259686, + "learning_rate": 7.4239011311123235e-06, + "loss": 2.2846, + "step": 14257 + }, + { + "epoch": 1.2152049774141311, + "grad_norm": 41.833904872765935, + "learning_rate": 7.423467432835236e-06, + "loss": 4.3517, + "step": 14258 + }, + { + "epoch": 1.2152902071081564, + "grad_norm": 44.06223508876245, + "learning_rate": 7.423033710724308e-06, + "loss": 3.4282, + "step": 14259 + }, + { + "epoch": 1.2153754368021819, + "grad_norm": 48.913074521527136, + "learning_rate": 7.422599964783806e-06, + "loss": 1.8535, + "step": 14260 + }, + { + "epoch": 1.2154606664962073, + "grad_norm": 53.9108556336056, + "learning_rate": 7.422166195017997e-06, + "loss": 2.4357, + "step": 14261 + }, + { + "epoch": 1.2155458961902328, + "grad_norm": 70.77695156251156, + "learning_rate": 7.421732401431146e-06, + "loss": 3.5923, + "step": 14262 + }, + { + "epoch": 1.215631125884258, + "grad_norm": 52.25292831277289, + "learning_rate": 7.421298584027519e-06, + "loss": 3.6043, + "step": 14263 + }, + { + "epoch": 1.2157163555782835, + "grad_norm": 26.90561521216034, + "learning_rate": 7.420864742811382e-06, + "loss": 2.203, + "step": 14264 + }, + { + "epoch": 1.2158015852723087, + "grad_norm": 34.4623763993045, + "learning_rate": 7.420430877787004e-06, + "loss": 3.4027, + "step": 14265 + }, + { + "epoch": 1.2158868149663342, + "grad_norm": 53.60788647424527, + "learning_rate": 7.419996988958649e-06, + "loss": 2.9435, + "step": 14266 + }, + { + "epoch": 1.2159720446603597, + "grad_norm": 32.23326329903907, + "learning_rate": 7.419563076330585e-06, + "loss": 2.7711, + "step": 14267 + }, + { + "epoch": 1.2160572743543852, + "grad_norm": 115.43065746779892, + "learning_rate": 7.419129139907079e-06, + "loss": 2.639, + "step": 14268 + }, + { + "epoch": 1.2161425040484104, + "grad_norm": 42.37494034868825, + "learning_rate": 7.418695179692401e-06, + "loss": 3.1073, + "step": 14269 + }, + { + "epoch": 1.2162277337424359, + "grad_norm": 58.05076064872566, + "learning_rate": 7.418261195690814e-06, + "loss": 2.7114, + "step": 14270 + }, + { + "epoch": 1.2163129634364613, + "grad_norm": 36.33292498959683, + "learning_rate": 7.417827187906592e-06, + "loss": 1.7001, + "step": 14271 + }, + { + "epoch": 1.2163981931304866, + "grad_norm": 39.97115652533218, + "learning_rate": 7.417393156343998e-06, + "loss": 3.8044, + "step": 14272 + }, + { + "epoch": 1.216483422824512, + "grad_norm": 50.241352691857564, + "learning_rate": 7.416959101007306e-06, + "loss": 3.1735, + "step": 14273 + }, + { + "epoch": 1.2165686525185375, + "grad_norm": 58.69564764680208, + "learning_rate": 7.416525021900779e-06, + "loss": 3.4823, + "step": 14274 + }, + { + "epoch": 1.2166538822125628, + "grad_norm": 53.11782251454333, + "learning_rate": 7.416090919028689e-06, + "loss": 3.9995, + "step": 14275 + }, + { + "epoch": 1.2167391119065882, + "grad_norm": 67.71001823124487, + "learning_rate": 7.415656792395305e-06, + "loss": 2.7146, + "step": 14276 + }, + { + "epoch": 1.2168243416006137, + "grad_norm": 83.15439865802746, + "learning_rate": 7.415222642004898e-06, + "loss": 3.9313, + "step": 14277 + }, + { + "epoch": 1.216909571294639, + "grad_norm": 30.040316697903503, + "learning_rate": 7.4147884678617335e-06, + "loss": 2.2051, + "step": 14278 + }, + { + "epoch": 1.2169948009886644, + "grad_norm": 35.095816442328434, + "learning_rate": 7.414354269970085e-06, + "loss": 2.7098, + "step": 14279 + }, + { + "epoch": 1.2170800306826899, + "grad_norm": 56.975731660273674, + "learning_rate": 7.4139200483342224e-06, + "loss": 3.3968, + "step": 14280 + }, + { + "epoch": 1.2171652603767154, + "grad_norm": 51.68062451160179, + "learning_rate": 7.413485802958415e-06, + "loss": 2.2561, + "step": 14281 + }, + { + "epoch": 1.2172504900707406, + "grad_norm": 26.135633158822387, + "learning_rate": 7.4130515338469335e-06, + "loss": 2.4739, + "step": 14282 + }, + { + "epoch": 1.217335719764766, + "grad_norm": 34.15813186560128, + "learning_rate": 7.412617241004047e-06, + "loss": 3.3111, + "step": 14283 + }, + { + "epoch": 1.2174209494587915, + "grad_norm": 38.43527692097126, + "learning_rate": 7.412182924434033e-06, + "loss": 2.6121, + "step": 14284 + }, + { + "epoch": 1.2175061791528168, + "grad_norm": 56.622437678897434, + "learning_rate": 7.411748584141155e-06, + "loss": 2.7995, + "step": 14285 + }, + { + "epoch": 1.2175914088468422, + "grad_norm": 40.23389827381485, + "learning_rate": 7.411314220129688e-06, + "loss": 2.9157, + "step": 14286 + }, + { + "epoch": 1.2176766385408677, + "grad_norm": 39.918133606421215, + "learning_rate": 7.410879832403905e-06, + "loss": 2.5582, + "step": 14287 + }, + { + "epoch": 1.217761868234893, + "grad_norm": 89.60603230518444, + "learning_rate": 7.410445420968078e-06, + "loss": 4.0089, + "step": 14288 + }, + { + "epoch": 1.2178470979289184, + "grad_norm": 52.9940452400836, + "learning_rate": 7.410010985826476e-06, + "loss": 2.6167, + "step": 14289 + }, + { + "epoch": 1.217932327622944, + "grad_norm": 86.10299539795261, + "learning_rate": 7.409576526983374e-06, + "loss": 4.7904, + "step": 14290 + }, + { + "epoch": 1.2180175573169691, + "grad_norm": 35.08345419362075, + "learning_rate": 7.409142044443045e-06, + "loss": 2.8428, + "step": 14291 + }, + { + "epoch": 1.2181027870109946, + "grad_norm": 40.794699531173706, + "learning_rate": 7.408707538209761e-06, + "loss": 2.6578, + "step": 14292 + }, + { + "epoch": 1.21818801670502, + "grad_norm": 24.591039165785798, + "learning_rate": 7.408273008287795e-06, + "loss": 2.1823, + "step": 14293 + }, + { + "epoch": 1.2182732463990456, + "grad_norm": 41.44034743540026, + "learning_rate": 7.407838454681422e-06, + "loss": 3.068, + "step": 14294 + }, + { + "epoch": 1.2183584760930708, + "grad_norm": 32.78344651744043, + "learning_rate": 7.407403877394913e-06, + "loss": 3.1121, + "step": 14295 + }, + { + "epoch": 1.2184437057870963, + "grad_norm": 97.81920913986042, + "learning_rate": 7.406969276432545e-06, + "loss": 3.971, + "step": 14296 + }, + { + "epoch": 1.2185289354811215, + "grad_norm": 47.0753355206568, + "learning_rate": 7.406534651798592e-06, + "loss": 3.9407, + "step": 14297 + }, + { + "epoch": 1.218614165175147, + "grad_norm": 72.33908259246104, + "learning_rate": 7.406100003497323e-06, + "loss": 2.9599, + "step": 14298 + }, + { + "epoch": 1.2186993948691724, + "grad_norm": 79.14398300791208, + "learning_rate": 7.405665331533018e-06, + "loss": 3.6905, + "step": 14299 + }, + { + "epoch": 1.218784624563198, + "grad_norm": 32.69424485386245, + "learning_rate": 7.405230635909952e-06, + "loss": 2.4044, + "step": 14300 + }, + { + "epoch": 1.2188698542572232, + "grad_norm": 31.660424639325484, + "learning_rate": 7.404795916632398e-06, + "loss": 2.8878, + "step": 14301 + }, + { + "epoch": 1.2189550839512486, + "grad_norm": 53.2881815282553, + "learning_rate": 7.404361173704629e-06, + "loss": 3.3729, + "step": 14302 + }, + { + "epoch": 1.219040313645274, + "grad_norm": 45.44645607066529, + "learning_rate": 7.403926407130927e-06, + "loss": 3.3657, + "step": 14303 + }, + { + "epoch": 1.2191255433392993, + "grad_norm": 32.59330081296193, + "learning_rate": 7.403491616915561e-06, + "loss": 3.0397, + "step": 14304 + }, + { + "epoch": 1.2192107730333248, + "grad_norm": 90.3053206967761, + "learning_rate": 7.403056803062812e-06, + "loss": 3.5027, + "step": 14305 + }, + { + "epoch": 1.2192960027273503, + "grad_norm": 81.22790312827628, + "learning_rate": 7.402621965576952e-06, + "loss": 3.837, + "step": 14306 + }, + { + "epoch": 1.2193812324213755, + "grad_norm": 67.54583118512562, + "learning_rate": 7.402187104462263e-06, + "loss": 3.9541, + "step": 14307 + }, + { + "epoch": 1.219466462115401, + "grad_norm": 67.47114848914673, + "learning_rate": 7.401752219723016e-06, + "loss": 1.7938, + "step": 14308 + }, + { + "epoch": 1.2195516918094265, + "grad_norm": 36.115695649052796, + "learning_rate": 7.401317311363492e-06, + "loss": 2.6343, + "step": 14309 + }, + { + "epoch": 1.2196369215034517, + "grad_norm": 42.74527130528061, + "learning_rate": 7.400882379387966e-06, + "loss": 2.9384, + "step": 14310 + }, + { + "epoch": 1.2197221511974772, + "grad_norm": 51.494680532510024, + "learning_rate": 7.400447423800714e-06, + "loss": 1.856, + "step": 14311 + }, + { + "epoch": 1.2198073808915026, + "grad_norm": 45.31103946759481, + "learning_rate": 7.4000124446060175e-06, + "loss": 2.6451, + "step": 14312 + }, + { + "epoch": 1.2198926105855281, + "grad_norm": 29.929386291694716, + "learning_rate": 7.399577441808151e-06, + "loss": 2.3017, + "step": 14313 + }, + { + "epoch": 1.2199778402795534, + "grad_norm": 78.49035934244992, + "learning_rate": 7.399142415411395e-06, + "loss": 3.8502, + "step": 14314 + }, + { + "epoch": 1.2200630699735788, + "grad_norm": 66.33473558666633, + "learning_rate": 7.3987073654200245e-06, + "loss": 2.1958, + "step": 14315 + }, + { + "epoch": 1.220148299667604, + "grad_norm": 104.66856271066823, + "learning_rate": 7.398272291838322e-06, + "loss": 3.3867, + "step": 14316 + }, + { + "epoch": 1.2202335293616295, + "grad_norm": 50.63847443135156, + "learning_rate": 7.397837194670564e-06, + "loss": 3.2072, + "step": 14317 + }, + { + "epoch": 1.220318759055655, + "grad_norm": 49.95277370004197, + "learning_rate": 7.3974020739210305e-06, + "loss": 3.1624, + "step": 14318 + }, + { + "epoch": 1.2204039887496805, + "grad_norm": 38.69294375568017, + "learning_rate": 7.396966929593999e-06, + "loss": 3.5709, + "step": 14319 + }, + { + "epoch": 1.2204892184437057, + "grad_norm": 39.26498758321827, + "learning_rate": 7.396531761693753e-06, + "loss": 2.9794, + "step": 14320 + }, + { + "epoch": 1.2205744481377312, + "grad_norm": 119.68382939517032, + "learning_rate": 7.3960965702245665e-06, + "loss": 3.246, + "step": 14321 + }, + { + "epoch": 1.2206596778317567, + "grad_norm": 63.72848330122196, + "learning_rate": 7.395661355190723e-06, + "loss": 4.0396, + "step": 14322 + }, + { + "epoch": 1.220744907525782, + "grad_norm": 45.62402430766116, + "learning_rate": 7.395226116596502e-06, + "loss": 2.9677, + "step": 14323 + }, + { + "epoch": 1.2208301372198074, + "grad_norm": 64.75522688205685, + "learning_rate": 7.394790854446186e-06, + "loss": 2.8877, + "step": 14324 + }, + { + "epoch": 1.2209153669138328, + "grad_norm": 98.51689824908519, + "learning_rate": 7.39435556874405e-06, + "loss": 3.7884, + "step": 14325 + }, + { + "epoch": 1.221000596607858, + "grad_norm": 37.23618049149055, + "learning_rate": 7.3939202594943805e-06, + "loss": 2.9128, + "step": 14326 + }, + { + "epoch": 1.2210858263018836, + "grad_norm": 32.85012895193747, + "learning_rate": 7.393484926701456e-06, + "loss": 1.9639, + "step": 14327 + }, + { + "epoch": 1.221171055995909, + "grad_norm": 43.65048868370106, + "learning_rate": 7.393049570369559e-06, + "loss": 2.9586, + "step": 14328 + }, + { + "epoch": 1.2212562856899343, + "grad_norm": 32.302126853138496, + "learning_rate": 7.392614190502969e-06, + "loss": 2.7989, + "step": 14329 + }, + { + "epoch": 1.2213415153839597, + "grad_norm": 68.96278551620163, + "learning_rate": 7.39217878710597e-06, + "loss": 3.3701, + "step": 14330 + }, + { + "epoch": 1.2214267450779852, + "grad_norm": 51.691410061776736, + "learning_rate": 7.3917433601828435e-06, + "loss": 3.2047, + "step": 14331 + }, + { + "epoch": 1.2215119747720107, + "grad_norm": 60.55691367797447, + "learning_rate": 7.391307909737871e-06, + "loss": 2.7268, + "step": 14332 + }, + { + "epoch": 1.221597204466036, + "grad_norm": 42.968169198779854, + "learning_rate": 7.390872435775336e-06, + "loss": 2.8148, + "step": 14333 + }, + { + "epoch": 1.2216824341600614, + "grad_norm": 86.68229915658729, + "learning_rate": 7.39043693829952e-06, + "loss": 3.3793, + "step": 14334 + }, + { + "epoch": 1.2217676638540869, + "grad_norm": 41.47192471951106, + "learning_rate": 7.390001417314707e-06, + "loss": 3.0082, + "step": 14335 + }, + { + "epoch": 1.221852893548112, + "grad_norm": 52.5800367509242, + "learning_rate": 7.38956587282518e-06, + "loss": 3.1243, + "step": 14336 + }, + { + "epoch": 1.2219381232421376, + "grad_norm": 40.42900781362084, + "learning_rate": 7.389130304835221e-06, + "loss": 3.2532, + "step": 14337 + }, + { + "epoch": 1.222023352936163, + "grad_norm": 58.198780819949164, + "learning_rate": 7.388694713349116e-06, + "loss": 2.8629, + "step": 14338 + }, + { + "epoch": 1.2221085826301883, + "grad_norm": 50.45237904269214, + "learning_rate": 7.3882590983711475e-06, + "loss": 3.4336, + "step": 14339 + }, + { + "epoch": 1.2221938123242138, + "grad_norm": 82.47011843125684, + "learning_rate": 7.3878234599056e-06, + "loss": 3.0151, + "step": 14340 + }, + { + "epoch": 1.2222790420182392, + "grad_norm": 43.112215317491255, + "learning_rate": 7.387387797956755e-06, + "loss": 3.372, + "step": 14341 + }, + { + "epoch": 1.2223642717122645, + "grad_norm": 159.53520669084418, + "learning_rate": 7.386952112528903e-06, + "loss": 4.4326, + "step": 14342 + }, + { + "epoch": 1.22244950140629, + "grad_norm": 31.58437084236041, + "learning_rate": 7.386516403626325e-06, + "loss": 2.4816, + "step": 14343 + }, + { + "epoch": 1.2225347311003154, + "grad_norm": 31.893252189692856, + "learning_rate": 7.386080671253306e-06, + "loss": 2.9573, + "step": 14344 + }, + { + "epoch": 1.2226199607943407, + "grad_norm": 32.28947376245485, + "learning_rate": 7.385644915414129e-06, + "loss": 2.4018, + "step": 14345 + }, + { + "epoch": 1.2227051904883661, + "grad_norm": 39.117631860469075, + "learning_rate": 7.385209136113086e-06, + "loss": 2.9818, + "step": 14346 + }, + { + "epoch": 1.2227904201823916, + "grad_norm": 64.95120074441981, + "learning_rate": 7.384773333354458e-06, + "loss": 3.3462, + "step": 14347 + }, + { + "epoch": 1.2228756498764168, + "grad_norm": 21.738525511184115, + "learning_rate": 7.3843375071425315e-06, + "loss": 2.2262, + "step": 14348 + }, + { + "epoch": 1.2229608795704423, + "grad_norm": 46.07904211620872, + "learning_rate": 7.383901657481592e-06, + "loss": 3.3171, + "step": 14349 + }, + { + "epoch": 1.2230461092644678, + "grad_norm": 48.37513049326967, + "learning_rate": 7.3834657843759286e-06, + "loss": 1.7049, + "step": 14350 + }, + { + "epoch": 1.2231313389584932, + "grad_norm": 72.94206120815026, + "learning_rate": 7.383029887829825e-06, + "loss": 3.0985, + "step": 14351 + }, + { + "epoch": 1.2232165686525185, + "grad_norm": 42.848686048334066, + "learning_rate": 7.382593967847571e-06, + "loss": 2.8239, + "step": 14352 + }, + { + "epoch": 1.223301798346544, + "grad_norm": 37.90941447766763, + "learning_rate": 7.38215802443345e-06, + "loss": 2.2008, + "step": 14353 + }, + { + "epoch": 1.2233870280405694, + "grad_norm": 40.97300556986738, + "learning_rate": 7.381722057591754e-06, + "loss": 3.315, + "step": 14354 + }, + { + "epoch": 1.2234722577345947, + "grad_norm": 43.17465979886328, + "learning_rate": 7.381286067326766e-06, + "loss": 2.8355, + "step": 14355 + }, + { + "epoch": 1.2235574874286201, + "grad_norm": 56.36990024658557, + "learning_rate": 7.380850053642776e-06, + "loss": 3.1237, + "step": 14356 + }, + { + "epoch": 1.2236427171226456, + "grad_norm": 37.28145626280059, + "learning_rate": 7.380414016544072e-06, + "loss": 2.7244, + "step": 14357 + }, + { + "epoch": 1.2237279468166709, + "grad_norm": 28.228294338311812, + "learning_rate": 7.3799779560349415e-06, + "loss": 2.105, + "step": 14358 + }, + { + "epoch": 1.2238131765106963, + "grad_norm": 52.1189196806869, + "learning_rate": 7.379541872119675e-06, + "loss": 3.2376, + "step": 14359 + }, + { + "epoch": 1.2238984062047218, + "grad_norm": 59.36994574147161, + "learning_rate": 7.379105764802559e-06, + "loss": 3.1154, + "step": 14360 + }, + { + "epoch": 1.223983635898747, + "grad_norm": 57.37356565147473, + "learning_rate": 7.378669634087883e-06, + "loss": 3.1561, + "step": 14361 + }, + { + "epoch": 1.2240688655927725, + "grad_norm": 30.491079533920377, + "learning_rate": 7.378233479979935e-06, + "loss": 3.1687, + "step": 14362 + }, + { + "epoch": 1.224154095286798, + "grad_norm": 32.04370019957689, + "learning_rate": 7.3777973024830075e-06, + "loss": 2.9898, + "step": 14363 + }, + { + "epoch": 1.2242393249808234, + "grad_norm": 62.41798462024868, + "learning_rate": 7.377361101601388e-06, + "loss": 3.0589, + "step": 14364 + }, + { + "epoch": 1.2243245546748487, + "grad_norm": 66.20272279682968, + "learning_rate": 7.376924877339366e-06, + "loss": 3.7112, + "step": 14365 + }, + { + "epoch": 1.2244097843688742, + "grad_norm": 42.596765374290655, + "learning_rate": 7.376488629701233e-06, + "loss": 2.6577, + "step": 14366 + }, + { + "epoch": 1.2244950140628994, + "grad_norm": 51.56069624852058, + "learning_rate": 7.3760523586912794e-06, + "loss": 3.4361, + "step": 14367 + }, + { + "epoch": 1.2245802437569249, + "grad_norm": 52.10249920149653, + "learning_rate": 7.375616064313794e-06, + "loss": 2.1568, + "step": 14368 + }, + { + "epoch": 1.2246654734509503, + "grad_norm": 34.12505533227527, + "learning_rate": 7.375179746573069e-06, + "loss": 3.0596, + "step": 14369 + }, + { + "epoch": 1.2247507031449758, + "grad_norm": 56.09564066258295, + "learning_rate": 7.374743405473395e-06, + "loss": 2.5552, + "step": 14370 + }, + { + "epoch": 1.224835932839001, + "grad_norm": 254.97311318998283, + "learning_rate": 7.374307041019063e-06, + "loss": 2.0344, + "step": 14371 + }, + { + "epoch": 1.2249211625330265, + "grad_norm": 54.12761654203763, + "learning_rate": 7.3738706532143645e-06, + "loss": 2.5306, + "step": 14372 + }, + { + "epoch": 1.225006392227052, + "grad_norm": 57.043100796620934, + "learning_rate": 7.373434242063592e-06, + "loss": 3.183, + "step": 14373 + }, + { + "epoch": 1.2250916219210772, + "grad_norm": 31.621386002899552, + "learning_rate": 7.372997807571036e-06, + "loss": 2.2651, + "step": 14374 + }, + { + "epoch": 1.2251768516151027, + "grad_norm": 40.4716253285491, + "learning_rate": 7.37256134974099e-06, + "loss": 2.9904, + "step": 14375 + }, + { + "epoch": 1.2252620813091282, + "grad_norm": 33.97535166779204, + "learning_rate": 7.372124868577746e-06, + "loss": 2.6031, + "step": 14376 + }, + { + "epoch": 1.2253473110031534, + "grad_norm": 61.5287719676898, + "learning_rate": 7.371688364085597e-06, + "loss": 2.3981, + "step": 14377 + }, + { + "epoch": 1.2254325406971789, + "grad_norm": 31.24380935957792, + "learning_rate": 7.371251836268835e-06, + "loss": 2.492, + "step": 14378 + }, + { + "epoch": 1.2255177703912044, + "grad_norm": 101.32448095848176, + "learning_rate": 7.370815285131753e-06, + "loss": 2.9925, + "step": 14379 + }, + { + "epoch": 1.2256030000852296, + "grad_norm": 87.86342967924632, + "learning_rate": 7.370378710678644e-06, + "loss": 4.3805, + "step": 14380 + }, + { + "epoch": 1.225688229779255, + "grad_norm": 57.656123942351584, + "learning_rate": 7.3699421129138035e-06, + "loss": 2.6086, + "step": 14381 + }, + { + "epoch": 1.2257734594732805, + "grad_norm": 40.45222427349223, + "learning_rate": 7.369505491841525e-06, + "loss": 2.6603, + "step": 14382 + }, + { + "epoch": 1.225858689167306, + "grad_norm": 59.012369181957936, + "learning_rate": 7.3690688474661e-06, + "loss": 2.7479, + "step": 14383 + }, + { + "epoch": 1.2259439188613312, + "grad_norm": 170.27156515285466, + "learning_rate": 7.368632179791823e-06, + "loss": 4.0391, + "step": 14384 + }, + { + "epoch": 1.2260291485553567, + "grad_norm": 42.35216563273018, + "learning_rate": 7.368195488822992e-06, + "loss": 3.2293, + "step": 14385 + }, + { + "epoch": 1.226114378249382, + "grad_norm": 39.124597175845686, + "learning_rate": 7.367758774563899e-06, + "loss": 2.7468, + "step": 14386 + }, + { + "epoch": 1.2261996079434074, + "grad_norm": 35.282075945865095, + "learning_rate": 7.3673220370188405e-06, + "loss": 3.062, + "step": 14387 + }, + { + "epoch": 1.226284837637433, + "grad_norm": 27.128908715368883, + "learning_rate": 7.366885276192106e-06, + "loss": 2.313, + "step": 14388 + }, + { + "epoch": 1.2263700673314584, + "grad_norm": 69.77349816668931, + "learning_rate": 7.3664484920879996e-06, + "loss": 4.2324, + "step": 14389 + }, + { + "epoch": 1.2264552970254836, + "grad_norm": 74.07812045278999, + "learning_rate": 7.366011684710811e-06, + "loss": 3.7932, + "step": 14390 + }, + { + "epoch": 1.226540526719509, + "grad_norm": 39.4239464934395, + "learning_rate": 7.3655748540648386e-06, + "loss": 2.8284, + "step": 14391 + }, + { + "epoch": 1.2266257564135346, + "grad_norm": 44.44003201874804, + "learning_rate": 7.365138000154376e-06, + "loss": 3.4953, + "step": 14392 + }, + { + "epoch": 1.2267109861075598, + "grad_norm": 53.85280105496141, + "learning_rate": 7.364701122983722e-06, + "loss": 2.5662, + "step": 14393 + }, + { + "epoch": 1.2267962158015853, + "grad_norm": 65.96517021525673, + "learning_rate": 7.3642642225571715e-06, + "loss": 3.3302, + "step": 14394 + }, + { + "epoch": 1.2268814454956107, + "grad_norm": 53.17100960590328, + "learning_rate": 7.363827298879023e-06, + "loss": 3.6056, + "step": 14395 + }, + { + "epoch": 1.226966675189636, + "grad_norm": 58.824166573942755, + "learning_rate": 7.36339035195357e-06, + "loss": 3.4406, + "step": 14396 + }, + { + "epoch": 1.2270519048836614, + "grad_norm": 55.20922463549291, + "learning_rate": 7.362953381785114e-06, + "loss": 3.9082, + "step": 14397 + }, + { + "epoch": 1.227137134577687, + "grad_norm": 84.10876523101156, + "learning_rate": 7.362516388377951e-06, + "loss": 3.8164, + "step": 14398 + }, + { + "epoch": 1.2272223642717122, + "grad_norm": 40.91906072484806, + "learning_rate": 7.362079371736377e-06, + "loss": 2.9615, + "step": 14399 + }, + { + "epoch": 1.2273075939657376, + "grad_norm": 46.87885520783843, + "learning_rate": 7.361642331864691e-06, + "loss": 2.2293, + "step": 14400 + }, + { + "epoch": 1.227392823659763, + "grad_norm": 37.2385691583306, + "learning_rate": 7.361205268767191e-06, + "loss": 2.5716, + "step": 14401 + }, + { + "epoch": 1.2274780533537886, + "grad_norm": 46.557067567727465, + "learning_rate": 7.360768182448176e-06, + "loss": 2.6666, + "step": 14402 + }, + { + "epoch": 1.2275632830478138, + "grad_norm": 59.072087086693664, + "learning_rate": 7.360331072911943e-06, + "loss": 2.6641, + "step": 14403 + }, + { + "epoch": 1.2276485127418393, + "grad_norm": 79.06693673546185, + "learning_rate": 7.359893940162792e-06, + "loss": 2.9756, + "step": 14404 + }, + { + "epoch": 1.2277337424358647, + "grad_norm": 79.78974565604669, + "learning_rate": 7.359456784205022e-06, + "loss": 2.7114, + "step": 14405 + }, + { + "epoch": 1.22781897212989, + "grad_norm": 35.568278133757794, + "learning_rate": 7.359019605042933e-06, + "loss": 2.8619, + "step": 14406 + }, + { + "epoch": 1.2279042018239155, + "grad_norm": 63.28505284685305, + "learning_rate": 7.358582402680823e-06, + "loss": 2.6408, + "step": 14407 + }, + { + "epoch": 1.227989431517941, + "grad_norm": 35.63813031130417, + "learning_rate": 7.3581451771229915e-06, + "loss": 2.0421, + "step": 14408 + }, + { + "epoch": 1.2280746612119662, + "grad_norm": 53.30545970905928, + "learning_rate": 7.3577079283737396e-06, + "loss": 2.4862, + "step": 14409 + }, + { + "epoch": 1.2281598909059916, + "grad_norm": 57.19559984140501, + "learning_rate": 7.357270656437367e-06, + "loss": 2.0272, + "step": 14410 + }, + { + "epoch": 1.2282451206000171, + "grad_norm": 49.5319676214519, + "learning_rate": 7.356833361318175e-06, + "loss": 3.0919, + "step": 14411 + }, + { + "epoch": 1.2283303502940424, + "grad_norm": 90.31602715977898, + "learning_rate": 7.356396043020463e-06, + "loss": 4.0583, + "step": 14412 + }, + { + "epoch": 1.2284155799880678, + "grad_norm": 51.84090423211044, + "learning_rate": 7.355958701548533e-06, + "loss": 3.6095, + "step": 14413 + }, + { + "epoch": 1.2285008096820933, + "grad_norm": 105.61135987275921, + "learning_rate": 7.355521336906683e-06, + "loss": 3.8338, + "step": 14414 + }, + { + "epoch": 1.2285860393761185, + "grad_norm": 64.73826354688394, + "learning_rate": 7.355083949099219e-06, + "loss": 2.0333, + "step": 14415 + }, + { + "epoch": 1.228671269070144, + "grad_norm": 39.766382709743255, + "learning_rate": 7.354646538130439e-06, + "loss": 2.5416, + "step": 14416 + }, + { + "epoch": 1.2287564987641695, + "grad_norm": 53.10812774678622, + "learning_rate": 7.354209104004647e-06, + "loss": 2.3998, + "step": 14417 + }, + { + "epoch": 1.2288417284581947, + "grad_norm": 36.39723494427539, + "learning_rate": 7.353771646726143e-06, + "loss": 2.9137, + "step": 14418 + }, + { + "epoch": 1.2289269581522202, + "grad_norm": 41.53579769265955, + "learning_rate": 7.35333416629923e-06, + "loss": 2.923, + "step": 14419 + }, + { + "epoch": 1.2290121878462457, + "grad_norm": 51.25103520914761, + "learning_rate": 7.352896662728211e-06, + "loss": 2.2907, + "step": 14420 + }, + { + "epoch": 1.2290974175402711, + "grad_norm": 61.55564402359034, + "learning_rate": 7.352459136017389e-06, + "loss": 2.8607, + "step": 14421 + }, + { + "epoch": 1.2291826472342964, + "grad_norm": 67.36557742012562, + "learning_rate": 7.352021586171065e-06, + "loss": 3.9054, + "step": 14422 + }, + { + "epoch": 1.2292678769283218, + "grad_norm": 78.1186287876881, + "learning_rate": 7.351584013193543e-06, + "loss": 2.9745, + "step": 14423 + }, + { + "epoch": 1.2293531066223473, + "grad_norm": 67.2120688023698, + "learning_rate": 7.351146417089128e-06, + "loss": 2.4297, + "step": 14424 + }, + { + "epoch": 1.2294383363163726, + "grad_norm": 166.25563946063468, + "learning_rate": 7.350708797862121e-06, + "loss": 2.7327, + "step": 14425 + }, + { + "epoch": 1.229523566010398, + "grad_norm": 65.31968597344455, + "learning_rate": 7.350271155516828e-06, + "loss": 3.3457, + "step": 14426 + }, + { + "epoch": 1.2296087957044235, + "grad_norm": 56.050860882831344, + "learning_rate": 7.349833490057552e-06, + "loss": 3.7106, + "step": 14427 + }, + { + "epoch": 1.2296940253984487, + "grad_norm": 34.786422966645695, + "learning_rate": 7.349395801488597e-06, + "loss": 3.3063, + "step": 14428 + }, + { + "epoch": 1.2297792550924742, + "grad_norm": 47.58717458089554, + "learning_rate": 7.348958089814267e-06, + "loss": 3.138, + "step": 14429 + }, + { + "epoch": 1.2298644847864997, + "grad_norm": 45.0056449081475, + "learning_rate": 7.348520355038871e-06, + "loss": 3.0188, + "step": 14430 + }, + { + "epoch": 1.229949714480525, + "grad_norm": 64.40233167620246, + "learning_rate": 7.348082597166706e-06, + "loss": 2.0539, + "step": 14431 + }, + { + "epoch": 1.2300349441745504, + "grad_norm": 95.74838864125446, + "learning_rate": 7.347644816202084e-06, + "loss": 3.0688, + "step": 14432 + }, + { + "epoch": 1.2301201738685759, + "grad_norm": 105.25037802923215, + "learning_rate": 7.3472070121493075e-06, + "loss": 3.8855, + "step": 14433 + }, + { + "epoch": 1.2302054035626013, + "grad_norm": 45.988688456612046, + "learning_rate": 7.346769185012684e-06, + "loss": 4.0242, + "step": 14434 + }, + { + "epoch": 1.2302906332566266, + "grad_norm": 45.272838762283584, + "learning_rate": 7.346331334796516e-06, + "loss": 3.479, + "step": 14435 + }, + { + "epoch": 1.230375862950652, + "grad_norm": 92.13578263987822, + "learning_rate": 7.345893461505114e-06, + "loss": 3.8031, + "step": 14436 + }, + { + "epoch": 1.2304610926446773, + "grad_norm": 146.8942841370608, + "learning_rate": 7.345455565142779e-06, + "loss": 3.076, + "step": 14437 + }, + { + "epoch": 1.2305463223387028, + "grad_norm": 39.33434882763238, + "learning_rate": 7.345017645713823e-06, + "loss": 3.8924, + "step": 14438 + }, + { + "epoch": 1.2306315520327282, + "grad_norm": 49.43994762151696, + "learning_rate": 7.34457970322255e-06, + "loss": 2.7052, + "step": 14439 + }, + { + "epoch": 1.2307167817267537, + "grad_norm": 49.496171532303386, + "learning_rate": 7.344141737673267e-06, + "loss": 3.3525, + "step": 14440 + }, + { + "epoch": 1.230802011420779, + "grad_norm": 58.23560197043336, + "learning_rate": 7.343703749070281e-06, + "loss": 2.9528, + "step": 14441 + }, + { + "epoch": 1.2308872411148044, + "grad_norm": 117.22034800949979, + "learning_rate": 7.343265737417901e-06, + "loss": 2.546, + "step": 14442 + }, + { + "epoch": 1.2309724708088299, + "grad_norm": 37.019504629646015, + "learning_rate": 7.342827702720432e-06, + "loss": 2.2634, + "step": 14443 + }, + { + "epoch": 1.2310577005028551, + "grad_norm": 100.10880951327502, + "learning_rate": 7.342389644982184e-06, + "loss": 3.2777, + "step": 14444 + }, + { + "epoch": 1.2311429301968806, + "grad_norm": 36.78889928882161, + "learning_rate": 7.341951564207466e-06, + "loss": 2.9027, + "step": 14445 + }, + { + "epoch": 1.231228159890906, + "grad_norm": 69.84610617981141, + "learning_rate": 7.341513460400583e-06, + "loss": 3.9905, + "step": 14446 + }, + { + "epoch": 1.2313133895849313, + "grad_norm": 79.5443680709718, + "learning_rate": 7.341075333565847e-06, + "loss": 2.3275, + "step": 14447 + }, + { + "epoch": 1.2313986192789568, + "grad_norm": 35.74576493521951, + "learning_rate": 7.340637183707564e-06, + "loss": 3.0806, + "step": 14448 + }, + { + "epoch": 1.2314838489729822, + "grad_norm": 83.23188488255815, + "learning_rate": 7.340199010830046e-06, + "loss": 3.2035, + "step": 14449 + }, + { + "epoch": 1.2315690786670075, + "grad_norm": 42.699831100819786, + "learning_rate": 7.3397608149376e-06, + "loss": 2.7675, + "step": 14450 + }, + { + "epoch": 1.231654308361033, + "grad_norm": 168.4132004002849, + "learning_rate": 7.339322596034535e-06, + "loss": 3.7147, + "step": 14451 + }, + { + "epoch": 1.2317395380550584, + "grad_norm": 38.372892874429894, + "learning_rate": 7.338884354125162e-06, + "loss": 2.6615, + "step": 14452 + }, + { + "epoch": 1.231824767749084, + "grad_norm": 83.90872759896999, + "learning_rate": 7.338446089213793e-06, + "loss": 3.3306, + "step": 14453 + }, + { + "epoch": 1.2319099974431091, + "grad_norm": 74.08770765326084, + "learning_rate": 7.338007801304734e-06, + "loss": 2.938, + "step": 14454 + }, + { + "epoch": 1.2319952271371346, + "grad_norm": 69.58519246435515, + "learning_rate": 7.337569490402297e-06, + "loss": 4.3823, + "step": 14455 + }, + { + "epoch": 1.2320804568311599, + "grad_norm": 36.689698797880816, + "learning_rate": 7.337131156510795e-06, + "loss": 2.3424, + "step": 14456 + }, + { + "epoch": 1.2321656865251853, + "grad_norm": 87.62832057759164, + "learning_rate": 7.336692799634535e-06, + "loss": 3.1138, + "step": 14457 + }, + { + "epoch": 1.2322509162192108, + "grad_norm": 50.20312556301502, + "learning_rate": 7.33625441977783e-06, + "loss": 3.9584, + "step": 14458 + }, + { + "epoch": 1.2323361459132363, + "grad_norm": 54.58810914819455, + "learning_rate": 7.335816016944991e-06, + "loss": 2.7654, + "step": 14459 + }, + { + "epoch": 1.2324213756072615, + "grad_norm": 93.0110693138632, + "learning_rate": 7.33537759114033e-06, + "loss": 2.6102, + "step": 14460 + }, + { + "epoch": 1.232506605301287, + "grad_norm": 72.8780316181847, + "learning_rate": 7.334939142368158e-06, + "loss": 3.707, + "step": 14461 + }, + { + "epoch": 1.2325918349953124, + "grad_norm": 123.26538763376975, + "learning_rate": 7.334500670632788e-06, + "loss": 2.7933, + "step": 14462 + }, + { + "epoch": 1.2326770646893377, + "grad_norm": 57.29540727528387, + "learning_rate": 7.3340621759385315e-06, + "loss": 3.0093, + "step": 14463 + }, + { + "epoch": 1.2327622943833632, + "grad_norm": 50.807707461162046, + "learning_rate": 7.333623658289701e-06, + "loss": 3.1627, + "step": 14464 + }, + { + "epoch": 1.2328475240773886, + "grad_norm": 80.59979778315203, + "learning_rate": 7.3331851176906085e-06, + "loss": 3.0169, + "step": 14465 + }, + { + "epoch": 1.2329327537714139, + "grad_norm": 34.65464638386717, + "learning_rate": 7.332746554145567e-06, + "loss": 3.5977, + "step": 14466 + }, + { + "epoch": 1.2330179834654393, + "grad_norm": 66.56317370513347, + "learning_rate": 7.3323079676588916e-06, + "loss": 3.8092, + "step": 14467 + }, + { + "epoch": 1.2331032131594648, + "grad_norm": 30.839928337152013, + "learning_rate": 7.331869358234893e-06, + "loss": 2.6152, + "step": 14468 + }, + { + "epoch": 1.23318844285349, + "grad_norm": 82.35830444340313, + "learning_rate": 7.331430725877886e-06, + "loss": 2.1316, + "step": 14469 + }, + { + "epoch": 1.2332736725475155, + "grad_norm": 36.44233728230702, + "learning_rate": 7.3309920705921845e-06, + "loss": 2.6927, + "step": 14470 + }, + { + "epoch": 1.233358902241541, + "grad_norm": 37.75532692312291, + "learning_rate": 7.3305533923821026e-06, + "loss": 1.9973, + "step": 14471 + }, + { + "epoch": 1.2334441319355665, + "grad_norm": 36.629454640963445, + "learning_rate": 7.330114691251954e-06, + "loss": 2.7668, + "step": 14472 + }, + { + "epoch": 1.2335293616295917, + "grad_norm": 63.609336596601636, + "learning_rate": 7.3296759672060535e-06, + "loss": 3.4555, + "step": 14473 + }, + { + "epoch": 1.2336145913236172, + "grad_norm": 67.40260957039695, + "learning_rate": 7.329237220248714e-06, + "loss": 3.2641, + "step": 14474 + }, + { + "epoch": 1.2336998210176426, + "grad_norm": 60.807416946197065, + "learning_rate": 7.328798450384253e-06, + "loss": 3.9321, + "step": 14475 + }, + { + "epoch": 1.2337850507116679, + "grad_norm": 72.91548480557961, + "learning_rate": 7.328359657616986e-06, + "loss": 2.6235, + "step": 14476 + }, + { + "epoch": 1.2338702804056934, + "grad_norm": 32.994147952241576, + "learning_rate": 7.327920841951226e-06, + "loss": 2.9215, + "step": 14477 + }, + { + "epoch": 1.2339555100997188, + "grad_norm": 59.89630435659108, + "learning_rate": 7.3274820033912875e-06, + "loss": 3.5061, + "step": 14478 + }, + { + "epoch": 1.234040739793744, + "grad_norm": 54.223444619363995, + "learning_rate": 7.3270431419414915e-06, + "loss": 2.9897, + "step": 14479 + }, + { + "epoch": 1.2341259694877695, + "grad_norm": 29.10774714001841, + "learning_rate": 7.3266042576061494e-06, + "loss": 2.6219, + "step": 14480 + }, + { + "epoch": 1.234211199181795, + "grad_norm": 62.11825175391307, + "learning_rate": 7.326165350389579e-06, + "loss": 2.4739, + "step": 14481 + }, + { + "epoch": 1.2342964288758203, + "grad_norm": 46.20601541384119, + "learning_rate": 7.325726420296095e-06, + "loss": 3.0411, + "step": 14482 + }, + { + "epoch": 1.2343816585698457, + "grad_norm": 40.32984422745613, + "learning_rate": 7.325287467330019e-06, + "loss": 2.9345, + "step": 14483 + }, + { + "epoch": 1.2344668882638712, + "grad_norm": 49.24597111135078, + "learning_rate": 7.324848491495663e-06, + "loss": 2.6305, + "step": 14484 + }, + { + "epoch": 1.2345521179578967, + "grad_norm": 68.9045800324309, + "learning_rate": 7.324409492797347e-06, + "loss": 3.1254, + "step": 14485 + }, + { + "epoch": 1.234637347651922, + "grad_norm": 29.63404250338708, + "learning_rate": 7.323970471239386e-06, + "loss": 1.9509, + "step": 14486 + }, + { + "epoch": 1.2347225773459474, + "grad_norm": 27.1915484502183, + "learning_rate": 7.323531426826099e-06, + "loss": 2.4595, + "step": 14487 + }, + { + "epoch": 1.2348078070399726, + "grad_norm": 42.71581932512672, + "learning_rate": 7.323092359561804e-06, + "loss": 2.9863, + "step": 14488 + }, + { + "epoch": 1.234893036733998, + "grad_norm": 48.765922450035376, + "learning_rate": 7.322653269450819e-06, + "loss": 2.719, + "step": 14489 + }, + { + "epoch": 1.2349782664280236, + "grad_norm": 54.50777536787745, + "learning_rate": 7.322214156497462e-06, + "loss": 4.0831, + "step": 14490 + }, + { + "epoch": 1.235063496122049, + "grad_norm": 42.83905759736581, + "learning_rate": 7.32177502070605e-06, + "loss": 3.043, + "step": 14491 + }, + { + "epoch": 1.2351487258160743, + "grad_norm": 47.616478865243494, + "learning_rate": 7.3213358620809064e-06, + "loss": 3.0349, + "step": 14492 + }, + { + "epoch": 1.2352339555100997, + "grad_norm": 86.83325890979344, + "learning_rate": 7.320896680626344e-06, + "loss": 3.7663, + "step": 14493 + }, + { + "epoch": 1.2353191852041252, + "grad_norm": 60.29859725362648, + "learning_rate": 7.320457476346686e-06, + "loss": 2.764, + "step": 14494 + }, + { + "epoch": 1.2354044148981504, + "grad_norm": 40.03184179583078, + "learning_rate": 7.320018249246251e-06, + "loss": 2.6968, + "step": 14495 + }, + { + "epoch": 1.235489644592176, + "grad_norm": 94.37328045063711, + "learning_rate": 7.3195789993293585e-06, + "loss": 2.937, + "step": 14496 + }, + { + "epoch": 1.2355748742862014, + "grad_norm": 32.92865347558772, + "learning_rate": 7.319139726600328e-06, + "loss": 3.3759, + "step": 14497 + }, + { + "epoch": 1.2356601039802266, + "grad_norm": 63.512887799369594, + "learning_rate": 7.31870043106348e-06, + "loss": 3.6477, + "step": 14498 + }, + { + "epoch": 1.235745333674252, + "grad_norm": 54.989132858664455, + "learning_rate": 7.318261112723134e-06, + "loss": 3.3498, + "step": 14499 + }, + { + "epoch": 1.2358305633682776, + "grad_norm": 83.16435650939742, + "learning_rate": 7.317821771583613e-06, + "loss": 3.8572, + "step": 14500 + }, + { + "epoch": 1.2359157930623028, + "grad_norm": 39.959901485010256, + "learning_rate": 7.317382407649234e-06, + "loss": 2.9215, + "step": 14501 + }, + { + "epoch": 1.2360010227563283, + "grad_norm": 70.71338391226944, + "learning_rate": 7.31694302092432e-06, + "loss": 2.5394, + "step": 14502 + }, + { + "epoch": 1.2360862524503538, + "grad_norm": 24.764111930645214, + "learning_rate": 7.316503611413193e-06, + "loss": 2.3592, + "step": 14503 + }, + { + "epoch": 1.2361714821443792, + "grad_norm": 69.24522772902657, + "learning_rate": 7.316064179120173e-06, + "loss": 3.2535, + "step": 14504 + }, + { + "epoch": 1.2362567118384045, + "grad_norm": 39.562566522821584, + "learning_rate": 7.315624724049581e-06, + "loss": 2.7776, + "step": 14505 + }, + { + "epoch": 1.23634194153243, + "grad_norm": 63.48756060927834, + "learning_rate": 7.315185246205742e-06, + "loss": 3.5706, + "step": 14506 + }, + { + "epoch": 1.2364271712264552, + "grad_norm": 41.04448557594372, + "learning_rate": 7.314745745592975e-06, + "loss": 2.6063, + "step": 14507 + }, + { + "epoch": 1.2365124009204806, + "grad_norm": 33.617461180513885, + "learning_rate": 7.314306222215603e-06, + "loss": 1.7562, + "step": 14508 + }, + { + "epoch": 1.2365976306145061, + "grad_norm": 96.55144518606654, + "learning_rate": 7.31386667607795e-06, + "loss": 3.4571, + "step": 14509 + }, + { + "epoch": 1.2366828603085316, + "grad_norm": 59.3301835237916, + "learning_rate": 7.313427107184337e-06, + "loss": 2.5048, + "step": 14510 + }, + { + "epoch": 1.2367680900025568, + "grad_norm": 40.41537296446969, + "learning_rate": 7.312987515539087e-06, + "loss": 3.4359, + "step": 14511 + }, + { + "epoch": 1.2368533196965823, + "grad_norm": 32.28096370060833, + "learning_rate": 7.312547901146525e-06, + "loss": 2.0851, + "step": 14512 + }, + { + "epoch": 1.2369385493906078, + "grad_norm": 30.174871030020874, + "learning_rate": 7.3121082640109735e-06, + "loss": 2.9085, + "step": 14513 + }, + { + "epoch": 1.237023779084633, + "grad_norm": 58.30141501448529, + "learning_rate": 7.311668604136755e-06, + "loss": 3.8076, + "step": 14514 + }, + { + "epoch": 1.2371090087786585, + "grad_norm": 87.37110373149669, + "learning_rate": 7.311228921528195e-06, + "loss": 3.0266, + "step": 14515 + }, + { + "epoch": 1.237194238472684, + "grad_norm": 44.847736024187434, + "learning_rate": 7.310789216189618e-06, + "loss": 2.5458, + "step": 14516 + }, + { + "epoch": 1.2372794681667092, + "grad_norm": 66.89200903891931, + "learning_rate": 7.310349488125343e-06, + "loss": 4.3146, + "step": 14517 + }, + { + "epoch": 1.2373646978607347, + "grad_norm": 60.54447957106423, + "learning_rate": 7.3099097373397036e-06, + "loss": 3.1367, + "step": 14518 + }, + { + "epoch": 1.2374499275547601, + "grad_norm": 34.235343201715345, + "learning_rate": 7.309469963837017e-06, + "loss": 2.5671, + "step": 14519 + }, + { + "epoch": 1.2375351572487854, + "grad_norm": 53.645563161422416, + "learning_rate": 7.3090301676216134e-06, + "loss": 2.7143, + "step": 14520 + }, + { + "epoch": 1.2376203869428108, + "grad_norm": 34.988664025619215, + "learning_rate": 7.308590348697813e-06, + "loss": 2.4479, + "step": 14521 + }, + { + "epoch": 1.2377056166368363, + "grad_norm": 64.48539908558456, + "learning_rate": 7.308150507069945e-06, + "loss": 2.5, + "step": 14522 + }, + { + "epoch": 1.2377908463308618, + "grad_norm": 64.72754524986713, + "learning_rate": 7.3077106427423335e-06, + "loss": 2.9702, + "step": 14523 + }, + { + "epoch": 1.237876076024887, + "grad_norm": 32.18475375983351, + "learning_rate": 7.3072707557193065e-06, + "loss": 3.2568, + "step": 14524 + }, + { + "epoch": 1.2379613057189125, + "grad_norm": 42.281420019840304, + "learning_rate": 7.306830846005186e-06, + "loss": 2.1286, + "step": 14525 + }, + { + "epoch": 1.2380465354129377, + "grad_norm": 40.47216524746959, + "learning_rate": 7.306390913604303e-06, + "loss": 2.8427, + "step": 14526 + }, + { + "epoch": 1.2381317651069632, + "grad_norm": 70.92777750822756, + "learning_rate": 7.3059509585209805e-06, + "loss": 2.259, + "step": 14527 + }, + { + "epoch": 1.2382169948009887, + "grad_norm": 50.56054069409744, + "learning_rate": 7.305510980759548e-06, + "loss": 2.3399, + "step": 14528 + }, + { + "epoch": 1.2383022244950141, + "grad_norm": 32.22448929952189, + "learning_rate": 7.305070980324329e-06, + "loss": 2.7868, + "step": 14529 + }, + { + "epoch": 1.2383874541890394, + "grad_norm": 38.11416748190287, + "learning_rate": 7.304630957219654e-06, + "loss": 2.2638, + "step": 14530 + }, + { + "epoch": 1.2384726838830649, + "grad_norm": 31.27633645490824, + "learning_rate": 7.30419091144985e-06, + "loss": 2.9351, + "step": 14531 + }, + { + "epoch": 1.2385579135770903, + "grad_norm": 37.08752997360154, + "learning_rate": 7.303750843019244e-06, + "loss": 2.4481, + "step": 14532 + }, + { + "epoch": 1.2386431432711156, + "grad_norm": 34.93194439398638, + "learning_rate": 7.303310751932162e-06, + "loss": 2.3647, + "step": 14533 + }, + { + "epoch": 1.238728372965141, + "grad_norm": 30.432117070996785, + "learning_rate": 7.302870638192935e-06, + "loss": 2.2077, + "step": 14534 + }, + { + "epoch": 1.2388136026591665, + "grad_norm": 116.13071421136762, + "learning_rate": 7.30243050180589e-06, + "loss": 3.4664, + "step": 14535 + }, + { + "epoch": 1.2388988323531918, + "grad_norm": 33.65756254711682, + "learning_rate": 7.301990342775357e-06, + "loss": 2.9553, + "step": 14536 + }, + { + "epoch": 1.2389840620472172, + "grad_norm": 53.82648697150277, + "learning_rate": 7.301550161105663e-06, + "loss": 2.9476, + "step": 14537 + }, + { + "epoch": 1.2390692917412427, + "grad_norm": 86.31283286692054, + "learning_rate": 7.301109956801138e-06, + "loss": 3.356, + "step": 14538 + }, + { + "epoch": 1.239154521435268, + "grad_norm": 57.989181415706646, + "learning_rate": 7.300669729866111e-06, + "loss": 2.4756, + "step": 14539 + }, + { + "epoch": 1.2392397511292934, + "grad_norm": 62.62661394433004, + "learning_rate": 7.300229480304911e-06, + "loss": 3.0288, + "step": 14540 + }, + { + "epoch": 1.2393249808233189, + "grad_norm": 28.657233795334406, + "learning_rate": 7.299789208121868e-06, + "loss": 2.0156, + "step": 14541 + }, + { + "epoch": 1.2394102105173443, + "grad_norm": 44.09253827575324, + "learning_rate": 7.299348913321311e-06, + "loss": 3.6805, + "step": 14542 + }, + { + "epoch": 1.2394954402113696, + "grad_norm": 73.78655018486283, + "learning_rate": 7.298908595907574e-06, + "loss": 4.1702, + "step": 14543 + }, + { + "epoch": 1.239580669905395, + "grad_norm": 39.544285074019584, + "learning_rate": 7.298468255884983e-06, + "loss": 3.1033, + "step": 14544 + }, + { + "epoch": 1.2396658995994205, + "grad_norm": 119.4805501381306, + "learning_rate": 7.298027893257869e-06, + "loss": 3.8658, + "step": 14545 + }, + { + "epoch": 1.2397511292934458, + "grad_norm": 68.68693601593169, + "learning_rate": 7.297587508030565e-06, + "loss": 3.7264, + "step": 14546 + }, + { + "epoch": 1.2398363589874712, + "grad_norm": 78.68738455023542, + "learning_rate": 7.2971471002074e-06, + "loss": 3.5529, + "step": 14547 + }, + { + "epoch": 1.2399215886814967, + "grad_norm": 21.279475088659957, + "learning_rate": 7.296706669792707e-06, + "loss": 1.3816, + "step": 14548 + }, + { + "epoch": 1.240006818375522, + "grad_norm": 87.51484595775095, + "learning_rate": 7.296266216790815e-06, + "loss": 4.4288, + "step": 14549 + }, + { + "epoch": 1.2400920480695474, + "grad_norm": 72.27441413337993, + "learning_rate": 7.295825741206059e-06, + "loss": 3.6792, + "step": 14550 + }, + { + "epoch": 1.240177277763573, + "grad_norm": 43.718651157421355, + "learning_rate": 7.295385243042769e-06, + "loss": 2.9309, + "step": 14551 + }, + { + "epoch": 1.2402625074575981, + "grad_norm": 47.41514907101834, + "learning_rate": 7.294944722305276e-06, + "loss": 3.4306, + "step": 14552 + }, + { + "epoch": 1.2403477371516236, + "grad_norm": 70.12554483195426, + "learning_rate": 7.294504178997914e-06, + "loss": 4.4278, + "step": 14553 + }, + { + "epoch": 1.240432966845649, + "grad_norm": 90.00171705923373, + "learning_rate": 7.2940636131250155e-06, + "loss": 2.8871, + "step": 14554 + }, + { + "epoch": 1.2405181965396745, + "grad_norm": 39.793683513911176, + "learning_rate": 7.2936230246909125e-06, + "loss": 3.0684, + "step": 14555 + }, + { + "epoch": 1.2406034262336998, + "grad_norm": 39.56489250974149, + "learning_rate": 7.2931824136999384e-06, + "loss": 3.4393, + "step": 14556 + }, + { + "epoch": 1.2406886559277253, + "grad_norm": 41.29714055208704, + "learning_rate": 7.292741780156427e-06, + "loss": 2.2179, + "step": 14557 + }, + { + "epoch": 1.2407738856217505, + "grad_norm": 28.64659300685747, + "learning_rate": 7.2923011240647115e-06, + "loss": 2.5553, + "step": 14558 + }, + { + "epoch": 1.240859115315776, + "grad_norm": 59.15644654871316, + "learning_rate": 7.2918604454291255e-06, + "loss": 2.9514, + "step": 14559 + }, + { + "epoch": 1.2409443450098014, + "grad_norm": 66.22654127467446, + "learning_rate": 7.2914197442540004e-06, + "loss": 3.124, + "step": 14560 + }, + { + "epoch": 1.241029574703827, + "grad_norm": 29.48499611819497, + "learning_rate": 7.290979020543676e-06, + "loss": 1.7361, + "step": 14561 + }, + { + "epoch": 1.2411148043978522, + "grad_norm": 43.741289902911035, + "learning_rate": 7.290538274302481e-06, + "loss": 3.697, + "step": 14562 + }, + { + "epoch": 1.2412000340918776, + "grad_norm": 32.22039830835178, + "learning_rate": 7.2900975055347535e-06, + "loss": 2.5586, + "step": 14563 + }, + { + "epoch": 1.241285263785903, + "grad_norm": 50.88046842117662, + "learning_rate": 7.289656714244825e-06, + "loss": 3.5639, + "step": 14564 + }, + { + "epoch": 1.2413704934799283, + "grad_norm": 51.76063623620823, + "learning_rate": 7.289215900437034e-06, + "loss": 2.3855, + "step": 14565 + }, + { + "epoch": 1.2414557231739538, + "grad_norm": 54.14335587078225, + "learning_rate": 7.288775064115714e-06, + "loss": 2.1175, + "step": 14566 + }, + { + "epoch": 1.2415409528679793, + "grad_norm": 44.95839043792977, + "learning_rate": 7.2883342052852015e-06, + "loss": 3.6024, + "step": 14567 + }, + { + "epoch": 1.2416261825620045, + "grad_norm": 65.4960767258658, + "learning_rate": 7.2878933239498295e-06, + "loss": 4.5325, + "step": 14568 + }, + { + "epoch": 1.24171141225603, + "grad_norm": 52.198491073851145, + "learning_rate": 7.2874524201139385e-06, + "loss": 2.8608, + "step": 14569 + }, + { + "epoch": 1.2417966419500555, + "grad_norm": 59.078926275185104, + "learning_rate": 7.28701149378186e-06, + "loss": 3.7868, + "step": 14570 + }, + { + "epoch": 1.2418818716440807, + "grad_norm": 75.28704606042007, + "learning_rate": 7.286570544957933e-06, + "loss": 2.4399, + "step": 14571 + }, + { + "epoch": 1.2419671013381062, + "grad_norm": 37.34971069677421, + "learning_rate": 7.286129573646492e-06, + "loss": 2.7904, + "step": 14572 + }, + { + "epoch": 1.2420523310321316, + "grad_norm": 28.275043492875383, + "learning_rate": 7.285688579851876e-06, + "loss": 2.1733, + "step": 14573 + }, + { + "epoch": 1.242137560726157, + "grad_norm": 89.90753216986678, + "learning_rate": 7.285247563578423e-06, + "loss": 2.5674, + "step": 14574 + }, + { + "epoch": 1.2422227904201824, + "grad_norm": 68.67481701946801, + "learning_rate": 7.2848065248304655e-06, + "loss": 2.2314, + "step": 14575 + }, + { + "epoch": 1.2423080201142078, + "grad_norm": 63.71237273119864, + "learning_rate": 7.284365463612344e-06, + "loss": 2.9221, + "step": 14576 + }, + { + "epoch": 1.242393249808233, + "grad_norm": 185.30630643863807, + "learning_rate": 7.283924379928397e-06, + "loss": 4.4395, + "step": 14577 + }, + { + "epoch": 1.2424784795022585, + "grad_norm": 55.41338418790388, + "learning_rate": 7.28348327378296e-06, + "loss": 2.7493, + "step": 14578 + }, + { + "epoch": 1.242563709196284, + "grad_norm": 89.42443709317655, + "learning_rate": 7.283042145180373e-06, + "loss": 3.2236, + "step": 14579 + }, + { + "epoch": 1.2426489388903095, + "grad_norm": 18.311147899570397, + "learning_rate": 7.282600994124973e-06, + "loss": 1.5863, + "step": 14580 + }, + { + "epoch": 1.2427341685843347, + "grad_norm": 48.53733155733504, + "learning_rate": 7.282159820621101e-06, + "loss": 3.3867, + "step": 14581 + }, + { + "epoch": 1.2428193982783602, + "grad_norm": 39.265873182419114, + "learning_rate": 7.281718624673093e-06, + "loss": 2.4587, + "step": 14582 + }, + { + "epoch": 1.2429046279723857, + "grad_norm": 62.60655794834283, + "learning_rate": 7.281277406285289e-06, + "loss": 2.2796, + "step": 14583 + }, + { + "epoch": 1.242989857666411, + "grad_norm": 36.637184826207694, + "learning_rate": 7.280836165462028e-06, + "loss": 2.2385, + "step": 14584 + }, + { + "epoch": 1.2430750873604364, + "grad_norm": 82.39841367643464, + "learning_rate": 7.280394902207649e-06, + "loss": 2.8452, + "step": 14585 + }, + { + "epoch": 1.2431603170544618, + "grad_norm": 69.72013499150354, + "learning_rate": 7.279953616526494e-06, + "loss": 3.3791, + "step": 14586 + }, + { + "epoch": 1.243245546748487, + "grad_norm": 32.69860507269192, + "learning_rate": 7.2795123084229e-06, + "loss": 2.7982, + "step": 14587 + }, + { + "epoch": 1.2433307764425126, + "grad_norm": 35.192004322130934, + "learning_rate": 7.279070977901208e-06, + "loss": 3.1717, + "step": 14588 + }, + { + "epoch": 1.243416006136538, + "grad_norm": 50.40632990541017, + "learning_rate": 7.278629624965761e-06, + "loss": 2.559, + "step": 14589 + }, + { + "epoch": 1.2435012358305633, + "grad_norm": 121.83119061547747, + "learning_rate": 7.278188249620895e-06, + "loss": 3.3548, + "step": 14590 + }, + { + "epoch": 1.2435864655245887, + "grad_norm": 42.561853867487535, + "learning_rate": 7.277746851870952e-06, + "loss": 3.2743, + "step": 14591 + }, + { + "epoch": 1.2436716952186142, + "grad_norm": 28.933742730762837, + "learning_rate": 7.277305431720274e-06, + "loss": 2.4395, + "step": 14592 + }, + { + "epoch": 1.2437569249126397, + "grad_norm": 49.416126554225656, + "learning_rate": 7.276863989173204e-06, + "loss": 3.4671, + "step": 14593 + }, + { + "epoch": 1.243842154606665, + "grad_norm": 53.0508317286693, + "learning_rate": 7.2764225242340795e-06, + "loss": 2.342, + "step": 14594 + }, + { + "epoch": 1.2439273843006904, + "grad_norm": 73.6486945641521, + "learning_rate": 7.275981036907245e-06, + "loss": 4.9714, + "step": 14595 + }, + { + "epoch": 1.2440126139947159, + "grad_norm": 82.51155853826488, + "learning_rate": 7.275539527197039e-06, + "loss": 2.2534, + "step": 14596 + }, + { + "epoch": 1.244097843688741, + "grad_norm": 57.9041785889813, + "learning_rate": 7.275097995107809e-06, + "loss": 3.0775, + "step": 14597 + }, + { + "epoch": 1.2441830733827666, + "grad_norm": 28.614778945692198, + "learning_rate": 7.274656440643894e-06, + "loss": 2.6244, + "step": 14598 + }, + { + "epoch": 1.244268303076792, + "grad_norm": 51.726023941683934, + "learning_rate": 7.2742148638096356e-06, + "loss": 2.4122, + "step": 14599 + }, + { + "epoch": 1.2443535327708173, + "grad_norm": 60.514608661250314, + "learning_rate": 7.2737732646093785e-06, + "loss": 3.5647, + "step": 14600 + }, + { + "epoch": 1.2444387624648428, + "grad_norm": 35.39995618315236, + "learning_rate": 7.273331643047464e-06, + "loss": 2.0253, + "step": 14601 + }, + { + "epoch": 1.2445239921588682, + "grad_norm": 34.59500306096257, + "learning_rate": 7.272889999128237e-06, + "loss": 2.9894, + "step": 14602 + }, + { + "epoch": 1.2446092218528935, + "grad_norm": 28.90596710025535, + "learning_rate": 7.272448332856039e-06, + "loss": 2.1522, + "step": 14603 + }, + { + "epoch": 1.244694451546919, + "grad_norm": 41.42359748220768, + "learning_rate": 7.272006644235217e-06, + "loss": 2.4219, + "step": 14604 + }, + { + "epoch": 1.2447796812409444, + "grad_norm": 38.91021671416994, + "learning_rate": 7.27156493327011e-06, + "loss": 3.0715, + "step": 14605 + }, + { + "epoch": 1.2448649109349696, + "grad_norm": 66.8574938424463, + "learning_rate": 7.271123199965064e-06, + "loss": 2.5637, + "step": 14606 + }, + { + "epoch": 1.2449501406289951, + "grad_norm": 110.60287900719489, + "learning_rate": 7.270681444324425e-06, + "loss": 2.8984, + "step": 14607 + }, + { + "epoch": 1.2450353703230206, + "grad_norm": 36.98479370456389, + "learning_rate": 7.270239666352538e-06, + "loss": 3.0092, + "step": 14608 + }, + { + "epoch": 1.2451206000170458, + "grad_norm": 32.097862916093014, + "learning_rate": 7.2697978660537425e-06, + "loss": 2.6955, + "step": 14609 + }, + { + "epoch": 1.2452058297110713, + "grad_norm": 84.93213759766499, + "learning_rate": 7.269356043432389e-06, + "loss": 2.8646, + "step": 14610 + }, + { + "epoch": 1.2452910594050968, + "grad_norm": 40.68902329972477, + "learning_rate": 7.26891419849282e-06, + "loss": 2.9632, + "step": 14611 + }, + { + "epoch": 1.2453762890991222, + "grad_norm": 43.95998836650538, + "learning_rate": 7.268472331239382e-06, + "loss": 3.0696, + "step": 14612 + }, + { + "epoch": 1.2454615187931475, + "grad_norm": 75.77653866075504, + "learning_rate": 7.2680304416764205e-06, + "loss": 3.1971, + "step": 14613 + }, + { + "epoch": 1.245546748487173, + "grad_norm": 44.228239198348675, + "learning_rate": 7.26758852980828e-06, + "loss": 2.9373, + "step": 14614 + }, + { + "epoch": 1.2456319781811984, + "grad_norm": 61.524075203043246, + "learning_rate": 7.267146595639306e-06, + "loss": 2.6839, + "step": 14615 + }, + { + "epoch": 1.2457172078752237, + "grad_norm": 40.63218565548008, + "learning_rate": 7.266704639173848e-06, + "loss": 3.7854, + "step": 14616 + }, + { + "epoch": 1.2458024375692491, + "grad_norm": 57.77987959726307, + "learning_rate": 7.266262660416251e-06, + "loss": 3.4332, + "step": 14617 + }, + { + "epoch": 1.2458876672632746, + "grad_norm": 83.36971840318296, + "learning_rate": 7.26582065937086e-06, + "loss": 3.2123, + "step": 14618 + }, + { + "epoch": 1.2459728969572998, + "grad_norm": 76.29448972192525, + "learning_rate": 7.265378636042024e-06, + "loss": 2.9488, + "step": 14619 + }, + { + "epoch": 1.2460581266513253, + "grad_norm": 65.62810785568216, + "learning_rate": 7.264936590434089e-06, + "loss": 2.2343, + "step": 14620 + }, + { + "epoch": 1.2461433563453508, + "grad_norm": 175.60742328244527, + "learning_rate": 7.264494522551402e-06, + "loss": 2.859, + "step": 14621 + }, + { + "epoch": 1.246228586039376, + "grad_norm": 130.70427034349635, + "learning_rate": 7.264052432398312e-06, + "loss": 1.5837, + "step": 14622 + }, + { + "epoch": 1.2463138157334015, + "grad_norm": 62.01095011041553, + "learning_rate": 7.263610319979165e-06, + "loss": 2.8068, + "step": 14623 + }, + { + "epoch": 1.246399045427427, + "grad_norm": 74.2183240942211, + "learning_rate": 7.26316818529831e-06, + "loss": 3.3547, + "step": 14624 + }, + { + "epoch": 1.2464842751214524, + "grad_norm": 25.571240443791602, + "learning_rate": 7.2627260283600965e-06, + "loss": 2.0317, + "step": 14625 + }, + { + "epoch": 1.2465695048154777, + "grad_norm": 37.60507574242734, + "learning_rate": 7.2622838491688705e-06, + "loss": 2.9142, + "step": 14626 + }, + { + "epoch": 1.2466547345095031, + "grad_norm": 60.804418889109655, + "learning_rate": 7.2618416477289825e-06, + "loss": 1.581, + "step": 14627 + }, + { + "epoch": 1.2467399642035284, + "grad_norm": 32.09617495229266, + "learning_rate": 7.261399424044779e-06, + "loss": 2.1416, + "step": 14628 + }, + { + "epoch": 1.2468251938975539, + "grad_norm": 93.98725154622414, + "learning_rate": 7.260957178120613e-06, + "loss": 3.2791, + "step": 14629 + }, + { + "epoch": 1.2469104235915793, + "grad_norm": 101.32974809059613, + "learning_rate": 7.260514909960831e-06, + "loss": 3.7715, + "step": 14630 + }, + { + "epoch": 1.2469956532856048, + "grad_norm": 44.94682474552956, + "learning_rate": 7.260072619569782e-06, + "loss": 3.1033, + "step": 14631 + }, + { + "epoch": 1.24708088297963, + "grad_norm": 64.59289380096985, + "learning_rate": 7.259630306951819e-06, + "loss": 3.0463, + "step": 14632 + }, + { + "epoch": 1.2471661126736555, + "grad_norm": 45.49801082697077, + "learning_rate": 7.259187972111288e-06, + "loss": 2.9322, + "step": 14633 + }, + { + "epoch": 1.247251342367681, + "grad_norm": 42.263315755425744, + "learning_rate": 7.258745615052542e-06, + "loss": 2.8024, + "step": 14634 + }, + { + "epoch": 1.2473365720617062, + "grad_norm": 70.80557624745978, + "learning_rate": 7.258303235779929e-06, + "loss": 2.626, + "step": 14635 + }, + { + "epoch": 1.2474218017557317, + "grad_norm": 59.09213028677999, + "learning_rate": 7.257860834297803e-06, + "loss": 2.9725, + "step": 14636 + }, + { + "epoch": 1.2475070314497572, + "grad_norm": 33.99904615899573, + "learning_rate": 7.257418410610511e-06, + "loss": 2.7779, + "step": 14637 + }, + { + "epoch": 1.2475922611437824, + "grad_norm": 60.31747893546083, + "learning_rate": 7.256975964722406e-06, + "loss": 2.5928, + "step": 14638 + }, + { + "epoch": 1.2476774908378079, + "grad_norm": 75.89772990361406, + "learning_rate": 7.25653349663784e-06, + "loss": 3.536, + "step": 14639 + }, + { + "epoch": 1.2477627205318333, + "grad_norm": 73.95910054948493, + "learning_rate": 7.256091006361164e-06, + "loss": 3.6108, + "step": 14640 + }, + { + "epoch": 1.2478479502258586, + "grad_norm": 81.82652517446822, + "learning_rate": 7.25564849389673e-06, + "loss": 3.4699, + "step": 14641 + }, + { + "epoch": 1.247933179919884, + "grad_norm": 71.75995693474181, + "learning_rate": 7.255205959248888e-06, + "loss": 2.8271, + "step": 14642 + }, + { + "epoch": 1.2480184096139095, + "grad_norm": 37.07173071090691, + "learning_rate": 7.254763402421991e-06, + "loss": 3.0633, + "step": 14643 + }, + { + "epoch": 1.248103639307935, + "grad_norm": 87.0524056023728, + "learning_rate": 7.2543208234203934e-06, + "loss": 3.0471, + "step": 14644 + }, + { + "epoch": 1.2481888690019602, + "grad_norm": 43.18870849925465, + "learning_rate": 7.253878222248446e-06, + "loss": 3.262, + "step": 14645 + }, + { + "epoch": 1.2482740986959857, + "grad_norm": 97.79904965931863, + "learning_rate": 7.253435598910501e-06, + "loss": 3.4304, + "step": 14646 + }, + { + "epoch": 1.248359328390011, + "grad_norm": 48.60455124535757, + "learning_rate": 7.2529929534109135e-06, + "loss": 2.6337, + "step": 14647 + }, + { + "epoch": 1.2484445580840364, + "grad_norm": 29.315004288100074, + "learning_rate": 7.252550285754035e-06, + "loss": 2.187, + "step": 14648 + }, + { + "epoch": 1.248529787778062, + "grad_norm": 54.246797905178425, + "learning_rate": 7.25210759594422e-06, + "loss": 2.8342, + "step": 14649 + }, + { + "epoch": 1.2486150174720874, + "grad_norm": 25.27408696865675, + "learning_rate": 7.25166488398582e-06, + "loss": 2.1165, + "step": 14650 + }, + { + "epoch": 1.2487002471661126, + "grad_norm": 67.41720156276762, + "learning_rate": 7.251222149883193e-06, + "loss": 3.2204, + "step": 14651 + }, + { + "epoch": 1.248785476860138, + "grad_norm": 41.71099518297631, + "learning_rate": 7.250779393640689e-06, + "loss": 2.5673, + "step": 14652 + }, + { + "epoch": 1.2488707065541635, + "grad_norm": 173.4357092171755, + "learning_rate": 7.250336615262664e-06, + "loss": 3.2139, + "step": 14653 + }, + { + "epoch": 1.2489559362481888, + "grad_norm": 30.112183316608768, + "learning_rate": 7.2498938147534735e-06, + "loss": 2.9271, + "step": 14654 + }, + { + "epoch": 1.2490411659422143, + "grad_norm": 72.13773659315808, + "learning_rate": 7.2494509921174715e-06, + "loss": 3.7413, + "step": 14655 + }, + { + "epoch": 1.2491263956362397, + "grad_norm": 36.91540206309681, + "learning_rate": 7.249008147359013e-06, + "loss": 2.6576, + "step": 14656 + }, + { + "epoch": 1.249211625330265, + "grad_norm": 101.48286262654408, + "learning_rate": 7.248565280482453e-06, + "loss": 4.1554, + "step": 14657 + }, + { + "epoch": 1.2492968550242904, + "grad_norm": 86.21912933209452, + "learning_rate": 7.248122391492144e-06, + "loss": 3.9259, + "step": 14658 + }, + { + "epoch": 1.249382084718316, + "grad_norm": 20.94895963181507, + "learning_rate": 7.2476794803924475e-06, + "loss": 1.718, + "step": 14659 + }, + { + "epoch": 1.2494673144123412, + "grad_norm": 47.02298050331311, + "learning_rate": 7.247236547187716e-06, + "loss": 2.9735, + "step": 14660 + }, + { + "epoch": 1.2495525441063666, + "grad_norm": 68.75869345133854, + "learning_rate": 7.246793591882306e-06, + "loss": 4.202, + "step": 14661 + }, + { + "epoch": 1.249637773800392, + "grad_norm": 319.0511213350244, + "learning_rate": 7.246350614480574e-06, + "loss": 2.8557, + "step": 14662 + }, + { + "epoch": 1.2497230034944176, + "grad_norm": 70.91331897648715, + "learning_rate": 7.245907614986876e-06, + "loss": 3.234, + "step": 14663 + }, + { + "epoch": 1.2498082331884428, + "grad_norm": 69.93764338728103, + "learning_rate": 7.245464593405569e-06, + "loss": 2.9445, + "step": 14664 + }, + { + "epoch": 1.2498934628824683, + "grad_norm": 56.09894967011785, + "learning_rate": 7.24502154974101e-06, + "loss": 2.5989, + "step": 14665 + }, + { + "epoch": 1.2499786925764937, + "grad_norm": 57.90983323618571, + "learning_rate": 7.244578483997557e-06, + "loss": 2.6847, + "step": 14666 + }, + { + "epoch": 1.250063922270519, + "grad_norm": 55.10764734262207, + "learning_rate": 7.244135396179565e-06, + "loss": 2.5617, + "step": 14667 + }, + { + "epoch": 1.2501491519645445, + "grad_norm": 49.63667138680251, + "learning_rate": 7.243692286291394e-06, + "loss": 3.5528, + "step": 14668 + }, + { + "epoch": 1.25023438165857, + "grad_norm": 137.60719134011697, + "learning_rate": 7.243249154337401e-06, + "loss": 2.5397, + "step": 14669 + }, + { + "epoch": 1.2503196113525952, + "grad_norm": 85.53821928922378, + "learning_rate": 7.242806000321942e-06, + "loss": 2.204, + "step": 14670 + }, + { + "epoch": 1.2504048410466206, + "grad_norm": 41.42506099578243, + "learning_rate": 7.242362824249379e-06, + "loss": 3.3661, + "step": 14671 + }, + { + "epoch": 1.250490070740646, + "grad_norm": 37.53185947013306, + "learning_rate": 7.24191962612407e-06, + "loss": 2.3486, + "step": 14672 + }, + { + "epoch": 1.2505753004346714, + "grad_norm": 48.67387078511842, + "learning_rate": 7.241476405950371e-06, + "loss": 2.6345, + "step": 14673 + }, + { + "epoch": 1.2506605301286968, + "grad_norm": 79.00580213530237, + "learning_rate": 7.241033163732641e-06, + "loss": 4.3181, + "step": 14674 + }, + { + "epoch": 1.2507457598227223, + "grad_norm": 42.98830902355157, + "learning_rate": 7.240589899475241e-06, + "loss": 4.0361, + "step": 14675 + }, + { + "epoch": 1.2508309895167478, + "grad_norm": 35.28160835745163, + "learning_rate": 7.240146613182531e-06, + "loss": 3.0306, + "step": 14676 + }, + { + "epoch": 1.250916219210773, + "grad_norm": 35.40760782834221, + "learning_rate": 7.239703304858868e-06, + "loss": 2.7458, + "step": 14677 + }, + { + "epoch": 1.2510014489047985, + "grad_norm": 66.1936840787416, + "learning_rate": 7.239259974508614e-06, + "loss": 2.8332, + "step": 14678 + }, + { + "epoch": 1.2510866785988237, + "grad_norm": 57.94108009956209, + "learning_rate": 7.238816622136127e-06, + "loss": 3.6063, + "step": 14679 + }, + { + "epoch": 1.2511719082928492, + "grad_norm": 28.634787649705697, + "learning_rate": 7.238373247745768e-06, + "loss": 2.7546, + "step": 14680 + }, + { + "epoch": 1.2512571379868747, + "grad_norm": 82.18794231187599, + "learning_rate": 7.237929851341899e-06, + "loss": 3.1706, + "step": 14681 + }, + { + "epoch": 1.2513423676809001, + "grad_norm": 47.912400417146074, + "learning_rate": 7.237486432928876e-06, + "loss": 2.8875, + "step": 14682 + }, + { + "epoch": 1.2514275973749254, + "grad_norm": 37.78509004967692, + "learning_rate": 7.237042992511066e-06, + "loss": 2.8594, + "step": 14683 + }, + { + "epoch": 1.2515128270689508, + "grad_norm": 72.83432291486372, + "learning_rate": 7.2365995300928264e-06, + "loss": 3.0416, + "step": 14684 + }, + { + "epoch": 1.251598056762976, + "grad_norm": 59.72039600549603, + "learning_rate": 7.236156045678518e-06, + "loss": 1.6981, + "step": 14685 + }, + { + "epoch": 1.2516832864570016, + "grad_norm": 58.47649405497391, + "learning_rate": 7.235712539272505e-06, + "loss": 3.041, + "step": 14686 + }, + { + "epoch": 1.251768516151027, + "grad_norm": 51.999887688608894, + "learning_rate": 7.2352690108791465e-06, + "loss": 2.7427, + "step": 14687 + }, + { + "epoch": 1.2518537458450525, + "grad_norm": 37.86003003276612, + "learning_rate": 7.234825460502807e-06, + "loss": 1.9226, + "step": 14688 + }, + { + "epoch": 1.2519389755390777, + "grad_norm": 31.20008876658638, + "learning_rate": 7.2343818881478455e-06, + "loss": 2.7098, + "step": 14689 + }, + { + "epoch": 1.2520242052331032, + "grad_norm": 51.167783874863154, + "learning_rate": 7.233938293818628e-06, + "loss": 2.6156, + "step": 14690 + }, + { + "epoch": 1.2521094349271287, + "grad_norm": 55.55474698781151, + "learning_rate": 7.233494677519513e-06, + "loss": 2.4864, + "step": 14691 + }, + { + "epoch": 1.252194664621154, + "grad_norm": 37.1848588448777, + "learning_rate": 7.233051039254865e-06, + "loss": 3.7237, + "step": 14692 + }, + { + "epoch": 1.2522798943151794, + "grad_norm": 57.05959334993988, + "learning_rate": 7.232607379029049e-06, + "loss": 3.1335, + "step": 14693 + }, + { + "epoch": 1.2523651240092049, + "grad_norm": 155.73958009252766, + "learning_rate": 7.232163696846426e-06, + "loss": 3.071, + "step": 14694 + }, + { + "epoch": 1.2524503537032303, + "grad_norm": 61.487915148711785, + "learning_rate": 7.231719992711359e-06, + "loss": 3.3896, + "step": 14695 + }, + { + "epoch": 1.2525355833972556, + "grad_norm": 61.979581283431735, + "learning_rate": 7.231276266628214e-06, + "loss": 3.7156, + "step": 14696 + }, + { + "epoch": 1.252620813091281, + "grad_norm": 50.973122483241696, + "learning_rate": 7.230832518601352e-06, + "loss": 3.6263, + "step": 14697 + }, + { + "epoch": 1.2527060427853063, + "grad_norm": 35.76945193188796, + "learning_rate": 7.2303887486351406e-06, + "loss": 2.9414, + "step": 14698 + }, + { + "epoch": 1.2527912724793318, + "grad_norm": 55.29094808657171, + "learning_rate": 7.2299449567339405e-06, + "loss": 3.2339, + "step": 14699 + }, + { + "epoch": 1.2528765021733572, + "grad_norm": 53.09107390251716, + "learning_rate": 7.229501142902119e-06, + "loss": 2.1262, + "step": 14700 + }, + { + "epoch": 1.2529617318673827, + "grad_norm": 44.75704190180558, + "learning_rate": 7.229057307144038e-06, + "loss": 3.1094, + "step": 14701 + }, + { + "epoch": 1.253046961561408, + "grad_norm": 47.11212596185279, + "learning_rate": 7.228613449464067e-06, + "loss": 2.883, + "step": 14702 + }, + { + "epoch": 1.2531321912554334, + "grad_norm": 61.944599806313455, + "learning_rate": 7.228169569866567e-06, + "loss": 2.8072, + "step": 14703 + }, + { + "epoch": 1.2532174209494589, + "grad_norm": 32.78348499729414, + "learning_rate": 7.2277256683559035e-06, + "loss": 2.2125, + "step": 14704 + }, + { + "epoch": 1.2533026506434841, + "grad_norm": 39.260673555762416, + "learning_rate": 7.227281744936445e-06, + "loss": 2.4208, + "step": 14705 + }, + { + "epoch": 1.2533878803375096, + "grad_norm": 46.23186798245719, + "learning_rate": 7.2268377996125535e-06, + "loss": 3.0073, + "step": 14706 + }, + { + "epoch": 1.253473110031535, + "grad_norm": 33.20961210753672, + "learning_rate": 7.2263938323886e-06, + "loss": 2.8208, + "step": 14707 + }, + { + "epoch": 1.2535583397255605, + "grad_norm": 54.30206401597151, + "learning_rate": 7.225949843268946e-06, + "loss": 3.0702, + "step": 14708 + }, + { + "epoch": 1.2536435694195858, + "grad_norm": 62.92991428862595, + "learning_rate": 7.225505832257959e-06, + "loss": 3.4618, + "step": 14709 + }, + { + "epoch": 1.2537287991136112, + "grad_norm": 40.41047035213208, + "learning_rate": 7.225061799360007e-06, + "loss": 2.9159, + "step": 14710 + }, + { + "epoch": 1.2538140288076365, + "grad_norm": 57.13233467381072, + "learning_rate": 7.224617744579459e-06, + "loss": 3.0416, + "step": 14711 + }, + { + "epoch": 1.253899258501662, + "grad_norm": 46.530083993710626, + "learning_rate": 7.224173667920676e-06, + "loss": 2.8558, + "step": 14712 + }, + { + "epoch": 1.2539844881956874, + "grad_norm": 34.23234577708345, + "learning_rate": 7.223729569388029e-06, + "loss": 2.5262, + "step": 14713 + }, + { + "epoch": 1.2540697178897129, + "grad_norm": 46.05216388118393, + "learning_rate": 7.223285448985886e-06, + "loss": 2.4461, + "step": 14714 + }, + { + "epoch": 1.2541549475837381, + "grad_norm": 44.53524889470251, + "learning_rate": 7.222841306718615e-06, + "loss": 3.1342, + "step": 14715 + }, + { + "epoch": 1.2542401772777636, + "grad_norm": 59.542101022897405, + "learning_rate": 7.222397142590582e-06, + "loss": 3.0114, + "step": 14716 + }, + { + "epoch": 1.2543254069717888, + "grad_norm": 76.9864438674983, + "learning_rate": 7.221952956606158e-06, + "loss": 4.4673, + "step": 14717 + }, + { + "epoch": 1.2544106366658143, + "grad_norm": 37.18440276578983, + "learning_rate": 7.2215087487697075e-06, + "loss": 3.3086, + "step": 14718 + }, + { + "epoch": 1.2544958663598398, + "grad_norm": 63.87137698080292, + "learning_rate": 7.221064519085602e-06, + "loss": 2.5262, + "step": 14719 + }, + { + "epoch": 1.2545810960538653, + "grad_norm": 46.414622311590456, + "learning_rate": 7.220620267558209e-06, + "loss": 3.2401, + "step": 14720 + }, + { + "epoch": 1.2546663257478905, + "grad_norm": 32.81371415439944, + "learning_rate": 7.220175994191898e-06, + "loss": 2.5596, + "step": 14721 + }, + { + "epoch": 1.254751555441916, + "grad_norm": 114.29550230296603, + "learning_rate": 7.2197316989910405e-06, + "loss": 3.0245, + "step": 14722 + }, + { + "epoch": 1.2548367851359414, + "grad_norm": 49.225342812732215, + "learning_rate": 7.21928738196e-06, + "loss": 2.307, + "step": 14723 + }, + { + "epoch": 1.2549220148299667, + "grad_norm": 59.26318273907295, + "learning_rate": 7.218843043103153e-06, + "loss": 2.7699, + "step": 14724 + }, + { + "epoch": 1.2550072445239921, + "grad_norm": 34.69292154557417, + "learning_rate": 7.218398682424865e-06, + "loss": 2.1451, + "step": 14725 + }, + { + "epoch": 1.2550924742180176, + "grad_norm": 95.91973581065409, + "learning_rate": 7.217954299929508e-06, + "loss": 4.4989, + "step": 14726 + }, + { + "epoch": 1.255177703912043, + "grad_norm": 51.311763899804596, + "learning_rate": 7.217509895621452e-06, + "loss": 3.2611, + "step": 14727 + }, + { + "epoch": 1.2552629336060683, + "grad_norm": 68.68163275611306, + "learning_rate": 7.217065469505067e-06, + "loss": 2.536, + "step": 14728 + }, + { + "epoch": 1.2553481633000938, + "grad_norm": 25.713556091073137, + "learning_rate": 7.216621021584723e-06, + "loss": 1.7836, + "step": 14729 + }, + { + "epoch": 1.255433392994119, + "grad_norm": 139.56940979400846, + "learning_rate": 7.216176551864794e-06, + "loss": 2.4646, + "step": 14730 + }, + { + "epoch": 1.2555186226881445, + "grad_norm": 52.109311377812325, + "learning_rate": 7.2157320603496485e-06, + "loss": 2.5251, + "step": 14731 + }, + { + "epoch": 1.25560385238217, + "grad_norm": 53.114374114643965, + "learning_rate": 7.215287547043658e-06, + "loss": 3.0326, + "step": 14732 + }, + { + "epoch": 1.2556890820761955, + "grad_norm": 77.31064119824748, + "learning_rate": 7.2148430119511945e-06, + "loss": 3.6715, + "step": 14733 + }, + { + "epoch": 1.2557743117702207, + "grad_norm": 48.114244706466806, + "learning_rate": 7.214398455076632e-06, + "loss": 3.3892, + "step": 14734 + }, + { + "epoch": 1.2558595414642462, + "grad_norm": 41.81018828400827, + "learning_rate": 7.213953876424339e-06, + "loss": 2.8276, + "step": 14735 + }, + { + "epoch": 1.2559447711582714, + "grad_norm": 45.679408216046504, + "learning_rate": 7.2135092759986895e-06, + "loss": 3.7727, + "step": 14736 + }, + { + "epoch": 1.2560300008522969, + "grad_norm": 67.46389026756137, + "learning_rate": 7.213064653804056e-06, + "loss": 2.5909, + "step": 14737 + }, + { + "epoch": 1.2561152305463223, + "grad_norm": 69.18590585204025, + "learning_rate": 7.212620009844811e-06, + "loss": 3.1865, + "step": 14738 + }, + { + "epoch": 1.2562004602403478, + "grad_norm": 70.58590768586821, + "learning_rate": 7.2121753441253275e-06, + "loss": 2.9601, + "step": 14739 + }, + { + "epoch": 1.256285689934373, + "grad_norm": 108.77536683641738, + "learning_rate": 7.211730656649978e-06, + "loss": 3.7557, + "step": 14740 + }, + { + "epoch": 1.2563709196283985, + "grad_norm": 33.074045030086765, + "learning_rate": 7.211285947423138e-06, + "loss": 2.6567, + "step": 14741 + }, + { + "epoch": 1.256456149322424, + "grad_norm": 70.48951714398524, + "learning_rate": 7.210841216449177e-06, + "loss": 3.4785, + "step": 14742 + }, + { + "epoch": 1.2565413790164492, + "grad_norm": 51.711811999716005, + "learning_rate": 7.210396463732472e-06, + "loss": 3.8267, + "step": 14743 + }, + { + "epoch": 1.2566266087104747, + "grad_norm": 32.94254260203795, + "learning_rate": 7.2099516892773965e-06, + "loss": 2.3133, + "step": 14744 + }, + { + "epoch": 1.2567118384045002, + "grad_norm": 56.12092246012496, + "learning_rate": 7.209506893088325e-06, + "loss": 2.9211, + "step": 14745 + }, + { + "epoch": 1.2567970680985256, + "grad_norm": 41.026350847657255, + "learning_rate": 7.20906207516963e-06, + "loss": 3.2887, + "step": 14746 + }, + { + "epoch": 1.256882297792551, + "grad_norm": 149.79794407970832, + "learning_rate": 7.208617235525688e-06, + "loss": 3.7479, + "step": 14747 + }, + { + "epoch": 1.2569675274865764, + "grad_norm": 23.883509690803834, + "learning_rate": 7.2081723741608725e-06, + "loss": 2.0777, + "step": 14748 + }, + { + "epoch": 1.2570527571806016, + "grad_norm": 59.22515298046836, + "learning_rate": 7.2077274910795605e-06, + "loss": 2.0746, + "step": 14749 + }, + { + "epoch": 1.257137986874627, + "grad_norm": 87.16972004627773, + "learning_rate": 7.207282586286125e-06, + "loss": 4.0366, + "step": 14750 + }, + { + "epoch": 1.2572232165686525, + "grad_norm": 33.36710371109059, + "learning_rate": 7.206837659784941e-06, + "loss": 2.3682, + "step": 14751 + }, + { + "epoch": 1.257308446262678, + "grad_norm": 52.5125876982376, + "learning_rate": 7.2063927115803875e-06, + "loss": 3.462, + "step": 14752 + }, + { + "epoch": 1.2573936759567033, + "grad_norm": 35.89190433320204, + "learning_rate": 7.205947741676837e-06, + "loss": 2.6748, + "step": 14753 + }, + { + "epoch": 1.2574789056507287, + "grad_norm": 36.32663029507587, + "learning_rate": 7.205502750078669e-06, + "loss": 3.1078, + "step": 14754 + }, + { + "epoch": 1.257564135344754, + "grad_norm": 34.078285621098665, + "learning_rate": 7.205057736790255e-06, + "loss": 2.88, + "step": 14755 + }, + { + "epoch": 1.2576493650387794, + "grad_norm": 26.67431367562847, + "learning_rate": 7.204612701815976e-06, + "loss": 1.7881, + "step": 14756 + }, + { + "epoch": 1.257734594732805, + "grad_norm": 31.251276146546598, + "learning_rate": 7.204167645160207e-06, + "loss": 3.102, + "step": 14757 + }, + { + "epoch": 1.2578198244268304, + "grad_norm": 39.40478501279233, + "learning_rate": 7.203722566827326e-06, + "loss": 3.1222, + "step": 14758 + }, + { + "epoch": 1.2579050541208556, + "grad_norm": 59.83878149678849, + "learning_rate": 7.203277466821708e-06, + "loss": 3.7428, + "step": 14759 + }, + { + "epoch": 1.257990283814881, + "grad_norm": 20.96360658997928, + "learning_rate": 7.202832345147731e-06, + "loss": 1.3451, + "step": 14760 + }, + { + "epoch": 1.2580755135089066, + "grad_norm": 83.50295808784543, + "learning_rate": 7.202387201809774e-06, + "loss": 3.7954, + "step": 14761 + }, + { + "epoch": 1.2581607432029318, + "grad_norm": 41.54455771841385, + "learning_rate": 7.201942036812215e-06, + "loss": 3.0831, + "step": 14762 + }, + { + "epoch": 1.2582459728969573, + "grad_norm": 36.2438611948133, + "learning_rate": 7.201496850159429e-06, + "loss": 2.8454, + "step": 14763 + }, + { + "epoch": 1.2583312025909827, + "grad_norm": 46.6046086541965, + "learning_rate": 7.201051641855799e-06, + "loss": 3.7505, + "step": 14764 + }, + { + "epoch": 1.2584164322850082, + "grad_norm": 35.365475321916506, + "learning_rate": 7.200606411905698e-06, + "loss": 2.5963, + "step": 14765 + }, + { + "epoch": 1.2585016619790335, + "grad_norm": 80.86677890664839, + "learning_rate": 7.200161160313509e-06, + "loss": 2.3091, + "step": 14766 + }, + { + "epoch": 1.258586891673059, + "grad_norm": 89.7853743217754, + "learning_rate": 7.1997158870836085e-06, + "loss": 2.7924, + "step": 14767 + }, + { + "epoch": 1.2586721213670842, + "grad_norm": 38.32084535132285, + "learning_rate": 7.199270592220377e-06, + "loss": 2.3358, + "step": 14768 + }, + { + "epoch": 1.2587573510611096, + "grad_norm": 65.5865180403742, + "learning_rate": 7.198825275728193e-06, + "loss": 3.0827, + "step": 14769 + }, + { + "epoch": 1.258842580755135, + "grad_norm": 39.330975440410036, + "learning_rate": 7.198379937611435e-06, + "loss": 3.0712, + "step": 14770 + }, + { + "epoch": 1.2589278104491606, + "grad_norm": 57.30518554031816, + "learning_rate": 7.197934577874485e-06, + "loss": 1.8396, + "step": 14771 + }, + { + "epoch": 1.2590130401431858, + "grad_norm": 35.606802497406846, + "learning_rate": 7.197489196521721e-06, + "loss": 2.5391, + "step": 14772 + }, + { + "epoch": 1.2590982698372113, + "grad_norm": 40.26633515762842, + "learning_rate": 7.197043793557525e-06, + "loss": 3.1003, + "step": 14773 + }, + { + "epoch": 1.2591834995312368, + "grad_norm": 69.45200452988573, + "learning_rate": 7.196598368986275e-06, + "loss": 3.3413, + "step": 14774 + }, + { + "epoch": 1.259268729225262, + "grad_norm": 39.60809873632846, + "learning_rate": 7.196152922812354e-06, + "loss": 2.6632, + "step": 14775 + }, + { + "epoch": 1.2593539589192875, + "grad_norm": 36.04685934254049, + "learning_rate": 7.195707455040141e-06, + "loss": 2.464, + "step": 14776 + }, + { + "epoch": 1.259439188613313, + "grad_norm": 94.16063900151566, + "learning_rate": 7.195261965674018e-06, + "loss": 2.6262, + "step": 14777 + }, + { + "epoch": 1.2595244183073384, + "grad_norm": 25.14536517952758, + "learning_rate": 7.194816454718366e-06, + "loss": 2.3419, + "step": 14778 + }, + { + "epoch": 1.2596096480013637, + "grad_norm": 43.65602747282345, + "learning_rate": 7.194370922177566e-06, + "loss": 2.4636, + "step": 14779 + }, + { + "epoch": 1.2596948776953891, + "grad_norm": 59.67102219097026, + "learning_rate": 7.193925368056e-06, + "loss": 2.72, + "step": 14780 + }, + { + "epoch": 1.2597801073894144, + "grad_norm": 14.27809019302869, + "learning_rate": 7.19347979235805e-06, + "loss": 1.0049, + "step": 14781 + }, + { + "epoch": 1.2598653370834398, + "grad_norm": 37.85029730475589, + "learning_rate": 7.193034195088097e-06, + "loss": 2.6608, + "step": 14782 + }, + { + "epoch": 1.2599505667774653, + "grad_norm": 49.23853512771979, + "learning_rate": 7.192588576250526e-06, + "loss": 1.928, + "step": 14783 + }, + { + "epoch": 1.2600357964714908, + "grad_norm": 69.53903409549898, + "learning_rate": 7.192142935849716e-06, + "loss": 3.2263, + "step": 14784 + }, + { + "epoch": 1.260121026165516, + "grad_norm": 34.64356471651151, + "learning_rate": 7.191697273890053e-06, + "loss": 2.9523, + "step": 14785 + }, + { + "epoch": 1.2602062558595415, + "grad_norm": 38.27693563983864, + "learning_rate": 7.191251590375916e-06, + "loss": 2.1722, + "step": 14786 + }, + { + "epoch": 1.2602914855535667, + "grad_norm": 47.00960992925715, + "learning_rate": 7.19080588531169e-06, + "loss": 2.2041, + "step": 14787 + }, + { + "epoch": 1.2603767152475922, + "grad_norm": 73.39540516093489, + "learning_rate": 7.190360158701761e-06, + "loss": 3.5714, + "step": 14788 + }, + { + "epoch": 1.2604619449416177, + "grad_norm": 55.53071070188737, + "learning_rate": 7.1899144105505095e-06, + "loss": 2.6783, + "step": 14789 + }, + { + "epoch": 1.2605471746356431, + "grad_norm": 58.45782760614429, + "learning_rate": 7.189468640862319e-06, + "loss": 3.2222, + "step": 14790 + }, + { + "epoch": 1.2606324043296684, + "grad_norm": 66.15977831694157, + "learning_rate": 7.189022849641574e-06, + "loss": 4.2978, + "step": 14791 + }, + { + "epoch": 1.2607176340236939, + "grad_norm": 124.93621469441365, + "learning_rate": 7.188577036892661e-06, + "loss": 3.599, + "step": 14792 + }, + { + "epoch": 1.2608028637177193, + "grad_norm": 43.64661651793038, + "learning_rate": 7.188131202619963e-06, + "loss": 2.913, + "step": 14793 + }, + { + "epoch": 1.2608880934117446, + "grad_norm": 28.675765864152062, + "learning_rate": 7.187685346827862e-06, + "loss": 1.9953, + "step": 14794 + }, + { + "epoch": 1.26097332310577, + "grad_norm": 49.92541922407901, + "learning_rate": 7.187239469520746e-06, + "loss": 3.2506, + "step": 14795 + }, + { + "epoch": 1.2610585527997955, + "grad_norm": 46.39963266100468, + "learning_rate": 7.186793570702998e-06, + "loss": 3.4757, + "step": 14796 + }, + { + "epoch": 1.261143782493821, + "grad_norm": 56.339010709727006, + "learning_rate": 7.186347650379005e-06, + "loss": 2.99, + "step": 14797 + }, + { + "epoch": 1.2612290121878462, + "grad_norm": 40.48901417932874, + "learning_rate": 7.1859017085531506e-06, + "loss": 3.0308, + "step": 14798 + }, + { + "epoch": 1.2613142418818717, + "grad_norm": 182.35135000485994, + "learning_rate": 7.185455745229824e-06, + "loss": 4.2758, + "step": 14799 + }, + { + "epoch": 1.261399471575897, + "grad_norm": 42.05885339022244, + "learning_rate": 7.185009760413405e-06, + "loss": 1.597, + "step": 14800 + }, + { + "epoch": 1.2614847012699224, + "grad_norm": 48.285292619663146, + "learning_rate": 7.184563754108287e-06, + "loss": 2.7058, + "step": 14801 + }, + { + "epoch": 1.2615699309639479, + "grad_norm": 37.244539507840436, + "learning_rate": 7.184117726318849e-06, + "loss": 2.2927, + "step": 14802 + }, + { + "epoch": 1.2616551606579733, + "grad_norm": 49.55151916990236, + "learning_rate": 7.183671677049483e-06, + "loss": 2.582, + "step": 14803 + }, + { + "epoch": 1.2617403903519986, + "grad_norm": 78.09108978267835, + "learning_rate": 7.183225606304574e-06, + "loss": 3.7568, + "step": 14804 + }, + { + "epoch": 1.261825620046024, + "grad_norm": 80.08717289530287, + "learning_rate": 7.182779514088508e-06, + "loss": 2.608, + "step": 14805 + }, + { + "epoch": 1.2619108497400493, + "grad_norm": 43.86040020153073, + "learning_rate": 7.182333400405672e-06, + "loss": 3.0304, + "step": 14806 + }, + { + "epoch": 1.2619960794340748, + "grad_norm": 32.53860412571873, + "learning_rate": 7.1818872652604555e-06, + "loss": 2.643, + "step": 14807 + }, + { + "epoch": 1.2620813091281002, + "grad_norm": 40.59427017521004, + "learning_rate": 7.181441108657246e-06, + "loss": 1.9865, + "step": 14808 + }, + { + "epoch": 1.2621665388221257, + "grad_norm": 124.51992708893819, + "learning_rate": 7.1809949306004265e-06, + "loss": 2.8051, + "step": 14809 + }, + { + "epoch": 1.262251768516151, + "grad_norm": 38.823029731623315, + "learning_rate": 7.180548731094391e-06, + "loss": 2.3084, + "step": 14810 + }, + { + "epoch": 1.2623369982101764, + "grad_norm": 40.66551500018575, + "learning_rate": 7.180102510143524e-06, + "loss": 2.8377, + "step": 14811 + }, + { + "epoch": 1.2624222279042019, + "grad_norm": 64.41718093249325, + "learning_rate": 7.179656267752216e-06, + "loss": 3.2254, + "step": 14812 + }, + { + "epoch": 1.2625074575982271, + "grad_norm": 48.12191375624484, + "learning_rate": 7.179210003924855e-06, + "loss": 3.1476, + "step": 14813 + }, + { + "epoch": 1.2625926872922526, + "grad_norm": 54.19562444186075, + "learning_rate": 7.1787637186658285e-06, + "loss": 2.5301, + "step": 14814 + }, + { + "epoch": 1.262677916986278, + "grad_norm": 30.008492137176756, + "learning_rate": 7.178317411979526e-06, + "loss": 1.914, + "step": 14815 + }, + { + "epoch": 1.2627631466803035, + "grad_norm": 73.73108735098516, + "learning_rate": 7.177871083870339e-06, + "loss": 2.6776, + "step": 14816 + }, + { + "epoch": 1.2628483763743288, + "grad_norm": 41.90022618755066, + "learning_rate": 7.177424734342655e-06, + "loss": 2.7552, + "step": 14817 + }, + { + "epoch": 1.2629336060683543, + "grad_norm": 29.523764822362875, + "learning_rate": 7.176978363400864e-06, + "loss": 3.0874, + "step": 14818 + }, + { + "epoch": 1.2630188357623795, + "grad_norm": 34.675909215395, + "learning_rate": 7.176531971049354e-06, + "loss": 2.886, + "step": 14819 + }, + { + "epoch": 1.263104065456405, + "grad_norm": 60.86443608570843, + "learning_rate": 7.176085557292519e-06, + "loss": 3.5862, + "step": 14820 + }, + { + "epoch": 1.2631892951504304, + "grad_norm": 57.47608955825896, + "learning_rate": 7.175639122134747e-06, + "loss": 1.9222, + "step": 14821 + }, + { + "epoch": 1.263274524844456, + "grad_norm": 64.01934158732709, + "learning_rate": 7.175192665580428e-06, + "loss": 3.4343, + "step": 14822 + }, + { + "epoch": 1.2633597545384812, + "grad_norm": 86.57757373040886, + "learning_rate": 7.174746187633954e-06, + "loss": 5.079, + "step": 14823 + }, + { + "epoch": 1.2634449842325066, + "grad_norm": 125.6534027528765, + "learning_rate": 7.174299688299715e-06, + "loss": 3.4339, + "step": 14824 + }, + { + "epoch": 1.263530213926532, + "grad_norm": 30.99285122281182, + "learning_rate": 7.173853167582103e-06, + "loss": 2.3604, + "step": 14825 + }, + { + "epoch": 1.2636154436205573, + "grad_norm": 72.44282635238335, + "learning_rate": 7.173406625485509e-06, + "loss": 2.3247, + "step": 14826 + }, + { + "epoch": 1.2637006733145828, + "grad_norm": 33.97405539300783, + "learning_rate": 7.172960062014325e-06, + "loss": 3.4153, + "step": 14827 + }, + { + "epoch": 1.2637859030086083, + "grad_norm": 45.37621718777794, + "learning_rate": 7.17251347717294e-06, + "loss": 3.1186, + "step": 14828 + }, + { + "epoch": 1.2638711327026335, + "grad_norm": 63.32570435537754, + "learning_rate": 7.1720668709657495e-06, + "loss": 2.7281, + "step": 14829 + }, + { + "epoch": 1.263956362396659, + "grad_norm": 40.74645927220104, + "learning_rate": 7.171620243397144e-06, + "loss": 2.9871, + "step": 14830 + }, + { + "epoch": 1.2640415920906845, + "grad_norm": 38.72576604351239, + "learning_rate": 7.171173594471518e-06, + "loss": 3.3118, + "step": 14831 + }, + { + "epoch": 1.2641268217847097, + "grad_norm": 32.637902362967054, + "learning_rate": 7.170726924193261e-06, + "loss": 2.2266, + "step": 14832 + }, + { + "epoch": 1.2642120514787352, + "grad_norm": 66.480970067566, + "learning_rate": 7.170280232566768e-06, + "loss": 4.5933, + "step": 14833 + }, + { + "epoch": 1.2642972811727606, + "grad_norm": 57.53111527254845, + "learning_rate": 7.1698335195964294e-06, + "loss": 3.3377, + "step": 14834 + }, + { + "epoch": 1.264382510866786, + "grad_norm": 78.39141882336455, + "learning_rate": 7.169386785286643e-06, + "loss": 2.8672, + "step": 14835 + }, + { + "epoch": 1.2644677405608113, + "grad_norm": 37.681503138863356, + "learning_rate": 7.168940029641799e-06, + "loss": 2.5151, + "step": 14836 + }, + { + "epoch": 1.2645529702548368, + "grad_norm": 55.194885584639, + "learning_rate": 7.168493252666289e-06, + "loss": 2.4609, + "step": 14837 + }, + { + "epoch": 1.264638199948862, + "grad_norm": 39.11372513222958, + "learning_rate": 7.168046454364514e-06, + "loss": 2.936, + "step": 14838 + }, + { + "epoch": 1.2647234296428875, + "grad_norm": 90.4922399516382, + "learning_rate": 7.167599634740861e-06, + "loss": 3.3609, + "step": 14839 + }, + { + "epoch": 1.264808659336913, + "grad_norm": 82.96556316910848, + "learning_rate": 7.167152793799728e-06, + "loss": 4.8596, + "step": 14840 + }, + { + "epoch": 1.2648938890309385, + "grad_norm": 32.5365761236592, + "learning_rate": 7.166705931545505e-06, + "loss": 3.7758, + "step": 14841 + }, + { + "epoch": 1.2649791187249637, + "grad_norm": 57.151686576776676, + "learning_rate": 7.1662590479825935e-06, + "loss": 2.9566, + "step": 14842 + }, + { + "epoch": 1.2650643484189892, + "grad_norm": 61.39089782810671, + "learning_rate": 7.1658121431153834e-06, + "loss": 2.4169, + "step": 14843 + }, + { + "epoch": 1.2651495781130147, + "grad_norm": 53.9843024521975, + "learning_rate": 7.165365216948274e-06, + "loss": 2.4527, + "step": 14844 + }, + { + "epoch": 1.26523480780704, + "grad_norm": 60.342085517533725, + "learning_rate": 7.164918269485655e-06, + "loss": 2.7184, + "step": 14845 + }, + { + "epoch": 1.2653200375010654, + "grad_norm": 141.0743234837198, + "learning_rate": 7.1644713007319265e-06, + "loss": 3.259, + "step": 14846 + }, + { + "epoch": 1.2654052671950908, + "grad_norm": 38.83612111372603, + "learning_rate": 7.164024310691483e-06, + "loss": 2.9577, + "step": 14847 + }, + { + "epoch": 1.2654904968891163, + "grad_norm": 38.31470164608091, + "learning_rate": 7.1635772993687205e-06, + "loss": 3.2177, + "step": 14848 + }, + { + "epoch": 1.2655757265831415, + "grad_norm": 51.29498679526115, + "learning_rate": 7.163130266768034e-06, + "loss": 1.812, + "step": 14849 + }, + { + "epoch": 1.265660956277167, + "grad_norm": 57.07429267033617, + "learning_rate": 7.162683212893822e-06, + "loss": 2.8914, + "step": 14850 + }, + { + "epoch": 1.2657461859711923, + "grad_norm": 53.928456398279, + "learning_rate": 7.16223613775048e-06, + "loss": 3.0497, + "step": 14851 + }, + { + "epoch": 1.2658314156652177, + "grad_norm": 29.719161700021743, + "learning_rate": 7.161789041342404e-06, + "loss": 2.8803, + "step": 14852 + }, + { + "epoch": 1.2659166453592432, + "grad_norm": 49.651262224541824, + "learning_rate": 7.161341923673993e-06, + "loss": 2.9815, + "step": 14853 + }, + { + "epoch": 1.2660018750532687, + "grad_norm": 84.51064912297151, + "learning_rate": 7.16089478474964e-06, + "loss": 3.812, + "step": 14854 + }, + { + "epoch": 1.266087104747294, + "grad_norm": 25.777706898246354, + "learning_rate": 7.160447624573749e-06, + "loss": 2.8119, + "step": 14855 + }, + { + "epoch": 1.2661723344413194, + "grad_norm": 63.91784440796476, + "learning_rate": 7.160000443150712e-06, + "loss": 2.775, + "step": 14856 + }, + { + "epoch": 1.2662575641353446, + "grad_norm": 55.856325934940955, + "learning_rate": 7.159553240484929e-06, + "loss": 2.951, + "step": 14857 + }, + { + "epoch": 1.26634279382937, + "grad_norm": 33.23919636460797, + "learning_rate": 7.159106016580799e-06, + "loss": 2.762, + "step": 14858 + }, + { + "epoch": 1.2664280235233956, + "grad_norm": 57.64159572117776, + "learning_rate": 7.158658771442719e-06, + "loss": 3.1321, + "step": 14859 + }, + { + "epoch": 1.266513253217421, + "grad_norm": 119.09444387944978, + "learning_rate": 7.158211505075088e-06, + "loss": 3.4759, + "step": 14860 + }, + { + "epoch": 1.2665984829114463, + "grad_norm": 64.07641337669122, + "learning_rate": 7.157764217482303e-06, + "loss": 3.4165, + "step": 14861 + }, + { + "epoch": 1.2666837126054717, + "grad_norm": 54.702807957793425, + "learning_rate": 7.1573169086687655e-06, + "loss": 2.6677, + "step": 14862 + }, + { + "epoch": 1.2667689422994972, + "grad_norm": 28.67014647506637, + "learning_rate": 7.156869578638873e-06, + "loss": 3.2053, + "step": 14863 + }, + { + "epoch": 1.2668541719935225, + "grad_norm": 77.43644449320313, + "learning_rate": 7.1564222273970256e-06, + "loss": 3.3619, + "step": 14864 + }, + { + "epoch": 1.266939401687548, + "grad_norm": 72.69882883333445, + "learning_rate": 7.1559748549476226e-06, + "loss": 3.0546, + "step": 14865 + }, + { + "epoch": 1.2670246313815734, + "grad_norm": 80.64116404351489, + "learning_rate": 7.155527461295063e-06, + "loss": 3.2351, + "step": 14866 + }, + { + "epoch": 1.2671098610755989, + "grad_norm": 67.51055162819901, + "learning_rate": 7.155080046443747e-06, + "loss": 3.416, + "step": 14867 + }, + { + "epoch": 1.267195090769624, + "grad_norm": 43.721233003702864, + "learning_rate": 7.154632610398077e-06, + "loss": 3.0975, + "step": 14868 + }, + { + "epoch": 1.2672803204636496, + "grad_norm": 29.819807254153183, + "learning_rate": 7.15418515316245e-06, + "loss": 2.6337, + "step": 14869 + }, + { + "epoch": 1.2673655501576748, + "grad_norm": 70.72101148529269, + "learning_rate": 7.153737674741268e-06, + "loss": 1.9408, + "step": 14870 + }, + { + "epoch": 1.2674507798517003, + "grad_norm": 44.56034159334054, + "learning_rate": 7.153290175138931e-06, + "loss": 3.5372, + "step": 14871 + }, + { + "epoch": 1.2675360095457258, + "grad_norm": 40.07347163207925, + "learning_rate": 7.152842654359843e-06, + "loss": 2.9594, + "step": 14872 + }, + { + "epoch": 1.2676212392397512, + "grad_norm": 35.617191450740904, + "learning_rate": 7.1523951124084016e-06, + "loss": 3.381, + "step": 14873 + }, + { + "epoch": 1.2677064689337765, + "grad_norm": 52.19032942209842, + "learning_rate": 7.15194754928901e-06, + "loss": 3.0348, + "step": 14874 + }, + { + "epoch": 1.267791698627802, + "grad_norm": 84.82564524010358, + "learning_rate": 7.151499965006069e-06, + "loss": 3.7637, + "step": 14875 + }, + { + "epoch": 1.2678769283218272, + "grad_norm": 39.21761899871249, + "learning_rate": 7.1510523595639805e-06, + "loss": 2.3654, + "step": 14876 + }, + { + "epoch": 1.2679621580158527, + "grad_norm": 45.27221906986721, + "learning_rate": 7.150604732967147e-06, + "loss": 3.2812, + "step": 14877 + }, + { + "epoch": 1.2680473877098781, + "grad_norm": 46.6827716914301, + "learning_rate": 7.150157085219971e-06, + "loss": 2.4493, + "step": 14878 + }, + { + "epoch": 1.2681326174039036, + "grad_norm": 66.44447066688393, + "learning_rate": 7.149709416326853e-06, + "loss": 2.9973, + "step": 14879 + }, + { + "epoch": 1.2682178470979288, + "grad_norm": 46.22217229226991, + "learning_rate": 7.149261726292198e-06, + "loss": 3.2788, + "step": 14880 + }, + { + "epoch": 1.2683030767919543, + "grad_norm": 48.706862822479934, + "learning_rate": 7.148814015120409e-06, + "loss": 3.6519, + "step": 14881 + }, + { + "epoch": 1.2683883064859798, + "grad_norm": 148.51782675064513, + "learning_rate": 7.148366282815887e-06, + "loss": 4.4476, + "step": 14882 + }, + { + "epoch": 1.268473536180005, + "grad_norm": 116.83885933470766, + "learning_rate": 7.147918529383038e-06, + "loss": 4.3452, + "step": 14883 + }, + { + "epoch": 1.2685587658740305, + "grad_norm": 26.369127107744198, + "learning_rate": 7.1474707548262605e-06, + "loss": 2.0933, + "step": 14884 + }, + { + "epoch": 1.268643995568056, + "grad_norm": 104.07091559770036, + "learning_rate": 7.147022959149963e-06, + "loss": 4.5871, + "step": 14885 + }, + { + "epoch": 1.2687292252620814, + "grad_norm": 46.40448058022427, + "learning_rate": 7.146575142358548e-06, + "loss": 3.1692, + "step": 14886 + }, + { + "epoch": 1.2688144549561067, + "grad_norm": 184.57036779214783, + "learning_rate": 7.14612730445642e-06, + "loss": 5.3833, + "step": 14887 + }, + { + "epoch": 1.2688996846501321, + "grad_norm": 32.88191799632823, + "learning_rate": 7.145679445447981e-06, + "loss": 1.7743, + "step": 14888 + }, + { + "epoch": 1.2689849143441574, + "grad_norm": 63.91665738912427, + "learning_rate": 7.145231565337639e-06, + "loss": 2.3614, + "step": 14889 + }, + { + "epoch": 1.2690701440381829, + "grad_norm": 37.41906481918639, + "learning_rate": 7.144783664129797e-06, + "loss": 2.5691, + "step": 14890 + }, + { + "epoch": 1.2691553737322083, + "grad_norm": 53.9391768987335, + "learning_rate": 7.144335741828859e-06, + "loss": 2.6689, + "step": 14891 + }, + { + "epoch": 1.2692406034262338, + "grad_norm": 52.95149814059042, + "learning_rate": 7.143887798439231e-06, + "loss": 2.7639, + "step": 14892 + }, + { + "epoch": 1.269325833120259, + "grad_norm": 29.92217665947573, + "learning_rate": 7.143439833965319e-06, + "loss": 2.7267, + "step": 14893 + }, + { + "epoch": 1.2694110628142845, + "grad_norm": 70.24372961366699, + "learning_rate": 7.142991848411528e-06, + "loss": 4.259, + "step": 14894 + }, + { + "epoch": 1.26949629250831, + "grad_norm": 48.653183795129664, + "learning_rate": 7.142543841782264e-06, + "loss": 2.7465, + "step": 14895 + }, + { + "epoch": 1.2695815222023352, + "grad_norm": 61.59742986867392, + "learning_rate": 7.142095814081932e-06, + "loss": 2.4003, + "step": 14896 + }, + { + "epoch": 1.2696667518963607, + "grad_norm": 50.207953344944585, + "learning_rate": 7.141647765314939e-06, + "loss": 2.0223, + "step": 14897 + }, + { + "epoch": 1.2697519815903862, + "grad_norm": 76.12047153756123, + "learning_rate": 7.141199695485691e-06, + "loss": 4.6873, + "step": 14898 + }, + { + "epoch": 1.2698372112844116, + "grad_norm": 52.70171044432532, + "learning_rate": 7.1407516045985945e-06, + "loss": 2.307, + "step": 14899 + }, + { + "epoch": 1.2699224409784369, + "grad_norm": 44.65526242426908, + "learning_rate": 7.140303492658057e-06, + "loss": 2.958, + "step": 14900 + }, + { + "epoch": 1.2700076706724623, + "grad_norm": 76.16240703856015, + "learning_rate": 7.139855359668485e-06, + "loss": 2.7878, + "step": 14901 + }, + { + "epoch": 1.2700929003664876, + "grad_norm": 80.52397233977071, + "learning_rate": 7.139407205634287e-06, + "loss": 3.1455, + "step": 14902 + }, + { + "epoch": 1.270178130060513, + "grad_norm": 52.383557001151786, + "learning_rate": 7.138959030559869e-06, + "loss": 2.8315, + "step": 14903 + }, + { + "epoch": 1.2702633597545385, + "grad_norm": 60.67488574366727, + "learning_rate": 7.138510834449637e-06, + "loss": 3.7996, + "step": 14904 + }, + { + "epoch": 1.270348589448564, + "grad_norm": 42.45495126623033, + "learning_rate": 7.138062617308001e-06, + "loss": 3.213, + "step": 14905 + }, + { + "epoch": 1.2704338191425892, + "grad_norm": 143.06556167265592, + "learning_rate": 7.1376143791393715e-06, + "loss": 3.4685, + "step": 14906 + }, + { + "epoch": 1.2705190488366147, + "grad_norm": 49.130420646143115, + "learning_rate": 7.137166119948151e-06, + "loss": 2.7688, + "step": 14907 + }, + { + "epoch": 1.27060427853064, + "grad_norm": 47.48173930521154, + "learning_rate": 7.136717839738752e-06, + "loss": 3.546, + "step": 14908 + }, + { + "epoch": 1.2706895082246654, + "grad_norm": 177.98856293395647, + "learning_rate": 7.136269538515582e-06, + "loss": 3.0179, + "step": 14909 + }, + { + "epoch": 1.2707747379186909, + "grad_norm": 46.575001143403036, + "learning_rate": 7.135821216283051e-06, + "loss": 3.5121, + "step": 14910 + }, + { + "epoch": 1.2708599676127164, + "grad_norm": 59.80854254834827, + "learning_rate": 7.135372873045567e-06, + "loss": 3.4272, + "step": 14911 + }, + { + "epoch": 1.2709451973067416, + "grad_norm": 51.43512092302721, + "learning_rate": 7.134924508807537e-06, + "loss": 3.3425, + "step": 14912 + }, + { + "epoch": 1.271030427000767, + "grad_norm": 76.21333380195233, + "learning_rate": 7.134476123573375e-06, + "loss": 2.5932, + "step": 14913 + }, + { + "epoch": 1.2711156566947925, + "grad_norm": 40.94429637050056, + "learning_rate": 7.134027717347488e-06, + "loss": 3.0736, + "step": 14914 + }, + { + "epoch": 1.2712008863888178, + "grad_norm": 44.19332201695313, + "learning_rate": 7.133579290134286e-06, + "loss": 2.5349, + "step": 14915 + }, + { + "epoch": 1.2712861160828433, + "grad_norm": 42.9742665360515, + "learning_rate": 7.133130841938179e-06, + "loss": 3.5152, + "step": 14916 + }, + { + "epoch": 1.2713713457768687, + "grad_norm": 41.52867184524063, + "learning_rate": 7.132682372763579e-06, + "loss": 2.9383, + "step": 14917 + }, + { + "epoch": 1.2714565754708942, + "grad_norm": 50.50149278111142, + "learning_rate": 7.132233882614894e-06, + "loss": 2.7772, + "step": 14918 + }, + { + "epoch": 1.2715418051649194, + "grad_norm": 118.91989128024504, + "learning_rate": 7.131785371496537e-06, + "loss": 2.2229, + "step": 14919 + }, + { + "epoch": 1.271627034858945, + "grad_norm": 55.72664853503354, + "learning_rate": 7.131336839412917e-06, + "loss": 3.5546, + "step": 14920 + }, + { + "epoch": 1.2717122645529702, + "grad_norm": 51.36385672049222, + "learning_rate": 7.130888286368448e-06, + "loss": 3.0098, + "step": 14921 + }, + { + "epoch": 1.2717974942469956, + "grad_norm": 67.85012230599705, + "learning_rate": 7.130439712367538e-06, + "loss": 2.9728, + "step": 14922 + }, + { + "epoch": 1.271882723941021, + "grad_norm": 56.278115556759694, + "learning_rate": 7.1299911174146e-06, + "loss": 2.8716, + "step": 14923 + }, + { + "epoch": 1.2719679536350466, + "grad_norm": 43.058025880271686, + "learning_rate": 7.1295425015140465e-06, + "loss": 3.4855, + "step": 14924 + }, + { + "epoch": 1.2720531833290718, + "grad_norm": 33.00783619436112, + "learning_rate": 7.1290938646702875e-06, + "loss": 2.2683, + "step": 14925 + }, + { + "epoch": 1.2721384130230973, + "grad_norm": 34.56152589207174, + "learning_rate": 7.1286452068877385e-06, + "loss": 3.132, + "step": 14926 + }, + { + "epoch": 1.2722236427171225, + "grad_norm": 64.2140526454453, + "learning_rate": 7.128196528170807e-06, + "loss": 3.2186, + "step": 14927 + }, + { + "epoch": 1.272308872411148, + "grad_norm": 39.56584656275058, + "learning_rate": 7.12774782852391e-06, + "loss": 3.0241, + "step": 14928 + }, + { + "epoch": 1.2723941021051735, + "grad_norm": 37.01310287113531, + "learning_rate": 7.1272991079514575e-06, + "loss": 3.0838, + "step": 14929 + }, + { + "epoch": 1.272479331799199, + "grad_norm": 70.05873522915847, + "learning_rate": 7.1268503664578646e-06, + "loss": 3.1028, + "step": 14930 + }, + { + "epoch": 1.2725645614932242, + "grad_norm": 30.76287253584112, + "learning_rate": 7.126401604047543e-06, + "loss": 2.8285, + "step": 14931 + }, + { + "epoch": 1.2726497911872496, + "grad_norm": 27.58037428570295, + "learning_rate": 7.1259528207249065e-06, + "loss": 2.8773, + "step": 14932 + }, + { + "epoch": 1.272735020881275, + "grad_norm": 32.46656498904329, + "learning_rate": 7.125504016494369e-06, + "loss": 2.2988, + "step": 14933 + }, + { + "epoch": 1.2728202505753003, + "grad_norm": 36.98443272256051, + "learning_rate": 7.125055191360345e-06, + "loss": 2.6745, + "step": 14934 + }, + { + "epoch": 1.2729054802693258, + "grad_norm": 39.584244763847764, + "learning_rate": 7.124606345327245e-06, + "loss": 2.7245, + "step": 14935 + }, + { + "epoch": 1.2729907099633513, + "grad_norm": 68.35961851360763, + "learning_rate": 7.124157478399488e-06, + "loss": 3.0672, + "step": 14936 + }, + { + "epoch": 1.2730759396573768, + "grad_norm": 33.3746653286305, + "learning_rate": 7.1237085905814865e-06, + "loss": 2.2984, + "step": 14937 + }, + { + "epoch": 1.273161169351402, + "grad_norm": 33.49524680782169, + "learning_rate": 7.123259681877655e-06, + "loss": 3.3996, + "step": 14938 + }, + { + "epoch": 1.2732463990454275, + "grad_norm": 52.63056853485904, + "learning_rate": 7.122810752292406e-06, + "loss": 2.884, + "step": 14939 + }, + { + "epoch": 1.2733316287394527, + "grad_norm": 62.38196069315299, + "learning_rate": 7.122361801830158e-06, + "loss": 2.7985, + "step": 14940 + }, + { + "epoch": 1.2734168584334782, + "grad_norm": 28.70783722935124, + "learning_rate": 7.121912830495326e-06, + "loss": 1.7703, + "step": 14941 + }, + { + "epoch": 1.2735020881275037, + "grad_norm": 95.04207991019886, + "learning_rate": 7.121463838292324e-06, + "loss": 3.7378, + "step": 14942 + }, + { + "epoch": 1.2735873178215291, + "grad_norm": 53.598755506068436, + "learning_rate": 7.121014825225567e-06, + "loss": 3.746, + "step": 14943 + }, + { + "epoch": 1.2736725475155544, + "grad_norm": 56.93147658775029, + "learning_rate": 7.120565791299472e-06, + "loss": 3.1196, + "step": 14944 + }, + { + "epoch": 1.2737577772095798, + "grad_norm": 28.592973491958166, + "learning_rate": 7.1201167365184565e-06, + "loss": 2.5198, + "step": 14945 + }, + { + "epoch": 1.273843006903605, + "grad_norm": 37.6598074882158, + "learning_rate": 7.119667660886936e-06, + "loss": 2.684, + "step": 14946 + }, + { + "epoch": 1.2739282365976305, + "grad_norm": 79.41727838694884, + "learning_rate": 7.119218564409324e-06, + "loss": 3.7808, + "step": 14947 + }, + { + "epoch": 1.274013466291656, + "grad_norm": 27.013870493928202, + "learning_rate": 7.118769447090039e-06, + "loss": 2.4797, + "step": 14948 + }, + { + "epoch": 1.2740986959856815, + "grad_norm": 39.94644575020431, + "learning_rate": 7.118320308933502e-06, + "loss": 3.026, + "step": 14949 + }, + { + "epoch": 1.2741839256797067, + "grad_norm": 55.8048254552559, + "learning_rate": 7.117871149944124e-06, + "loss": 3.1758, + "step": 14950 + }, + { + "epoch": 1.2742691553737322, + "grad_norm": 34.73321307379439, + "learning_rate": 7.1174219701263245e-06, + "loss": 2.6692, + "step": 14951 + }, + { + "epoch": 1.2743543850677577, + "grad_norm": 35.37005460093355, + "learning_rate": 7.116972769484523e-06, + "loss": 2.4977, + "step": 14952 + }, + { + "epoch": 1.274439614761783, + "grad_norm": 109.43681312915584, + "learning_rate": 7.116523548023134e-06, + "loss": 4.766, + "step": 14953 + }, + { + "epoch": 1.2745248444558084, + "grad_norm": 67.66517581697596, + "learning_rate": 7.116074305746577e-06, + "loss": 4.0364, + "step": 14954 + }, + { + "epoch": 1.2746100741498338, + "grad_norm": 27.903597095905674, + "learning_rate": 7.115625042659271e-06, + "loss": 2.4528, + "step": 14955 + }, + { + "epoch": 1.2746953038438593, + "grad_norm": 58.16117028966113, + "learning_rate": 7.1151757587656335e-06, + "loss": 3.2389, + "step": 14956 + }, + { + "epoch": 1.2747805335378846, + "grad_norm": 45.18883864370507, + "learning_rate": 7.114726454070082e-06, + "loss": 2.3137, + "step": 14957 + }, + { + "epoch": 1.27486576323191, + "grad_norm": 57.13392061795067, + "learning_rate": 7.114277128577036e-06, + "loss": 2.6071, + "step": 14958 + }, + { + "epoch": 1.2749509929259353, + "grad_norm": 50.02609766675599, + "learning_rate": 7.113827782290914e-06, + "loss": 2.9519, + "step": 14959 + }, + { + "epoch": 1.2750362226199607, + "grad_norm": 34.53369191556546, + "learning_rate": 7.113378415216138e-06, + "loss": 2.5906, + "step": 14960 + }, + { + "epoch": 1.2751214523139862, + "grad_norm": 44.25467828417228, + "learning_rate": 7.112929027357124e-06, + "loss": 2.3296, + "step": 14961 + }, + { + "epoch": 1.2752066820080117, + "grad_norm": 62.293310548927536, + "learning_rate": 7.112479618718292e-06, + "loss": 3.3181, + "step": 14962 + }, + { + "epoch": 1.275291911702037, + "grad_norm": 60.56746350370242, + "learning_rate": 7.112030189304062e-06, + "loss": 2.7343, + "step": 14963 + }, + { + "epoch": 1.2753771413960624, + "grad_norm": 47.32555820869447, + "learning_rate": 7.111580739118856e-06, + "loss": 2.703, + "step": 14964 + }, + { + "epoch": 1.2754623710900879, + "grad_norm": 52.380872224590185, + "learning_rate": 7.111131268167091e-06, + "loss": 3.6988, + "step": 14965 + }, + { + "epoch": 1.2755476007841131, + "grad_norm": 44.63949608037036, + "learning_rate": 7.11068177645319e-06, + "loss": 2.6754, + "step": 14966 + }, + { + "epoch": 1.2756328304781386, + "grad_norm": 54.18876593091396, + "learning_rate": 7.110232263981571e-06, + "loss": 2.2092, + "step": 14967 + }, + { + "epoch": 1.275718060172164, + "grad_norm": 48.09050368183057, + "learning_rate": 7.109782730756658e-06, + "loss": 2.099, + "step": 14968 + }, + { + "epoch": 1.2758032898661895, + "grad_norm": 93.06794372297468, + "learning_rate": 7.10933317678287e-06, + "loss": 2.1385, + "step": 14969 + }, + { + "epoch": 1.2758885195602148, + "grad_norm": 41.84146019819652, + "learning_rate": 7.108883602064627e-06, + "loss": 2.9133, + "step": 14970 + }, + { + "epoch": 1.2759737492542402, + "grad_norm": 63.81130062063172, + "learning_rate": 7.108434006606353e-06, + "loss": 2.7501, + "step": 14971 + }, + { + "epoch": 1.2760589789482655, + "grad_norm": 44.33324381562423, + "learning_rate": 7.107984390412467e-06, + "loss": 3.3189, + "step": 14972 + }, + { + "epoch": 1.276144208642291, + "grad_norm": 96.19725990466019, + "learning_rate": 7.107534753487393e-06, + "loss": 4.2329, + "step": 14973 + }, + { + "epoch": 1.2762294383363164, + "grad_norm": 55.191516626010625, + "learning_rate": 7.1070850958355505e-06, + "loss": 3.1181, + "step": 14974 + }, + { + "epoch": 1.2763146680303419, + "grad_norm": 102.20309136916684, + "learning_rate": 7.106635417461366e-06, + "loss": 4.3654, + "step": 14975 + }, + { + "epoch": 1.2763998977243671, + "grad_norm": 33.978832324937194, + "learning_rate": 7.106185718369259e-06, + "loss": 2.0257, + "step": 14976 + }, + { + "epoch": 1.2764851274183926, + "grad_norm": 46.040560852541155, + "learning_rate": 7.105735998563652e-06, + "loss": 3.2073, + "step": 14977 + }, + { + "epoch": 1.2765703571124178, + "grad_norm": 38.05602121456411, + "learning_rate": 7.105286258048966e-06, + "loss": 3.1793, + "step": 14978 + }, + { + "epoch": 1.2766555868064433, + "grad_norm": 36.25699835108254, + "learning_rate": 7.104836496829629e-06, + "loss": 3.4858, + "step": 14979 + }, + { + "epoch": 1.2767408165004688, + "grad_norm": 47.13118722598969, + "learning_rate": 7.10438671491006e-06, + "loss": 3.1907, + "step": 14980 + }, + { + "epoch": 1.2768260461944942, + "grad_norm": 48.3361572976791, + "learning_rate": 7.103936912294684e-06, + "loss": 2.7509, + "step": 14981 + }, + { + "epoch": 1.2769112758885195, + "grad_norm": 37.21367150489538, + "learning_rate": 7.103487088987924e-06, + "loss": 2.8772, + "step": 14982 + }, + { + "epoch": 1.276996505582545, + "grad_norm": 75.56752384542955, + "learning_rate": 7.103037244994206e-06, + "loss": 1.9858, + "step": 14983 + }, + { + "epoch": 1.2770817352765704, + "grad_norm": 157.3586279430723, + "learning_rate": 7.102587380317952e-06, + "loss": 3.5185, + "step": 14984 + }, + { + "epoch": 1.2771669649705957, + "grad_norm": 68.34856206687098, + "learning_rate": 7.1021374949635855e-06, + "loss": 4.0001, + "step": 14985 + }, + { + "epoch": 1.2772521946646211, + "grad_norm": 35.46920766079991, + "learning_rate": 7.101687588935533e-06, + "loss": 2.5999, + "step": 14986 + }, + { + "epoch": 1.2773374243586466, + "grad_norm": 57.2821669963087, + "learning_rate": 7.101237662238217e-06, + "loss": 3.0836, + "step": 14987 + }, + { + "epoch": 1.277422654052672, + "grad_norm": 71.84586220390014, + "learning_rate": 7.100787714876065e-06, + "loss": 3.5231, + "step": 14988 + }, + { + "epoch": 1.2775078837466973, + "grad_norm": 33.386039782035645, + "learning_rate": 7.100337746853499e-06, + "loss": 2.901, + "step": 14989 + }, + { + "epoch": 1.2775931134407228, + "grad_norm": 44.22331583158593, + "learning_rate": 7.099887758174947e-06, + "loss": 2.92, + "step": 14990 + }, + { + "epoch": 1.277678343134748, + "grad_norm": 48.04959734651183, + "learning_rate": 7.099437748844833e-06, + "loss": 2.6628, + "step": 14991 + }, + { + "epoch": 1.2777635728287735, + "grad_norm": 50.43442711749478, + "learning_rate": 7.0989877188675835e-06, + "loss": 3.4269, + "step": 14992 + }, + { + "epoch": 1.277848802522799, + "grad_norm": 51.92943648317447, + "learning_rate": 7.098537668247624e-06, + "loss": 2.7184, + "step": 14993 + }, + { + "epoch": 1.2779340322168244, + "grad_norm": 67.89583067350925, + "learning_rate": 7.0980875969893795e-06, + "loss": 3.2935, + "step": 14994 + }, + { + "epoch": 1.2780192619108497, + "grad_norm": 29.761725190925535, + "learning_rate": 7.097637505097277e-06, + "loss": 2.2427, + "step": 14995 + }, + { + "epoch": 1.2781044916048752, + "grad_norm": 63.11351320808509, + "learning_rate": 7.097187392575744e-06, + "loss": 2.978, + "step": 14996 + }, + { + "epoch": 1.2781897212989004, + "grad_norm": 67.89596374083911, + "learning_rate": 7.096737259429206e-06, + "loss": 4.1609, + "step": 14997 + }, + { + "epoch": 1.2782749509929259, + "grad_norm": 23.98847382853868, + "learning_rate": 7.0962871056620895e-06, + "loss": 2.2095, + "step": 14998 + }, + { + "epoch": 1.2783601806869513, + "grad_norm": 50.53863350749763, + "learning_rate": 7.095836931278823e-06, + "loss": 3.8389, + "step": 14999 + }, + { + "epoch": 1.2784454103809768, + "grad_norm": 71.9463571162836, + "learning_rate": 7.095386736283833e-06, + "loss": 2.9305, + "step": 15000 + }, + { + "epoch": 1.278530640075002, + "grad_norm": 57.34213913578647, + "learning_rate": 7.094936520681548e-06, + "loss": 2.1115, + "step": 15001 + }, + { + "epoch": 1.2786158697690275, + "grad_norm": 41.78121745821782, + "learning_rate": 7.094486284476393e-06, + "loss": 2.829, + "step": 15002 + }, + { + "epoch": 1.278701099463053, + "grad_norm": 43.177297174178555, + "learning_rate": 7.0940360276728e-06, + "loss": 3.2589, + "step": 15003 + }, + { + "epoch": 1.2787863291570782, + "grad_norm": 31.56558646124253, + "learning_rate": 7.093585750275193e-06, + "loss": 2.5826, + "step": 15004 + }, + { + "epoch": 1.2788715588511037, + "grad_norm": 61.40728101374016, + "learning_rate": 7.0931354522880015e-06, + "loss": 3.1906, + "step": 15005 + }, + { + "epoch": 1.2789567885451292, + "grad_norm": 100.52854342303034, + "learning_rate": 7.092685133715656e-06, + "loss": 3.4321, + "step": 15006 + }, + { + "epoch": 1.2790420182391546, + "grad_norm": 27.982050004665076, + "learning_rate": 7.092234794562583e-06, + "loss": 2.2427, + "step": 15007 + }, + { + "epoch": 1.27912724793318, + "grad_norm": 42.129561800220586, + "learning_rate": 7.0917844348332124e-06, + "loss": 2.3747, + "step": 15008 + }, + { + "epoch": 1.2792124776272054, + "grad_norm": 26.421260072776732, + "learning_rate": 7.091334054531973e-06, + "loss": 2.3546, + "step": 15009 + }, + { + "epoch": 1.2792977073212306, + "grad_norm": 64.7405029239078, + "learning_rate": 7.090883653663294e-06, + "loss": 3.6267, + "step": 15010 + }, + { + "epoch": 1.279382937015256, + "grad_norm": 45.495182280103414, + "learning_rate": 7.090433232231606e-06, + "loss": 2.9151, + "step": 15011 + }, + { + "epoch": 1.2794681667092815, + "grad_norm": 31.82256537593211, + "learning_rate": 7.089982790241338e-06, + "loss": 2.4407, + "step": 15012 + }, + { + "epoch": 1.279553396403307, + "grad_norm": 57.22917419400468, + "learning_rate": 7.089532327696917e-06, + "loss": 3.5836, + "step": 15013 + }, + { + "epoch": 1.2796386260973323, + "grad_norm": 86.45698087557552, + "learning_rate": 7.0890818446027785e-06, + "loss": 4.4229, + "step": 15014 + }, + { + "epoch": 1.2797238557913577, + "grad_norm": 34.2994887707831, + "learning_rate": 7.088631340963349e-06, + "loss": 3.7146, + "step": 15015 + }, + { + "epoch": 1.279809085485383, + "grad_norm": 44.93540767490067, + "learning_rate": 7.088180816783062e-06, + "loss": 2.4756, + "step": 15016 + }, + { + "epoch": 1.2798943151794084, + "grad_norm": 109.95913750172517, + "learning_rate": 7.0877302720663435e-06, + "loss": 3.4272, + "step": 15017 + }, + { + "epoch": 1.279979544873434, + "grad_norm": 65.13964218215516, + "learning_rate": 7.087279706817629e-06, + "loss": 2.8798, + "step": 15018 + }, + { + "epoch": 1.2800647745674594, + "grad_norm": 37.25498435977211, + "learning_rate": 7.086829121041348e-06, + "loss": 1.5713, + "step": 15019 + }, + { + "epoch": 1.2801500042614846, + "grad_norm": 103.34457227602559, + "learning_rate": 7.086378514741932e-06, + "loss": 1.9502, + "step": 15020 + }, + { + "epoch": 1.28023523395551, + "grad_norm": 69.25633482237805, + "learning_rate": 7.085927887923812e-06, + "loss": 3.2055, + "step": 15021 + }, + { + "epoch": 1.2803204636495356, + "grad_norm": 41.52437087723367, + "learning_rate": 7.085477240591419e-06, + "loss": 3.8221, + "step": 15022 + }, + { + "epoch": 1.2804056933435608, + "grad_norm": 63.76463279443709, + "learning_rate": 7.085026572749187e-06, + "loss": 2.2144, + "step": 15023 + }, + { + "epoch": 1.2804909230375863, + "grad_norm": 119.14431848034488, + "learning_rate": 7.084575884401549e-06, + "loss": 3.9272, + "step": 15024 + }, + { + "epoch": 1.2805761527316117, + "grad_norm": 72.26734924473617, + "learning_rate": 7.084125175552933e-06, + "loss": 3.8013, + "step": 15025 + }, + { + "epoch": 1.2806613824256372, + "grad_norm": 28.587517652266946, + "learning_rate": 7.083674446207776e-06, + "loss": 2.5459, + "step": 15026 + }, + { + "epoch": 1.2807466121196625, + "grad_norm": 35.15314822324128, + "learning_rate": 7.083223696370507e-06, + "loss": 3.6911, + "step": 15027 + }, + { + "epoch": 1.280831841813688, + "grad_norm": 67.07515540920087, + "learning_rate": 7.0827729260455615e-06, + "loss": 3.4849, + "step": 15028 + }, + { + "epoch": 1.2809170715077132, + "grad_norm": 37.68306935829295, + "learning_rate": 7.082322135237372e-06, + "loss": 2.9441, + "step": 15029 + }, + { + "epoch": 1.2810023012017386, + "grad_norm": 53.10671356294446, + "learning_rate": 7.081871323950371e-06, + "loss": 3.8434, + "step": 15030 + }, + { + "epoch": 1.281087530895764, + "grad_norm": 34.05980859978824, + "learning_rate": 7.081420492188994e-06, + "loss": 2.2638, + "step": 15031 + }, + { + "epoch": 1.2811727605897896, + "grad_norm": 32.50111118831988, + "learning_rate": 7.080969639957673e-06, + "loss": 3.7775, + "step": 15032 + }, + { + "epoch": 1.2812579902838148, + "grad_norm": 56.88898739113582, + "learning_rate": 7.080518767260842e-06, + "loss": 2.4084, + "step": 15033 + }, + { + "epoch": 1.2813432199778403, + "grad_norm": 79.69621921864656, + "learning_rate": 7.080067874102936e-06, + "loss": 3.3298, + "step": 15034 + }, + { + "epoch": 1.2814284496718658, + "grad_norm": 37.01792079235686, + "learning_rate": 7.079616960488389e-06, + "loss": 2.5304, + "step": 15035 + }, + { + "epoch": 1.281513679365891, + "grad_norm": 64.62123283171839, + "learning_rate": 7.079166026421635e-06, + "loss": 3.0689, + "step": 15036 + }, + { + "epoch": 1.2815989090599165, + "grad_norm": 36.11846432567873, + "learning_rate": 7.078715071907109e-06, + "loss": 3.894, + "step": 15037 + }, + { + "epoch": 1.281684138753942, + "grad_norm": 27.825333546307895, + "learning_rate": 7.078264096949248e-06, + "loss": 2.0635, + "step": 15038 + }, + { + "epoch": 1.2817693684479674, + "grad_norm": 37.05274996799678, + "learning_rate": 7.077813101552485e-06, + "loss": 2.9475, + "step": 15039 + }, + { + "epoch": 1.2818545981419927, + "grad_norm": 69.51249423658722, + "learning_rate": 7.077362085721255e-06, + "loss": 3.8506, + "step": 15040 + }, + { + "epoch": 1.2819398278360181, + "grad_norm": 52.103276126729284, + "learning_rate": 7.0769110494599945e-06, + "loss": 2.5701, + "step": 15041 + }, + { + "epoch": 1.2820250575300434, + "grad_norm": 35.968429836751106, + "learning_rate": 7.07645999277314e-06, + "loss": 2.719, + "step": 15042 + }, + { + "epoch": 1.2821102872240688, + "grad_norm": 43.474711178763854, + "learning_rate": 7.076008915665126e-06, + "loss": 2.0543, + "step": 15043 + }, + { + "epoch": 1.2821955169180943, + "grad_norm": 42.657949276523595, + "learning_rate": 7.075557818140389e-06, + "loss": 2.9051, + "step": 15044 + }, + { + "epoch": 1.2822807466121198, + "grad_norm": 59.383143878176035, + "learning_rate": 7.0751067002033645e-06, + "loss": 2.659, + "step": 15045 + }, + { + "epoch": 1.282365976306145, + "grad_norm": 35.1903708920905, + "learning_rate": 7.074655561858492e-06, + "loss": 3.1833, + "step": 15046 + }, + { + "epoch": 1.2824512060001705, + "grad_norm": 101.67711949629316, + "learning_rate": 7.0742044031102066e-06, + "loss": 3.8801, + "step": 15047 + }, + { + "epoch": 1.2825364356941957, + "grad_norm": 38.43665293735073, + "learning_rate": 7.073753223962944e-06, + "loss": 3.1498, + "step": 15048 + }, + { + "epoch": 1.2826216653882212, + "grad_norm": 34.75952396622806, + "learning_rate": 7.0733020244211425e-06, + "loss": 2.6179, + "step": 15049 + }, + { + "epoch": 1.2827068950822467, + "grad_norm": 70.04216188414085, + "learning_rate": 7.07285080448924e-06, + "loss": 3.7892, + "step": 15050 + }, + { + "epoch": 1.2827921247762721, + "grad_norm": 59.831190059602925, + "learning_rate": 7.072399564171672e-06, + "loss": 2.9451, + "step": 15051 + }, + { + "epoch": 1.2828773544702974, + "grad_norm": 71.47936936843001, + "learning_rate": 7.0719483034728794e-06, + "loss": 3.5741, + "step": 15052 + }, + { + "epoch": 1.2829625841643229, + "grad_norm": 42.17091182061905, + "learning_rate": 7.071497022397298e-06, + "loss": 2.6986, + "step": 15053 + }, + { + "epoch": 1.2830478138583483, + "grad_norm": 42.06705864680223, + "learning_rate": 7.071045720949368e-06, + "loss": 2.4203, + "step": 15054 + }, + { + "epoch": 1.2831330435523736, + "grad_norm": 73.14870715905258, + "learning_rate": 7.070594399133525e-06, + "loss": 4.0395, + "step": 15055 + }, + { + "epoch": 1.283218273246399, + "grad_norm": 30.271397100720293, + "learning_rate": 7.070143056954209e-06, + "loss": 1.6385, + "step": 15056 + }, + { + "epoch": 1.2833035029404245, + "grad_norm": 41.006551051791, + "learning_rate": 7.069691694415858e-06, + "loss": 3.291, + "step": 15057 + }, + { + "epoch": 1.28338873263445, + "grad_norm": 174.24451344857525, + "learning_rate": 7.069240311522911e-06, + "loss": 3.1059, + "step": 15058 + }, + { + "epoch": 1.2834739623284752, + "grad_norm": 45.7679411838096, + "learning_rate": 7.06878890827981e-06, + "loss": 2.8336, + "step": 15059 + }, + { + "epoch": 1.2835591920225007, + "grad_norm": 48.99163791178826, + "learning_rate": 7.068337484690991e-06, + "loss": 2.2498, + "step": 15060 + }, + { + "epoch": 1.283644421716526, + "grad_norm": 38.283775917905565, + "learning_rate": 7.067886040760894e-06, + "loss": 3.3774, + "step": 15061 + }, + { + "epoch": 1.2837296514105514, + "grad_norm": 76.33382668306254, + "learning_rate": 7.06743457649396e-06, + "loss": 2.9741, + "step": 15062 + }, + { + "epoch": 1.2838148811045769, + "grad_norm": 66.66463470489239, + "learning_rate": 7.06698309189463e-06, + "loss": 3.3992, + "step": 15063 + }, + { + "epoch": 1.2839001107986023, + "grad_norm": 29.502290418002286, + "learning_rate": 7.0665315869673405e-06, + "loss": 2.5532, + "step": 15064 + }, + { + "epoch": 1.2839853404926276, + "grad_norm": 67.7851691929117, + "learning_rate": 7.066080061716535e-06, + "loss": 2.7711, + "step": 15065 + }, + { + "epoch": 1.284070570186653, + "grad_norm": 35.62325473717989, + "learning_rate": 7.065628516146652e-06, + "loss": 1.7286, + "step": 15066 + }, + { + "epoch": 1.2841557998806783, + "grad_norm": 60.34299632328896, + "learning_rate": 7.065176950262135e-06, + "loss": 2.7949, + "step": 15067 + }, + { + "epoch": 1.2842410295747038, + "grad_norm": 25.915129102813992, + "learning_rate": 7.064725364067422e-06, + "loss": 2.5399, + "step": 15068 + }, + { + "epoch": 1.2843262592687292, + "grad_norm": 65.65357421146584, + "learning_rate": 7.064273757566956e-06, + "loss": 3.7427, + "step": 15069 + }, + { + "epoch": 1.2844114889627547, + "grad_norm": 30.66731336438764, + "learning_rate": 7.063822130765179e-06, + "loss": 2.214, + "step": 15070 + }, + { + "epoch": 1.28449671865678, + "grad_norm": 85.71506301242167, + "learning_rate": 7.063370483666529e-06, + "loss": 3.5079, + "step": 15071 + }, + { + "epoch": 1.2845819483508054, + "grad_norm": 59.04676489123398, + "learning_rate": 7.062918816275451e-06, + "loss": 2.6973, + "step": 15072 + }, + { + "epoch": 1.2846671780448309, + "grad_norm": 48.228695092773265, + "learning_rate": 7.0624671285963865e-06, + "loss": 2.3984, + "step": 15073 + }, + { + "epoch": 1.2847524077388561, + "grad_norm": 54.867582293802165, + "learning_rate": 7.062015420633779e-06, + "loss": 3.221, + "step": 15074 + }, + { + "epoch": 1.2848376374328816, + "grad_norm": 42.101944488893196, + "learning_rate": 7.061563692392067e-06, + "loss": 3.5008, + "step": 15075 + }, + { + "epoch": 1.284922867126907, + "grad_norm": 73.68472183218549, + "learning_rate": 7.0611119438756955e-06, + "loss": 3.7745, + "step": 15076 + }, + { + "epoch": 1.2850080968209325, + "grad_norm": 35.96484314831075, + "learning_rate": 7.060660175089107e-06, + "loss": 2.7978, + "step": 15077 + }, + { + "epoch": 1.2850933265149578, + "grad_norm": 37.08127598081902, + "learning_rate": 7.060208386036746e-06, + "loss": 2.2199, + "step": 15078 + }, + { + "epoch": 1.2851785562089832, + "grad_norm": 56.2147473623034, + "learning_rate": 7.059756576723053e-06, + "loss": 3.4623, + "step": 15079 + }, + { + "epoch": 1.2852637859030085, + "grad_norm": 42.51924116290447, + "learning_rate": 7.0593047471524726e-06, + "loss": 3.4353, + "step": 15080 + }, + { + "epoch": 1.285349015597034, + "grad_norm": 53.39233596097184, + "learning_rate": 7.0588528973294475e-06, + "loss": 3.2167, + "step": 15081 + }, + { + "epoch": 1.2854342452910594, + "grad_norm": 39.990077511608504, + "learning_rate": 7.0584010272584235e-06, + "loss": 2.4827, + "step": 15082 + }, + { + "epoch": 1.285519474985085, + "grad_norm": 33.14097487987757, + "learning_rate": 7.057949136943844e-06, + "loss": 3.3337, + "step": 15083 + }, + { + "epoch": 1.2856047046791101, + "grad_norm": 47.98148438617023, + "learning_rate": 7.057497226390151e-06, + "loss": 3.4417, + "step": 15084 + }, + { + "epoch": 1.2856899343731356, + "grad_norm": 78.48903383432125, + "learning_rate": 7.057045295601791e-06, + "loss": 2.5888, + "step": 15085 + }, + { + "epoch": 1.285775164067161, + "grad_norm": 70.45745619826502, + "learning_rate": 7.056593344583207e-06, + "loss": 2.7428, + "step": 15086 + }, + { + "epoch": 1.2858603937611863, + "grad_norm": 50.75158857261388, + "learning_rate": 7.0561413733388455e-06, + "loss": 1.8181, + "step": 15087 + }, + { + "epoch": 1.2859456234552118, + "grad_norm": 64.6009683792976, + "learning_rate": 7.05568938187315e-06, + "loss": 3.2652, + "step": 15088 + }, + { + "epoch": 1.2860308531492373, + "grad_norm": 39.92297006788678, + "learning_rate": 7.055237370190568e-06, + "loss": 2.9167, + "step": 15089 + }, + { + "epoch": 1.2861160828432627, + "grad_norm": 40.372873297741876, + "learning_rate": 7.054785338295541e-06, + "loss": 2.6537, + "step": 15090 + }, + { + "epoch": 1.286201312537288, + "grad_norm": 34.978476512522214, + "learning_rate": 7.054333286192518e-06, + "loss": 3.3527, + "step": 15091 + }, + { + "epoch": 1.2862865422313134, + "grad_norm": 50.055937441152686, + "learning_rate": 7.053881213885942e-06, + "loss": 3.6018, + "step": 15092 + }, + { + "epoch": 1.2863717719253387, + "grad_norm": 59.402183773994395, + "learning_rate": 7.053429121380263e-06, + "loss": 2.7434, + "step": 15093 + }, + { + "epoch": 1.2864570016193642, + "grad_norm": 55.93212507919891, + "learning_rate": 7.052977008679922e-06, + "loss": 3.6009, + "step": 15094 + }, + { + "epoch": 1.2865422313133896, + "grad_norm": 48.79894352463452, + "learning_rate": 7.052524875789368e-06, + "loss": 2.0316, + "step": 15095 + }, + { + "epoch": 1.286627461007415, + "grad_norm": 68.87781525956491, + "learning_rate": 7.052072722713049e-06, + "loss": 3.8216, + "step": 15096 + }, + { + "epoch": 1.2867126907014403, + "grad_norm": 55.568805003045505, + "learning_rate": 7.05162054945541e-06, + "loss": 2.9881, + "step": 15097 + }, + { + "epoch": 1.2867979203954658, + "grad_norm": 33.152791394605764, + "learning_rate": 7.0511683560208975e-06, + "loss": 2.3052, + "step": 15098 + }, + { + "epoch": 1.286883150089491, + "grad_norm": 32.59761187886366, + "learning_rate": 7.050716142413959e-06, + "loss": 2.1614, + "step": 15099 + }, + { + "epoch": 1.2869683797835165, + "grad_norm": 76.88629703234456, + "learning_rate": 7.050263908639043e-06, + "loss": 3.232, + "step": 15100 + }, + { + "epoch": 1.287053609477542, + "grad_norm": 43.125537937207135, + "learning_rate": 7.049811654700595e-06, + "loss": 3.194, + "step": 15101 + }, + { + "epoch": 1.2871388391715675, + "grad_norm": 57.681123289309824, + "learning_rate": 7.049359380603066e-06, + "loss": 3.772, + "step": 15102 + }, + { + "epoch": 1.2872240688655927, + "grad_norm": 80.80626029319663, + "learning_rate": 7.048907086350899e-06, + "loss": 4.0861, + "step": 15103 + }, + { + "epoch": 1.2873092985596182, + "grad_norm": 52.656752519069094, + "learning_rate": 7.048454771948547e-06, + "loss": 2.6943, + "step": 15104 + }, + { + "epoch": 1.2873945282536436, + "grad_norm": 28.66787035230196, + "learning_rate": 7.048002437400456e-06, + "loss": 2.1598, + "step": 15105 + }, + { + "epoch": 1.287479757947669, + "grad_norm": 64.74232063495876, + "learning_rate": 7.047550082711074e-06, + "loss": 2.8185, + "step": 15106 + }, + { + "epoch": 1.2875649876416944, + "grad_norm": 41.02361796686315, + "learning_rate": 7.047097707884851e-06, + "loss": 3.4723, + "step": 15107 + }, + { + "epoch": 1.2876502173357198, + "grad_norm": 49.765818482315716, + "learning_rate": 7.046645312926236e-06, + "loss": 3.3883, + "step": 15108 + }, + { + "epoch": 1.2877354470297453, + "grad_norm": 35.21915056591349, + "learning_rate": 7.046192897839677e-06, + "loss": 2.1665, + "step": 15109 + }, + { + "epoch": 1.2878206767237705, + "grad_norm": 60.26318128472742, + "learning_rate": 7.045740462629625e-06, + "loss": 2.9942, + "step": 15110 + }, + { + "epoch": 1.287905906417796, + "grad_norm": 72.87461453148119, + "learning_rate": 7.045288007300526e-06, + "loss": 2.9774, + "step": 15111 + }, + { + "epoch": 1.2879911361118213, + "grad_norm": 66.21553940884394, + "learning_rate": 7.044835531856834e-06, + "loss": 2.8807, + "step": 15112 + }, + { + "epoch": 1.2880763658058467, + "grad_norm": 65.18455734326352, + "learning_rate": 7.044383036302996e-06, + "loss": 3.2, + "step": 15113 + }, + { + "epoch": 1.2881615954998722, + "grad_norm": 48.375777775766316, + "learning_rate": 7.043930520643465e-06, + "loss": 2.1783, + "step": 15114 + }, + { + "epoch": 1.2882468251938977, + "grad_norm": 77.97056000613419, + "learning_rate": 7.043477984882688e-06, + "loss": 3.6602, + "step": 15115 + }, + { + "epoch": 1.288332054887923, + "grad_norm": 35.070825827229065, + "learning_rate": 7.043025429025118e-06, + "loss": 2.623, + "step": 15116 + }, + { + "epoch": 1.2884172845819484, + "grad_norm": 31.954997193643077, + "learning_rate": 7.0425728530752045e-06, + "loss": 1.8563, + "step": 15117 + }, + { + "epoch": 1.2885025142759736, + "grad_norm": 39.742654814438644, + "learning_rate": 7.042120257037398e-06, + "loss": 2.1294, + "step": 15118 + }, + { + "epoch": 1.288587743969999, + "grad_norm": 35.570635261627885, + "learning_rate": 7.04166764091615e-06, + "loss": 2.9715, + "step": 15119 + }, + { + "epoch": 1.2886729736640246, + "grad_norm": 60.61367342104524, + "learning_rate": 7.0412150047159125e-06, + "loss": 3.4082, + "step": 15120 + }, + { + "epoch": 1.28875820335805, + "grad_norm": 102.77217198533756, + "learning_rate": 7.040762348441137e-06, + "loss": 4.6542, + "step": 15121 + }, + { + "epoch": 1.2888434330520753, + "grad_norm": 49.16853602761417, + "learning_rate": 7.0403096720962734e-06, + "loss": 3.24, + "step": 15122 + }, + { + "epoch": 1.2889286627461007, + "grad_norm": 35.214381590896025, + "learning_rate": 7.039856975685776e-06, + "loss": 3.0675, + "step": 15123 + }, + { + "epoch": 1.2890138924401262, + "grad_norm": 36.22054472235094, + "learning_rate": 7.039404259214095e-06, + "loss": 2.9511, + "step": 15124 + }, + { + "epoch": 1.2890991221341515, + "grad_norm": 41.939359848062246, + "learning_rate": 7.038951522685685e-06, + "loss": 3.3876, + "step": 15125 + }, + { + "epoch": 1.289184351828177, + "grad_norm": 91.66600844772218, + "learning_rate": 7.038498766104996e-06, + "loss": 3.8139, + "step": 15126 + }, + { + "epoch": 1.2892695815222024, + "grad_norm": 34.829726642163855, + "learning_rate": 7.0380459894764815e-06, + "loss": 2.6612, + "step": 15127 + }, + { + "epoch": 1.2893548112162279, + "grad_norm": 47.197706363010234, + "learning_rate": 7.037593192804595e-06, + "loss": 2.6541, + "step": 15128 + }, + { + "epoch": 1.289440040910253, + "grad_norm": 45.39037707086828, + "learning_rate": 7.037140376093789e-06, + "loss": 2.8848, + "step": 15129 + }, + { + "epoch": 1.2895252706042786, + "grad_norm": 36.45762953483701, + "learning_rate": 7.0366875393485165e-06, + "loss": 2.5241, + "step": 15130 + }, + { + "epoch": 1.2896105002983038, + "grad_norm": 33.604866537373525, + "learning_rate": 7.036234682573232e-06, + "loss": 2.5499, + "step": 15131 + }, + { + "epoch": 1.2896957299923293, + "grad_norm": 50.654846437520206, + "learning_rate": 7.035781805772388e-06, + "loss": 2.2041, + "step": 15132 + }, + { + "epoch": 1.2897809596863548, + "grad_norm": 40.57448712056563, + "learning_rate": 7.035328908950439e-06, + "loss": 2.7422, + "step": 15133 + }, + { + "epoch": 1.2898661893803802, + "grad_norm": 36.36391247545589, + "learning_rate": 7.034875992111839e-06, + "loss": 2.7593, + "step": 15134 + }, + { + "epoch": 1.2899514190744055, + "grad_norm": 36.748006518437734, + "learning_rate": 7.034423055261041e-06, + "loss": 2.1155, + "step": 15135 + }, + { + "epoch": 1.290036648768431, + "grad_norm": 43.5459797563699, + "learning_rate": 7.033970098402503e-06, + "loss": 2.7442, + "step": 15136 + }, + { + "epoch": 1.2901218784624562, + "grad_norm": 102.65329031219149, + "learning_rate": 7.033517121540676e-06, + "loss": 2.5511, + "step": 15137 + }, + { + "epoch": 1.2902071081564817, + "grad_norm": 64.50305559808052, + "learning_rate": 7.033064124680015e-06, + "loss": 3.6921, + "step": 15138 + }, + { + "epoch": 1.2902923378505071, + "grad_norm": 31.974468220222036, + "learning_rate": 7.032611107824978e-06, + "loss": 2.5157, + "step": 15139 + }, + { + "epoch": 1.2903775675445326, + "grad_norm": 57.863433170211245, + "learning_rate": 7.03215807098002e-06, + "loss": 3.7047, + "step": 15140 + }, + { + "epoch": 1.2904627972385578, + "grad_norm": 35.565009006488324, + "learning_rate": 7.031705014149592e-06, + "loss": 2.6815, + "step": 15141 + }, + { + "epoch": 1.2905480269325833, + "grad_norm": 41.81471222740551, + "learning_rate": 7.031251937338153e-06, + "loss": 2.9265, + "step": 15142 + }, + { + "epoch": 1.2906332566266088, + "grad_norm": 77.07575506039774, + "learning_rate": 7.0307988405501595e-06, + "loss": 3.4214, + "step": 15143 + }, + { + "epoch": 1.290718486320634, + "grad_norm": 35.225451166658566, + "learning_rate": 7.030345723790064e-06, + "loss": 2.7697, + "step": 15144 + }, + { + "epoch": 1.2908037160146595, + "grad_norm": 31.08399650477249, + "learning_rate": 7.029892587062328e-06, + "loss": 2.4658, + "step": 15145 + }, + { + "epoch": 1.290888945708685, + "grad_norm": 82.98276128320008, + "learning_rate": 7.029439430371402e-06, + "loss": 3.6374, + "step": 15146 + }, + { + "epoch": 1.2909741754027104, + "grad_norm": 66.5969450183104, + "learning_rate": 7.028986253721748e-06, + "loss": 2.7199, + "step": 15147 + }, + { + "epoch": 1.2910594050967357, + "grad_norm": 48.29969214171804, + "learning_rate": 7.0285330571178186e-06, + "loss": 2.2881, + "step": 15148 + }, + { + "epoch": 1.2911446347907611, + "grad_norm": 24.963973760950694, + "learning_rate": 7.028079840564075e-06, + "loss": 2.301, + "step": 15149 + }, + { + "epoch": 1.2912298644847864, + "grad_norm": 47.971322678987434, + "learning_rate": 7.02762660406497e-06, + "loss": 2.6327, + "step": 15150 + }, + { + "epoch": 1.2913150941788119, + "grad_norm": 29.363976530535304, + "learning_rate": 7.027173347624964e-06, + "loss": 2.3764, + "step": 15151 + }, + { + "epoch": 1.2914003238728373, + "grad_norm": 92.51658216409069, + "learning_rate": 7.026720071248513e-06, + "loss": 2.5549, + "step": 15152 + }, + { + "epoch": 1.2914855535668628, + "grad_norm": 89.27822797320133, + "learning_rate": 7.026266774940077e-06, + "loss": 3.0695, + "step": 15153 + }, + { + "epoch": 1.291570783260888, + "grad_norm": 32.7310144473476, + "learning_rate": 7.02581345870411e-06, + "loss": 2.0881, + "step": 15154 + }, + { + "epoch": 1.2916560129549135, + "grad_norm": 56.09418566311757, + "learning_rate": 7.025360122545074e-06, + "loss": 2.6886, + "step": 15155 + }, + { + "epoch": 1.291741242648939, + "grad_norm": 111.95462944922532, + "learning_rate": 7.024906766467426e-06, + "loss": 3.3614, + "step": 15156 + }, + { + "epoch": 1.2918264723429642, + "grad_norm": 63.35736837370356, + "learning_rate": 7.024453390475625e-06, + "loss": 3.3495, + "step": 15157 + }, + { + "epoch": 1.2919117020369897, + "grad_norm": 40.80379216457919, + "learning_rate": 7.023999994574129e-06, + "loss": 3.107, + "step": 15158 + }, + { + "epoch": 1.2919969317310152, + "grad_norm": 48.83132144102042, + "learning_rate": 7.0235465787673965e-06, + "loss": 2.6073, + "step": 15159 + }, + { + "epoch": 1.2920821614250406, + "grad_norm": 110.91165975506988, + "learning_rate": 7.023093143059889e-06, + "loss": 3.5254, + "step": 15160 + }, + { + "epoch": 1.2921673911190659, + "grad_norm": 60.716903841680065, + "learning_rate": 7.022639687456063e-06, + "loss": 2.5758, + "step": 15161 + }, + { + "epoch": 1.2922526208130913, + "grad_norm": 80.46125704744951, + "learning_rate": 7.02218621196038e-06, + "loss": 2.6493, + "step": 15162 + }, + { + "epoch": 1.2923378505071166, + "grad_norm": 76.56063367685618, + "learning_rate": 7.021732716577298e-06, + "loss": 3.3625, + "step": 15163 + }, + { + "epoch": 1.292423080201142, + "grad_norm": 31.756846894332444, + "learning_rate": 7.02127920131128e-06, + "loss": 2.3562, + "step": 15164 + }, + { + "epoch": 1.2925083098951675, + "grad_norm": 37.932646123841266, + "learning_rate": 7.020825666166783e-06, + "loss": 2.9285, + "step": 15165 + }, + { + "epoch": 1.292593539589193, + "grad_norm": 82.16825086042776, + "learning_rate": 7.020372111148268e-06, + "loss": 3.7862, + "step": 15166 + }, + { + "epoch": 1.2926787692832182, + "grad_norm": 93.55729275293605, + "learning_rate": 7.019918536260197e-06, + "loss": 3.0673, + "step": 15167 + }, + { + "epoch": 1.2927639989772437, + "grad_norm": 30.65970442613457, + "learning_rate": 7.019464941507031e-06, + "loss": 2.257, + "step": 15168 + }, + { + "epoch": 1.292849228671269, + "grad_norm": 58.102272911384645, + "learning_rate": 7.019011326893228e-06, + "loss": 2.3686, + "step": 15169 + }, + { + "epoch": 1.2929344583652944, + "grad_norm": 26.028341568848226, + "learning_rate": 7.018557692423252e-06, + "loss": 2.045, + "step": 15170 + }, + { + "epoch": 1.2930196880593199, + "grad_norm": 22.112708436400005, + "learning_rate": 7.018104038101561e-06, + "loss": 1.7016, + "step": 15171 + }, + { + "epoch": 1.2931049177533454, + "grad_norm": 53.46507074217272, + "learning_rate": 7.0176503639326224e-06, + "loss": 3.5258, + "step": 15172 + }, + { + "epoch": 1.2931901474473706, + "grad_norm": 38.53811642615035, + "learning_rate": 7.017196669920892e-06, + "loss": 2.7868, + "step": 15173 + }, + { + "epoch": 1.293275377141396, + "grad_norm": 46.053931632543126, + "learning_rate": 7.016742956070832e-06, + "loss": 4.2908, + "step": 15174 + }, + { + "epoch": 1.2933606068354215, + "grad_norm": 56.64148095597998, + "learning_rate": 7.0162892223869095e-06, + "loss": 3.6931, + "step": 15175 + }, + { + "epoch": 1.2934458365294468, + "grad_norm": 64.13703322508421, + "learning_rate": 7.015835468873583e-06, + "loss": 2.6356, + "step": 15176 + }, + { + "epoch": 1.2935310662234722, + "grad_norm": 76.12388182656669, + "learning_rate": 7.015381695535315e-06, + "loss": 3.3874, + "step": 15177 + }, + { + "epoch": 1.2936162959174977, + "grad_norm": 95.0112696842801, + "learning_rate": 7.014927902376568e-06, + "loss": 3.4531, + "step": 15178 + }, + { + "epoch": 1.2937015256115232, + "grad_norm": 66.50738530656636, + "learning_rate": 7.0144740894018086e-06, + "loss": 2.9292, + "step": 15179 + }, + { + "epoch": 1.2937867553055484, + "grad_norm": 57.464894997905645, + "learning_rate": 7.014020256615495e-06, + "loss": 2.8417, + "step": 15180 + }, + { + "epoch": 1.293871984999574, + "grad_norm": 71.24717265089495, + "learning_rate": 7.013566404022092e-06, + "loss": 3.9925, + "step": 15181 + }, + { + "epoch": 1.2939572146935991, + "grad_norm": 51.162286607095695, + "learning_rate": 7.013112531626064e-06, + "loss": 3.8081, + "step": 15182 + }, + { + "epoch": 1.2940424443876246, + "grad_norm": 71.48291947361984, + "learning_rate": 7.012658639431875e-06, + "loss": 3.5044, + "step": 15183 + }, + { + "epoch": 1.29412767408165, + "grad_norm": 39.95444508515138, + "learning_rate": 7.012204727443988e-06, + "loss": 2.6961, + "step": 15184 + }, + { + "epoch": 1.2942129037756755, + "grad_norm": 31.242708025546726, + "learning_rate": 7.011750795666866e-06, + "loss": 2.6128, + "step": 15185 + }, + { + "epoch": 1.2942981334697008, + "grad_norm": 39.92878010263522, + "learning_rate": 7.011296844104975e-06, + "loss": 2.4142, + "step": 15186 + }, + { + "epoch": 1.2943833631637263, + "grad_norm": 24.910296439601925, + "learning_rate": 7.010842872762779e-06, + "loss": 2.434, + "step": 15187 + }, + { + "epoch": 1.2944685928577515, + "grad_norm": 55.49780346001371, + "learning_rate": 7.010388881644743e-06, + "loss": 3.2396, + "step": 15188 + }, + { + "epoch": 1.294553822551777, + "grad_norm": 91.50155332341642, + "learning_rate": 7.009934870755328e-06, + "loss": 2.6945, + "step": 15189 + }, + { + "epoch": 1.2946390522458024, + "grad_norm": 36.34654065499224, + "learning_rate": 7.009480840099006e-06, + "loss": 2.734, + "step": 15190 + }, + { + "epoch": 1.294724281939828, + "grad_norm": 69.30119570724042, + "learning_rate": 7.009026789680237e-06, + "loss": 2.8681, + "step": 15191 + }, + { + "epoch": 1.2948095116338532, + "grad_norm": 41.09214048913531, + "learning_rate": 7.008572719503488e-06, + "loss": 3.3841, + "step": 15192 + }, + { + "epoch": 1.2948947413278786, + "grad_norm": 55.511629386622396, + "learning_rate": 7.008118629573223e-06, + "loss": 2.7458, + "step": 15193 + }, + { + "epoch": 1.294979971021904, + "grad_norm": 41.43021182273106, + "learning_rate": 7.007664519893911e-06, + "loss": 2.7918, + "step": 15194 + }, + { + "epoch": 1.2950652007159293, + "grad_norm": 54.47906517790755, + "learning_rate": 7.007210390470015e-06, + "loss": 2.7748, + "step": 15195 + }, + { + "epoch": 1.2951504304099548, + "grad_norm": 37.95207649642223, + "learning_rate": 7.006756241306003e-06, + "loss": 3.3417, + "step": 15196 + }, + { + "epoch": 1.2952356601039803, + "grad_norm": 43.328244766750956, + "learning_rate": 7.00630207240634e-06, + "loss": 2.0165, + "step": 15197 + }, + { + "epoch": 1.2953208897980057, + "grad_norm": 84.99049820063341, + "learning_rate": 7.005847883775495e-06, + "loss": 3.3901, + "step": 15198 + }, + { + "epoch": 1.295406119492031, + "grad_norm": 104.97828149250172, + "learning_rate": 7.005393675417932e-06, + "loss": 2.7116, + "step": 15199 + }, + { + "epoch": 1.2954913491860565, + "grad_norm": 29.123691758239477, + "learning_rate": 7.0049394473381196e-06, + "loss": 2.1651, + "step": 15200 + }, + { + "epoch": 1.2955765788800817, + "grad_norm": 56.063477063266106, + "learning_rate": 7.004485199540523e-06, + "loss": 2.4691, + "step": 15201 + }, + { + "epoch": 1.2956618085741072, + "grad_norm": 35.18915544908052, + "learning_rate": 7.004030932029613e-06, + "loss": 3.1557, + "step": 15202 + }, + { + "epoch": 1.2957470382681326, + "grad_norm": 53.88778680890423, + "learning_rate": 7.0035766448098555e-06, + "loss": 3.0312, + "step": 15203 + }, + { + "epoch": 1.2958322679621581, + "grad_norm": 30.554568444304127, + "learning_rate": 7.003122337885715e-06, + "loss": 2.4127, + "step": 15204 + }, + { + "epoch": 1.2959174976561834, + "grad_norm": 31.675922033224847, + "learning_rate": 7.002668011261664e-06, + "loss": 2.1948, + "step": 15205 + }, + { + "epoch": 1.2960027273502088, + "grad_norm": 52.58615712426656, + "learning_rate": 7.0022136649421675e-06, + "loss": 3.585, + "step": 15206 + }, + { + "epoch": 1.296087957044234, + "grad_norm": 33.44125804854561, + "learning_rate": 7.001759298931698e-06, + "loss": 2.5844, + "step": 15207 + }, + { + "epoch": 1.2961731867382595, + "grad_norm": 31.902825828306284, + "learning_rate": 7.0013049132347185e-06, + "loss": 1.973, + "step": 15208 + }, + { + "epoch": 1.296258416432285, + "grad_norm": 50.086274849192364, + "learning_rate": 7.000850507855702e-06, + "loss": 2.801, + "step": 15209 + }, + { + "epoch": 1.2963436461263105, + "grad_norm": 74.14400989830884, + "learning_rate": 7.0003960827991145e-06, + "loss": 2.4478, + "step": 15210 + }, + { + "epoch": 1.2964288758203357, + "grad_norm": 49.73755449320989, + "learning_rate": 6.999941638069429e-06, + "loss": 2.4999, + "step": 15211 + }, + { + "epoch": 1.2965141055143612, + "grad_norm": 42.62185026378881, + "learning_rate": 6.999487173671109e-06, + "loss": 4.092, + "step": 15212 + }, + { + "epoch": 1.2965993352083867, + "grad_norm": 54.34361112380653, + "learning_rate": 6.999032689608629e-06, + "loss": 4.1675, + "step": 15213 + }, + { + "epoch": 1.296684564902412, + "grad_norm": 68.561156111908, + "learning_rate": 6.998578185886456e-06, + "loss": 4.3426, + "step": 15214 + }, + { + "epoch": 1.2967697945964374, + "grad_norm": 41.50194647480808, + "learning_rate": 6.998123662509062e-06, + "loss": 2.8686, + "step": 15215 + }, + { + "epoch": 1.2968550242904628, + "grad_norm": 30.551087613941192, + "learning_rate": 6.997669119480914e-06, + "loss": 1.7367, + "step": 15216 + }, + { + "epoch": 1.2969402539844883, + "grad_norm": 51.40275090993682, + "learning_rate": 6.997214556806485e-06, + "loss": 2.8679, + "step": 15217 + }, + { + "epoch": 1.2970254836785136, + "grad_norm": 39.15465467334226, + "learning_rate": 6.996759974490244e-06, + "loss": 3.3201, + "step": 15218 + }, + { + "epoch": 1.297110713372539, + "grad_norm": 117.84426546270355, + "learning_rate": 6.996305372536664e-06, + "loss": 3.399, + "step": 15219 + }, + { + "epoch": 1.2971959430665643, + "grad_norm": 31.417225203728886, + "learning_rate": 6.995850750950211e-06, + "loss": 2.4967, + "step": 15220 + }, + { + "epoch": 1.2972811727605897, + "grad_norm": 76.65874275897114, + "learning_rate": 6.99539610973536e-06, + "loss": 4.8355, + "step": 15221 + }, + { + "epoch": 1.2973664024546152, + "grad_norm": 38.690273966348514, + "learning_rate": 6.994941448896583e-06, + "loss": 2.3874, + "step": 15222 + }, + { + "epoch": 1.2974516321486407, + "grad_norm": 53.26169999472289, + "learning_rate": 6.994486768438346e-06, + "loss": 3.0816, + "step": 15223 + }, + { + "epoch": 1.297536861842666, + "grad_norm": 70.57448180420268, + "learning_rate": 6.994032068365126e-06, + "loss": 2.8926, + "step": 15224 + }, + { + "epoch": 1.2976220915366914, + "grad_norm": 33.96488984122847, + "learning_rate": 6.993577348681394e-06, + "loss": 2.9282, + "step": 15225 + }, + { + "epoch": 1.2977073212307169, + "grad_norm": 84.5932101887631, + "learning_rate": 6.993122609391621e-06, + "loss": 4.1917, + "step": 15226 + }, + { + "epoch": 1.297792550924742, + "grad_norm": 43.02859559592499, + "learning_rate": 6.992667850500278e-06, + "loss": 3.0856, + "step": 15227 + }, + { + "epoch": 1.2978777806187676, + "grad_norm": 64.32312107487623, + "learning_rate": 6.992213072011838e-06, + "loss": 3.7722, + "step": 15228 + }, + { + "epoch": 1.297963010312793, + "grad_norm": 58.80181550632268, + "learning_rate": 6.991758273930774e-06, + "loss": 2.6303, + "step": 15229 + }, + { + "epoch": 1.2980482400068185, + "grad_norm": 29.37427805017097, + "learning_rate": 6.99130345626156e-06, + "loss": 2.0253, + "step": 15230 + }, + { + "epoch": 1.2981334697008438, + "grad_norm": 28.73966400360645, + "learning_rate": 6.990848619008669e-06, + "loss": 2.7175, + "step": 15231 + }, + { + "epoch": 1.2982186993948692, + "grad_norm": 59.87848922034431, + "learning_rate": 6.990393762176569e-06, + "loss": 3.637, + "step": 15232 + }, + { + "epoch": 1.2983039290888945, + "grad_norm": 54.306615863107, + "learning_rate": 6.9899388857697405e-06, + "loss": 3.2832, + "step": 15233 + }, + { + "epoch": 1.29838915878292, + "grad_norm": 80.66081388233944, + "learning_rate": 6.989483989792653e-06, + "loss": 3.499, + "step": 15234 + }, + { + "epoch": 1.2984743884769454, + "grad_norm": 36.94854172502473, + "learning_rate": 6.989029074249781e-06, + "loss": 2.7499, + "step": 15235 + }, + { + "epoch": 1.2985596181709709, + "grad_norm": 45.457238380967, + "learning_rate": 6.988574139145597e-06, + "loss": 2.8285, + "step": 15236 + }, + { + "epoch": 1.2986448478649961, + "grad_norm": 46.77297448595095, + "learning_rate": 6.9881191844845784e-06, + "loss": 2.6532, + "step": 15237 + }, + { + "epoch": 1.2987300775590216, + "grad_norm": 83.20144221881081, + "learning_rate": 6.987664210271197e-06, + "loss": 3.6189, + "step": 15238 + }, + { + "epoch": 1.2988153072530468, + "grad_norm": 89.16728532556623, + "learning_rate": 6.987209216509929e-06, + "loss": 2.7628, + "step": 15239 + }, + { + "epoch": 1.2989005369470723, + "grad_norm": 64.0290879093048, + "learning_rate": 6.986754203205245e-06, + "loss": 3.0972, + "step": 15240 + }, + { + "epoch": 1.2989857666410978, + "grad_norm": 45.38021980551863, + "learning_rate": 6.986299170361626e-06, + "loss": 2.3195, + "step": 15241 + }, + { + "epoch": 1.2990709963351232, + "grad_norm": 30.134453865366698, + "learning_rate": 6.985844117983542e-06, + "loss": 2.259, + "step": 15242 + }, + { + "epoch": 1.2991562260291485, + "grad_norm": 67.16270142461592, + "learning_rate": 6.985389046075471e-06, + "loss": 3.4448, + "step": 15243 + }, + { + "epoch": 1.299241455723174, + "grad_norm": 60.97420680434018, + "learning_rate": 6.9849339546418875e-06, + "loss": 3.0684, + "step": 15244 + }, + { + "epoch": 1.2993266854171994, + "grad_norm": 63.27786125899386, + "learning_rate": 6.9844788436872675e-06, + "loss": 3.5573, + "step": 15245 + }, + { + "epoch": 1.2994119151112247, + "grad_norm": 39.670396111936775, + "learning_rate": 6.984023713216087e-06, + "loss": 3.1881, + "step": 15246 + }, + { + "epoch": 1.2994971448052501, + "grad_norm": 37.81780252693914, + "learning_rate": 6.983568563232821e-06, + "loss": 2.8766, + "step": 15247 + }, + { + "epoch": 1.2995823744992756, + "grad_norm": 31.009937977470834, + "learning_rate": 6.983113393741946e-06, + "loss": 2.4154, + "step": 15248 + }, + { + "epoch": 1.299667604193301, + "grad_norm": 35.584855822748224, + "learning_rate": 6.982658204747938e-06, + "loss": 3.384, + "step": 15249 + }, + { + "epoch": 1.2997528338873263, + "grad_norm": 59.33224393100184, + "learning_rate": 6.982202996255277e-06, + "loss": 3.0182, + "step": 15250 + }, + { + "epoch": 1.2998380635813518, + "grad_norm": 132.38163590255604, + "learning_rate": 6.981747768268437e-06, + "loss": 3.5241, + "step": 15251 + }, + { + "epoch": 1.299923293275377, + "grad_norm": 41.026958058468416, + "learning_rate": 6.981292520791892e-06, + "loss": 2.9168, + "step": 15252 + }, + { + "epoch": 1.3000085229694025, + "grad_norm": 39.44737480864929, + "learning_rate": 6.980837253830125e-06, + "loss": 2.9756, + "step": 15253 + }, + { + "epoch": 1.300093752663428, + "grad_norm": 42.5199942665216, + "learning_rate": 6.98038196738761e-06, + "loss": 3.0386, + "step": 15254 + }, + { + "epoch": 1.3001789823574534, + "grad_norm": 36.890374254503485, + "learning_rate": 6.979926661468826e-06, + "loss": 2.7604, + "step": 15255 + }, + { + "epoch": 1.3002642120514787, + "grad_norm": 35.45518312683374, + "learning_rate": 6.979471336078248e-06, + "loss": 3.0302, + "step": 15256 + }, + { + "epoch": 1.3003494417455042, + "grad_norm": 57.221357735561725, + "learning_rate": 6.979015991220357e-06, + "loss": 2.5884, + "step": 15257 + }, + { + "epoch": 1.3004346714395294, + "grad_norm": 74.1366547998703, + "learning_rate": 6.978560626899632e-06, + "loss": 3.1577, + "step": 15258 + }, + { + "epoch": 1.3005199011335549, + "grad_norm": 43.14341671577827, + "learning_rate": 6.978105243120548e-06, + "loss": 2.3821, + "step": 15259 + }, + { + "epoch": 1.3006051308275803, + "grad_norm": 60.711381484669566, + "learning_rate": 6.977649839887584e-06, + "loss": 2.7698, + "step": 15260 + }, + { + "epoch": 1.3006903605216058, + "grad_norm": 33.68253667434569, + "learning_rate": 6.977194417205221e-06, + "loss": 3.0825, + "step": 15261 + }, + { + "epoch": 1.300775590215631, + "grad_norm": 34.505950466350946, + "learning_rate": 6.976738975077936e-06, + "loss": 2.9834, + "step": 15262 + }, + { + "epoch": 1.3008608199096565, + "grad_norm": 25.21166002588299, + "learning_rate": 6.976283513510208e-06, + "loss": 2.2826, + "step": 15263 + }, + { + "epoch": 1.300946049603682, + "grad_norm": 48.02679090529308, + "learning_rate": 6.9758280325065175e-06, + "loss": 3.1123, + "step": 15264 + }, + { + "epoch": 1.3010312792977072, + "grad_norm": 24.927547514840462, + "learning_rate": 6.975372532071345e-06, + "loss": 2.2958, + "step": 15265 + }, + { + "epoch": 1.3011165089917327, + "grad_norm": 80.74377084204244, + "learning_rate": 6.974917012209166e-06, + "loss": 2.0797, + "step": 15266 + }, + { + "epoch": 1.3012017386857582, + "grad_norm": 45.325645197230216, + "learning_rate": 6.974461472924464e-06, + "loss": 3.1558, + "step": 15267 + }, + { + "epoch": 1.3012869683797836, + "grad_norm": 38.41760891860227, + "learning_rate": 6.974005914221717e-06, + "loss": 3.1505, + "step": 15268 + }, + { + "epoch": 1.3013721980738089, + "grad_norm": 269.8770139861392, + "learning_rate": 6.973550336105409e-06, + "loss": 3.0752, + "step": 15269 + }, + { + "epoch": 1.3014574277678344, + "grad_norm": 83.59818003847346, + "learning_rate": 6.9730947385800155e-06, + "loss": 3.068, + "step": 15270 + }, + { + "epoch": 1.3015426574618596, + "grad_norm": 34.685862501064804, + "learning_rate": 6.97263912165002e-06, + "loss": 1.8764, + "step": 15271 + }, + { + "epoch": 1.301627887155885, + "grad_norm": 60.62010981263249, + "learning_rate": 6.972183485319901e-06, + "loss": 3.451, + "step": 15272 + }, + { + "epoch": 1.3017131168499105, + "grad_norm": 41.37928604046892, + "learning_rate": 6.971727829594143e-06, + "loss": 2.5277, + "step": 15273 + }, + { + "epoch": 1.301798346543936, + "grad_norm": 38.939404404778635, + "learning_rate": 6.971272154477225e-06, + "loss": 3.4153, + "step": 15274 + }, + { + "epoch": 1.3018835762379612, + "grad_norm": 62.591189084837, + "learning_rate": 6.970816459973629e-06, + "loss": 3.3067, + "step": 15275 + }, + { + "epoch": 1.3019688059319867, + "grad_norm": 49.534655841012224, + "learning_rate": 6.970360746087836e-06, + "loss": 2.7583, + "step": 15276 + }, + { + "epoch": 1.3020540356260122, + "grad_norm": 73.79269538708867, + "learning_rate": 6.969905012824327e-06, + "loss": 3.1724, + "step": 15277 + }, + { + "epoch": 1.3021392653200374, + "grad_norm": 74.67602174929668, + "learning_rate": 6.9694492601875866e-06, + "loss": 3.1493, + "step": 15278 + }, + { + "epoch": 1.302224495014063, + "grad_norm": 69.9390881831247, + "learning_rate": 6.968993488182093e-06, + "loss": 2.5284, + "step": 15279 + }, + { + "epoch": 1.3023097247080884, + "grad_norm": 44.62316632802584, + "learning_rate": 6.968537696812333e-06, + "loss": 3.4448, + "step": 15280 + }, + { + "epoch": 1.3023949544021136, + "grad_norm": 35.407023461015555, + "learning_rate": 6.968081886082786e-06, + "loss": 3.2354, + "step": 15281 + }, + { + "epoch": 1.302480184096139, + "grad_norm": 66.94804507426637, + "learning_rate": 6.9676260559979354e-06, + "loss": 2.667, + "step": 15282 + }, + { + "epoch": 1.3025654137901646, + "grad_norm": 52.86033556876595, + "learning_rate": 6.9671702065622635e-06, + "loss": 2.0626, + "step": 15283 + }, + { + "epoch": 1.3026506434841898, + "grad_norm": 39.223038798071116, + "learning_rate": 6.9667143377802575e-06, + "loss": 2.6573, + "step": 15284 + }, + { + "epoch": 1.3027358731782153, + "grad_norm": 17.414594447029287, + "learning_rate": 6.966258449656395e-06, + "loss": 1.2457, + "step": 15285 + }, + { + "epoch": 1.3028211028722407, + "grad_norm": 238.08269068068407, + "learning_rate": 6.9658025421951636e-06, + "loss": 6.2167, + "step": 15286 + }, + { + "epoch": 1.3029063325662662, + "grad_norm": 71.29024822496474, + "learning_rate": 6.965346615401043e-06, + "loss": 2.3647, + "step": 15287 + }, + { + "epoch": 1.3029915622602914, + "grad_norm": 43.908056694867604, + "learning_rate": 6.964890669278521e-06, + "loss": 2.9358, + "step": 15288 + }, + { + "epoch": 1.303076791954317, + "grad_norm": 28.37009230327704, + "learning_rate": 6.964434703832079e-06, + "loss": 2.7649, + "step": 15289 + }, + { + "epoch": 1.3031620216483422, + "grad_norm": 36.160158279378805, + "learning_rate": 6.963978719066204e-06, + "loss": 3.4709, + "step": 15290 + }, + { + "epoch": 1.3032472513423676, + "grad_norm": 35.482304344933134, + "learning_rate": 6.9635227149853775e-06, + "loss": 3.211, + "step": 15291 + }, + { + "epoch": 1.303332481036393, + "grad_norm": 34.753653802805076, + "learning_rate": 6.963066691594085e-06, + "loss": 2.6796, + "step": 15292 + }, + { + "epoch": 1.3034177107304186, + "grad_norm": 48.780840464454535, + "learning_rate": 6.962610648896813e-06, + "loss": 3.5091, + "step": 15293 + }, + { + "epoch": 1.3035029404244438, + "grad_norm": 69.16578285935579, + "learning_rate": 6.962154586898044e-06, + "loss": 3.1911, + "step": 15294 + }, + { + "epoch": 1.3035881701184693, + "grad_norm": 49.752731670997115, + "learning_rate": 6.961698505602265e-06, + "loss": 3.1761, + "step": 15295 + }, + { + "epoch": 1.3036733998124947, + "grad_norm": 44.01851469572008, + "learning_rate": 6.961242405013959e-06, + "loss": 3.1802, + "step": 15296 + }, + { + "epoch": 1.30375862950652, + "grad_norm": 35.109455533307035, + "learning_rate": 6.960786285137615e-06, + "loss": 3.0029, + "step": 15297 + }, + { + "epoch": 1.3038438592005455, + "grad_norm": 36.616305667830765, + "learning_rate": 6.960330145977717e-06, + "loss": 3.1549, + "step": 15298 + }, + { + "epoch": 1.303929088894571, + "grad_norm": 49.950081923270965, + "learning_rate": 6.95987398753875e-06, + "loss": 3.0947, + "step": 15299 + }, + { + "epoch": 1.3040143185885964, + "grad_norm": 41.95372248976614, + "learning_rate": 6.959417809825202e-06, + "loss": 2.847, + "step": 15300 + }, + { + "epoch": 1.3040995482826216, + "grad_norm": 56.93112130134997, + "learning_rate": 6.9589616128415584e-06, + "loss": 3.396, + "step": 15301 + }, + { + "epoch": 1.3041847779766471, + "grad_norm": 84.69327797567946, + "learning_rate": 6.958505396592306e-06, + "loss": 4.2591, + "step": 15302 + }, + { + "epoch": 1.3042700076706724, + "grad_norm": 33.994148296083154, + "learning_rate": 6.95804916108193e-06, + "loss": 2.2387, + "step": 15303 + }, + { + "epoch": 1.3043552373646978, + "grad_norm": 54.011885836877255, + "learning_rate": 6.95759290631492e-06, + "loss": 2.8617, + "step": 15304 + }, + { + "epoch": 1.3044404670587233, + "grad_norm": 35.888276034924935, + "learning_rate": 6.95713663229576e-06, + "loss": 2.199, + "step": 15305 + }, + { + "epoch": 1.3045256967527488, + "grad_norm": 36.79843164337457, + "learning_rate": 6.95668033902894e-06, + "loss": 2.4706, + "step": 15306 + }, + { + "epoch": 1.304610926446774, + "grad_norm": 32.72562387983018, + "learning_rate": 6.956224026518946e-06, + "loss": 3.1692, + "step": 15307 + }, + { + "epoch": 1.3046961561407995, + "grad_norm": 107.18661901530868, + "learning_rate": 6.955767694770267e-06, + "loss": 4.564, + "step": 15308 + }, + { + "epoch": 1.3047813858348247, + "grad_norm": 63.234601823282645, + "learning_rate": 6.955311343787389e-06, + "loss": 3.4754, + "step": 15309 + }, + { + "epoch": 1.3048666155288502, + "grad_norm": 57.40519294721612, + "learning_rate": 6.954854973574801e-06, + "loss": 2.1862, + "step": 15310 + }, + { + "epoch": 1.3049518452228757, + "grad_norm": 82.13428687826769, + "learning_rate": 6.95439858413699e-06, + "loss": 3.8423, + "step": 15311 + }, + { + "epoch": 1.3050370749169011, + "grad_norm": 47.92137336542449, + "learning_rate": 6.953942175478447e-06, + "loss": 3.4595, + "step": 15312 + }, + { + "epoch": 1.3051223046109264, + "grad_norm": 47.72335417482128, + "learning_rate": 6.953485747603659e-06, + "loss": 2.3, + "step": 15313 + }, + { + "epoch": 1.3052075343049518, + "grad_norm": 72.61836592440203, + "learning_rate": 6.953029300517115e-06, + "loss": 2.8711, + "step": 15314 + }, + { + "epoch": 1.3052927639989773, + "grad_norm": 34.109146412687245, + "learning_rate": 6.952572834223302e-06, + "loss": 2.5898, + "step": 15315 + }, + { + "epoch": 1.3053779936930026, + "grad_norm": 53.33197555727893, + "learning_rate": 6.952116348726714e-06, + "loss": 2.5718, + "step": 15316 + }, + { + "epoch": 1.305463223387028, + "grad_norm": 47.391677056752336, + "learning_rate": 6.951659844031836e-06, + "loss": 3.313, + "step": 15317 + }, + { + "epoch": 1.3055484530810535, + "grad_norm": 34.17307399243746, + "learning_rate": 6.9512033201431585e-06, + "loss": 2.4781, + "step": 15318 + }, + { + "epoch": 1.305633682775079, + "grad_norm": 58.005196915673984, + "learning_rate": 6.950746777065172e-06, + "loss": 2.808, + "step": 15319 + }, + { + "epoch": 1.3057189124691042, + "grad_norm": 79.98949386938921, + "learning_rate": 6.950290214802367e-06, + "loss": 3.0782, + "step": 15320 + }, + { + "epoch": 1.3058041421631297, + "grad_norm": 33.062381750808136, + "learning_rate": 6.949833633359233e-06, + "loss": 2.8068, + "step": 15321 + }, + { + "epoch": 1.305889371857155, + "grad_norm": 38.75101693188489, + "learning_rate": 6.949377032740256e-06, + "loss": 2.8623, + "step": 15322 + }, + { + "epoch": 1.3059746015511804, + "grad_norm": 67.98405604216411, + "learning_rate": 6.948920412949935e-06, + "loss": 2.8505, + "step": 15323 + }, + { + "epoch": 1.3060598312452059, + "grad_norm": 51.154578988178145, + "learning_rate": 6.948463773992755e-06, + "loss": 3.0419, + "step": 15324 + }, + { + "epoch": 1.3061450609392313, + "grad_norm": 72.33015611771037, + "learning_rate": 6.948007115873208e-06, + "loss": 2.654, + "step": 15325 + }, + { + "epoch": 1.3062302906332566, + "grad_norm": 88.51606801618709, + "learning_rate": 6.947550438595784e-06, + "loss": 2.9182, + "step": 15326 + }, + { + "epoch": 1.306315520327282, + "grad_norm": 53.22186894191279, + "learning_rate": 6.947093742164976e-06, + "loss": 2.0426, + "step": 15327 + }, + { + "epoch": 1.3064007500213073, + "grad_norm": 60.09590274514073, + "learning_rate": 6.946637026585274e-06, + "loss": 3.107, + "step": 15328 + }, + { + "epoch": 1.3064859797153328, + "grad_norm": 62.779461478703986, + "learning_rate": 6.946180291861173e-06, + "loss": 3.3999, + "step": 15329 + }, + { + "epoch": 1.3065712094093582, + "grad_norm": 52.07500657258498, + "learning_rate": 6.945723537997159e-06, + "loss": 2.8291, + "step": 15330 + }, + { + "epoch": 1.3066564391033837, + "grad_norm": 27.406836471351255, + "learning_rate": 6.945266764997729e-06, + "loss": 2.2728, + "step": 15331 + }, + { + "epoch": 1.306741668797409, + "grad_norm": 70.3537457969423, + "learning_rate": 6.944809972867372e-06, + "loss": 3.362, + "step": 15332 + }, + { + "epoch": 1.3068268984914344, + "grad_norm": 28.709225097317514, + "learning_rate": 6.944353161610584e-06, + "loss": 1.9473, + "step": 15333 + }, + { + "epoch": 1.3069121281854599, + "grad_norm": 24.663565426157884, + "learning_rate": 6.943896331231853e-06, + "loss": 1.9697, + "step": 15334 + }, + { + "epoch": 1.3069973578794851, + "grad_norm": 23.057899237884435, + "learning_rate": 6.943439481735674e-06, + "loss": 1.6992, + "step": 15335 + }, + { + "epoch": 1.3070825875735106, + "grad_norm": 58.08836610020634, + "learning_rate": 6.942982613126542e-06, + "loss": 3.2633, + "step": 15336 + }, + { + "epoch": 1.307167817267536, + "grad_norm": 37.5673249586583, + "learning_rate": 6.942525725408946e-06, + "loss": 2.8562, + "step": 15337 + }, + { + "epoch": 1.3072530469615615, + "grad_norm": 39.250313442940005, + "learning_rate": 6.942068818587382e-06, + "loss": 2.9143, + "step": 15338 + }, + { + "epoch": 1.3073382766555868, + "grad_norm": 33.27202417804465, + "learning_rate": 6.941611892666343e-06, + "loss": 2.6788, + "step": 15339 + }, + { + "epoch": 1.3074235063496122, + "grad_norm": 33.313082296522715, + "learning_rate": 6.941154947650324e-06, + "loss": 3.2912, + "step": 15340 + }, + { + "epoch": 1.3075087360436375, + "grad_norm": 66.99716177633128, + "learning_rate": 6.940697983543815e-06, + "loss": 3.4995, + "step": 15341 + }, + { + "epoch": 1.307593965737663, + "grad_norm": 42.86277537828485, + "learning_rate": 6.940241000351313e-06, + "loss": 2.8549, + "step": 15342 + }, + { + "epoch": 1.3076791954316884, + "grad_norm": 44.32609313159101, + "learning_rate": 6.939783998077312e-06, + "loss": 2.3697, + "step": 15343 + }, + { + "epoch": 1.307764425125714, + "grad_norm": 111.93610651073169, + "learning_rate": 6.939326976726307e-06, + "loss": 3.5124, + "step": 15344 + }, + { + "epoch": 1.3078496548197391, + "grad_norm": 33.050461554866445, + "learning_rate": 6.938869936302791e-06, + "loss": 2.4495, + "step": 15345 + }, + { + "epoch": 1.3079348845137646, + "grad_norm": 38.912725513872026, + "learning_rate": 6.93841287681126e-06, + "loss": 2.8669, + "step": 15346 + }, + { + "epoch": 1.30802011420779, + "grad_norm": 190.08858761058977, + "learning_rate": 6.937955798256208e-06, + "loss": 3.7121, + "step": 15347 + }, + { + "epoch": 1.3081053439018153, + "grad_norm": 46.457615566991244, + "learning_rate": 6.937498700642132e-06, + "loss": 2.4902, + "step": 15348 + }, + { + "epoch": 1.3081905735958408, + "grad_norm": 50.927689988008105, + "learning_rate": 6.937041583973525e-06, + "loss": 3.0052, + "step": 15349 + }, + { + "epoch": 1.3082758032898663, + "grad_norm": 60.10831852007876, + "learning_rate": 6.936584448254884e-06, + "loss": 2.9896, + "step": 15350 + }, + { + "epoch": 1.3083610329838917, + "grad_norm": 110.17337118001558, + "learning_rate": 6.936127293490705e-06, + "loss": 3.638, + "step": 15351 + }, + { + "epoch": 1.308446262677917, + "grad_norm": 43.16742652595003, + "learning_rate": 6.935670119685483e-06, + "loss": 1.8376, + "step": 15352 + }, + { + "epoch": 1.3085314923719424, + "grad_norm": 46.07012189602686, + "learning_rate": 6.935212926843714e-06, + "loss": 3.5983, + "step": 15353 + }, + { + "epoch": 1.3086167220659677, + "grad_norm": 41.91267171079189, + "learning_rate": 6.934755714969895e-06, + "loss": 3.0662, + "step": 15354 + }, + { + "epoch": 1.3087019517599932, + "grad_norm": 34.42726919690624, + "learning_rate": 6.934298484068525e-06, + "loss": 2.6626, + "step": 15355 + }, + { + "epoch": 1.3087871814540186, + "grad_norm": 64.42425639145951, + "learning_rate": 6.9338412341440946e-06, + "loss": 3.1982, + "step": 15356 + }, + { + "epoch": 1.308872411148044, + "grad_norm": 25.166484237894828, + "learning_rate": 6.933383965201105e-06, + "loss": 2.1016, + "step": 15357 + }, + { + "epoch": 1.3089576408420693, + "grad_norm": 58.28025353333425, + "learning_rate": 6.932926677244052e-06, + "loss": 2.7415, + "step": 15358 + }, + { + "epoch": 1.3090428705360948, + "grad_norm": 52.66951012589037, + "learning_rate": 6.932469370277435e-06, + "loss": 1.8756, + "step": 15359 + }, + { + "epoch": 1.30912810023012, + "grad_norm": 75.4557081776567, + "learning_rate": 6.932012044305747e-06, + "loss": 2.9767, + "step": 15360 + }, + { + "epoch": 1.3092133299241455, + "grad_norm": 97.59279465900279, + "learning_rate": 6.93155469933349e-06, + "loss": 4.4417, + "step": 15361 + }, + { + "epoch": 1.309298559618171, + "grad_norm": 58.81537926885939, + "learning_rate": 6.93109733536516e-06, + "loss": 2.6253, + "step": 15362 + }, + { + "epoch": 1.3093837893121965, + "grad_norm": 55.00691988352822, + "learning_rate": 6.930639952405255e-06, + "loss": 4.5663, + "step": 15363 + }, + { + "epoch": 1.3094690190062217, + "grad_norm": 85.45000607207047, + "learning_rate": 6.930182550458273e-06, + "loss": 4.0604, + "step": 15364 + }, + { + "epoch": 1.3095542487002472, + "grad_norm": 66.0923550525465, + "learning_rate": 6.929725129528713e-06, + "loss": 3.1714, + "step": 15365 + }, + { + "epoch": 1.3096394783942726, + "grad_norm": 43.92900902538885, + "learning_rate": 6.929267689621073e-06, + "loss": 3.3096, + "step": 15366 + }, + { + "epoch": 1.3097247080882979, + "grad_norm": 80.35322169735208, + "learning_rate": 6.928810230739851e-06, + "loss": 3.0707, + "step": 15367 + }, + { + "epoch": 1.3098099377823234, + "grad_norm": 39.33239647622425, + "learning_rate": 6.928352752889547e-06, + "loss": 3.0464, + "step": 15368 + }, + { + "epoch": 1.3098951674763488, + "grad_norm": 29.587288468598516, + "learning_rate": 6.927895256074661e-06, + "loss": 2.3894, + "step": 15369 + }, + { + "epoch": 1.3099803971703743, + "grad_norm": 41.74307693656256, + "learning_rate": 6.9274377402996905e-06, + "loss": 2.6393, + "step": 15370 + }, + { + "epoch": 1.3100656268643995, + "grad_norm": 33.29991380874706, + "learning_rate": 6.926980205569136e-06, + "loss": 3.2806, + "step": 15371 + }, + { + "epoch": 1.310150856558425, + "grad_norm": 35.02050688099279, + "learning_rate": 6.926522651887497e-06, + "loss": 2.5952, + "step": 15372 + }, + { + "epoch": 1.3102360862524502, + "grad_norm": 71.1269983085241, + "learning_rate": 6.926065079259272e-06, + "loss": 2.4707, + "step": 15373 + }, + { + "epoch": 1.3103213159464757, + "grad_norm": 123.66859323083096, + "learning_rate": 6.925607487688963e-06, + "loss": 3.7928, + "step": 15374 + }, + { + "epoch": 1.3104065456405012, + "grad_norm": 37.2248815170674, + "learning_rate": 6.92514987718107e-06, + "loss": 2.4183, + "step": 15375 + }, + { + "epoch": 1.3104917753345267, + "grad_norm": 40.000757054984476, + "learning_rate": 6.924692247740094e-06, + "loss": 2.406, + "step": 15376 + }, + { + "epoch": 1.310577005028552, + "grad_norm": 50.50155810493294, + "learning_rate": 6.924234599370533e-06, + "loss": 3.108, + "step": 15377 + }, + { + "epoch": 1.3106622347225774, + "grad_norm": 33.220015880906, + "learning_rate": 6.923776932076889e-06, + "loss": 2.4994, + "step": 15378 + }, + { + "epoch": 1.3107474644166026, + "grad_norm": 49.67416582684675, + "learning_rate": 6.923319245863664e-06, + "loss": 3.7715, + "step": 15379 + }, + { + "epoch": 1.310832694110628, + "grad_norm": 71.3938464674142, + "learning_rate": 6.9228615407353585e-06, + "loss": 2.6547, + "step": 15380 + }, + { + "epoch": 1.3109179238046536, + "grad_norm": 50.860142721240756, + "learning_rate": 6.922403816696473e-06, + "loss": 2.8844, + "step": 15381 + }, + { + "epoch": 1.311003153498679, + "grad_norm": 65.04011392115405, + "learning_rate": 6.921946073751511e-06, + "loss": 3.667, + "step": 15382 + }, + { + "epoch": 1.3110883831927043, + "grad_norm": 44.041491429868856, + "learning_rate": 6.921488311904973e-06, + "loss": 3.5455, + "step": 15383 + }, + { + "epoch": 1.3111736128867297, + "grad_norm": 39.59508972701051, + "learning_rate": 6.92103053116136e-06, + "loss": 2.9204, + "step": 15384 + }, + { + "epoch": 1.3112588425807552, + "grad_norm": 57.99614796754619, + "learning_rate": 6.920572731525176e-06, + "loss": 3.9229, + "step": 15385 + }, + { + "epoch": 1.3113440722747804, + "grad_norm": 76.30505391300767, + "learning_rate": 6.920114913000922e-06, + "loss": 3.2194, + "step": 15386 + }, + { + "epoch": 1.311429301968806, + "grad_norm": 66.28052250554494, + "learning_rate": 6.9196570755931e-06, + "loss": 3.1803, + "step": 15387 + }, + { + "epoch": 1.3115145316628314, + "grad_norm": 96.37090672006896, + "learning_rate": 6.919199219306214e-06, + "loss": 3.2872, + "step": 15388 + }, + { + "epoch": 1.3115997613568569, + "grad_norm": 69.92856631999564, + "learning_rate": 6.918741344144767e-06, + "loss": 3.7445, + "step": 15389 + }, + { + "epoch": 1.311684991050882, + "grad_norm": 42.910626772390444, + "learning_rate": 6.918283450113261e-06, + "loss": 3.0583, + "step": 15390 + }, + { + "epoch": 1.3117702207449076, + "grad_norm": 28.99324126839763, + "learning_rate": 6.9178255372162e-06, + "loss": 1.8791, + "step": 15391 + }, + { + "epoch": 1.3118554504389328, + "grad_norm": 45.64980057453927, + "learning_rate": 6.917367605458085e-06, + "loss": 2.8346, + "step": 15392 + }, + { + "epoch": 1.3119406801329583, + "grad_norm": 88.18818688873094, + "learning_rate": 6.916909654843423e-06, + "loss": 3.1423, + "step": 15393 + }, + { + "epoch": 1.3120259098269837, + "grad_norm": 40.203873042571864, + "learning_rate": 6.916451685376717e-06, + "loss": 2.6658, + "step": 15394 + }, + { + "epoch": 1.3121111395210092, + "grad_norm": 29.627383368510188, + "learning_rate": 6.9159936970624685e-06, + "loss": 1.3659, + "step": 15395 + }, + { + "epoch": 1.3121963692150345, + "grad_norm": 54.44342799137681, + "learning_rate": 6.915535689905184e-06, + "loss": 3.1827, + "step": 15396 + }, + { + "epoch": 1.31228159890906, + "grad_norm": 34.802731125469336, + "learning_rate": 6.915077663909366e-06, + "loss": 1.8097, + "step": 15397 + }, + { + "epoch": 1.3123668286030852, + "grad_norm": 62.47735855314882, + "learning_rate": 6.914619619079524e-06, + "loss": 3.3281, + "step": 15398 + }, + { + "epoch": 1.3124520582971106, + "grad_norm": 34.44250379538491, + "learning_rate": 6.914161555420155e-06, + "loss": 1.7546, + "step": 15399 + }, + { + "epoch": 1.3125372879911361, + "grad_norm": 41.11185790884989, + "learning_rate": 6.913703472935768e-06, + "loss": 3.029, + "step": 15400 + }, + { + "epoch": 1.3126225176851616, + "grad_norm": 50.92438255694091, + "learning_rate": 6.913245371630869e-06, + "loss": 2.9603, + "step": 15401 + }, + { + "epoch": 1.3127077473791868, + "grad_norm": 36.802059293826034, + "learning_rate": 6.912787251509963e-06, + "loss": 3.2655, + "step": 15402 + }, + { + "epoch": 1.3127929770732123, + "grad_norm": 45.034230693278886, + "learning_rate": 6.912329112577552e-06, + "loss": 3.4094, + "step": 15403 + }, + { + "epoch": 1.3128782067672378, + "grad_norm": 39.48987352078413, + "learning_rate": 6.911870954838144e-06, + "loss": 2.9125, + "step": 15404 + }, + { + "epoch": 1.312963436461263, + "grad_norm": 75.55616188596771, + "learning_rate": 6.911412778296247e-06, + "loss": 2.8197, + "step": 15405 + }, + { + "epoch": 1.3130486661552885, + "grad_norm": 41.97255970579958, + "learning_rate": 6.910954582956364e-06, + "loss": 3.3264, + "step": 15406 + }, + { + "epoch": 1.313133895849314, + "grad_norm": 36.19766003915241, + "learning_rate": 6.910496368823003e-06, + "loss": 3.7437, + "step": 15407 + }, + { + "epoch": 1.3132191255433394, + "grad_norm": 32.38841699116307, + "learning_rate": 6.910038135900668e-06, + "loss": 3.0019, + "step": 15408 + }, + { + "epoch": 1.3133043552373647, + "grad_norm": 48.27738380789844, + "learning_rate": 6.9095798841938665e-06, + "loss": 2.6707, + "step": 15409 + }, + { + "epoch": 1.3133895849313901, + "grad_norm": 27.524815444096358, + "learning_rate": 6.909121613707107e-06, + "loss": 2.6525, + "step": 15410 + }, + { + "epoch": 1.3134748146254154, + "grad_norm": 16.48125235161562, + "learning_rate": 6.9086633244448956e-06, + "loss": 1.0243, + "step": 15411 + }, + { + "epoch": 1.3135600443194408, + "grad_norm": 74.64293006733126, + "learning_rate": 6.908205016411736e-06, + "loss": 3.1563, + "step": 15412 + }, + { + "epoch": 1.3136452740134663, + "grad_norm": 36.416592104507586, + "learning_rate": 6.90774668961214e-06, + "loss": 2.8189, + "step": 15413 + }, + { + "epoch": 1.3137305037074918, + "grad_norm": 73.24644764055027, + "learning_rate": 6.907288344050614e-06, + "loss": 4.6641, + "step": 15414 + }, + { + "epoch": 1.313815733401517, + "grad_norm": 41.538534572386794, + "learning_rate": 6.906829979731664e-06, + "loss": 2.2433, + "step": 15415 + }, + { + "epoch": 1.3139009630955425, + "grad_norm": 87.39628521983907, + "learning_rate": 6.9063715966597975e-06, + "loss": 2.6726, + "step": 15416 + }, + { + "epoch": 1.313986192789568, + "grad_norm": 48.04856238212366, + "learning_rate": 6.905913194839526e-06, + "loss": 2.6884, + "step": 15417 + }, + { + "epoch": 1.3140714224835932, + "grad_norm": 32.41939289963997, + "learning_rate": 6.905454774275355e-06, + "loss": 3.3254, + "step": 15418 + }, + { + "epoch": 1.3141566521776187, + "grad_norm": 49.37984860081931, + "learning_rate": 6.904996334971795e-06, + "loss": 3.1106, + "step": 15419 + }, + { + "epoch": 1.3142418818716441, + "grad_norm": 77.02091355232689, + "learning_rate": 6.90453787693335e-06, + "loss": 2.3587, + "step": 15420 + }, + { + "epoch": 1.3143271115656696, + "grad_norm": 53.082718895021486, + "learning_rate": 6.904079400164534e-06, + "loss": 2.5974, + "step": 15421 + }, + { + "epoch": 1.3144123412596949, + "grad_norm": 45.2050600666699, + "learning_rate": 6.903620904669855e-06, + "loss": 2.4411, + "step": 15422 + }, + { + "epoch": 1.3144975709537203, + "grad_norm": 43.613075191237755, + "learning_rate": 6.903162390453818e-06, + "loss": 3.8184, + "step": 15423 + }, + { + "epoch": 1.3145828006477456, + "grad_norm": 30.713775778368387, + "learning_rate": 6.902703857520936e-06, + "loss": 2.3636, + "step": 15424 + }, + { + "epoch": 1.314668030341771, + "grad_norm": 55.98283233266675, + "learning_rate": 6.902245305875717e-06, + "loss": 2.9771, + "step": 15425 + }, + { + "epoch": 1.3147532600357965, + "grad_norm": 31.756092032927484, + "learning_rate": 6.901786735522673e-06, + "loss": 2.345, + "step": 15426 + }, + { + "epoch": 1.314838489729822, + "grad_norm": 54.614381767174436, + "learning_rate": 6.901328146466311e-06, + "loss": 2.4226, + "step": 15427 + }, + { + "epoch": 1.3149237194238472, + "grad_norm": 34.2459082623784, + "learning_rate": 6.9008695387111414e-06, + "loss": 2.5655, + "step": 15428 + }, + { + "epoch": 1.3150089491178727, + "grad_norm": 85.56071166848906, + "learning_rate": 6.900410912261675e-06, + "loss": 3.3394, + "step": 15429 + }, + { + "epoch": 1.315094178811898, + "grad_norm": 29.406669496772754, + "learning_rate": 6.899952267122425e-06, + "loss": 2.3516, + "step": 15430 + }, + { + "epoch": 1.3151794085059234, + "grad_norm": 219.58812982113008, + "learning_rate": 6.899493603297898e-06, + "loss": 3.07, + "step": 15431 + }, + { + "epoch": 1.3152646381999489, + "grad_norm": 65.55129664830328, + "learning_rate": 6.899034920792605e-06, + "loss": 3.6393, + "step": 15432 + }, + { + "epoch": 1.3153498678939743, + "grad_norm": 27.77514568289307, + "learning_rate": 6.8985762196110594e-06, + "loss": 2.2605, + "step": 15433 + }, + { + "epoch": 1.3154350975879996, + "grad_norm": 66.87057639379299, + "learning_rate": 6.898117499757772e-06, + "loss": 3.7197, + "step": 15434 + }, + { + "epoch": 1.315520327282025, + "grad_norm": 67.63556458686043, + "learning_rate": 6.897658761237251e-06, + "loss": 3.6904, + "step": 15435 + }, + { + "epoch": 1.3156055569760505, + "grad_norm": 104.10824629622483, + "learning_rate": 6.897200004054012e-06, + "loss": 4.2205, + "step": 15436 + }, + { + "epoch": 1.3156907866700758, + "grad_norm": 56.23712456960796, + "learning_rate": 6.896741228212564e-06, + "loss": 2.9897, + "step": 15437 + }, + { + "epoch": 1.3157760163641012, + "grad_norm": 68.3556800950189, + "learning_rate": 6.8962824337174204e-06, + "loss": 3.1619, + "step": 15438 + }, + { + "epoch": 1.3158612460581267, + "grad_norm": 41.093819383095735, + "learning_rate": 6.895823620573091e-06, + "loss": 3.4303, + "step": 15439 + }, + { + "epoch": 1.3159464757521522, + "grad_norm": 50.52675883981223, + "learning_rate": 6.8953647887840915e-06, + "loss": 2.8891, + "step": 15440 + }, + { + "epoch": 1.3160317054461774, + "grad_norm": 253.14318440756816, + "learning_rate": 6.894905938354932e-06, + "loss": 2.7871, + "step": 15441 + }, + { + "epoch": 1.316116935140203, + "grad_norm": 39.40559120743196, + "learning_rate": 6.894447069290124e-06, + "loss": 2.9857, + "step": 15442 + }, + { + "epoch": 1.3162021648342281, + "grad_norm": 62.134514007108905, + "learning_rate": 6.893988181594183e-06, + "loss": 3.4073, + "step": 15443 + }, + { + "epoch": 1.3162873945282536, + "grad_norm": 60.47626323929623, + "learning_rate": 6.893529275271621e-06, + "loss": 1.7195, + "step": 15444 + }, + { + "epoch": 1.316372624222279, + "grad_norm": 39.916870492381804, + "learning_rate": 6.893070350326952e-06, + "loss": 3.7141, + "step": 15445 + }, + { + "epoch": 1.3164578539163045, + "grad_norm": 41.706595651148255, + "learning_rate": 6.892611406764687e-06, + "loss": 1.9946, + "step": 15446 + }, + { + "epoch": 1.3165430836103298, + "grad_norm": 70.94995429953781, + "learning_rate": 6.89215244458934e-06, + "loss": 2.8762, + "step": 15447 + }, + { + "epoch": 1.3166283133043553, + "grad_norm": 38.95680167589775, + "learning_rate": 6.891693463805427e-06, + "loss": 2.2886, + "step": 15448 + }, + { + "epoch": 1.3167135429983805, + "grad_norm": 72.16459271553474, + "learning_rate": 6.89123446441746e-06, + "loss": 2.7976, + "step": 15449 + }, + { + "epoch": 1.316798772692406, + "grad_norm": 43.2191355723074, + "learning_rate": 6.890775446429955e-06, + "loss": 3.5164, + "step": 15450 + }, + { + "epoch": 1.3168840023864314, + "grad_norm": 27.20558491858977, + "learning_rate": 6.890316409847422e-06, + "loss": 2.6751, + "step": 15451 + }, + { + "epoch": 1.316969232080457, + "grad_norm": 63.50936364820601, + "learning_rate": 6.8898573546743805e-06, + "loss": 2.381, + "step": 15452 + }, + { + "epoch": 1.3170544617744822, + "grad_norm": 32.16710600422766, + "learning_rate": 6.889398280915341e-06, + "loss": 2.7816, + "step": 15453 + }, + { + "epoch": 1.3171396914685076, + "grad_norm": 38.75462666553954, + "learning_rate": 6.8889391885748225e-06, + "loss": 2.5827, + "step": 15454 + }, + { + "epoch": 1.317224921162533, + "grad_norm": 61.53276529159322, + "learning_rate": 6.8884800776573355e-06, + "loss": 2.6105, + "step": 15455 + }, + { + "epoch": 1.3173101508565583, + "grad_norm": 36.80903391441419, + "learning_rate": 6.888020948167398e-06, + "loss": 2.292, + "step": 15456 + }, + { + "epoch": 1.3173953805505838, + "grad_norm": 81.23419956648375, + "learning_rate": 6.887561800109526e-06, + "loss": 3.0941, + "step": 15457 + }, + { + "epoch": 1.3174806102446093, + "grad_norm": 38.01519997180349, + "learning_rate": 6.8871026334882326e-06, + "loss": 2.7536, + "step": 15458 + }, + { + "epoch": 1.3175658399386347, + "grad_norm": 53.001682861413215, + "learning_rate": 6.886643448308033e-06, + "loss": 3.0189, + "step": 15459 + }, + { + "epoch": 1.31765106963266, + "grad_norm": 33.18629878613292, + "learning_rate": 6.886184244573448e-06, + "loss": 2.5073, + "step": 15460 + }, + { + "epoch": 1.3177362993266855, + "grad_norm": 79.8786784567076, + "learning_rate": 6.885725022288989e-06, + "loss": 3.6275, + "step": 15461 + }, + { + "epoch": 1.3178215290207107, + "grad_norm": 43.756903299826035, + "learning_rate": 6.885265781459174e-06, + "loss": 3.1867, + "step": 15462 + }, + { + "epoch": 1.3179067587147362, + "grad_norm": 99.32994390004237, + "learning_rate": 6.884806522088518e-06, + "loss": 4.2103, + "step": 15463 + }, + { + "epoch": 1.3179919884087616, + "grad_norm": 66.13700776699146, + "learning_rate": 6.88434724418154e-06, + "loss": 2.7295, + "step": 15464 + }, + { + "epoch": 1.318077218102787, + "grad_norm": 48.83335707732712, + "learning_rate": 6.8838879477427554e-06, + "loss": 3.4131, + "step": 15465 + }, + { + "epoch": 1.3181624477968124, + "grad_norm": 57.69674334076802, + "learning_rate": 6.883428632776682e-06, + "loss": 3.8878, + "step": 15466 + }, + { + "epoch": 1.3182476774908378, + "grad_norm": 71.79893369408812, + "learning_rate": 6.882969299287835e-06, + "loss": 3.8037, + "step": 15467 + }, + { + "epoch": 1.318332907184863, + "grad_norm": 66.72234130943265, + "learning_rate": 6.882509947280733e-06, + "loss": 3.2287, + "step": 15468 + }, + { + "epoch": 1.3184181368788885, + "grad_norm": 34.456882562555954, + "learning_rate": 6.882050576759895e-06, + "loss": 2.2236, + "step": 15469 + }, + { + "epoch": 1.318503366572914, + "grad_norm": 50.27795329047637, + "learning_rate": 6.881591187729838e-06, + "loss": 3.8295, + "step": 15470 + }, + { + "epoch": 1.3185885962669395, + "grad_norm": 43.18367345085532, + "learning_rate": 6.881131780195078e-06, + "loss": 2.8732, + "step": 15471 + }, + { + "epoch": 1.3186738259609647, + "grad_norm": 54.52130046035351, + "learning_rate": 6.880672354160134e-06, + "loss": 3.5445, + "step": 15472 + }, + { + "epoch": 1.3187590556549902, + "grad_norm": 105.48230608952937, + "learning_rate": 6.880212909629527e-06, + "loss": 3.6693, + "step": 15473 + }, + { + "epoch": 1.3188442853490157, + "grad_norm": 41.23652525740115, + "learning_rate": 6.879753446607771e-06, + "loss": 2.8665, + "step": 15474 + }, + { + "epoch": 1.318929515043041, + "grad_norm": 29.242036062829868, + "learning_rate": 6.879293965099387e-06, + "loss": 2.457, + "step": 15475 + }, + { + "epoch": 1.3190147447370664, + "grad_norm": 59.62297419632032, + "learning_rate": 6.878834465108895e-06, + "loss": 2.8476, + "step": 15476 + }, + { + "epoch": 1.3190999744310918, + "grad_norm": 125.51310816145163, + "learning_rate": 6.878374946640811e-06, + "loss": 3.8465, + "step": 15477 + }, + { + "epoch": 1.3191852041251173, + "grad_norm": 40.88720679367695, + "learning_rate": 6.877915409699656e-06, + "loss": 2.8997, + "step": 15478 + }, + { + "epoch": 1.3192704338191426, + "grad_norm": 59.44357759716442, + "learning_rate": 6.8774558542899495e-06, + "loss": 3.0525, + "step": 15479 + }, + { + "epoch": 1.319355663513168, + "grad_norm": 54.474372512286415, + "learning_rate": 6.876996280416211e-06, + "loss": 3.8253, + "step": 15480 + }, + { + "epoch": 1.3194408932071933, + "grad_norm": 39.42030444863785, + "learning_rate": 6.8765366880829595e-06, + "loss": 3.6019, + "step": 15481 + }, + { + "epoch": 1.3195261229012187, + "grad_norm": 75.43301097395029, + "learning_rate": 6.876077077294716e-06, + "loss": 2.7112, + "step": 15482 + }, + { + "epoch": 1.3196113525952442, + "grad_norm": 44.26909615829017, + "learning_rate": 6.875617448055999e-06, + "loss": 3.1104, + "step": 15483 + }, + { + "epoch": 1.3196965822892697, + "grad_norm": 109.63821558766134, + "learning_rate": 6.875157800371332e-06, + "loss": 3.6147, + "step": 15484 + }, + { + "epoch": 1.319781811983295, + "grad_norm": 32.71721203314056, + "learning_rate": 6.87469813424523e-06, + "loss": 2.8822, + "step": 15485 + }, + { + "epoch": 1.3198670416773204, + "grad_norm": 33.54907745459749, + "learning_rate": 6.8742384496822175e-06, + "loss": 2.8236, + "step": 15486 + }, + { + "epoch": 1.3199522713713459, + "grad_norm": 102.9580607450543, + "learning_rate": 6.873778746686816e-06, + "loss": 4.8429, + "step": 15487 + }, + { + "epoch": 1.320037501065371, + "grad_norm": 43.173626799027666, + "learning_rate": 6.873319025263545e-06, + "loss": 1.9483, + "step": 15488 + }, + { + "epoch": 1.3201227307593966, + "grad_norm": 29.067727276829807, + "learning_rate": 6.872859285416924e-06, + "loss": 2.8001, + "step": 15489 + }, + { + "epoch": 1.320207960453422, + "grad_norm": 39.37286309465384, + "learning_rate": 6.872399527151478e-06, + "loss": 3.02, + "step": 15490 + }, + { + "epoch": 1.3202931901474475, + "grad_norm": 53.459489798849795, + "learning_rate": 6.871939750471724e-06, + "loss": 3.2098, + "step": 15491 + }, + { + "epoch": 1.3203784198414728, + "grad_norm": 83.17744198351285, + "learning_rate": 6.87147995538219e-06, + "loss": 4.8847, + "step": 15492 + }, + { + "epoch": 1.3204636495354982, + "grad_norm": 56.671674551784356, + "learning_rate": 6.8710201418873924e-06, + "loss": 2.9756, + "step": 15493 + }, + { + "epoch": 1.3205488792295235, + "grad_norm": 60.62563943702311, + "learning_rate": 6.870560309991855e-06, + "loss": 2.9033, + "step": 15494 + }, + { + "epoch": 1.320634108923549, + "grad_norm": 33.56385005121343, + "learning_rate": 6.870100459700101e-06, + "loss": 2.3992, + "step": 15495 + }, + { + "epoch": 1.3207193386175744, + "grad_norm": 60.7236706975228, + "learning_rate": 6.869640591016651e-06, + "loss": 2.6445, + "step": 15496 + }, + { + "epoch": 1.3208045683115999, + "grad_norm": 73.30264716347507, + "learning_rate": 6.86918070394603e-06, + "loss": 3.8575, + "step": 15497 + }, + { + "epoch": 1.3208897980056251, + "grad_norm": 47.04142415884409, + "learning_rate": 6.868720798492758e-06, + "loss": 3.4561, + "step": 15498 + }, + { + "epoch": 1.3209750276996506, + "grad_norm": 67.84449938636465, + "learning_rate": 6.868260874661361e-06, + "loss": 2.5933, + "step": 15499 + }, + { + "epoch": 1.3210602573936758, + "grad_norm": 61.811024571263665, + "learning_rate": 6.86780093245636e-06, + "loss": 2.8629, + "step": 15500 + }, + { + "epoch": 1.3211454870877013, + "grad_norm": 61.3804691014901, + "learning_rate": 6.86734097188228e-06, + "loss": 3.4582, + "step": 15501 + }, + { + "epoch": 1.3212307167817268, + "grad_norm": 51.21021743982049, + "learning_rate": 6.866880992943641e-06, + "loss": 1.9959, + "step": 15502 + }, + { + "epoch": 1.3213159464757522, + "grad_norm": 26.046067983476753, + "learning_rate": 6.866420995644972e-06, + "loss": 2.6349, + "step": 15503 + }, + { + "epoch": 1.3214011761697775, + "grad_norm": 64.01036660768388, + "learning_rate": 6.865960979990792e-06, + "loss": 2.819, + "step": 15504 + }, + { + "epoch": 1.321486405863803, + "grad_norm": 46.089593427458574, + "learning_rate": 6.865500945985629e-06, + "loss": 2.2989, + "step": 15505 + }, + { + "epoch": 1.3215716355578284, + "grad_norm": 40.56817073679039, + "learning_rate": 6.865040893634004e-06, + "loss": 2.9452, + "step": 15506 + }, + { + "epoch": 1.3216568652518537, + "grad_norm": 30.66698408747545, + "learning_rate": 6.864580822940444e-06, + "loss": 1.9851, + "step": 15507 + }, + { + "epoch": 1.3217420949458791, + "grad_norm": 72.67647858486347, + "learning_rate": 6.864120733909472e-06, + "loss": 3.5493, + "step": 15508 + }, + { + "epoch": 1.3218273246399046, + "grad_norm": 126.00580313433741, + "learning_rate": 6.863660626545614e-06, + "loss": 3.0607, + "step": 15509 + }, + { + "epoch": 1.32191255433393, + "grad_norm": 40.492758521119846, + "learning_rate": 6.8632005008533945e-06, + "loss": 2.3953, + "step": 15510 + }, + { + "epoch": 1.3219977840279553, + "grad_norm": 79.47803402324158, + "learning_rate": 6.862740356837336e-06, + "loss": 4.7262, + "step": 15511 + }, + { + "epoch": 1.3220830137219808, + "grad_norm": 32.59057405629483, + "learning_rate": 6.862280194501969e-06, + "loss": 2.2158, + "step": 15512 + }, + { + "epoch": 1.322168243416006, + "grad_norm": 68.47058394523114, + "learning_rate": 6.861820013851815e-06, + "loss": 3.0024, + "step": 15513 + }, + { + "epoch": 1.3222534731100315, + "grad_norm": 85.0650547782341, + "learning_rate": 6.861359814891402e-06, + "loss": 2.7436, + "step": 15514 + }, + { + "epoch": 1.322338702804057, + "grad_norm": 60.95746956540886, + "learning_rate": 6.860899597625253e-06, + "loss": 2.7259, + "step": 15515 + }, + { + "epoch": 1.3224239324980824, + "grad_norm": 97.25182312845988, + "learning_rate": 6.860439362057898e-06, + "loss": 2.8361, + "step": 15516 + }, + { + "epoch": 1.3225091621921077, + "grad_norm": 33.87465111361896, + "learning_rate": 6.859979108193859e-06, + "loss": 2.6449, + "step": 15517 + }, + { + "epoch": 1.3225943918861331, + "grad_norm": 39.748507498188864, + "learning_rate": 6.859518836037666e-06, + "loss": 2.8608, + "step": 15518 + }, + { + "epoch": 1.3226796215801584, + "grad_norm": 37.96091959749809, + "learning_rate": 6.859058545593844e-06, + "loss": 2.8469, + "step": 15519 + }, + { + "epoch": 1.3227648512741839, + "grad_norm": 54.42066659573256, + "learning_rate": 6.85859823686692e-06, + "loss": 3.1242, + "step": 15520 + }, + { + "epoch": 1.3228500809682093, + "grad_norm": 42.20364908761172, + "learning_rate": 6.85813790986142e-06, + "loss": 3.2459, + "step": 15521 + }, + { + "epoch": 1.3229353106622348, + "grad_norm": 34.89686349845584, + "learning_rate": 6.857677564581873e-06, + "loss": 2.7355, + "step": 15522 + }, + { + "epoch": 1.32302054035626, + "grad_norm": 29.88922833097484, + "learning_rate": 6.857217201032804e-06, + "loss": 2.4172, + "step": 15523 + }, + { + "epoch": 1.3231057700502855, + "grad_norm": 60.51424680266062, + "learning_rate": 6.856756819218744e-06, + "loss": 3.3286, + "step": 15524 + }, + { + "epoch": 1.323190999744311, + "grad_norm": 45.37728862433163, + "learning_rate": 6.856296419144217e-06, + "loss": 2.9669, + "step": 15525 + }, + { + "epoch": 1.3232762294383362, + "grad_norm": 133.13862638281432, + "learning_rate": 6.855836000813753e-06, + "loss": 4.6823, + "step": 15526 + }, + { + "epoch": 1.3233614591323617, + "grad_norm": 47.28928895307192, + "learning_rate": 6.855375564231879e-06, + "loss": 3.0578, + "step": 15527 + }, + { + "epoch": 1.3234466888263872, + "grad_norm": 36.40414642383586, + "learning_rate": 6.854915109403123e-06, + "loss": 2.9452, + "step": 15528 + }, + { + "epoch": 1.3235319185204126, + "grad_norm": 52.66213283441907, + "learning_rate": 6.854454636332014e-06, + "loss": 3.0135, + "step": 15529 + }, + { + "epoch": 1.3236171482144379, + "grad_norm": 61.50247400887682, + "learning_rate": 6.853994145023082e-06, + "loss": 3.0135, + "step": 15530 + }, + { + "epoch": 1.3237023779084633, + "grad_norm": 73.81762911891404, + "learning_rate": 6.853533635480854e-06, + "loss": 2.9291, + "step": 15531 + }, + { + "epoch": 1.3237876076024886, + "grad_norm": 16.677468941669925, + "learning_rate": 6.853073107709858e-06, + "loss": 1.1925, + "step": 15532 + }, + { + "epoch": 1.323872837296514, + "grad_norm": 43.03295472073115, + "learning_rate": 6.852612561714624e-06, + "loss": 3.4991, + "step": 15533 + }, + { + "epoch": 1.3239580669905395, + "grad_norm": 42.86333850396276, + "learning_rate": 6.852151997499682e-06, + "loss": 2.822, + "step": 15534 + }, + { + "epoch": 1.324043296684565, + "grad_norm": 39.94860504299173, + "learning_rate": 6.8516914150695624e-06, + "loss": 2.2695, + "step": 15535 + }, + { + "epoch": 1.3241285263785902, + "grad_norm": 92.75981565096437, + "learning_rate": 6.851230814428793e-06, + "loss": 4.3728, + "step": 15536 + }, + { + "epoch": 1.3242137560726157, + "grad_norm": 45.65980766791574, + "learning_rate": 6.850770195581902e-06, + "loss": 3.2229, + "step": 15537 + }, + { + "epoch": 1.3242989857666412, + "grad_norm": 44.364273166078995, + "learning_rate": 6.850309558533425e-06, + "loss": 3.414, + "step": 15538 + }, + { + "epoch": 1.3243842154606664, + "grad_norm": 59.08984443287027, + "learning_rate": 6.849848903287887e-06, + "loss": 3.424, + "step": 15539 + }, + { + "epoch": 1.324469445154692, + "grad_norm": 46.24426253556902, + "learning_rate": 6.84938822984982e-06, + "loss": 3.2023, + "step": 15540 + }, + { + "epoch": 1.3245546748487174, + "grad_norm": 48.68590058661383, + "learning_rate": 6.848927538223754e-06, + "loss": 2.7838, + "step": 15541 + }, + { + "epoch": 1.3246399045427428, + "grad_norm": 49.992178231998516, + "learning_rate": 6.848466828414222e-06, + "loss": 3.0244, + "step": 15542 + }, + { + "epoch": 1.324725134236768, + "grad_norm": 63.272504000838815, + "learning_rate": 6.848006100425753e-06, + "loss": 2.2221, + "step": 15543 + }, + { + "epoch": 1.3248103639307935, + "grad_norm": 60.521876378114115, + "learning_rate": 6.847545354262878e-06, + "loss": 3.6208, + "step": 15544 + }, + { + "epoch": 1.3248955936248188, + "grad_norm": 37.16242864158308, + "learning_rate": 6.847084589930128e-06, + "loss": 2.9643, + "step": 15545 + }, + { + "epoch": 1.3249808233188443, + "grad_norm": 23.075861060530045, + "learning_rate": 6.846623807432035e-06, + "loss": 1.7896, + "step": 15546 + }, + { + "epoch": 1.3250660530128697, + "grad_norm": 40.26533434321775, + "learning_rate": 6.846163006773131e-06, + "loss": 1.9245, + "step": 15547 + }, + { + "epoch": 1.3251512827068952, + "grad_norm": 46.84968643600917, + "learning_rate": 6.845702187957948e-06, + "loss": 2.1159, + "step": 15548 + }, + { + "epoch": 1.3252365124009204, + "grad_norm": 38.37287329811378, + "learning_rate": 6.845241350991015e-06, + "loss": 3.2267, + "step": 15549 + }, + { + "epoch": 1.325321742094946, + "grad_norm": 42.58373681512488, + "learning_rate": 6.844780495876871e-06, + "loss": 3.2825, + "step": 15550 + }, + { + "epoch": 1.3254069717889712, + "grad_norm": 37.48269842010587, + "learning_rate": 6.844319622620039e-06, + "loss": 3.0534, + "step": 15551 + }, + { + "epoch": 1.3254922014829966, + "grad_norm": 50.699887082604064, + "learning_rate": 6.84385873122506e-06, + "loss": 2.0633, + "step": 15552 + }, + { + "epoch": 1.325577431177022, + "grad_norm": 263.4468721093401, + "learning_rate": 6.843397821696461e-06, + "loss": 2.7473, + "step": 15553 + }, + { + "epoch": 1.3256626608710476, + "grad_norm": 33.71384185252614, + "learning_rate": 6.8429368940387775e-06, + "loss": 3.0622, + "step": 15554 + }, + { + "epoch": 1.3257478905650728, + "grad_norm": 48.09883562291585, + "learning_rate": 6.842475948256543e-06, + "loss": 3.1824, + "step": 15555 + }, + { + "epoch": 1.3258331202590983, + "grad_norm": 61.11099670153585, + "learning_rate": 6.8420149843542885e-06, + "loss": 3.7941, + "step": 15556 + }, + { + "epoch": 1.3259183499531237, + "grad_norm": 52.34720383506038, + "learning_rate": 6.841554002336548e-06, + "loss": 3.1583, + "step": 15557 + }, + { + "epoch": 1.326003579647149, + "grad_norm": 44.62189170392099, + "learning_rate": 6.841093002207857e-06, + "loss": 3.9334, + "step": 15558 + }, + { + "epoch": 1.3260888093411745, + "grad_norm": 40.73444015044837, + "learning_rate": 6.840631983972748e-06, + "loss": 3.0382, + "step": 15559 + }, + { + "epoch": 1.3261740390352, + "grad_norm": 78.85636866618034, + "learning_rate": 6.840170947635753e-06, + "loss": 3.0265, + "step": 15560 + }, + { + "epoch": 1.3262592687292254, + "grad_norm": 45.03327780844892, + "learning_rate": 6.839709893201409e-06, + "loss": 2.1068, + "step": 15561 + }, + { + "epoch": 1.3263444984232506, + "grad_norm": 49.62841241646865, + "learning_rate": 6.839248820674249e-06, + "loss": 3.7819, + "step": 15562 + }, + { + "epoch": 1.326429728117276, + "grad_norm": 54.38142713798759, + "learning_rate": 6.838787730058809e-06, + "loss": 4.3105, + "step": 15563 + }, + { + "epoch": 1.3265149578113014, + "grad_norm": 55.763201692359196, + "learning_rate": 6.8383266213596215e-06, + "loss": 3.4756, + "step": 15564 + }, + { + "epoch": 1.3266001875053268, + "grad_norm": 43.712179780912486, + "learning_rate": 6.837865494581222e-06, + "loss": 4.0495, + "step": 15565 + }, + { + "epoch": 1.3266854171993523, + "grad_norm": 41.47474200144491, + "learning_rate": 6.837404349728145e-06, + "loss": 3.5565, + "step": 15566 + }, + { + "epoch": 1.3267706468933778, + "grad_norm": 47.880793980944176, + "learning_rate": 6.836943186804927e-06, + "loss": 2.1419, + "step": 15567 + }, + { + "epoch": 1.326855876587403, + "grad_norm": 70.46034319468386, + "learning_rate": 6.836482005816103e-06, + "loss": 3.672, + "step": 15568 + }, + { + "epoch": 1.3269411062814285, + "grad_norm": 34.107813730578066, + "learning_rate": 6.836020806766208e-06, + "loss": 2.1872, + "step": 15569 + }, + { + "epoch": 1.3270263359754537, + "grad_norm": 29.043242939807836, + "learning_rate": 6.835559589659778e-06, + "loss": 2.0077, + "step": 15570 + }, + { + "epoch": 1.3271115656694792, + "grad_norm": 44.49691361190839, + "learning_rate": 6.835098354501348e-06, + "loss": 3.875, + "step": 15571 + }, + { + "epoch": 1.3271967953635047, + "grad_norm": 44.233153777840435, + "learning_rate": 6.834637101295456e-06, + "loss": 2.7199, + "step": 15572 + }, + { + "epoch": 1.3272820250575301, + "grad_norm": 82.39979585657312, + "learning_rate": 6.834175830046636e-06, + "loss": 3.6934, + "step": 15573 + }, + { + "epoch": 1.3273672547515554, + "grad_norm": 55.30506640092584, + "learning_rate": 6.833714540759428e-06, + "loss": 2.9756, + "step": 15574 + }, + { + "epoch": 1.3274524844455808, + "grad_norm": 69.5842945986976, + "learning_rate": 6.833253233438364e-06, + "loss": 3.5974, + "step": 15575 + }, + { + "epoch": 1.3275377141396063, + "grad_norm": 77.9645034213745, + "learning_rate": 6.832791908087982e-06, + "loss": 3.5381, + "step": 15576 + }, + { + "epoch": 1.3276229438336316, + "grad_norm": 35.54946696284759, + "learning_rate": 6.8323305647128215e-06, + "loss": 2.6259, + "step": 15577 + }, + { + "epoch": 1.327708173527657, + "grad_norm": 132.7130673787493, + "learning_rate": 6.831869203317418e-06, + "loss": 2.9801, + "step": 15578 + }, + { + "epoch": 1.3277934032216825, + "grad_norm": 67.90769548903623, + "learning_rate": 6.831407823906309e-06, + "loss": 3.0537, + "step": 15579 + }, + { + "epoch": 1.327878632915708, + "grad_norm": 41.63699397582665, + "learning_rate": 6.83094642648403e-06, + "loss": 2.9926, + "step": 15580 + }, + { + "epoch": 1.3279638626097332, + "grad_norm": 57.20109679235445, + "learning_rate": 6.8304850110551215e-06, + "loss": 3.2538, + "step": 15581 + }, + { + "epoch": 1.3280490923037587, + "grad_norm": 74.74622361853324, + "learning_rate": 6.830023577624121e-06, + "loss": 3.0187, + "step": 15582 + }, + { + "epoch": 1.328134321997784, + "grad_norm": 31.070706893228003, + "learning_rate": 6.8295621261955645e-06, + "loss": 2.5137, + "step": 15583 + }, + { + "epoch": 1.3282195516918094, + "grad_norm": 58.05843447963815, + "learning_rate": 6.829100656773993e-06, + "loss": 3.2049, + "step": 15584 + }, + { + "epoch": 1.3283047813858349, + "grad_norm": 33.27431143522909, + "learning_rate": 6.828639169363943e-06, + "loss": 2.2801, + "step": 15585 + }, + { + "epoch": 1.3283900110798603, + "grad_norm": 47.80230940202344, + "learning_rate": 6.828177663969953e-06, + "loss": 2.8789, + "step": 15586 + }, + { + "epoch": 1.3284752407738856, + "grad_norm": 42.12954029443203, + "learning_rate": 6.827716140596561e-06, + "loss": 3.5172, + "step": 15587 + }, + { + "epoch": 1.328560470467911, + "grad_norm": 45.422701616627386, + "learning_rate": 6.827254599248308e-06, + "loss": 2.8208, + "step": 15588 + }, + { + "epoch": 1.3286457001619363, + "grad_norm": 37.4969167164416, + "learning_rate": 6.826793039929733e-06, + "loss": 2.3568, + "step": 15589 + }, + { + "epoch": 1.3287309298559618, + "grad_norm": 35.43168554485688, + "learning_rate": 6.826331462645373e-06, + "loss": 1.4671, + "step": 15590 + }, + { + "epoch": 1.3288161595499872, + "grad_norm": 77.95901139427983, + "learning_rate": 6.825869867399769e-06, + "loss": 3.3713, + "step": 15591 + }, + { + "epoch": 1.3289013892440127, + "grad_norm": 50.25347808961389, + "learning_rate": 6.825408254197461e-06, + "loss": 2.7633, + "step": 15592 + }, + { + "epoch": 1.328986618938038, + "grad_norm": 53.28069606379814, + "learning_rate": 6.8249466230429885e-06, + "loss": 3.0752, + "step": 15593 + }, + { + "epoch": 1.3290718486320634, + "grad_norm": 59.412656011898875, + "learning_rate": 6.82448497394089e-06, + "loss": 2.5727, + "step": 15594 + }, + { + "epoch": 1.3291570783260889, + "grad_norm": 63.05480405056201, + "learning_rate": 6.824023306895708e-06, + "loss": 4.3371, + "step": 15595 + }, + { + "epoch": 1.3292423080201141, + "grad_norm": 70.47452461992994, + "learning_rate": 6.823561621911979e-06, + "loss": 3.2778, + "step": 15596 + }, + { + "epoch": 1.3293275377141396, + "grad_norm": 37.75137473715521, + "learning_rate": 6.82309991899425e-06, + "loss": 2.9453, + "step": 15597 + }, + { + "epoch": 1.329412767408165, + "grad_norm": 87.23736166269094, + "learning_rate": 6.8226381981470555e-06, + "loss": 2.5622, + "step": 15598 + }, + { + "epoch": 1.3294979971021905, + "grad_norm": 100.16404689312448, + "learning_rate": 6.822176459374938e-06, + "loss": 2.6363, + "step": 15599 + }, + { + "epoch": 1.3295832267962158, + "grad_norm": 21.65681297668491, + "learning_rate": 6.8217147026824405e-06, + "loss": 2.0867, + "step": 15600 + }, + { + "epoch": 1.3296684564902412, + "grad_norm": 39.50767169397976, + "learning_rate": 6.8212529280741016e-06, + "loss": 2.8434, + "step": 15601 + }, + { + "epoch": 1.3297536861842665, + "grad_norm": 113.54473520542085, + "learning_rate": 6.820791135554464e-06, + "loss": 5.3307, + "step": 15602 + }, + { + "epoch": 1.329838915878292, + "grad_norm": 68.22741651066148, + "learning_rate": 6.82032932512807e-06, + "loss": 2.163, + "step": 15603 + }, + { + "epoch": 1.3299241455723174, + "grad_norm": 44.92157929459222, + "learning_rate": 6.81986749679946e-06, + "loss": 2.2973, + "step": 15604 + }, + { + "epoch": 1.3300093752663429, + "grad_norm": 40.435626671642886, + "learning_rate": 6.819405650573174e-06, + "loss": 2.9571, + "step": 15605 + }, + { + "epoch": 1.3300946049603681, + "grad_norm": 93.72733095529884, + "learning_rate": 6.81894378645376e-06, + "loss": 2.9161, + "step": 15606 + }, + { + "epoch": 1.3301798346543936, + "grad_norm": 43.2462621836709, + "learning_rate": 6.818481904445755e-06, + "loss": 3.1592, + "step": 15607 + }, + { + "epoch": 1.330265064348419, + "grad_norm": 38.68239646503992, + "learning_rate": 6.818020004553703e-06, + "loss": 2.5442, + "step": 15608 + }, + { + "epoch": 1.3303502940424443, + "grad_norm": 35.27300810758296, + "learning_rate": 6.817558086782146e-06, + "loss": 2.6701, + "step": 15609 + }, + { + "epoch": 1.3304355237364698, + "grad_norm": 35.69823754338372, + "learning_rate": 6.817096151135629e-06, + "loss": 2.4893, + "step": 15610 + }, + { + "epoch": 1.3305207534304953, + "grad_norm": 35.98472076544733, + "learning_rate": 6.8166341976186936e-06, + "loss": 2.5762, + "step": 15611 + }, + { + "epoch": 1.3306059831245207, + "grad_norm": 31.779985716705244, + "learning_rate": 6.816172226235881e-06, + "loss": 2.5414, + "step": 15612 + }, + { + "epoch": 1.330691212818546, + "grad_norm": 45.47257297911599, + "learning_rate": 6.815710236991737e-06, + "loss": 3.0701, + "step": 15613 + }, + { + "epoch": 1.3307764425125714, + "grad_norm": 31.70097700322869, + "learning_rate": 6.815248229890805e-06, + "loss": 2.4427, + "step": 15614 + }, + { + "epoch": 1.3308616722065967, + "grad_norm": 71.73027594954107, + "learning_rate": 6.814786204937626e-06, + "loss": 3.6748, + "step": 15615 + }, + { + "epoch": 1.3309469019006221, + "grad_norm": 65.50986023811426, + "learning_rate": 6.814324162136747e-06, + "loss": 2.226, + "step": 15616 + }, + { + "epoch": 1.3310321315946476, + "grad_norm": 78.2987858842193, + "learning_rate": 6.813862101492712e-06, + "loss": 3.0979, + "step": 15617 + }, + { + "epoch": 1.331117361288673, + "grad_norm": 41.900718635672916, + "learning_rate": 6.813400023010062e-06, + "loss": 2.4107, + "step": 15618 + }, + { + "epoch": 1.3312025909826983, + "grad_norm": 54.78283405751167, + "learning_rate": 6.812937926693345e-06, + "loss": 2.5617, + "step": 15619 + }, + { + "epoch": 1.3312878206767238, + "grad_norm": 51.03376328020042, + "learning_rate": 6.812475812547103e-06, + "loss": 2.8092, + "step": 15620 + }, + { + "epoch": 1.331373050370749, + "grad_norm": 57.67541935831418, + "learning_rate": 6.812013680575882e-06, + "loss": 2.4024, + "step": 15621 + }, + { + "epoch": 1.3314582800647745, + "grad_norm": 55.81325817707947, + "learning_rate": 6.811551530784226e-06, + "loss": 3.4621, + "step": 15622 + }, + { + "epoch": 1.3315435097588, + "grad_norm": 52.39440324305119, + "learning_rate": 6.8110893631766804e-06, + "loss": 2.3079, + "step": 15623 + }, + { + "epoch": 1.3316287394528254, + "grad_norm": 88.3962693140054, + "learning_rate": 6.8106271777577915e-06, + "loss": 4.5212, + "step": 15624 + }, + { + "epoch": 1.3317139691468507, + "grad_norm": 68.90303673616944, + "learning_rate": 6.810164974532103e-06, + "loss": 3.6, + "step": 15625 + }, + { + "epoch": 1.3317991988408762, + "grad_norm": 27.47031685429221, + "learning_rate": 6.809702753504161e-06, + "loss": 2.2577, + "step": 15626 + }, + { + "epoch": 1.3318844285349016, + "grad_norm": 43.82167972291805, + "learning_rate": 6.8092405146785125e-06, + "loss": 2.4069, + "step": 15627 + }, + { + "epoch": 1.3319696582289269, + "grad_norm": 56.429704948269006, + "learning_rate": 6.808778258059703e-06, + "loss": 3.2457, + "step": 15628 + }, + { + "epoch": 1.3320548879229523, + "grad_norm": 62.011095279077445, + "learning_rate": 6.808315983652276e-06, + "loss": 2.4792, + "step": 15629 + }, + { + "epoch": 1.3321401176169778, + "grad_norm": 37.51662683279249, + "learning_rate": 6.80785369146078e-06, + "loss": 2.9048, + "step": 15630 + }, + { + "epoch": 1.3322253473110033, + "grad_norm": 99.84808204462655, + "learning_rate": 6.807391381489763e-06, + "loss": 2.2797, + "step": 15631 + }, + { + "epoch": 1.3323105770050285, + "grad_norm": 25.23681881709884, + "learning_rate": 6.80692905374377e-06, + "loss": 1.7734, + "step": 15632 + }, + { + "epoch": 1.332395806699054, + "grad_norm": 73.48576013501166, + "learning_rate": 6.806466708227346e-06, + "loss": 3.5632, + "step": 15633 + }, + { + "epoch": 1.3324810363930792, + "grad_norm": 89.07026041443655, + "learning_rate": 6.806004344945041e-06, + "loss": 2.3518, + "step": 15634 + }, + { + "epoch": 1.3325662660871047, + "grad_norm": 29.66899082430586, + "learning_rate": 6.805541963901401e-06, + "loss": 1.9626, + "step": 15635 + }, + { + "epoch": 1.3326514957811302, + "grad_norm": 42.148076291243875, + "learning_rate": 6.805079565100973e-06, + "loss": 2.824, + "step": 15636 + }, + { + "epoch": 1.3327367254751556, + "grad_norm": 44.11901692081266, + "learning_rate": 6.804617148548304e-06, + "loss": 3.2634, + "step": 15637 + }, + { + "epoch": 1.332821955169181, + "grad_norm": 50.654030491638125, + "learning_rate": 6.8041547142479425e-06, + "loss": 2.4337, + "step": 15638 + }, + { + "epoch": 1.3329071848632064, + "grad_norm": 40.357650375166884, + "learning_rate": 6.803692262204436e-06, + "loss": 2.7928, + "step": 15639 + }, + { + "epoch": 1.3329924145572316, + "grad_norm": 93.88593553368715, + "learning_rate": 6.803229792422335e-06, + "loss": 3.6023, + "step": 15640 + }, + { + "epoch": 1.333077644251257, + "grad_norm": 61.523740049393744, + "learning_rate": 6.802767304906185e-06, + "loss": 3.3349, + "step": 15641 + }, + { + "epoch": 1.3331628739452825, + "grad_norm": 65.45859556282835, + "learning_rate": 6.802304799660533e-06, + "loss": 3.9419, + "step": 15642 + }, + { + "epoch": 1.333248103639308, + "grad_norm": 76.43611062722916, + "learning_rate": 6.801842276689931e-06, + "loss": 4.2569, + "step": 15643 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 47.16558854640826, + "learning_rate": 6.801379735998925e-06, + "loss": 2.7184, + "step": 15644 + }, + { + "epoch": 1.3334185630273587, + "grad_norm": 38.70785087328334, + "learning_rate": 6.800917177592067e-06, + "loss": 2.6881, + "step": 15645 + }, + { + "epoch": 1.3335037927213842, + "grad_norm": 53.05940696318226, + "learning_rate": 6.8004546014739026e-06, + "loss": 2.9111, + "step": 15646 + }, + { + "epoch": 1.3335890224154094, + "grad_norm": 25.991010905158426, + "learning_rate": 6.799992007648982e-06, + "loss": 1.724, + "step": 15647 + }, + { + "epoch": 1.333674252109435, + "grad_norm": 61.56957874975927, + "learning_rate": 6.799529396121855e-06, + "loss": 3.1747, + "step": 15648 + }, + { + "epoch": 1.3337594818034604, + "grad_norm": 71.68056894253846, + "learning_rate": 6.799066766897075e-06, + "loss": 2.3622, + "step": 15649 + }, + { + "epoch": 1.3338447114974858, + "grad_norm": 35.447162742898875, + "learning_rate": 6.798604119979185e-06, + "loss": 2.9352, + "step": 15650 + }, + { + "epoch": 1.333929941191511, + "grad_norm": 52.43587778834878, + "learning_rate": 6.798141455372739e-06, + "loss": 2.5326, + "step": 15651 + }, + { + "epoch": 1.3340151708855366, + "grad_norm": 44.67505759854873, + "learning_rate": 6.797678773082286e-06, + "loss": 3.2556, + "step": 15652 + }, + { + "epoch": 1.3341004005795618, + "grad_norm": 21.313258341157706, + "learning_rate": 6.7972160731123785e-06, + "loss": 1.0515, + "step": 15653 + }, + { + "epoch": 1.3341856302735873, + "grad_norm": 23.719166867781677, + "learning_rate": 6.796753355467562e-06, + "loss": 1.9842, + "step": 15654 + }, + { + "epoch": 1.3342708599676127, + "grad_norm": 39.8896332744549, + "learning_rate": 6.796290620152391e-06, + "loss": 2.9705, + "step": 15655 + }, + { + "epoch": 1.3343560896616382, + "grad_norm": 293.5144032049549, + "learning_rate": 6.795827867171418e-06, + "loss": 1.7961, + "step": 15656 + }, + { + "epoch": 1.3344413193556635, + "grad_norm": 39.99959402483009, + "learning_rate": 6.7953650965291896e-06, + "loss": 2.5857, + "step": 15657 + }, + { + "epoch": 1.334526549049689, + "grad_norm": 53.69167778389369, + "learning_rate": 6.794902308230258e-06, + "loss": 2.1348, + "step": 15658 + }, + { + "epoch": 1.3346117787437142, + "grad_norm": 27.48119235542153, + "learning_rate": 6.794439502279176e-06, + "loss": 2.6982, + "step": 15659 + }, + { + "epoch": 1.3346970084377396, + "grad_norm": 38.457665936233326, + "learning_rate": 6.793976678680496e-06, + "loss": 2.7579, + "step": 15660 + }, + { + "epoch": 1.334782238131765, + "grad_norm": 108.48625844583282, + "learning_rate": 6.793513837438768e-06, + "loss": 3.6127, + "step": 15661 + }, + { + "epoch": 1.3348674678257906, + "grad_norm": 35.49412106501894, + "learning_rate": 6.793050978558542e-06, + "loss": 2.4861, + "step": 15662 + }, + { + "epoch": 1.3349526975198158, + "grad_norm": 99.52108335597164, + "learning_rate": 6.792588102044373e-06, + "loss": 2.8521, + "step": 15663 + }, + { + "epoch": 1.3350379272138413, + "grad_norm": 67.08145011458188, + "learning_rate": 6.792125207900814e-06, + "loss": 4.0542, + "step": 15664 + }, + { + "epoch": 1.3351231569078668, + "grad_norm": 62.29651643165454, + "learning_rate": 6.791662296132414e-06, + "loss": 3.3882, + "step": 15665 + }, + { + "epoch": 1.335208386601892, + "grad_norm": 81.03376806802638, + "learning_rate": 6.791199366743728e-06, + "loss": 5.4001, + "step": 15666 + }, + { + "epoch": 1.3352936162959175, + "grad_norm": 53.439558690481, + "learning_rate": 6.790736419739307e-06, + "loss": 4.7308, + "step": 15667 + }, + { + "epoch": 1.335378845989943, + "grad_norm": 68.34676505301846, + "learning_rate": 6.790273455123706e-06, + "loss": 2.3108, + "step": 15668 + }, + { + "epoch": 1.3354640756839684, + "grad_norm": 35.12373594605054, + "learning_rate": 6.7898104729014765e-06, + "loss": 3.2274, + "step": 15669 + }, + { + "epoch": 1.3355493053779937, + "grad_norm": 50.00272330627334, + "learning_rate": 6.7893474730771724e-06, + "loss": 1.8039, + "step": 15670 + }, + { + "epoch": 1.3356345350720191, + "grad_norm": 29.31659439484658, + "learning_rate": 6.788884455655347e-06, + "loss": 2.6273, + "step": 15671 + }, + { + "epoch": 1.3357197647660444, + "grad_norm": 39.08432979657253, + "learning_rate": 6.788421420640553e-06, + "loss": 2.629, + "step": 15672 + }, + { + "epoch": 1.3358049944600698, + "grad_norm": 117.16357264232742, + "learning_rate": 6.787958368037346e-06, + "loss": 3.48, + "step": 15673 + }, + { + "epoch": 1.3358902241540953, + "grad_norm": 87.14694276103965, + "learning_rate": 6.787495297850277e-06, + "loss": 2.5348, + "step": 15674 + }, + { + "epoch": 1.3359754538481208, + "grad_norm": 43.83632197575332, + "learning_rate": 6.787032210083905e-06, + "loss": 2.1696, + "step": 15675 + }, + { + "epoch": 1.336060683542146, + "grad_norm": 62.24097405347989, + "learning_rate": 6.786569104742779e-06, + "loss": 2.1172, + "step": 15676 + }, + { + "epoch": 1.3361459132361715, + "grad_norm": 88.06013834902978, + "learning_rate": 6.7861059818314566e-06, + "loss": 3.0418, + "step": 15677 + }, + { + "epoch": 1.336231142930197, + "grad_norm": 61.357295497148876, + "learning_rate": 6.7856428413544915e-06, + "loss": 3.0567, + "step": 15678 + }, + { + "epoch": 1.3363163726242222, + "grad_norm": 59.49290529222585, + "learning_rate": 6.78517968331644e-06, + "loss": 3.0487, + "step": 15679 + }, + { + "epoch": 1.3364016023182477, + "grad_norm": 37.521065835075774, + "learning_rate": 6.784716507721854e-06, + "loss": 2.517, + "step": 15680 + }, + { + "epoch": 1.3364868320122731, + "grad_norm": 93.44002219672902, + "learning_rate": 6.78425331457529e-06, + "loss": 2.9634, + "step": 15681 + }, + { + "epoch": 1.3365720617062986, + "grad_norm": 42.10815535158952, + "learning_rate": 6.7837901038813034e-06, + "loss": 3.1157, + "step": 15682 + }, + { + "epoch": 1.3366572914003239, + "grad_norm": 54.73742590808502, + "learning_rate": 6.7833268756444516e-06, + "loss": 2.858, + "step": 15683 + }, + { + "epoch": 1.3367425210943493, + "grad_norm": 28.911966379108456, + "learning_rate": 6.782863629869289e-06, + "loss": 2.7717, + "step": 15684 + }, + { + "epoch": 1.3368277507883746, + "grad_norm": 55.89807630828965, + "learning_rate": 6.7824003665603685e-06, + "loss": 2.7368, + "step": 15685 + }, + { + "epoch": 1.3369129804824, + "grad_norm": 33.31940561918553, + "learning_rate": 6.78193708572225e-06, + "loss": 2.5569, + "step": 15686 + }, + { + "epoch": 1.3369982101764255, + "grad_norm": 110.2204978724415, + "learning_rate": 6.7814737873594885e-06, + "loss": 3.0861, + "step": 15687 + }, + { + "epoch": 1.337083439870451, + "grad_norm": 119.04164279815589, + "learning_rate": 6.781010471476641e-06, + "loss": 3.4249, + "step": 15688 + }, + { + "epoch": 1.3371686695644762, + "grad_norm": 45.91686290964688, + "learning_rate": 6.780547138078261e-06, + "loss": 3.495, + "step": 15689 + }, + { + "epoch": 1.3372538992585017, + "grad_norm": 37.02406660228953, + "learning_rate": 6.78008378716891e-06, + "loss": 2.9655, + "step": 15690 + }, + { + "epoch": 1.337339128952527, + "grad_norm": 31.112766329947853, + "learning_rate": 6.77962041875314e-06, + "loss": 2.6267, + "step": 15691 + }, + { + "epoch": 1.3374243586465524, + "grad_norm": 39.37931643511885, + "learning_rate": 6.779157032835512e-06, + "loss": 2.9579, + "step": 15692 + }, + { + "epoch": 1.3375095883405779, + "grad_norm": 47.21932046122284, + "learning_rate": 6.778693629420581e-06, + "loss": 3.677, + "step": 15693 + }, + { + "epoch": 1.3375948180346033, + "grad_norm": 48.72218036679757, + "learning_rate": 6.778230208512904e-06, + "loss": 2.1924, + "step": 15694 + }, + { + "epoch": 1.3376800477286286, + "grad_norm": 63.98575354322718, + "learning_rate": 6.7777667701170406e-06, + "loss": 3.6693, + "step": 15695 + }, + { + "epoch": 1.337765277422654, + "grad_norm": 35.84653287432897, + "learning_rate": 6.777303314237548e-06, + "loss": 3.23, + "step": 15696 + }, + { + "epoch": 1.3378505071166795, + "grad_norm": 80.4414193777861, + "learning_rate": 6.776839840878981e-06, + "loss": 2.4132, + "step": 15697 + }, + { + "epoch": 1.3379357368107048, + "grad_norm": 43.374303638761845, + "learning_rate": 6.776376350045902e-06, + "loss": 2.4108, + "step": 15698 + }, + { + "epoch": 1.3380209665047302, + "grad_norm": 44.811152184049156, + "learning_rate": 6.775912841742869e-06, + "loss": 1.5346, + "step": 15699 + }, + { + "epoch": 1.3381061961987557, + "grad_norm": 95.82997606618594, + "learning_rate": 6.775449315974436e-06, + "loss": 3.2121, + "step": 15700 + }, + { + "epoch": 1.3381914258927812, + "grad_norm": 97.62277591649885, + "learning_rate": 6.774985772745166e-06, + "loss": 4.4666, + "step": 15701 + }, + { + "epoch": 1.3382766555868064, + "grad_norm": 79.06023304471573, + "learning_rate": 6.774522212059616e-06, + "loss": 4.258, + "step": 15702 + }, + { + "epoch": 1.3383618852808319, + "grad_norm": 33.61357843032053, + "learning_rate": 6.774058633922345e-06, + "loss": 2.2824, + "step": 15703 + }, + { + "epoch": 1.3384471149748571, + "grad_norm": 270.4956011969769, + "learning_rate": 6.773595038337913e-06, + "loss": 2.8057, + "step": 15704 + }, + { + "epoch": 1.3385323446688826, + "grad_norm": 111.413051786917, + "learning_rate": 6.773131425310877e-06, + "loss": 4.6946, + "step": 15705 + }, + { + "epoch": 1.338617574362908, + "grad_norm": 46.40529924490707, + "learning_rate": 6.772667794845798e-06, + "loss": 3.6648, + "step": 15706 + }, + { + "epoch": 1.3387028040569335, + "grad_norm": 53.25983461009491, + "learning_rate": 6.7722041469472375e-06, + "loss": 3.1465, + "step": 15707 + }, + { + "epoch": 1.3387880337509588, + "grad_norm": 54.28113316360823, + "learning_rate": 6.771740481619753e-06, + "loss": 2.7276, + "step": 15708 + }, + { + "epoch": 1.3388732634449843, + "grad_norm": 44.49533666423252, + "learning_rate": 6.771276798867903e-06, + "loss": 3.6299, + "step": 15709 + }, + { + "epoch": 1.3389584931390095, + "grad_norm": 62.997923013132976, + "learning_rate": 6.770813098696252e-06, + "loss": 3.1781, + "step": 15710 + }, + { + "epoch": 1.339043722833035, + "grad_norm": 72.1055382158146, + "learning_rate": 6.770349381109357e-06, + "loss": 2.2261, + "step": 15711 + }, + { + "epoch": 1.3391289525270604, + "grad_norm": 62.94211956302621, + "learning_rate": 6.769885646111778e-06, + "loss": 3.3777, + "step": 15712 + }, + { + "epoch": 1.339214182221086, + "grad_norm": 38.571228745844294, + "learning_rate": 6.769421893708078e-06, + "loss": 3.1661, + "step": 15713 + }, + { + "epoch": 1.3392994119151111, + "grad_norm": 32.442531994832486, + "learning_rate": 6.768958123902818e-06, + "loss": 2.9617, + "step": 15714 + }, + { + "epoch": 1.3393846416091366, + "grad_norm": 27.1417200155367, + "learning_rate": 6.768494336700555e-06, + "loss": 2.3623, + "step": 15715 + }, + { + "epoch": 1.339469871303162, + "grad_norm": 56.63998458865266, + "learning_rate": 6.768030532105855e-06, + "loss": 3.5701, + "step": 15716 + }, + { + "epoch": 1.3395551009971873, + "grad_norm": 36.25723678414569, + "learning_rate": 6.767566710123277e-06, + "loss": 2.7611, + "step": 15717 + }, + { + "epoch": 1.3396403306912128, + "grad_norm": 39.020624530697354, + "learning_rate": 6.7671028707573826e-06, + "loss": 2.1645, + "step": 15718 + }, + { + "epoch": 1.3397255603852383, + "grad_norm": 37.4600127298181, + "learning_rate": 6.766639014012733e-06, + "loss": 2.4606, + "step": 15719 + }, + { + "epoch": 1.3398107900792637, + "grad_norm": 36.62441334366306, + "learning_rate": 6.766175139893891e-06, + "loss": 3.6564, + "step": 15720 + }, + { + "epoch": 1.339896019773289, + "grad_norm": 38.523808445670625, + "learning_rate": 6.7657112484054175e-06, + "loss": 3.3178, + "step": 15721 + }, + { + "epoch": 1.3399812494673145, + "grad_norm": 40.24671082761361, + "learning_rate": 6.765247339551878e-06, + "loss": 2.3913, + "step": 15722 + }, + { + "epoch": 1.3400664791613397, + "grad_norm": 32.17331237278325, + "learning_rate": 6.76478341333783e-06, + "loss": 3.5197, + "step": 15723 + }, + { + "epoch": 1.3401517088553652, + "grad_norm": 38.55113670607752, + "learning_rate": 6.76431946976784e-06, + "loss": 3.317, + "step": 15724 + }, + { + "epoch": 1.3402369385493906, + "grad_norm": 68.15884241755218, + "learning_rate": 6.763855508846468e-06, + "loss": 2.7279, + "step": 15725 + }, + { + "epoch": 1.340322168243416, + "grad_norm": 66.36204381029692, + "learning_rate": 6.763391530578279e-06, + "loss": 3.4222, + "step": 15726 + }, + { + "epoch": 1.3404073979374413, + "grad_norm": 33.02145455817104, + "learning_rate": 6.762927534967834e-06, + "loss": 2.3743, + "step": 15727 + }, + { + "epoch": 1.3404926276314668, + "grad_norm": 78.30529279160508, + "learning_rate": 6.7624635220196966e-06, + "loss": 3.9957, + "step": 15728 + }, + { + "epoch": 1.3405778573254923, + "grad_norm": 63.393043632719035, + "learning_rate": 6.761999491738432e-06, + "loss": 2.9803, + "step": 15729 + }, + { + "epoch": 1.3406630870195175, + "grad_norm": 57.403070286442414, + "learning_rate": 6.761535444128602e-06, + "loss": 3.4591, + "step": 15730 + }, + { + "epoch": 1.340748316713543, + "grad_norm": 61.96848567224592, + "learning_rate": 6.761071379194773e-06, + "loss": 2.3741, + "step": 15731 + }, + { + "epoch": 1.3408335464075685, + "grad_norm": 33.81263588008792, + "learning_rate": 6.760607296941503e-06, + "loss": 2.5925, + "step": 15732 + }, + { + "epoch": 1.3409187761015937, + "grad_norm": 62.43457949208044, + "learning_rate": 6.7601431973733626e-06, + "loss": 3.8427, + "step": 15733 + }, + { + "epoch": 1.3410040057956192, + "grad_norm": 82.4957579752244, + "learning_rate": 6.759679080494912e-06, + "loss": 4.1302, + "step": 15734 + }, + { + "epoch": 1.3410892354896446, + "grad_norm": 74.8731165901752, + "learning_rate": 6.759214946310717e-06, + "loss": 2.7899, + "step": 15735 + }, + { + "epoch": 1.34117446518367, + "grad_norm": 51.813794684642005, + "learning_rate": 6.758750794825342e-06, + "loss": 3.1459, + "step": 15736 + }, + { + "epoch": 1.3412596948776954, + "grad_norm": 62.36518298559305, + "learning_rate": 6.7582866260433526e-06, + "loss": 2.6798, + "step": 15737 + }, + { + "epoch": 1.3413449245717208, + "grad_norm": 52.46093837495476, + "learning_rate": 6.757822439969311e-06, + "loss": 2.93, + "step": 15738 + }, + { + "epoch": 1.3414301542657463, + "grad_norm": 33.1586408282521, + "learning_rate": 6.757358236607787e-06, + "loss": 2.6302, + "step": 15739 + }, + { + "epoch": 1.3415153839597715, + "grad_norm": 46.42014549411572, + "learning_rate": 6.756894015963342e-06, + "loss": 4.0492, + "step": 15740 + }, + { + "epoch": 1.341600613653797, + "grad_norm": 42.48817708189302, + "learning_rate": 6.756429778040541e-06, + "loss": 2.2971, + "step": 15741 + }, + { + "epoch": 1.3416858433478223, + "grad_norm": 35.12368285079719, + "learning_rate": 6.7559655228439526e-06, + "loss": 2.9101, + "step": 15742 + }, + { + "epoch": 1.3417710730418477, + "grad_norm": 48.073438062054784, + "learning_rate": 6.755501250378142e-06, + "loss": 3.4442, + "step": 15743 + }, + { + "epoch": 1.3418563027358732, + "grad_norm": 29.05142815028232, + "learning_rate": 6.755036960647672e-06, + "loss": 2.5427, + "step": 15744 + }, + { + "epoch": 1.3419415324298987, + "grad_norm": 82.12820299991974, + "learning_rate": 6.754572653657111e-06, + "loss": 2.9841, + "step": 15745 + }, + { + "epoch": 1.342026762123924, + "grad_norm": 53.64589177535471, + "learning_rate": 6.7541083294110264e-06, + "loss": 1.9311, + "step": 15746 + }, + { + "epoch": 1.3421119918179494, + "grad_norm": 63.196498159381854, + "learning_rate": 6.753643987913983e-06, + "loss": 1.9385, + "step": 15747 + }, + { + "epoch": 1.3421972215119748, + "grad_norm": 698.6444180456917, + "learning_rate": 6.753179629170546e-06, + "loss": 4.0374, + "step": 15748 + }, + { + "epoch": 1.342282451206, + "grad_norm": 28.701038103182846, + "learning_rate": 6.752715253185287e-06, + "loss": 2.6742, + "step": 15749 + }, + { + "epoch": 1.3423676809000256, + "grad_norm": 74.10548702253575, + "learning_rate": 6.7522508599627676e-06, + "loss": 4.5597, + "step": 15750 + }, + { + "epoch": 1.342452910594051, + "grad_norm": 33.59811224634017, + "learning_rate": 6.751786449507559e-06, + "loss": 3.3607, + "step": 15751 + }, + { + "epoch": 1.3425381402880765, + "grad_norm": 58.24463334749899, + "learning_rate": 6.751322021824226e-06, + "loss": 2.3011, + "step": 15752 + }, + { + "epoch": 1.3426233699821017, + "grad_norm": 67.23133730648799, + "learning_rate": 6.750857576917335e-06, + "loss": 2.9955, + "step": 15753 + }, + { + "epoch": 1.3427085996761272, + "grad_norm": 46.95625417240942, + "learning_rate": 6.750393114791459e-06, + "loss": 3.4067, + "step": 15754 + }, + { + "epoch": 1.3427938293701525, + "grad_norm": 51.49647587862923, + "learning_rate": 6.74992863545116e-06, + "loss": 2.3247, + "step": 15755 + }, + { + "epoch": 1.342879059064178, + "grad_norm": 43.56666730987543, + "learning_rate": 6.749464138901008e-06, + "loss": 3.0659, + "step": 15756 + }, + { + "epoch": 1.3429642887582034, + "grad_norm": 102.11130956535362, + "learning_rate": 6.7489996251455735e-06, + "loss": 2.975, + "step": 15757 + }, + { + "epoch": 1.3430495184522289, + "grad_norm": 46.77233031418864, + "learning_rate": 6.7485350941894214e-06, + "loss": 2.7013, + "step": 15758 + }, + { + "epoch": 1.343134748146254, + "grad_norm": 37.92237427533755, + "learning_rate": 6.748070546037121e-06, + "loss": 2.5355, + "step": 15759 + }, + { + "epoch": 1.3432199778402796, + "grad_norm": 56.41574485543357, + "learning_rate": 6.747605980693241e-06, + "loss": 3.0528, + "step": 15760 + }, + { + "epoch": 1.3433052075343048, + "grad_norm": 43.75580417264793, + "learning_rate": 6.747141398162352e-06, + "loss": 2.9887, + "step": 15761 + }, + { + "epoch": 1.3433904372283303, + "grad_norm": 38.32125397057043, + "learning_rate": 6.7466767984490215e-06, + "loss": 2.9548, + "step": 15762 + }, + { + "epoch": 1.3434756669223558, + "grad_norm": 94.46422677219483, + "learning_rate": 6.746212181557818e-06, + "loss": 3.4508, + "step": 15763 + }, + { + "epoch": 1.3435608966163812, + "grad_norm": 49.88434355493161, + "learning_rate": 6.7457475474933106e-06, + "loss": 3.211, + "step": 15764 + }, + { + "epoch": 1.3436461263104065, + "grad_norm": 43.15714888742384, + "learning_rate": 6.7452828962600715e-06, + "loss": 2.4126, + "step": 15765 + }, + { + "epoch": 1.343731356004432, + "grad_norm": 92.1606728008104, + "learning_rate": 6.744818227862669e-06, + "loss": 3.3704, + "step": 15766 + }, + { + "epoch": 1.3438165856984574, + "grad_norm": 50.01922282283898, + "learning_rate": 6.744353542305671e-06, + "loss": 2.3432, + "step": 15767 + }, + { + "epoch": 1.3439018153924827, + "grad_norm": 28.670356701873818, + "learning_rate": 6.743888839593649e-06, + "loss": 3.0438, + "step": 15768 + }, + { + "epoch": 1.3439870450865081, + "grad_norm": 47.23596560602153, + "learning_rate": 6.7434241197311745e-06, + "loss": 3.3006, + "step": 15769 + }, + { + "epoch": 1.3440722747805336, + "grad_norm": 166.87028862082323, + "learning_rate": 6.7429593827228155e-06, + "loss": 4.9074, + "step": 15770 + }, + { + "epoch": 1.344157504474559, + "grad_norm": 83.62656916487617, + "learning_rate": 6.742494628573143e-06, + "loss": 4.0593, + "step": 15771 + }, + { + "epoch": 1.3442427341685843, + "grad_norm": 30.880813186665968, + "learning_rate": 6.74202985728673e-06, + "loss": 2.2608, + "step": 15772 + }, + { + "epoch": 1.3443279638626098, + "grad_norm": 85.85802837403277, + "learning_rate": 6.741565068868144e-06, + "loss": 3.3239, + "step": 15773 + }, + { + "epoch": 1.344413193556635, + "grad_norm": 56.64388506462628, + "learning_rate": 6.74110026332196e-06, + "loss": 3.1387, + "step": 15774 + }, + { + "epoch": 1.3444984232506605, + "grad_norm": 29.054149493465705, + "learning_rate": 6.740635440652743e-06, + "loss": 2.0102, + "step": 15775 + }, + { + "epoch": 1.344583652944686, + "grad_norm": 120.73861446277411, + "learning_rate": 6.740170600865072e-06, + "loss": 3.8, + "step": 15776 + }, + { + "epoch": 1.3446688826387114, + "grad_norm": 54.015613231790866, + "learning_rate": 6.739705743963511e-06, + "loss": 2.0497, + "step": 15777 + }, + { + "epoch": 1.3447541123327367, + "grad_norm": 55.229081740418756, + "learning_rate": 6.739240869952639e-06, + "loss": 3.6111, + "step": 15778 + }, + { + "epoch": 1.3448393420267621, + "grad_norm": 49.4822023640744, + "learning_rate": 6.73877597883702e-06, + "loss": 3.3562, + "step": 15779 + }, + { + "epoch": 1.3449245717207874, + "grad_norm": 49.86023428129334, + "learning_rate": 6.738311070621234e-06, + "loss": 2.608, + "step": 15780 + }, + { + "epoch": 1.3450098014148129, + "grad_norm": 64.93256449756853, + "learning_rate": 6.737846145309847e-06, + "loss": 3.0745, + "step": 15781 + }, + { + "epoch": 1.3450950311088383, + "grad_norm": 62.31149530947394, + "learning_rate": 6.737381202907436e-06, + "loss": 2.7191, + "step": 15782 + }, + { + "epoch": 1.3451802608028638, + "grad_norm": 53.07379278930126, + "learning_rate": 6.736916243418568e-06, + "loss": 2.8767, + "step": 15783 + }, + { + "epoch": 1.345265490496889, + "grad_norm": 61.46232970340639, + "learning_rate": 6.736451266847821e-06, + "loss": 4.0923, + "step": 15784 + }, + { + "epoch": 1.3453507201909145, + "grad_norm": 86.34900143923552, + "learning_rate": 6.735986273199766e-06, + "loss": 4.3689, + "step": 15785 + }, + { + "epoch": 1.34543594988494, + "grad_norm": 38.27055087964663, + "learning_rate": 6.7355212624789755e-06, + "loss": 2.5714, + "step": 15786 + }, + { + "epoch": 1.3455211795789652, + "grad_norm": 74.06338439834053, + "learning_rate": 6.735056234690023e-06, + "loss": 3.7297, + "step": 15787 + }, + { + "epoch": 1.3456064092729907, + "grad_norm": 101.96710591291202, + "learning_rate": 6.73459118983748e-06, + "loss": 2.9283, + "step": 15788 + }, + { + "epoch": 1.3456916389670162, + "grad_norm": 40.10780913122484, + "learning_rate": 6.734126127925925e-06, + "loss": 3.2325, + "step": 15789 + }, + { + "epoch": 1.3457768686610416, + "grad_norm": 55.48159534653799, + "learning_rate": 6.733661048959928e-06, + "loss": 2.8334, + "step": 15790 + }, + { + "epoch": 1.3458620983550669, + "grad_norm": 70.72618440663156, + "learning_rate": 6.733195952944061e-06, + "loss": 2.6459, + "step": 15791 + }, + { + "epoch": 1.3459473280490923, + "grad_norm": 70.70551437328791, + "learning_rate": 6.732730839882902e-06, + "loss": 3.1484, + "step": 15792 + }, + { + "epoch": 1.3460325577431176, + "grad_norm": 38.985388205456964, + "learning_rate": 6.732265709781025e-06, + "loss": 3.0774, + "step": 15793 + }, + { + "epoch": 1.346117787437143, + "grad_norm": 30.67324118242816, + "learning_rate": 6.731800562643002e-06, + "loss": 2.7561, + "step": 15794 + }, + { + "epoch": 1.3462030171311685, + "grad_norm": 60.97205687038408, + "learning_rate": 6.731335398473408e-06, + "loss": 2.7028, + "step": 15795 + }, + { + "epoch": 1.346288246825194, + "grad_norm": 39.0038543209003, + "learning_rate": 6.730870217276818e-06, + "loss": 2.8405, + "step": 15796 + }, + { + "epoch": 1.3463734765192192, + "grad_norm": 28.451179030730984, + "learning_rate": 6.730405019057809e-06, + "loss": 1.8858, + "step": 15797 + }, + { + "epoch": 1.3464587062132447, + "grad_norm": 44.49359116643996, + "learning_rate": 6.729939803820953e-06, + "loss": 3.0244, + "step": 15798 + }, + { + "epoch": 1.3465439359072702, + "grad_norm": 76.369404087237, + "learning_rate": 6.729474571570826e-06, + "loss": 3.7239, + "step": 15799 + }, + { + "epoch": 1.3466291656012954, + "grad_norm": 28.665494410762395, + "learning_rate": 6.729009322312005e-06, + "loss": 2.7871, + "step": 15800 + }, + { + "epoch": 1.3467143952953209, + "grad_norm": 35.810056221031935, + "learning_rate": 6.728544056049064e-06, + "loss": 2.9125, + "step": 15801 + }, + { + "epoch": 1.3467996249893464, + "grad_norm": 59.2332815494355, + "learning_rate": 6.728078772786579e-06, + "loss": 3.2235, + "step": 15802 + }, + { + "epoch": 1.3468848546833718, + "grad_norm": 45.42919532670481, + "learning_rate": 6.727613472529126e-06, + "loss": 3.0521, + "step": 15803 + }, + { + "epoch": 1.346970084377397, + "grad_norm": 33.53683016364243, + "learning_rate": 6.727148155281281e-06, + "loss": 2.8327, + "step": 15804 + }, + { + "epoch": 1.3470553140714225, + "grad_norm": 55.465713225879185, + "learning_rate": 6.72668282104762e-06, + "loss": 2.3477, + "step": 15805 + }, + { + "epoch": 1.3471405437654478, + "grad_norm": 33.14655584970225, + "learning_rate": 6.726217469832719e-06, + "loss": 3.4067, + "step": 15806 + }, + { + "epoch": 1.3472257734594733, + "grad_norm": 43.54023205459606, + "learning_rate": 6.7257521016411555e-06, + "loss": 2.8555, + "step": 15807 + }, + { + "epoch": 1.3473110031534987, + "grad_norm": 35.114168760993174, + "learning_rate": 6.725286716477507e-06, + "loss": 2.1133, + "step": 15808 + }, + { + "epoch": 1.3473962328475242, + "grad_norm": 55.14374259874006, + "learning_rate": 6.724821314346348e-06, + "loss": 2.3276, + "step": 15809 + }, + { + "epoch": 1.3474814625415494, + "grad_norm": 49.62131121926818, + "learning_rate": 6.724355895252257e-06, + "loss": 2.8547, + "step": 15810 + }, + { + "epoch": 1.347566692235575, + "grad_norm": 37.33019239726599, + "learning_rate": 6.723890459199811e-06, + "loss": 2.5202, + "step": 15811 + }, + { + "epoch": 1.3476519219296001, + "grad_norm": 77.29891074323106, + "learning_rate": 6.723425006193588e-06, + "loss": 3.596, + "step": 15812 + }, + { + "epoch": 1.3477371516236256, + "grad_norm": 91.06851626037685, + "learning_rate": 6.722959536238164e-06, + "loss": 2.4076, + "step": 15813 + }, + { + "epoch": 1.347822381317651, + "grad_norm": 64.5230007841836, + "learning_rate": 6.722494049338117e-06, + "loss": 2.4788, + "step": 15814 + }, + { + "epoch": 1.3479076110116766, + "grad_norm": 59.82580172893072, + "learning_rate": 6.7220285454980264e-06, + "loss": 3.5386, + "step": 15815 + }, + { + "epoch": 1.3479928407057018, + "grad_norm": 42.04871706213433, + "learning_rate": 6.72156302472247e-06, + "loss": 3.052, + "step": 15816 + }, + { + "epoch": 1.3480780703997273, + "grad_norm": 29.25390138976943, + "learning_rate": 6.721097487016025e-06, + "loss": 2.7358, + "step": 15817 + }, + { + "epoch": 1.3481633000937527, + "grad_norm": 88.98826762933194, + "learning_rate": 6.720631932383268e-06, + "loss": 4.1458, + "step": 15818 + }, + { + "epoch": 1.348248529787778, + "grad_norm": 47.00825057673601, + "learning_rate": 6.720166360828782e-06, + "loss": 3.4215, + "step": 15819 + }, + { + "epoch": 1.3483337594818035, + "grad_norm": 65.50493681827552, + "learning_rate": 6.719700772357143e-06, + "loss": 3.0766, + "step": 15820 + }, + { + "epoch": 1.348418989175829, + "grad_norm": 59.73915635180398, + "learning_rate": 6.719235166972931e-06, + "loss": 2.7944, + "step": 15821 + }, + { + "epoch": 1.3485042188698544, + "grad_norm": 37.96214487215985, + "learning_rate": 6.718769544680721e-06, + "loss": 2.762, + "step": 15822 + }, + { + "epoch": 1.3485894485638796, + "grad_norm": 42.53275801547572, + "learning_rate": 6.7183039054851e-06, + "loss": 3.3773, + "step": 15823 + }, + { + "epoch": 1.348674678257905, + "grad_norm": 39.171941502789934, + "learning_rate": 6.717838249390639e-06, + "loss": 3.2631, + "step": 15824 + }, + { + "epoch": 1.3487599079519303, + "grad_norm": 58.63201743641996, + "learning_rate": 6.717372576401924e-06, + "loss": 2.5701, + "step": 15825 + }, + { + "epoch": 1.3488451376459558, + "grad_norm": 72.27676369609986, + "learning_rate": 6.71690688652353e-06, + "loss": 3.274, + "step": 15826 + }, + { + "epoch": 1.3489303673399813, + "grad_norm": 75.01456794644471, + "learning_rate": 6.71644117976004e-06, + "loss": 3.1702, + "step": 15827 + }, + { + "epoch": 1.3490155970340068, + "grad_norm": 87.79511507288687, + "learning_rate": 6.715975456116034e-06, + "loss": 2.8522, + "step": 15828 + }, + { + "epoch": 1.349100826728032, + "grad_norm": 65.59897348674488, + "learning_rate": 6.715509715596091e-06, + "loss": 2.6558, + "step": 15829 + }, + { + "epoch": 1.3491860564220575, + "grad_norm": 64.55218658817083, + "learning_rate": 6.71504395820479e-06, + "loss": 2.9724, + "step": 15830 + }, + { + "epoch": 1.3492712861160827, + "grad_norm": 41.28253380479091, + "learning_rate": 6.714578183946715e-06, + "loss": 2.3023, + "step": 15831 + }, + { + "epoch": 1.3493565158101082, + "grad_norm": 40.08067493120523, + "learning_rate": 6.714112392826444e-06, + "loss": 3.5616, + "step": 15832 + }, + { + "epoch": 1.3494417455041336, + "grad_norm": 38.906980760743544, + "learning_rate": 6.7136465848485584e-06, + "loss": 3.4784, + "step": 15833 + }, + { + "epoch": 1.3495269751981591, + "grad_norm": 45.766209371394545, + "learning_rate": 6.71318076001764e-06, + "loss": 3.0355, + "step": 15834 + }, + { + "epoch": 1.3496122048921844, + "grad_norm": 412.45500754221143, + "learning_rate": 6.7127149183382705e-06, + "loss": 2.1466, + "step": 15835 + }, + { + "epoch": 1.3496974345862098, + "grad_norm": 47.47790177697618, + "learning_rate": 6.7122490598150296e-06, + "loss": 2.9362, + "step": 15836 + }, + { + "epoch": 1.3497826642802353, + "grad_norm": 31.17839468589018, + "learning_rate": 6.711783184452498e-06, + "loss": 2.4752, + "step": 15837 + }, + { + "epoch": 1.3498678939742605, + "grad_norm": 45.48569879120949, + "learning_rate": 6.711317292255262e-06, + "loss": 3.0269, + "step": 15838 + }, + { + "epoch": 1.349953123668286, + "grad_norm": 103.90254765203424, + "learning_rate": 6.7108513832278975e-06, + "loss": 2.9425, + "step": 15839 + }, + { + "epoch": 1.3500383533623115, + "grad_norm": 46.31497360909943, + "learning_rate": 6.710385457374992e-06, + "loss": 2.6055, + "step": 15840 + }, + { + "epoch": 1.350123583056337, + "grad_norm": 102.59628641505977, + "learning_rate": 6.709919514701123e-06, + "loss": 3.4966, + "step": 15841 + }, + { + "epoch": 1.3502088127503622, + "grad_norm": 96.5264848480897, + "learning_rate": 6.7094535552108765e-06, + "loss": 3.1569, + "step": 15842 + }, + { + "epoch": 1.3502940424443877, + "grad_norm": 38.725306963559866, + "learning_rate": 6.708987578908832e-06, + "loss": 2.9927, + "step": 15843 + }, + { + "epoch": 1.350379272138413, + "grad_norm": 85.14350290746651, + "learning_rate": 6.7085215857995755e-06, + "loss": 2.6598, + "step": 15844 + }, + { + "epoch": 1.3504645018324384, + "grad_norm": 40.02591014722279, + "learning_rate": 6.708055575887688e-06, + "loss": 3.1311, + "step": 15845 + }, + { + "epoch": 1.3505497315264638, + "grad_norm": 49.400020756786354, + "learning_rate": 6.707589549177751e-06, + "loss": 3.4729, + "step": 15846 + }, + { + "epoch": 1.3506349612204893, + "grad_norm": 53.94410137710636, + "learning_rate": 6.7071235056743515e-06, + "loss": 3.0125, + "step": 15847 + }, + { + "epoch": 1.3507201909145146, + "grad_norm": 48.45654540528194, + "learning_rate": 6.706657445382069e-06, + "loss": 3.3419, + "step": 15848 + }, + { + "epoch": 1.35080542060854, + "grad_norm": 61.534729976598605, + "learning_rate": 6.7061913683054895e-06, + "loss": 3.0694, + "step": 15849 + }, + { + "epoch": 1.3508906503025653, + "grad_norm": 41.84227202120694, + "learning_rate": 6.705725274449195e-06, + "loss": 2.8741, + "step": 15850 + }, + { + "epoch": 1.3509758799965907, + "grad_norm": 54.3629067714763, + "learning_rate": 6.705259163817771e-06, + "loss": 4.9105, + "step": 15851 + }, + { + "epoch": 1.3510611096906162, + "grad_norm": 38.51457753511409, + "learning_rate": 6.704793036415801e-06, + "loss": 2.7949, + "step": 15852 + }, + { + "epoch": 1.3511463393846417, + "grad_norm": 65.6833457107274, + "learning_rate": 6.704326892247867e-06, + "loss": 3.5669, + "step": 15853 + }, + { + "epoch": 1.351231569078667, + "grad_norm": 236.41790052285293, + "learning_rate": 6.703860731318557e-06, + "loss": 2.2487, + "step": 15854 + }, + { + "epoch": 1.3513167987726924, + "grad_norm": 68.45229053518838, + "learning_rate": 6.703394553632454e-06, + "loss": 2.9461, + "step": 15855 + }, + { + "epoch": 1.3514020284667179, + "grad_norm": 74.59709674100756, + "learning_rate": 6.7029283591941415e-06, + "loss": 3.0435, + "step": 15856 + }, + { + "epoch": 1.351487258160743, + "grad_norm": 106.44089112008355, + "learning_rate": 6.702462148008205e-06, + "loss": 3.7307, + "step": 15857 + }, + { + "epoch": 1.3515724878547686, + "grad_norm": 51.615322408607476, + "learning_rate": 6.701995920079231e-06, + "loss": 3.0229, + "step": 15858 + }, + { + "epoch": 1.351657717548794, + "grad_norm": 55.245132168486606, + "learning_rate": 6.701529675411803e-06, + "loss": 3.9462, + "step": 15859 + }, + { + "epoch": 1.3517429472428195, + "grad_norm": 39.79760722142979, + "learning_rate": 6.701063414010508e-06, + "loss": 3.0093, + "step": 15860 + }, + { + "epoch": 1.3518281769368448, + "grad_norm": 74.53786320621032, + "learning_rate": 6.700597135879927e-06, + "loss": 3.1492, + "step": 15861 + }, + { + "epoch": 1.3519134066308702, + "grad_norm": 59.94074579618316, + "learning_rate": 6.700130841024651e-06, + "loss": 3.1636, + "step": 15862 + }, + { + "epoch": 1.3519986363248955, + "grad_norm": 34.6471617916972, + "learning_rate": 6.699664529449264e-06, + "loss": 2.3659, + "step": 15863 + }, + { + "epoch": 1.352083866018921, + "grad_norm": 46.55269519745144, + "learning_rate": 6.699198201158353e-06, + "loss": 2.7984, + "step": 15864 + }, + { + "epoch": 1.3521690957129464, + "grad_norm": 55.45039791452332, + "learning_rate": 6.698731856156498e-06, + "loss": 3.0049, + "step": 15865 + }, + { + "epoch": 1.3522543254069719, + "grad_norm": 84.4145583978885, + "learning_rate": 6.698265494448295e-06, + "loss": 3.0273, + "step": 15866 + }, + { + "epoch": 1.3523395551009971, + "grad_norm": 41.07203244855886, + "learning_rate": 6.697799116038323e-06, + "loss": 2.8997, + "step": 15867 + }, + { + "epoch": 1.3524247847950226, + "grad_norm": 37.60574350480708, + "learning_rate": 6.697332720931173e-06, + "loss": 3.0616, + "step": 15868 + }, + { + "epoch": 1.352510014489048, + "grad_norm": 63.494856254268115, + "learning_rate": 6.696866309131429e-06, + "loss": 4.0545, + "step": 15869 + }, + { + "epoch": 1.3525952441830733, + "grad_norm": 36.129542276431515, + "learning_rate": 6.69639988064368e-06, + "loss": 3.0317, + "step": 15870 + }, + { + "epoch": 1.3526804738770988, + "grad_norm": 78.98888931579242, + "learning_rate": 6.695933435472511e-06, + "loss": 2.7691, + "step": 15871 + }, + { + "epoch": 1.3527657035711242, + "grad_norm": 48.29152241360802, + "learning_rate": 6.695466973622513e-06, + "loss": 3.3481, + "step": 15872 + }, + { + "epoch": 1.3528509332651497, + "grad_norm": 112.19612214837622, + "learning_rate": 6.695000495098267e-06, + "loss": 3.9224, + "step": 15873 + }, + { + "epoch": 1.352936162959175, + "grad_norm": 38.19617722854426, + "learning_rate": 6.694533999904369e-06, + "loss": 2.3349, + "step": 15874 + }, + { + "epoch": 1.3530213926532004, + "grad_norm": 89.10362076452496, + "learning_rate": 6.694067488045402e-06, + "loss": 3.0376, + "step": 15875 + }, + { + "epoch": 1.3531066223472257, + "grad_norm": 72.15276335302939, + "learning_rate": 6.693600959525952e-06, + "loss": 3.4908, + "step": 15876 + }, + { + "epoch": 1.3531918520412511, + "grad_norm": 116.67111202900443, + "learning_rate": 6.693134414350611e-06, + "loss": 4.0077, + "step": 15877 + }, + { + "epoch": 1.3532770817352766, + "grad_norm": 51.9134215187052, + "learning_rate": 6.692667852523964e-06, + "loss": 3.1215, + "step": 15878 + }, + { + "epoch": 1.353362311429302, + "grad_norm": 36.40996517012984, + "learning_rate": 6.692201274050604e-06, + "loss": 3.0876, + "step": 15879 + }, + { + "epoch": 1.3534475411233273, + "grad_norm": 126.29446857889911, + "learning_rate": 6.691734678935115e-06, + "loss": 3.7627, + "step": 15880 + }, + { + "epoch": 1.3535327708173528, + "grad_norm": 32.98557879335621, + "learning_rate": 6.691268067182088e-06, + "loss": 3.1368, + "step": 15881 + }, + { + "epoch": 1.353618000511378, + "grad_norm": 75.50778939204135, + "learning_rate": 6.690801438796111e-06, + "loss": 3.8737, + "step": 15882 + }, + { + "epoch": 1.3537032302054035, + "grad_norm": 55.12986793742899, + "learning_rate": 6.6903347937817754e-06, + "loss": 3.949, + "step": 15883 + }, + { + "epoch": 1.353788459899429, + "grad_norm": 75.22012178066787, + "learning_rate": 6.689868132143666e-06, + "loss": 3.0456, + "step": 15884 + }, + { + "epoch": 1.3538736895934544, + "grad_norm": 36.49575506943243, + "learning_rate": 6.6894014538863776e-06, + "loss": 2.6795, + "step": 15885 + }, + { + "epoch": 1.3539589192874797, + "grad_norm": 34.22001862503854, + "learning_rate": 6.688934759014496e-06, + "loss": 3.1958, + "step": 15886 + }, + { + "epoch": 1.3540441489815052, + "grad_norm": 69.45786023009411, + "learning_rate": 6.688468047532614e-06, + "loss": 2.8665, + "step": 15887 + }, + { + "epoch": 1.3541293786755306, + "grad_norm": 40.86427468061761, + "learning_rate": 6.688001319445318e-06, + "loss": 2.2557, + "step": 15888 + }, + { + "epoch": 1.3542146083695559, + "grad_norm": 91.57706000786239, + "learning_rate": 6.687534574757199e-06, + "loss": 2.609, + "step": 15889 + }, + { + "epoch": 1.3542998380635813, + "grad_norm": 47.89184930029859, + "learning_rate": 6.687067813472849e-06, + "loss": 2.9687, + "step": 15890 + }, + { + "epoch": 1.3543850677576068, + "grad_norm": 38.9396913309736, + "learning_rate": 6.686601035596858e-06, + "loss": 2.9166, + "step": 15891 + }, + { + "epoch": 1.3544702974516323, + "grad_norm": 24.51319979452031, + "learning_rate": 6.686134241133816e-06, + "loss": 2.0862, + "step": 15892 + }, + { + "epoch": 1.3545555271456575, + "grad_norm": 33.028065209565845, + "learning_rate": 6.6856674300883125e-06, + "loss": 2.6433, + "step": 15893 + }, + { + "epoch": 1.354640756839683, + "grad_norm": 60.878858669729176, + "learning_rate": 6.685200602464941e-06, + "loss": 3.0761, + "step": 15894 + }, + { + "epoch": 1.3547259865337082, + "grad_norm": 53.49694194378709, + "learning_rate": 6.6847337582682905e-06, + "loss": 2.7004, + "step": 15895 + }, + { + "epoch": 1.3548112162277337, + "grad_norm": 79.02345906901431, + "learning_rate": 6.684266897502953e-06, + "loss": 3.3902, + "step": 15896 + }, + { + "epoch": 1.3548964459217592, + "grad_norm": 40.3438895735931, + "learning_rate": 6.683800020173519e-06, + "loss": 2.876, + "step": 15897 + }, + { + "epoch": 1.3549816756157846, + "grad_norm": 103.67795099281167, + "learning_rate": 6.683333126284584e-06, + "loss": 4.6119, + "step": 15898 + }, + { + "epoch": 1.3550669053098099, + "grad_norm": 35.33424165494004, + "learning_rate": 6.6828662158407335e-06, + "loss": 2.7056, + "step": 15899 + }, + { + "epoch": 1.3551521350038354, + "grad_norm": 52.23688612281034, + "learning_rate": 6.6823992888465626e-06, + "loss": 2.955, + "step": 15900 + }, + { + "epoch": 1.3552373646978606, + "grad_norm": 48.11785560993732, + "learning_rate": 6.6819323453066635e-06, + "loss": 3.4516, + "step": 15901 + }, + { + "epoch": 1.355322594391886, + "grad_norm": 65.94140547158203, + "learning_rate": 6.68146538522563e-06, + "loss": 2.3442, + "step": 15902 + }, + { + "epoch": 1.3554078240859115, + "grad_norm": 35.06404549310084, + "learning_rate": 6.6809984086080504e-06, + "loss": 2.7084, + "step": 15903 + }, + { + "epoch": 1.355493053779937, + "grad_norm": 56.67715284142606, + "learning_rate": 6.68053141545852e-06, + "loss": 2.9382, + "step": 15904 + }, + { + "epoch": 1.3555782834739623, + "grad_norm": 31.876255455457777, + "learning_rate": 6.680064405781631e-06, + "loss": 2.8927, + "step": 15905 + }, + { + "epoch": 1.3556635131679877, + "grad_norm": 72.16730635958642, + "learning_rate": 6.679597379581977e-06, + "loss": 3.0944, + "step": 15906 + }, + { + "epoch": 1.3557487428620132, + "grad_norm": 39.21263391747257, + "learning_rate": 6.679130336864149e-06, + "loss": 2.4422, + "step": 15907 + }, + { + "epoch": 1.3558339725560384, + "grad_norm": 57.66792501895965, + "learning_rate": 6.678663277632741e-06, + "loss": 2.6892, + "step": 15908 + }, + { + "epoch": 1.355919202250064, + "grad_norm": 45.54368344023701, + "learning_rate": 6.678196201892348e-06, + "loss": 2.0424, + "step": 15909 + }, + { + "epoch": 1.3560044319440894, + "grad_norm": 35.65100422067462, + "learning_rate": 6.677729109647562e-06, + "loss": 2.9231, + "step": 15910 + }, + { + "epoch": 1.3560896616381148, + "grad_norm": 59.934830194148326, + "learning_rate": 6.677262000902977e-06, + "loss": 3.0053, + "step": 15911 + }, + { + "epoch": 1.35617489133214, + "grad_norm": 62.93281945752633, + "learning_rate": 6.676794875663184e-06, + "loss": 2.5572, + "step": 15912 + }, + { + "epoch": 1.3562601210261656, + "grad_norm": 60.437329807364684, + "learning_rate": 6.676327733932784e-06, + "loss": 3.2607, + "step": 15913 + }, + { + "epoch": 1.3563453507201908, + "grad_norm": 50.29767244285044, + "learning_rate": 6.675860575716364e-06, + "loss": 2.4627, + "step": 15914 + }, + { + "epoch": 1.3564305804142163, + "grad_norm": 46.22946233573321, + "learning_rate": 6.675393401018522e-06, + "loss": 1.7233, + "step": 15915 + }, + { + "epoch": 1.3565158101082417, + "grad_norm": 92.63832415946084, + "learning_rate": 6.6749262098438506e-06, + "loss": 3.1964, + "step": 15916 + }, + { + "epoch": 1.3566010398022672, + "grad_norm": 76.19692573821996, + "learning_rate": 6.6744590021969465e-06, + "loss": 4.3481, + "step": 15917 + }, + { + "epoch": 1.3566862694962925, + "grad_norm": 72.05874198434044, + "learning_rate": 6.673991778082405e-06, + "loss": 4.1068, + "step": 15918 + }, + { + "epoch": 1.356771499190318, + "grad_norm": 79.41151220394768, + "learning_rate": 6.673524537504817e-06, + "loss": 2.9287, + "step": 15919 + }, + { + "epoch": 1.3568567288843434, + "grad_norm": 68.26573656331924, + "learning_rate": 6.673057280468781e-06, + "loss": 2.7008, + "step": 15920 + }, + { + "epoch": 1.3569419585783686, + "grad_norm": 57.02064472027365, + "learning_rate": 6.672590006978892e-06, + "loss": 3.3463, + "step": 15921 + }, + { + "epoch": 1.357027188272394, + "grad_norm": 87.02916960797884, + "learning_rate": 6.672122717039744e-06, + "loss": 3.6226, + "step": 15922 + }, + { + "epoch": 1.3571124179664196, + "grad_norm": 33.511389828198055, + "learning_rate": 6.671655410655935e-06, + "loss": 2.7273, + "step": 15923 + }, + { + "epoch": 1.3571976476604448, + "grad_norm": 77.24544794901998, + "learning_rate": 6.671188087832057e-06, + "loss": 3.2893, + "step": 15924 + }, + { + "epoch": 1.3572828773544703, + "grad_norm": 51.37296071397371, + "learning_rate": 6.67072074857271e-06, + "loss": 2.9802, + "step": 15925 + }, + { + "epoch": 1.3573681070484958, + "grad_norm": 52.136093383354314, + "learning_rate": 6.670253392882489e-06, + "loss": 3.4203, + "step": 15926 + }, + { + "epoch": 1.357453336742521, + "grad_norm": 78.14978021888444, + "learning_rate": 6.669786020765987e-06, + "loss": 3.1408, + "step": 15927 + }, + { + "epoch": 1.3575385664365465, + "grad_norm": 37.05056420812244, + "learning_rate": 6.669318632227803e-06, + "loss": 3.0794, + "step": 15928 + }, + { + "epoch": 1.357623796130572, + "grad_norm": 105.30448739500714, + "learning_rate": 6.668851227272536e-06, + "loss": 4.6063, + "step": 15929 + }, + { + "epoch": 1.3577090258245974, + "grad_norm": 85.97614995361802, + "learning_rate": 6.66838380590478e-06, + "loss": 2.8533, + "step": 15930 + }, + { + "epoch": 1.3577942555186227, + "grad_norm": 56.76792773144177, + "learning_rate": 6.667916368129131e-06, + "loss": 2.3802, + "step": 15931 + }, + { + "epoch": 1.3578794852126481, + "grad_norm": 51.51075718889938, + "learning_rate": 6.667448913950188e-06, + "loss": 3.0936, + "step": 15932 + }, + { + "epoch": 1.3579647149066734, + "grad_norm": 59.94971124294494, + "learning_rate": 6.6669814433725475e-06, + "loss": 2.4729, + "step": 15933 + }, + { + "epoch": 1.3580499446006988, + "grad_norm": 31.469840072893216, + "learning_rate": 6.6665139564008065e-06, + "loss": 2.2359, + "step": 15934 + }, + { + "epoch": 1.3581351742947243, + "grad_norm": 63.62568901011192, + "learning_rate": 6.666046453039563e-06, + "loss": 2.5581, + "step": 15935 + }, + { + "epoch": 1.3582204039887498, + "grad_norm": 53.395947300220804, + "learning_rate": 6.665578933293414e-06, + "loss": 2.7953, + "step": 15936 + }, + { + "epoch": 1.358305633682775, + "grad_norm": 58.43928098999771, + "learning_rate": 6.66511139716696e-06, + "loss": 2.7304, + "step": 15937 + }, + { + "epoch": 1.3583908633768005, + "grad_norm": 69.2556238306825, + "learning_rate": 6.664643844664795e-06, + "loss": 3.3458, + "step": 15938 + }, + { + "epoch": 1.358476093070826, + "grad_norm": 35.26812120804835, + "learning_rate": 6.664176275791521e-06, + "loss": 3.1385, + "step": 15939 + }, + { + "epoch": 1.3585613227648512, + "grad_norm": 40.48995090533746, + "learning_rate": 6.663708690551734e-06, + "loss": 2.6778, + "step": 15940 + }, + { + "epoch": 1.3586465524588767, + "grad_norm": 42.651345816012615, + "learning_rate": 6.663241088950032e-06, + "loss": 2.6369, + "step": 15941 + }, + { + "epoch": 1.3587317821529021, + "grad_norm": 42.52029086832984, + "learning_rate": 6.6627734709910155e-06, + "loss": 3.1332, + "step": 15942 + }, + { + "epoch": 1.3588170118469276, + "grad_norm": 54.95064107978863, + "learning_rate": 6.6623058366792824e-06, + "loss": 3.7094, + "step": 15943 + }, + { + "epoch": 1.3589022415409528, + "grad_norm": 51.90811845031321, + "learning_rate": 6.661838186019432e-06, + "loss": 3.4753, + "step": 15944 + }, + { + "epoch": 1.3589874712349783, + "grad_norm": 71.42831578480495, + "learning_rate": 6.661370519016065e-06, + "loss": 3.2152, + "step": 15945 + }, + { + "epoch": 1.3590727009290036, + "grad_norm": 60.605380972140466, + "learning_rate": 6.660902835673777e-06, + "loss": 2.8463, + "step": 15946 + }, + { + "epoch": 1.359157930623029, + "grad_norm": 42.84347084042681, + "learning_rate": 6.660435135997169e-06, + "loss": 2.1026, + "step": 15947 + }, + { + "epoch": 1.3592431603170545, + "grad_norm": 34.614322560895836, + "learning_rate": 6.659967419990845e-06, + "loss": 2.9581, + "step": 15948 + }, + { + "epoch": 1.35932839001108, + "grad_norm": 29.348079450212563, + "learning_rate": 6.659499687659399e-06, + "loss": 2.1914, + "step": 15949 + }, + { + "epoch": 1.3594136197051052, + "grad_norm": 83.02489707886137, + "learning_rate": 6.659031939007432e-06, + "loss": 3.2179, + "step": 15950 + }, + { + "epoch": 1.3594988493991307, + "grad_norm": 54.79403475606626, + "learning_rate": 6.6585641740395445e-06, + "loss": 3.8382, + "step": 15951 + }, + { + "epoch": 1.359584079093156, + "grad_norm": 85.75896055011373, + "learning_rate": 6.65809639276034e-06, + "loss": 3.5631, + "step": 15952 + }, + { + "epoch": 1.3596693087871814, + "grad_norm": 40.65984586071841, + "learning_rate": 6.657628595174414e-06, + "loss": 2.3306, + "step": 15953 + }, + { + "epoch": 1.3597545384812069, + "grad_norm": 37.24100295563924, + "learning_rate": 6.657160781286371e-06, + "loss": 2.938, + "step": 15954 + }, + { + "epoch": 1.3598397681752323, + "grad_norm": 30.745074457894916, + "learning_rate": 6.656692951100809e-06, + "loss": 2.3443, + "step": 15955 + }, + { + "epoch": 1.3599249978692576, + "grad_norm": 35.710501289994475, + "learning_rate": 6.6562251046223315e-06, + "loss": 3.2633, + "step": 15956 + }, + { + "epoch": 1.360010227563283, + "grad_norm": 26.88086974345571, + "learning_rate": 6.655757241855537e-06, + "loss": 2.26, + "step": 15957 + }, + { + "epoch": 1.3600954572573085, + "grad_norm": 39.92674557590225, + "learning_rate": 6.65528936280503e-06, + "loss": 2.6724, + "step": 15958 + }, + { + "epoch": 1.3601806869513338, + "grad_norm": 88.01304746541591, + "learning_rate": 6.654821467475406e-06, + "loss": 3.707, + "step": 15959 + }, + { + "epoch": 1.3602659166453592, + "grad_norm": 47.11802004241243, + "learning_rate": 6.654353555871274e-06, + "loss": 2.277, + "step": 15960 + }, + { + "epoch": 1.3603511463393847, + "grad_norm": 59.591736033464585, + "learning_rate": 6.653885627997231e-06, + "loss": 3.1986, + "step": 15961 + }, + { + "epoch": 1.3604363760334102, + "grad_norm": 147.0053289912082, + "learning_rate": 6.6534176838578805e-06, + "loss": 3.6001, + "step": 15962 + }, + { + "epoch": 1.3605216057274354, + "grad_norm": 29.62676202566129, + "learning_rate": 6.652949723457823e-06, + "loss": 2.2551, + "step": 15963 + }, + { + "epoch": 1.3606068354214609, + "grad_norm": 80.79815617522631, + "learning_rate": 6.652481746801662e-06, + "loss": 1.6987, + "step": 15964 + }, + { + "epoch": 1.3606920651154861, + "grad_norm": 40.07636478120509, + "learning_rate": 6.652013753894001e-06, + "loss": 4.365, + "step": 15965 + }, + { + "epoch": 1.3607772948095116, + "grad_norm": 88.26229722675657, + "learning_rate": 6.651545744739439e-06, + "loss": 4.3289, + "step": 15966 + }, + { + "epoch": 1.360862524503537, + "grad_norm": 66.73103556886664, + "learning_rate": 6.651077719342582e-06, + "loss": 3.5382, + "step": 15967 + }, + { + "epoch": 1.3609477541975625, + "grad_norm": 48.84274085797591, + "learning_rate": 6.650609677708032e-06, + "loss": 2.438, + "step": 15968 + }, + { + "epoch": 1.3610329838915878, + "grad_norm": 33.17924668346855, + "learning_rate": 6.650141619840393e-06, + "loss": 2.4548, + "step": 15969 + }, + { + "epoch": 1.3611182135856132, + "grad_norm": 28.44549689464424, + "learning_rate": 6.649673545744264e-06, + "loss": 3.1931, + "step": 15970 + }, + { + "epoch": 1.3612034432796385, + "grad_norm": 20.635099002262606, + "learning_rate": 6.649205455424252e-06, + "loss": 1.7781, + "step": 15971 + }, + { + "epoch": 1.361288672973664, + "grad_norm": 101.64802861907802, + "learning_rate": 6.6487373488849605e-06, + "loss": 4.4475, + "step": 15972 + }, + { + "epoch": 1.3613739026676894, + "grad_norm": 97.53761399982437, + "learning_rate": 6.648269226130992e-06, + "loss": 3.211, + "step": 15973 + }, + { + "epoch": 1.361459132361715, + "grad_norm": 92.4070199562341, + "learning_rate": 6.6478010871669505e-06, + "loss": 3.8853, + "step": 15974 + }, + { + "epoch": 1.3615443620557401, + "grad_norm": 56.01467276983873, + "learning_rate": 6.64733293199744e-06, + "loss": 4.2876, + "step": 15975 + }, + { + "epoch": 1.3616295917497656, + "grad_norm": 97.45792877662969, + "learning_rate": 6.646864760627064e-06, + "loss": 2.5059, + "step": 15976 + }, + { + "epoch": 1.361714821443791, + "grad_norm": 56.33235052165729, + "learning_rate": 6.64639657306043e-06, + "loss": 2.8926, + "step": 15977 + }, + { + "epoch": 1.3618000511378163, + "grad_norm": 32.482818797630095, + "learning_rate": 6.645928369302137e-06, + "loss": 2.9162, + "step": 15978 + }, + { + "epoch": 1.3618852808318418, + "grad_norm": 30.983612377176428, + "learning_rate": 6.6454601493567935e-06, + "loss": 1.8628, + "step": 15979 + }, + { + "epoch": 1.3619705105258673, + "grad_norm": 33.01273002189737, + "learning_rate": 6.644991913229005e-06, + "loss": 2.5944, + "step": 15980 + }, + { + "epoch": 1.3620557402198927, + "grad_norm": 68.79121326652586, + "learning_rate": 6.644523660923373e-06, + "loss": 2.8865, + "step": 15981 + }, + { + "epoch": 1.362140969913918, + "grad_norm": 34.55417168094699, + "learning_rate": 6.644055392444504e-06, + "loss": 2.2846, + "step": 15982 + }, + { + "epoch": 1.3622261996079434, + "grad_norm": 72.22873992424603, + "learning_rate": 6.643587107797004e-06, + "loss": 3.2073, + "step": 15983 + }, + { + "epoch": 1.3623114293019687, + "grad_norm": 91.95485950202219, + "learning_rate": 6.643118806985478e-06, + "loss": 4.3991, + "step": 15984 + }, + { + "epoch": 1.3623966589959942, + "grad_norm": 63.51660418729578, + "learning_rate": 6.642650490014531e-06, + "loss": 3.6504, + "step": 15985 + }, + { + "epoch": 1.3624818886900196, + "grad_norm": 31.819399184637657, + "learning_rate": 6.64218215688877e-06, + "loss": 2.1192, + "step": 15986 + }, + { + "epoch": 1.362567118384045, + "grad_norm": 144.64887146948192, + "learning_rate": 6.6417138076128e-06, + "loss": 4.232, + "step": 15987 + }, + { + "epoch": 1.3626523480780703, + "grad_norm": 31.185669533671074, + "learning_rate": 6.6412454421912265e-06, + "loss": 2.2204, + "step": 15988 + }, + { + "epoch": 1.3627375777720958, + "grad_norm": 93.81710006429488, + "learning_rate": 6.640777060628656e-06, + "loss": 4.4221, + "step": 15989 + }, + { + "epoch": 1.3628228074661213, + "grad_norm": 29.570742669335285, + "learning_rate": 6.640308662929695e-06, + "loss": 2.025, + "step": 15990 + }, + { + "epoch": 1.3629080371601465, + "grad_norm": 44.55257655177416, + "learning_rate": 6.639840249098951e-06, + "loss": 4.045, + "step": 15991 + }, + { + "epoch": 1.362993266854172, + "grad_norm": 39.73286976268518, + "learning_rate": 6.639371819141029e-06, + "loss": 2.7156, + "step": 15992 + }, + { + "epoch": 1.3630784965481975, + "grad_norm": 105.2327456938333, + "learning_rate": 6.638903373060538e-06, + "loss": 4.0999, + "step": 15993 + }, + { + "epoch": 1.363163726242223, + "grad_norm": 24.355467468395062, + "learning_rate": 6.638434910862082e-06, + "loss": 2.5779, + "step": 15994 + }, + { + "epoch": 1.3632489559362482, + "grad_norm": 57.06880765566659, + "learning_rate": 6.63796643255027e-06, + "loss": 3.1414, + "step": 15995 + }, + { + "epoch": 1.3633341856302736, + "grad_norm": 64.67849797261451, + "learning_rate": 6.637497938129709e-06, + "loss": 3.1727, + "step": 15996 + }, + { + "epoch": 1.3634194153242989, + "grad_norm": 68.1782850374515, + "learning_rate": 6.637029427605007e-06, + "loss": 3.3836, + "step": 15997 + }, + { + "epoch": 1.3635046450183244, + "grad_norm": 38.733409208195496, + "learning_rate": 6.6365609009807694e-06, + "loss": 2.6731, + "step": 15998 + }, + { + "epoch": 1.3635898747123498, + "grad_norm": 37.0733287146263, + "learning_rate": 6.636092358261607e-06, + "loss": 2.9551, + "step": 15999 + }, + { + "epoch": 1.3636751044063753, + "grad_norm": 41.247563441429136, + "learning_rate": 6.635623799452127e-06, + "loss": 2.3018, + "step": 16000 + }, + { + "epoch": 1.3637603341004005, + "grad_norm": 32.90156923080884, + "learning_rate": 6.635155224556936e-06, + "loss": 3.119, + "step": 16001 + }, + { + "epoch": 1.363845563794426, + "grad_norm": 42.45934742127213, + "learning_rate": 6.634686633580642e-06, + "loss": 2.555, + "step": 16002 + }, + { + "epoch": 1.3639307934884513, + "grad_norm": 100.7958427394308, + "learning_rate": 6.6342180265278564e-06, + "loss": 4.4363, + "step": 16003 + }, + { + "epoch": 1.3640160231824767, + "grad_norm": 57.99750930640502, + "learning_rate": 6.6337494034031854e-06, + "loss": 2.634, + "step": 16004 + }, + { + "epoch": 1.3641012528765022, + "grad_norm": 84.54277741859781, + "learning_rate": 6.633280764211239e-06, + "loss": 3.0252, + "step": 16005 + }, + { + "epoch": 1.3641864825705277, + "grad_norm": 81.52974515283985, + "learning_rate": 6.6328121089566235e-06, + "loss": 2.6223, + "step": 16006 + }, + { + "epoch": 1.364271712264553, + "grad_norm": 58.65704540329262, + "learning_rate": 6.632343437643951e-06, + "loss": 2.5174, + "step": 16007 + }, + { + "epoch": 1.3643569419585784, + "grad_norm": 92.76850892015864, + "learning_rate": 6.6318747502778294e-06, + "loss": 3.2391, + "step": 16008 + }, + { + "epoch": 1.3644421716526038, + "grad_norm": 29.4458260970646, + "learning_rate": 6.631406046862867e-06, + "loss": 2.8904, + "step": 16009 + }, + { + "epoch": 1.364527401346629, + "grad_norm": 36.474123285907176, + "learning_rate": 6.630937327403675e-06, + "loss": 3.0024, + "step": 16010 + }, + { + "epoch": 1.3646126310406546, + "grad_norm": 34.009162933738764, + "learning_rate": 6.630468591904862e-06, + "loss": 2.8987, + "step": 16011 + }, + { + "epoch": 1.36469786073468, + "grad_norm": 39.11779138243242, + "learning_rate": 6.62999984037104e-06, + "loss": 3.4028, + "step": 16012 + }, + { + "epoch": 1.3647830904287055, + "grad_norm": 29.146243692191398, + "learning_rate": 6.629531072806815e-06, + "loss": 2.2286, + "step": 16013 + }, + { + "epoch": 1.3648683201227307, + "grad_norm": 44.3983729182196, + "learning_rate": 6.629062289216801e-06, + "loss": 2.6292, + "step": 16014 + }, + { + "epoch": 1.3649535498167562, + "grad_norm": 70.9374631167431, + "learning_rate": 6.628593489605606e-06, + "loss": 2.853, + "step": 16015 + }, + { + "epoch": 1.3650387795107815, + "grad_norm": 25.632549167079432, + "learning_rate": 6.628124673977841e-06, + "loss": 1.9414, + "step": 16016 + }, + { + "epoch": 1.365124009204807, + "grad_norm": 43.801218859522386, + "learning_rate": 6.6276558423381175e-06, + "loss": 3.1757, + "step": 16017 + }, + { + "epoch": 1.3652092388988324, + "grad_norm": 61.11474718451714, + "learning_rate": 6.627186994691045e-06, + "loss": 3.1648, + "step": 16018 + }, + { + "epoch": 1.3652944685928579, + "grad_norm": 63.56092580141525, + "learning_rate": 6.626718131041235e-06, + "loss": 2.9351, + "step": 16019 + }, + { + "epoch": 1.365379698286883, + "grad_norm": 41.44856012190528, + "learning_rate": 6.6262492513933e-06, + "loss": 2.5762, + "step": 16020 + }, + { + "epoch": 1.3654649279809086, + "grad_norm": 66.37191816865808, + "learning_rate": 6.625780355751848e-06, + "loss": 2.8269, + "step": 16021 + }, + { + "epoch": 1.3655501576749338, + "grad_norm": 49.6244342186447, + "learning_rate": 6.625311444121492e-06, + "loss": 2.9287, + "step": 16022 + }, + { + "epoch": 1.3656353873689593, + "grad_norm": 60.2753356140828, + "learning_rate": 6.624842516506845e-06, + "loss": 2.8595, + "step": 16023 + }, + { + "epoch": 1.3657206170629848, + "grad_norm": 74.75430066484466, + "learning_rate": 6.624373572912517e-06, + "loss": 3.2966, + "step": 16024 + }, + { + "epoch": 1.3658058467570102, + "grad_norm": 44.13014295533509, + "learning_rate": 6.62390461334312e-06, + "loss": 2.885, + "step": 16025 + }, + { + "epoch": 1.3658910764510355, + "grad_norm": 96.05177295048216, + "learning_rate": 6.623435637803267e-06, + "loss": 4.4134, + "step": 16026 + }, + { + "epoch": 1.365976306145061, + "grad_norm": 82.52669302518342, + "learning_rate": 6.622966646297571e-06, + "loss": 2.6924, + "step": 16027 + }, + { + "epoch": 1.3660615358390864, + "grad_norm": 20.29101650343206, + "learning_rate": 6.622497638830641e-06, + "loss": 1.4357, + "step": 16028 + }, + { + "epoch": 1.3661467655331117, + "grad_norm": 33.63944733356298, + "learning_rate": 6.62202861540709e-06, + "loss": 2.4972, + "step": 16029 + }, + { + "epoch": 1.3662319952271371, + "grad_norm": 54.63441281745279, + "learning_rate": 6.621559576031534e-06, + "loss": 3.4858, + "step": 16030 + }, + { + "epoch": 1.3663172249211626, + "grad_norm": 35.295153450390764, + "learning_rate": 6.621090520708585e-06, + "loss": 2.9534, + "step": 16031 + }, + { + "epoch": 1.366402454615188, + "grad_norm": 56.69309747851257, + "learning_rate": 6.620621449442853e-06, + "loss": 3.1829, + "step": 16032 + }, + { + "epoch": 1.3664876843092133, + "grad_norm": 58.948816376528185, + "learning_rate": 6.620152362238954e-06, + "loss": 3.7853, + "step": 16033 + }, + { + "epoch": 1.3665729140032388, + "grad_norm": 29.53007951339612, + "learning_rate": 6.619683259101499e-06, + "loss": 1.7649, + "step": 16034 + }, + { + "epoch": 1.366658143697264, + "grad_norm": 35.01498302506649, + "learning_rate": 6.619214140035105e-06, + "loss": 3.534, + "step": 16035 + }, + { + "epoch": 1.3667433733912895, + "grad_norm": 36.62661614559853, + "learning_rate": 6.618745005044382e-06, + "loss": 3.097, + "step": 16036 + }, + { + "epoch": 1.366828603085315, + "grad_norm": 31.68451559046457, + "learning_rate": 6.618275854133944e-06, + "loss": 2.7723, + "step": 16037 + }, + { + "epoch": 1.3669138327793404, + "grad_norm": 52.049798314655085, + "learning_rate": 6.617806687308409e-06, + "loss": 2.9145, + "step": 16038 + }, + { + "epoch": 1.3669990624733657, + "grad_norm": 87.53693316108307, + "learning_rate": 6.617337504572386e-06, + "loss": 3.1482, + "step": 16039 + }, + { + "epoch": 1.3670842921673911, + "grad_norm": 70.11723000631842, + "learning_rate": 6.616868305930493e-06, + "loss": 3.3004, + "step": 16040 + }, + { + "epoch": 1.3671695218614164, + "grad_norm": 80.68354602711717, + "learning_rate": 6.61639909138734e-06, + "loss": 2.8205, + "step": 16041 + }, + { + "epoch": 1.3672547515554418, + "grad_norm": 103.71785859830018, + "learning_rate": 6.615929860947545e-06, + "loss": 2.1455, + "step": 16042 + }, + { + "epoch": 1.3673399812494673, + "grad_norm": 71.6611672967056, + "learning_rate": 6.615460614615723e-06, + "loss": 2.4927, + "step": 16043 + }, + { + "epoch": 1.3674252109434928, + "grad_norm": 44.92045221072844, + "learning_rate": 6.614991352396488e-06, + "loss": 2.629, + "step": 16044 + }, + { + "epoch": 1.367510440637518, + "grad_norm": 30.155010343367824, + "learning_rate": 6.6145220742944526e-06, + "loss": 2.1244, + "step": 16045 + }, + { + "epoch": 1.3675956703315435, + "grad_norm": 41.13601700877465, + "learning_rate": 6.614052780314237e-06, + "loss": 2.6912, + "step": 16046 + }, + { + "epoch": 1.367680900025569, + "grad_norm": 30.889962627946346, + "learning_rate": 6.613583470460451e-06, + "loss": 2.1786, + "step": 16047 + }, + { + "epoch": 1.3677661297195942, + "grad_norm": 49.514682086704816, + "learning_rate": 6.613114144737716e-06, + "loss": 3.2308, + "step": 16048 + }, + { + "epoch": 1.3678513594136197, + "grad_norm": 36.990168983770204, + "learning_rate": 6.612644803150641e-06, + "loss": 2.9816, + "step": 16049 + }, + { + "epoch": 1.3679365891076452, + "grad_norm": 72.53750288605505, + "learning_rate": 6.612175445703847e-06, + "loss": 3.4097, + "step": 16050 + }, + { + "epoch": 1.3680218188016706, + "grad_norm": 57.30881880760603, + "learning_rate": 6.611706072401947e-06, + "loss": 3.4114, + "step": 16051 + }, + { + "epoch": 1.3681070484956959, + "grad_norm": 77.21935049296027, + "learning_rate": 6.611236683249559e-06, + "loss": 3.58, + "step": 16052 + }, + { + "epoch": 1.3681922781897213, + "grad_norm": 61.233256895612854, + "learning_rate": 6.610767278251298e-06, + "loss": 1.8856, + "step": 16053 + }, + { + "epoch": 1.3682775078837466, + "grad_norm": 85.77482790049534, + "learning_rate": 6.610297857411782e-06, + "loss": 3.1576, + "step": 16054 + }, + { + "epoch": 1.368362737577772, + "grad_norm": 73.67585971449479, + "learning_rate": 6.609828420735625e-06, + "loss": 3.2773, + "step": 16055 + }, + { + "epoch": 1.3684479672717975, + "grad_norm": 47.3864281921581, + "learning_rate": 6.609358968227446e-06, + "loss": 3.2997, + "step": 16056 + }, + { + "epoch": 1.368533196965823, + "grad_norm": 53.28038562998054, + "learning_rate": 6.608889499891861e-06, + "loss": 2.11, + "step": 16057 + }, + { + "epoch": 1.3686184266598482, + "grad_norm": 40.52736735056909, + "learning_rate": 6.608420015733487e-06, + "loss": 2.0983, + "step": 16058 + }, + { + "epoch": 1.3687036563538737, + "grad_norm": 22.088389877016557, + "learning_rate": 6.607950515756941e-06, + "loss": 1.4761, + "step": 16059 + }, + { + "epoch": 1.3687888860478992, + "grad_norm": 58.32287054812488, + "learning_rate": 6.607480999966841e-06, + "loss": 3.2317, + "step": 16060 + }, + { + "epoch": 1.3688741157419244, + "grad_norm": 33.575752167789496, + "learning_rate": 6.607011468367803e-06, + "loss": 3.1853, + "step": 16061 + }, + { + "epoch": 1.3689593454359499, + "grad_norm": 53.77275457322099, + "learning_rate": 6.606541920964447e-06, + "loss": 3.0293, + "step": 16062 + }, + { + "epoch": 1.3690445751299753, + "grad_norm": 41.24998157553209, + "learning_rate": 6.606072357761391e-06, + "loss": 2.6476, + "step": 16063 + }, + { + "epoch": 1.3691298048240008, + "grad_norm": 51.36210985766429, + "learning_rate": 6.605602778763248e-06, + "loss": 3.2249, + "step": 16064 + }, + { + "epoch": 1.369215034518026, + "grad_norm": 75.00970357435513, + "learning_rate": 6.605133183974641e-06, + "loss": 2.7034, + "step": 16065 + }, + { + "epoch": 1.3693002642120515, + "grad_norm": 120.83976826448053, + "learning_rate": 6.604663573400189e-06, + "loss": 1.9884, + "step": 16066 + }, + { + "epoch": 1.3693854939060768, + "grad_norm": 65.47918359666411, + "learning_rate": 6.604193947044506e-06, + "loss": 3.7058, + "step": 16067 + }, + { + "epoch": 1.3694707236001022, + "grad_norm": 56.88158861891455, + "learning_rate": 6.603724304912214e-06, + "loss": 2.4078, + "step": 16068 + }, + { + "epoch": 1.3695559532941277, + "grad_norm": 53.82826095813469, + "learning_rate": 6.60325464700793e-06, + "loss": 3.4277, + "step": 16069 + }, + { + "epoch": 1.3696411829881532, + "grad_norm": 47.53070509686254, + "learning_rate": 6.602784973336275e-06, + "loss": 2.5841, + "step": 16070 + }, + { + "epoch": 1.3697264126821784, + "grad_norm": 73.75380741283143, + "learning_rate": 6.6023152839018656e-06, + "loss": 3.0612, + "step": 16071 + }, + { + "epoch": 1.369811642376204, + "grad_norm": 25.64590268029255, + "learning_rate": 6.6018455787093215e-06, + "loss": 2.1544, + "step": 16072 + }, + { + "epoch": 1.3698968720702291, + "grad_norm": 79.38509715608643, + "learning_rate": 6.601375857763264e-06, + "loss": 2.7784, + "step": 16073 + }, + { + "epoch": 1.3699821017642546, + "grad_norm": 81.86870902508856, + "learning_rate": 6.600906121068311e-06, + "loss": 2.1217, + "step": 16074 + }, + { + "epoch": 1.37006733145828, + "grad_norm": 41.02391834533624, + "learning_rate": 6.600436368629081e-06, + "loss": 3.0084, + "step": 16075 + }, + { + "epoch": 1.3701525611523055, + "grad_norm": 59.70418612151222, + "learning_rate": 6.5999666004501975e-06, + "loss": 3.4471, + "step": 16076 + }, + { + "epoch": 1.3702377908463308, + "grad_norm": 42.09297690543341, + "learning_rate": 6.599496816536277e-06, + "loss": 4.1035, + "step": 16077 + }, + { + "epoch": 1.3703230205403563, + "grad_norm": 43.421680878092054, + "learning_rate": 6.599027016891941e-06, + "loss": 2.2346, + "step": 16078 + }, + { + "epoch": 1.3704082502343817, + "grad_norm": 39.76780950010123, + "learning_rate": 6.598557201521811e-06, + "loss": 3.0892, + "step": 16079 + }, + { + "epoch": 1.370493479928407, + "grad_norm": 51.23348279939833, + "learning_rate": 6.598087370430504e-06, + "loss": 3.623, + "step": 16080 + }, + { + "epoch": 1.3705787096224324, + "grad_norm": 58.02779355884733, + "learning_rate": 6.597617523622643e-06, + "loss": 2.3722, + "step": 16081 + }, + { + "epoch": 1.370663939316458, + "grad_norm": 121.94404917917102, + "learning_rate": 6.597147661102849e-06, + "loss": 3.3565, + "step": 16082 + }, + { + "epoch": 1.3707491690104834, + "grad_norm": 37.13690402617336, + "learning_rate": 6.596677782875743e-06, + "loss": 3.4046, + "step": 16083 + }, + { + "epoch": 1.3708343987045086, + "grad_norm": 49.72990694958297, + "learning_rate": 6.596207888945945e-06, + "loss": 3.2239, + "step": 16084 + }, + { + "epoch": 1.370919628398534, + "grad_norm": 171.14561218556483, + "learning_rate": 6.595737979318076e-06, + "loss": 2.4736, + "step": 16085 + }, + { + "epoch": 1.3710048580925593, + "grad_norm": 38.081311846354, + "learning_rate": 6.59526805399676e-06, + "loss": 2.6942, + "step": 16086 + }, + { + "epoch": 1.3710900877865848, + "grad_norm": 42.41831048745392, + "learning_rate": 6.594798112986615e-06, + "loss": 3.7998, + "step": 16087 + }, + { + "epoch": 1.3711753174806103, + "grad_norm": 40.506751278067625, + "learning_rate": 6.594328156292263e-06, + "loss": 2.9518, + "step": 16088 + }, + { + "epoch": 1.3712605471746357, + "grad_norm": 65.86627954172756, + "learning_rate": 6.593858183918329e-06, + "loss": 3.3167, + "step": 16089 + }, + { + "epoch": 1.371345776868661, + "grad_norm": 60.87176314050362, + "learning_rate": 6.5933881958694335e-06, + "loss": 3.6148, + "step": 16090 + }, + { + "epoch": 1.3714310065626865, + "grad_norm": 50.03037672644647, + "learning_rate": 6.5929181921501975e-06, + "loss": 2.8416, + "step": 16091 + }, + { + "epoch": 1.3715162362567117, + "grad_norm": 38.6198623459814, + "learning_rate": 6.592448172765243e-06, + "loss": 3.3455, + "step": 16092 + }, + { + "epoch": 1.3716014659507372, + "grad_norm": 89.12662532571183, + "learning_rate": 6.591978137719195e-06, + "loss": 3.0855, + "step": 16093 + }, + { + "epoch": 1.3716866956447626, + "grad_norm": 35.05688209738479, + "learning_rate": 6.591508087016674e-06, + "loss": 2.2448, + "step": 16094 + }, + { + "epoch": 1.3717719253387881, + "grad_norm": 49.33159635828909, + "learning_rate": 6.591038020662304e-06, + "loss": 2.4303, + "step": 16095 + }, + { + "epoch": 1.3718571550328134, + "grad_norm": 35.57159796176849, + "learning_rate": 6.5905679386607065e-06, + "loss": 2.477, + "step": 16096 + }, + { + "epoch": 1.3719423847268388, + "grad_norm": 78.50387073519325, + "learning_rate": 6.590097841016505e-06, + "loss": 3.4692, + "step": 16097 + }, + { + "epoch": 1.3720276144208643, + "grad_norm": 100.28019756283467, + "learning_rate": 6.589627727734325e-06, + "loss": 3.1127, + "step": 16098 + }, + { + "epoch": 1.3721128441148895, + "grad_norm": 19.133304593392445, + "learning_rate": 6.589157598818786e-06, + "loss": 1.826, + "step": 16099 + }, + { + "epoch": 1.372198073808915, + "grad_norm": 42.12644911177758, + "learning_rate": 6.588687454274514e-06, + "loss": 2.9025, + "step": 16100 + }, + { + "epoch": 1.3722833035029405, + "grad_norm": 48.40327616807008, + "learning_rate": 6.588217294106132e-06, + "loss": 2.684, + "step": 16101 + }, + { + "epoch": 1.372368533196966, + "grad_norm": 72.17801992019642, + "learning_rate": 6.587747118318265e-06, + "loss": 3.2276, + "step": 16102 + }, + { + "epoch": 1.3724537628909912, + "grad_norm": 53.96642475464153, + "learning_rate": 6.587276926915536e-06, + "loss": 3.1712, + "step": 16103 + }, + { + "epoch": 1.3725389925850167, + "grad_norm": 58.411922929999335, + "learning_rate": 6.5868067199025675e-06, + "loss": 2.5562, + "step": 16104 + }, + { + "epoch": 1.372624222279042, + "grad_norm": 110.09783750165244, + "learning_rate": 6.586336497283986e-06, + "loss": 4.1478, + "step": 16105 + }, + { + "epoch": 1.3727094519730674, + "grad_norm": 141.52383464790321, + "learning_rate": 6.585866259064417e-06, + "loss": 4.6868, + "step": 16106 + }, + { + "epoch": 1.3727946816670928, + "grad_norm": 38.2932649976005, + "learning_rate": 6.5853960052484825e-06, + "loss": 3.1605, + "step": 16107 + }, + { + "epoch": 1.3728799113611183, + "grad_norm": 43.951323404452715, + "learning_rate": 6.584925735840807e-06, + "loss": 2.9667, + "step": 16108 + }, + { + "epoch": 1.3729651410551436, + "grad_norm": 47.51260399623843, + "learning_rate": 6.584455450846019e-06, + "loss": 2.7138, + "step": 16109 + }, + { + "epoch": 1.373050370749169, + "grad_norm": 51.020930436380524, + "learning_rate": 6.583985150268739e-06, + "loss": 3.3639, + "step": 16110 + }, + { + "epoch": 1.3731356004431943, + "grad_norm": 48.76161887749791, + "learning_rate": 6.5835148341135956e-06, + "loss": 3.2857, + "step": 16111 + }, + { + "epoch": 1.3732208301372197, + "grad_norm": 32.31122673957512, + "learning_rate": 6.583044502385213e-06, + "loss": 2.6396, + "step": 16112 + }, + { + "epoch": 1.3733060598312452, + "grad_norm": 56.6027376587133, + "learning_rate": 6.5825741550882175e-06, + "loss": 2.9, + "step": 16113 + }, + { + "epoch": 1.3733912895252707, + "grad_norm": 59.95630330664357, + "learning_rate": 6.582103792227232e-06, + "loss": 3.04, + "step": 16114 + }, + { + "epoch": 1.373476519219296, + "grad_norm": 38.27682530306998, + "learning_rate": 6.581633413806886e-06, + "loss": 2.9273, + "step": 16115 + }, + { + "epoch": 1.3735617489133214, + "grad_norm": 64.67615211426097, + "learning_rate": 6.581163019831802e-06, + "loss": 3.1111, + "step": 16116 + }, + { + "epoch": 1.3736469786073469, + "grad_norm": 33.26914682166123, + "learning_rate": 6.580692610306611e-06, + "loss": 3.0285, + "step": 16117 + }, + { + "epoch": 1.373732208301372, + "grad_norm": 14.681508166197586, + "learning_rate": 6.580222185235934e-06, + "loss": 0.9434, + "step": 16118 + }, + { + "epoch": 1.3738174379953976, + "grad_norm": 54.16826622053759, + "learning_rate": 6.579751744624401e-06, + "loss": 3.018, + "step": 16119 + }, + { + "epoch": 1.373902667689423, + "grad_norm": 42.00431403843613, + "learning_rate": 6.579281288476635e-06, + "loss": 2.4676, + "step": 16120 + }, + { + "epoch": 1.3739878973834485, + "grad_norm": 89.89951692151256, + "learning_rate": 6.578810816797269e-06, + "loss": 3.7809, + "step": 16121 + }, + { + "epoch": 1.3740731270774738, + "grad_norm": 38.38634843317379, + "learning_rate": 6.5783403295909235e-06, + "loss": 3.0498, + "step": 16122 + }, + { + "epoch": 1.3741583567714992, + "grad_norm": 49.87656011530803, + "learning_rate": 6.577869826862228e-06, + "loss": 2.8505, + "step": 16123 + }, + { + "epoch": 1.3742435864655245, + "grad_norm": 48.1790552734288, + "learning_rate": 6.57739930861581e-06, + "loss": 2.6052, + "step": 16124 + }, + { + "epoch": 1.37432881615955, + "grad_norm": 88.67121002729601, + "learning_rate": 6.576928774856297e-06, + "loss": 3.963, + "step": 16125 + }, + { + "epoch": 1.3744140458535754, + "grad_norm": 40.4141335107344, + "learning_rate": 6.576458225588316e-06, + "loss": 3.1161, + "step": 16126 + }, + { + "epoch": 1.3744992755476009, + "grad_norm": 73.06554773428141, + "learning_rate": 6.575987660816493e-06, + "loss": 4.4409, + "step": 16127 + }, + { + "epoch": 1.3745845052416261, + "grad_norm": 72.57199200620569, + "learning_rate": 6.5755170805454605e-06, + "loss": 3.6547, + "step": 16128 + }, + { + "epoch": 1.3746697349356516, + "grad_norm": 50.77187141123533, + "learning_rate": 6.5750464847798414e-06, + "loss": 3.028, + "step": 16129 + }, + { + "epoch": 1.374754964629677, + "grad_norm": 37.378604057895245, + "learning_rate": 6.574575873524268e-06, + "loss": 3.0604, + "step": 16130 + }, + { + "epoch": 1.3748401943237023, + "grad_norm": 79.50872627177795, + "learning_rate": 6.574105246783364e-06, + "loss": 2.9159, + "step": 16131 + }, + { + "epoch": 1.3749254240177278, + "grad_norm": 47.43785245530919, + "learning_rate": 6.573634604561763e-06, + "loss": 1.6112, + "step": 16132 + }, + { + "epoch": 1.3750106537117532, + "grad_norm": 32.00848938966999, + "learning_rate": 6.573163946864088e-06, + "loss": 2.9603, + "step": 16133 + }, + { + "epoch": 1.3750958834057787, + "grad_norm": 24.882455945890346, + "learning_rate": 6.572693273694973e-06, + "loss": 1.826, + "step": 16134 + }, + { + "epoch": 1.375181113099804, + "grad_norm": 48.84561771721168, + "learning_rate": 6.572222585059044e-06, + "loss": 2.9135, + "step": 16135 + }, + { + "epoch": 1.3752663427938294, + "grad_norm": 54.14236003916956, + "learning_rate": 6.57175188096093e-06, + "loss": 2.885, + "step": 16136 + }, + { + "epoch": 1.3753515724878547, + "grad_norm": 30.553764129337832, + "learning_rate": 6.571281161405262e-06, + "loss": 2.7328, + "step": 16137 + }, + { + "epoch": 1.3754368021818801, + "grad_norm": 67.56647441541347, + "learning_rate": 6.570810426396667e-06, + "loss": 3.2304, + "step": 16138 + }, + { + "epoch": 1.3755220318759056, + "grad_norm": 43.99729269270976, + "learning_rate": 6.570339675939777e-06, + "loss": 2.7088, + "step": 16139 + }, + { + "epoch": 1.375607261569931, + "grad_norm": 31.95467349248192, + "learning_rate": 6.569868910039219e-06, + "loss": 2.8129, + "step": 16140 + }, + { + "epoch": 1.3756924912639563, + "grad_norm": 70.83781310620189, + "learning_rate": 6.569398128699626e-06, + "loss": 3.0148, + "step": 16141 + }, + { + "epoch": 1.3757777209579818, + "grad_norm": 112.97986062636633, + "learning_rate": 6.568927331925623e-06, + "loss": 4.1859, + "step": 16142 + }, + { + "epoch": 1.375862950652007, + "grad_norm": 113.41521273401365, + "learning_rate": 6.568456519721845e-06, + "loss": 3.3325, + "step": 16143 + }, + { + "epoch": 1.3759481803460325, + "grad_norm": 44.69146160195834, + "learning_rate": 6.567985692092919e-06, + "loss": 2.7938, + "step": 16144 + }, + { + "epoch": 1.376033410040058, + "grad_norm": 32.69651571768265, + "learning_rate": 6.567514849043479e-06, + "loss": 2.977, + "step": 16145 + }, + { + "epoch": 1.3761186397340834, + "grad_norm": 32.870070895779826, + "learning_rate": 6.567043990578152e-06, + "loss": 2.2951, + "step": 16146 + }, + { + "epoch": 1.3762038694281087, + "grad_norm": 34.92044866682338, + "learning_rate": 6.56657311670157e-06, + "loss": 2.9377, + "step": 16147 + }, + { + "epoch": 1.3762890991221342, + "grad_norm": 60.64654030176592, + "learning_rate": 6.566102227418364e-06, + "loss": 3.6144, + "step": 16148 + }, + { + "epoch": 1.3763743288161596, + "grad_norm": 47.28957374042004, + "learning_rate": 6.565631322733166e-06, + "loss": 3.15, + "step": 16149 + }, + { + "epoch": 1.3764595585101849, + "grad_norm": 37.8855499078556, + "learning_rate": 6.565160402650605e-06, + "loss": 3.1182, + "step": 16150 + }, + { + "epoch": 1.3765447882042103, + "grad_norm": 54.35077317395909, + "learning_rate": 6.564689467175312e-06, + "loss": 3.0206, + "step": 16151 + }, + { + "epoch": 1.3766300178982358, + "grad_norm": 59.098076109643195, + "learning_rate": 6.564218516311922e-06, + "loss": 2.808, + "step": 16152 + }, + { + "epoch": 1.3767152475922613, + "grad_norm": 41.09103305937521, + "learning_rate": 6.563747550065063e-06, + "loss": 3.0142, + "step": 16153 + }, + { + "epoch": 1.3768004772862865, + "grad_norm": 28.94183807128237, + "learning_rate": 6.56327656843937e-06, + "loss": 1.9176, + "step": 16154 + }, + { + "epoch": 1.376885706980312, + "grad_norm": 38.49730893885322, + "learning_rate": 6.562805571439471e-06, + "loss": 2.8158, + "step": 16155 + }, + { + "epoch": 1.3769709366743372, + "grad_norm": 83.54727968314886, + "learning_rate": 6.562334559070002e-06, + "loss": 3.54, + "step": 16156 + }, + { + "epoch": 1.3770561663683627, + "grad_norm": 50.75856657311974, + "learning_rate": 6.5618635313355925e-06, + "loss": 2.4558, + "step": 16157 + }, + { + "epoch": 1.3771413960623882, + "grad_norm": 108.17094334732766, + "learning_rate": 6.561392488240875e-06, + "loss": 3.2676, + "step": 16158 + }, + { + "epoch": 1.3772266257564136, + "grad_norm": 49.62730077851543, + "learning_rate": 6.560921429790483e-06, + "loss": 2.5532, + "step": 16159 + }, + { + "epoch": 1.3773118554504389, + "grad_norm": 23.644234816418425, + "learning_rate": 6.560450355989052e-06, + "loss": 2.0595, + "step": 16160 + }, + { + "epoch": 1.3773970851444644, + "grad_norm": 53.59851200763357, + "learning_rate": 6.559979266841209e-06, + "loss": 2.1578, + "step": 16161 + }, + { + "epoch": 1.3774823148384896, + "grad_norm": 47.66804861212253, + "learning_rate": 6.55950816235159e-06, + "loss": 2.1733, + "step": 16162 + }, + { + "epoch": 1.377567544532515, + "grad_norm": 48.64327418684325, + "learning_rate": 6.559037042524827e-06, + "loss": 3.0619, + "step": 16163 + }, + { + "epoch": 1.3776527742265405, + "grad_norm": 45.067293327320456, + "learning_rate": 6.558565907365557e-06, + "loss": 2.5048, + "step": 16164 + }, + { + "epoch": 1.377738003920566, + "grad_norm": 40.531587254073706, + "learning_rate": 6.558094756878408e-06, + "loss": 3.9648, + "step": 16165 + }, + { + "epoch": 1.3778232336145912, + "grad_norm": 55.809309559116585, + "learning_rate": 6.557623591068018e-06, + "loss": 2.9407, + "step": 16166 + }, + { + "epoch": 1.3779084633086167, + "grad_norm": 39.437908913683216, + "learning_rate": 6.557152409939018e-06, + "loss": 2.1833, + "step": 16167 + }, + { + "epoch": 1.3779936930026422, + "grad_norm": 68.47936713657279, + "learning_rate": 6.5566812134960434e-06, + "loss": 3.2662, + "step": 16168 + }, + { + "epoch": 1.3780789226966674, + "grad_norm": 73.0884756265775, + "learning_rate": 6.556210001743728e-06, + "loss": 3.1851, + "step": 16169 + }, + { + "epoch": 1.378164152390693, + "grad_norm": 62.84531623511135, + "learning_rate": 6.555738774686703e-06, + "loss": 2.2543, + "step": 16170 + }, + { + "epoch": 1.3782493820847184, + "grad_norm": 22.283335319092043, + "learning_rate": 6.555267532329608e-06, + "loss": 2.9473, + "step": 16171 + }, + { + "epoch": 1.3783346117787438, + "grad_norm": 40.57825882255234, + "learning_rate": 6.554796274677073e-06, + "loss": 2.9938, + "step": 16172 + }, + { + "epoch": 1.378419841472769, + "grad_norm": 115.55431575510143, + "learning_rate": 6.554325001733737e-06, + "loss": 3.919, + "step": 16173 + }, + { + "epoch": 1.3785050711667945, + "grad_norm": 73.94593651931282, + "learning_rate": 6.553853713504229e-06, + "loss": 2.4308, + "step": 16174 + }, + { + "epoch": 1.3785903008608198, + "grad_norm": 43.75784745463168, + "learning_rate": 6.55338240999319e-06, + "loss": 2.8101, + "step": 16175 + }, + { + "epoch": 1.3786755305548453, + "grad_norm": 36.85868440929556, + "learning_rate": 6.5529110912052506e-06, + "loss": 2.3375, + "step": 16176 + }, + { + "epoch": 1.3787607602488707, + "grad_norm": 33.3913495703801, + "learning_rate": 6.5524397571450496e-06, + "loss": 1.8688, + "step": 16177 + }, + { + "epoch": 1.3788459899428962, + "grad_norm": 34.93368905482238, + "learning_rate": 6.551968407817218e-06, + "loss": 3.2974, + "step": 16178 + }, + { + "epoch": 1.3789312196369214, + "grad_norm": 32.27801417706256, + "learning_rate": 6.551497043226394e-06, + "loss": 2.8313, + "step": 16179 + }, + { + "epoch": 1.379016449330947, + "grad_norm": 37.656764206120165, + "learning_rate": 6.551025663377215e-06, + "loss": 3.2403, + "step": 16180 + }, + { + "epoch": 1.3791016790249724, + "grad_norm": 93.73389224375651, + "learning_rate": 6.550554268274314e-06, + "loss": 2.9292, + "step": 16181 + }, + { + "epoch": 1.3791869087189976, + "grad_norm": 50.29691307325749, + "learning_rate": 6.5500828579223276e-06, + "loss": 2.3938, + "step": 16182 + }, + { + "epoch": 1.379272138413023, + "grad_norm": 69.09078893869372, + "learning_rate": 6.549611432325892e-06, + "loss": 3.1805, + "step": 16183 + }, + { + "epoch": 1.3793573681070486, + "grad_norm": 58.32637711953988, + "learning_rate": 6.549139991489645e-06, + "loss": 2.9286, + "step": 16184 + }, + { + "epoch": 1.3794425978010738, + "grad_norm": 70.23810709569244, + "learning_rate": 6.548668535418221e-06, + "loss": 3.3851, + "step": 16185 + }, + { + "epoch": 1.3795278274950993, + "grad_norm": 39.19620319461923, + "learning_rate": 6.548197064116256e-06, + "loss": 2.9072, + "step": 16186 + }, + { + "epoch": 1.3796130571891247, + "grad_norm": 56.823638823243556, + "learning_rate": 6.54772557758839e-06, + "loss": 2.2382, + "step": 16187 + }, + { + "epoch": 1.37969828688315, + "grad_norm": 106.36320795217438, + "learning_rate": 6.5472540758392575e-06, + "loss": 2.7731, + "step": 16188 + }, + { + "epoch": 1.3797835165771755, + "grad_norm": 82.55870046261306, + "learning_rate": 6.5467825588734945e-06, + "loss": 3.0411, + "step": 16189 + }, + { + "epoch": 1.379868746271201, + "grad_norm": 58.05813409173165, + "learning_rate": 6.5463110266957405e-06, + "loss": 2.8417, + "step": 16190 + }, + { + "epoch": 1.3799539759652264, + "grad_norm": 40.65829945445075, + "learning_rate": 6.545839479310633e-06, + "loss": 2.8589, + "step": 16191 + }, + { + "epoch": 1.3800392056592516, + "grad_norm": 35.195422646587815, + "learning_rate": 6.545367916722808e-06, + "loss": 2.7678, + "step": 16192 + }, + { + "epoch": 1.3801244353532771, + "grad_norm": 28.874924885616373, + "learning_rate": 6.544896338936904e-06, + "loss": 2.3994, + "step": 16193 + }, + { + "epoch": 1.3802096650473024, + "grad_norm": 66.95601960962415, + "learning_rate": 6.544424745957557e-06, + "loss": 3.2853, + "step": 16194 + }, + { + "epoch": 1.3802948947413278, + "grad_norm": 68.95391616747189, + "learning_rate": 6.543953137789407e-06, + "loss": 2.756, + "step": 16195 + }, + { + "epoch": 1.3803801244353533, + "grad_norm": 46.1882143846708, + "learning_rate": 6.5434815144370925e-06, + "loss": 2.4201, + "step": 16196 + }, + { + "epoch": 1.3804653541293788, + "grad_norm": 22.24910694292952, + "learning_rate": 6.543009875905249e-06, + "loss": 1.9519, + "step": 16197 + }, + { + "epoch": 1.380550583823404, + "grad_norm": 41.28030179435977, + "learning_rate": 6.542538222198517e-06, + "loss": 3.5096, + "step": 16198 + }, + { + "epoch": 1.3806358135174295, + "grad_norm": 49.35732057786169, + "learning_rate": 6.542066553321535e-06, + "loss": 2.6258, + "step": 16199 + }, + { + "epoch": 1.380721043211455, + "grad_norm": 97.65415657272573, + "learning_rate": 6.541594869278941e-06, + "loss": 2.0663, + "step": 16200 + }, + { + "epoch": 1.3808062729054802, + "grad_norm": 29.115339356759247, + "learning_rate": 6.541123170075374e-06, + "loss": 2.1863, + "step": 16201 + }, + { + "epoch": 1.3808915025995057, + "grad_norm": 61.65337196549751, + "learning_rate": 6.540651455715472e-06, + "loss": 2.6726, + "step": 16202 + }, + { + "epoch": 1.3809767322935311, + "grad_norm": 40.397432076831045, + "learning_rate": 6.540179726203877e-06, + "loss": 3.4898, + "step": 16203 + }, + { + "epoch": 1.3810619619875566, + "grad_norm": 97.80277695831467, + "learning_rate": 6.539707981545224e-06, + "loss": 3.0684, + "step": 16204 + }, + { + "epoch": 1.3811471916815818, + "grad_norm": 49.79678649620575, + "learning_rate": 6.539236221744157e-06, + "loss": 3.1146, + "step": 16205 + }, + { + "epoch": 1.3812324213756073, + "grad_norm": 32.602484049711514, + "learning_rate": 6.538764446805312e-06, + "loss": 2.5048, + "step": 16206 + }, + { + "epoch": 1.3813176510696326, + "grad_norm": 47.999090268666286, + "learning_rate": 6.538292656733332e-06, + "loss": 3.2083, + "step": 16207 + }, + { + "epoch": 1.381402880763658, + "grad_norm": 30.650431198469242, + "learning_rate": 6.537820851532853e-06, + "loss": 2.9251, + "step": 16208 + }, + { + "epoch": 1.3814881104576835, + "grad_norm": 121.74859441440705, + "learning_rate": 6.537349031208517e-06, + "loss": 3.308, + "step": 16209 + }, + { + "epoch": 1.381573340151709, + "grad_norm": 70.54134078008586, + "learning_rate": 6.536877195764966e-06, + "loss": 2.8826, + "step": 16210 + }, + { + "epoch": 1.3816585698457342, + "grad_norm": 44.92394574577706, + "learning_rate": 6.536405345206838e-06, + "loss": 3.2576, + "step": 16211 + }, + { + "epoch": 1.3817437995397597, + "grad_norm": 38.874592371628765, + "learning_rate": 6.535933479538774e-06, + "loss": 3.0247, + "step": 16212 + }, + { + "epoch": 1.381829029233785, + "grad_norm": 61.91573565680185, + "learning_rate": 6.535461598765412e-06, + "loss": 3.3368, + "step": 16213 + }, + { + "epoch": 1.3819142589278104, + "grad_norm": 101.90652207637581, + "learning_rate": 6.534989702891399e-06, + "loss": 3.5103, + "step": 16214 + }, + { + "epoch": 1.3819994886218359, + "grad_norm": 36.13482562990627, + "learning_rate": 6.534517791921369e-06, + "loss": 2.827, + "step": 16215 + }, + { + "epoch": 1.3820847183158613, + "grad_norm": 44.70059961754622, + "learning_rate": 6.534045865859968e-06, + "loss": 2.8019, + "step": 16216 + }, + { + "epoch": 1.3821699480098866, + "grad_norm": 19.179841831743897, + "learning_rate": 6.533573924711835e-06, + "loss": 1.6141, + "step": 16217 + }, + { + "epoch": 1.382255177703912, + "grad_norm": 44.76456907049273, + "learning_rate": 6.5331019684816125e-06, + "loss": 2.4784, + "step": 16218 + }, + { + "epoch": 1.3823404073979375, + "grad_norm": 58.91464884492131, + "learning_rate": 6.53262999717394e-06, + "loss": 3.694, + "step": 16219 + }, + { + "epoch": 1.3824256370919628, + "grad_norm": 64.05996716517208, + "learning_rate": 6.532158010793462e-06, + "loss": 3.801, + "step": 16220 + }, + { + "epoch": 1.3825108667859882, + "grad_norm": 141.48884697900243, + "learning_rate": 6.531686009344818e-06, + "loss": 4.3609, + "step": 16221 + }, + { + "epoch": 1.3825960964800137, + "grad_norm": 38.987454992527695, + "learning_rate": 6.5312139928326505e-06, + "loss": 2.5347, + "step": 16222 + }, + { + "epoch": 1.3826813261740392, + "grad_norm": 270.0445064212132, + "learning_rate": 6.530741961261602e-06, + "loss": 3.6134, + "step": 16223 + }, + { + "epoch": 1.3827665558680644, + "grad_norm": 37.04278521634789, + "learning_rate": 6.530269914636316e-06, + "loss": 4.3424, + "step": 16224 + }, + { + "epoch": 1.3828517855620899, + "grad_norm": 35.99511219133433, + "learning_rate": 6.5297978529614315e-06, + "loss": 2.8214, + "step": 16225 + }, + { + "epoch": 1.3829370152561151, + "grad_norm": 45.11762701738709, + "learning_rate": 6.529325776241593e-06, + "loss": 2.4938, + "step": 16226 + }, + { + "epoch": 1.3830222449501406, + "grad_norm": 51.647260245464, + "learning_rate": 6.5288536844814455e-06, + "loss": 3.1196, + "step": 16227 + }, + { + "epoch": 1.383107474644166, + "grad_norm": 64.72509320578808, + "learning_rate": 6.528381577685627e-06, + "loss": 2.9099, + "step": 16228 + }, + { + "epoch": 1.3831927043381915, + "grad_norm": 30.998498795804824, + "learning_rate": 6.527909455858783e-06, + "loss": 2.9605, + "step": 16229 + }, + { + "epoch": 1.3832779340322168, + "grad_norm": 55.66086337984206, + "learning_rate": 6.527437319005558e-06, + "loss": 3.3698, + "step": 16230 + }, + { + "epoch": 1.3833631637262422, + "grad_norm": 89.14907078076003, + "learning_rate": 6.526965167130595e-06, + "loss": 4.0192, + "step": 16231 + }, + { + "epoch": 1.3834483934202675, + "grad_norm": 57.33563561659886, + "learning_rate": 6.526493000238534e-06, + "loss": 2.3412, + "step": 16232 + }, + { + "epoch": 1.383533623114293, + "grad_norm": 37.02688435304257, + "learning_rate": 6.526020818334021e-06, + "loss": 1.6776, + "step": 16233 + }, + { + "epoch": 1.3836188528083184, + "grad_norm": 39.293732736302765, + "learning_rate": 6.5255486214217e-06, + "loss": 2.9529, + "step": 16234 + }, + { + "epoch": 1.383704082502344, + "grad_norm": 86.42423999018614, + "learning_rate": 6.5250764095062156e-06, + "loss": 2.5697, + "step": 16235 + }, + { + "epoch": 1.3837893121963691, + "grad_norm": 86.69820112294266, + "learning_rate": 6.524604182592209e-06, + "loss": 3.6938, + "step": 16236 + }, + { + "epoch": 1.3838745418903946, + "grad_norm": 35.704711161724845, + "learning_rate": 6.524131940684327e-06, + "loss": 2.7836, + "step": 16237 + }, + { + "epoch": 1.38395977158442, + "grad_norm": 120.34886490840157, + "learning_rate": 6.523659683787213e-06, + "loss": 3.8949, + "step": 16238 + }, + { + "epoch": 1.3840450012784453, + "grad_norm": 89.47937818950574, + "learning_rate": 6.523187411905512e-06, + "loss": 3.0322, + "step": 16239 + }, + { + "epoch": 1.3841302309724708, + "grad_norm": 34.692588350266995, + "learning_rate": 6.522715125043868e-06, + "loss": 2.8319, + "step": 16240 + }, + { + "epoch": 1.3842154606664963, + "grad_norm": 29.029245872890936, + "learning_rate": 6.522242823206924e-06, + "loss": 1.8654, + "step": 16241 + }, + { + "epoch": 1.3843006903605217, + "grad_norm": 56.29134508115891, + "learning_rate": 6.52177050639933e-06, + "loss": 4.0011, + "step": 16242 + }, + { + "epoch": 1.384385920054547, + "grad_norm": 43.067519900194036, + "learning_rate": 6.521298174625725e-06, + "loss": 3.1306, + "step": 16243 + }, + { + "epoch": 1.3844711497485724, + "grad_norm": 72.53759054492113, + "learning_rate": 6.520825827890758e-06, + "loss": 2.5645, + "step": 16244 + }, + { + "epoch": 1.3845563794425977, + "grad_norm": 41.99188895522721, + "learning_rate": 6.520353466199072e-06, + "loss": 2.9818, + "step": 16245 + }, + { + "epoch": 1.3846416091366232, + "grad_norm": 121.03296154500678, + "learning_rate": 6.519881089555315e-06, + "loss": 2.9139, + "step": 16246 + }, + { + "epoch": 1.3847268388306486, + "grad_norm": 39.52600862549644, + "learning_rate": 6.519408697964131e-06, + "loss": 3.0129, + "step": 16247 + }, + { + "epoch": 1.384812068524674, + "grad_norm": 43.85679834355727, + "learning_rate": 6.5189362914301665e-06, + "loss": 2.999, + "step": 16248 + }, + { + "epoch": 1.3848972982186993, + "grad_norm": 47.15627748982903, + "learning_rate": 6.518463869958067e-06, + "loss": 3.2883, + "step": 16249 + }, + { + "epoch": 1.3849825279127248, + "grad_norm": 51.31282424814132, + "learning_rate": 6.51799143355248e-06, + "loss": 2.8582, + "step": 16250 + }, + { + "epoch": 1.3850677576067503, + "grad_norm": 80.81194257240534, + "learning_rate": 6.517518982218049e-06, + "loss": 2.0907, + "step": 16251 + }, + { + "epoch": 1.3851529873007755, + "grad_norm": 66.53615948957217, + "learning_rate": 6.517046515959422e-06, + "loss": 2.5467, + "step": 16252 + }, + { + "epoch": 1.385238216994801, + "grad_norm": 27.462539096883344, + "learning_rate": 6.516574034781245e-06, + "loss": 2.1609, + "step": 16253 + }, + { + "epoch": 1.3853234466888265, + "grad_norm": 40.541045288750965, + "learning_rate": 6.516101538688166e-06, + "loss": 1.172, + "step": 16254 + }, + { + "epoch": 1.385408676382852, + "grad_norm": 72.60292128673031, + "learning_rate": 6.515629027684831e-06, + "loss": 2.8067, + "step": 16255 + }, + { + "epoch": 1.3854939060768772, + "grad_norm": 66.65348954146917, + "learning_rate": 6.515156501775884e-06, + "loss": 2.68, + "step": 16256 + }, + { + "epoch": 1.3855791357709026, + "grad_norm": 124.56056658997812, + "learning_rate": 6.514683960965978e-06, + "loss": 4.4232, + "step": 16257 + }, + { + "epoch": 1.3856643654649279, + "grad_norm": 33.84641870566141, + "learning_rate": 6.514211405259756e-06, + "loss": 2.565, + "step": 16258 + }, + { + "epoch": 1.3857495951589534, + "grad_norm": 70.40593723476776, + "learning_rate": 6.5137388346618665e-06, + "loss": 2.57, + "step": 16259 + }, + { + "epoch": 1.3858348248529788, + "grad_norm": 47.33810018061455, + "learning_rate": 6.513266249176956e-06, + "loss": 3.2809, + "step": 16260 + }, + { + "epoch": 1.3859200545470043, + "grad_norm": 49.97935340426471, + "learning_rate": 6.512793648809676e-06, + "loss": 3.4441, + "step": 16261 + }, + { + "epoch": 1.3860052842410295, + "grad_norm": 98.10338760719222, + "learning_rate": 6.512321033564669e-06, + "loss": 2.9816, + "step": 16262 + }, + { + "epoch": 1.386090513935055, + "grad_norm": 35.228374763741606, + "learning_rate": 6.511848403446587e-06, + "loss": 2.9889, + "step": 16263 + }, + { + "epoch": 1.3861757436290802, + "grad_norm": 28.098206817515283, + "learning_rate": 6.511375758460075e-06, + "loss": 2.2224, + "step": 16264 + }, + { + "epoch": 1.3862609733231057, + "grad_norm": 66.39512230437941, + "learning_rate": 6.510903098609784e-06, + "loss": 3.3072, + "step": 16265 + }, + { + "epoch": 1.3863462030171312, + "grad_norm": 92.67663538677347, + "learning_rate": 6.5104304239003625e-06, + "loss": 3.9078, + "step": 16266 + }, + { + "epoch": 1.3864314327111567, + "grad_norm": 44.97361298329144, + "learning_rate": 6.509957734336457e-06, + "loss": 3.3125, + "step": 16267 + }, + { + "epoch": 1.386516662405182, + "grad_norm": 38.60894414596093, + "learning_rate": 6.509485029922716e-06, + "loss": 2.9098, + "step": 16268 + }, + { + "epoch": 1.3866018920992074, + "grad_norm": 44.59223405349591, + "learning_rate": 6.509012310663792e-06, + "loss": 2.442, + "step": 16269 + }, + { + "epoch": 1.3866871217932328, + "grad_norm": 74.52551599976128, + "learning_rate": 6.5085395765643315e-06, + "loss": 2.436, + "step": 16270 + }, + { + "epoch": 1.386772351487258, + "grad_norm": 120.40734026887604, + "learning_rate": 6.508066827628982e-06, + "loss": 3.6552, + "step": 16271 + }, + { + "epoch": 1.3868575811812835, + "grad_norm": 57.34151807783266, + "learning_rate": 6.507594063862395e-06, + "loss": 2.6516, + "step": 16272 + }, + { + "epoch": 1.386942810875309, + "grad_norm": 42.90349516281836, + "learning_rate": 6.5071212852692175e-06, + "loss": 3.0828, + "step": 16273 + }, + { + "epoch": 1.3870280405693345, + "grad_norm": 40.92464970865654, + "learning_rate": 6.506648491854104e-06, + "loss": 3.7962, + "step": 16274 + }, + { + "epoch": 1.3871132702633597, + "grad_norm": 23.036948458931647, + "learning_rate": 6.506175683621701e-06, + "loss": 1.7215, + "step": 16275 + }, + { + "epoch": 1.3871984999573852, + "grad_norm": 86.98146712910138, + "learning_rate": 6.505702860576657e-06, + "loss": 3.765, + "step": 16276 + }, + { + "epoch": 1.3872837296514104, + "grad_norm": 44.45130184775772, + "learning_rate": 6.505230022723624e-06, + "loss": 2.5855, + "step": 16277 + }, + { + "epoch": 1.387368959345436, + "grad_norm": 39.617662252884365, + "learning_rate": 6.504757170067253e-06, + "loss": 2.929, + "step": 16278 + }, + { + "epoch": 1.3874541890394614, + "grad_norm": 17.045236726172615, + "learning_rate": 6.504284302612193e-06, + "loss": 1.0338, + "step": 16279 + }, + { + "epoch": 1.3875394187334869, + "grad_norm": 45.44480381480185, + "learning_rate": 6.503811420363093e-06, + "loss": 2.5248, + "step": 16280 + }, + { + "epoch": 1.387624648427512, + "grad_norm": 43.29176257077944, + "learning_rate": 6.503338523324607e-06, + "loss": 2.6563, + "step": 16281 + }, + { + "epoch": 1.3877098781215376, + "grad_norm": 44.156154232253165, + "learning_rate": 6.502865611501383e-06, + "loss": 1.8616, + "step": 16282 + }, + { + "epoch": 1.3877951078155628, + "grad_norm": 34.45486714989008, + "learning_rate": 6.502392684898073e-06, + "loss": 3.0627, + "step": 16283 + }, + { + "epoch": 1.3878803375095883, + "grad_norm": 61.32734897642358, + "learning_rate": 6.501919743519329e-06, + "loss": 2.6956, + "step": 16284 + }, + { + "epoch": 1.3879655672036137, + "grad_norm": 65.83274263297228, + "learning_rate": 6.501446787369801e-06, + "loss": 3.6005, + "step": 16285 + }, + { + "epoch": 1.3880507968976392, + "grad_norm": 45.35249331813566, + "learning_rate": 6.500973816454139e-06, + "loss": 2.7217, + "step": 16286 + }, + { + "epoch": 1.3881360265916645, + "grad_norm": 46.89785511171488, + "learning_rate": 6.5005008307769966e-06, + "loss": 1.4554, + "step": 16287 + }, + { + "epoch": 1.38822125628569, + "grad_norm": 80.8801331430845, + "learning_rate": 6.5000278303430254e-06, + "loss": 3.4889, + "step": 16288 + }, + { + "epoch": 1.3883064859797154, + "grad_norm": 48.526578752570884, + "learning_rate": 6.499554815156877e-06, + "loss": 4.3888, + "step": 16289 + }, + { + "epoch": 1.3883917156737406, + "grad_norm": 133.51162874059327, + "learning_rate": 6.499081785223201e-06, + "loss": 4.089, + "step": 16290 + }, + { + "epoch": 1.3884769453677661, + "grad_norm": 81.58118403906789, + "learning_rate": 6.498608740546652e-06, + "loss": 3.4509, + "step": 16291 + }, + { + "epoch": 1.3885621750617916, + "grad_norm": 119.01051704246494, + "learning_rate": 6.498135681131882e-06, + "loss": 3.6883, + "step": 16292 + }, + { + "epoch": 1.388647404755817, + "grad_norm": 75.32538580749247, + "learning_rate": 6.497662606983544e-06, + "loss": 2.5953, + "step": 16293 + }, + { + "epoch": 1.3887326344498423, + "grad_norm": 62.42637278771574, + "learning_rate": 6.497189518106287e-06, + "loss": 2.6755, + "step": 16294 + }, + { + "epoch": 1.3888178641438678, + "grad_norm": 68.60541842447383, + "learning_rate": 6.496716414504767e-06, + "loss": 3.4176, + "step": 16295 + }, + { + "epoch": 1.388903093837893, + "grad_norm": 47.426207465663396, + "learning_rate": 6.496243296183635e-06, + "loss": 3.5211, + "step": 16296 + }, + { + "epoch": 1.3889883235319185, + "grad_norm": 37.73086579277447, + "learning_rate": 6.495770163147546e-06, + "loss": 2.2213, + "step": 16297 + }, + { + "epoch": 1.389073553225944, + "grad_norm": 61.31296277775801, + "learning_rate": 6.495297015401153e-06, + "loss": 2.9136, + "step": 16298 + }, + { + "epoch": 1.3891587829199694, + "grad_norm": 34.13057176779795, + "learning_rate": 6.494823852949104e-06, + "loss": 2.6166, + "step": 16299 + }, + { + "epoch": 1.3892440126139947, + "grad_norm": 54.29694626545112, + "learning_rate": 6.494350675796061e-06, + "loss": 3.806, + "step": 16300 + }, + { + "epoch": 1.3893292423080201, + "grad_norm": 30.641952291594293, + "learning_rate": 6.49387748394667e-06, + "loss": 2.3072, + "step": 16301 + }, + { + "epoch": 1.3894144720020454, + "grad_norm": 53.85933437535618, + "learning_rate": 6.493404277405589e-06, + "loss": 3.1229, + "step": 16302 + }, + { + "epoch": 1.3894997016960708, + "grad_norm": 86.33684135317793, + "learning_rate": 6.492931056177468e-06, + "loss": 3.1797, + "step": 16303 + }, + { + "epoch": 1.3895849313900963, + "grad_norm": 37.71281315404606, + "learning_rate": 6.492457820266965e-06, + "loss": 3.4197, + "step": 16304 + }, + { + "epoch": 1.3896701610841218, + "grad_norm": 45.457912872969025, + "learning_rate": 6.4919845696787335e-06, + "loss": 3.2036, + "step": 16305 + }, + { + "epoch": 1.389755390778147, + "grad_norm": 35.982944549962895, + "learning_rate": 6.4915113044174246e-06, + "loss": 2.8828, + "step": 16306 + }, + { + "epoch": 1.3898406204721725, + "grad_norm": 108.43259032032502, + "learning_rate": 6.4910380244876945e-06, + "loss": 3.0887, + "step": 16307 + }, + { + "epoch": 1.389925850166198, + "grad_norm": 121.13079895784384, + "learning_rate": 6.4905647298941996e-06, + "loss": 3.7659, + "step": 16308 + }, + { + "epoch": 1.3900110798602232, + "grad_norm": 67.08437452373315, + "learning_rate": 6.4900914206415924e-06, + "loss": 3.5498, + "step": 16309 + }, + { + "epoch": 1.3900963095542487, + "grad_norm": 42.84340982323203, + "learning_rate": 6.489618096734529e-06, + "loss": 3.0554, + "step": 16310 + }, + { + "epoch": 1.3901815392482741, + "grad_norm": 52.92165415244748, + "learning_rate": 6.489144758177661e-06, + "loss": 3.1881, + "step": 16311 + }, + { + "epoch": 1.3902667689422996, + "grad_norm": 40.36953959926189, + "learning_rate": 6.488671404975648e-06, + "loss": 3.386, + "step": 16312 + }, + { + "epoch": 1.3903519986363249, + "grad_norm": 40.916256570400144, + "learning_rate": 6.488198037133143e-06, + "loss": 3.0237, + "step": 16313 + }, + { + "epoch": 1.3904372283303503, + "grad_norm": 64.96001574691657, + "learning_rate": 6.487724654654801e-06, + "loss": 3.3288, + "step": 16314 + }, + { + "epoch": 1.3905224580243756, + "grad_norm": 50.641006287910066, + "learning_rate": 6.487251257545279e-06, + "loss": 4.1006, + "step": 16315 + }, + { + "epoch": 1.390607687718401, + "grad_norm": 71.41412500491073, + "learning_rate": 6.486777845809231e-06, + "loss": 4.093, + "step": 16316 + }, + { + "epoch": 1.3906929174124265, + "grad_norm": 62.945954593185256, + "learning_rate": 6.4863044194513145e-06, + "loss": 3.2491, + "step": 16317 + }, + { + "epoch": 1.390778147106452, + "grad_norm": 31.545374403447084, + "learning_rate": 6.485830978476184e-06, + "loss": 2.5565, + "step": 16318 + }, + { + "epoch": 1.3908633768004772, + "grad_norm": 49.64317921335858, + "learning_rate": 6.4853575228884965e-06, + "loss": 2.3613, + "step": 16319 + }, + { + "epoch": 1.3909486064945027, + "grad_norm": 84.42230166591224, + "learning_rate": 6.484884052692908e-06, + "loss": 4.2599, + "step": 16320 + }, + { + "epoch": 1.3910338361885282, + "grad_norm": 40.72317929895575, + "learning_rate": 6.484410567894076e-06, + "loss": 2.7666, + "step": 16321 + }, + { + "epoch": 1.3911190658825534, + "grad_norm": 63.502987391178834, + "learning_rate": 6.4839370684966545e-06, + "loss": 2.504, + "step": 16322 + }, + { + "epoch": 1.3912042955765789, + "grad_norm": 51.13684969873187, + "learning_rate": 6.483463554505301e-06, + "loss": 3.3191, + "step": 16323 + }, + { + "epoch": 1.3912895252706043, + "grad_norm": 41.534937699213266, + "learning_rate": 6.482990025924674e-06, + "loss": 3.2389, + "step": 16324 + }, + { + "epoch": 1.3913747549646298, + "grad_norm": 36.988078815529235, + "learning_rate": 6.48251648275943e-06, + "loss": 2.7365, + "step": 16325 + }, + { + "epoch": 1.391459984658655, + "grad_norm": 45.42823017606665, + "learning_rate": 6.482042925014225e-06, + "loss": 2.4316, + "step": 16326 + }, + { + "epoch": 1.3915452143526805, + "grad_norm": 22.948460535237633, + "learning_rate": 6.4815693526937154e-06, + "loss": 1.7768, + "step": 16327 + }, + { + "epoch": 1.3916304440467058, + "grad_norm": 87.60021361658941, + "learning_rate": 6.481095765802562e-06, + "loss": 2.8766, + "step": 16328 + }, + { + "epoch": 1.3917156737407312, + "grad_norm": 42.8907875412111, + "learning_rate": 6.480622164345418e-06, + "loss": 2.8271, + "step": 16329 + }, + { + "epoch": 1.3918009034347567, + "grad_norm": 65.92295864022262, + "learning_rate": 6.480148548326946e-06, + "loss": 3.635, + "step": 16330 + }, + { + "epoch": 1.3918861331287822, + "grad_norm": 51.53020825990587, + "learning_rate": 6.479674917751798e-06, + "loss": 2.8511, + "step": 16331 + }, + { + "epoch": 1.3919713628228074, + "grad_norm": 50.268369725255106, + "learning_rate": 6.479201272624638e-06, + "loss": 2.5748, + "step": 16332 + }, + { + "epoch": 1.392056592516833, + "grad_norm": 27.99187665512261, + "learning_rate": 6.4787276129501195e-06, + "loss": 2.09, + "step": 16333 + }, + { + "epoch": 1.3921418222108581, + "grad_norm": 49.11993918848457, + "learning_rate": 6.478253938732904e-06, + "loss": 3.2511, + "step": 16334 + }, + { + "epoch": 1.3922270519048836, + "grad_norm": 103.91245765177385, + "learning_rate": 6.477780249977646e-06, + "loss": 3.752, + "step": 16335 + }, + { + "epoch": 1.392312281598909, + "grad_norm": 25.234101235763667, + "learning_rate": 6.477306546689009e-06, + "loss": 2.276, + "step": 16336 + }, + { + "epoch": 1.3923975112929345, + "grad_norm": 41.9853145034523, + "learning_rate": 6.476832828871649e-06, + "loss": 2.7096, + "step": 16337 + }, + { + "epoch": 1.3924827409869598, + "grad_norm": 80.53831639802078, + "learning_rate": 6.476359096530223e-06, + "loss": 2.6869, + "step": 16338 + }, + { + "epoch": 1.3925679706809853, + "grad_norm": 41.20285435293065, + "learning_rate": 6.475885349669393e-06, + "loss": 3.251, + "step": 16339 + }, + { + "epoch": 1.3926532003750107, + "grad_norm": 60.58227477929928, + "learning_rate": 6.475411588293819e-06, + "loss": 2.7748, + "step": 16340 + }, + { + "epoch": 1.392738430069036, + "grad_norm": 29.659439233145278, + "learning_rate": 6.4749378124081555e-06, + "loss": 2.2741, + "step": 16341 + }, + { + "epoch": 1.3928236597630614, + "grad_norm": 53.73735701132193, + "learning_rate": 6.474464022017065e-06, + "loss": 2.0943, + "step": 16342 + }, + { + "epoch": 1.392908889457087, + "grad_norm": 25.481440410827467, + "learning_rate": 6.473990217125208e-06, + "loss": 1.4852, + "step": 16343 + }, + { + "epoch": 1.3929941191511124, + "grad_norm": 35.48460954333368, + "learning_rate": 6.473516397737241e-06, + "loss": 3.9142, + "step": 16344 + }, + { + "epoch": 1.3930793488451376, + "grad_norm": 118.08307969040207, + "learning_rate": 6.473042563857828e-06, + "loss": 3.8769, + "step": 16345 + }, + { + "epoch": 1.393164578539163, + "grad_norm": 40.02447794243982, + "learning_rate": 6.472568715491625e-06, + "loss": 3.6838, + "step": 16346 + }, + { + "epoch": 1.3932498082331883, + "grad_norm": 34.83360034609825, + "learning_rate": 6.472094852643296e-06, + "loss": 3.0874, + "step": 16347 + }, + { + "epoch": 1.3933350379272138, + "grad_norm": 52.15530209131733, + "learning_rate": 6.471620975317496e-06, + "loss": 2.6593, + "step": 16348 + }, + { + "epoch": 1.3934202676212393, + "grad_norm": 65.96368904013964, + "learning_rate": 6.471147083518892e-06, + "loss": 3.121, + "step": 16349 + }, + { + "epoch": 1.3935054973152647, + "grad_norm": 25.308406453067203, + "learning_rate": 6.470673177252138e-06, + "loss": 2.1803, + "step": 16350 + }, + { + "epoch": 1.39359072700929, + "grad_norm": 33.104345050093634, + "learning_rate": 6.4701992565218995e-06, + "loss": 2.07, + "step": 16351 + }, + { + "epoch": 1.3936759567033155, + "grad_norm": 42.94973744873004, + "learning_rate": 6.469725321332834e-06, + "loss": 2.779, + "step": 16352 + }, + { + "epoch": 1.3937611863973407, + "grad_norm": 64.41253603475512, + "learning_rate": 6.469251371689606e-06, + "loss": 2.2711, + "step": 16353 + }, + { + "epoch": 1.3938464160913662, + "grad_norm": 135.62250948030265, + "learning_rate": 6.4687774075968725e-06, + "loss": 4.6688, + "step": 16354 + }, + { + "epoch": 1.3939316457853916, + "grad_norm": 36.70534591729598, + "learning_rate": 6.468303429059298e-06, + "loss": 2.6829, + "step": 16355 + }, + { + "epoch": 1.394016875479417, + "grad_norm": 36.77542930973834, + "learning_rate": 6.467829436081542e-06, + "loss": 3.2278, + "step": 16356 + }, + { + "epoch": 1.3941021051734424, + "grad_norm": 35.10522413381999, + "learning_rate": 6.467355428668268e-06, + "loss": 2.5267, + "step": 16357 + }, + { + "epoch": 1.3941873348674678, + "grad_norm": 33.52708676084742, + "learning_rate": 6.466881406824135e-06, + "loss": 2.364, + "step": 16358 + }, + { + "epoch": 1.3942725645614933, + "grad_norm": 57.33975208162663, + "learning_rate": 6.466407370553807e-06, + "loss": 3.3933, + "step": 16359 + }, + { + "epoch": 1.3943577942555185, + "grad_norm": 43.405269955888116, + "learning_rate": 6.4659333198619454e-06, + "loss": 3.503, + "step": 16360 + }, + { + "epoch": 1.394443023949544, + "grad_norm": 66.69651997943791, + "learning_rate": 6.465459254753212e-06, + "loss": 3.8397, + "step": 16361 + }, + { + "epoch": 1.3945282536435695, + "grad_norm": 56.15513639056642, + "learning_rate": 6.4649851752322675e-06, + "loss": 3.3182, + "step": 16362 + }, + { + "epoch": 1.394613483337595, + "grad_norm": 57.50588741122221, + "learning_rate": 6.464511081303778e-06, + "loss": 3.054, + "step": 16363 + }, + { + "epoch": 1.3946987130316202, + "grad_norm": 50.97947201410711, + "learning_rate": 6.464036972972404e-06, + "loss": 2.7634, + "step": 16364 + }, + { + "epoch": 1.3947839427256457, + "grad_norm": 38.373196654326755, + "learning_rate": 6.463562850242807e-06, + "loss": 2.37, + "step": 16365 + }, + { + "epoch": 1.394869172419671, + "grad_norm": 90.34888178062876, + "learning_rate": 6.463088713119652e-06, + "loss": 3.1357, + "step": 16366 + }, + { + "epoch": 1.3949544021136964, + "grad_norm": 59.231522821474286, + "learning_rate": 6.4626145616076e-06, + "loss": 2.6137, + "step": 16367 + }, + { + "epoch": 1.3950396318077218, + "grad_norm": 52.85224291599377, + "learning_rate": 6.462140395711316e-06, + "loss": 3.0995, + "step": 16368 + }, + { + "epoch": 1.3951248615017473, + "grad_norm": 33.388996628442506, + "learning_rate": 6.461666215435461e-06, + "loss": 1.3339, + "step": 16369 + }, + { + "epoch": 1.3952100911957726, + "grad_norm": 94.36064824779838, + "learning_rate": 6.461192020784699e-06, + "loss": 4.0315, + "step": 16370 + }, + { + "epoch": 1.395295320889798, + "grad_norm": 33.457132318029764, + "learning_rate": 6.460717811763696e-06, + "loss": 2.7017, + "step": 16371 + }, + { + "epoch": 1.3953805505838235, + "grad_norm": 60.76567867166087, + "learning_rate": 6.460243588377113e-06, + "loss": 3.2879, + "step": 16372 + }, + { + "epoch": 1.3954657802778487, + "grad_norm": 52.77308398395807, + "learning_rate": 6.459769350629614e-06, + "loss": 2.5353, + "step": 16373 + }, + { + "epoch": 1.3955510099718742, + "grad_norm": 37.2697208980296, + "learning_rate": 6.459295098525864e-06, + "loss": 0.9458, + "step": 16374 + }, + { + "epoch": 1.3956362396658997, + "grad_norm": 53.427456020889004, + "learning_rate": 6.4588208320705274e-06, + "loss": 2.6797, + "step": 16375 + }, + { + "epoch": 1.395721469359925, + "grad_norm": 25.291609364719775, + "learning_rate": 6.458346551268265e-06, + "loss": 1.902, + "step": 16376 + }, + { + "epoch": 1.3958066990539504, + "grad_norm": 42.93389485229201, + "learning_rate": 6.457872256123745e-06, + "loss": 2.7832, + "step": 16377 + }, + { + "epoch": 1.3958919287479759, + "grad_norm": 58.361108522014334, + "learning_rate": 6.457397946641632e-06, + "loss": 2.3425, + "step": 16378 + }, + { + "epoch": 1.395977158442001, + "grad_norm": 50.28969730322687, + "learning_rate": 6.4569236228265884e-06, + "loss": 2.6363, + "step": 16379 + }, + { + "epoch": 1.3960623881360266, + "grad_norm": 45.35730709023291, + "learning_rate": 6.45644928468328e-06, + "loss": 2.7197, + "step": 16380 + }, + { + "epoch": 1.396147617830052, + "grad_norm": 37.12507453034379, + "learning_rate": 6.455974932216371e-06, + "loss": 3.2289, + "step": 16381 + }, + { + "epoch": 1.3962328475240775, + "grad_norm": 51.7264354313743, + "learning_rate": 6.455500565430528e-06, + "loss": 2.2078, + "step": 16382 + }, + { + "epoch": 1.3963180772181027, + "grad_norm": 45.636122504105145, + "learning_rate": 6.455026184330415e-06, + "loss": 2.7083, + "step": 16383 + }, + { + "epoch": 1.3964033069121282, + "grad_norm": 87.19839010885522, + "learning_rate": 6.454551788920698e-06, + "loss": 3.8212, + "step": 16384 + }, + { + "epoch": 1.3964885366061535, + "grad_norm": 126.58670914228718, + "learning_rate": 6.454077379206041e-06, + "loss": 4.94, + "step": 16385 + }, + { + "epoch": 1.396573766300179, + "grad_norm": 31.57608960937955, + "learning_rate": 6.453602955191112e-06, + "loss": 2.5289, + "step": 16386 + }, + { + "epoch": 1.3966589959942044, + "grad_norm": 43.67827031442784, + "learning_rate": 6.453128516880574e-06, + "loss": 3.0057, + "step": 16387 + }, + { + "epoch": 1.3967442256882299, + "grad_norm": 78.95189925083139, + "learning_rate": 6.452654064279097e-06, + "loss": 3.8139, + "step": 16388 + }, + { + "epoch": 1.3968294553822551, + "grad_norm": 33.21328595309689, + "learning_rate": 6.452179597391341e-06, + "loss": 2.3309, + "step": 16389 + }, + { + "epoch": 1.3969146850762806, + "grad_norm": 49.831924368834116, + "learning_rate": 6.451705116221979e-06, + "loss": 3.8327, + "step": 16390 + }, + { + "epoch": 1.396999914770306, + "grad_norm": 58.166242220198136, + "learning_rate": 6.451230620775673e-06, + "loss": 2.1752, + "step": 16391 + }, + { + "epoch": 1.3970851444643313, + "grad_norm": 36.75377010100628, + "learning_rate": 6.450756111057091e-06, + "loss": 2.9259, + "step": 16392 + }, + { + "epoch": 1.3971703741583568, + "grad_norm": 63.737206706953415, + "learning_rate": 6.450281587070897e-06, + "loss": 3.361, + "step": 16393 + }, + { + "epoch": 1.3972556038523822, + "grad_norm": 40.027605162906504, + "learning_rate": 6.4498070488217615e-06, + "loss": 2.772, + "step": 16394 + }, + { + "epoch": 1.3973408335464077, + "grad_norm": 75.84024910770833, + "learning_rate": 6.449332496314349e-06, + "loss": 2.4316, + "step": 16395 + }, + { + "epoch": 1.397426063240433, + "grad_norm": 29.615417576433508, + "learning_rate": 6.448857929553328e-06, + "loss": 2.0572, + "step": 16396 + }, + { + "epoch": 1.3975112929344584, + "grad_norm": 72.20983912697425, + "learning_rate": 6.448383348543363e-06, + "loss": 3.6593, + "step": 16397 + }, + { + "epoch": 1.3975965226284837, + "grad_norm": 55.07560535600476, + "learning_rate": 6.4479087532891275e-06, + "loss": 2.2562, + "step": 16398 + }, + { + "epoch": 1.3976817523225091, + "grad_norm": 78.51838100221917, + "learning_rate": 6.447434143795281e-06, + "loss": 2.937, + "step": 16399 + }, + { + "epoch": 1.3977669820165346, + "grad_norm": 46.96974660419701, + "learning_rate": 6.446959520066497e-06, + "loss": 3.529, + "step": 16400 + }, + { + "epoch": 1.39785221171056, + "grad_norm": 43.02817503790972, + "learning_rate": 6.44648488210744e-06, + "loss": 3.6593, + "step": 16401 + }, + { + "epoch": 1.3979374414045853, + "grad_norm": 92.46857778916568, + "learning_rate": 6.446010229922779e-06, + "loss": 3.0018, + "step": 16402 + }, + { + "epoch": 1.3980226710986108, + "grad_norm": 59.56228889381766, + "learning_rate": 6.445535563517182e-06, + "loss": 3.2411, + "step": 16403 + }, + { + "epoch": 1.398107900792636, + "grad_norm": 44.97876608438383, + "learning_rate": 6.445060882895318e-06, + "loss": 3.3471, + "step": 16404 + }, + { + "epoch": 1.3981931304866615, + "grad_norm": 34.521807835466355, + "learning_rate": 6.444586188061853e-06, + "loss": 2.6556, + "step": 16405 + }, + { + "epoch": 1.398278360180687, + "grad_norm": 38.08721769196506, + "learning_rate": 6.4441114790214576e-06, + "loss": 2.0679, + "step": 16406 + }, + { + "epoch": 1.3983635898747124, + "grad_norm": 87.38292470683334, + "learning_rate": 6.4436367557788e-06, + "loss": 3.9118, + "step": 16407 + }, + { + "epoch": 1.3984488195687377, + "grad_norm": 25.40435222943651, + "learning_rate": 6.443162018338548e-06, + "loss": 2.4249, + "step": 16408 + }, + { + "epoch": 1.3985340492627631, + "grad_norm": 96.84007915221096, + "learning_rate": 6.44268726670537e-06, + "loss": 3.7349, + "step": 16409 + }, + { + "epoch": 1.3986192789567886, + "grad_norm": 34.90271070042206, + "learning_rate": 6.442212500883937e-06, + "loss": 3.0788, + "step": 16410 + }, + { + "epoch": 1.3987045086508139, + "grad_norm": 49.32844220222571, + "learning_rate": 6.441737720878919e-06, + "loss": 3.6183, + "step": 16411 + }, + { + "epoch": 1.3987897383448393, + "grad_norm": 100.57188360356454, + "learning_rate": 6.441262926694982e-06, + "loss": 4.6757, + "step": 16412 + }, + { + "epoch": 1.3988749680388648, + "grad_norm": 26.886089686339613, + "learning_rate": 6.440788118336796e-06, + "loss": 1.6867, + "step": 16413 + }, + { + "epoch": 1.3989601977328903, + "grad_norm": 55.6259351660075, + "learning_rate": 6.440313295809031e-06, + "loss": 3.0653, + "step": 16414 + }, + { + "epoch": 1.3990454274269155, + "grad_norm": 27.96159129685782, + "learning_rate": 6.439838459116359e-06, + "loss": 2.3959, + "step": 16415 + }, + { + "epoch": 1.399130657120941, + "grad_norm": 44.21050614693441, + "learning_rate": 6.4393636082634466e-06, + "loss": 2.0538, + "step": 16416 + }, + { + "epoch": 1.3992158868149662, + "grad_norm": 39.21376963679369, + "learning_rate": 6.438888743254965e-06, + "loss": 2.7634, + "step": 16417 + }, + { + "epoch": 1.3993011165089917, + "grad_norm": 83.97887891241857, + "learning_rate": 6.4384138640955855e-06, + "loss": 4.4772, + "step": 16418 + }, + { + "epoch": 1.3993863462030172, + "grad_norm": 63.902129176734704, + "learning_rate": 6.437938970789976e-06, + "loss": 2.8156, + "step": 16419 + }, + { + "epoch": 1.3994715758970426, + "grad_norm": 24.212923106307525, + "learning_rate": 6.437464063342809e-06, + "loss": 1.3477, + "step": 16420 + }, + { + "epoch": 1.3995568055910679, + "grad_norm": 89.20569910256155, + "learning_rate": 6.4369891417587525e-06, + "loss": 3.3038, + "step": 16421 + }, + { + "epoch": 1.3996420352850933, + "grad_norm": 58.17880610006276, + "learning_rate": 6.4365142060424814e-06, + "loss": 1.7957, + "step": 16422 + }, + { + "epoch": 1.3997272649791186, + "grad_norm": 47.96978307932897, + "learning_rate": 6.436039256198661e-06, + "loss": 2.3942, + "step": 16423 + }, + { + "epoch": 1.399812494673144, + "grad_norm": 79.64649296614282, + "learning_rate": 6.435564292231967e-06, + "loss": 3.4627, + "step": 16424 + }, + { + "epoch": 1.3998977243671695, + "grad_norm": 81.71634052768334, + "learning_rate": 6.435089314147068e-06, + "loss": 3.439, + "step": 16425 + }, + { + "epoch": 1.399982954061195, + "grad_norm": 44.44675448591801, + "learning_rate": 6.434614321948637e-06, + "loss": 2.1883, + "step": 16426 + }, + { + "epoch": 1.4000681837552202, + "grad_norm": 17.12774581177018, + "learning_rate": 6.434139315641343e-06, + "loss": 1.6143, + "step": 16427 + }, + { + "epoch": 1.4001534134492457, + "grad_norm": 61.32409360601047, + "learning_rate": 6.433664295229858e-06, + "loss": 2.7572, + "step": 16428 + }, + { + "epoch": 1.4002386431432712, + "grad_norm": 45.19276977377315, + "learning_rate": 6.433189260718855e-06, + "loss": 3.2958, + "step": 16429 + }, + { + "epoch": 1.4003238728372964, + "grad_norm": 43.06540395706565, + "learning_rate": 6.4327142121130075e-06, + "loss": 3.128, + "step": 16430 + }, + { + "epoch": 1.400409102531322, + "grad_norm": 44.60450472621073, + "learning_rate": 6.432239149416984e-06, + "loss": 2.0844, + "step": 16431 + }, + { + "epoch": 1.4004943322253474, + "grad_norm": 59.28644113676613, + "learning_rate": 6.4317640726354535e-06, + "loss": 3.2839, + "step": 16432 + }, + { + "epoch": 1.4005795619193728, + "grad_norm": 61.89397596514953, + "learning_rate": 6.431288981773096e-06, + "loss": 3.3882, + "step": 16433 + }, + { + "epoch": 1.400664791613398, + "grad_norm": 63.90037059927167, + "learning_rate": 6.430813876834579e-06, + "loss": 2.5211, + "step": 16434 + }, + { + "epoch": 1.4007500213074235, + "grad_norm": 132.04915808597565, + "learning_rate": 6.430338757824578e-06, + "loss": 3.6541, + "step": 16435 + }, + { + "epoch": 1.4008352510014488, + "grad_norm": 182.70116744412064, + "learning_rate": 6.429863624747761e-06, + "loss": 2.8827, + "step": 16436 + }, + { + "epoch": 1.4009204806954743, + "grad_norm": 37.75942841314914, + "learning_rate": 6.429388477608804e-06, + "loss": 2.7544, + "step": 16437 + }, + { + "epoch": 1.4010057103894997, + "grad_norm": 46.18626592601726, + "learning_rate": 6.4289133164123805e-06, + "loss": 2.2231, + "step": 16438 + }, + { + "epoch": 1.4010909400835252, + "grad_norm": 39.34235845837581, + "learning_rate": 6.428438141163163e-06, + "loss": 2.9078, + "step": 16439 + }, + { + "epoch": 1.4011761697775504, + "grad_norm": 38.398969101979226, + "learning_rate": 6.42796295186582e-06, + "loss": 2.3887, + "step": 16440 + }, + { + "epoch": 1.401261399471576, + "grad_norm": 52.135971423849355, + "learning_rate": 6.427487748525033e-06, + "loss": 3.008, + "step": 16441 + }, + { + "epoch": 1.4013466291656014, + "grad_norm": 36.46195291358771, + "learning_rate": 6.427012531145469e-06, + "loss": 1.8697, + "step": 16442 + }, + { + "epoch": 1.4014318588596266, + "grad_norm": 63.75818571588095, + "learning_rate": 6.426537299731805e-06, + "loss": 3.5257, + "step": 16443 + }, + { + "epoch": 1.401517088553652, + "grad_norm": 36.42153725118875, + "learning_rate": 6.426062054288713e-06, + "loss": 2.0848, + "step": 16444 + }, + { + "epoch": 1.4016023182476776, + "grad_norm": 33.15073155314986, + "learning_rate": 6.425586794820867e-06, + "loss": 2.8011, + "step": 16445 + }, + { + "epoch": 1.401687547941703, + "grad_norm": 49.78723887492366, + "learning_rate": 6.425111521332943e-06, + "loss": 2.7898, + "step": 16446 + }, + { + "epoch": 1.4017727776357283, + "grad_norm": 42.8885406345428, + "learning_rate": 6.424636233829612e-06, + "loss": 3.0016, + "step": 16447 + }, + { + "epoch": 1.4018580073297537, + "grad_norm": 16.642015094842893, + "learning_rate": 6.42416093231555e-06, + "loss": 1.7652, + "step": 16448 + }, + { + "epoch": 1.401943237023779, + "grad_norm": 35.97630115035271, + "learning_rate": 6.4236856167954295e-06, + "loss": 3.5454, + "step": 16449 + }, + { + "epoch": 1.4020284667178045, + "grad_norm": 64.61278567082299, + "learning_rate": 6.42321028727393e-06, + "loss": 2.4083, + "step": 16450 + }, + { + "epoch": 1.40211369641183, + "grad_norm": 38.373234634757, + "learning_rate": 6.422734943755721e-06, + "loss": 2.6519, + "step": 16451 + }, + { + "epoch": 1.4021989261058554, + "grad_norm": 61.911721067718624, + "learning_rate": 6.422259586245479e-06, + "loss": 3.0213, + "step": 16452 + }, + { + "epoch": 1.4022841557998806, + "grad_norm": 40.7824958738013, + "learning_rate": 6.421784214747878e-06, + "loss": 3.1425, + "step": 16453 + }, + { + "epoch": 1.402369385493906, + "grad_norm": 92.64207363222012, + "learning_rate": 6.421308829267596e-06, + "loss": 3.7727, + "step": 16454 + }, + { + "epoch": 1.4024546151879314, + "grad_norm": 31.98159142042456, + "learning_rate": 6.4208334298093066e-06, + "loss": 2.6141, + "step": 16455 + }, + { + "epoch": 1.4025398448819568, + "grad_norm": 62.999776311809285, + "learning_rate": 6.420358016377683e-06, + "loss": 2.5462, + "step": 16456 + }, + { + "epoch": 1.4026250745759823, + "grad_norm": 55.24572545483807, + "learning_rate": 6.419882588977405e-06, + "loss": 3.3354, + "step": 16457 + }, + { + "epoch": 1.4027103042700078, + "grad_norm": 164.27717112090724, + "learning_rate": 6.419407147613146e-06, + "loss": 4.9954, + "step": 16458 + }, + { + "epoch": 1.402795533964033, + "grad_norm": 76.35914381195637, + "learning_rate": 6.418931692289581e-06, + "loss": 3.5009, + "step": 16459 + }, + { + "epoch": 1.4028807636580585, + "grad_norm": 36.19770936231555, + "learning_rate": 6.4184562230113856e-06, + "loss": 3.0076, + "step": 16460 + }, + { + "epoch": 1.402965993352084, + "grad_norm": 59.989329612859066, + "learning_rate": 6.4179807397832386e-06, + "loss": 3.1761, + "step": 16461 + }, + { + "epoch": 1.4030512230461092, + "grad_norm": 93.07818044120647, + "learning_rate": 6.417505242609813e-06, + "loss": 3.0376, + "step": 16462 + }, + { + "epoch": 1.4031364527401347, + "grad_norm": 68.10822247721744, + "learning_rate": 6.417029731495786e-06, + "loss": 2.6445, + "step": 16463 + }, + { + "epoch": 1.4032216824341601, + "grad_norm": 40.08389998276399, + "learning_rate": 6.4165542064458364e-06, + "loss": 2.8807, + "step": 16464 + }, + { + "epoch": 1.4033069121281856, + "grad_norm": 40.11752802103563, + "learning_rate": 6.416078667464639e-06, + "loss": 3.0005, + "step": 16465 + }, + { + "epoch": 1.4033921418222108, + "grad_norm": 34.80482510874383, + "learning_rate": 6.41560311455687e-06, + "loss": 2.3814, + "step": 16466 + }, + { + "epoch": 1.4034773715162363, + "grad_norm": 49.7382701579634, + "learning_rate": 6.4151275477272065e-06, + "loss": 2.326, + "step": 16467 + }, + { + "epoch": 1.4035626012102616, + "grad_norm": 199.21877493563383, + "learning_rate": 6.4146519669803265e-06, + "loss": 3.0291, + "step": 16468 + }, + { + "epoch": 1.403647830904287, + "grad_norm": 45.416267061003, + "learning_rate": 6.4141763723209075e-06, + "loss": 3.26, + "step": 16469 + }, + { + "epoch": 1.4037330605983125, + "grad_norm": 54.14758871054243, + "learning_rate": 6.4137007637536234e-06, + "loss": 2.1778, + "step": 16470 + }, + { + "epoch": 1.403818290292338, + "grad_norm": 41.8551912411126, + "learning_rate": 6.413225141283155e-06, + "loss": 2.2345, + "step": 16471 + }, + { + "epoch": 1.4039035199863632, + "grad_norm": 66.0912741387234, + "learning_rate": 6.412749504914179e-06, + "loss": 3.5044, + "step": 16472 + }, + { + "epoch": 1.4039887496803887, + "grad_norm": 65.20263995773252, + "learning_rate": 6.412273854651374e-06, + "loss": 3.6829, + "step": 16473 + }, + { + "epoch": 1.404073979374414, + "grad_norm": 64.2844995070314, + "learning_rate": 6.411798190499416e-06, + "loss": 3.6463, + "step": 16474 + }, + { + "epoch": 1.4041592090684394, + "grad_norm": 147.8050457613672, + "learning_rate": 6.411322512462983e-06, + "loss": 3.2715, + "step": 16475 + }, + { + "epoch": 1.4042444387624649, + "grad_norm": 29.31160740417232, + "learning_rate": 6.410846820546757e-06, + "loss": 2.5743, + "step": 16476 + }, + { + "epoch": 1.4043296684564903, + "grad_norm": 50.276366547841064, + "learning_rate": 6.410371114755409e-06, + "loss": 3.2756, + "step": 16477 + }, + { + "epoch": 1.4044148981505156, + "grad_norm": 39.55881315737994, + "learning_rate": 6.409895395093624e-06, + "loss": 2.5993, + "step": 16478 + }, + { + "epoch": 1.404500127844541, + "grad_norm": 44.603107070389434, + "learning_rate": 6.409419661566076e-06, + "loss": 3.463, + "step": 16479 + }, + { + "epoch": 1.4045853575385665, + "grad_norm": 32.56942564988504, + "learning_rate": 6.408943914177449e-06, + "loss": 2.3127, + "step": 16480 + }, + { + "epoch": 1.4046705872325917, + "grad_norm": 74.97887140379032, + "learning_rate": 6.408468152932417e-06, + "loss": 2.3403, + "step": 16481 + }, + { + "epoch": 1.4047558169266172, + "grad_norm": 42.91353896065943, + "learning_rate": 6.407992377835659e-06, + "loss": 2.844, + "step": 16482 + }, + { + "epoch": 1.4048410466206427, + "grad_norm": 66.00473999649563, + "learning_rate": 6.407516588891856e-06, + "loss": 3.1375, + "step": 16483 + }, + { + "epoch": 1.4049262763146682, + "grad_norm": 66.39583504288915, + "learning_rate": 6.407040786105687e-06, + "loss": 2.3709, + "step": 16484 + }, + { + "epoch": 1.4050115060086934, + "grad_norm": 36.63713199451196, + "learning_rate": 6.406564969481831e-06, + "loss": 2.6208, + "step": 16485 + }, + { + "epoch": 1.4050967357027189, + "grad_norm": 259.7298305016573, + "learning_rate": 6.406089139024968e-06, + "loss": 4.4956, + "step": 16486 + }, + { + "epoch": 1.4051819653967441, + "grad_norm": 40.214158601136994, + "learning_rate": 6.405613294739775e-06, + "loss": 2.7711, + "step": 16487 + }, + { + "epoch": 1.4052671950907696, + "grad_norm": 47.68659462010073, + "learning_rate": 6.405137436630937e-06, + "loss": 3.4131, + "step": 16488 + }, + { + "epoch": 1.405352424784795, + "grad_norm": 54.40136245687294, + "learning_rate": 6.4046615647031294e-06, + "loss": 3.0204, + "step": 16489 + }, + { + "epoch": 1.4054376544788205, + "grad_norm": 117.19633172835945, + "learning_rate": 6.404185678961032e-06, + "loss": 3.995, + "step": 16490 + }, + { + "epoch": 1.4055228841728458, + "grad_norm": 57.04473589482011, + "learning_rate": 6.403709779409327e-06, + "loss": 3.0317, + "step": 16491 + }, + { + "epoch": 1.4056081138668712, + "grad_norm": 34.37076804462288, + "learning_rate": 6.4032338660526945e-06, + "loss": 3.3484, + "step": 16492 + }, + { + "epoch": 1.4056933435608965, + "grad_norm": 27.3072676437831, + "learning_rate": 6.402757938895816e-06, + "loss": 2.1841, + "step": 16493 + }, + { + "epoch": 1.405778573254922, + "grad_norm": 31.886533910907918, + "learning_rate": 6.402281997943368e-06, + "loss": 3.3729, + "step": 16494 + }, + { + "epoch": 1.4058638029489474, + "grad_norm": 109.77653694710993, + "learning_rate": 6.401806043200035e-06, + "loss": 3.544, + "step": 16495 + }, + { + "epoch": 1.4059490326429729, + "grad_norm": 38.10475176239757, + "learning_rate": 6.401330074670496e-06, + "loss": 2.9652, + "step": 16496 + }, + { + "epoch": 1.4060342623369981, + "grad_norm": 83.47221728688478, + "learning_rate": 6.400854092359434e-06, + "loss": 2.957, + "step": 16497 + }, + { + "epoch": 1.4061194920310236, + "grad_norm": 97.68263671958012, + "learning_rate": 6.4003780962715265e-06, + "loss": 2.4952, + "step": 16498 + }, + { + "epoch": 1.406204721725049, + "grad_norm": 42.34695182454933, + "learning_rate": 6.399902086411458e-06, + "loss": 2.3186, + "step": 16499 + }, + { + "epoch": 1.4062899514190743, + "grad_norm": 57.554729862269646, + "learning_rate": 6.399426062783909e-06, + "loss": 2.972, + "step": 16500 + }, + { + "epoch": 1.4063751811130998, + "grad_norm": 38.03303487214474, + "learning_rate": 6.398950025393561e-06, + "loss": 3.2629, + "step": 16501 + }, + { + "epoch": 1.4064604108071252, + "grad_norm": 42.339444301863864, + "learning_rate": 6.398473974245094e-06, + "loss": 3.1477, + "step": 16502 + }, + { + "epoch": 1.4065456405011507, + "grad_norm": 94.69771655954642, + "learning_rate": 6.397997909343192e-06, + "loss": 4.3242, + "step": 16503 + }, + { + "epoch": 1.406630870195176, + "grad_norm": 49.343934891553694, + "learning_rate": 6.397521830692537e-06, + "loss": 2.8505, + "step": 16504 + }, + { + "epoch": 1.4067160998892014, + "grad_norm": 36.638585002672706, + "learning_rate": 6.397045738297809e-06, + "loss": 2.5755, + "step": 16505 + }, + { + "epoch": 1.4068013295832267, + "grad_norm": 82.7834026306429, + "learning_rate": 6.3965696321636915e-06, + "loss": 3.0172, + "step": 16506 + }, + { + "epoch": 1.4068865592772521, + "grad_norm": 46.64542022839852, + "learning_rate": 6.396093512294866e-06, + "loss": 3.5765, + "step": 16507 + }, + { + "epoch": 1.4069717889712776, + "grad_norm": 87.34751455243644, + "learning_rate": 6.3956173786960175e-06, + "loss": 3.7924, + "step": 16508 + }, + { + "epoch": 1.407057018665303, + "grad_norm": 53.43341600564291, + "learning_rate": 6.395141231371826e-06, + "loss": 2.7925, + "step": 16509 + }, + { + "epoch": 1.4071422483593283, + "grad_norm": 31.11639855587019, + "learning_rate": 6.394665070326973e-06, + "loss": 2.853, + "step": 16510 + }, + { + "epoch": 1.4072274780533538, + "grad_norm": 58.346491925928305, + "learning_rate": 6.394188895566145e-06, + "loss": 2.9606, + "step": 16511 + }, + { + "epoch": 1.4073127077473793, + "grad_norm": 42.26903578063492, + "learning_rate": 6.393712707094022e-06, + "loss": 2.8065, + "step": 16512 + }, + { + "epoch": 1.4073979374414045, + "grad_norm": 61.76651766443832, + "learning_rate": 6.393236504915289e-06, + "loss": 2.9294, + "step": 16513 + }, + { + "epoch": 1.40748316713543, + "grad_norm": 62.218666386972814, + "learning_rate": 6.392760289034629e-06, + "loss": 2.8325, + "step": 16514 + }, + { + "epoch": 1.4075683968294554, + "grad_norm": 35.52001868703966, + "learning_rate": 6.392284059456724e-06, + "loss": 2.8982, + "step": 16515 + }, + { + "epoch": 1.407653626523481, + "grad_norm": 30.171390795637322, + "learning_rate": 6.39180781618626e-06, + "loss": 2.3002, + "step": 16516 + }, + { + "epoch": 1.4077388562175062, + "grad_norm": 22.847868622568328, + "learning_rate": 6.391331559227918e-06, + "loss": 1.9843, + "step": 16517 + }, + { + "epoch": 1.4078240859115316, + "grad_norm": 50.80692424642982, + "learning_rate": 6.390855288586383e-06, + "loss": 3.4343, + "step": 16518 + }, + { + "epoch": 1.4079093156055569, + "grad_norm": 39.81014078392047, + "learning_rate": 6.39037900426634e-06, + "loss": 3.2859, + "step": 16519 + }, + { + "epoch": 1.4079945452995823, + "grad_norm": 33.39940625037768, + "learning_rate": 6.389902706272471e-06, + "loss": 2.818, + "step": 16520 + }, + { + "epoch": 1.4080797749936078, + "grad_norm": 31.080300918774373, + "learning_rate": 6.3894263946094605e-06, + "loss": 2.2593, + "step": 16521 + }, + { + "epoch": 1.4081650046876333, + "grad_norm": 34.90052479359897, + "learning_rate": 6.3889500692819936e-06, + "loss": 2.9355, + "step": 16522 + }, + { + "epoch": 1.4082502343816585, + "grad_norm": 37.222666739285835, + "learning_rate": 6.3884737302947566e-06, + "loss": 2.9812, + "step": 16523 + }, + { + "epoch": 1.408335464075684, + "grad_norm": 27.146227539992836, + "learning_rate": 6.38799737765243e-06, + "loss": 1.9737, + "step": 16524 + }, + { + "epoch": 1.4084206937697092, + "grad_norm": 46.355837195003446, + "learning_rate": 6.387521011359702e-06, + "loss": 2.5056, + "step": 16525 + }, + { + "epoch": 1.4085059234637347, + "grad_norm": 27.97206943015032, + "learning_rate": 6.3870446314212555e-06, + "loss": 2.5007, + "step": 16526 + }, + { + "epoch": 1.4085911531577602, + "grad_norm": 52.206227602160425, + "learning_rate": 6.386568237841777e-06, + "loss": 2.4668, + "step": 16527 + }, + { + "epoch": 1.4086763828517856, + "grad_norm": 30.34156149251754, + "learning_rate": 6.38609183062595e-06, + "loss": 2.9386, + "step": 16528 + }, + { + "epoch": 1.408761612545811, + "grad_norm": 86.77962877988814, + "learning_rate": 6.385615409778463e-06, + "loss": 3.3628, + "step": 16529 + }, + { + "epoch": 1.4088468422398364, + "grad_norm": 65.08236802720592, + "learning_rate": 6.385138975303995e-06, + "loss": 3.5376, + "step": 16530 + }, + { + "epoch": 1.4089320719338618, + "grad_norm": 55.776466768617745, + "learning_rate": 6.384662527207239e-06, + "loss": 3.0946, + "step": 16531 + }, + { + "epoch": 1.409017301627887, + "grad_norm": 66.65651210175325, + "learning_rate": 6.384186065492877e-06, + "loss": 3.3128, + "step": 16532 + }, + { + "epoch": 1.4091025313219125, + "grad_norm": 143.19594529035993, + "learning_rate": 6.383709590165593e-06, + "loss": 2.4136, + "step": 16533 + }, + { + "epoch": 1.409187761015938, + "grad_norm": 36.05528583163119, + "learning_rate": 6.383233101230075e-06, + "loss": 3.0309, + "step": 16534 + }, + { + "epoch": 1.4092729907099635, + "grad_norm": 66.64772865138924, + "learning_rate": 6.38275659869101e-06, + "loss": 3.0699, + "step": 16535 + }, + { + "epoch": 1.4093582204039887, + "grad_norm": 31.214555616948424, + "learning_rate": 6.382280082553085e-06, + "loss": 2.1978, + "step": 16536 + }, + { + "epoch": 1.4094434500980142, + "grad_norm": 97.45802535858442, + "learning_rate": 6.381803552820982e-06, + "loss": 4.2014, + "step": 16537 + }, + { + "epoch": 1.4095286797920394, + "grad_norm": 59.29648238171147, + "learning_rate": 6.381327009499391e-06, + "loss": 3.1503, + "step": 16538 + }, + { + "epoch": 1.409613909486065, + "grad_norm": 64.44232124372654, + "learning_rate": 6.380850452592998e-06, + "loss": 3.6623, + "step": 16539 + }, + { + "epoch": 1.4096991391800904, + "grad_norm": 103.61620904048503, + "learning_rate": 6.38037388210649e-06, + "loss": 3.7013, + "step": 16540 + }, + { + "epoch": 1.4097843688741158, + "grad_norm": 72.62318955670958, + "learning_rate": 6.379897298044553e-06, + "loss": 3.0487, + "step": 16541 + }, + { + "epoch": 1.409869598568141, + "grad_norm": 56.82603268612912, + "learning_rate": 6.3794207004118736e-06, + "loss": 2.6086, + "step": 16542 + }, + { + "epoch": 1.4099548282621666, + "grad_norm": 40.07557857172375, + "learning_rate": 6.378944089213141e-06, + "loss": 3.0586, + "step": 16543 + }, + { + "epoch": 1.4100400579561918, + "grad_norm": 97.99339554143118, + "learning_rate": 6.378467464453041e-06, + "loss": 2.7457, + "step": 16544 + }, + { + "epoch": 1.4101252876502173, + "grad_norm": 51.58729792813279, + "learning_rate": 6.37799082613626e-06, + "loss": 2.4903, + "step": 16545 + }, + { + "epoch": 1.4102105173442427, + "grad_norm": 58.92843137711517, + "learning_rate": 6.377514174267487e-06, + "loss": 3.6262, + "step": 16546 + }, + { + "epoch": 1.4102957470382682, + "grad_norm": 58.87524759725349, + "learning_rate": 6.3770375088514115e-06, + "loss": 2.9385, + "step": 16547 + }, + { + "epoch": 1.4103809767322935, + "grad_norm": 47.88143369933382, + "learning_rate": 6.376560829892717e-06, + "loss": 2.5976, + "step": 16548 + }, + { + "epoch": 1.410466206426319, + "grad_norm": 53.5279865120367, + "learning_rate": 6.376084137396095e-06, + "loss": 2.5128, + "step": 16549 + }, + { + "epoch": 1.4105514361203444, + "grad_norm": 45.874716786143466, + "learning_rate": 6.375607431366231e-06, + "loss": 2.1007, + "step": 16550 + }, + { + "epoch": 1.4106366658143696, + "grad_norm": 124.0131663188864, + "learning_rate": 6.3751307118078165e-06, + "loss": 3.4557, + "step": 16551 + }, + { + "epoch": 1.410721895508395, + "grad_norm": 30.897490701557352, + "learning_rate": 6.374653978725536e-06, + "loss": 2.744, + "step": 16552 + }, + { + "epoch": 1.4108071252024206, + "grad_norm": 27.812497387484637, + "learning_rate": 6.3741772321240805e-06, + "loss": 1.701, + "step": 16553 + }, + { + "epoch": 1.410892354896446, + "grad_norm": 42.5836794158215, + "learning_rate": 6.373700472008137e-06, + "loss": 3.8538, + "step": 16554 + }, + { + "epoch": 1.4109775845904713, + "grad_norm": 49.671235025030526, + "learning_rate": 6.373223698382398e-06, + "loss": 4.1291, + "step": 16555 + }, + { + "epoch": 1.4110628142844968, + "grad_norm": 22.40964536804974, + "learning_rate": 6.372746911251548e-06, + "loss": 2.4531, + "step": 16556 + }, + { + "epoch": 1.411148043978522, + "grad_norm": 40.641016353241966, + "learning_rate": 6.3722701106202765e-06, + "loss": 2.7888, + "step": 16557 + }, + { + "epoch": 1.4112332736725475, + "grad_norm": 59.57547543558218, + "learning_rate": 6.371793296493275e-06, + "loss": 2.6126, + "step": 16558 + }, + { + "epoch": 1.411318503366573, + "grad_norm": 57.05332673458831, + "learning_rate": 6.371316468875232e-06, + "loss": 3.6039, + "step": 16559 + }, + { + "epoch": 1.4114037330605984, + "grad_norm": 43.04552741561164, + "learning_rate": 6.3708396277708355e-06, + "loss": 3.3928, + "step": 16560 + }, + { + "epoch": 1.4114889627546237, + "grad_norm": 35.95888882630749, + "learning_rate": 6.370362773184776e-06, + "loss": 3.4305, + "step": 16561 + }, + { + "epoch": 1.4115741924486491, + "grad_norm": 42.742258403466394, + "learning_rate": 6.369885905121745e-06, + "loss": 2.7597, + "step": 16562 + }, + { + "epoch": 1.4116594221426744, + "grad_norm": 51.279240392819496, + "learning_rate": 6.369409023586427e-06, + "loss": 2.6723, + "step": 16563 + }, + { + "epoch": 1.4117446518366998, + "grad_norm": 17.61814283704885, + "learning_rate": 6.368932128583517e-06, + "loss": 1.3566, + "step": 16564 + }, + { + "epoch": 1.4118298815307253, + "grad_norm": 39.88707257900656, + "learning_rate": 6.368455220117704e-06, + "loss": 2.7579, + "step": 16565 + }, + { + "epoch": 1.4119151112247508, + "grad_norm": 20.404024260104414, + "learning_rate": 6.367978298193678e-06, + "loss": 1.5825, + "step": 16566 + }, + { + "epoch": 1.412000340918776, + "grad_norm": 47.78851969544283, + "learning_rate": 6.367501362816129e-06, + "loss": 3.8663, + "step": 16567 + }, + { + "epoch": 1.4120855706128015, + "grad_norm": 31.33206008085057, + "learning_rate": 6.367024413989745e-06, + "loss": 2.4824, + "step": 16568 + }, + { + "epoch": 1.412170800306827, + "grad_norm": 69.55172087376732, + "learning_rate": 6.366547451719222e-06, + "loss": 2.6296, + "step": 16569 + }, + { + "epoch": 1.4122560300008522, + "grad_norm": 36.89511996055006, + "learning_rate": 6.366070476009247e-06, + "loss": 2.6597, + "step": 16570 + }, + { + "epoch": 1.4123412596948777, + "grad_norm": 58.631979401519416, + "learning_rate": 6.365593486864511e-06, + "loss": 3.1095, + "step": 16571 + }, + { + "epoch": 1.4124264893889031, + "grad_norm": 98.47807426610784, + "learning_rate": 6.365116484289706e-06, + "loss": 3.3152, + "step": 16572 + }, + { + "epoch": 1.4125117190829286, + "grad_norm": 69.9061061966105, + "learning_rate": 6.364639468289523e-06, + "loss": 4.4922, + "step": 16573 + }, + { + "epoch": 1.4125969487769539, + "grad_norm": 36.85745687719522, + "learning_rate": 6.364162438868653e-06, + "loss": 2.8063, + "step": 16574 + }, + { + "epoch": 1.4126821784709793, + "grad_norm": 34.94436504730912, + "learning_rate": 6.363685396031788e-06, + "loss": 2.6147, + "step": 16575 + }, + { + "epoch": 1.4127674081650046, + "grad_norm": 32.060546762698934, + "learning_rate": 6.3632083397836175e-06, + "loss": 2.6102, + "step": 16576 + }, + { + "epoch": 1.41285263785903, + "grad_norm": 120.05653783910284, + "learning_rate": 6.3627312701288365e-06, + "loss": 4.2533, + "step": 16577 + }, + { + "epoch": 1.4129378675530555, + "grad_norm": 48.09361670233807, + "learning_rate": 6.362254187072134e-06, + "loss": 2.0538, + "step": 16578 + }, + { + "epoch": 1.413023097247081, + "grad_norm": 32.77300733496143, + "learning_rate": 6.361777090618204e-06, + "loss": 2.4359, + "step": 16579 + }, + { + "epoch": 1.4131083269411062, + "grad_norm": 66.42693996086122, + "learning_rate": 6.361299980771737e-06, + "loss": 3.7846, + "step": 16580 + }, + { + "epoch": 1.4131935566351317, + "grad_norm": 75.81499868214209, + "learning_rate": 6.360822857537424e-06, + "loss": 2.5416, + "step": 16581 + }, + { + "epoch": 1.4132787863291572, + "grad_norm": 59.77400946542594, + "learning_rate": 6.3603457209199605e-06, + "loss": 2.5873, + "step": 16582 + }, + { + "epoch": 1.4133640160231824, + "grad_norm": 41.57459851157694, + "learning_rate": 6.359868570924037e-06, + "loss": 3.079, + "step": 16583 + }, + { + "epoch": 1.4134492457172079, + "grad_norm": 31.508802576818738, + "learning_rate": 6.359391407554346e-06, + "loss": 2.7639, + "step": 16584 + }, + { + "epoch": 1.4135344754112333, + "grad_norm": 38.432291977081206, + "learning_rate": 6.358914230815581e-06, + "loss": 2.3092, + "step": 16585 + }, + { + "epoch": 1.4136197051052588, + "grad_norm": 75.66281039470586, + "learning_rate": 6.358437040712435e-06, + "loss": 2.9928, + "step": 16586 + }, + { + "epoch": 1.413704934799284, + "grad_norm": 57.09586430260403, + "learning_rate": 6.357959837249601e-06, + "loss": 2.685, + "step": 16587 + }, + { + "epoch": 1.4137901644933095, + "grad_norm": 25.639727353464426, + "learning_rate": 6.35748262043177e-06, + "loss": 2.1313, + "step": 16588 + }, + { + "epoch": 1.4138753941873348, + "grad_norm": 36.082739674131794, + "learning_rate": 6.357005390263638e-06, + "loss": 2.6106, + "step": 16589 + }, + { + "epoch": 1.4139606238813602, + "grad_norm": 33.743151343361774, + "learning_rate": 6.3565281467498955e-06, + "loss": 2.4798, + "step": 16590 + }, + { + "epoch": 1.4140458535753857, + "grad_norm": 119.05464787033063, + "learning_rate": 6.35605088989524e-06, + "loss": 2.6717, + "step": 16591 + }, + { + "epoch": 1.4141310832694112, + "grad_norm": 84.12690847988864, + "learning_rate": 6.355573619704362e-06, + "loss": 2.4677, + "step": 16592 + }, + { + "epoch": 1.4142163129634364, + "grad_norm": 39.77567077610512, + "learning_rate": 6.355096336181955e-06, + "loss": 2.5013, + "step": 16593 + }, + { + "epoch": 1.4143015426574619, + "grad_norm": 28.44972923957208, + "learning_rate": 6.354619039332716e-06, + "loss": 3.0102, + "step": 16594 + }, + { + "epoch": 1.4143867723514871, + "grad_norm": 34.935318141973845, + "learning_rate": 6.354141729161335e-06, + "loss": 2.2825, + "step": 16595 + }, + { + "epoch": 1.4144720020455126, + "grad_norm": 46.638323892076954, + "learning_rate": 6.353664405672508e-06, + "loss": 2.4331, + "step": 16596 + }, + { + "epoch": 1.414557231739538, + "grad_norm": 11.610446680208854, + "learning_rate": 6.353187068870931e-06, + "loss": 0.9489, + "step": 16597 + }, + { + "epoch": 1.4146424614335635, + "grad_norm": 41.853115494776674, + "learning_rate": 6.352709718761297e-06, + "loss": 3.0132, + "step": 16598 + }, + { + "epoch": 1.4147276911275888, + "grad_norm": 51.10838756630708, + "learning_rate": 6.3522323553483005e-06, + "loss": 3.72, + "step": 16599 + }, + { + "epoch": 1.4148129208216143, + "grad_norm": 84.20081404136405, + "learning_rate": 6.351754978636635e-06, + "loss": 3.2095, + "step": 16600 + }, + { + "epoch": 1.4148981505156397, + "grad_norm": 85.61605293495256, + "learning_rate": 6.3512775886309964e-06, + "loss": 3.4468, + "step": 16601 + }, + { + "epoch": 1.414983380209665, + "grad_norm": 60.82341301585101, + "learning_rate": 6.350800185336081e-06, + "loss": 3.2027, + "step": 16602 + }, + { + "epoch": 1.4150686099036904, + "grad_norm": 51.109092303683404, + "learning_rate": 6.35032276875658e-06, + "loss": 2.5734, + "step": 16603 + }, + { + "epoch": 1.415153839597716, + "grad_norm": 25.945325438323685, + "learning_rate": 6.349845338897194e-06, + "loss": 1.9318, + "step": 16604 + }, + { + "epoch": 1.4152390692917414, + "grad_norm": 49.20561694029797, + "learning_rate": 6.349367895762615e-06, + "loss": 2.5949, + "step": 16605 + }, + { + "epoch": 1.4153242989857666, + "grad_norm": 55.3095084030256, + "learning_rate": 6.348890439357538e-06, + "loss": 2.6438, + "step": 16606 + }, + { + "epoch": 1.415409528679792, + "grad_norm": 32.6788192806751, + "learning_rate": 6.348412969686659e-06, + "loss": 2.2757, + "step": 16607 + }, + { + "epoch": 1.4154947583738173, + "grad_norm": 39.844873967754985, + "learning_rate": 6.347935486754675e-06, + "loss": 3.1022, + "step": 16608 + }, + { + "epoch": 1.4155799880678428, + "grad_norm": 41.163349650077464, + "learning_rate": 6.347457990566281e-06, + "loss": 2.8073, + "step": 16609 + }, + { + "epoch": 1.4156652177618683, + "grad_norm": 56.76044515985304, + "learning_rate": 6.346980481126173e-06, + "loss": 3.1693, + "step": 16610 + }, + { + "epoch": 1.4157504474558937, + "grad_norm": 42.48334327911404, + "learning_rate": 6.346502958439047e-06, + "loss": 3.536, + "step": 16611 + }, + { + "epoch": 1.415835677149919, + "grad_norm": 62.87756029364372, + "learning_rate": 6.3460254225096004e-06, + "loss": 3.0592, + "step": 16612 + }, + { + "epoch": 1.4159209068439444, + "grad_norm": 40.20659222589828, + "learning_rate": 6.345547873342528e-06, + "loss": 2.8116, + "step": 16613 + }, + { + "epoch": 1.4160061365379697, + "grad_norm": 28.915101108046986, + "learning_rate": 6.345070310942527e-06, + "loss": 2.4611, + "step": 16614 + }, + { + "epoch": 1.4160913662319952, + "grad_norm": 43.99343756122935, + "learning_rate": 6.344592735314295e-06, + "loss": 3.2103, + "step": 16615 + }, + { + "epoch": 1.4161765959260206, + "grad_norm": 35.30032193722633, + "learning_rate": 6.344115146462525e-06, + "loss": 2.5833, + "step": 16616 + }, + { + "epoch": 1.416261825620046, + "grad_norm": 31.173274297132792, + "learning_rate": 6.34363754439192e-06, + "loss": 1.6664, + "step": 16617 + }, + { + "epoch": 1.4163470553140713, + "grad_norm": 105.83023031539315, + "learning_rate": 6.343159929107171e-06, + "loss": 3.5535, + "step": 16618 + }, + { + "epoch": 1.4164322850080968, + "grad_norm": 43.277457584965376, + "learning_rate": 6.342682300612979e-06, + "loss": 3.1034, + "step": 16619 + }, + { + "epoch": 1.4165175147021223, + "grad_norm": 107.46226853469238, + "learning_rate": 6.342204658914041e-06, + "loss": 3.359, + "step": 16620 + }, + { + "epoch": 1.4166027443961475, + "grad_norm": 65.88392396360182, + "learning_rate": 6.341727004015052e-06, + "loss": 3.185, + "step": 16621 + }, + { + "epoch": 1.416687974090173, + "grad_norm": 79.71889539411448, + "learning_rate": 6.3412493359207116e-06, + "loss": 3.0147, + "step": 16622 + }, + { + "epoch": 1.4167732037841985, + "grad_norm": 32.92915439409074, + "learning_rate": 6.340771654635715e-06, + "loss": 2.8487, + "step": 16623 + }, + { + "epoch": 1.416858433478224, + "grad_norm": 33.10022021864698, + "learning_rate": 6.340293960164765e-06, + "loss": 3.3755, + "step": 16624 + }, + { + "epoch": 1.4169436631722492, + "grad_norm": 231.57832176745015, + "learning_rate": 6.339816252512555e-06, + "loss": 3.6122, + "step": 16625 + }, + { + "epoch": 1.4170288928662746, + "grad_norm": 89.82522776105968, + "learning_rate": 6.339338531683785e-06, + "loss": 3.8084, + "step": 16626 + }, + { + "epoch": 1.4171141225603, + "grad_norm": 52.593547244731916, + "learning_rate": 6.338860797683153e-06, + "loss": 2.7513, + "step": 16627 + }, + { + "epoch": 1.4171993522543254, + "grad_norm": 34.732290988424495, + "learning_rate": 6.338383050515356e-06, + "loss": 2.7607, + "step": 16628 + }, + { + "epoch": 1.4172845819483508, + "grad_norm": 31.483465620938194, + "learning_rate": 6.337905290185095e-06, + "loss": 2.7829, + "step": 16629 + }, + { + "epoch": 1.4173698116423763, + "grad_norm": 90.18600986059955, + "learning_rate": 6.3374275166970666e-06, + "loss": 3.5291, + "step": 16630 + }, + { + "epoch": 1.4174550413364015, + "grad_norm": 89.38137680691881, + "learning_rate": 6.3369497300559694e-06, + "loss": 4.0713, + "step": 16631 + }, + { + "epoch": 1.417540271030427, + "grad_norm": 31.73434604090551, + "learning_rate": 6.336471930266503e-06, + "loss": 2.3419, + "step": 16632 + }, + { + "epoch": 1.4176255007244525, + "grad_norm": 64.78335915196764, + "learning_rate": 6.3359941173333665e-06, + "loss": 2.5427, + "step": 16633 + }, + { + "epoch": 1.4177107304184777, + "grad_norm": 74.25191860521555, + "learning_rate": 6.3355162912612585e-06, + "loss": 4.2929, + "step": 16634 + }, + { + "epoch": 1.4177959601125032, + "grad_norm": 45.26585029434244, + "learning_rate": 6.335038452054878e-06, + "loss": 3.3754, + "step": 16635 + }, + { + "epoch": 1.4178811898065287, + "grad_norm": 49.070882509566054, + "learning_rate": 6.334560599718925e-06, + "loss": 1.3934, + "step": 16636 + }, + { + "epoch": 1.4179664195005541, + "grad_norm": 63.39041885592111, + "learning_rate": 6.3340827342581006e-06, + "loss": 2.6165, + "step": 16637 + }, + { + "epoch": 1.4180516491945794, + "grad_norm": 51.55560294382391, + "learning_rate": 6.3336048556771005e-06, + "loss": 3.4146, + "step": 16638 + }, + { + "epoch": 1.4181368788886048, + "grad_norm": 41.25437103659032, + "learning_rate": 6.333126963980627e-06, + "loss": 2.2801, + "step": 16639 + }, + { + "epoch": 1.41822210858263, + "grad_norm": 39.37680017569546, + "learning_rate": 6.3326490591733795e-06, + "loss": 1.8929, + "step": 16640 + }, + { + "epoch": 1.4183073382766556, + "grad_norm": 112.88423276437749, + "learning_rate": 6.33217114126006e-06, + "loss": 3.298, + "step": 16641 + }, + { + "epoch": 1.418392567970681, + "grad_norm": 73.49653949582861, + "learning_rate": 6.331693210245365e-06, + "loss": 3.8609, + "step": 16642 + }, + { + "epoch": 1.4184777976647065, + "grad_norm": 70.92420768311665, + "learning_rate": 6.331215266133994e-06, + "loss": 3.3239, + "step": 16643 + }, + { + "epoch": 1.4185630273587317, + "grad_norm": 52.52774789753526, + "learning_rate": 6.3307373089306525e-06, + "loss": 2.7821, + "step": 16644 + }, + { + "epoch": 1.4186482570527572, + "grad_norm": 59.098726042916915, + "learning_rate": 6.330259338640039e-06, + "loss": 2.1077, + "step": 16645 + }, + { + "epoch": 1.4187334867467825, + "grad_norm": 66.58245567869747, + "learning_rate": 6.329781355266853e-06, + "loss": 3.9752, + "step": 16646 + }, + { + "epoch": 1.418818716440808, + "grad_norm": 38.15541008432497, + "learning_rate": 6.329303358815795e-06, + "loss": 2.2669, + "step": 16647 + }, + { + "epoch": 1.4189039461348334, + "grad_norm": 72.4883288602315, + "learning_rate": 6.328825349291566e-06, + "loss": 2.9436, + "step": 16648 + }, + { + "epoch": 1.4189891758288589, + "grad_norm": 49.92777033720992, + "learning_rate": 6.328347326698871e-06, + "loss": 2.6647, + "step": 16649 + }, + { + "epoch": 1.419074405522884, + "grad_norm": 43.515529225366734, + "learning_rate": 6.327869291042405e-06, + "loss": 2.9101, + "step": 16650 + }, + { + "epoch": 1.4191596352169096, + "grad_norm": 171.56315898567172, + "learning_rate": 6.3273912423268725e-06, + "loss": 5.1821, + "step": 16651 + }, + { + "epoch": 1.419244864910935, + "grad_norm": 38.42246292515152, + "learning_rate": 6.326913180556975e-06, + "loss": 2.6908, + "step": 16652 + }, + { + "epoch": 1.4193300946049603, + "grad_norm": 96.7305369470927, + "learning_rate": 6.326435105737414e-06, + "loss": 2.335, + "step": 16653 + }, + { + "epoch": 1.4194153242989858, + "grad_norm": 50.74393139596663, + "learning_rate": 6.32595701787289e-06, + "loss": 3.076, + "step": 16654 + }, + { + "epoch": 1.4195005539930112, + "grad_norm": 32.581543714252476, + "learning_rate": 6.325478916968105e-06, + "loss": 2.9225, + "step": 16655 + }, + { + "epoch": 1.4195857836870367, + "grad_norm": 84.28093424361137, + "learning_rate": 6.325000803027763e-06, + "loss": 3.2408, + "step": 16656 + }, + { + "epoch": 1.419671013381062, + "grad_norm": 53.65848679316358, + "learning_rate": 6.324522676056565e-06, + "loss": 2.7948, + "step": 16657 + }, + { + "epoch": 1.4197562430750874, + "grad_norm": 54.026677266109616, + "learning_rate": 6.324044536059211e-06, + "loss": 2.0196, + "step": 16658 + }, + { + "epoch": 1.4198414727691127, + "grad_norm": 53.775415667429286, + "learning_rate": 6.323566383040406e-06, + "loss": 3.2322, + "step": 16659 + }, + { + "epoch": 1.4199267024631381, + "grad_norm": 42.38620441267948, + "learning_rate": 6.323088217004852e-06, + "loss": 3.1893, + "step": 16660 + }, + { + "epoch": 1.4200119321571636, + "grad_norm": 34.854103741476415, + "learning_rate": 6.322610037957251e-06, + "loss": 2.9041, + "step": 16661 + }, + { + "epoch": 1.420097161851189, + "grad_norm": 80.01898255345391, + "learning_rate": 6.3221318459023054e-06, + "loss": 2.3345, + "step": 16662 + }, + { + "epoch": 1.4201823915452143, + "grad_norm": 96.78133191253792, + "learning_rate": 6.321653640844718e-06, + "loss": 3.6435, + "step": 16663 + }, + { + "epoch": 1.4202676212392398, + "grad_norm": 56.96805899771763, + "learning_rate": 6.321175422789193e-06, + "loss": 2.8708, + "step": 16664 + }, + { + "epoch": 1.420352850933265, + "grad_norm": 87.65371833776095, + "learning_rate": 6.320697191740434e-06, + "loss": 4.1388, + "step": 16665 + }, + { + "epoch": 1.4204380806272905, + "grad_norm": 47.75522558227826, + "learning_rate": 6.320218947703139e-06, + "loss": 3.5108, + "step": 16666 + }, + { + "epoch": 1.420523310321316, + "grad_norm": 60.409135169303525, + "learning_rate": 6.319740690682019e-06, + "loss": 2.1374, + "step": 16667 + }, + { + "epoch": 1.4206085400153414, + "grad_norm": 29.216940691337413, + "learning_rate": 6.319262420681772e-06, + "loss": 2.5007, + "step": 16668 + }, + { + "epoch": 1.4206937697093667, + "grad_norm": 34.22413264011589, + "learning_rate": 6.318784137707105e-06, + "loss": 2.6012, + "step": 16669 + }, + { + "epoch": 1.4207789994033921, + "grad_norm": 82.5038511431867, + "learning_rate": 6.318305841762717e-06, + "loss": 2.1197, + "step": 16670 + }, + { + "epoch": 1.4208642290974176, + "grad_norm": 65.3426810498582, + "learning_rate": 6.317827532853318e-06, + "loss": 1.8168, + "step": 16671 + }, + { + "epoch": 1.4209494587914429, + "grad_norm": 75.62633490481483, + "learning_rate": 6.317349210983607e-06, + "loss": 2.6846, + "step": 16672 + }, + { + "epoch": 1.4210346884854683, + "grad_norm": 47.88369731375369, + "learning_rate": 6.316870876158291e-06, + "loss": 3.1175, + "step": 16673 + }, + { + "epoch": 1.4211199181794938, + "grad_norm": 38.05362940558224, + "learning_rate": 6.316392528382071e-06, + "loss": 3.0143, + "step": 16674 + }, + { + "epoch": 1.4212051478735193, + "grad_norm": 38.63681103508198, + "learning_rate": 6.315914167659657e-06, + "loss": 2.309, + "step": 16675 + }, + { + "epoch": 1.4212903775675445, + "grad_norm": 36.35275377117562, + "learning_rate": 6.315435793995748e-06, + "loss": 3.1746, + "step": 16676 + }, + { + "epoch": 1.42137560726157, + "grad_norm": 51.19735450027028, + "learning_rate": 6.314957407395052e-06, + "loss": 3.7609, + "step": 16677 + }, + { + "epoch": 1.4214608369555952, + "grad_norm": 51.25703606693127, + "learning_rate": 6.314479007862271e-06, + "loss": 3.5528, + "step": 16678 + }, + { + "epoch": 1.4215460666496207, + "grad_norm": 50.93808964723351, + "learning_rate": 6.314000595402111e-06, + "loss": 2.3425, + "step": 16679 + }, + { + "epoch": 1.4216312963436462, + "grad_norm": 46.99655735821308, + "learning_rate": 6.313522170019278e-06, + "loss": 1.9945, + "step": 16680 + }, + { + "epoch": 1.4217165260376716, + "grad_norm": 27.49405318500542, + "learning_rate": 6.313043731718477e-06, + "loss": 2.1021, + "step": 16681 + }, + { + "epoch": 1.4218017557316969, + "grad_norm": 34.49482692876691, + "learning_rate": 6.312565280504411e-06, + "loss": 2.5578, + "step": 16682 + }, + { + "epoch": 1.4218869854257223, + "grad_norm": 71.76797889780362, + "learning_rate": 6.312086816381788e-06, + "loss": 2.9474, + "step": 16683 + }, + { + "epoch": 1.4219722151197476, + "grad_norm": 31.684494664669792, + "learning_rate": 6.311608339355313e-06, + "loss": 2.7688, + "step": 16684 + }, + { + "epoch": 1.422057444813773, + "grad_norm": 29.867086793728415, + "learning_rate": 6.311129849429691e-06, + "loss": 2.4609, + "step": 16685 + }, + { + "epoch": 1.4221426745077985, + "grad_norm": 81.18692768662568, + "learning_rate": 6.310651346609627e-06, + "loss": 2.8556, + "step": 16686 + }, + { + "epoch": 1.422227904201824, + "grad_norm": 73.13186910683518, + "learning_rate": 6.310172830899829e-06, + "loss": 2.6661, + "step": 16687 + }, + { + "epoch": 1.4223131338958492, + "grad_norm": 75.36096434496042, + "learning_rate": 6.309694302305002e-06, + "loss": 2.6383, + "step": 16688 + }, + { + "epoch": 1.4223983635898747, + "grad_norm": 34.614616877881836, + "learning_rate": 6.30921576082985e-06, + "loss": 2.7082, + "step": 16689 + }, + { + "epoch": 1.4224835932839002, + "grad_norm": 28.807654614296887, + "learning_rate": 6.3087372064790834e-06, + "loss": 2.0348, + "step": 16690 + }, + { + "epoch": 1.4225688229779254, + "grad_norm": 42.57134011027199, + "learning_rate": 6.3082586392574055e-06, + "loss": 2.868, + "step": 16691 + }, + { + "epoch": 1.4226540526719509, + "grad_norm": 56.13139031141763, + "learning_rate": 6.307780059169524e-06, + "loss": 2.1353, + "step": 16692 + }, + { + "epoch": 1.4227392823659764, + "grad_norm": 45.63309055222545, + "learning_rate": 6.307301466220144e-06, + "loss": 2.597, + "step": 16693 + }, + { + "epoch": 1.4228245120600018, + "grad_norm": 33.8067303158199, + "learning_rate": 6.3068228604139745e-06, + "loss": 2.1642, + "step": 16694 + }, + { + "epoch": 1.422909741754027, + "grad_norm": 44.68908523990983, + "learning_rate": 6.306344241755722e-06, + "loss": 3.005, + "step": 16695 + }, + { + "epoch": 1.4229949714480525, + "grad_norm": 48.42937181874176, + "learning_rate": 6.305865610250093e-06, + "loss": 3.0564, + "step": 16696 + }, + { + "epoch": 1.4230802011420778, + "grad_norm": 66.98377092367784, + "learning_rate": 6.305386965901793e-06, + "loss": 3.0081, + "step": 16697 + }, + { + "epoch": 1.4231654308361033, + "grad_norm": 28.34167439438591, + "learning_rate": 6.3049083087155316e-06, + "loss": 2.5326, + "step": 16698 + }, + { + "epoch": 1.4232506605301287, + "grad_norm": 32.31097733747077, + "learning_rate": 6.304429638696017e-06, + "loss": 2.4293, + "step": 16699 + }, + { + "epoch": 1.4233358902241542, + "grad_norm": 27.89220974298502, + "learning_rate": 6.303950955847954e-06, + "loss": 2.3975, + "step": 16700 + }, + { + "epoch": 1.4234211199181794, + "grad_norm": 33.477529964041445, + "learning_rate": 6.303472260176051e-06, + "loss": 2.7364, + "step": 16701 + }, + { + "epoch": 1.423506349612205, + "grad_norm": 43.35318093576576, + "learning_rate": 6.302993551685017e-06, + "loss": 3.3028, + "step": 16702 + }, + { + "epoch": 1.4235915793062304, + "grad_norm": 47.12787157001147, + "learning_rate": 6.3025148303795606e-06, + "loss": 3.4898, + "step": 16703 + }, + { + "epoch": 1.4236768090002556, + "grad_norm": 104.7086726478853, + "learning_rate": 6.302036096264386e-06, + "loss": 3.17, + "step": 16704 + }, + { + "epoch": 1.423762038694281, + "grad_norm": 30.567052307141708, + "learning_rate": 6.301557349344204e-06, + "loss": 2.3796, + "step": 16705 + }, + { + "epoch": 1.4238472683883066, + "grad_norm": 62.69516834349367, + "learning_rate": 6.3010785896237236e-06, + "loss": 3.6998, + "step": 16706 + }, + { + "epoch": 1.423932498082332, + "grad_norm": 48.23376161506506, + "learning_rate": 6.300599817107652e-06, + "loss": 2.9838, + "step": 16707 + }, + { + "epoch": 1.4240177277763573, + "grad_norm": 65.8014426087474, + "learning_rate": 6.300121031800699e-06, + "loss": 2.8627, + "step": 16708 + }, + { + "epoch": 1.4241029574703827, + "grad_norm": 55.43455122954363, + "learning_rate": 6.29964223370757e-06, + "loss": 2.8373, + "step": 16709 + }, + { + "epoch": 1.424188187164408, + "grad_norm": 46.527473802468805, + "learning_rate": 6.2991634228329776e-06, + "loss": 2.9016, + "step": 16710 + }, + { + "epoch": 1.4242734168584334, + "grad_norm": 42.594467924277986, + "learning_rate": 6.298684599181629e-06, + "loss": 3.7911, + "step": 16711 + }, + { + "epoch": 1.424358646552459, + "grad_norm": 53.84585329204252, + "learning_rate": 6.298205762758234e-06, + "loss": 3.869, + "step": 16712 + }, + { + "epoch": 1.4244438762464844, + "grad_norm": 42.185605115178255, + "learning_rate": 6.297726913567499e-06, + "loss": 2.5594, + "step": 16713 + }, + { + "epoch": 1.4245291059405096, + "grad_norm": 33.820224987890306, + "learning_rate": 6.297248051614137e-06, + "loss": 2.3957, + "step": 16714 + }, + { + "epoch": 1.424614335634535, + "grad_norm": 64.54614713459084, + "learning_rate": 6.296769176902856e-06, + "loss": 3.333, + "step": 16715 + }, + { + "epoch": 1.4246995653285603, + "grad_norm": 34.48945681780614, + "learning_rate": 6.296290289438367e-06, + "loss": 2.9504, + "step": 16716 + }, + { + "epoch": 1.4247847950225858, + "grad_norm": 107.5017298465137, + "learning_rate": 6.295811389225375e-06, + "loss": 3.3178, + "step": 16717 + }, + { + "epoch": 1.4248700247166113, + "grad_norm": 58.214102614177456, + "learning_rate": 6.295332476268594e-06, + "loss": 2.9505, + "step": 16718 + }, + { + "epoch": 1.4249552544106368, + "grad_norm": 79.59235435812761, + "learning_rate": 6.294853550572732e-06, + "loss": 2.9137, + "step": 16719 + }, + { + "epoch": 1.425040484104662, + "grad_norm": 27.76602579784029, + "learning_rate": 6.294374612142503e-06, + "loss": 2.5284, + "step": 16720 + }, + { + "epoch": 1.4251257137986875, + "grad_norm": 41.98516344675915, + "learning_rate": 6.293895660982611e-06, + "loss": 2.9888, + "step": 16721 + }, + { + "epoch": 1.425210943492713, + "grad_norm": 60.37628940054593, + "learning_rate": 6.293416697097771e-06, + "loss": 4.0842, + "step": 16722 + }, + { + "epoch": 1.4252961731867382, + "grad_norm": 39.84080631718047, + "learning_rate": 6.292937720492691e-06, + "loss": 2.9622, + "step": 16723 + }, + { + "epoch": 1.4253814028807636, + "grad_norm": 55.3675151241091, + "learning_rate": 6.292458731172082e-06, + "loss": 3.143, + "step": 16724 + }, + { + "epoch": 1.4254666325747891, + "grad_norm": 79.11078315762389, + "learning_rate": 6.291979729140656e-06, + "loss": 3.2585, + "step": 16725 + }, + { + "epoch": 1.4255518622688146, + "grad_norm": 57.219574059152976, + "learning_rate": 6.29150071440312e-06, + "loss": 3.1818, + "step": 16726 + }, + { + "epoch": 1.4256370919628398, + "grad_norm": 64.45029226093128, + "learning_rate": 6.291021686964192e-06, + "loss": 3.4906, + "step": 16727 + }, + { + "epoch": 1.4257223216568653, + "grad_norm": 81.51144300989962, + "learning_rate": 6.290542646828575e-06, + "loss": 2.5594, + "step": 16728 + }, + { + "epoch": 1.4258075513508905, + "grad_norm": 34.442091941574304, + "learning_rate": 6.290063594000986e-06, + "loss": 2.4943, + "step": 16729 + }, + { + "epoch": 1.425892781044916, + "grad_norm": 50.4054363007254, + "learning_rate": 6.289584528486132e-06, + "loss": 3.8728, + "step": 16730 + }, + { + "epoch": 1.4259780107389415, + "grad_norm": 43.20968948789862, + "learning_rate": 6.2891054502887295e-06, + "loss": 3.308, + "step": 16731 + }, + { + "epoch": 1.426063240432967, + "grad_norm": 48.139984452637385, + "learning_rate": 6.288626359413486e-06, + "loss": 3.6815, + "step": 16732 + }, + { + "epoch": 1.4261484701269922, + "grad_norm": 25.638865231193027, + "learning_rate": 6.288147255865113e-06, + "loss": 2.6677, + "step": 16733 + }, + { + "epoch": 1.4262336998210177, + "grad_norm": 43.14716640704928, + "learning_rate": 6.287668139648324e-06, + "loss": 2.7376, + "step": 16734 + }, + { + "epoch": 1.426318929515043, + "grad_norm": 60.45604783839855, + "learning_rate": 6.287189010767831e-06, + "loss": 3.2991, + "step": 16735 + }, + { + "epoch": 1.4264041592090684, + "grad_norm": 82.0565343373048, + "learning_rate": 6.286709869228347e-06, + "loss": 3.0954, + "step": 16736 + }, + { + "epoch": 1.4264893889030938, + "grad_norm": 28.414414134264014, + "learning_rate": 6.28623071503458e-06, + "loss": 2.7924, + "step": 16737 + }, + { + "epoch": 1.4265746185971193, + "grad_norm": 39.92074725731088, + "learning_rate": 6.285751548191248e-06, + "loss": 2.9757, + "step": 16738 + }, + { + "epoch": 1.4266598482911446, + "grad_norm": 41.0481562218385, + "learning_rate": 6.285272368703058e-06, + "loss": 3.7971, + "step": 16739 + }, + { + "epoch": 1.42674507798517, + "grad_norm": 32.36858660177062, + "learning_rate": 6.2847931765747264e-06, + "loss": 2.9961, + "step": 16740 + }, + { + "epoch": 1.4268303076791955, + "grad_norm": 83.57755446261604, + "learning_rate": 6.284313971810962e-06, + "loss": 2.439, + "step": 16741 + }, + { + "epoch": 1.4269155373732207, + "grad_norm": 56.77588770299977, + "learning_rate": 6.283834754416483e-06, + "loss": 2.0373, + "step": 16742 + }, + { + "epoch": 1.4270007670672462, + "grad_norm": 44.95338296691395, + "learning_rate": 6.283355524395998e-06, + "loss": 2.8616, + "step": 16743 + }, + { + "epoch": 1.4270859967612717, + "grad_norm": 22.30379488161209, + "learning_rate": 6.282876281754221e-06, + "loss": 2.2275, + "step": 16744 + }, + { + "epoch": 1.4271712264552971, + "grad_norm": 88.2263208908331, + "learning_rate": 6.282397026495866e-06, + "loss": 2.9366, + "step": 16745 + }, + { + "epoch": 1.4272564561493224, + "grad_norm": 33.210820719935164, + "learning_rate": 6.281917758625647e-06, + "loss": 2.2685, + "step": 16746 + }, + { + "epoch": 1.4273416858433479, + "grad_norm": 84.28326510766344, + "learning_rate": 6.281438478148274e-06, + "loss": 3.9468, + "step": 16747 + }, + { + "epoch": 1.427426915537373, + "grad_norm": 33.055902024418366, + "learning_rate": 6.280959185068463e-06, + "loss": 2.8843, + "step": 16748 + }, + { + "epoch": 1.4275121452313986, + "grad_norm": 63.69108263334505, + "learning_rate": 6.280479879390929e-06, + "loss": 3.2475, + "step": 16749 + }, + { + "epoch": 1.427597374925424, + "grad_norm": 34.013707600956074, + "learning_rate": 6.280000561120384e-06, + "loss": 2.8764, + "step": 16750 + }, + { + "epoch": 1.4276826046194495, + "grad_norm": 44.25304985979512, + "learning_rate": 6.279521230261542e-06, + "loss": 1.8824, + "step": 16751 + }, + { + "epoch": 1.4277678343134748, + "grad_norm": 84.67533818551836, + "learning_rate": 6.2790418868191165e-06, + "loss": 3.6281, + "step": 16752 + }, + { + "epoch": 1.4278530640075002, + "grad_norm": 47.37778505173999, + "learning_rate": 6.278562530797823e-06, + "loss": 2.3837, + "step": 16753 + }, + { + "epoch": 1.4279382937015255, + "grad_norm": 28.468192261417883, + "learning_rate": 6.278083162202374e-06, + "loss": 2.1035, + "step": 16754 + }, + { + "epoch": 1.428023523395551, + "grad_norm": 39.80730114586354, + "learning_rate": 6.277603781037487e-06, + "loss": 2.9568, + "step": 16755 + }, + { + "epoch": 1.4281087530895764, + "grad_norm": 79.53883502920264, + "learning_rate": 6.277124387307871e-06, + "loss": 4.2502, + "step": 16756 + }, + { + "epoch": 1.4281939827836019, + "grad_norm": 60.02451203943002, + "learning_rate": 6.276644981018247e-06, + "loss": 2.7881, + "step": 16757 + }, + { + "epoch": 1.4282792124776271, + "grad_norm": 62.76944657187323, + "learning_rate": 6.276165562173326e-06, + "loss": 2.6379, + "step": 16758 + }, + { + "epoch": 1.4283644421716526, + "grad_norm": 38.72951763850625, + "learning_rate": 6.275686130777825e-06, + "loss": 2.7353, + "step": 16759 + }, + { + "epoch": 1.428449671865678, + "grad_norm": 43.2839433292975, + "learning_rate": 6.275206686836455e-06, + "loss": 2.8943, + "step": 16760 + }, + { + "epoch": 1.4285349015597033, + "grad_norm": 67.89760398850711, + "learning_rate": 6.274727230353936e-06, + "loss": 3.0208, + "step": 16761 + }, + { + "epoch": 1.4286201312537288, + "grad_norm": 38.4424292317772, + "learning_rate": 6.27424776133498e-06, + "loss": 2.6821, + "step": 16762 + }, + { + "epoch": 1.4287053609477542, + "grad_norm": 79.90710052003752, + "learning_rate": 6.273768279784306e-06, + "loss": 3.4646, + "step": 16763 + }, + { + "epoch": 1.4287905906417797, + "grad_norm": 66.86153712194383, + "learning_rate": 6.273288785706624e-06, + "loss": 2.4471, + "step": 16764 + }, + { + "epoch": 1.428875820335805, + "grad_norm": 71.17115463822428, + "learning_rate": 6.272809279106656e-06, + "loss": 3.8623, + "step": 16765 + }, + { + "epoch": 1.4289610500298304, + "grad_norm": 81.88189928950622, + "learning_rate": 6.272329759989113e-06, + "loss": 3.1721, + "step": 16766 + }, + { + "epoch": 1.4290462797238557, + "grad_norm": 94.88192407509486, + "learning_rate": 6.271850228358711e-06, + "loss": 4.0925, + "step": 16767 + }, + { + "epoch": 1.4291315094178811, + "grad_norm": 26.007502174671057, + "learning_rate": 6.271370684220168e-06, + "loss": 2.1574, + "step": 16768 + }, + { + "epoch": 1.4292167391119066, + "grad_norm": 55.203185751604614, + "learning_rate": 6.2708911275782e-06, + "loss": 3.5979, + "step": 16769 + }, + { + "epoch": 1.429301968805932, + "grad_norm": 43.32680559445202, + "learning_rate": 6.270411558437524e-06, + "loss": 1.9921, + "step": 16770 + }, + { + "epoch": 1.4293871984999573, + "grad_norm": 42.018954570027766, + "learning_rate": 6.269931976802854e-06, + "loss": 3.6046, + "step": 16771 + }, + { + "epoch": 1.4294724281939828, + "grad_norm": 34.95983002413336, + "learning_rate": 6.269452382678906e-06, + "loss": 2.9676, + "step": 16772 + }, + { + "epoch": 1.4295576578880083, + "grad_norm": 31.467044197897785, + "learning_rate": 6.268972776070401e-06, + "loss": 2.3958, + "step": 16773 + }, + { + "epoch": 1.4296428875820335, + "grad_norm": 16.507952557881573, + "learning_rate": 6.268493156982052e-06, + "loss": 0.7977, + "step": 16774 + }, + { + "epoch": 1.429728117276059, + "grad_norm": 35.28329964598968, + "learning_rate": 6.268013525418577e-06, + "loss": 2.9057, + "step": 16775 + }, + { + "epoch": 1.4298133469700844, + "grad_norm": 47.26413793794163, + "learning_rate": 6.267533881384692e-06, + "loss": 1.6274, + "step": 16776 + }, + { + "epoch": 1.42989857666411, + "grad_norm": 55.67349079693399, + "learning_rate": 6.267054224885115e-06, + "loss": 2.8456, + "step": 16777 + }, + { + "epoch": 1.4299838063581352, + "grad_norm": 50.38176048821443, + "learning_rate": 6.266574555924565e-06, + "loss": 2.2127, + "step": 16778 + }, + { + "epoch": 1.4300690360521606, + "grad_norm": 72.0767968704568, + "learning_rate": 6.2660948745077554e-06, + "loss": 3.8944, + "step": 16779 + }, + { + "epoch": 1.4301542657461859, + "grad_norm": 47.318024545636405, + "learning_rate": 6.265615180639406e-06, + "loss": 3.362, + "step": 16780 + }, + { + "epoch": 1.4302394954402113, + "grad_norm": 71.40834737640213, + "learning_rate": 6.265135474324235e-06, + "loss": 2.8496, + "step": 16781 + }, + { + "epoch": 1.4303247251342368, + "grad_norm": 43.30863044358565, + "learning_rate": 6.26465575556696e-06, + "loss": 3.4977, + "step": 16782 + }, + { + "epoch": 1.4304099548282623, + "grad_norm": 55.161729190109014, + "learning_rate": 6.264176024372297e-06, + "loss": 2.0649, + "step": 16783 + }, + { + "epoch": 1.4304951845222875, + "grad_norm": 62.781971524954805, + "learning_rate": 6.2636962807449645e-06, + "loss": 4.0388, + "step": 16784 + }, + { + "epoch": 1.430580414216313, + "grad_norm": 38.319353992721084, + "learning_rate": 6.263216524689684e-06, + "loss": 2.4746, + "step": 16785 + }, + { + "epoch": 1.4306656439103382, + "grad_norm": 34.2496169090435, + "learning_rate": 6.262736756211167e-06, + "loss": 2.2596, + "step": 16786 + }, + { + "epoch": 1.4307508736043637, + "grad_norm": 22.14921501783182, + "learning_rate": 6.262256975314138e-06, + "loss": 2.1403, + "step": 16787 + }, + { + "epoch": 1.4308361032983892, + "grad_norm": 62.998598234867735, + "learning_rate": 6.261777182003313e-06, + "loss": 3.0681, + "step": 16788 + }, + { + "epoch": 1.4309213329924146, + "grad_norm": 42.003135061342164, + "learning_rate": 6.261297376283412e-06, + "loss": 4.0449, + "step": 16789 + }, + { + "epoch": 1.4310065626864399, + "grad_norm": 23.472856864777206, + "learning_rate": 6.260817558159151e-06, + "loss": 1.8963, + "step": 16790 + }, + { + "epoch": 1.4310917923804654, + "grad_norm": 55.976860218270915, + "learning_rate": 6.26033772763525e-06, + "loss": 3.6529, + "step": 16791 + }, + { + "epoch": 1.4311770220744908, + "grad_norm": 67.26569959784868, + "learning_rate": 6.259857884716429e-06, + "loss": 3.2766, + "step": 16792 + }, + { + "epoch": 1.431262251768516, + "grad_norm": 57.06283046603963, + "learning_rate": 6.259378029407407e-06, + "loss": 3.6752, + "step": 16793 + }, + { + "epoch": 1.4313474814625415, + "grad_norm": 56.47362981731434, + "learning_rate": 6.258898161712901e-06, + "loss": 3.5578, + "step": 16794 + }, + { + "epoch": 1.431432711156567, + "grad_norm": 67.624847829708, + "learning_rate": 6.258418281637633e-06, + "loss": 2.9143, + "step": 16795 + }, + { + "epoch": 1.4315179408505925, + "grad_norm": 55.697941029425806, + "learning_rate": 6.257938389186321e-06, + "loss": 3.0189, + "step": 16796 + }, + { + "epoch": 1.4316031705446177, + "grad_norm": 34.584866902365675, + "learning_rate": 6.257458484363684e-06, + "loss": 2.7047, + "step": 16797 + }, + { + "epoch": 1.4316884002386432, + "grad_norm": 50.842184075622754, + "learning_rate": 6.256978567174444e-06, + "loss": 2.7352, + "step": 16798 + }, + { + "epoch": 1.4317736299326684, + "grad_norm": 48.60708790949056, + "learning_rate": 6.256498637623317e-06, + "loss": 3.2308, + "step": 16799 + }, + { + "epoch": 1.431858859626694, + "grad_norm": 41.83232860662998, + "learning_rate": 6.2560186957150284e-06, + "loss": 3.4978, + "step": 16800 + }, + { + "epoch": 1.4319440893207194, + "grad_norm": 28.164932273664185, + "learning_rate": 6.255538741454292e-06, + "loss": 1.955, + "step": 16801 + }, + { + "epoch": 1.4320293190147448, + "grad_norm": 41.48587101855049, + "learning_rate": 6.255058774845834e-06, + "loss": 2.4055, + "step": 16802 + }, + { + "epoch": 1.43211454870877, + "grad_norm": 93.43143269997681, + "learning_rate": 6.254578795894368e-06, + "loss": 3.1536, + "step": 16803 + }, + { + "epoch": 1.4321997784027956, + "grad_norm": 55.456018975912684, + "learning_rate": 6.25409880460462e-06, + "loss": 2.4602, + "step": 16804 + }, + { + "epoch": 1.4322850080968208, + "grad_norm": 48.72595448777002, + "learning_rate": 6.253618800981308e-06, + "loss": 2.8468, + "step": 16805 + }, + { + "epoch": 1.4323702377908463, + "grad_norm": 48.849792472913, + "learning_rate": 6.253138785029155e-06, + "loss": 2.4788, + "step": 16806 + }, + { + "epoch": 1.4324554674848717, + "grad_norm": 34.69396808912614, + "learning_rate": 6.252658756752877e-06, + "loss": 1.1627, + "step": 16807 + }, + { + "epoch": 1.4325406971788972, + "grad_norm": 86.15624415598352, + "learning_rate": 6.2521787161572e-06, + "loss": 3.0634, + "step": 16808 + }, + { + "epoch": 1.4326259268729225, + "grad_norm": 51.43514777498786, + "learning_rate": 6.251698663246842e-06, + "loss": 2.7432, + "step": 16809 + }, + { + "epoch": 1.432711156566948, + "grad_norm": 40.70346963682812, + "learning_rate": 6.251218598026527e-06, + "loss": 2.7339, + "step": 16810 + }, + { + "epoch": 1.4327963862609734, + "grad_norm": 48.96829762788931, + "learning_rate": 6.250738520500972e-06, + "loss": 2.9707, + "step": 16811 + }, + { + "epoch": 1.4328816159549986, + "grad_norm": 48.14750749155112, + "learning_rate": 6.250258430674902e-06, + "loss": 3.7613, + "step": 16812 + }, + { + "epoch": 1.432966845649024, + "grad_norm": 63.48547479082455, + "learning_rate": 6.249778328553036e-06, + "loss": 3.1656, + "step": 16813 + }, + { + "epoch": 1.4330520753430496, + "grad_norm": 78.75624564285442, + "learning_rate": 6.2492982141400985e-06, + "loss": 3.2366, + "step": 16814 + }, + { + "epoch": 1.433137305037075, + "grad_norm": 122.96301508754384, + "learning_rate": 6.248818087440806e-06, + "loss": 2.7666, + "step": 16815 + }, + { + "epoch": 1.4332225347311003, + "grad_norm": 68.14781376240566, + "learning_rate": 6.248337948459887e-06, + "loss": 3.1543, + "step": 16816 + }, + { + "epoch": 1.4333077644251258, + "grad_norm": 84.64924026572643, + "learning_rate": 6.247857797202061e-06, + "loss": 2.2426, + "step": 16817 + }, + { + "epoch": 1.433392994119151, + "grad_norm": 85.34823829492312, + "learning_rate": 6.247377633672047e-06, + "loss": 3.1994, + "step": 16818 + }, + { + "epoch": 1.4334782238131765, + "grad_norm": 36.31216848799294, + "learning_rate": 6.24689745787457e-06, + "loss": 2.8457, + "step": 16819 + }, + { + "epoch": 1.433563453507202, + "grad_norm": 73.65869763194128, + "learning_rate": 6.246417269814352e-06, + "loss": 3.1468, + "step": 16820 + }, + { + "epoch": 1.4336486832012274, + "grad_norm": 76.29121680822792, + "learning_rate": 6.245937069496116e-06, + "loss": 2.2714, + "step": 16821 + }, + { + "epoch": 1.4337339128952526, + "grad_norm": 44.726520802779945, + "learning_rate": 6.245456856924585e-06, + "loss": 2.9262, + "step": 16822 + }, + { + "epoch": 1.4338191425892781, + "grad_norm": 49.79329910217466, + "learning_rate": 6.244976632104479e-06, + "loss": 2.6329, + "step": 16823 + }, + { + "epoch": 1.4339043722833036, + "grad_norm": 49.61791696367921, + "learning_rate": 6.244496395040523e-06, + "loss": 4.652, + "step": 16824 + }, + { + "epoch": 1.4339896019773288, + "grad_norm": 52.097168650966815, + "learning_rate": 6.244016145737439e-06, + "loss": 2.1293, + "step": 16825 + }, + { + "epoch": 1.4340748316713543, + "grad_norm": 89.26546892426025, + "learning_rate": 6.243535884199951e-06, + "loss": 3.8343, + "step": 16826 + }, + { + "epoch": 1.4341600613653798, + "grad_norm": 37.27080219087441, + "learning_rate": 6.243055610432782e-06, + "loss": 3.2571, + "step": 16827 + }, + { + "epoch": 1.434245291059405, + "grad_norm": 106.64171311600431, + "learning_rate": 6.242575324440656e-06, + "loss": 3.3464, + "step": 16828 + }, + { + "epoch": 1.4343305207534305, + "grad_norm": 109.45769192401706, + "learning_rate": 6.242095026228294e-06, + "loss": 3.3617, + "step": 16829 + }, + { + "epoch": 1.434415750447456, + "grad_norm": 45.0955187373782, + "learning_rate": 6.2416147158004205e-06, + "loss": 3.6683, + "step": 16830 + }, + { + "epoch": 1.4345009801414812, + "grad_norm": 41.1047003275944, + "learning_rate": 6.24113439316176e-06, + "loss": 3.1079, + "step": 16831 + }, + { + "epoch": 1.4345862098355067, + "grad_norm": 63.31408667856119, + "learning_rate": 6.240654058317037e-06, + "loss": 3.2702, + "step": 16832 + }, + { + "epoch": 1.4346714395295321, + "grad_norm": 67.69407144383098, + "learning_rate": 6.2401737112709735e-06, + "loss": 2.7114, + "step": 16833 + }, + { + "epoch": 1.4347566692235576, + "grad_norm": 38.64653203935067, + "learning_rate": 6.2396933520282945e-06, + "loss": 3.0477, + "step": 16834 + }, + { + "epoch": 1.4348418989175828, + "grad_norm": 60.84905948204921, + "learning_rate": 6.239212980593724e-06, + "loss": 2.7314, + "step": 16835 + }, + { + "epoch": 1.4349271286116083, + "grad_norm": 44.6498843501833, + "learning_rate": 6.2387325969719866e-06, + "loss": 3.0475, + "step": 16836 + }, + { + "epoch": 1.4350123583056336, + "grad_norm": 52.73889837835951, + "learning_rate": 6.238252201167806e-06, + "loss": 2.9412, + "step": 16837 + }, + { + "epoch": 1.435097587999659, + "grad_norm": 32.23071320888429, + "learning_rate": 6.237771793185906e-06, + "loss": 2.6573, + "step": 16838 + }, + { + "epoch": 1.4351828176936845, + "grad_norm": 41.63252627469551, + "learning_rate": 6.2372913730310145e-06, + "loss": 3.5156, + "step": 16839 + }, + { + "epoch": 1.43526804738771, + "grad_norm": 92.63073400822032, + "learning_rate": 6.236810940707852e-06, + "loss": 3.2328, + "step": 16840 + }, + { + "epoch": 1.4353532770817352, + "grad_norm": 71.72522970268825, + "learning_rate": 6.236330496221148e-06, + "loss": 3.8449, + "step": 16841 + }, + { + "epoch": 1.4354385067757607, + "grad_norm": 73.7463481865889, + "learning_rate": 6.23585003957562e-06, + "loss": 2.9603, + "step": 16842 + }, + { + "epoch": 1.4355237364697861, + "grad_norm": 24.211940889153688, + "learning_rate": 6.235369570776002e-06, + "loss": 2.1739, + "step": 16843 + }, + { + "epoch": 1.4356089661638114, + "grad_norm": 37.13465421173134, + "learning_rate": 6.234889089827014e-06, + "loss": 3.3642, + "step": 16844 + }, + { + "epoch": 1.4356941958578369, + "grad_norm": 33.98659693038976, + "learning_rate": 6.234408596733383e-06, + "loss": 3.627, + "step": 16845 + }, + { + "epoch": 1.4357794255518623, + "grad_norm": 52.87532958422097, + "learning_rate": 6.2339280914998325e-06, + "loss": 2.0833, + "step": 16846 + }, + { + "epoch": 1.4358646552458878, + "grad_norm": 42.70558274942726, + "learning_rate": 6.233447574131092e-06, + "loss": 3.2268, + "step": 16847 + }, + { + "epoch": 1.435949884939913, + "grad_norm": 63.557137473668135, + "learning_rate": 6.232967044631882e-06, + "loss": 2.5735, + "step": 16848 + }, + { + "epoch": 1.4360351146339385, + "grad_norm": 92.75750317835126, + "learning_rate": 6.232486503006934e-06, + "loss": 4.263, + "step": 16849 + }, + { + "epoch": 1.4361203443279638, + "grad_norm": 30.578354905767227, + "learning_rate": 6.2320059492609676e-06, + "loss": 2.0568, + "step": 16850 + }, + { + "epoch": 1.4362055740219892, + "grad_norm": 50.11065711139017, + "learning_rate": 6.231525383398714e-06, + "loss": 2.7033, + "step": 16851 + }, + { + "epoch": 1.4362908037160147, + "grad_norm": 33.55404978424146, + "learning_rate": 6.231044805424899e-06, + "loss": 2.0457, + "step": 16852 + }, + { + "epoch": 1.4363760334100402, + "grad_norm": 106.88459744111633, + "learning_rate": 6.230564215344246e-06, + "loss": 3.0951, + "step": 16853 + }, + { + "epoch": 1.4364612631040654, + "grad_norm": 32.56339502798012, + "learning_rate": 6.230083613161483e-06, + "loss": 3.0196, + "step": 16854 + }, + { + "epoch": 1.4365464927980909, + "grad_norm": 79.38542575350438, + "learning_rate": 6.229602998881336e-06, + "loss": 3.0149, + "step": 16855 + }, + { + "epoch": 1.4366317224921161, + "grad_norm": 40.057856037748834, + "learning_rate": 6.229122372508533e-06, + "loss": 2.8674, + "step": 16856 + }, + { + "epoch": 1.4367169521861416, + "grad_norm": 39.45304713356286, + "learning_rate": 6.228641734047799e-06, + "loss": 3.2477, + "step": 16857 + }, + { + "epoch": 1.436802181880167, + "grad_norm": 65.61925928139814, + "learning_rate": 6.228161083503862e-06, + "loss": 2.7552, + "step": 16858 + }, + { + "epoch": 1.4368874115741925, + "grad_norm": 50.957038117206125, + "learning_rate": 6.227680420881449e-06, + "loss": 2.5564, + "step": 16859 + }, + { + "epoch": 1.4369726412682178, + "grad_norm": 55.133671798909, + "learning_rate": 6.227199746185288e-06, + "loss": 2.5878, + "step": 16860 + }, + { + "epoch": 1.4370578709622432, + "grad_norm": 38.15931454769573, + "learning_rate": 6.226719059420104e-06, + "loss": 2.8671, + "step": 16861 + }, + { + "epoch": 1.4371431006562687, + "grad_norm": 105.82185146971919, + "learning_rate": 6.226238360590624e-06, + "loss": 3.2921, + "step": 16862 + }, + { + "epoch": 1.437228330350294, + "grad_norm": 26.051385245354805, + "learning_rate": 6.2257576497015785e-06, + "loss": 3.2061, + "step": 16863 + }, + { + "epoch": 1.4373135600443194, + "grad_norm": 117.92150169390575, + "learning_rate": 6.225276926757694e-06, + "loss": 2.404, + "step": 16864 + }, + { + "epoch": 1.437398789738345, + "grad_norm": 68.8569063476683, + "learning_rate": 6.224796191763697e-06, + "loss": 3.5559, + "step": 16865 + }, + { + "epoch": 1.4374840194323704, + "grad_norm": 34.51154437499092, + "learning_rate": 6.224315444724316e-06, + "loss": 2.4382, + "step": 16866 + }, + { + "epoch": 1.4375692491263956, + "grad_norm": 57.67599736983767, + "learning_rate": 6.22383468564428e-06, + "loss": 2.3781, + "step": 16867 + }, + { + "epoch": 1.437654478820421, + "grad_norm": 35.72432806361294, + "learning_rate": 6.223353914528316e-06, + "loss": 2.7694, + "step": 16868 + }, + { + "epoch": 1.4377397085144463, + "grad_norm": 43.42945450035872, + "learning_rate": 6.2228731313811515e-06, + "loss": 2.5934, + "step": 16869 + }, + { + "epoch": 1.4378249382084718, + "grad_norm": 41.07066068228624, + "learning_rate": 6.222392336207516e-06, + "loss": 3.3464, + "step": 16870 + }, + { + "epoch": 1.4379101679024973, + "grad_norm": 38.840852297764926, + "learning_rate": 6.221911529012139e-06, + "loss": 2.9488, + "step": 16871 + }, + { + "epoch": 1.4379953975965227, + "grad_norm": 56.55277897272379, + "learning_rate": 6.221430709799746e-06, + "loss": 2.8192, + "step": 16872 + }, + { + "epoch": 1.438080627290548, + "grad_norm": 83.19894671677794, + "learning_rate": 6.2209498785750675e-06, + "loss": 3.6471, + "step": 16873 + }, + { + "epoch": 1.4381658569845734, + "grad_norm": 51.88767343611245, + "learning_rate": 6.220469035342832e-06, + "loss": 3.6635, + "step": 16874 + }, + { + "epoch": 1.4382510866785987, + "grad_norm": 90.26951474628447, + "learning_rate": 6.21998818010777e-06, + "loss": 3.6099, + "step": 16875 + }, + { + "epoch": 1.4383363163726242, + "grad_norm": 34.66634693114601, + "learning_rate": 6.219507312874609e-06, + "loss": 3.0428, + "step": 16876 + }, + { + "epoch": 1.4384215460666496, + "grad_norm": 35.16859882367794, + "learning_rate": 6.219026433648076e-06, + "loss": 2.998, + "step": 16877 + }, + { + "epoch": 1.438506775760675, + "grad_norm": 104.9134313644002, + "learning_rate": 6.218545542432903e-06, + "loss": 4.5219, + "step": 16878 + }, + { + "epoch": 1.4385920054547003, + "grad_norm": 178.37685723862214, + "learning_rate": 6.218064639233821e-06, + "loss": 3.2105, + "step": 16879 + }, + { + "epoch": 1.4386772351487258, + "grad_norm": 73.46428658747855, + "learning_rate": 6.217583724055555e-06, + "loss": 3.2935, + "step": 16880 + }, + { + "epoch": 1.4387624648427513, + "grad_norm": 44.8817339600074, + "learning_rate": 6.217102796902836e-06, + "loss": 2.6245, + "step": 16881 + }, + { + "epoch": 1.4388476945367765, + "grad_norm": 58.95695654882931, + "learning_rate": 6.216621857780397e-06, + "loss": 2.0396, + "step": 16882 + }, + { + "epoch": 1.438932924230802, + "grad_norm": 69.57308513750537, + "learning_rate": 6.216140906692965e-06, + "loss": 3.884, + "step": 16883 + }, + { + "epoch": 1.4390181539248275, + "grad_norm": 76.67284472209334, + "learning_rate": 6.215659943645271e-06, + "loss": 2.8453, + "step": 16884 + }, + { + "epoch": 1.439103383618853, + "grad_norm": 66.21222959960905, + "learning_rate": 6.215178968642042e-06, + "loss": 2.8982, + "step": 16885 + }, + { + "epoch": 1.4391886133128782, + "grad_norm": 73.02387261637006, + "learning_rate": 6.214697981688014e-06, + "loss": 3.4997, + "step": 16886 + }, + { + "epoch": 1.4392738430069036, + "grad_norm": 41.97569237134998, + "learning_rate": 6.2142169827879114e-06, + "loss": 2.8101, + "step": 16887 + }, + { + "epoch": 1.4393590727009289, + "grad_norm": 44.13182257600091, + "learning_rate": 6.213735971946468e-06, + "loss": 2.5652, + "step": 16888 + }, + { + "epoch": 1.4394443023949544, + "grad_norm": 97.58852315363308, + "learning_rate": 6.213254949168413e-06, + "loss": 4.0164, + "step": 16889 + }, + { + "epoch": 1.4395295320889798, + "grad_norm": 15.291087216296319, + "learning_rate": 6.2127739144584795e-06, + "loss": 0.7705, + "step": 16890 + }, + { + "epoch": 1.4396147617830053, + "grad_norm": 34.24759464103533, + "learning_rate": 6.212292867821394e-06, + "loss": 3.256, + "step": 16891 + }, + { + "epoch": 1.4396999914770305, + "grad_norm": 30.3302209177735, + "learning_rate": 6.211811809261893e-06, + "loss": 2.3173, + "step": 16892 + }, + { + "epoch": 1.439785221171056, + "grad_norm": 107.83213909097621, + "learning_rate": 6.211330738784701e-06, + "loss": 3.6466, + "step": 16893 + }, + { + "epoch": 1.4398704508650815, + "grad_norm": 49.49944697340146, + "learning_rate": 6.210849656394555e-06, + "loss": 2.6732, + "step": 16894 + }, + { + "epoch": 1.4399556805591067, + "grad_norm": 26.97082485992622, + "learning_rate": 6.210368562096183e-06, + "loss": 1.9485, + "step": 16895 + }, + { + "epoch": 1.4400409102531322, + "grad_norm": 55.14882707677225, + "learning_rate": 6.209887455894318e-06, + "loss": 2.9936, + "step": 16896 + }, + { + "epoch": 1.4401261399471577, + "grad_norm": 68.24334564291175, + "learning_rate": 6.209406337793688e-06, + "loss": 3.6233, + "step": 16897 + }, + { + "epoch": 1.4402113696411831, + "grad_norm": 36.27921967191168, + "learning_rate": 6.20892520779903e-06, + "loss": 2.5062, + "step": 16898 + }, + { + "epoch": 1.4402965993352084, + "grad_norm": 47.82486757101329, + "learning_rate": 6.208444065915072e-06, + "loss": 1.4529, + "step": 16899 + }, + { + "epoch": 1.4403818290292338, + "grad_norm": 74.70445828599235, + "learning_rate": 6.207962912146547e-06, + "loss": 3.1162, + "step": 16900 + }, + { + "epoch": 1.440467058723259, + "grad_norm": 40.88451926313115, + "learning_rate": 6.207481746498187e-06, + "loss": 2.8459, + "step": 16901 + }, + { + "epoch": 1.4405522884172846, + "grad_norm": 76.53692039580578, + "learning_rate": 6.207000568974723e-06, + "loss": 3.8664, + "step": 16902 + }, + { + "epoch": 1.44063751811131, + "grad_norm": 66.56639234096366, + "learning_rate": 6.206519379580888e-06, + "loss": 2.3084, + "step": 16903 + }, + { + "epoch": 1.4407227478053355, + "grad_norm": 42.48797113393303, + "learning_rate": 6.206038178321414e-06, + "loss": 3.7545, + "step": 16904 + }, + { + "epoch": 1.4408079774993607, + "grad_norm": 56.55254123753611, + "learning_rate": 6.205556965201035e-06, + "loss": 2.9323, + "step": 16905 + }, + { + "epoch": 1.4408932071933862, + "grad_norm": 57.23185735852575, + "learning_rate": 6.205075740224481e-06, + "loss": 2.8957, + "step": 16906 + }, + { + "epoch": 1.4409784368874115, + "grad_norm": 43.78351591672219, + "learning_rate": 6.2045945033964876e-06, + "loss": 2.9801, + "step": 16907 + }, + { + "epoch": 1.441063666581437, + "grad_norm": 111.72233003297887, + "learning_rate": 6.204113254721784e-06, + "loss": 2.9049, + "step": 16908 + }, + { + "epoch": 1.4411488962754624, + "grad_norm": 39.89513004744771, + "learning_rate": 6.203631994205106e-06, + "loss": 3.2083, + "step": 16909 + }, + { + "epoch": 1.4412341259694879, + "grad_norm": 40.21963569248002, + "learning_rate": 6.2031507218511856e-06, + "loss": 3.359, + "step": 16910 + }, + { + "epoch": 1.441319355663513, + "grad_norm": 84.31461722135015, + "learning_rate": 6.2026694376647564e-06, + "loss": 3.0974, + "step": 16911 + }, + { + "epoch": 1.4414045853575386, + "grad_norm": 52.90634665640585, + "learning_rate": 6.202188141650551e-06, + "loss": 2.4667, + "step": 16912 + }, + { + "epoch": 1.441489815051564, + "grad_norm": 35.899656809323126, + "learning_rate": 6.201706833813302e-06, + "loss": 3.3032, + "step": 16913 + }, + { + "epoch": 1.4415750447455893, + "grad_norm": 84.97485315769475, + "learning_rate": 6.201225514157745e-06, + "loss": 3.5636, + "step": 16914 + }, + { + "epoch": 1.4416602744396148, + "grad_norm": 49.346050098668506, + "learning_rate": 6.200744182688612e-06, + "loss": 3.5174, + "step": 16915 + }, + { + "epoch": 1.4417455041336402, + "grad_norm": 67.6667421837416, + "learning_rate": 6.200262839410636e-06, + "loss": 2.6158, + "step": 16916 + }, + { + "epoch": 1.4418307338276657, + "grad_norm": 32.096449742649426, + "learning_rate": 6.199781484328553e-06, + "loss": 2.5059, + "step": 16917 + }, + { + "epoch": 1.441915963521691, + "grad_norm": 42.92955609962722, + "learning_rate": 6.199300117447096e-06, + "loss": 3.8387, + "step": 16918 + }, + { + "epoch": 1.4420011932157164, + "grad_norm": 68.84922708526636, + "learning_rate": 6.198818738770999e-06, + "loss": 2.3087, + "step": 16919 + }, + { + "epoch": 1.4420864229097416, + "grad_norm": 32.404582383874136, + "learning_rate": 6.1983373483049945e-06, + "loss": 2.2853, + "step": 16920 + }, + { + "epoch": 1.4421716526037671, + "grad_norm": 57.77230342835278, + "learning_rate": 6.197855946053819e-06, + "loss": 3.6438, + "step": 16921 + }, + { + "epoch": 1.4422568822977926, + "grad_norm": 31.633634202702734, + "learning_rate": 6.197374532022208e-06, + "loss": 2.7572, + "step": 16922 + }, + { + "epoch": 1.442342111991818, + "grad_norm": 43.304726906991654, + "learning_rate": 6.196893106214893e-06, + "loss": 2.9404, + "step": 16923 + }, + { + "epoch": 1.4424273416858433, + "grad_norm": 78.47885158894763, + "learning_rate": 6.19641166863661e-06, + "loss": 3.0791, + "step": 16924 + }, + { + "epoch": 1.4425125713798688, + "grad_norm": 35.300225963218956, + "learning_rate": 6.195930219292092e-06, + "loss": 2.854, + "step": 16925 + }, + { + "epoch": 1.442597801073894, + "grad_norm": 43.82691827604088, + "learning_rate": 6.195448758186078e-06, + "loss": 2.8011, + "step": 16926 + }, + { + "epoch": 1.4426830307679195, + "grad_norm": 62.38997621284492, + "learning_rate": 6.194967285323301e-06, + "loss": 3.0584, + "step": 16927 + }, + { + "epoch": 1.442768260461945, + "grad_norm": 83.14201846209492, + "learning_rate": 6.194485800708492e-06, + "loss": 4.29, + "step": 16928 + }, + { + "epoch": 1.4428534901559704, + "grad_norm": 53.457023411755074, + "learning_rate": 6.194004304346392e-06, + "loss": 3.2236, + "step": 16929 + }, + { + "epoch": 1.4429387198499957, + "grad_norm": 50.33534480591445, + "learning_rate": 6.193522796241734e-06, + "loss": 2.7756, + "step": 16930 + }, + { + "epoch": 1.4430239495440211, + "grad_norm": 48.812635899227395, + "learning_rate": 6.193041276399255e-06, + "loss": 3.2004, + "step": 16931 + }, + { + "epoch": 1.4431091792380466, + "grad_norm": 42.70316373656436, + "learning_rate": 6.1925597448236865e-06, + "loss": 3.104, + "step": 16932 + }, + { + "epoch": 1.4431944089320718, + "grad_norm": 39.50177435481228, + "learning_rate": 6.192078201519769e-06, + "loss": 2.4611, + "step": 16933 + }, + { + "epoch": 1.4432796386260973, + "grad_norm": 34.97899402447731, + "learning_rate": 6.191596646492235e-06, + "loss": 2.4888, + "step": 16934 + }, + { + "epoch": 1.4433648683201228, + "grad_norm": 32.249699171301316, + "learning_rate": 6.191115079745822e-06, + "loss": 2.7925, + "step": 16935 + }, + { + "epoch": 1.4434500980141483, + "grad_norm": 31.04301508857291, + "learning_rate": 6.190633501285264e-06, + "loss": 2.5891, + "step": 16936 + }, + { + "epoch": 1.4435353277081735, + "grad_norm": 94.44977742853521, + "learning_rate": 6.1901519111153e-06, + "loss": 3.4795, + "step": 16937 + }, + { + "epoch": 1.443620557402199, + "grad_norm": 37.89453427367705, + "learning_rate": 6.189670309240664e-06, + "loss": 2.5259, + "step": 16938 + }, + { + "epoch": 1.4437057870962242, + "grad_norm": 65.96358496181564, + "learning_rate": 6.189188695666095e-06, + "loss": 3.7858, + "step": 16939 + }, + { + "epoch": 1.4437910167902497, + "grad_norm": 79.97018924521991, + "learning_rate": 6.188707070396325e-06, + "loss": 2.7671, + "step": 16940 + }, + { + "epoch": 1.4438762464842751, + "grad_norm": 40.90246097079055, + "learning_rate": 6.1882254334360955e-06, + "loss": 3.2688, + "step": 16941 + }, + { + "epoch": 1.4439614761783006, + "grad_norm": 59.89330801329519, + "learning_rate": 6.187743784790141e-06, + "loss": 3.4033, + "step": 16942 + }, + { + "epoch": 1.4440467058723259, + "grad_norm": 47.57791128169496, + "learning_rate": 6.187262124463197e-06, + "loss": 2.4601, + "step": 16943 + }, + { + "epoch": 1.4441319355663513, + "grad_norm": 63.90383161201303, + "learning_rate": 6.186780452460003e-06, + "loss": 2.4071, + "step": 16944 + }, + { + "epoch": 1.4442171652603766, + "grad_norm": 51.77296022014432, + "learning_rate": 6.186298768785294e-06, + "loss": 2.7057, + "step": 16945 + }, + { + "epoch": 1.444302394954402, + "grad_norm": 50.55038346434858, + "learning_rate": 6.1858170734438085e-06, + "loss": 2.7259, + "step": 16946 + }, + { + "epoch": 1.4443876246484275, + "grad_norm": 41.030019504701876, + "learning_rate": 6.185335366440282e-06, + "loss": 2.451, + "step": 16947 + }, + { + "epoch": 1.444472854342453, + "grad_norm": 49.912346057333956, + "learning_rate": 6.184853647779454e-06, + "loss": 2.8139, + "step": 16948 + }, + { + "epoch": 1.4445580840364782, + "grad_norm": 67.51258271981187, + "learning_rate": 6.184371917466062e-06, + "loss": 3.1923, + "step": 16949 + }, + { + "epoch": 1.4446433137305037, + "grad_norm": 36.597863272000666, + "learning_rate": 6.183890175504843e-06, + "loss": 3.1159, + "step": 16950 + }, + { + "epoch": 1.4447285434245292, + "grad_norm": 81.105448963462, + "learning_rate": 6.183408421900533e-06, + "loss": 3.2691, + "step": 16951 + }, + { + "epoch": 1.4448137731185544, + "grad_norm": 30.307436299434098, + "learning_rate": 6.182926656657872e-06, + "loss": 2.5965, + "step": 16952 + }, + { + "epoch": 1.4448990028125799, + "grad_norm": 229.07700229241507, + "learning_rate": 6.182444879781597e-06, + "loss": 3.2987, + "step": 16953 + }, + { + "epoch": 1.4449842325066053, + "grad_norm": 68.55855381703856, + "learning_rate": 6.181963091276448e-06, + "loss": 2.8909, + "step": 16954 + }, + { + "epoch": 1.4450694622006308, + "grad_norm": 29.660695522913247, + "learning_rate": 6.18148129114716e-06, + "loss": 1.6153, + "step": 16955 + }, + { + "epoch": 1.445154691894656, + "grad_norm": 64.85409362553376, + "learning_rate": 6.180999479398474e-06, + "loss": 2.7866, + "step": 16956 + }, + { + "epoch": 1.4452399215886815, + "grad_norm": 69.69362803401498, + "learning_rate": 6.180517656035127e-06, + "loss": 3.0202, + "step": 16957 + }, + { + "epoch": 1.4453251512827068, + "grad_norm": 45.81881088754627, + "learning_rate": 6.180035821061858e-06, + "loss": 3.1256, + "step": 16958 + }, + { + "epoch": 1.4454103809767322, + "grad_norm": 53.74597601502564, + "learning_rate": 6.179553974483405e-06, + "loss": 3.0408, + "step": 16959 + }, + { + "epoch": 1.4454956106707577, + "grad_norm": 37.74172399757328, + "learning_rate": 6.179072116304508e-06, + "loss": 3.1911, + "step": 16960 + }, + { + "epoch": 1.4455808403647832, + "grad_norm": 39.15391897580741, + "learning_rate": 6.178590246529906e-06, + "loss": 3.1862, + "step": 16961 + }, + { + "epoch": 1.4456660700588084, + "grad_norm": 188.06780801992818, + "learning_rate": 6.178108365164337e-06, + "loss": 3.4047, + "step": 16962 + }, + { + "epoch": 1.445751299752834, + "grad_norm": 84.50604350728383, + "learning_rate": 6.177626472212539e-06, + "loss": 3.1484, + "step": 16963 + }, + { + "epoch": 1.4458365294468594, + "grad_norm": 60.9331334827073, + "learning_rate": 6.177144567679253e-06, + "loss": 2.5143, + "step": 16964 + }, + { + "epoch": 1.4459217591408846, + "grad_norm": 42.78452551456915, + "learning_rate": 6.1766626515692195e-06, + "loss": 2.9718, + "step": 16965 + }, + { + "epoch": 1.44600698883491, + "grad_norm": 29.677673213019826, + "learning_rate": 6.1761807238871754e-06, + "loss": 1.432, + "step": 16966 + }, + { + "epoch": 1.4460922185289355, + "grad_norm": 49.171656973734564, + "learning_rate": 6.1756987846378605e-06, + "loss": 3.508, + "step": 16967 + }, + { + "epoch": 1.446177448222961, + "grad_norm": 43.36932056243566, + "learning_rate": 6.175216833826016e-06, + "loss": 3.4009, + "step": 16968 + }, + { + "epoch": 1.4462626779169863, + "grad_norm": 54.68222334978198, + "learning_rate": 6.17473487145638e-06, + "loss": 1.9648, + "step": 16969 + }, + { + "epoch": 1.4463479076110117, + "grad_norm": 30.83615779910628, + "learning_rate": 6.174252897533694e-06, + "loss": 2.3635, + "step": 16970 + }, + { + "epoch": 1.446433137305037, + "grad_norm": 35.8770910647249, + "learning_rate": 6.173770912062698e-06, + "loss": 2.5483, + "step": 16971 + }, + { + "epoch": 1.4465183669990624, + "grad_norm": 236.46061533708263, + "learning_rate": 6.173288915048131e-06, + "loss": 2.9799, + "step": 16972 + }, + { + "epoch": 1.446603596693088, + "grad_norm": 31.504785769575896, + "learning_rate": 6.172806906494734e-06, + "loss": 2.9982, + "step": 16973 + }, + { + "epoch": 1.4466888263871134, + "grad_norm": 33.601270083669476, + "learning_rate": 6.1723248864072475e-06, + "loss": 2.9307, + "step": 16974 + }, + { + "epoch": 1.4467740560811386, + "grad_norm": 61.14330082382724, + "learning_rate": 6.171842854790409e-06, + "loss": 3.5225, + "step": 16975 + }, + { + "epoch": 1.446859285775164, + "grad_norm": 84.87271160656081, + "learning_rate": 6.1713608116489655e-06, + "loss": 2.9547, + "step": 16976 + }, + { + "epoch": 1.4469445154691893, + "grad_norm": 65.06290531942254, + "learning_rate": 6.170878756987651e-06, + "loss": 3.8728, + "step": 16977 + }, + { + "epoch": 1.4470297451632148, + "grad_norm": 80.49943887016114, + "learning_rate": 6.170396690811211e-06, + "loss": 3.7479, + "step": 16978 + }, + { + "epoch": 1.4471149748572403, + "grad_norm": 28.266069024056858, + "learning_rate": 6.169914613124383e-06, + "loss": 2.4084, + "step": 16979 + }, + { + "epoch": 1.4472002045512657, + "grad_norm": 73.56197506693037, + "learning_rate": 6.169432523931911e-06, + "loss": 3.7249, + "step": 16980 + }, + { + "epoch": 1.447285434245291, + "grad_norm": 86.49076862603367, + "learning_rate": 6.168950423238533e-06, + "loss": 3.6938, + "step": 16981 + }, + { + "epoch": 1.4473706639393165, + "grad_norm": 73.7247559092894, + "learning_rate": 6.168468311048994e-06, + "loss": 2.7461, + "step": 16982 + }, + { + "epoch": 1.447455893633342, + "grad_norm": 66.93511913936543, + "learning_rate": 6.167986187368032e-06, + "loss": 3.1558, + "step": 16983 + }, + { + "epoch": 1.4475411233273672, + "grad_norm": 60.05590159775887, + "learning_rate": 6.167504052200391e-06, + "loss": 2.5601, + "step": 16984 + }, + { + "epoch": 1.4476263530213926, + "grad_norm": 93.51088540911658, + "learning_rate": 6.167021905550812e-06, + "loss": 3.1187, + "step": 16985 + }, + { + "epoch": 1.447711582715418, + "grad_norm": 58.675906050873415, + "learning_rate": 6.166539747424035e-06, + "loss": 2.4584, + "step": 16986 + }, + { + "epoch": 1.4477968124094436, + "grad_norm": 57.75792111532432, + "learning_rate": 6.166057577824803e-06, + "loss": 2.4524, + "step": 16987 + }, + { + "epoch": 1.4478820421034688, + "grad_norm": 92.38787783261482, + "learning_rate": 6.165575396757859e-06, + "loss": 3.9689, + "step": 16988 + }, + { + "epoch": 1.4479672717974943, + "grad_norm": 37.59691980338821, + "learning_rate": 6.165093204227944e-06, + "loss": 3.3728, + "step": 16989 + }, + { + "epoch": 1.4480525014915195, + "grad_norm": 62.81419695252793, + "learning_rate": 6.1646110002398004e-06, + "loss": 4.0925, + "step": 16990 + }, + { + "epoch": 1.448137731185545, + "grad_norm": 79.67006321151189, + "learning_rate": 6.16412878479817e-06, + "loss": 2.8233, + "step": 16991 + }, + { + "epoch": 1.4482229608795705, + "grad_norm": 37.98756783858275, + "learning_rate": 6.163646557907796e-06, + "loss": 3.3687, + "step": 16992 + }, + { + "epoch": 1.448308190573596, + "grad_norm": 70.24537108898461, + "learning_rate": 6.1631643195734205e-06, + "loss": 2.9565, + "step": 16993 + }, + { + "epoch": 1.4483934202676212, + "grad_norm": 31.433076940590965, + "learning_rate": 6.162682069799786e-06, + "loss": 2.0483, + "step": 16994 + }, + { + "epoch": 1.4484786499616467, + "grad_norm": 62.8316938440277, + "learning_rate": 6.162199808591635e-06, + "loss": 3.4197, + "step": 16995 + }, + { + "epoch": 1.448563879655672, + "grad_norm": 30.298814345388962, + "learning_rate": 6.1617175359537095e-06, + "loss": 2.635, + "step": 16996 + }, + { + "epoch": 1.4486491093496974, + "grad_norm": 35.50579922134259, + "learning_rate": 6.1612352518907556e-06, + "loss": 2.8768, + "step": 16997 + }, + { + "epoch": 1.4487343390437228, + "grad_norm": 38.836688261634166, + "learning_rate": 6.160752956407514e-06, + "loss": 2.9393, + "step": 16998 + }, + { + "epoch": 1.4488195687377483, + "grad_norm": 43.81623010609813, + "learning_rate": 6.160270649508729e-06, + "loss": 2.2486, + "step": 16999 + }, + { + "epoch": 1.4489047984317736, + "grad_norm": 34.1232318435944, + "learning_rate": 6.159788331199142e-06, + "loss": 2.9874, + "step": 17000 + }, + { + "epoch": 1.448990028125799, + "grad_norm": 49.09677712575885, + "learning_rate": 6.159306001483497e-06, + "loss": 1.8643, + "step": 17001 + }, + { + "epoch": 1.4490752578198245, + "grad_norm": 112.10105050432814, + "learning_rate": 6.15882366036654e-06, + "loss": 4.8764, + "step": 17002 + }, + { + "epoch": 1.4491604875138497, + "grad_norm": 76.20573890892203, + "learning_rate": 6.1583413078530105e-06, + "loss": 3.4798, + "step": 17003 + }, + { + "epoch": 1.4492457172078752, + "grad_norm": 62.15169525822898, + "learning_rate": 6.157858943947657e-06, + "loss": 3.1141, + "step": 17004 + }, + { + "epoch": 1.4493309469019007, + "grad_norm": 50.818157875997784, + "learning_rate": 6.15737656865522e-06, + "loss": 2.4502, + "step": 17005 + }, + { + "epoch": 1.4494161765959261, + "grad_norm": 48.45373360559312, + "learning_rate": 6.156894181980444e-06, + "loss": 3.6185, + "step": 17006 + }, + { + "epoch": 1.4495014062899514, + "grad_norm": 31.398215419741767, + "learning_rate": 6.156411783928072e-06, + "loss": 2.5059, + "step": 17007 + }, + { + "epoch": 1.4495866359839769, + "grad_norm": 38.2126335184549, + "learning_rate": 6.155929374502853e-06, + "loss": 3.4879, + "step": 17008 + }, + { + "epoch": 1.449671865678002, + "grad_norm": 47.99420934786799, + "learning_rate": 6.1554469537095255e-06, + "loss": 2.7206, + "step": 17009 + }, + { + "epoch": 1.4497570953720276, + "grad_norm": 31.94407404919794, + "learning_rate": 6.154964521552836e-06, + "loss": 2.5207, + "step": 17010 + }, + { + "epoch": 1.449842325066053, + "grad_norm": 33.955846389503044, + "learning_rate": 6.15448207803753e-06, + "loss": 1.9752, + "step": 17011 + }, + { + "epoch": 1.4499275547600785, + "grad_norm": 65.32455462377715, + "learning_rate": 6.153999623168353e-06, + "loss": 2.917, + "step": 17012 + }, + { + "epoch": 1.4500127844541038, + "grad_norm": 49.860082644340444, + "learning_rate": 6.153517156950047e-06, + "loss": 3.4483, + "step": 17013 + }, + { + "epoch": 1.4500980141481292, + "grad_norm": 38.11917379372686, + "learning_rate": 6.153034679387358e-06, + "loss": 3.1591, + "step": 17014 + }, + { + "epoch": 1.4501832438421545, + "grad_norm": 85.7100401703328, + "learning_rate": 6.152552190485032e-06, + "loss": 3.5967, + "step": 17015 + }, + { + "epoch": 1.45026847353618, + "grad_norm": 31.220390973388962, + "learning_rate": 6.152069690247811e-06, + "loss": 2.5691, + "step": 17016 + }, + { + "epoch": 1.4503537032302054, + "grad_norm": 29.462655581195207, + "learning_rate": 6.151587178680446e-06, + "loss": 2.5535, + "step": 17017 + }, + { + "epoch": 1.4504389329242309, + "grad_norm": 41.86679812200924, + "learning_rate": 6.151104655787675e-06, + "loss": 3.868, + "step": 17018 + }, + { + "epoch": 1.4505241626182561, + "grad_norm": 46.981442830756166, + "learning_rate": 6.150622121574249e-06, + "loss": 2.296, + "step": 17019 + }, + { + "epoch": 1.4506093923122816, + "grad_norm": 65.88827217724602, + "learning_rate": 6.1501395760449116e-06, + "loss": 2.2767, + "step": 17020 + }, + { + "epoch": 1.450694622006307, + "grad_norm": 86.61244311329234, + "learning_rate": 6.149657019204409e-06, + "loss": 3.5891, + "step": 17021 + }, + { + "epoch": 1.4507798517003323, + "grad_norm": 55.56856761617198, + "learning_rate": 6.149174451057483e-06, + "loss": 2.5955, + "step": 17022 + }, + { + "epoch": 1.4508650813943578, + "grad_norm": 156.4713917489656, + "learning_rate": 6.148691871608887e-06, + "loss": 3.2051, + "step": 17023 + }, + { + "epoch": 1.4509503110883832, + "grad_norm": 75.96859780363954, + "learning_rate": 6.148209280863362e-06, + "loss": 3.2637, + "step": 17024 + }, + { + "epoch": 1.4510355407824087, + "grad_norm": 84.85373243477967, + "learning_rate": 6.147726678825655e-06, + "loss": 2.4769, + "step": 17025 + }, + { + "epoch": 1.451120770476434, + "grad_norm": 37.82068959132845, + "learning_rate": 6.1472440655005095e-06, + "loss": 3.0488, + "step": 17026 + }, + { + "epoch": 1.4512060001704594, + "grad_norm": 87.1525205473531, + "learning_rate": 6.146761440892678e-06, + "loss": 3.3254, + "step": 17027 + }, + { + "epoch": 1.4512912298644847, + "grad_norm": 42.36709252732024, + "learning_rate": 6.146278805006901e-06, + "loss": 3.103, + "step": 17028 + }, + { + "epoch": 1.4513764595585101, + "grad_norm": 84.98291665195451, + "learning_rate": 6.14579615784793e-06, + "loss": 3.4531, + "step": 17029 + }, + { + "epoch": 1.4514616892525356, + "grad_norm": 45.23003955161745, + "learning_rate": 6.145313499420506e-06, + "loss": 3.915, + "step": 17030 + }, + { + "epoch": 1.451546918946561, + "grad_norm": 51.495816063707174, + "learning_rate": 6.144830829729381e-06, + "loss": 2.4176, + "step": 17031 + }, + { + "epoch": 1.4516321486405863, + "grad_norm": 89.97471821765852, + "learning_rate": 6.1443481487793e-06, + "loss": 4.0162, + "step": 17032 + }, + { + "epoch": 1.4517173783346118, + "grad_norm": 98.11323204847739, + "learning_rate": 6.143865456575008e-06, + "loss": 3.3767, + "step": 17033 + }, + { + "epoch": 1.4518026080286373, + "grad_norm": 83.12079015393174, + "learning_rate": 6.143382753121254e-06, + "loss": 3.3649, + "step": 17034 + }, + { + "epoch": 1.4518878377226625, + "grad_norm": 38.50018863387362, + "learning_rate": 6.142900038422785e-06, + "loss": 2.4112, + "step": 17035 + }, + { + "epoch": 1.451973067416688, + "grad_norm": 45.30943767615639, + "learning_rate": 6.14241731248435e-06, + "loss": 3.4265, + "step": 17036 + }, + { + "epoch": 1.4520582971107134, + "grad_norm": 40.3659187043166, + "learning_rate": 6.141934575310693e-06, + "loss": 2.7814, + "step": 17037 + }, + { + "epoch": 1.452143526804739, + "grad_norm": 27.671093250928607, + "learning_rate": 6.141451826906564e-06, + "loss": 2.5416, + "step": 17038 + }, + { + "epoch": 1.4522287564987642, + "grad_norm": 41.824087982360105, + "learning_rate": 6.140969067276709e-06, + "loss": 2.6458, + "step": 17039 + }, + { + "epoch": 1.4523139861927896, + "grad_norm": 53.156814001244044, + "learning_rate": 6.140486296425877e-06, + "loss": 2.8801, + "step": 17040 + }, + { + "epoch": 1.4523992158868149, + "grad_norm": 75.2466368976393, + "learning_rate": 6.140003514358815e-06, + "loss": 3.2225, + "step": 17041 + }, + { + "epoch": 1.4524844455808403, + "grad_norm": 42.33522792141151, + "learning_rate": 6.139520721080272e-06, + "loss": 3.2644, + "step": 17042 + }, + { + "epoch": 1.4525696752748658, + "grad_norm": 49.333865853000184, + "learning_rate": 6.139037916594996e-06, + "loss": 3.1223, + "step": 17043 + }, + { + "epoch": 1.4526549049688913, + "grad_norm": 30.60521711554455, + "learning_rate": 6.138555100907735e-06, + "loss": 2.0429, + "step": 17044 + }, + { + "epoch": 1.4527401346629165, + "grad_norm": 36.362956710769254, + "learning_rate": 6.1380722740232355e-06, + "loss": 1.2538, + "step": 17045 + }, + { + "epoch": 1.452825364356942, + "grad_norm": 67.42802068342185, + "learning_rate": 6.137589435946247e-06, + "loss": 3.2621, + "step": 17046 + }, + { + "epoch": 1.4529105940509672, + "grad_norm": 43.18411687502754, + "learning_rate": 6.13710658668152e-06, + "loss": 2.2423, + "step": 17047 + }, + { + "epoch": 1.4529958237449927, + "grad_norm": 51.37623415358434, + "learning_rate": 6.136623726233801e-06, + "loss": 2.7113, + "step": 17048 + }, + { + "epoch": 1.4530810534390182, + "grad_norm": 65.42604965314598, + "learning_rate": 6.136140854607839e-06, + "loss": 3.8133, + "step": 17049 + }, + { + "epoch": 1.4531662831330436, + "grad_norm": 35.98367856348613, + "learning_rate": 6.135657971808384e-06, + "loss": 2.0513, + "step": 17050 + }, + { + "epoch": 1.4532515128270689, + "grad_norm": 54.241351813304945, + "learning_rate": 6.135175077840184e-06, + "loss": 2.9457, + "step": 17051 + }, + { + "epoch": 1.4533367425210943, + "grad_norm": 48.83193852814955, + "learning_rate": 6.134692172707988e-06, + "loss": 3.5958, + "step": 17052 + }, + { + "epoch": 1.4534219722151198, + "grad_norm": 29.436764082593992, + "learning_rate": 6.134209256416545e-06, + "loss": 1.8603, + "step": 17053 + }, + { + "epoch": 1.453507201909145, + "grad_norm": 42.782819964094834, + "learning_rate": 6.133726328970604e-06, + "loss": 3.1323, + "step": 17054 + }, + { + "epoch": 1.4535924316031705, + "grad_norm": 61.247028981677616, + "learning_rate": 6.133243390374916e-06, + "loss": 2.8501, + "step": 17055 + }, + { + "epoch": 1.453677661297196, + "grad_norm": 60.67044789887078, + "learning_rate": 6.13276044063423e-06, + "loss": 3.8619, + "step": 17056 + }, + { + "epoch": 1.4537628909912215, + "grad_norm": 65.0645892581472, + "learning_rate": 6.132277479753295e-06, + "loss": 3.9705, + "step": 17057 + }, + { + "epoch": 1.4538481206852467, + "grad_norm": 33.90873738579358, + "learning_rate": 6.131794507736859e-06, + "loss": 2.9166, + "step": 17058 + }, + { + "epoch": 1.4539333503792722, + "grad_norm": 53.28642786010419, + "learning_rate": 6.131311524589677e-06, + "loss": 2.1752, + "step": 17059 + }, + { + "epoch": 1.4540185800732974, + "grad_norm": 44.30484650060168, + "learning_rate": 6.130828530316495e-06, + "loss": 2.8768, + "step": 17060 + }, + { + "epoch": 1.454103809767323, + "grad_norm": 37.02860605448084, + "learning_rate": 6.130345524922061e-06, + "loss": 2.3615, + "step": 17061 + }, + { + "epoch": 1.4541890394613484, + "grad_norm": 39.625640587007496, + "learning_rate": 6.129862508411131e-06, + "loss": 1.5561, + "step": 17062 + }, + { + "epoch": 1.4542742691553738, + "grad_norm": 25.5396812308352, + "learning_rate": 6.12937948078845e-06, + "loss": 2.4922, + "step": 17063 + }, + { + "epoch": 1.454359498849399, + "grad_norm": 45.52347483717855, + "learning_rate": 6.128896442058772e-06, + "loss": 3.4042, + "step": 17064 + }, + { + "epoch": 1.4544447285434245, + "grad_norm": 67.01246802502378, + "learning_rate": 6.128413392226844e-06, + "loss": 2.6757, + "step": 17065 + }, + { + "epoch": 1.4545299582374498, + "grad_norm": 147.08672955684122, + "learning_rate": 6.1279303312974215e-06, + "loss": 3.8158, + "step": 17066 + }, + { + "epoch": 1.4546151879314753, + "grad_norm": 40.11019095933345, + "learning_rate": 6.12744725927525e-06, + "loss": 2.9308, + "step": 17067 + }, + { + "epoch": 1.4547004176255007, + "grad_norm": 36.077476643631634, + "learning_rate": 6.126964176165085e-06, + "loss": 2.9119, + "step": 17068 + }, + { + "epoch": 1.4547856473195262, + "grad_norm": 39.61293146946804, + "learning_rate": 6.126481081971672e-06, + "loss": 3.096, + "step": 17069 + }, + { + "epoch": 1.4548708770135514, + "grad_norm": 48.82447978447637, + "learning_rate": 6.1259979766997675e-06, + "loss": 2.2009, + "step": 17070 + }, + { + "epoch": 1.454956106707577, + "grad_norm": 37.54691902378459, + "learning_rate": 6.12551486035412e-06, + "loss": 2.542, + "step": 17071 + }, + { + "epoch": 1.4550413364016024, + "grad_norm": 52.771077904477266, + "learning_rate": 6.125031732939482e-06, + "loss": 3.3353, + "step": 17072 + }, + { + "epoch": 1.4551265660956276, + "grad_norm": 37.91945626374303, + "learning_rate": 6.124548594460603e-06, + "loss": 2.8455, + "step": 17073 + }, + { + "epoch": 1.455211795789653, + "grad_norm": 52.505068295489295, + "learning_rate": 6.124065444922235e-06, + "loss": 2.9213, + "step": 17074 + }, + { + "epoch": 1.4552970254836786, + "grad_norm": 91.867664185717, + "learning_rate": 6.1235822843291306e-06, + "loss": 2.5933, + "step": 17075 + }, + { + "epoch": 1.455382255177704, + "grad_norm": 90.66505301222315, + "learning_rate": 6.123099112686041e-06, + "loss": 2.9558, + "step": 17076 + }, + { + "epoch": 1.4554674848717293, + "grad_norm": 29.60705259299591, + "learning_rate": 6.122615929997717e-06, + "loss": 2.2308, + "step": 17077 + }, + { + "epoch": 1.4555527145657547, + "grad_norm": 65.41636390560916, + "learning_rate": 6.122132736268912e-06, + "loss": 2.4038, + "step": 17078 + }, + { + "epoch": 1.45563794425978, + "grad_norm": 106.66464274361424, + "learning_rate": 6.121649531504378e-06, + "loss": 2.9628, + "step": 17079 + }, + { + "epoch": 1.4557231739538055, + "grad_norm": 41.02758382934445, + "learning_rate": 6.121166315708866e-06, + "loss": 2.9156, + "step": 17080 + }, + { + "epoch": 1.455808403647831, + "grad_norm": 93.00053956985532, + "learning_rate": 6.120683088887128e-06, + "loss": 4.9494, + "step": 17081 + }, + { + "epoch": 1.4558936333418564, + "grad_norm": 37.10894859399829, + "learning_rate": 6.120199851043918e-06, + "loss": 2.4265, + "step": 17082 + }, + { + "epoch": 1.4559788630358816, + "grad_norm": 45.78629792280799, + "learning_rate": 6.119716602183988e-06, + "loss": 3.1071, + "step": 17083 + }, + { + "epoch": 1.4560640927299071, + "grad_norm": 31.719048321993505, + "learning_rate": 6.11923334231209e-06, + "loss": 2.2979, + "step": 17084 + }, + { + "epoch": 1.4561493224239326, + "grad_norm": 59.18496236184685, + "learning_rate": 6.118750071432975e-06, + "loss": 2.6798, + "step": 17085 + }, + { + "epoch": 1.4562345521179578, + "grad_norm": 50.6516457221676, + "learning_rate": 6.118266789551399e-06, + "loss": 3.578, + "step": 17086 + }, + { + "epoch": 1.4563197818119833, + "grad_norm": 32.21995814264073, + "learning_rate": 6.117783496672114e-06, + "loss": 2.5899, + "step": 17087 + }, + { + "epoch": 1.4564050115060088, + "grad_norm": 162.61177300328566, + "learning_rate": 6.117300192799872e-06, + "loss": 2.8416, + "step": 17088 + }, + { + "epoch": 1.4564902412000342, + "grad_norm": 42.54892885054176, + "learning_rate": 6.116816877939426e-06, + "loss": 2.5714, + "step": 17089 + }, + { + "epoch": 1.4565754708940595, + "grad_norm": 66.80432924618944, + "learning_rate": 6.116333552095532e-06, + "loss": 2.6718, + "step": 17090 + }, + { + "epoch": 1.456660700588085, + "grad_norm": 93.98143924108278, + "learning_rate": 6.115850215272939e-06, + "loss": 3.7554, + "step": 17091 + }, + { + "epoch": 1.4567459302821102, + "grad_norm": 42.36972234575073, + "learning_rate": 6.115366867476402e-06, + "loss": 3.8917, + "step": 17092 + }, + { + "epoch": 1.4568311599761357, + "grad_norm": 80.66231456033003, + "learning_rate": 6.114883508710676e-06, + "loss": 3.1572, + "step": 17093 + }, + { + "epoch": 1.4569163896701611, + "grad_norm": 42.24138352408639, + "learning_rate": 6.114400138980515e-06, + "loss": 3.148, + "step": 17094 + }, + { + "epoch": 1.4570016193641866, + "grad_norm": 73.97222890230461, + "learning_rate": 6.11391675829067e-06, + "loss": 3.5477, + "step": 17095 + }, + { + "epoch": 1.4570868490582118, + "grad_norm": 49.74166968648265, + "learning_rate": 6.1134333666458956e-06, + "loss": 2.4635, + "step": 17096 + }, + { + "epoch": 1.4571720787522373, + "grad_norm": 65.5904977648326, + "learning_rate": 6.112949964050947e-06, + "loss": 3.2254, + "step": 17097 + }, + { + "epoch": 1.4572573084462626, + "grad_norm": 57.84285131030968, + "learning_rate": 6.11246655051058e-06, + "loss": 2.8922, + "step": 17098 + }, + { + "epoch": 1.457342538140288, + "grad_norm": 69.55630717786516, + "learning_rate": 6.111983126029544e-06, + "loss": 2.6645, + "step": 17099 + }, + { + "epoch": 1.4574277678343135, + "grad_norm": 33.83769452267937, + "learning_rate": 6.111499690612596e-06, + "loss": 3.595, + "step": 17100 + }, + { + "epoch": 1.457512997528339, + "grad_norm": 47.02615624137873, + "learning_rate": 6.111016244264491e-06, + "loss": 3.0571, + "step": 17101 + }, + { + "epoch": 1.4575982272223642, + "grad_norm": 55.88758758706606, + "learning_rate": 6.110532786989982e-06, + "loss": 2.2227, + "step": 17102 + }, + { + "epoch": 1.4576834569163897, + "grad_norm": 28.169503401732232, + "learning_rate": 6.110049318793826e-06, + "loss": 2.3643, + "step": 17103 + }, + { + "epoch": 1.4577686866104151, + "grad_norm": 71.63799568155152, + "learning_rate": 6.109565839680773e-06, + "loss": 3.514, + "step": 17104 + }, + { + "epoch": 1.4578539163044404, + "grad_norm": 41.045002612117614, + "learning_rate": 6.109082349655584e-06, + "loss": 3.0311, + "step": 17105 + }, + { + "epoch": 1.4579391459984659, + "grad_norm": 23.09095770543914, + "learning_rate": 6.108598848723008e-06, + "loss": 1.5002, + "step": 17106 + }, + { + "epoch": 1.4580243756924913, + "grad_norm": 31.972656612550356, + "learning_rate": 6.108115336887806e-06, + "loss": 3.0397, + "step": 17107 + }, + { + "epoch": 1.4581096053865168, + "grad_norm": 28.088157303356155, + "learning_rate": 6.107631814154728e-06, + "loss": 2.1773, + "step": 17108 + }, + { + "epoch": 1.458194835080542, + "grad_norm": 40.333927258698715, + "learning_rate": 6.107148280528531e-06, + "loss": 3.305, + "step": 17109 + }, + { + "epoch": 1.4582800647745675, + "grad_norm": 49.70024091346952, + "learning_rate": 6.1066647360139704e-06, + "loss": 2.5422, + "step": 17110 + }, + { + "epoch": 1.4583652944685928, + "grad_norm": 108.7586754409426, + "learning_rate": 6.106181180615805e-06, + "loss": 3.1468, + "step": 17111 + }, + { + "epoch": 1.4584505241626182, + "grad_norm": 29.054472565237045, + "learning_rate": 6.1056976143387825e-06, + "loss": 2.3777, + "step": 17112 + }, + { + "epoch": 1.4585357538566437, + "grad_norm": 88.83895503103626, + "learning_rate": 6.1052140371876665e-06, + "loss": 3.2852, + "step": 17113 + }, + { + "epoch": 1.4586209835506692, + "grad_norm": 30.292978975491536, + "learning_rate": 6.1047304491672085e-06, + "loss": 2.0408, + "step": 17114 + }, + { + "epoch": 1.4587062132446944, + "grad_norm": 19.37021878586954, + "learning_rate": 6.104246850282166e-06, + "loss": 1.4291, + "step": 17115 + }, + { + "epoch": 1.4587914429387199, + "grad_norm": 67.04330007448937, + "learning_rate": 6.103763240537293e-06, + "loss": 3.1142, + "step": 17116 + }, + { + "epoch": 1.4588766726327451, + "grad_norm": 70.33736475003883, + "learning_rate": 6.103279619937349e-06, + "loss": 2.952, + "step": 17117 + }, + { + "epoch": 1.4589619023267706, + "grad_norm": 96.60696074613342, + "learning_rate": 6.102795988487089e-06, + "loss": 4.4198, + "step": 17118 + }, + { + "epoch": 1.459047132020796, + "grad_norm": 49.17198842898979, + "learning_rate": 6.102312346191267e-06, + "loss": 2.5436, + "step": 17119 + }, + { + "epoch": 1.4591323617148215, + "grad_norm": 44.05880552140979, + "learning_rate": 6.101828693054641e-06, + "loss": 2.1905, + "step": 17120 + }, + { + "epoch": 1.4592175914088468, + "grad_norm": 72.61049304822392, + "learning_rate": 6.1013450290819675e-06, + "loss": 4.3633, + "step": 17121 + }, + { + "epoch": 1.4593028211028722, + "grad_norm": 36.21719232546748, + "learning_rate": 6.100861354278005e-06, + "loss": 2.9112, + "step": 17122 + }, + { + "epoch": 1.4593880507968977, + "grad_norm": 56.80628463556677, + "learning_rate": 6.100377668647507e-06, + "loss": 2.9135, + "step": 17123 + }, + { + "epoch": 1.459473280490923, + "grad_norm": 40.01452029210108, + "learning_rate": 6.099893972195232e-06, + "loss": 3.0136, + "step": 17124 + }, + { + "epoch": 1.4595585101849484, + "grad_norm": 41.62702013379009, + "learning_rate": 6.099410264925937e-06, + "loss": 3.0605, + "step": 17125 + }, + { + "epoch": 1.459643739878974, + "grad_norm": 51.65336311358688, + "learning_rate": 6.098926546844379e-06, + "loss": 3.2416, + "step": 17126 + }, + { + "epoch": 1.4597289695729994, + "grad_norm": 62.16363228934893, + "learning_rate": 6.098442817955316e-06, + "loss": 3.5749, + "step": 17127 + }, + { + "epoch": 1.4598141992670246, + "grad_norm": 24.62516628122788, + "learning_rate": 6.0979590782635025e-06, + "loss": 1.997, + "step": 17128 + }, + { + "epoch": 1.45989942896105, + "grad_norm": 91.57386264823904, + "learning_rate": 6.097475327773699e-06, + "loss": 3.6769, + "step": 17129 + }, + { + "epoch": 1.4599846586550753, + "grad_norm": 19.965454605553603, + "learning_rate": 6.096991566490662e-06, + "loss": 1.4924, + "step": 17130 + }, + { + "epoch": 1.4600698883491008, + "grad_norm": 38.33686523587678, + "learning_rate": 6.096507794419148e-06, + "loss": 2.7659, + "step": 17131 + }, + { + "epoch": 1.4601551180431263, + "grad_norm": 41.27311357313403, + "learning_rate": 6.096024011563916e-06, + "loss": 3.2738, + "step": 17132 + }, + { + "epoch": 1.4602403477371517, + "grad_norm": 34.02297851937469, + "learning_rate": 6.095540217929723e-06, + "loss": 2.8593, + "step": 17133 + }, + { + "epoch": 1.460325577431177, + "grad_norm": 43.523021918522474, + "learning_rate": 6.095056413521327e-06, + "loss": 2.3376, + "step": 17134 + }, + { + "epoch": 1.4604108071252024, + "grad_norm": 62.18197478694496, + "learning_rate": 6.094572598343487e-06, + "loss": 2.9125, + "step": 17135 + }, + { + "epoch": 1.4604960368192277, + "grad_norm": 65.05254879734264, + "learning_rate": 6.09408877240096e-06, + "loss": 3.5514, + "step": 17136 + }, + { + "epoch": 1.4605812665132532, + "grad_norm": 75.67806677416469, + "learning_rate": 6.093604935698506e-06, + "loss": 2.9706, + "step": 17137 + }, + { + "epoch": 1.4606664962072786, + "grad_norm": 36.53783926008209, + "learning_rate": 6.0931210882408805e-06, + "loss": 2.5104, + "step": 17138 + }, + { + "epoch": 1.460751725901304, + "grad_norm": 53.73112321779111, + "learning_rate": 6.092637230032844e-06, + "loss": 2.6124, + "step": 17139 + }, + { + "epoch": 1.4608369555953293, + "grad_norm": 45.08935043306643, + "learning_rate": 6.092153361079154e-06, + "loss": 2.1243, + "step": 17140 + }, + { + "epoch": 1.4609221852893548, + "grad_norm": 83.85206915968749, + "learning_rate": 6.09166948138457e-06, + "loss": 2.0699, + "step": 17141 + }, + { + "epoch": 1.4610074149833803, + "grad_norm": 34.37307282780051, + "learning_rate": 6.091185590953851e-06, + "loss": 2.5745, + "step": 17142 + }, + { + "epoch": 1.4610926446774055, + "grad_norm": 32.0642292839831, + "learning_rate": 6.0907016897917535e-06, + "loss": 2.0709, + "step": 17143 + }, + { + "epoch": 1.461177874371431, + "grad_norm": 41.00633171643952, + "learning_rate": 6.090217777903041e-06, + "loss": 2.245, + "step": 17144 + }, + { + "epoch": 1.4612631040654565, + "grad_norm": 42.63685755881294, + "learning_rate": 6.089733855292468e-06, + "loss": 2.516, + "step": 17145 + }, + { + "epoch": 1.461348333759482, + "grad_norm": 35.21238639404802, + "learning_rate": 6.089249921964797e-06, + "loss": 2.7712, + "step": 17146 + }, + { + "epoch": 1.4614335634535072, + "grad_norm": 53.98202958827193, + "learning_rate": 6.0887659779247836e-06, + "loss": 3.3282, + "step": 17147 + }, + { + "epoch": 1.4615187931475326, + "grad_norm": 32.066392486807686, + "learning_rate": 6.088282023177191e-06, + "loss": 2.2656, + "step": 17148 + }, + { + "epoch": 1.4616040228415579, + "grad_norm": 44.51452572097076, + "learning_rate": 6.087798057726777e-06, + "loss": 2.6336, + "step": 17149 + }, + { + "epoch": 1.4616892525355833, + "grad_norm": 70.72841989182622, + "learning_rate": 6.0873140815783015e-06, + "loss": 1.6147, + "step": 17150 + }, + { + "epoch": 1.4617744822296088, + "grad_norm": 36.89008768695936, + "learning_rate": 6.086830094736523e-06, + "loss": 2.0016, + "step": 17151 + }, + { + "epoch": 1.4618597119236343, + "grad_norm": 69.95884012148237, + "learning_rate": 6.086346097206204e-06, + "loss": 3.0603, + "step": 17152 + }, + { + "epoch": 1.4619449416176595, + "grad_norm": 39.76002878141899, + "learning_rate": 6.085862088992102e-06, + "loss": 3.4736, + "step": 17153 + }, + { + "epoch": 1.462030171311685, + "grad_norm": 34.33026046609816, + "learning_rate": 6.085378070098978e-06, + "loss": 2.8304, + "step": 17154 + }, + { + "epoch": 1.4621154010057105, + "grad_norm": 25.475346431249093, + "learning_rate": 6.084894040531591e-06, + "loss": 2.3799, + "step": 17155 + }, + { + "epoch": 1.4622006306997357, + "grad_norm": 62.56199012099606, + "learning_rate": 6.084410000294703e-06, + "loss": 2.8, + "step": 17156 + }, + { + "epoch": 1.4622858603937612, + "grad_norm": 91.67081559026236, + "learning_rate": 6.083925949393074e-06, + "loss": 2.8964, + "step": 17157 + }, + { + "epoch": 1.4623710900877867, + "grad_norm": 54.28732705473192, + "learning_rate": 6.083441887831464e-06, + "loss": 2.4598, + "step": 17158 + }, + { + "epoch": 1.4624563197818121, + "grad_norm": 42.80558683852939, + "learning_rate": 6.082957815614632e-06, + "loss": 2.3969, + "step": 17159 + }, + { + "epoch": 1.4625415494758374, + "grad_norm": 55.819867973829496, + "learning_rate": 6.082473732747341e-06, + "loss": 3.1624, + "step": 17160 + }, + { + "epoch": 1.4626267791698628, + "grad_norm": 43.0427967518557, + "learning_rate": 6.081989639234353e-06, + "loss": 2.1424, + "step": 17161 + }, + { + "epoch": 1.462712008863888, + "grad_norm": 30.837575160849116, + "learning_rate": 6.0815055350804245e-06, + "loss": 2.5432, + "step": 17162 + }, + { + "epoch": 1.4627972385579135, + "grad_norm": 28.702868289159948, + "learning_rate": 6.081021420290318e-06, + "loss": 2.92, + "step": 17163 + }, + { + "epoch": 1.462882468251939, + "grad_norm": 43.75305436113321, + "learning_rate": 6.080537294868797e-06, + "loss": 2.419, + "step": 17164 + }, + { + "epoch": 1.4629676979459645, + "grad_norm": 30.471886405917807, + "learning_rate": 6.080053158820621e-06, + "loss": 2.9045, + "step": 17165 + }, + { + "epoch": 1.4630529276399897, + "grad_norm": 91.30936308481816, + "learning_rate": 6.079569012150552e-06, + "loss": 4.1299, + "step": 17166 + }, + { + "epoch": 1.4631381573340152, + "grad_norm": 54.618039732337095, + "learning_rate": 6.07908485486335e-06, + "loss": 2.7932, + "step": 17167 + }, + { + "epoch": 1.4632233870280404, + "grad_norm": 47.929104504473486, + "learning_rate": 6.078600686963776e-06, + "loss": 2.7557, + "step": 17168 + }, + { + "epoch": 1.463308616722066, + "grad_norm": 72.14270536573127, + "learning_rate": 6.078116508456595e-06, + "loss": 3.2834, + "step": 17169 + }, + { + "epoch": 1.4633938464160914, + "grad_norm": 26.65203014000792, + "learning_rate": 6.0776323193465645e-06, + "loss": 2.0103, + "step": 17170 + }, + { + "epoch": 1.4634790761101168, + "grad_norm": 32.16172578360066, + "learning_rate": 6.07714811963845e-06, + "loss": 2.218, + "step": 17171 + }, + { + "epoch": 1.463564305804142, + "grad_norm": 42.167741983016946, + "learning_rate": 6.076663909337011e-06, + "loss": 3.3873, + "step": 17172 + }, + { + "epoch": 1.4636495354981676, + "grad_norm": 34.9312417653249, + "learning_rate": 6.07617968844701e-06, + "loss": 2.3112, + "step": 17173 + }, + { + "epoch": 1.463734765192193, + "grad_norm": 64.90859854936454, + "learning_rate": 6.075695456973211e-06, + "loss": 3.4127, + "step": 17174 + }, + { + "epoch": 1.4638199948862183, + "grad_norm": 75.79906595192533, + "learning_rate": 6.0752112149203726e-06, + "loss": 3.917, + "step": 17175 + }, + { + "epoch": 1.4639052245802437, + "grad_norm": 47.55678063236392, + "learning_rate": 6.07472696229326e-06, + "loss": 3.1762, + "step": 17176 + }, + { + "epoch": 1.4639904542742692, + "grad_norm": 45.864877954541406, + "learning_rate": 6.074242699096635e-06, + "loss": 2.4779, + "step": 17177 + }, + { + "epoch": 1.4640756839682947, + "grad_norm": 63.15881217916389, + "learning_rate": 6.07375842533526e-06, + "loss": 3.3785, + "step": 17178 + }, + { + "epoch": 1.46416091366232, + "grad_norm": 91.83603911613379, + "learning_rate": 6.0732741410138965e-06, + "loss": 4.103, + "step": 17179 + }, + { + "epoch": 1.4642461433563454, + "grad_norm": 114.8228176197593, + "learning_rate": 6.07278984613731e-06, + "loss": 3.1771, + "step": 17180 + }, + { + "epoch": 1.4643313730503706, + "grad_norm": 36.356981067808285, + "learning_rate": 6.0723055407102605e-06, + "loss": 2.359, + "step": 17181 + }, + { + "epoch": 1.4644166027443961, + "grad_norm": 53.82505129023763, + "learning_rate": 6.0718212247375115e-06, + "loss": 3.3311, + "step": 17182 + }, + { + "epoch": 1.4645018324384216, + "grad_norm": 30.675806764036537, + "learning_rate": 6.071336898223827e-06, + "loss": 2.045, + "step": 17183 + }, + { + "epoch": 1.464587062132447, + "grad_norm": 43.0570901838596, + "learning_rate": 6.070852561173971e-06, + "loss": 3.1542, + "step": 17184 + }, + { + "epoch": 1.4646722918264723, + "grad_norm": 33.6755018527534, + "learning_rate": 6.0703682135927055e-06, + "loss": 2.1315, + "step": 17185 + }, + { + "epoch": 1.4647575215204978, + "grad_norm": 42.492884065486216, + "learning_rate": 6.069883855484793e-06, + "loss": 3.3957, + "step": 17186 + }, + { + "epoch": 1.464842751214523, + "grad_norm": 65.11834930679431, + "learning_rate": 6.0693994868549975e-06, + "loss": 3.5364, + "step": 17187 + }, + { + "epoch": 1.4649279809085485, + "grad_norm": 87.97653288245932, + "learning_rate": 6.068915107708084e-06, + "loss": 4.8461, + "step": 17188 + }, + { + "epoch": 1.465013210602574, + "grad_norm": 21.46975135785851, + "learning_rate": 6.0684307180488146e-06, + "loss": 1.2819, + "step": 17189 + }, + { + "epoch": 1.4650984402965994, + "grad_norm": 70.9604417118204, + "learning_rate": 6.067946317881955e-06, + "loss": 2.8076, + "step": 17190 + }, + { + "epoch": 1.4651836699906247, + "grad_norm": 45.09756326256479, + "learning_rate": 6.067461907212266e-06, + "loss": 2.5958, + "step": 17191 + }, + { + "epoch": 1.4652688996846501, + "grad_norm": 41.6271954360576, + "learning_rate": 6.0669774860445145e-06, + "loss": 1.8256, + "step": 17192 + }, + { + "epoch": 1.4653541293786756, + "grad_norm": 60.718064339753994, + "learning_rate": 6.0664930543834635e-06, + "loss": 2.0549, + "step": 17193 + }, + { + "epoch": 1.4654393590727008, + "grad_norm": 30.469124667070027, + "learning_rate": 6.066008612233875e-06, + "loss": 1.7628, + "step": 17194 + }, + { + "epoch": 1.4655245887667263, + "grad_norm": 33.88725116647969, + "learning_rate": 6.065524159600517e-06, + "loss": 2.1077, + "step": 17195 + }, + { + "epoch": 1.4656098184607518, + "grad_norm": 53.960089872697765, + "learning_rate": 6.065039696488152e-06, + "loss": 3.0452, + "step": 17196 + }, + { + "epoch": 1.4656950481547772, + "grad_norm": 31.60835217349276, + "learning_rate": 6.064555222901546e-06, + "loss": 2.2787, + "step": 17197 + }, + { + "epoch": 1.4657802778488025, + "grad_norm": 47.06707867286369, + "learning_rate": 6.0640707388454605e-06, + "loss": 2.9587, + "step": 17198 + }, + { + "epoch": 1.465865507542828, + "grad_norm": 88.58008677843705, + "learning_rate": 6.063586244324663e-06, + "loss": 3.8999, + "step": 17199 + }, + { + "epoch": 1.4659507372368532, + "grad_norm": 53.13420743600678, + "learning_rate": 6.063101739343917e-06, + "loss": 2.4302, + "step": 17200 + }, + { + "epoch": 1.4660359669308787, + "grad_norm": 37.562365569963625, + "learning_rate": 6.0626172239079885e-06, + "loss": 2.1224, + "step": 17201 + }, + { + "epoch": 1.4661211966249041, + "grad_norm": 98.6530159633779, + "learning_rate": 6.06213269802164e-06, + "loss": 3.4033, + "step": 17202 + }, + { + "epoch": 1.4662064263189296, + "grad_norm": 32.16280751043296, + "learning_rate": 6.061648161689641e-06, + "loss": 2.7798, + "step": 17203 + }, + { + "epoch": 1.4662916560129549, + "grad_norm": 72.35493833005509, + "learning_rate": 6.061163614916753e-06, + "loss": 4.1872, + "step": 17204 + }, + { + "epoch": 1.4663768857069803, + "grad_norm": 44.344680235102174, + "learning_rate": 6.0606790577077435e-06, + "loss": 2.3354, + "step": 17205 + }, + { + "epoch": 1.4664621154010056, + "grad_norm": 48.55462002126583, + "learning_rate": 6.060194490067375e-06, + "loss": 3.4377, + "step": 17206 + }, + { + "epoch": 1.466547345095031, + "grad_norm": 57.35569401805212, + "learning_rate": 6.059709912000415e-06, + "loss": 3.0176, + "step": 17207 + }, + { + "epoch": 1.4666325747890565, + "grad_norm": 29.810623179488143, + "learning_rate": 6.059225323511632e-06, + "loss": 2.1436, + "step": 17208 + }, + { + "epoch": 1.466717804483082, + "grad_norm": 56.91136866131407, + "learning_rate": 6.058740724605787e-06, + "loss": 3.262, + "step": 17209 + }, + { + "epoch": 1.4668030341771072, + "grad_norm": 34.902840084459434, + "learning_rate": 6.0582561152876475e-06, + "loss": 2.1157, + "step": 17210 + }, + { + "epoch": 1.4668882638711327, + "grad_norm": 24.505333665847413, + "learning_rate": 6.0577714955619796e-06, + "loss": 2.3261, + "step": 17211 + }, + { + "epoch": 1.4669734935651582, + "grad_norm": 63.94885275509795, + "learning_rate": 6.057286865433549e-06, + "loss": 3.8029, + "step": 17212 + }, + { + "epoch": 1.4670587232591834, + "grad_norm": 69.97361067891948, + "learning_rate": 6.056802224907124e-06, + "loss": 2.8521, + "step": 17213 + }, + { + "epoch": 1.4671439529532089, + "grad_norm": 76.03502065989123, + "learning_rate": 6.056317573987467e-06, + "loss": 3.4013, + "step": 17214 + }, + { + "epoch": 1.4672291826472343, + "grad_norm": 44.801584958466044, + "learning_rate": 6.055832912679347e-06, + "loss": 2.2962, + "step": 17215 + }, + { + "epoch": 1.4673144123412598, + "grad_norm": 49.65913159211757, + "learning_rate": 6.055348240987531e-06, + "loss": 3.3662, + "step": 17216 + }, + { + "epoch": 1.467399642035285, + "grad_norm": 23.76026107808454, + "learning_rate": 6.054863558916782e-06, + "loss": 1.6661, + "step": 17217 + }, + { + "epoch": 1.4674848717293105, + "grad_norm": 50.51844946639223, + "learning_rate": 6.05437886647187e-06, + "loss": 2.8104, + "step": 17218 + }, + { + "epoch": 1.4675701014233358, + "grad_norm": 34.51827402695981, + "learning_rate": 6.053894163657563e-06, + "loss": 2.7126, + "step": 17219 + }, + { + "epoch": 1.4676553311173612, + "grad_norm": 46.37457055887736, + "learning_rate": 6.053409450478623e-06, + "loss": 1.9023, + "step": 17220 + }, + { + "epoch": 1.4677405608113867, + "grad_norm": 53.08568158865017, + "learning_rate": 6.0529247269398205e-06, + "loss": 2.8498, + "step": 17221 + }, + { + "epoch": 1.4678257905054122, + "grad_norm": 38.71814192573184, + "learning_rate": 6.052439993045922e-06, + "loss": 2.7122, + "step": 17222 + }, + { + "epoch": 1.4679110201994374, + "grad_norm": 63.23222668179963, + "learning_rate": 6.051955248801696e-06, + "loss": 2.6127, + "step": 17223 + }, + { + "epoch": 1.467996249893463, + "grad_norm": 41.556533189524494, + "learning_rate": 6.051470494211905e-06, + "loss": 2.3251, + "step": 17224 + }, + { + "epoch": 1.4680814795874884, + "grad_norm": 34.13922782112193, + "learning_rate": 6.05098572928132e-06, + "loss": 2.6344, + "step": 17225 + }, + { + "epoch": 1.4681667092815136, + "grad_norm": 30.50317545264929, + "learning_rate": 6.0505009540147085e-06, + "loss": 2.4159, + "step": 17226 + }, + { + "epoch": 1.468251938975539, + "grad_norm": 55.81625691992711, + "learning_rate": 6.050016168416839e-06, + "loss": 2.9969, + "step": 17227 + }, + { + "epoch": 1.4683371686695645, + "grad_norm": 33.670512994460466, + "learning_rate": 6.049531372492476e-06, + "loss": 3.1291, + "step": 17228 + }, + { + "epoch": 1.46842239836359, + "grad_norm": 81.19789987845162, + "learning_rate": 6.049046566246389e-06, + "loss": 2.777, + "step": 17229 + }, + { + "epoch": 1.4685076280576153, + "grad_norm": 64.7230803072956, + "learning_rate": 6.048561749683347e-06, + "loss": 2.8427, + "step": 17230 + }, + { + "epoch": 1.4685928577516407, + "grad_norm": 52.308671570923835, + "learning_rate": 6.048076922808117e-06, + "loss": 2.912, + "step": 17231 + }, + { + "epoch": 1.468678087445666, + "grad_norm": 26.470187456490223, + "learning_rate": 6.0475920856254655e-06, + "loss": 2.276, + "step": 17232 + }, + { + "epoch": 1.4687633171396914, + "grad_norm": 84.62239224305576, + "learning_rate": 6.047107238140163e-06, + "loss": 3.5936, + "step": 17233 + }, + { + "epoch": 1.468848546833717, + "grad_norm": 74.27127018492116, + "learning_rate": 6.046622380356977e-06, + "loss": 3.5034, + "step": 17234 + }, + { + "epoch": 1.4689337765277424, + "grad_norm": 112.38694834418794, + "learning_rate": 6.0461375122806755e-06, + "loss": 4.4512, + "step": 17235 + }, + { + "epoch": 1.4690190062217676, + "grad_norm": 50.26479348422416, + "learning_rate": 6.045652633916028e-06, + "loss": 2.3138, + "step": 17236 + }, + { + "epoch": 1.469104235915793, + "grad_norm": 61.10126430668165, + "learning_rate": 6.0451677452678e-06, + "loss": 2.4873, + "step": 17237 + }, + { + "epoch": 1.4691894656098183, + "grad_norm": 61.660168300606614, + "learning_rate": 6.0446828463407645e-06, + "loss": 2.8939, + "step": 17238 + }, + { + "epoch": 1.4692746953038438, + "grad_norm": 24.082331379385252, + "learning_rate": 6.0441979371396886e-06, + "loss": 1.8248, + "step": 17239 + }, + { + "epoch": 1.4693599249978693, + "grad_norm": 65.7320329027516, + "learning_rate": 6.04371301766934e-06, + "loss": 3.4729, + "step": 17240 + }, + { + "epoch": 1.4694451546918947, + "grad_norm": 41.808725125085864, + "learning_rate": 6.043228087934488e-06, + "loss": 3.7139, + "step": 17241 + }, + { + "epoch": 1.46953038438592, + "grad_norm": 36.430090900021504, + "learning_rate": 6.042743147939905e-06, + "loss": 3.1288, + "step": 17242 + }, + { + "epoch": 1.4696156140799455, + "grad_norm": 121.60598784994531, + "learning_rate": 6.042258197690355e-06, + "loss": 3.2195, + "step": 17243 + }, + { + "epoch": 1.469700843773971, + "grad_norm": 104.5017617669318, + "learning_rate": 6.041773237190611e-06, + "loss": 3.5539, + "step": 17244 + }, + { + "epoch": 1.4697860734679962, + "grad_norm": 26.456984621261736, + "learning_rate": 6.041288266445439e-06, + "loss": 2.1083, + "step": 17245 + }, + { + "epoch": 1.4698713031620216, + "grad_norm": 69.56427393938249, + "learning_rate": 6.040803285459613e-06, + "loss": 3.0907, + "step": 17246 + }, + { + "epoch": 1.469956532856047, + "grad_norm": 35.769911749125235, + "learning_rate": 6.0403182942378995e-06, + "loss": 2.5561, + "step": 17247 + }, + { + "epoch": 1.4700417625500726, + "grad_norm": 65.76783603810749, + "learning_rate": 6.03983329278507e-06, + "loss": 3.0993, + "step": 17248 + }, + { + "epoch": 1.4701269922440978, + "grad_norm": 72.92156049304465, + "learning_rate": 6.039348281105891e-06, + "loss": 3.2334, + "step": 17249 + }, + { + "epoch": 1.4702122219381233, + "grad_norm": 49.21046115416181, + "learning_rate": 6.038863259205137e-06, + "loss": 1.661, + "step": 17250 + }, + { + "epoch": 1.4702974516321485, + "grad_norm": 117.08955709559993, + "learning_rate": 6.038378227087574e-06, + "loss": 4.5371, + "step": 17251 + }, + { + "epoch": 1.470382681326174, + "grad_norm": 376.45386097693915, + "learning_rate": 6.037893184757974e-06, + "loss": 3.2193, + "step": 17252 + }, + { + "epoch": 1.4704679110201995, + "grad_norm": 47.63889625128946, + "learning_rate": 6.037408132221107e-06, + "loss": 2.594, + "step": 17253 + }, + { + "epoch": 1.470553140714225, + "grad_norm": 117.12141612055389, + "learning_rate": 6.0369230694817435e-06, + "loss": 3.2239, + "step": 17254 + }, + { + "epoch": 1.4706383704082502, + "grad_norm": 41.88102601741904, + "learning_rate": 6.0364379965446554e-06, + "loss": 3.2766, + "step": 17255 + }, + { + "epoch": 1.4707236001022757, + "grad_norm": 42.96695651373092, + "learning_rate": 6.035952913414609e-06, + "loss": 2.5184, + "step": 17256 + }, + { + "epoch": 1.470808829796301, + "grad_norm": 31.605095656219333, + "learning_rate": 6.035467820096377e-06, + "loss": 3.1571, + "step": 17257 + }, + { + "epoch": 1.4708940594903264, + "grad_norm": 67.6020984033575, + "learning_rate": 6.034982716594732e-06, + "loss": 2.8708, + "step": 17258 + }, + { + "epoch": 1.4709792891843518, + "grad_norm": 31.359929452966032, + "learning_rate": 6.034497602914443e-06, + "loss": 2.0202, + "step": 17259 + }, + { + "epoch": 1.4710645188783773, + "grad_norm": 61.76532704951265, + "learning_rate": 6.03401247906028e-06, + "loss": 2.5088, + "step": 17260 + }, + { + "epoch": 1.4711497485724025, + "grad_norm": 31.248783870505022, + "learning_rate": 6.033527345037016e-06, + "loss": 1.882, + "step": 17261 + }, + { + "epoch": 1.471234978266428, + "grad_norm": 46.97636454921073, + "learning_rate": 6.033042200849422e-06, + "loss": 2.417, + "step": 17262 + }, + { + "epoch": 1.4713202079604535, + "grad_norm": 33.18037674717259, + "learning_rate": 6.0325570465022676e-06, + "loss": 2.9389, + "step": 17263 + }, + { + "epoch": 1.4714054376544787, + "grad_norm": 71.81395397529727, + "learning_rate": 6.032071882000324e-06, + "loss": 2.2265, + "step": 17264 + }, + { + "epoch": 1.4714906673485042, + "grad_norm": 31.588447844965792, + "learning_rate": 6.0315867073483645e-06, + "loss": 2.1725, + "step": 17265 + }, + { + "epoch": 1.4715758970425297, + "grad_norm": 45.32851087717235, + "learning_rate": 6.031101522551162e-06, + "loss": 2.5036, + "step": 17266 + }, + { + "epoch": 1.4716611267365551, + "grad_norm": 47.085482781758955, + "learning_rate": 6.030616327613482e-06, + "loss": 2.9286, + "step": 17267 + }, + { + "epoch": 1.4717463564305804, + "grad_norm": 87.83378862977875, + "learning_rate": 6.030131122540101e-06, + "loss": 3.0817, + "step": 17268 + }, + { + "epoch": 1.4718315861246059, + "grad_norm": 43.705016771417085, + "learning_rate": 6.0296459073357906e-06, + "loss": 3.1706, + "step": 17269 + }, + { + "epoch": 1.471916815818631, + "grad_norm": 32.87734076682546, + "learning_rate": 6.029160682005322e-06, + "loss": 2.529, + "step": 17270 + }, + { + "epoch": 1.4720020455126566, + "grad_norm": 42.402511065945966, + "learning_rate": 6.028675446553466e-06, + "loss": 1.3531, + "step": 17271 + }, + { + "epoch": 1.472087275206682, + "grad_norm": 47.45177722411986, + "learning_rate": 6.028190200984996e-06, + "loss": 3.1269, + "step": 17272 + }, + { + "epoch": 1.4721725049007075, + "grad_norm": 39.33277430735383, + "learning_rate": 6.027704945304683e-06, + "loss": 2.1791, + "step": 17273 + }, + { + "epoch": 1.4722577345947327, + "grad_norm": 82.29828621550213, + "learning_rate": 6.027219679517302e-06, + "loss": 3.1498, + "step": 17274 + }, + { + "epoch": 1.4723429642887582, + "grad_norm": 31.32803348339804, + "learning_rate": 6.026734403627623e-06, + "loss": 2.7203, + "step": 17275 + }, + { + "epoch": 1.4724281939827837, + "grad_norm": 98.74924973016813, + "learning_rate": 6.026249117640419e-06, + "loss": 3.6796, + "step": 17276 + }, + { + "epoch": 1.472513423676809, + "grad_norm": 47.42977448708705, + "learning_rate": 6.025763821560463e-06, + "loss": 2.4397, + "step": 17277 + }, + { + "epoch": 1.4725986533708344, + "grad_norm": 81.22627085918592, + "learning_rate": 6.025278515392528e-06, + "loss": 3.7518, + "step": 17278 + }, + { + "epoch": 1.4726838830648599, + "grad_norm": 68.22795220692315, + "learning_rate": 6.024793199141386e-06, + "loss": 3.5511, + "step": 17279 + }, + { + "epoch": 1.4727691127588851, + "grad_norm": 46.590081246590984, + "learning_rate": 6.024307872811807e-06, + "loss": 2.8272, + "step": 17280 + }, + { + "epoch": 1.4728543424529106, + "grad_norm": 49.5029965718856, + "learning_rate": 6.023822536408571e-06, + "loss": 2.619, + "step": 17281 + }, + { + "epoch": 1.472939572146936, + "grad_norm": 80.20109994927769, + "learning_rate": 6.0233371899364454e-06, + "loss": 4.0879, + "step": 17282 + }, + { + "epoch": 1.4730248018409613, + "grad_norm": 39.84403235630421, + "learning_rate": 6.0228518334002074e-06, + "loss": 2.3714, + "step": 17283 + }, + { + "epoch": 1.4731100315349868, + "grad_norm": 39.27326827380854, + "learning_rate": 6.022366466804625e-06, + "loss": 1.9813, + "step": 17284 + }, + { + "epoch": 1.4731952612290122, + "grad_norm": 90.9996739469635, + "learning_rate": 6.0218810901544765e-06, + "loss": 2.9785, + "step": 17285 + }, + { + "epoch": 1.4732804909230377, + "grad_norm": 31.045649943057448, + "learning_rate": 6.021395703454532e-06, + "loss": 2.4738, + "step": 17286 + }, + { + "epoch": 1.473365720617063, + "grad_norm": 37.93043615936959, + "learning_rate": 6.020910306709569e-06, + "loss": 2.7702, + "step": 17287 + }, + { + "epoch": 1.4734509503110884, + "grad_norm": 47.433905379462246, + "learning_rate": 6.020424899924355e-06, + "loss": 3.208, + "step": 17288 + }, + { + "epoch": 1.4735361800051137, + "grad_norm": 29.971906523560992, + "learning_rate": 6.019939483103672e-06, + "loss": 2.2871, + "step": 17289 + }, + { + "epoch": 1.4736214096991391, + "grad_norm": 40.56667422912764, + "learning_rate": 6.019454056252286e-06, + "loss": 2.668, + "step": 17290 + }, + { + "epoch": 1.4737066393931646, + "grad_norm": 44.189263585047634, + "learning_rate": 6.018968619374977e-06, + "loss": 3.0778, + "step": 17291 + }, + { + "epoch": 1.47379186908719, + "grad_norm": 42.93533963016517, + "learning_rate": 6.018483172476515e-06, + "loss": 2.3755, + "step": 17292 + }, + { + "epoch": 1.4738770987812153, + "grad_norm": 71.26614315790106, + "learning_rate": 6.017997715561676e-06, + "loss": 3.106, + "step": 17293 + }, + { + "epoch": 1.4739623284752408, + "grad_norm": 23.995554898628384, + "learning_rate": 6.017512248635235e-06, + "loss": 2.2208, + "step": 17294 + }, + { + "epoch": 1.4740475581692662, + "grad_norm": 60.65648123943848, + "learning_rate": 6.017026771701963e-06, + "loss": 2.6347, + "step": 17295 + }, + { + "epoch": 1.4741327878632915, + "grad_norm": 27.048594096275693, + "learning_rate": 6.0165412847666395e-06, + "loss": 2.6757, + "step": 17296 + }, + { + "epoch": 1.474218017557317, + "grad_norm": 39.07976254306626, + "learning_rate": 6.016055787834034e-06, + "loss": 2.9593, + "step": 17297 + }, + { + "epoch": 1.4743032472513424, + "grad_norm": 84.33632722161437, + "learning_rate": 6.015570280908926e-06, + "loss": 4.0632, + "step": 17298 + }, + { + "epoch": 1.474388476945368, + "grad_norm": 82.9749150510759, + "learning_rate": 6.015084763996086e-06, + "loss": 2.978, + "step": 17299 + }, + { + "epoch": 1.4744737066393931, + "grad_norm": 51.8513088202061, + "learning_rate": 6.014599237100292e-06, + "loss": 2.2696, + "step": 17300 + }, + { + "epoch": 1.4745589363334186, + "grad_norm": 25.73914250069452, + "learning_rate": 6.014113700226315e-06, + "loss": 2.114, + "step": 17301 + }, + { + "epoch": 1.4746441660274439, + "grad_norm": 78.70622980520564, + "learning_rate": 6.0136281533789366e-06, + "loss": 3.8, + "step": 17302 + }, + { + "epoch": 1.4747293957214693, + "grad_norm": 49.660488329019124, + "learning_rate": 6.013142596562925e-06, + "loss": 3.6427, + "step": 17303 + }, + { + "epoch": 1.4748146254154948, + "grad_norm": 74.08422329372081, + "learning_rate": 6.01265702978306e-06, + "loss": 3.3092, + "step": 17304 + }, + { + "epoch": 1.4748998551095203, + "grad_norm": 37.78546326060783, + "learning_rate": 6.012171453044115e-06, + "loss": 2.9243, + "step": 17305 + }, + { + "epoch": 1.4749850848035455, + "grad_norm": 51.76632936808026, + "learning_rate": 6.011685866350866e-06, + "loss": 1.8905, + "step": 17306 + }, + { + "epoch": 1.475070314497571, + "grad_norm": 26.447845887420783, + "learning_rate": 6.011200269708088e-06, + "loss": 1.9499, + "step": 17307 + }, + { + "epoch": 1.4751555441915962, + "grad_norm": 30.15624879906975, + "learning_rate": 6.0107146631205585e-06, + "loss": 2.1029, + "step": 17308 + }, + { + "epoch": 1.4752407738856217, + "grad_norm": 40.27078505981137, + "learning_rate": 6.010229046593051e-06, + "loss": 2.4469, + "step": 17309 + }, + { + "epoch": 1.4753260035796472, + "grad_norm": 68.9486388255938, + "learning_rate": 6.009743420130343e-06, + "loss": 2.3106, + "step": 17310 + }, + { + "epoch": 1.4754112332736726, + "grad_norm": 36.834375704555896, + "learning_rate": 6.009257783737209e-06, + "loss": 3.5844, + "step": 17311 + }, + { + "epoch": 1.4754964629676979, + "grad_norm": 76.82188724875921, + "learning_rate": 6.008772137418425e-06, + "loss": 2.4847, + "step": 17312 + }, + { + "epoch": 1.4755816926617233, + "grad_norm": 52.80224148131509, + "learning_rate": 6.00828648117877e-06, + "loss": 2.9913, + "step": 17313 + }, + { + "epoch": 1.4756669223557488, + "grad_norm": 66.75221919129575, + "learning_rate": 6.007800815023016e-06, + "loss": 3.2417, + "step": 17314 + }, + { + "epoch": 1.475752152049774, + "grad_norm": 59.68703695304229, + "learning_rate": 6.0073151389559425e-06, + "loss": 3.5362, + "step": 17315 + }, + { + "epoch": 1.4758373817437995, + "grad_norm": 30.628796388244876, + "learning_rate": 6.006829452982325e-06, + "loss": 1.961, + "step": 17316 + }, + { + "epoch": 1.475922611437825, + "grad_norm": 39.02165598659621, + "learning_rate": 6.006343757106939e-06, + "loss": 2.4072, + "step": 17317 + }, + { + "epoch": 1.4760078411318505, + "grad_norm": 35.836172166006584, + "learning_rate": 6.0058580513345636e-06, + "loss": 2.7903, + "step": 17318 + }, + { + "epoch": 1.4760930708258757, + "grad_norm": 35.339725503028774, + "learning_rate": 6.005372335669972e-06, + "loss": 2.1932, + "step": 17319 + }, + { + "epoch": 1.4761783005199012, + "grad_norm": 93.61867129849388, + "learning_rate": 6.004886610117944e-06, + "loss": 3.2261, + "step": 17320 + }, + { + "epoch": 1.4762635302139264, + "grad_norm": 45.94245981627091, + "learning_rate": 6.004400874683256e-06, + "loss": 3.3659, + "step": 17321 + }, + { + "epoch": 1.476348759907952, + "grad_norm": 52.45236538033408, + "learning_rate": 6.003915129370684e-06, + "loss": 3.0255, + "step": 17322 + }, + { + "epoch": 1.4764339896019774, + "grad_norm": 40.02441546825346, + "learning_rate": 6.003429374185005e-06, + "loss": 2.5673, + "step": 17323 + }, + { + "epoch": 1.4765192192960028, + "grad_norm": 54.825482872584736, + "learning_rate": 6.002943609130998e-06, + "loss": 3.6118, + "step": 17324 + }, + { + "epoch": 1.476604448990028, + "grad_norm": 41.15152268260825, + "learning_rate": 6.002457834213439e-06, + "loss": 2.7729, + "step": 17325 + }, + { + "epoch": 1.4766896786840535, + "grad_norm": 75.29697913822146, + "learning_rate": 6.001972049437106e-06, + "loss": 4.2095, + "step": 17326 + }, + { + "epoch": 1.4767749083780788, + "grad_norm": 38.19341052454658, + "learning_rate": 6.001486254806773e-06, + "loss": 4.5734, + "step": 17327 + }, + { + "epoch": 1.4768601380721043, + "grad_norm": 30.00652680172717, + "learning_rate": 6.001000450327224e-06, + "loss": 2.0172, + "step": 17328 + }, + { + "epoch": 1.4769453677661297, + "grad_norm": 93.6288692505636, + "learning_rate": 6.000514636003232e-06, + "loss": 2.6988, + "step": 17329 + }, + { + "epoch": 1.4770305974601552, + "grad_norm": 59.71249791664938, + "learning_rate": 6.000028811839576e-06, + "loss": 3.5506, + "step": 17330 + }, + { + "epoch": 1.4771158271541804, + "grad_norm": 93.62416287472549, + "learning_rate": 5.999542977841033e-06, + "loss": 2.8489, + "step": 17331 + }, + { + "epoch": 1.477201056848206, + "grad_norm": 35.68319816760873, + "learning_rate": 5.999057134012384e-06, + "loss": 2.3241, + "step": 17332 + }, + { + "epoch": 1.4772862865422314, + "grad_norm": 73.7214579945052, + "learning_rate": 5.998571280358404e-06, + "loss": 3.5476, + "step": 17333 + }, + { + "epoch": 1.4773715162362566, + "grad_norm": 22.966308363697543, + "learning_rate": 5.998085416883873e-06, + "loss": 2.0214, + "step": 17334 + }, + { + "epoch": 1.477456745930282, + "grad_norm": 61.84919134386365, + "learning_rate": 5.997599543593567e-06, + "loss": 2.8893, + "step": 17335 + }, + { + "epoch": 1.4775419756243076, + "grad_norm": 125.0034644985276, + "learning_rate": 5.997113660492267e-06, + "loss": 3.0488, + "step": 17336 + }, + { + "epoch": 1.477627205318333, + "grad_norm": 35.986512557553276, + "learning_rate": 5.996627767584752e-06, + "loss": 3.1251, + "step": 17337 + }, + { + "epoch": 1.4777124350123583, + "grad_norm": 26.996588243583574, + "learning_rate": 5.996141864875798e-06, + "loss": 1.5875, + "step": 17338 + }, + { + "epoch": 1.4777976647063837, + "grad_norm": 35.23990518493116, + "learning_rate": 5.9956559523701825e-06, + "loss": 2.9852, + "step": 17339 + }, + { + "epoch": 1.477882894400409, + "grad_norm": 83.92648904087204, + "learning_rate": 5.995170030072688e-06, + "loss": 3.5537, + "step": 17340 + }, + { + "epoch": 1.4779681240944345, + "grad_norm": 70.13488865085516, + "learning_rate": 5.994684097988092e-06, + "loss": 3.207, + "step": 17341 + }, + { + "epoch": 1.47805335378846, + "grad_norm": 42.21109387012239, + "learning_rate": 5.994198156121171e-06, + "loss": 2.3507, + "step": 17342 + }, + { + "epoch": 1.4781385834824854, + "grad_norm": 36.128013779533795, + "learning_rate": 5.993712204476708e-06, + "loss": 2.3826, + "step": 17343 + }, + { + "epoch": 1.4782238131765106, + "grad_norm": 38.503851133155486, + "learning_rate": 5.99322624305948e-06, + "loss": 3.6013, + "step": 17344 + }, + { + "epoch": 1.478309042870536, + "grad_norm": 56.294254403941714, + "learning_rate": 5.992740271874267e-06, + "loss": 2.5833, + "step": 17345 + }, + { + "epoch": 1.4783942725645616, + "grad_norm": 34.88139988296689, + "learning_rate": 5.992254290925847e-06, + "loss": 2.9291, + "step": 17346 + }, + { + "epoch": 1.4784795022585868, + "grad_norm": 37.476966823146455, + "learning_rate": 5.991768300219001e-06, + "loss": 2.5272, + "step": 17347 + }, + { + "epoch": 1.4785647319526123, + "grad_norm": 33.86503250798022, + "learning_rate": 5.991282299758507e-06, + "loss": 2.8356, + "step": 17348 + }, + { + "epoch": 1.4786499616466378, + "grad_norm": 52.850675049915864, + "learning_rate": 5.990796289549147e-06, + "loss": 2.4582, + "step": 17349 + }, + { + "epoch": 1.4787351913406632, + "grad_norm": 27.48013461317715, + "learning_rate": 5.990310269595698e-06, + "loss": 2.9495, + "step": 17350 + }, + { + "epoch": 1.4788204210346885, + "grad_norm": 45.62914373769708, + "learning_rate": 5.9898242399029415e-06, + "loss": 2.5254, + "step": 17351 + }, + { + "epoch": 1.478905650728714, + "grad_norm": 41.97823393835416, + "learning_rate": 5.989338200475657e-06, + "loss": 2.5686, + "step": 17352 + }, + { + "epoch": 1.4789908804227392, + "grad_norm": 95.51759683833373, + "learning_rate": 5.988852151318625e-06, + "loss": 3.0507, + "step": 17353 + }, + { + "epoch": 1.4790761101167647, + "grad_norm": 34.420163923102876, + "learning_rate": 5.988366092436624e-06, + "loss": 2.8464, + "step": 17354 + }, + { + "epoch": 1.4791613398107901, + "grad_norm": 84.63495717008794, + "learning_rate": 5.987880023834435e-06, + "loss": 3.7502, + "step": 17355 + }, + { + "epoch": 1.4792465695048156, + "grad_norm": 27.11588060066762, + "learning_rate": 5.987393945516839e-06, + "loss": 1.4966, + "step": 17356 + }, + { + "epoch": 1.4793317991988408, + "grad_norm": 29.016014500749172, + "learning_rate": 5.986907857488616e-06, + "loss": 2.6039, + "step": 17357 + }, + { + "epoch": 1.4794170288928663, + "grad_norm": 133.9067909645413, + "learning_rate": 5.986421759754546e-06, + "loss": 3.115, + "step": 17358 + }, + { + "epoch": 1.4795022585868915, + "grad_norm": 38.57332622429907, + "learning_rate": 5.98593565231941e-06, + "loss": 2.8045, + "step": 17359 + }, + { + "epoch": 1.479587488280917, + "grad_norm": 32.462699569850066, + "learning_rate": 5.985449535187989e-06, + "loss": 2.0843, + "step": 17360 + }, + { + "epoch": 1.4796727179749425, + "grad_norm": 38.42158191942642, + "learning_rate": 5.984963408365063e-06, + "loss": 2.9524, + "step": 17361 + }, + { + "epoch": 1.479757947668968, + "grad_norm": 47.96985505086603, + "learning_rate": 5.984477271855414e-06, + "loss": 2.2122, + "step": 17362 + }, + { + "epoch": 1.4798431773629932, + "grad_norm": 34.93137205582201, + "learning_rate": 5.98399112566382e-06, + "loss": 3.3749, + "step": 17363 + }, + { + "epoch": 1.4799284070570187, + "grad_norm": 53.939601781063274, + "learning_rate": 5.983504969795068e-06, + "loss": 3.3677, + "step": 17364 + }, + { + "epoch": 1.4800136367510441, + "grad_norm": 42.049646617876824, + "learning_rate": 5.983018804253933e-06, + "loss": 2.6855, + "step": 17365 + }, + { + "epoch": 1.4800988664450694, + "grad_norm": 110.84455312377668, + "learning_rate": 5.982532629045199e-06, + "loss": 3.5172, + "step": 17366 + }, + { + "epoch": 1.4801840961390949, + "grad_norm": 37.97845329427254, + "learning_rate": 5.9820464441736485e-06, + "loss": 2.9221, + "step": 17367 + }, + { + "epoch": 1.4802693258331203, + "grad_norm": 56.83770118618072, + "learning_rate": 5.981560249644058e-06, + "loss": 1.9563, + "step": 17368 + }, + { + "epoch": 1.4803545555271458, + "grad_norm": 61.48123049778817, + "learning_rate": 5.981074045461216e-06, + "loss": 2.573, + "step": 17369 + }, + { + "epoch": 1.480439785221171, + "grad_norm": 44.6770394713518, + "learning_rate": 5.980587831629898e-06, + "loss": 2.814, + "step": 17370 + }, + { + "epoch": 1.4805250149151965, + "grad_norm": 67.83548721658902, + "learning_rate": 5.98010160815489e-06, + "loss": 2.1618, + "step": 17371 + }, + { + "epoch": 1.4806102446092217, + "grad_norm": 67.2200019541846, + "learning_rate": 5.979615375040972e-06, + "loss": 3.3526, + "step": 17372 + }, + { + "epoch": 1.4806954743032472, + "grad_norm": 79.30734870770226, + "learning_rate": 5.9791291322929255e-06, + "loss": 1.9729, + "step": 17373 + }, + { + "epoch": 1.4807807039972727, + "grad_norm": 39.135213526621015, + "learning_rate": 5.9786428799155325e-06, + "loss": 3.3121, + "step": 17374 + }, + { + "epoch": 1.4808659336912982, + "grad_norm": 44.49789388318243, + "learning_rate": 5.978156617913578e-06, + "loss": 2.1155, + "step": 17375 + }, + { + "epoch": 1.4809511633853234, + "grad_norm": 40.967857486470955, + "learning_rate": 5.9776703462918394e-06, + "loss": 3.0638, + "step": 17376 + }, + { + "epoch": 1.4810363930793489, + "grad_norm": 48.83389736654981, + "learning_rate": 5.977184065055103e-06, + "loss": 3.3343, + "step": 17377 + }, + { + "epoch": 1.4811216227733741, + "grad_norm": 72.65921118774725, + "learning_rate": 5.976697774208147e-06, + "loss": 2.7529, + "step": 17378 + }, + { + "epoch": 1.4812068524673996, + "grad_norm": 41.66074759888505, + "learning_rate": 5.97621147375576e-06, + "loss": 2.5058, + "step": 17379 + }, + { + "epoch": 1.481292082161425, + "grad_norm": 30.737748550920248, + "learning_rate": 5.975725163702719e-06, + "loss": 2.5599, + "step": 17380 + }, + { + "epoch": 1.4813773118554505, + "grad_norm": 53.05665590085114, + "learning_rate": 5.975238844053809e-06, + "loss": 3.1762, + "step": 17381 + }, + { + "epoch": 1.4814625415494758, + "grad_norm": 37.75620679836945, + "learning_rate": 5.9747525148138134e-06, + "loss": 2.7293, + "step": 17382 + }, + { + "epoch": 1.4815477712435012, + "grad_norm": 68.35742643472086, + "learning_rate": 5.974266175987512e-06, + "loss": 2.306, + "step": 17383 + }, + { + "epoch": 1.4816330009375267, + "grad_norm": 44.34607986975565, + "learning_rate": 5.9737798275796925e-06, + "loss": 2.9086, + "step": 17384 + }, + { + "epoch": 1.481718230631552, + "grad_norm": 52.02623038251715, + "learning_rate": 5.973293469595134e-06, + "loss": 2.6611, + "step": 17385 + }, + { + "epoch": 1.4818034603255774, + "grad_norm": 66.33032305164632, + "learning_rate": 5.97280710203862e-06, + "loss": 2.8218, + "step": 17386 + }, + { + "epoch": 1.4818886900196029, + "grad_norm": 85.67912006612227, + "learning_rate": 5.972320724914936e-06, + "loss": 2.8933, + "step": 17387 + }, + { + "epoch": 1.4819739197136284, + "grad_norm": 50.827542367103874, + "learning_rate": 5.971834338228865e-06, + "loss": 2.9991, + "step": 17388 + }, + { + "epoch": 1.4820591494076536, + "grad_norm": 35.48449790336724, + "learning_rate": 5.9713479419851875e-06, + "loss": 2.6123, + "step": 17389 + }, + { + "epoch": 1.482144379101679, + "grad_norm": 21.45717970819832, + "learning_rate": 5.97086153618869e-06, + "loss": 1.8162, + "step": 17390 + }, + { + "epoch": 1.4822296087957043, + "grad_norm": 64.13337285554978, + "learning_rate": 5.970375120844154e-06, + "loss": 2.5276, + "step": 17391 + }, + { + "epoch": 1.4823148384897298, + "grad_norm": 76.53141625757, + "learning_rate": 5.969888695956366e-06, + "loss": 3.2614, + "step": 17392 + }, + { + "epoch": 1.4824000681837552, + "grad_norm": 63.978634059730005, + "learning_rate": 5.9694022615301075e-06, + "loss": 2.5684, + "step": 17393 + }, + { + "epoch": 1.4824852978777807, + "grad_norm": 41.88815144117552, + "learning_rate": 5.968915817570162e-06, + "loss": 2.0729, + "step": 17394 + }, + { + "epoch": 1.482570527571806, + "grad_norm": 36.05767912094665, + "learning_rate": 5.968429364081315e-06, + "loss": 2.6781, + "step": 17395 + }, + { + "epoch": 1.4826557572658314, + "grad_norm": 42.258954444624614, + "learning_rate": 5.96794290106835e-06, + "loss": 3.1922, + "step": 17396 + }, + { + "epoch": 1.4827409869598567, + "grad_norm": 102.8667487910418, + "learning_rate": 5.9674564285360515e-06, + "loss": 4.2098, + "step": 17397 + }, + { + "epoch": 1.4828262166538821, + "grad_norm": 88.81830701981042, + "learning_rate": 5.966969946489203e-06, + "loss": 4.5127, + "step": 17398 + }, + { + "epoch": 1.4829114463479076, + "grad_norm": 32.63301412800631, + "learning_rate": 5.96648345493259e-06, + "loss": 2.1203, + "step": 17399 + }, + { + "epoch": 1.482996676041933, + "grad_norm": 36.276217529419085, + "learning_rate": 5.965996953870996e-06, + "loss": 3.0921, + "step": 17400 + }, + { + "epoch": 1.4830819057359583, + "grad_norm": 66.19099918308572, + "learning_rate": 5.965510443309205e-06, + "loss": 3.4429, + "step": 17401 + }, + { + "epoch": 1.4831671354299838, + "grad_norm": 78.03750285226423, + "learning_rate": 5.965023923252003e-06, + "loss": 2.1041, + "step": 17402 + }, + { + "epoch": 1.4832523651240093, + "grad_norm": 41.21240844291993, + "learning_rate": 5.964537393704175e-06, + "loss": 3.2969, + "step": 17403 + }, + { + "epoch": 1.4833375948180345, + "grad_norm": 46.54146005260806, + "learning_rate": 5.964050854670504e-06, + "loss": 3.3325, + "step": 17404 + }, + { + "epoch": 1.48342282451206, + "grad_norm": 46.67738414485059, + "learning_rate": 5.963564306155776e-06, + "loss": 3.3269, + "step": 17405 + }, + { + "epoch": 1.4835080542060854, + "grad_norm": 47.000468656702026, + "learning_rate": 5.963077748164775e-06, + "loss": 2.9481, + "step": 17406 + }, + { + "epoch": 1.483593283900111, + "grad_norm": 55.05004032450127, + "learning_rate": 5.9625911807022895e-06, + "loss": 4.1157, + "step": 17407 + }, + { + "epoch": 1.4836785135941362, + "grad_norm": 43.02692591734344, + "learning_rate": 5.9621046037731004e-06, + "loss": 2.5364, + "step": 17408 + }, + { + "epoch": 1.4837637432881616, + "grad_norm": 27.76102590407356, + "learning_rate": 5.961618017381995e-06, + "loss": 1.6708, + "step": 17409 + }, + { + "epoch": 1.4838489729821869, + "grad_norm": 119.85861410476264, + "learning_rate": 5.961131421533759e-06, + "loss": 4.6198, + "step": 17410 + }, + { + "epoch": 1.4839342026762123, + "grad_norm": 30.607694490057554, + "learning_rate": 5.960644816233177e-06, + "loss": 2.2003, + "step": 17411 + }, + { + "epoch": 1.4840194323702378, + "grad_norm": 69.92505006120422, + "learning_rate": 5.960158201485036e-06, + "loss": 2.3757, + "step": 17412 + }, + { + "epoch": 1.4841046620642633, + "grad_norm": 59.28396384504312, + "learning_rate": 5.959671577294118e-06, + "loss": 3.0824, + "step": 17413 + }, + { + "epoch": 1.4841898917582885, + "grad_norm": 27.237871447807397, + "learning_rate": 5.9591849436652145e-06, + "loss": 2.3832, + "step": 17414 + }, + { + "epoch": 1.484275121452314, + "grad_norm": 44.7151018900605, + "learning_rate": 5.9586983006031065e-06, + "loss": 3.5144, + "step": 17415 + }, + { + "epoch": 1.4843603511463395, + "grad_norm": 21.85908434821125, + "learning_rate": 5.958211648112583e-06, + "loss": 1.6236, + "step": 17416 + }, + { + "epoch": 1.4844455808403647, + "grad_norm": 102.48933402074971, + "learning_rate": 5.957724986198426e-06, + "loss": 3.5147, + "step": 17417 + }, + { + "epoch": 1.4845308105343902, + "grad_norm": 82.28228954439268, + "learning_rate": 5.9572383148654265e-06, + "loss": 3.3161, + "step": 17418 + }, + { + "epoch": 1.4846160402284156, + "grad_norm": 98.33371824598501, + "learning_rate": 5.956751634118369e-06, + "loss": 3.2755, + "step": 17419 + }, + { + "epoch": 1.4847012699224411, + "grad_norm": 48.2249376543411, + "learning_rate": 5.956264943962039e-06, + "loss": 2.917, + "step": 17420 + }, + { + "epoch": 1.4847864996164664, + "grad_norm": 66.81421869455004, + "learning_rate": 5.955778244401221e-06, + "loss": 2.5404, + "step": 17421 + }, + { + "epoch": 1.4848717293104918, + "grad_norm": 40.057484959018396, + "learning_rate": 5.955291535440706e-06, + "loss": 2.9111, + "step": 17422 + }, + { + "epoch": 1.484956959004517, + "grad_norm": 47.13266927546386, + "learning_rate": 5.954804817085278e-06, + "loss": 2.0531, + "step": 17423 + }, + { + "epoch": 1.4850421886985425, + "grad_norm": 73.27111420181329, + "learning_rate": 5.954318089339724e-06, + "loss": 3.5795, + "step": 17424 + }, + { + "epoch": 1.485127418392568, + "grad_norm": 56.805182978500596, + "learning_rate": 5.953831352208831e-06, + "loss": 3.7369, + "step": 17425 + }, + { + "epoch": 1.4852126480865935, + "grad_norm": 59.27794515208408, + "learning_rate": 5.953344605697384e-06, + "loss": 2.8421, + "step": 17426 + }, + { + "epoch": 1.4852978777806187, + "grad_norm": 52.26821182634854, + "learning_rate": 5.952857849810174e-06, + "loss": 3.4058, + "step": 17427 + }, + { + "epoch": 1.4853831074746442, + "grad_norm": 76.79654553395387, + "learning_rate": 5.952371084551985e-06, + "loss": 3.2589, + "step": 17428 + }, + { + "epoch": 1.4854683371686694, + "grad_norm": 121.76633793531104, + "learning_rate": 5.951884309927604e-06, + "loss": 3.0409, + "step": 17429 + }, + { + "epoch": 1.485553566862695, + "grad_norm": 38.4327229554603, + "learning_rate": 5.951397525941819e-06, + "loss": 2.7071, + "step": 17430 + }, + { + "epoch": 1.4856387965567204, + "grad_norm": 41.430682151277, + "learning_rate": 5.9509107325994186e-06, + "loss": 2.3848, + "step": 17431 + }, + { + "epoch": 1.4857240262507458, + "grad_norm": 57.797085832880384, + "learning_rate": 5.950423929905188e-06, + "loss": 2.8255, + "step": 17432 + }, + { + "epoch": 1.485809255944771, + "grad_norm": 65.49840723379668, + "learning_rate": 5.9499371178639154e-06, + "loss": 3.9695, + "step": 17433 + }, + { + "epoch": 1.4858944856387966, + "grad_norm": 107.48346043629516, + "learning_rate": 5.949450296480389e-06, + "loss": 4.2092, + "step": 17434 + }, + { + "epoch": 1.485979715332822, + "grad_norm": 61.26768061784722, + "learning_rate": 5.948963465759397e-06, + "loss": 2.934, + "step": 17435 + }, + { + "epoch": 1.4860649450268473, + "grad_norm": 58.282671960638574, + "learning_rate": 5.948476625705726e-06, + "loss": 3.1756, + "step": 17436 + }, + { + "epoch": 1.4861501747208727, + "grad_norm": 49.34407930712058, + "learning_rate": 5.9479897763241636e-06, + "loss": 3.011, + "step": 17437 + }, + { + "epoch": 1.4862354044148982, + "grad_norm": 40.06579545755905, + "learning_rate": 5.947502917619498e-06, + "loss": 2.2039, + "step": 17438 + }, + { + "epoch": 1.4863206341089237, + "grad_norm": 72.174697291441, + "learning_rate": 5.9470160495965205e-06, + "loss": 4.3639, + "step": 17439 + }, + { + "epoch": 1.486405863802949, + "grad_norm": 24.659688860889613, + "learning_rate": 5.946529172260014e-06, + "loss": 2.3293, + "step": 17440 + }, + { + "epoch": 1.4864910934969744, + "grad_norm": 66.51697907296872, + "learning_rate": 5.94604228561477e-06, + "loss": 3.3574, + "step": 17441 + }, + { + "epoch": 1.4865763231909996, + "grad_norm": 58.295522995367726, + "learning_rate": 5.945555389665576e-06, + "loss": 2.9319, + "step": 17442 + }, + { + "epoch": 1.486661552885025, + "grad_norm": 40.38511887593816, + "learning_rate": 5.94506848441722e-06, + "loss": 3.0544, + "step": 17443 + }, + { + "epoch": 1.4867467825790506, + "grad_norm": 43.36009345987038, + "learning_rate": 5.944581569874491e-06, + "loss": 3.3705, + "step": 17444 + }, + { + "epoch": 1.486832012273076, + "grad_norm": 50.8555252891912, + "learning_rate": 5.944094646042178e-06, + "loss": 2.9274, + "step": 17445 + }, + { + "epoch": 1.4869172419671013, + "grad_norm": 35.29864412593335, + "learning_rate": 5.94360771292507e-06, + "loss": 2.7834, + "step": 17446 + }, + { + "epoch": 1.4870024716611268, + "grad_norm": 53.36523652568944, + "learning_rate": 5.9431207705279544e-06, + "loss": 3.7171, + "step": 17447 + }, + { + "epoch": 1.487087701355152, + "grad_norm": 48.86245024680376, + "learning_rate": 5.942633818855621e-06, + "loss": 2.2399, + "step": 17448 + }, + { + "epoch": 1.4871729310491775, + "grad_norm": 56.04361546093925, + "learning_rate": 5.942146857912857e-06, + "loss": 2.9887, + "step": 17449 + }, + { + "epoch": 1.487258160743203, + "grad_norm": 77.0787338696405, + "learning_rate": 5.941659887704455e-06, + "loss": 2.7345, + "step": 17450 + }, + { + "epoch": 1.4873433904372284, + "grad_norm": 76.89174526130563, + "learning_rate": 5.941172908235202e-06, + "loss": 4.5883, + "step": 17451 + }, + { + "epoch": 1.4874286201312537, + "grad_norm": 61.778015654762726, + "learning_rate": 5.940685919509887e-06, + "loss": 3.0571, + "step": 17452 + }, + { + "epoch": 1.4875138498252791, + "grad_norm": 38.0201024078667, + "learning_rate": 5.9401989215333e-06, + "loss": 2.9986, + "step": 17453 + }, + { + "epoch": 1.4875990795193046, + "grad_norm": 58.397480742016505, + "learning_rate": 5.93971191431023e-06, + "loss": 2.6647, + "step": 17454 + }, + { + "epoch": 1.4876843092133298, + "grad_norm": 72.11074417697176, + "learning_rate": 5.939224897845466e-06, + "loss": 2.704, + "step": 17455 + }, + { + "epoch": 1.4877695389073553, + "grad_norm": 95.67120683373406, + "learning_rate": 5.938737872143798e-06, + "loss": 1.2858, + "step": 17456 + }, + { + "epoch": 1.4878547686013808, + "grad_norm": 77.8480788453422, + "learning_rate": 5.938250837210017e-06, + "loss": 3.0725, + "step": 17457 + }, + { + "epoch": 1.4879399982954062, + "grad_norm": 78.88343314554845, + "learning_rate": 5.937763793048913e-06, + "loss": 2.1471, + "step": 17458 + }, + { + "epoch": 1.4880252279894315, + "grad_norm": 54.55720643884738, + "learning_rate": 5.937276739665273e-06, + "loss": 2.6509, + "step": 17459 + }, + { + "epoch": 1.488110457683457, + "grad_norm": 49.15063710056716, + "learning_rate": 5.9367896770638885e-06, + "loss": 3.216, + "step": 17460 + }, + { + "epoch": 1.4881956873774822, + "grad_norm": 29.84591542319528, + "learning_rate": 5.936302605249552e-06, + "loss": 2.3573, + "step": 17461 + }, + { + "epoch": 1.4882809170715077, + "grad_norm": 39.11724209502227, + "learning_rate": 5.93581552422705e-06, + "loss": 3.2999, + "step": 17462 + }, + { + "epoch": 1.4883661467655331, + "grad_norm": 66.4687430628022, + "learning_rate": 5.935328434001174e-06, + "loss": 2.7033, + "step": 17463 + }, + { + "epoch": 1.4884513764595586, + "grad_norm": 23.40627139469965, + "learning_rate": 5.934841334576714e-06, + "loss": 1.9747, + "step": 17464 + }, + { + "epoch": 1.4885366061535839, + "grad_norm": 80.64548397044592, + "learning_rate": 5.934354225958463e-06, + "loss": 3.3039, + "step": 17465 + }, + { + "epoch": 1.4886218358476093, + "grad_norm": 126.8170312178917, + "learning_rate": 5.933867108151208e-06, + "loss": 2.7804, + "step": 17466 + }, + { + "epoch": 1.4887070655416348, + "grad_norm": 55.67459312598686, + "learning_rate": 5.933379981159742e-06, + "loss": 2.9343, + "step": 17467 + }, + { + "epoch": 1.48879229523566, + "grad_norm": 90.4639825651215, + "learning_rate": 5.932892844988854e-06, + "loss": 2.4738, + "step": 17468 + }, + { + "epoch": 1.4888775249296855, + "grad_norm": 43.8311845867541, + "learning_rate": 5.932405699643334e-06, + "loss": 2.3081, + "step": 17469 + }, + { + "epoch": 1.488962754623711, + "grad_norm": 27.08772994601449, + "learning_rate": 5.931918545127977e-06, + "loss": 2.4926, + "step": 17470 + }, + { + "epoch": 1.4890479843177362, + "grad_norm": 45.26631274016045, + "learning_rate": 5.9314313814475696e-06, + "loss": 1.979, + "step": 17471 + }, + { + "epoch": 1.4891332140117617, + "grad_norm": 64.35231966260669, + "learning_rate": 5.9309442086069056e-06, + "loss": 3.272, + "step": 17472 + }, + { + "epoch": 1.4892184437057872, + "grad_norm": 43.40423931818442, + "learning_rate": 5.930457026610774e-06, + "loss": 3.5293, + "step": 17473 + }, + { + "epoch": 1.4893036733998124, + "grad_norm": 38.33488644762974, + "learning_rate": 5.92996983546397e-06, + "loss": 3.2732, + "step": 17474 + }, + { + "epoch": 1.4893889030938379, + "grad_norm": 48.89613295961768, + "learning_rate": 5.92948263517128e-06, + "loss": 2.407, + "step": 17475 + }, + { + "epoch": 1.4894741327878633, + "grad_norm": 54.075880824232506, + "learning_rate": 5.928995425737498e-06, + "loss": 2.9679, + "step": 17476 + }, + { + "epoch": 1.4895593624818888, + "grad_norm": 41.68984095111757, + "learning_rate": 5.928508207167415e-06, + "loss": 3.6822, + "step": 17477 + }, + { + "epoch": 1.489644592175914, + "grad_norm": 24.93463320407844, + "learning_rate": 5.928020979465824e-06, + "loss": 1.7344, + "step": 17478 + }, + { + "epoch": 1.4897298218699395, + "grad_norm": 32.26888051843338, + "learning_rate": 5.927533742637514e-06, + "loss": 2.1887, + "step": 17479 + }, + { + "epoch": 1.4898150515639648, + "grad_norm": 39.96681965248556, + "learning_rate": 5.9270464966872786e-06, + "loss": 2.5708, + "step": 17480 + }, + { + "epoch": 1.4899002812579902, + "grad_norm": 34.06231251258315, + "learning_rate": 5.9265592416199095e-06, + "loss": 2.9865, + "step": 17481 + }, + { + "epoch": 1.4899855109520157, + "grad_norm": 78.16525387362195, + "learning_rate": 5.926071977440199e-06, + "loss": 2.385, + "step": 17482 + }, + { + "epoch": 1.4900707406460412, + "grad_norm": 76.53592565754568, + "learning_rate": 5.925584704152937e-06, + "loss": 2.9586, + "step": 17483 + }, + { + "epoch": 1.4901559703400664, + "grad_norm": 37.9519017240758, + "learning_rate": 5.925097421762918e-06, + "loss": 2.4686, + "step": 17484 + }, + { + "epoch": 1.4902412000340919, + "grad_norm": 57.92782839805497, + "learning_rate": 5.9246101302749334e-06, + "loss": 2.7926, + "step": 17485 + }, + { + "epoch": 1.4903264297281174, + "grad_norm": 93.2227636428562, + "learning_rate": 5.924122829693776e-06, + "loss": 4.0369, + "step": 17486 + }, + { + "epoch": 1.4904116594221426, + "grad_norm": 70.21085878433567, + "learning_rate": 5.923635520024237e-06, + "loss": 3.2607, + "step": 17487 + }, + { + "epoch": 1.490496889116168, + "grad_norm": 85.5154078944881, + "learning_rate": 5.923148201271111e-06, + "loss": 3.4662, + "step": 17488 + }, + { + "epoch": 1.4905821188101935, + "grad_norm": 65.83028794108496, + "learning_rate": 5.922660873439189e-06, + "loss": 2.5019, + "step": 17489 + }, + { + "epoch": 1.490667348504219, + "grad_norm": 25.351270125116514, + "learning_rate": 5.922173536533262e-06, + "loss": 1.6099, + "step": 17490 + }, + { + "epoch": 1.4907525781982442, + "grad_norm": 38.643664393455175, + "learning_rate": 5.921686190558127e-06, + "loss": 3.2555, + "step": 17491 + }, + { + "epoch": 1.4908378078922697, + "grad_norm": 34.62615269217456, + "learning_rate": 5.9211988355185725e-06, + "loss": 3.2678, + "step": 17492 + }, + { + "epoch": 1.490923037586295, + "grad_norm": 77.01323063724918, + "learning_rate": 5.920711471419396e-06, + "loss": 3.7526, + "step": 17493 + }, + { + "epoch": 1.4910082672803204, + "grad_norm": 82.4502398864318, + "learning_rate": 5.920224098265386e-06, + "loss": 2.8305, + "step": 17494 + }, + { + "epoch": 1.491093496974346, + "grad_norm": 54.14126145210598, + "learning_rate": 5.919736716061339e-06, + "loss": 2.4068, + "step": 17495 + }, + { + "epoch": 1.4911787266683714, + "grad_norm": 48.848723492956815, + "learning_rate": 5.919249324812045e-06, + "loss": 3.3238, + "step": 17496 + }, + { + "epoch": 1.4912639563623966, + "grad_norm": 58.96138075528842, + "learning_rate": 5.9187619245223e-06, + "loss": 3.0156, + "step": 17497 + }, + { + "epoch": 1.491349186056422, + "grad_norm": 24.640642974367715, + "learning_rate": 5.918274515196898e-06, + "loss": 2.2851, + "step": 17498 + }, + { + "epoch": 1.4914344157504473, + "grad_norm": 46.0620932277916, + "learning_rate": 5.9177870968406295e-06, + "loss": 2.5556, + "step": 17499 + }, + { + "epoch": 1.4915196454444728, + "grad_norm": 33.02940275337448, + "learning_rate": 5.917299669458291e-06, + "loss": 2.4921, + "step": 17500 + }, + { + "epoch": 1.4916048751384983, + "grad_norm": 35.08637728844219, + "learning_rate": 5.916812233054673e-06, + "loss": 2.636, + "step": 17501 + }, + { + "epoch": 1.4916901048325237, + "grad_norm": 45.53228235178075, + "learning_rate": 5.916324787634572e-06, + "loss": 2.4524, + "step": 17502 + }, + { + "epoch": 1.491775334526549, + "grad_norm": 52.51839376073544, + "learning_rate": 5.91583733320278e-06, + "loss": 2.2876, + "step": 17503 + }, + { + "epoch": 1.4918605642205744, + "grad_norm": 32.0032291357646, + "learning_rate": 5.9153498697640935e-06, + "loss": 2.6938, + "step": 17504 + }, + { + "epoch": 1.4919457939146, + "grad_norm": 36.28311279576907, + "learning_rate": 5.914862397323303e-06, + "loss": 2.8507, + "step": 17505 + }, + { + "epoch": 1.4920310236086252, + "grad_norm": 186.2355807837421, + "learning_rate": 5.914374915885205e-06, + "loss": 2.7706, + "step": 17506 + }, + { + "epoch": 1.4921162533026506, + "grad_norm": 56.61292809321062, + "learning_rate": 5.913887425454592e-06, + "loss": 2.6837, + "step": 17507 + }, + { + "epoch": 1.492201482996676, + "grad_norm": 37.23021612599573, + "learning_rate": 5.913399926036261e-06, + "loss": 1.9282, + "step": 17508 + }, + { + "epoch": 1.4922867126907016, + "grad_norm": 27.239974401830175, + "learning_rate": 5.912912417635004e-06, + "loss": 2.2417, + "step": 17509 + }, + { + "epoch": 1.4923719423847268, + "grad_norm": 42.929659177967146, + "learning_rate": 5.9124249002556155e-06, + "loss": 2.6342, + "step": 17510 + }, + { + "epoch": 1.4924571720787523, + "grad_norm": 64.91257815644731, + "learning_rate": 5.911937373902891e-06, + "loss": 2.1123, + "step": 17511 + }, + { + "epoch": 1.4925424017727775, + "grad_norm": 66.70176984124369, + "learning_rate": 5.911449838581627e-06, + "loss": 3.531, + "step": 17512 + }, + { + "epoch": 1.492627631466803, + "grad_norm": 43.24976817830372, + "learning_rate": 5.910962294296614e-06, + "loss": 3.1955, + "step": 17513 + }, + { + "epoch": 1.4927128611608285, + "grad_norm": 49.918021262624784, + "learning_rate": 5.910474741052648e-06, + "loss": 2.9829, + "step": 17514 + }, + { + "epoch": 1.492798090854854, + "grad_norm": 54.37731264905799, + "learning_rate": 5.909987178854526e-06, + "loss": 1.8873, + "step": 17515 + }, + { + "epoch": 1.4928833205488792, + "grad_norm": 220.34328286618535, + "learning_rate": 5.909499607707041e-06, + "loss": 4.7224, + "step": 17516 + }, + { + "epoch": 1.4929685502429046, + "grad_norm": 46.583653981247465, + "learning_rate": 5.90901202761499e-06, + "loss": 3.7505, + "step": 17517 + }, + { + "epoch": 1.49305377993693, + "grad_norm": 76.1343147963098, + "learning_rate": 5.9085244385831654e-06, + "loss": 4.3388, + "step": 17518 + }, + { + "epoch": 1.4931390096309554, + "grad_norm": 80.14353643567547, + "learning_rate": 5.9080368406163645e-06, + "loss": 4.1432, + "step": 17519 + }, + { + "epoch": 1.4932242393249808, + "grad_norm": 40.368480004094096, + "learning_rate": 5.907549233719382e-06, + "loss": 2.5769, + "step": 17520 + }, + { + "epoch": 1.4933094690190063, + "grad_norm": 58.206581884815286, + "learning_rate": 5.907061617897014e-06, + "loss": 3.404, + "step": 17521 + }, + { + "epoch": 1.4933946987130315, + "grad_norm": 65.88675417464009, + "learning_rate": 5.9065739931540545e-06, + "loss": 3.4154, + "step": 17522 + }, + { + "epoch": 1.493479928407057, + "grad_norm": 41.282409510481244, + "learning_rate": 5.906086359495301e-06, + "loss": 2.6628, + "step": 17523 + }, + { + "epoch": 1.4935651581010825, + "grad_norm": 33.39802225243394, + "learning_rate": 5.905598716925547e-06, + "loss": 2.5321, + "step": 17524 + }, + { + "epoch": 1.4936503877951077, + "grad_norm": 86.00443605708863, + "learning_rate": 5.905111065449591e-06, + "loss": 2.4394, + "step": 17525 + }, + { + "epoch": 1.4937356174891332, + "grad_norm": 76.05928475162794, + "learning_rate": 5.904623405072226e-06, + "loss": 2.6668, + "step": 17526 + }, + { + "epoch": 1.4938208471831587, + "grad_norm": 51.36693154856812, + "learning_rate": 5.9041357357982494e-06, + "loss": 2.4669, + "step": 17527 + }, + { + "epoch": 1.4939060768771841, + "grad_norm": 61.32713146882288, + "learning_rate": 5.903648057632458e-06, + "loss": 3.0326, + "step": 17528 + }, + { + "epoch": 1.4939913065712094, + "grad_norm": 31.69936755993911, + "learning_rate": 5.903160370579647e-06, + "loss": 2.6065, + "step": 17529 + }, + { + "epoch": 1.4940765362652348, + "grad_norm": 37.51146719337605, + "learning_rate": 5.9026726746446116e-06, + "loss": 2.5743, + "step": 17530 + }, + { + "epoch": 1.49416176595926, + "grad_norm": 50.98816800694943, + "learning_rate": 5.90218496983215e-06, + "loss": 3.6083, + "step": 17531 + }, + { + "epoch": 1.4942469956532856, + "grad_norm": 46.155723656584875, + "learning_rate": 5.901697256147058e-06, + "loss": 2.102, + "step": 17532 + }, + { + "epoch": 1.494332225347311, + "grad_norm": 54.95116710314688, + "learning_rate": 5.90120953359413e-06, + "loss": 3.7603, + "step": 17533 + }, + { + "epoch": 1.4944174550413365, + "grad_norm": 64.83993768761167, + "learning_rate": 5.900721802178165e-06, + "loss": 3.0527, + "step": 17534 + }, + { + "epoch": 1.4945026847353617, + "grad_norm": 32.9361430647467, + "learning_rate": 5.90023406190396e-06, + "loss": 2.4306, + "step": 17535 + }, + { + "epoch": 1.4945879144293872, + "grad_norm": 48.36812945194781, + "learning_rate": 5.899746312776311e-06, + "loss": 3.1436, + "step": 17536 + }, + { + "epoch": 1.4946731441234127, + "grad_norm": 142.78135734070437, + "learning_rate": 5.899258554800014e-06, + "loss": 3.3635, + "step": 17537 + }, + { + "epoch": 1.494758373817438, + "grad_norm": 59.914822851975984, + "learning_rate": 5.898770787979865e-06, + "loss": 2.0429, + "step": 17538 + }, + { + "epoch": 1.4948436035114634, + "grad_norm": 40.92950300825803, + "learning_rate": 5.8982830123206635e-06, + "loss": 2.1382, + "step": 17539 + }, + { + "epoch": 1.4949288332054889, + "grad_norm": 79.86963084794526, + "learning_rate": 5.897795227827207e-06, + "loss": 3.3641, + "step": 17540 + }, + { + "epoch": 1.4950140628995143, + "grad_norm": 37.89951446540413, + "learning_rate": 5.8973074345042895e-06, + "loss": 3.3312, + "step": 17541 + }, + { + "epoch": 1.4950992925935396, + "grad_norm": 39.73404734972104, + "learning_rate": 5.896819632356709e-06, + "loss": 2.7294, + "step": 17542 + }, + { + "epoch": 1.495184522287565, + "grad_norm": 47.90040761888788, + "learning_rate": 5.896331821389267e-06, + "loss": 3.2567, + "step": 17543 + }, + { + "epoch": 1.4952697519815903, + "grad_norm": 68.82193965579629, + "learning_rate": 5.8958440016067545e-06, + "loss": 2.5729, + "step": 17544 + }, + { + "epoch": 1.4953549816756158, + "grad_norm": 81.6556486044049, + "learning_rate": 5.895356173013973e-06, + "loss": 3.0892, + "step": 17545 + }, + { + "epoch": 1.4954402113696412, + "grad_norm": 60.15483463575717, + "learning_rate": 5.89486833561572e-06, + "loss": 2.7516, + "step": 17546 + }, + { + "epoch": 1.4955254410636667, + "grad_norm": 30.70932313792124, + "learning_rate": 5.894380489416793e-06, + "loss": 2.3569, + "step": 17547 + }, + { + "epoch": 1.495610670757692, + "grad_norm": 65.5348537844942, + "learning_rate": 5.893892634421988e-06, + "loss": 2.9257, + "step": 17548 + }, + { + "epoch": 1.4956959004517174, + "grad_norm": 21.185133522654805, + "learning_rate": 5.893404770636104e-06, + "loss": 1.7912, + "step": 17549 + }, + { + "epoch": 1.4957811301457427, + "grad_norm": 57.475921340810224, + "learning_rate": 5.892916898063939e-06, + "loss": 1.8509, + "step": 17550 + }, + { + "epoch": 1.4958663598397681, + "grad_norm": 33.326376433855906, + "learning_rate": 5.892429016710294e-06, + "loss": 2.808, + "step": 17551 + }, + { + "epoch": 1.4959515895337936, + "grad_norm": 66.52103686866064, + "learning_rate": 5.891941126579961e-06, + "loss": 4.67, + "step": 17552 + }, + { + "epoch": 1.496036819227819, + "grad_norm": 30.96429461856438, + "learning_rate": 5.8914532276777425e-06, + "loss": 2.7286, + "step": 17553 + }, + { + "epoch": 1.4961220489218443, + "grad_norm": 54.74117342187689, + "learning_rate": 5.890965320008436e-06, + "loss": 2.9775, + "step": 17554 + }, + { + "epoch": 1.4962072786158698, + "grad_norm": 37.36148752765017, + "learning_rate": 5.89047740357684e-06, + "loss": 3.7015, + "step": 17555 + }, + { + "epoch": 1.4962925083098952, + "grad_norm": 49.29460072896694, + "learning_rate": 5.8899894783877536e-06, + "loss": 2.5402, + "step": 17556 + }, + { + "epoch": 1.4963777380039205, + "grad_norm": 77.58937207754302, + "learning_rate": 5.88950154444597e-06, + "loss": 3.5559, + "step": 17557 + }, + { + "epoch": 1.496462967697946, + "grad_norm": 26.87837402908184, + "learning_rate": 5.889013601756296e-06, + "loss": 2.1049, + "step": 17558 + }, + { + "epoch": 1.4965481973919714, + "grad_norm": 60.372976646741215, + "learning_rate": 5.888525650323525e-06, + "loss": 3.3184, + "step": 17559 + }, + { + "epoch": 1.496633427085997, + "grad_norm": 41.39206375198787, + "learning_rate": 5.8880376901524595e-06, + "loss": 3.3216, + "step": 17560 + }, + { + "epoch": 1.4967186567800221, + "grad_norm": 100.53138317540542, + "learning_rate": 5.887549721247893e-06, + "loss": 3.776, + "step": 17561 + }, + { + "epoch": 1.4968038864740476, + "grad_norm": 42.46867720081291, + "learning_rate": 5.88706174361463e-06, + "loss": 1.7987, + "step": 17562 + }, + { + "epoch": 1.4968891161680729, + "grad_norm": 65.87227825537927, + "learning_rate": 5.886573757257465e-06, + "loss": 3.1829, + "step": 17563 + }, + { + "epoch": 1.4969743458620983, + "grad_norm": 34.55560008241902, + "learning_rate": 5.886085762181202e-06, + "loss": 2.3759, + "step": 17564 + }, + { + "epoch": 1.4970595755561238, + "grad_norm": 68.79065975923542, + "learning_rate": 5.885597758390635e-06, + "loss": 2.1047, + "step": 17565 + }, + { + "epoch": 1.4971448052501493, + "grad_norm": 33.63397314240069, + "learning_rate": 5.885109745890567e-06, + "loss": 2.5332, + "step": 17566 + }, + { + "epoch": 1.4972300349441745, + "grad_norm": 117.47434880837277, + "learning_rate": 5.884621724685798e-06, + "loss": 3.1034, + "step": 17567 + }, + { + "epoch": 1.4973152646382, + "grad_norm": 104.1269134758573, + "learning_rate": 5.884133694781124e-06, + "loss": 4.4654, + "step": 17568 + }, + { + "epoch": 1.4974004943322252, + "grad_norm": 76.14991702919154, + "learning_rate": 5.883645656181347e-06, + "loss": 2.3832, + "step": 17569 + }, + { + "epoch": 1.4974857240262507, + "grad_norm": 52.552550068222565, + "learning_rate": 5.883157608891265e-06, + "loss": 2.6449, + "step": 17570 + }, + { + "epoch": 1.4975709537202762, + "grad_norm": 35.261214465723526, + "learning_rate": 5.882669552915681e-06, + "loss": 2.7612, + "step": 17571 + }, + { + "epoch": 1.4976561834143016, + "grad_norm": 73.88369435457757, + "learning_rate": 5.882181488259391e-06, + "loss": 3.2389, + "step": 17572 + }, + { + "epoch": 1.4977414131083269, + "grad_norm": 43.85466497026573, + "learning_rate": 5.881693414927196e-06, + "loss": 2.7888, + "step": 17573 + }, + { + "epoch": 1.4978266428023523, + "grad_norm": 45.71333845203478, + "learning_rate": 5.881205332923899e-06, + "loss": 2.3015, + "step": 17574 + }, + { + "epoch": 1.4979118724963778, + "grad_norm": 77.55820027121347, + "learning_rate": 5.880717242254297e-06, + "loss": 3.9917, + "step": 17575 + }, + { + "epoch": 1.497997102190403, + "grad_norm": 49.421923854952695, + "learning_rate": 5.88022914292319e-06, + "loss": 3.7949, + "step": 17576 + }, + { + "epoch": 1.4980823318844285, + "grad_norm": 46.00277479027093, + "learning_rate": 5.879741034935379e-06, + "loss": 2.999, + "step": 17577 + }, + { + "epoch": 1.498167561578454, + "grad_norm": 85.14059812383262, + "learning_rate": 5.879252918295664e-06, + "loss": 3.4826, + "step": 17578 + }, + { + "epoch": 1.4982527912724795, + "grad_norm": 59.76879419007832, + "learning_rate": 5.878764793008847e-06, + "loss": 3.6582, + "step": 17579 + }, + { + "epoch": 1.4983380209665047, + "grad_norm": 50.78775701905015, + "learning_rate": 5.878276659079728e-06, + "loss": 3.3129, + "step": 17580 + }, + { + "epoch": 1.4984232506605302, + "grad_norm": 37.49428663381502, + "learning_rate": 5.8777885165131064e-06, + "loss": 3.3838, + "step": 17581 + }, + { + "epoch": 1.4985084803545554, + "grad_norm": 36.57876472007719, + "learning_rate": 5.8773003653137825e-06, + "loss": 3.2712, + "step": 17582 + }, + { + "epoch": 1.4985937100485809, + "grad_norm": 27.49878240534198, + "learning_rate": 5.876812205486561e-06, + "loss": 2.0287, + "step": 17583 + }, + { + "epoch": 1.4986789397426064, + "grad_norm": 73.59417376127858, + "learning_rate": 5.8763240370362376e-06, + "loss": 3.0718, + "step": 17584 + }, + { + "epoch": 1.4987641694366318, + "grad_norm": 93.600128076643, + "learning_rate": 5.875835859967616e-06, + "loss": 2.9352, + "step": 17585 + }, + { + "epoch": 1.498849399130657, + "grad_norm": 63.29004157002175, + "learning_rate": 5.875347674285496e-06, + "loss": 3.2903, + "step": 17586 + }, + { + "epoch": 1.4989346288246825, + "grad_norm": 56.70257853864908, + "learning_rate": 5.8748594799946805e-06, + "loss": 2.6975, + "step": 17587 + }, + { + "epoch": 1.4990198585187078, + "grad_norm": 37.26143666872925, + "learning_rate": 5.8743712770999684e-06, + "loss": 2.5514, + "step": 17588 + }, + { + "epoch": 1.4991050882127332, + "grad_norm": 44.873207114691105, + "learning_rate": 5.873883065606163e-06, + "loss": 3.4272, + "step": 17589 + }, + { + "epoch": 1.4991903179067587, + "grad_norm": 36.33528770448125, + "learning_rate": 5.873394845518066e-06, + "loss": 3.101, + "step": 17590 + }, + { + "epoch": 1.4992755476007842, + "grad_norm": 111.85631710231117, + "learning_rate": 5.872906616840477e-06, + "loss": 3.9376, + "step": 17591 + }, + { + "epoch": 1.4993607772948094, + "grad_norm": 40.49666966440772, + "learning_rate": 5.872418379578197e-06, + "loss": 3.179, + "step": 17592 + }, + { + "epoch": 1.499446006988835, + "grad_norm": 61.33633329242239, + "learning_rate": 5.8719301337360305e-06, + "loss": 2.3229, + "step": 17593 + }, + { + "epoch": 1.4995312366828604, + "grad_norm": 43.607402910069645, + "learning_rate": 5.871441879318778e-06, + "loss": 2.6832, + "step": 17594 + }, + { + "epoch": 1.4996164663768856, + "grad_norm": 38.135225416606715, + "learning_rate": 5.870953616331239e-06, + "loss": 3.4768, + "step": 17595 + }, + { + "epoch": 1.499701696070911, + "grad_norm": 49.815446309904985, + "learning_rate": 5.870465344778218e-06, + "loss": 2.4577, + "step": 17596 + }, + { + "epoch": 1.4997869257649366, + "grad_norm": 56.72686591138049, + "learning_rate": 5.869977064664516e-06, + "loss": 2.4697, + "step": 17597 + }, + { + "epoch": 1.499872155458962, + "grad_norm": 58.036778309095986, + "learning_rate": 5.869488775994936e-06, + "loss": 3.2149, + "step": 17598 + }, + { + "epoch": 1.4999573851529873, + "grad_norm": 64.23565285991711, + "learning_rate": 5.869000478774281e-06, + "loss": 2.4408, + "step": 17599 + }, + { + "epoch": 1.5000426148470127, + "grad_norm": 59.435453326781094, + "learning_rate": 5.868512173007348e-06, + "loss": 2.9082, + "step": 17600 + }, + { + "epoch": 1.500127844541038, + "grad_norm": 63.22160179173047, + "learning_rate": 5.868023858698945e-06, + "loss": 3.3902, + "step": 17601 + }, + { + "epoch": 1.5002130742350634, + "grad_norm": 33.50336351827465, + "learning_rate": 5.867535535853871e-06, + "loss": 2.7016, + "step": 17602 + }, + { + "epoch": 1.500298303929089, + "grad_norm": 58.97846610315762, + "learning_rate": 5.8670472044769325e-06, + "loss": 2.5786, + "step": 17603 + }, + { + "epoch": 1.5003835336231144, + "grad_norm": 34.345248007295794, + "learning_rate": 5.866558864572925e-06, + "loss": 2.9162, + "step": 17604 + }, + { + "epoch": 1.5004687633171399, + "grad_norm": 67.5534180593918, + "learning_rate": 5.866070516146658e-06, + "loss": 3.2885, + "step": 17605 + }, + { + "epoch": 1.500553993011165, + "grad_norm": 74.42941279290943, + "learning_rate": 5.865582159202932e-06, + "loss": 2.7283, + "step": 17606 + }, + { + "epoch": 1.5006392227051903, + "grad_norm": 107.65539395430423, + "learning_rate": 5.865093793746549e-06, + "loss": 2.2034, + "step": 17607 + }, + { + "epoch": 1.5007244523992158, + "grad_norm": 38.620194614407545, + "learning_rate": 5.864605419782311e-06, + "loss": 3.1765, + "step": 17608 + }, + { + "epoch": 1.5008096820932413, + "grad_norm": 77.03000952513105, + "learning_rate": 5.864117037315024e-06, + "loss": 2.6355, + "step": 17609 + }, + { + "epoch": 1.5008949117872668, + "grad_norm": 67.63654739359114, + "learning_rate": 5.863628646349488e-06, + "loss": 3.2143, + "step": 17610 + }, + { + "epoch": 1.5009801414812922, + "grad_norm": 38.31169992544039, + "learning_rate": 5.863140246890509e-06, + "loss": 2.5962, + "step": 17611 + }, + { + "epoch": 1.5010653711753175, + "grad_norm": 86.44702936938407, + "learning_rate": 5.862651838942888e-06, + "loss": 3.5583, + "step": 17612 + }, + { + "epoch": 1.501150600869343, + "grad_norm": 48.23316110675533, + "learning_rate": 5.8621634225114275e-06, + "loss": 2.5733, + "step": 17613 + }, + { + "epoch": 1.5012358305633682, + "grad_norm": 47.34769126230771, + "learning_rate": 5.861674997600934e-06, + "loss": 3.3714, + "step": 17614 + }, + { + "epoch": 1.5013210602573936, + "grad_norm": 53.80161174102406, + "learning_rate": 5.861186564216209e-06, + "loss": 3.429, + "step": 17615 + }, + { + "epoch": 1.5014062899514191, + "grad_norm": 45.39461740523805, + "learning_rate": 5.860698122362056e-06, + "loss": 2.4961, + "step": 17616 + }, + { + "epoch": 1.5014915196454446, + "grad_norm": 31.336098414443338, + "learning_rate": 5.860209672043279e-06, + "loss": 3.0963, + "step": 17617 + }, + { + "epoch": 1.5015767493394698, + "grad_norm": 36.815287594180134, + "learning_rate": 5.859721213264682e-06, + "loss": 2.6947, + "step": 17618 + }, + { + "epoch": 1.5016619790334953, + "grad_norm": 39.598078911689875, + "learning_rate": 5.859232746031068e-06, + "loss": 3.1351, + "step": 17619 + }, + { + "epoch": 1.5017472087275205, + "grad_norm": 58.47524142575999, + "learning_rate": 5.858744270347242e-06, + "loss": 3.1717, + "step": 17620 + }, + { + "epoch": 1.501832438421546, + "grad_norm": 51.7173362118257, + "learning_rate": 5.858255786218007e-06, + "loss": 2.2503, + "step": 17621 + }, + { + "epoch": 1.5019176681155715, + "grad_norm": 36.18765375953423, + "learning_rate": 5.857767293648168e-06, + "loss": 2.4524, + "step": 17622 + }, + { + "epoch": 1.502002897809597, + "grad_norm": 43.28740914332892, + "learning_rate": 5.857278792642527e-06, + "loss": 3.029, + "step": 17623 + }, + { + "epoch": 1.5020881275036224, + "grad_norm": 29.75062132479073, + "learning_rate": 5.856790283205891e-06, + "loss": 2.5431, + "step": 17624 + }, + { + "epoch": 1.5021733571976477, + "grad_norm": 39.631748966957566, + "learning_rate": 5.856301765343062e-06, + "loss": 2.2162, + "step": 17625 + }, + { + "epoch": 1.502258586891673, + "grad_norm": 56.43619912167601, + "learning_rate": 5.855813239058846e-06, + "loss": 2.7225, + "step": 17626 + }, + { + "epoch": 1.5023438165856984, + "grad_norm": 72.69659734881033, + "learning_rate": 5.855324704358047e-06, + "loss": 2.9069, + "step": 17627 + }, + { + "epoch": 1.5024290462797238, + "grad_norm": 34.07426286185966, + "learning_rate": 5.854836161245469e-06, + "loss": 2.2725, + "step": 17628 + }, + { + "epoch": 1.5025142759737493, + "grad_norm": 37.36635838000849, + "learning_rate": 5.854347609725918e-06, + "loss": 2.8575, + "step": 17629 + }, + { + "epoch": 1.5025995056677748, + "grad_norm": 49.98087048558117, + "learning_rate": 5.853859049804197e-06, + "loss": 2.9421, + "step": 17630 + }, + { + "epoch": 1.5026847353618, + "grad_norm": 39.09936538023195, + "learning_rate": 5.853370481485111e-06, + "loss": 3.1571, + "step": 17631 + }, + { + "epoch": 1.5027699650558255, + "grad_norm": 58.5256513519099, + "learning_rate": 5.8528819047734655e-06, + "loss": 2.5986, + "step": 17632 + }, + { + "epoch": 1.5028551947498507, + "grad_norm": 43.35626954252462, + "learning_rate": 5.852393319674066e-06, + "loss": 3.4148, + "step": 17633 + }, + { + "epoch": 1.5029404244438762, + "grad_norm": 53.06234865689803, + "learning_rate": 5.851904726191717e-06, + "loss": 2.5131, + "step": 17634 + }, + { + "epoch": 1.5030256541379017, + "grad_norm": 38.069836599148715, + "learning_rate": 5.851416124331224e-06, + "loss": 3.4024, + "step": 17635 + }, + { + "epoch": 1.5031108838319271, + "grad_norm": 42.71322678625792, + "learning_rate": 5.85092751409739e-06, + "loss": 2.7802, + "step": 17636 + }, + { + "epoch": 1.5031961135259524, + "grad_norm": 33.6750388656925, + "learning_rate": 5.850438895495024e-06, + "loss": 3.159, + "step": 17637 + }, + { + "epoch": 1.5032813432199779, + "grad_norm": 101.48304402741073, + "learning_rate": 5.8499502685289275e-06, + "loss": 3.1614, + "step": 17638 + }, + { + "epoch": 1.503366572914003, + "grad_norm": 96.3761700210672, + "learning_rate": 5.849461633203909e-06, + "loss": 2.7134, + "step": 17639 + }, + { + "epoch": 1.5034518026080286, + "grad_norm": 48.71375075240178, + "learning_rate": 5.848972989524773e-06, + "loss": 3.3284, + "step": 17640 + }, + { + "epoch": 1.503537032302054, + "grad_norm": 73.1929007407435, + "learning_rate": 5.848484337496325e-06, + "loss": 3.3188, + "step": 17641 + }, + { + "epoch": 1.5036222619960795, + "grad_norm": 33.39920271197388, + "learning_rate": 5.84799567712337e-06, + "loss": 3.2254, + "step": 17642 + }, + { + "epoch": 1.503707491690105, + "grad_norm": 37.783128153730075, + "learning_rate": 5.847507008410715e-06, + "loss": 3.3941, + "step": 17643 + }, + { + "epoch": 1.5037927213841302, + "grad_norm": 26.320883065459007, + "learning_rate": 5.847018331363166e-06, + "loss": 1.8966, + "step": 17644 + }, + { + "epoch": 1.5038779510781555, + "grad_norm": 54.919537839276764, + "learning_rate": 5.846529645985527e-06, + "loss": 2.2703, + "step": 17645 + }, + { + "epoch": 1.503963180772181, + "grad_norm": 50.39040134922565, + "learning_rate": 5.8460409522826065e-06, + "loss": 4.052, + "step": 17646 + }, + { + "epoch": 1.5040484104662064, + "grad_norm": 38.55494374408028, + "learning_rate": 5.845552250259208e-06, + "loss": 3.0411, + "step": 17647 + }, + { + "epoch": 1.5041336401602319, + "grad_norm": 58.59521815799605, + "learning_rate": 5.84506353992014e-06, + "loss": 2.5804, + "step": 17648 + }, + { + "epoch": 1.5042188698542573, + "grad_norm": 74.82973758285353, + "learning_rate": 5.844574821270207e-06, + "loss": 2.7144, + "step": 17649 + }, + { + "epoch": 1.5043040995482826, + "grad_norm": 663.5015278329741, + "learning_rate": 5.844086094314217e-06, + "loss": 4.4392, + "step": 17650 + }, + { + "epoch": 1.504389329242308, + "grad_norm": 103.60083364691499, + "learning_rate": 5.843597359056975e-06, + "loss": 5.5895, + "step": 17651 + }, + { + "epoch": 1.5044745589363333, + "grad_norm": 29.54039923598698, + "learning_rate": 5.8431086155032905e-06, + "loss": 1.8265, + "step": 17652 + }, + { + "epoch": 1.5045597886303588, + "grad_norm": 75.1170201771359, + "learning_rate": 5.8426198636579645e-06, + "loss": 3.9931, + "step": 17653 + }, + { + "epoch": 1.5046450183243842, + "grad_norm": 39.506366686656754, + "learning_rate": 5.84213110352581e-06, + "loss": 2.4555, + "step": 17654 + }, + { + "epoch": 1.5047302480184097, + "grad_norm": 41.94504786498012, + "learning_rate": 5.841642335111628e-06, + "loss": 2.5941, + "step": 17655 + }, + { + "epoch": 1.5048154777124352, + "grad_norm": 38.66373849542672, + "learning_rate": 5.8411535584202295e-06, + "loss": 2.5909, + "step": 17656 + }, + { + "epoch": 1.5049007074064604, + "grad_norm": 32.97349767386621, + "learning_rate": 5.840664773456419e-06, + "loss": 2.619, + "step": 17657 + }, + { + "epoch": 1.5049859371004857, + "grad_norm": 50.031350507159146, + "learning_rate": 5.840175980225007e-06, + "loss": 3.1933, + "step": 17658 + }, + { + "epoch": 1.5050711667945111, + "grad_norm": 82.13730711138507, + "learning_rate": 5.839687178730796e-06, + "loss": 1.7905, + "step": 17659 + }, + { + "epoch": 1.5051563964885366, + "grad_norm": 17.026163148263375, + "learning_rate": 5.839198368978596e-06, + "loss": 1.1842, + "step": 17660 + }, + { + "epoch": 1.505241626182562, + "grad_norm": 63.33189267422088, + "learning_rate": 5.838709550973213e-06, + "loss": 3.7753, + "step": 17661 + }, + { + "epoch": 1.5053268558765875, + "grad_norm": 32.06025506931219, + "learning_rate": 5.838220724719454e-06, + "loss": 2.573, + "step": 17662 + }, + { + "epoch": 1.5054120855706128, + "grad_norm": 98.22617186841043, + "learning_rate": 5.837731890222129e-06, + "loss": 4.0849, + "step": 17663 + }, + { + "epoch": 1.505497315264638, + "grad_norm": 66.4506217387095, + "learning_rate": 5.837243047486043e-06, + "loss": 2.0429, + "step": 17664 + }, + { + "epoch": 1.5055825449586635, + "grad_norm": 66.6186111116602, + "learning_rate": 5.836754196516005e-06, + "loss": 3.34, + "step": 17665 + }, + { + "epoch": 1.505667774652689, + "grad_norm": 120.91198250185231, + "learning_rate": 5.836265337316821e-06, + "loss": 3.3287, + "step": 17666 + }, + { + "epoch": 1.5057530043467144, + "grad_norm": 26.409862538857226, + "learning_rate": 5.8357764698933e-06, + "loss": 2.2165, + "step": 17667 + }, + { + "epoch": 1.50583823404074, + "grad_norm": 54.032794309499394, + "learning_rate": 5.835287594250249e-06, + "loss": 1.1782, + "step": 17668 + }, + { + "epoch": 1.5059234637347652, + "grad_norm": 27.238367310736812, + "learning_rate": 5.834798710392477e-06, + "loss": 1.8767, + "step": 17669 + }, + { + "epoch": 1.5060086934287906, + "grad_norm": 40.107327066502606, + "learning_rate": 5.834309818324791e-06, + "loss": 2.9017, + "step": 17670 + }, + { + "epoch": 1.5060939231228159, + "grad_norm": 64.78903164309087, + "learning_rate": 5.833820918052e-06, + "loss": 2.9689, + "step": 17671 + }, + { + "epoch": 1.5061791528168413, + "grad_norm": 24.13970071371754, + "learning_rate": 5.8333320095789115e-06, + "loss": 2.7602, + "step": 17672 + }, + { + "epoch": 1.5062643825108668, + "grad_norm": 104.6271979883332, + "learning_rate": 5.832843092910334e-06, + "loss": 4.0753, + "step": 17673 + }, + { + "epoch": 1.5063496122048923, + "grad_norm": 45.06365714342938, + "learning_rate": 5.832354168051075e-06, + "loss": 2.5622, + "step": 17674 + }, + { + "epoch": 1.5064348418989177, + "grad_norm": 31.90860004822942, + "learning_rate": 5.831865235005944e-06, + "loss": 2.279, + "step": 17675 + }, + { + "epoch": 1.506520071592943, + "grad_norm": 54.392704245771114, + "learning_rate": 5.831376293779749e-06, + "loss": 2.7563, + "step": 17676 + }, + { + "epoch": 1.5066053012869682, + "grad_norm": 40.12694831257165, + "learning_rate": 5.830887344377298e-06, + "loss": 2.6737, + "step": 17677 + }, + { + "epoch": 1.5066905309809937, + "grad_norm": 37.505746014816, + "learning_rate": 5.8303983868034e-06, + "loss": 3.3041, + "step": 17678 + }, + { + "epoch": 1.5067757606750192, + "grad_norm": 33.54319353814169, + "learning_rate": 5.8299094210628635e-06, + "loss": 2.7415, + "step": 17679 + }, + { + "epoch": 1.5068609903690446, + "grad_norm": 47.60169221971968, + "learning_rate": 5.8294204471605e-06, + "loss": 3.4908, + "step": 17680 + }, + { + "epoch": 1.50694622006307, + "grad_norm": 56.326113914448264, + "learning_rate": 5.828931465101114e-06, + "loss": 1.4823, + "step": 17681 + }, + { + "epoch": 1.5070314497570954, + "grad_norm": 77.38577530145419, + "learning_rate": 5.828442474889516e-06, + "loss": 3.947, + "step": 17682 + }, + { + "epoch": 1.5071166794511208, + "grad_norm": 54.15641269971026, + "learning_rate": 5.827953476530517e-06, + "loss": 2.6562, + "step": 17683 + }, + { + "epoch": 1.507201909145146, + "grad_norm": 44.673235455732666, + "learning_rate": 5.827464470028924e-06, + "loss": 3.0234, + "step": 17684 + }, + { + "epoch": 1.5072871388391715, + "grad_norm": 55.36179641534269, + "learning_rate": 5.826975455389546e-06, + "loss": 2.4722, + "step": 17685 + }, + { + "epoch": 1.507372368533197, + "grad_norm": 27.70803010401833, + "learning_rate": 5.826486432617193e-06, + "loss": 2.6664, + "step": 17686 + }, + { + "epoch": 1.5074575982272225, + "grad_norm": 32.84055405176003, + "learning_rate": 5.825997401716676e-06, + "loss": 2.9826, + "step": 17687 + }, + { + "epoch": 1.5075428279212477, + "grad_norm": 18.399736112021852, + "learning_rate": 5.825508362692801e-06, + "loss": 1.008, + "step": 17688 + }, + { + "epoch": 1.5076280576152732, + "grad_norm": 73.88845680309842, + "learning_rate": 5.825019315550381e-06, + "loss": 3.5744, + "step": 17689 + }, + { + "epoch": 1.5077132873092984, + "grad_norm": 39.15529735490888, + "learning_rate": 5.824530260294221e-06, + "loss": 2.8607, + "step": 17690 + }, + { + "epoch": 1.507798517003324, + "grad_norm": 56.49109107389166, + "learning_rate": 5.824041196929136e-06, + "loss": 3.8303, + "step": 17691 + }, + { + "epoch": 1.5078837466973494, + "grad_norm": 62.62366827260975, + "learning_rate": 5.8235521254599324e-06, + "loss": 2.6101, + "step": 17692 + }, + { + "epoch": 1.5079689763913748, + "grad_norm": 64.0217730979767, + "learning_rate": 5.8230630458914224e-06, + "loss": 2.1423, + "step": 17693 + }, + { + "epoch": 1.5080542060854003, + "grad_norm": 47.39667652296372, + "learning_rate": 5.822573958228412e-06, + "loss": 3.0419, + "step": 17694 + }, + { + "epoch": 1.5081394357794256, + "grad_norm": 44.579077075190426, + "learning_rate": 5.822084862475715e-06, + "loss": 3.1919, + "step": 17695 + }, + { + "epoch": 1.5082246654734508, + "grad_norm": 46.93617144045177, + "learning_rate": 5.82159575863814e-06, + "loss": 2.6468, + "step": 17696 + }, + { + "epoch": 1.5083098951674763, + "grad_norm": 70.37667438486297, + "learning_rate": 5.821106646720498e-06, + "loss": 2.2041, + "step": 17697 + }, + { + "epoch": 1.5083951248615017, + "grad_norm": 54.506289422372156, + "learning_rate": 5.8206175267275954e-06, + "loss": 3.2861, + "step": 17698 + }, + { + "epoch": 1.5084803545555272, + "grad_norm": 65.51438761440652, + "learning_rate": 5.820128398664249e-06, + "loss": 3.1246, + "step": 17699 + }, + { + "epoch": 1.5085655842495527, + "grad_norm": 59.44585517033757, + "learning_rate": 5.819639262535263e-06, + "loss": 3.4903, + "step": 17700 + }, + { + "epoch": 1.508650813943578, + "grad_norm": 36.99547714075736, + "learning_rate": 5.819150118345452e-06, + "loss": 2.7717, + "step": 17701 + }, + { + "epoch": 1.5087360436376034, + "grad_norm": 76.23189103441067, + "learning_rate": 5.818660966099625e-06, + "loss": 3.1754, + "step": 17702 + }, + { + "epoch": 1.5088212733316286, + "grad_norm": 33.41178012397343, + "learning_rate": 5.818171805802591e-06, + "loss": 2.7079, + "step": 17703 + }, + { + "epoch": 1.508906503025654, + "grad_norm": 50.31318897074978, + "learning_rate": 5.817682637459165e-06, + "loss": 3.2229, + "step": 17704 + }, + { + "epoch": 1.5089917327196796, + "grad_norm": 70.80191488047497, + "learning_rate": 5.817193461074154e-06, + "loss": 3.906, + "step": 17705 + }, + { + "epoch": 1.509076962413705, + "grad_norm": 34.564463958539065, + "learning_rate": 5.81670427665237e-06, + "loss": 1.498, + "step": 17706 + }, + { + "epoch": 1.5091621921077303, + "grad_norm": 47.47337926894287, + "learning_rate": 5.8162150841986245e-06, + "loss": 2.825, + "step": 17707 + }, + { + "epoch": 1.5092474218017558, + "grad_norm": 37.378908989133826, + "learning_rate": 5.8157258837177275e-06, + "loss": 1.8711, + "step": 17708 + }, + { + "epoch": 1.509332651495781, + "grad_norm": 65.64660309241584, + "learning_rate": 5.815236675214491e-06, + "loss": 3.2749, + "step": 17709 + }, + { + "epoch": 1.5094178811898065, + "grad_norm": 54.716944534487865, + "learning_rate": 5.814747458693724e-06, + "loss": 3.1742, + "step": 17710 + }, + { + "epoch": 1.509503110883832, + "grad_norm": 35.85799843197121, + "learning_rate": 5.81425823416024e-06, + "loss": 3.0043, + "step": 17711 + }, + { + "epoch": 1.5095883405778574, + "grad_norm": 28.09676458881807, + "learning_rate": 5.813769001618852e-06, + "loss": 2.2209, + "step": 17712 + }, + { + "epoch": 1.5096735702718829, + "grad_norm": 64.63071252629894, + "learning_rate": 5.8132797610743674e-06, + "loss": 2.4103, + "step": 17713 + }, + { + "epoch": 1.5097587999659081, + "grad_norm": 125.72968766962936, + "learning_rate": 5.8127905125315995e-06, + "loss": 3.331, + "step": 17714 + }, + { + "epoch": 1.5098440296599334, + "grad_norm": 53.571959574706874, + "learning_rate": 5.8123012559953605e-06, + "loss": 2.1081, + "step": 17715 + }, + { + "epoch": 1.5099292593539588, + "grad_norm": 26.690494077459658, + "learning_rate": 5.8118119914704616e-06, + "loss": 2.0778, + "step": 17716 + }, + { + "epoch": 1.5100144890479843, + "grad_norm": 36.96996105446694, + "learning_rate": 5.811322718961714e-06, + "loss": 3.1937, + "step": 17717 + }, + { + "epoch": 1.5100997187420098, + "grad_norm": 46.951827057092, + "learning_rate": 5.810833438473929e-06, + "loss": 3.6451, + "step": 17718 + }, + { + "epoch": 1.5101849484360352, + "grad_norm": 57.26378119516898, + "learning_rate": 5.8103441500119215e-06, + "loss": 3.1439, + "step": 17719 + }, + { + "epoch": 1.5102701781300605, + "grad_norm": 86.6174274260791, + "learning_rate": 5.8098548535805e-06, + "loss": 3.1142, + "step": 17720 + }, + { + "epoch": 1.510355407824086, + "grad_norm": 26.891224736904388, + "learning_rate": 5.809365549184477e-06, + "loss": 2.8376, + "step": 17721 + }, + { + "epoch": 1.5104406375181112, + "grad_norm": 37.96243432284094, + "learning_rate": 5.808876236828667e-06, + "loss": 2.6966, + "step": 17722 + }, + { + "epoch": 1.5105258672121367, + "grad_norm": 83.525165800258, + "learning_rate": 5.80838691651788e-06, + "loss": 4.0812, + "step": 17723 + }, + { + "epoch": 1.5106110969061621, + "grad_norm": 34.4561677093943, + "learning_rate": 5.807897588256928e-06, + "loss": 2.4257, + "step": 17724 + }, + { + "epoch": 1.5106963266001876, + "grad_norm": 62.684666553394685, + "learning_rate": 5.807408252050625e-06, + "loss": 3.8303, + "step": 17725 + }, + { + "epoch": 1.510781556294213, + "grad_norm": 70.84959829323411, + "learning_rate": 5.806918907903783e-06, + "loss": 2.7649, + "step": 17726 + }, + { + "epoch": 1.5108667859882383, + "grad_norm": 34.62058241438726, + "learning_rate": 5.8064295558212134e-06, + "loss": 3.562, + "step": 17727 + }, + { + "epoch": 1.5109520156822636, + "grad_norm": 37.722260460239795, + "learning_rate": 5.80594019580773e-06, + "loss": 3.4013, + "step": 17728 + }, + { + "epoch": 1.511037245376289, + "grad_norm": 105.72579977935104, + "learning_rate": 5.805450827868145e-06, + "loss": 4.1353, + "step": 17729 + }, + { + "epoch": 1.5111224750703145, + "grad_norm": 44.82373544958447, + "learning_rate": 5.804961452007271e-06, + "loss": 2.0502, + "step": 17730 + }, + { + "epoch": 1.51120770476434, + "grad_norm": 40.03640401900373, + "learning_rate": 5.804472068229922e-06, + "loss": 2.7741, + "step": 17731 + }, + { + "epoch": 1.5112929344583654, + "grad_norm": 77.13659271308401, + "learning_rate": 5.803982676540909e-06, + "loss": 2.8153, + "step": 17732 + }, + { + "epoch": 1.5113781641523907, + "grad_norm": 32.31313270884647, + "learning_rate": 5.803493276945045e-06, + "loss": 2.4843, + "step": 17733 + }, + { + "epoch": 1.511463393846416, + "grad_norm": 94.5022799264844, + "learning_rate": 5.803003869447146e-06, + "loss": 3.6378, + "step": 17734 + }, + { + "epoch": 1.5115486235404414, + "grad_norm": 76.14493841282429, + "learning_rate": 5.802514454052021e-06, + "loss": 2.3046, + "step": 17735 + }, + { + "epoch": 1.5116338532344669, + "grad_norm": 38.17250698390465, + "learning_rate": 5.802025030764486e-06, + "loss": 3.0798, + "step": 17736 + }, + { + "epoch": 1.5117190829284923, + "grad_norm": 27.130752038238988, + "learning_rate": 5.801535599589353e-06, + "loss": 2.0468, + "step": 17737 + }, + { + "epoch": 1.5118043126225178, + "grad_norm": 44.152389977172795, + "learning_rate": 5.8010461605314375e-06, + "loss": 2.9775, + "step": 17738 + }, + { + "epoch": 1.511889542316543, + "grad_norm": 63.88489713555578, + "learning_rate": 5.80055671359555e-06, + "loss": 3.3134, + "step": 17739 + }, + { + "epoch": 1.5119747720105685, + "grad_norm": 38.99083966409685, + "learning_rate": 5.800067258786506e-06, + "loss": 3.0092, + "step": 17740 + }, + { + "epoch": 1.5120600017045938, + "grad_norm": 73.25612902233885, + "learning_rate": 5.799577796109117e-06, + "loss": 3.088, + "step": 17741 + }, + { + "epoch": 1.5121452313986192, + "grad_norm": 48.70941755045139, + "learning_rate": 5.7990883255682e-06, + "loss": 3.2421, + "step": 17742 + }, + { + "epoch": 1.5122304610926447, + "grad_norm": 36.486816293959954, + "learning_rate": 5.798598847168565e-06, + "loss": 2.708, + "step": 17743 + }, + { + "epoch": 1.5123156907866702, + "grad_norm": 51.09413585655302, + "learning_rate": 5.79810936091503e-06, + "loss": 2.3548, + "step": 17744 + }, + { + "epoch": 1.5124009204806956, + "grad_norm": 27.840800774157458, + "learning_rate": 5.797619866812404e-06, + "loss": 1.5272, + "step": 17745 + }, + { + "epoch": 1.5124861501747209, + "grad_norm": 47.903924572385414, + "learning_rate": 5.7971303648655054e-06, + "loss": 3.0924, + "step": 17746 + }, + { + "epoch": 1.5125713798687461, + "grad_norm": 48.56741584857848, + "learning_rate": 5.796640855079146e-06, + "loss": 3.3557, + "step": 17747 + }, + { + "epoch": 1.5126566095627716, + "grad_norm": 36.47707722568707, + "learning_rate": 5.79615133745814e-06, + "loss": 3.8254, + "step": 17748 + }, + { + "epoch": 1.512741839256797, + "grad_norm": 63.57691449658498, + "learning_rate": 5.795661812007302e-06, + "loss": 2.7667, + "step": 17749 + }, + { + "epoch": 1.5128270689508225, + "grad_norm": 55.03417159593629, + "learning_rate": 5.795172278731446e-06, + "loss": 3.0766, + "step": 17750 + }, + { + "epoch": 1.512912298644848, + "grad_norm": 49.55904452789845, + "learning_rate": 5.7946827376353875e-06, + "loss": 3.0554, + "step": 17751 + }, + { + "epoch": 1.5129975283388732, + "grad_norm": 64.7049103638863, + "learning_rate": 5.7941931887239385e-06, + "loss": 4.1592, + "step": 17752 + }, + { + "epoch": 1.5130827580328987, + "grad_norm": 40.70813402990657, + "learning_rate": 5.7937036320019166e-06, + "loss": 2.9328, + "step": 17753 + }, + { + "epoch": 1.513167987726924, + "grad_norm": 118.95384069602468, + "learning_rate": 5.793214067474134e-06, + "loss": 3.2294, + "step": 17754 + }, + { + "epoch": 1.5132532174209494, + "grad_norm": 47.57453165690809, + "learning_rate": 5.792724495145408e-06, + "loss": 3.1091, + "step": 17755 + }, + { + "epoch": 1.513338447114975, + "grad_norm": 73.19390303419883, + "learning_rate": 5.792234915020549e-06, + "loss": 3.5869, + "step": 17756 + }, + { + "epoch": 1.5134236768090004, + "grad_norm": 51.828840600556006, + "learning_rate": 5.791745327104375e-06, + "loss": 2.6181, + "step": 17757 + }, + { + "epoch": 1.5135089065030256, + "grad_norm": 30.880282423041567, + "learning_rate": 5.791255731401701e-06, + "loss": 2.2578, + "step": 17758 + }, + { + "epoch": 1.513594136197051, + "grad_norm": 58.75621240269332, + "learning_rate": 5.790766127917342e-06, + "loss": 2.562, + "step": 17759 + }, + { + "epoch": 1.5136793658910763, + "grad_norm": 45.89269294905399, + "learning_rate": 5.79027651665611e-06, + "loss": 2.7497, + "step": 17760 + }, + { + "epoch": 1.5137645955851018, + "grad_norm": 35.88066079313912, + "learning_rate": 5.7897868976228244e-06, + "loss": 2.8334, + "step": 17761 + }, + { + "epoch": 1.5138498252791273, + "grad_norm": 40.87966064075177, + "learning_rate": 5.7892972708223e-06, + "loss": 3.0915, + "step": 17762 + }, + { + "epoch": 1.5139350549731527, + "grad_norm": 33.764497400911104, + "learning_rate": 5.788807636259347e-06, + "loss": 2.5416, + "step": 17763 + }, + { + "epoch": 1.5140202846671782, + "grad_norm": 29.522546579681023, + "learning_rate": 5.788317993938786e-06, + "loss": 2.0425, + "step": 17764 + }, + { + "epoch": 1.5141055143612034, + "grad_norm": 30.933282075326364, + "learning_rate": 5.78782834386543e-06, + "loss": 3.1862, + "step": 17765 + }, + { + "epoch": 1.5141907440552287, + "grad_norm": 70.61323261322663, + "learning_rate": 5.787338686044097e-06, + "loss": 3.336, + "step": 17766 + }, + { + "epoch": 1.5142759737492542, + "grad_norm": 62.26795698883613, + "learning_rate": 5.7868490204796e-06, + "loss": 2.1416, + "step": 17767 + }, + { + "epoch": 1.5143612034432796, + "grad_norm": 55.668671953566616, + "learning_rate": 5.786359347176756e-06, + "loss": 3.4762, + "step": 17768 + }, + { + "epoch": 1.514446433137305, + "grad_norm": 35.14833320327673, + "learning_rate": 5.7858696661403794e-06, + "loss": 2.2383, + "step": 17769 + }, + { + "epoch": 1.5145316628313306, + "grad_norm": 75.97101675873266, + "learning_rate": 5.7853799773752875e-06, + "loss": 2.863, + "step": 17770 + }, + { + "epoch": 1.5146168925253558, + "grad_norm": 38.061470988274905, + "learning_rate": 5.784890280886295e-06, + "loss": 3.2144, + "step": 17771 + }, + { + "epoch": 1.5147021222193813, + "grad_norm": 47.73419944328662, + "learning_rate": 5.784400576678219e-06, + "loss": 2.6621, + "step": 17772 + }, + { + "epoch": 1.5147873519134065, + "grad_norm": 46.85373379884907, + "learning_rate": 5.783910864755874e-06, + "loss": 3.0047, + "step": 17773 + }, + { + "epoch": 1.514872581607432, + "grad_norm": 71.24743751980681, + "learning_rate": 5.783421145124079e-06, + "loss": 3.1282, + "step": 17774 + }, + { + "epoch": 1.5149578113014575, + "grad_norm": 43.34919029443029, + "learning_rate": 5.782931417787648e-06, + "loss": 3.1003, + "step": 17775 + }, + { + "epoch": 1.515043040995483, + "grad_norm": 43.83929606132133, + "learning_rate": 5.782441682751395e-06, + "loss": 2.7143, + "step": 17776 + }, + { + "epoch": 1.5151282706895082, + "grad_norm": 38.51199617870092, + "learning_rate": 5.781951940020143e-06, + "loss": 3.0055, + "step": 17777 + }, + { + "epoch": 1.5152135003835336, + "grad_norm": 96.95969504134546, + "learning_rate": 5.781462189598701e-06, + "loss": 2.5801, + "step": 17778 + }, + { + "epoch": 1.5152987300775589, + "grad_norm": 65.6289428921162, + "learning_rate": 5.780972431491891e-06, + "loss": 2.9419, + "step": 17779 + }, + { + "epoch": 1.5153839597715844, + "grad_norm": 56.1126382175306, + "learning_rate": 5.780482665704526e-06, + "loss": 2.1494, + "step": 17780 + }, + { + "epoch": 1.5154691894656098, + "grad_norm": 32.76754204472774, + "learning_rate": 5.779992892241425e-06, + "loss": 2.2481, + "step": 17781 + }, + { + "epoch": 1.5155544191596353, + "grad_norm": 61.88635033949503, + "learning_rate": 5.779503111107403e-06, + "loss": 2.829, + "step": 17782 + }, + { + "epoch": 1.5156396488536608, + "grad_norm": 62.977727025621014, + "learning_rate": 5.77901332230728e-06, + "loss": 3.2476, + "step": 17783 + }, + { + "epoch": 1.515724878547686, + "grad_norm": 57.89882539683792, + "learning_rate": 5.778523525845867e-06, + "loss": 3.2224, + "step": 17784 + }, + { + "epoch": 1.5158101082417113, + "grad_norm": 40.60072121294378, + "learning_rate": 5.778033721727987e-06, + "loss": 3.0133, + "step": 17785 + }, + { + "epoch": 1.5158953379357367, + "grad_norm": 44.473587678286535, + "learning_rate": 5.777543909958454e-06, + "loss": 2.7177, + "step": 17786 + }, + { + "epoch": 1.5159805676297622, + "grad_norm": 49.87519584049357, + "learning_rate": 5.777054090542086e-06, + "loss": 3.2743, + "step": 17787 + }, + { + "epoch": 1.5160657973237877, + "grad_norm": 52.237746838357154, + "learning_rate": 5.776564263483697e-06, + "loss": 3.1887, + "step": 17788 + }, + { + "epoch": 1.5161510270178131, + "grad_norm": 86.60057589581767, + "learning_rate": 5.776074428788111e-06, + "loss": 3.2308, + "step": 17789 + }, + { + "epoch": 1.5162362567118384, + "grad_norm": 71.97434747448395, + "learning_rate": 5.7755845864601405e-06, + "loss": 2.5863, + "step": 17790 + }, + { + "epoch": 1.5163214864058638, + "grad_norm": 36.912381142545996, + "learning_rate": 5.775094736504602e-06, + "loss": 2.7947, + "step": 17791 + }, + { + "epoch": 1.516406716099889, + "grad_norm": 45.09372145300389, + "learning_rate": 5.774604878926315e-06, + "loss": 3.7262, + "step": 17792 + }, + { + "epoch": 1.5164919457939146, + "grad_norm": 29.82857112918705, + "learning_rate": 5.774115013730097e-06, + "loss": 1.9271, + "step": 17793 + }, + { + "epoch": 1.51657717548794, + "grad_norm": 42.76099450508211, + "learning_rate": 5.7736251409207665e-06, + "loss": 3.1207, + "step": 17794 + }, + { + "epoch": 1.5166624051819655, + "grad_norm": 40.15463482640171, + "learning_rate": 5.7731352605031395e-06, + "loss": 3.2573, + "step": 17795 + }, + { + "epoch": 1.516747634875991, + "grad_norm": 53.235785063935005, + "learning_rate": 5.772645372482035e-06, + "loss": 3.0017, + "step": 17796 + }, + { + "epoch": 1.5168328645700162, + "grad_norm": 68.00013856888583, + "learning_rate": 5.772155476862269e-06, + "loss": 2.9652, + "step": 17797 + }, + { + "epoch": 1.5169180942640415, + "grad_norm": 51.54227656848683, + "learning_rate": 5.771665573648663e-06, + "loss": 2.4283, + "step": 17798 + }, + { + "epoch": 1.517003323958067, + "grad_norm": 41.92084407278805, + "learning_rate": 5.77117566284603e-06, + "loss": 2.155, + "step": 17799 + }, + { + "epoch": 1.5170885536520924, + "grad_norm": 51.40898849203817, + "learning_rate": 5.770685744459192e-06, + "loss": 3.1347, + "step": 17800 + }, + { + "epoch": 1.5171737833461179, + "grad_norm": 28.45730876650235, + "learning_rate": 5.770195818492965e-06, + "loss": 2.1976, + "step": 17801 + }, + { + "epoch": 1.5172590130401433, + "grad_norm": 32.24148836468293, + "learning_rate": 5.769705884952171e-06, + "loss": 2.652, + "step": 17802 + }, + { + "epoch": 1.5173442427341686, + "grad_norm": 49.30251293839204, + "learning_rate": 5.769215943841623e-06, + "loss": 3.6904, + "step": 17803 + }, + { + "epoch": 1.517429472428194, + "grad_norm": 60.455445737567366, + "learning_rate": 5.768725995166142e-06, + "loss": 3.1197, + "step": 17804 + }, + { + "epoch": 1.5175147021222193, + "grad_norm": 92.73884860501155, + "learning_rate": 5.768236038930547e-06, + "loss": 2.3074, + "step": 17805 + }, + { + "epoch": 1.5175999318162448, + "grad_norm": 41.92436435481487, + "learning_rate": 5.767746075139656e-06, + "loss": 2.7492, + "step": 17806 + }, + { + "epoch": 1.5176851615102702, + "grad_norm": 47.37718249303505, + "learning_rate": 5.7672561037982866e-06, + "loss": 2.2536, + "step": 17807 + }, + { + "epoch": 1.5177703912042957, + "grad_norm": 66.0043430364171, + "learning_rate": 5.766766124911258e-06, + "loss": 2.2652, + "step": 17808 + }, + { + "epoch": 1.517855620898321, + "grad_norm": 143.7231419347882, + "learning_rate": 5.766276138483391e-06, + "loss": 2.9273, + "step": 17809 + }, + { + "epoch": 1.5179408505923464, + "grad_norm": 47.01401495736114, + "learning_rate": 5.765786144519501e-06, + "loss": 2.5361, + "step": 17810 + }, + { + "epoch": 1.5180260802863716, + "grad_norm": 73.91044324543228, + "learning_rate": 5.765296143024408e-06, + "loss": 4.3154, + "step": 17811 + }, + { + "epoch": 1.5181113099803971, + "grad_norm": 99.70266097244703, + "learning_rate": 5.764806134002932e-06, + "loss": 5.13, + "step": 17812 + }, + { + "epoch": 1.5181965396744226, + "grad_norm": 45.13347252385602, + "learning_rate": 5.764316117459894e-06, + "loss": 3.4982, + "step": 17813 + }, + { + "epoch": 1.518281769368448, + "grad_norm": 45.49528967432734, + "learning_rate": 5.763826093400109e-06, + "loss": 1.9731, + "step": 17814 + }, + { + "epoch": 1.5183669990624735, + "grad_norm": 89.82838830051152, + "learning_rate": 5.763336061828397e-06, + "loss": 2.6018, + "step": 17815 + }, + { + "epoch": 1.5184522287564988, + "grad_norm": 43.501403697786145, + "learning_rate": 5.762846022749579e-06, + "loss": 2.6847, + "step": 17816 + }, + { + "epoch": 1.518537458450524, + "grad_norm": 29.518326298833077, + "learning_rate": 5.7623559761684745e-06, + "loss": 2.0256, + "step": 17817 + }, + { + "epoch": 1.5186226881445495, + "grad_norm": 71.27936103567595, + "learning_rate": 5.7618659220898995e-06, + "loss": 3.2108, + "step": 17818 + }, + { + "epoch": 1.518707917838575, + "grad_norm": 43.905443049962734, + "learning_rate": 5.761375860518678e-06, + "loss": 2.9652, + "step": 17819 + }, + { + "epoch": 1.5187931475326004, + "grad_norm": 55.38587036522502, + "learning_rate": 5.760885791459627e-06, + "loss": 1.9957, + "step": 17820 + }, + { + "epoch": 1.5188783772266259, + "grad_norm": 39.1611500208896, + "learning_rate": 5.760395714917566e-06, + "loss": 2.4429, + "step": 17821 + }, + { + "epoch": 1.5189636069206511, + "grad_norm": 44.26832487672887, + "learning_rate": 5.759905630897317e-06, + "loss": 2.6893, + "step": 17822 + }, + { + "epoch": 1.5190488366146766, + "grad_norm": 56.095616145416734, + "learning_rate": 5.759415539403694e-06, + "loss": 4.5246, + "step": 17823 + }, + { + "epoch": 1.5191340663087018, + "grad_norm": 58.45499864464703, + "learning_rate": 5.758925440441525e-06, + "loss": 2.7158, + "step": 17824 + }, + { + "epoch": 1.5192192960027273, + "grad_norm": 41.06729185536887, + "learning_rate": 5.758435334015624e-06, + "loss": 2.9659, + "step": 17825 + }, + { + "epoch": 1.5193045256967528, + "grad_norm": 62.989602031730634, + "learning_rate": 5.757945220130814e-06, + "loss": 2.1584, + "step": 17826 + }, + { + "epoch": 1.5193897553907783, + "grad_norm": 29.164698463765223, + "learning_rate": 5.757455098791912e-06, + "loss": 2.2524, + "step": 17827 + }, + { + "epoch": 1.5194749850848035, + "grad_norm": 37.55498776343481, + "learning_rate": 5.756964970003742e-06, + "loss": 1.9093, + "step": 17828 + }, + { + "epoch": 1.519560214778829, + "grad_norm": 86.82539540361014, + "learning_rate": 5.756474833771122e-06, + "loss": 3.5389, + "step": 17829 + }, + { + "epoch": 1.5196454444728542, + "grad_norm": 67.16484888722565, + "learning_rate": 5.755984690098872e-06, + "loss": 3.2329, + "step": 17830 + }, + { + "epoch": 1.5197306741668797, + "grad_norm": 56.73007021073044, + "learning_rate": 5.7554945389918126e-06, + "loss": 3.1375, + "step": 17831 + }, + { + "epoch": 1.5198159038609051, + "grad_norm": 40.98689058052503, + "learning_rate": 5.755004380454766e-06, + "loss": 3.117, + "step": 17832 + }, + { + "epoch": 1.5199011335549306, + "grad_norm": 52.54025439104979, + "learning_rate": 5.754514214492551e-06, + "loss": 3.1049, + "step": 17833 + }, + { + "epoch": 1.519986363248956, + "grad_norm": 40.55797334482275, + "learning_rate": 5.754024041109988e-06, + "loss": 2.2865, + "step": 17834 + }, + { + "epoch": 1.5200715929429813, + "grad_norm": 149.87838129172096, + "learning_rate": 5.753533860311898e-06, + "loss": 2.9478, + "step": 17835 + }, + { + "epoch": 1.5201568226370066, + "grad_norm": 27.527727461838133, + "learning_rate": 5.753043672103102e-06, + "loss": 2.4661, + "step": 17836 + }, + { + "epoch": 1.520242052331032, + "grad_norm": 61.023374004489845, + "learning_rate": 5.752553476488422e-06, + "loss": 2.9574, + "step": 17837 + }, + { + "epoch": 1.5203272820250575, + "grad_norm": 32.4578655937509, + "learning_rate": 5.752063273472675e-06, + "loss": 2.8464, + "step": 17838 + }, + { + "epoch": 1.520412511719083, + "grad_norm": 31.920043706818415, + "learning_rate": 5.751573063060687e-06, + "loss": 2.5169, + "step": 17839 + }, + { + "epoch": 1.5204977414131085, + "grad_norm": 61.02670920516038, + "learning_rate": 5.7510828452572755e-06, + "loss": 3.4677, + "step": 17840 + }, + { + "epoch": 1.5205829711071337, + "grad_norm": 31.552101885225742, + "learning_rate": 5.750592620067263e-06, + "loss": 2.6124, + "step": 17841 + }, + { + "epoch": 1.5206682008011592, + "grad_norm": 30.74869722140846, + "learning_rate": 5.75010238749547e-06, + "loss": 2.1871, + "step": 17842 + }, + { + "epoch": 1.5207534304951844, + "grad_norm": 42.79298847614179, + "learning_rate": 5.749612147546718e-06, + "loss": 3.2133, + "step": 17843 + }, + { + "epoch": 1.5208386601892099, + "grad_norm": 35.20546966437111, + "learning_rate": 5.749121900225828e-06, + "loss": 2.5163, + "step": 17844 + }, + { + "epoch": 1.5209238898832353, + "grad_norm": 35.012230692212796, + "learning_rate": 5.7486316455376235e-06, + "loss": 2.5805, + "step": 17845 + }, + { + "epoch": 1.5210091195772608, + "grad_norm": 73.34035952824368, + "learning_rate": 5.748141383486923e-06, + "loss": 4.6408, + "step": 17846 + }, + { + "epoch": 1.521094349271286, + "grad_norm": 43.34653817008123, + "learning_rate": 5.747651114078548e-06, + "loss": 2.9226, + "step": 17847 + }, + { + "epoch": 1.5211795789653115, + "grad_norm": 85.7438014640422, + "learning_rate": 5.747160837317324e-06, + "loss": 2.2473, + "step": 17848 + }, + { + "epoch": 1.5212648086593368, + "grad_norm": 30.319147804199652, + "learning_rate": 5.746670553208069e-06, + "loss": 2.6827, + "step": 17849 + }, + { + "epoch": 1.5213500383533622, + "grad_norm": 52.224929178593285, + "learning_rate": 5.746180261755605e-06, + "loss": 2.2931, + "step": 17850 + }, + { + "epoch": 1.5214352680473877, + "grad_norm": 68.69226572231224, + "learning_rate": 5.745689962964754e-06, + "loss": 3.8331, + "step": 17851 + }, + { + "epoch": 1.5215204977414132, + "grad_norm": 51.03368642838074, + "learning_rate": 5.7451996568403415e-06, + "loss": 1.7452, + "step": 17852 + }, + { + "epoch": 1.5216057274354386, + "grad_norm": 118.89406178192692, + "learning_rate": 5.744709343387185e-06, + "loss": 3.1215, + "step": 17853 + }, + { + "epoch": 1.521690957129464, + "grad_norm": 76.23171940882453, + "learning_rate": 5.7442190226101066e-06, + "loss": 2.4978, + "step": 17854 + }, + { + "epoch": 1.5217761868234891, + "grad_norm": 35.297857503449, + "learning_rate": 5.743728694513931e-06, + "loss": 2.4227, + "step": 17855 + }, + { + "epoch": 1.5218614165175146, + "grad_norm": 46.12985123440568, + "learning_rate": 5.74323835910348e-06, + "loss": 3.3545, + "step": 17856 + }, + { + "epoch": 1.52194664621154, + "grad_norm": 28.446234811777668, + "learning_rate": 5.7427480163835736e-06, + "loss": 2.3111, + "step": 17857 + }, + { + "epoch": 1.5220318759055655, + "grad_norm": 94.459265810232, + "learning_rate": 5.7422576663590365e-06, + "loss": 2.3585, + "step": 17858 + }, + { + "epoch": 1.522117105599591, + "grad_norm": 36.9834723579521, + "learning_rate": 5.741767309034689e-06, + "loss": 2.2263, + "step": 17859 + }, + { + "epoch": 1.5222023352936163, + "grad_norm": 54.3481578824262, + "learning_rate": 5.741276944415357e-06, + "loss": 3.0063, + "step": 17860 + }, + { + "epoch": 1.5222875649876417, + "grad_norm": 35.4980300036794, + "learning_rate": 5.74078657250586e-06, + "loss": 2.565, + "step": 17861 + }, + { + "epoch": 1.522372794681667, + "grad_norm": 30.686724133146374, + "learning_rate": 5.74029619331102e-06, + "loss": 2.8185, + "step": 17862 + }, + { + "epoch": 1.5224580243756924, + "grad_norm": 31.57231110594275, + "learning_rate": 5.739805806835663e-06, + "loss": 2.5238, + "step": 17863 + }, + { + "epoch": 1.522543254069718, + "grad_norm": 34.934601750440955, + "learning_rate": 5.739315413084609e-06, + "loss": 2.3364, + "step": 17864 + }, + { + "epoch": 1.5226284837637434, + "grad_norm": 218.40686996819858, + "learning_rate": 5.738825012062682e-06, + "loss": 2.9706, + "step": 17865 + }, + { + "epoch": 1.5227137134577688, + "grad_norm": 52.92883788455646, + "learning_rate": 5.738334603774703e-06, + "loss": 2.2077, + "step": 17866 + }, + { + "epoch": 1.522798943151794, + "grad_norm": 64.62625949414087, + "learning_rate": 5.737844188225499e-06, + "loss": 2.9293, + "step": 17867 + }, + { + "epoch": 1.5228841728458193, + "grad_norm": 35.99649558500892, + "learning_rate": 5.737353765419889e-06, + "loss": 2.5239, + "step": 17868 + }, + { + "epoch": 1.5229694025398448, + "grad_norm": 37.62774316623036, + "learning_rate": 5.736863335362699e-06, + "loss": 2.899, + "step": 17869 + }, + { + "epoch": 1.5230546322338703, + "grad_norm": 51.58787886344603, + "learning_rate": 5.736372898058749e-06, + "loss": 2.8693, + "step": 17870 + }, + { + "epoch": 1.5231398619278957, + "grad_norm": 54.14100410470821, + "learning_rate": 5.735882453512865e-06, + "loss": 2.8201, + "step": 17871 + }, + { + "epoch": 1.5232250916219212, + "grad_norm": 68.62460500273536, + "learning_rate": 5.735392001729869e-06, + "loss": 3.2018, + "step": 17872 + }, + { + "epoch": 1.5233103213159465, + "grad_norm": 85.70741552191689, + "learning_rate": 5.734901542714587e-06, + "loss": 2.2979, + "step": 17873 + }, + { + "epoch": 1.523395551009972, + "grad_norm": 34.208008628061364, + "learning_rate": 5.734411076471838e-06, + "loss": 3.1804, + "step": 17874 + }, + { + "epoch": 1.5234807807039972, + "grad_norm": 58.11792063203629, + "learning_rate": 5.733920603006449e-06, + "loss": 2.4535, + "step": 17875 + }, + { + "epoch": 1.5235660103980226, + "grad_norm": 69.62079289082368, + "learning_rate": 5.733430122323241e-06, + "loss": 2.5793, + "step": 17876 + }, + { + "epoch": 1.523651240092048, + "grad_norm": 39.77486928470187, + "learning_rate": 5.732939634427042e-06, + "loss": 3.9392, + "step": 17877 + }, + { + "epoch": 1.5237364697860736, + "grad_norm": 34.02420788147267, + "learning_rate": 5.732449139322671e-06, + "loss": 2.3236, + "step": 17878 + }, + { + "epoch": 1.5238216994800988, + "grad_norm": 68.35051033965598, + "learning_rate": 5.731958637014954e-06, + "loss": 2.5022, + "step": 17879 + }, + { + "epoch": 1.5239069291741243, + "grad_norm": 114.7171440019933, + "learning_rate": 5.7314681275087155e-06, + "loss": 4.152, + "step": 17880 + }, + { + "epoch": 1.5239921588681495, + "grad_norm": 120.71923304381252, + "learning_rate": 5.730977610808778e-06, + "loss": 3.0431, + "step": 17881 + }, + { + "epoch": 1.524077388562175, + "grad_norm": 54.03488446919612, + "learning_rate": 5.7304870869199654e-06, + "loss": 3.0114, + "step": 17882 + }, + { + "epoch": 1.5241626182562005, + "grad_norm": 43.4094405825257, + "learning_rate": 5.729996555847103e-06, + "loss": 3.4976, + "step": 17883 + }, + { + "epoch": 1.524247847950226, + "grad_norm": 40.610647874572024, + "learning_rate": 5.729506017595015e-06, + "loss": 2.0006, + "step": 17884 + }, + { + "epoch": 1.5243330776442514, + "grad_norm": 81.48006997425614, + "learning_rate": 5.729015472168526e-06, + "loss": 3.6984, + "step": 17885 + }, + { + "epoch": 1.5244183073382767, + "grad_norm": 57.24276897720733, + "learning_rate": 5.728524919572458e-06, + "loss": 3.3027, + "step": 17886 + }, + { + "epoch": 1.524503537032302, + "grad_norm": 101.61251441606751, + "learning_rate": 5.728034359811636e-06, + "loss": 3.529, + "step": 17887 + }, + { + "epoch": 1.5245887667263274, + "grad_norm": 51.4766048419813, + "learning_rate": 5.727543792890888e-06, + "loss": 4.6431, + "step": 17888 + }, + { + "epoch": 1.5246739964203528, + "grad_norm": 37.50390193379974, + "learning_rate": 5.727053218815034e-06, + "loss": 3.0111, + "step": 17889 + }, + { + "epoch": 1.5247592261143783, + "grad_norm": 97.11294006747164, + "learning_rate": 5.726562637588902e-06, + "loss": 3.034, + "step": 17890 + }, + { + "epoch": 1.5248444558084038, + "grad_norm": 36.9205971911719, + "learning_rate": 5.726072049217313e-06, + "loss": 2.4312, + "step": 17891 + }, + { + "epoch": 1.524929685502429, + "grad_norm": 42.24450194111603, + "learning_rate": 5.725581453705096e-06, + "loss": 3.5696, + "step": 17892 + }, + { + "epoch": 1.5250149151964545, + "grad_norm": 61.59448114874535, + "learning_rate": 5.7250908510570735e-06, + "loss": 3.0578, + "step": 17893 + }, + { + "epoch": 1.5251001448904797, + "grad_norm": 17.056255146066576, + "learning_rate": 5.724600241278069e-06, + "loss": 1.2776, + "step": 17894 + }, + { + "epoch": 1.5251853745845052, + "grad_norm": 46.62676725266012, + "learning_rate": 5.7241096243729115e-06, + "loss": 2.5738, + "step": 17895 + }, + { + "epoch": 1.5252706042785307, + "grad_norm": 36.893007422247834, + "learning_rate": 5.723619000346421e-06, + "loss": 3.3, + "step": 17896 + }, + { + "epoch": 1.5253558339725561, + "grad_norm": 35.8781171003847, + "learning_rate": 5.723128369203427e-06, + "loss": 2.2847, + "step": 17897 + }, + { + "epoch": 1.5254410636665814, + "grad_norm": 229.88060121924534, + "learning_rate": 5.722637730948751e-06, + "loss": 4.1449, + "step": 17898 + }, + { + "epoch": 1.5255262933606069, + "grad_norm": 115.226803540174, + "learning_rate": 5.722147085587223e-06, + "loss": 2.3511, + "step": 17899 + }, + { + "epoch": 1.525611523054632, + "grad_norm": 43.72875486438243, + "learning_rate": 5.721656433123662e-06, + "loss": 1.9023, + "step": 17900 + }, + { + "epoch": 1.5256967527486576, + "grad_norm": 88.43327715820804, + "learning_rate": 5.721165773562899e-06, + "loss": 4.207, + "step": 17901 + }, + { + "epoch": 1.525781982442683, + "grad_norm": 76.38573445560367, + "learning_rate": 5.720675106909757e-06, + "loss": 2.9414, + "step": 17902 + }, + { + "epoch": 1.5258672121367085, + "grad_norm": 38.6502418577372, + "learning_rate": 5.7201844331690615e-06, + "loss": 3.1226, + "step": 17903 + }, + { + "epoch": 1.525952441830734, + "grad_norm": 84.1594976938657, + "learning_rate": 5.7196937523456376e-06, + "loss": 3.4998, + "step": 17904 + }, + { + "epoch": 1.5260376715247592, + "grad_norm": 54.894273821478905, + "learning_rate": 5.719203064444312e-06, + "loss": 2.3335, + "step": 17905 + }, + { + "epoch": 1.5261229012187845, + "grad_norm": 49.7758698428412, + "learning_rate": 5.71871236946991e-06, + "loss": 3.8575, + "step": 17906 + }, + { + "epoch": 1.52620813091281, + "grad_norm": 300.78540576132366, + "learning_rate": 5.7182216674272575e-06, + "loss": 2.2534, + "step": 17907 + }, + { + "epoch": 1.5262933606068354, + "grad_norm": 54.25471473350689, + "learning_rate": 5.7177309583211815e-06, + "loss": 2.7503, + "step": 17908 + }, + { + "epoch": 1.5263785903008609, + "grad_norm": 70.46000406433777, + "learning_rate": 5.717240242156503e-06, + "loss": 2.5437, + "step": 17909 + }, + { + "epoch": 1.5264638199948863, + "grad_norm": 48.209126905885746, + "learning_rate": 5.716749518938056e-06, + "loss": 2.4917, + "step": 17910 + }, + { + "epoch": 1.5265490496889116, + "grad_norm": 70.27077124800826, + "learning_rate": 5.71625878867066e-06, + "loss": 2.9187, + "step": 17911 + }, + { + "epoch": 1.526634279382937, + "grad_norm": 39.75491257003327, + "learning_rate": 5.715768051359145e-06, + "loss": 3.1672, + "step": 17912 + }, + { + "epoch": 1.5267195090769623, + "grad_norm": 27.347033224777253, + "learning_rate": 5.715277307008333e-06, + "loss": 2.5545, + "step": 17913 + }, + { + "epoch": 1.5268047387709878, + "grad_norm": 74.14783939352272, + "learning_rate": 5.714786555623055e-06, + "loss": 3.0765, + "step": 17914 + }, + { + "epoch": 1.5268899684650132, + "grad_norm": 39.20729572712179, + "learning_rate": 5.714295797208135e-06, + "loss": 2.4965, + "step": 17915 + }, + { + "epoch": 1.5269751981590387, + "grad_norm": 34.45022581977896, + "learning_rate": 5.7138050317684e-06, + "loss": 3.2587, + "step": 17916 + }, + { + "epoch": 1.5270604278530642, + "grad_norm": 34.519849967551075, + "learning_rate": 5.713314259308674e-06, + "loss": 2.4093, + "step": 17917 + }, + { + "epoch": 1.5271456575470894, + "grad_norm": 33.59402343544266, + "learning_rate": 5.712823479833788e-06, + "loss": 2.7766, + "step": 17918 + }, + { + "epoch": 1.5272308872411147, + "grad_norm": 46.114921321706305, + "learning_rate": 5.712332693348565e-06, + "loss": 2.7312, + "step": 17919 + }, + { + "epoch": 1.5273161169351401, + "grad_norm": 40.34630390531983, + "learning_rate": 5.711841899857835e-06, + "loss": 3.2358, + "step": 17920 + }, + { + "epoch": 1.5274013466291656, + "grad_norm": 45.175614844078275, + "learning_rate": 5.711351099366421e-06, + "loss": 2.7954, + "step": 17921 + }, + { + "epoch": 1.527486576323191, + "grad_norm": 127.4714072937717, + "learning_rate": 5.710860291879152e-06, + "loss": 4.4432, + "step": 17922 + }, + { + "epoch": 1.5275718060172165, + "grad_norm": 52.396747600294866, + "learning_rate": 5.710369477400855e-06, + "loss": 2.2935, + "step": 17923 + }, + { + "epoch": 1.5276570357112418, + "grad_norm": 129.87229557274233, + "learning_rate": 5.709878655936356e-06, + "loss": 4.1036, + "step": 17924 + }, + { + "epoch": 1.527742265405267, + "grad_norm": 48.56292314778296, + "learning_rate": 5.709387827490481e-06, + "loss": 3.5085, + "step": 17925 + }, + { + "epoch": 1.5278274950992925, + "grad_norm": 96.00151249991573, + "learning_rate": 5.7088969920680606e-06, + "loss": 3.0332, + "step": 17926 + }, + { + "epoch": 1.527912724793318, + "grad_norm": 29.210886186313253, + "learning_rate": 5.708406149673919e-06, + "loss": 2.1977, + "step": 17927 + }, + { + "epoch": 1.5279979544873434, + "grad_norm": 62.91135763804882, + "learning_rate": 5.707915300312885e-06, + "loss": 2.9848, + "step": 17928 + }, + { + "epoch": 1.528083184181369, + "grad_norm": 62.86687397841341, + "learning_rate": 5.7074244439897856e-06, + "loss": 2.9281, + "step": 17929 + }, + { + "epoch": 1.5281684138753941, + "grad_norm": 79.21959170664505, + "learning_rate": 5.7069335807094475e-06, + "loss": 2.8679, + "step": 17930 + }, + { + "epoch": 1.5282536435694196, + "grad_norm": 42.94566036704096, + "learning_rate": 5.7064427104766985e-06, + "loss": 2.711, + "step": 17931 + }, + { + "epoch": 1.5283388732634449, + "grad_norm": 75.05385385022093, + "learning_rate": 5.705951833296366e-06, + "loss": 3.0839, + "step": 17932 + }, + { + "epoch": 1.5284241029574703, + "grad_norm": 53.9310188011515, + "learning_rate": 5.705460949173278e-06, + "loss": 2.788, + "step": 17933 + }, + { + "epoch": 1.5285093326514958, + "grad_norm": 37.30897236121185, + "learning_rate": 5.7049700581122615e-06, + "loss": 1.9188, + "step": 17934 + }, + { + "epoch": 1.5285945623455213, + "grad_norm": 74.9230351524949, + "learning_rate": 5.704479160118145e-06, + "loss": 3.3727, + "step": 17935 + }, + { + "epoch": 1.5286797920395467, + "grad_norm": 42.053219828678166, + "learning_rate": 5.703988255195756e-06, + "loss": 3.3872, + "step": 17936 + }, + { + "epoch": 1.528765021733572, + "grad_norm": 62.46047506193173, + "learning_rate": 5.703497343349922e-06, + "loss": 2.5439, + "step": 17937 + }, + { + "epoch": 1.5288502514275972, + "grad_norm": 176.89983959557927, + "learning_rate": 5.703006424585471e-06, + "loss": 3.2515, + "step": 17938 + }, + { + "epoch": 1.5289354811216227, + "grad_norm": 61.02075596860639, + "learning_rate": 5.702515498907232e-06, + "loss": 2.8931, + "step": 17939 + }, + { + "epoch": 1.5290207108156482, + "grad_norm": 33.53636781717916, + "learning_rate": 5.70202456632003e-06, + "loss": 1.6399, + "step": 17940 + }, + { + "epoch": 1.5291059405096736, + "grad_norm": 24.3324686733045, + "learning_rate": 5.7015336268286966e-06, + "loss": 1.7751, + "step": 17941 + }, + { + "epoch": 1.529191170203699, + "grad_norm": 54.92455130626822, + "learning_rate": 5.70104268043806e-06, + "loss": 2.0104, + "step": 17942 + }, + { + "epoch": 1.5292763998977243, + "grad_norm": 35.48221519533309, + "learning_rate": 5.700551727152946e-06, + "loss": 2.9088, + "step": 17943 + }, + { + "epoch": 1.5293616295917498, + "grad_norm": 101.7040147399848, + "learning_rate": 5.700060766978185e-06, + "loss": 3.6505, + "step": 17944 + }, + { + "epoch": 1.529446859285775, + "grad_norm": 42.015042334655625, + "learning_rate": 5.699569799918603e-06, + "loss": 3.1654, + "step": 17945 + }, + { + "epoch": 1.5295320889798005, + "grad_norm": 34.13181763924806, + "learning_rate": 5.699078825979032e-06, + "loss": 2.5132, + "step": 17946 + }, + { + "epoch": 1.529617318673826, + "grad_norm": 67.56266342149242, + "learning_rate": 5.698587845164297e-06, + "loss": 2.7187, + "step": 17947 + }, + { + "epoch": 1.5297025483678515, + "grad_norm": 65.41709379043459, + "learning_rate": 5.698096857479228e-06, + "loss": 3.2999, + "step": 17948 + }, + { + "epoch": 1.5297877780618767, + "grad_norm": 32.03805485841304, + "learning_rate": 5.6976058629286545e-06, + "loss": 2.5161, + "step": 17949 + }, + { + "epoch": 1.5298730077559022, + "grad_norm": 84.2668118954873, + "learning_rate": 5.6971148615174055e-06, + "loss": 2.836, + "step": 17950 + }, + { + "epoch": 1.5299582374499274, + "grad_norm": 44.7330722649115, + "learning_rate": 5.696623853250308e-06, + "loss": 2.5709, + "step": 17951 + }, + { + "epoch": 1.530043467143953, + "grad_norm": 51.85604792763758, + "learning_rate": 5.69613283813219e-06, + "loss": 3.6555, + "step": 17952 + }, + { + "epoch": 1.5301286968379784, + "grad_norm": 46.03268279765219, + "learning_rate": 5.695641816167885e-06, + "loss": 3.2512, + "step": 17953 + }, + { + "epoch": 1.5302139265320038, + "grad_norm": 63.447819316448275, + "learning_rate": 5.695150787362217e-06, + "loss": 2.6956, + "step": 17954 + }, + { + "epoch": 1.5302991562260293, + "grad_norm": 50.524013300828535, + "learning_rate": 5.69465975172002e-06, + "loss": 2.5221, + "step": 17955 + }, + { + "epoch": 1.5303843859200545, + "grad_norm": 42.44416026423383, + "learning_rate": 5.694168709246116e-06, + "loss": 3.1994, + "step": 17956 + }, + { + "epoch": 1.5304696156140798, + "grad_norm": 32.28711209472897, + "learning_rate": 5.693677659945343e-06, + "loss": 2.976, + "step": 17957 + }, + { + "epoch": 1.5305548453081053, + "grad_norm": 39.30021702622697, + "learning_rate": 5.693186603822522e-06, + "loss": 2.1863, + "step": 17958 + }, + { + "epoch": 1.5306400750021307, + "grad_norm": 146.82337644892826, + "learning_rate": 5.692695540882489e-06, + "loss": 2.7586, + "step": 17959 + }, + { + "epoch": 1.5307253046961562, + "grad_norm": 24.133571792567018, + "learning_rate": 5.692204471130068e-06, + "loss": 1.4885, + "step": 17960 + }, + { + "epoch": 1.5308105343901817, + "grad_norm": 129.2292621273383, + "learning_rate": 5.691713394570094e-06, + "loss": 2.8041, + "step": 17961 + }, + { + "epoch": 1.530895764084207, + "grad_norm": 34.619098003698895, + "learning_rate": 5.691222311207392e-06, + "loss": 2.6456, + "step": 17962 + }, + { + "epoch": 1.5309809937782324, + "grad_norm": 64.72974105605991, + "learning_rate": 5.690731221046794e-06, + "loss": 3.457, + "step": 17963 + }, + { + "epoch": 1.5310662234722576, + "grad_norm": 46.21868432672726, + "learning_rate": 5.690240124093128e-06, + "loss": 2.0203, + "step": 17964 + }, + { + "epoch": 1.531151453166283, + "grad_norm": 92.02462644375437, + "learning_rate": 5.689749020351225e-06, + "loss": 4.4709, + "step": 17965 + }, + { + "epoch": 1.5312366828603086, + "grad_norm": 45.96879359419054, + "learning_rate": 5.689257909825916e-06, + "loss": 3.7023, + "step": 17966 + }, + { + "epoch": 1.531321912554334, + "grad_norm": 64.17001625517244, + "learning_rate": 5.688766792522028e-06, + "loss": 3.2061, + "step": 17967 + }, + { + "epoch": 1.5314071422483593, + "grad_norm": 103.72505319868125, + "learning_rate": 5.688275668444392e-06, + "loss": 3.1935, + "step": 17968 + }, + { + "epoch": 1.5314923719423847, + "grad_norm": 125.15349422144155, + "learning_rate": 5.687784537597838e-06, + "loss": 4.3281, + "step": 17969 + }, + { + "epoch": 1.53157760163641, + "grad_norm": 44.53249202125551, + "learning_rate": 5.6872933999871975e-06, + "loss": 3.2801, + "step": 17970 + }, + { + "epoch": 1.5316628313304355, + "grad_norm": 38.35102603693253, + "learning_rate": 5.686802255617299e-06, + "loss": 3.0006, + "step": 17971 + }, + { + "epoch": 1.531748061024461, + "grad_norm": 30.85746021872812, + "learning_rate": 5.686311104492973e-06, + "loss": 2.1932, + "step": 17972 + }, + { + "epoch": 1.5318332907184864, + "grad_norm": 59.43374717213506, + "learning_rate": 5.685819946619049e-06, + "loss": 2.7154, + "step": 17973 + }, + { + "epoch": 1.5319185204125119, + "grad_norm": 70.03507501377595, + "learning_rate": 5.685328782000361e-06, + "loss": 2.7503, + "step": 17974 + }, + { + "epoch": 1.532003750106537, + "grad_norm": 35.189082868378186, + "learning_rate": 5.684837610641735e-06, + "loss": 1.8833, + "step": 17975 + }, + { + "epoch": 1.5320889798005624, + "grad_norm": 41.002947859787916, + "learning_rate": 5.684346432548002e-06, + "loss": 3.1319, + "step": 17976 + }, + { + "epoch": 1.5321742094945878, + "grad_norm": 67.01598409281571, + "learning_rate": 5.683855247723996e-06, + "loss": 2.9818, + "step": 17977 + }, + { + "epoch": 1.5322594391886133, + "grad_norm": 40.23073826961318, + "learning_rate": 5.683364056174545e-06, + "loss": 2.8808, + "step": 17978 + }, + { + "epoch": 1.5323446688826388, + "grad_norm": 21.385866146132855, + "learning_rate": 5.68287285790448e-06, + "loss": 1.6238, + "step": 17979 + }, + { + "epoch": 1.5324298985766642, + "grad_norm": 69.27372490418028, + "learning_rate": 5.68238165291863e-06, + "loss": 2.9873, + "step": 17980 + }, + { + "epoch": 1.5325151282706895, + "grad_norm": 37.07922653378007, + "learning_rate": 5.6818904412218304e-06, + "loss": 2.4282, + "step": 17981 + }, + { + "epoch": 1.532600357964715, + "grad_norm": 39.806151524364466, + "learning_rate": 5.6813992228189075e-06, + "loss": 4.0878, + "step": 17982 + }, + { + "epoch": 1.5326855876587402, + "grad_norm": 88.41957082655551, + "learning_rate": 5.6809079977146944e-06, + "loss": 3.2675, + "step": 17983 + }, + { + "epoch": 1.5327708173527657, + "grad_norm": 64.58072829615544, + "learning_rate": 5.680416765914022e-06, + "loss": 2.7308, + "step": 17984 + }, + { + "epoch": 1.5328560470467911, + "grad_norm": 52.24664387290594, + "learning_rate": 5.679925527421721e-06, + "loss": 3.1152, + "step": 17985 + }, + { + "epoch": 1.5329412767408166, + "grad_norm": 56.75893280901205, + "learning_rate": 5.679434282242621e-06, + "loss": 3.473, + "step": 17986 + }, + { + "epoch": 1.533026506434842, + "grad_norm": 30.322360628716044, + "learning_rate": 5.678943030381557e-06, + "loss": 2.2407, + "step": 17987 + }, + { + "epoch": 1.5331117361288673, + "grad_norm": 54.96001539232283, + "learning_rate": 5.678451771843357e-06, + "loss": 3.3101, + "step": 17988 + }, + { + "epoch": 1.5331969658228926, + "grad_norm": 58.63089710056539, + "learning_rate": 5.677960506632855e-06, + "loss": 2.6106, + "step": 17989 + }, + { + "epoch": 1.533282195516918, + "grad_norm": 56.34150752609017, + "learning_rate": 5.6774692347548786e-06, + "loss": 3.5925, + "step": 17990 + }, + { + "epoch": 1.5333674252109435, + "grad_norm": 47.92282010541021, + "learning_rate": 5.676977956214262e-06, + "loss": 2.6471, + "step": 17991 + }, + { + "epoch": 1.533452654904969, + "grad_norm": 36.17000319597676, + "learning_rate": 5.676486671015837e-06, + "loss": 3.1021, + "step": 17992 + }, + { + "epoch": 1.5335378845989944, + "grad_norm": 41.14461200341267, + "learning_rate": 5.675995379164434e-06, + "loss": 2.4746, + "step": 17993 + }, + { + "epoch": 1.5336231142930197, + "grad_norm": 45.84728803214394, + "learning_rate": 5.675504080664887e-06, + "loss": 3.1859, + "step": 17994 + }, + { + "epoch": 1.5337083439870451, + "grad_norm": 34.0089296120261, + "learning_rate": 5.6750127755220215e-06, + "loss": 2.6473, + "step": 17995 + }, + { + "epoch": 1.5337935736810704, + "grad_norm": 45.64848042728889, + "learning_rate": 5.674521463740678e-06, + "loss": 2.8887, + "step": 17996 + }, + { + "epoch": 1.5338788033750959, + "grad_norm": 52.185523905158405, + "learning_rate": 5.6740301453256806e-06, + "loss": 2.626, + "step": 17997 + }, + { + "epoch": 1.5339640330691213, + "grad_norm": 58.94325143764384, + "learning_rate": 5.673538820281866e-06, + "loss": 3.4021, + "step": 17998 + }, + { + "epoch": 1.5340492627631468, + "grad_norm": 49.080509511271174, + "learning_rate": 5.673047488614064e-06, + "loss": 2.6233, + "step": 17999 + }, + { + "epoch": 1.534134492457172, + "grad_norm": 70.71663367225813, + "learning_rate": 5.672556150327109e-06, + "loss": 4.1106, + "step": 18000 + }, + { + "epoch": 1.5342197221511975, + "grad_norm": 50.59733771619933, + "learning_rate": 5.67206480542583e-06, + "loss": 2.2669, + "step": 18001 + }, + { + "epoch": 1.5343049518452228, + "grad_norm": 41.820503945156574, + "learning_rate": 5.671573453915063e-06, + "loss": 2.9524, + "step": 18002 + }, + { + "epoch": 1.5343901815392482, + "grad_norm": 86.99753322804328, + "learning_rate": 5.671082095799634e-06, + "loss": 4.0804, + "step": 18003 + }, + { + "epoch": 1.5344754112332737, + "grad_norm": 63.232252236468995, + "learning_rate": 5.670590731084382e-06, + "loss": 2.8173, + "step": 18004 + }, + { + "epoch": 1.5345606409272992, + "grad_norm": 83.20587552830462, + "learning_rate": 5.670099359774135e-06, + "loss": 2.215, + "step": 18005 + }, + { + "epoch": 1.5346458706213246, + "grad_norm": 42.615398395168064, + "learning_rate": 5.66960798187373e-06, + "loss": 2.8949, + "step": 18006 + }, + { + "epoch": 1.5347311003153499, + "grad_norm": 64.65313959679408, + "learning_rate": 5.669116597387992e-06, + "loss": 2.4873, + "step": 18007 + }, + { + "epoch": 1.5348163300093751, + "grad_norm": 66.98704209375347, + "learning_rate": 5.668625206321762e-06, + "loss": 2.9958, + "step": 18008 + }, + { + "epoch": 1.5349015597034006, + "grad_norm": 54.6823822962156, + "learning_rate": 5.668133808679869e-06, + "loss": 3.1026, + "step": 18009 + }, + { + "epoch": 1.534986789397426, + "grad_norm": 234.61594022610845, + "learning_rate": 5.667642404467143e-06, + "loss": 1.6145, + "step": 18010 + }, + { + "epoch": 1.5350720190914515, + "grad_norm": 115.32295600076819, + "learning_rate": 5.66715099368842e-06, + "loss": 3.8573, + "step": 18011 + }, + { + "epoch": 1.535157248785477, + "grad_norm": 33.238204620750004, + "learning_rate": 5.666659576348532e-06, + "loss": 3.3313, + "step": 18012 + }, + { + "epoch": 1.5352424784795022, + "grad_norm": 34.879786025055445, + "learning_rate": 5.666168152452314e-06, + "loss": 2.9573, + "step": 18013 + }, + { + "epoch": 1.5353277081735277, + "grad_norm": 34.526091099031426, + "learning_rate": 5.665676722004594e-06, + "loss": 2.1305, + "step": 18014 + }, + { + "epoch": 1.535412937867553, + "grad_norm": 53.54322410659335, + "learning_rate": 5.66518528501021e-06, + "loss": 2.8063, + "step": 18015 + }, + { + "epoch": 1.5354981675615784, + "grad_norm": 186.18339191529662, + "learning_rate": 5.664693841473991e-06, + "loss": 3.584, + "step": 18016 + }, + { + "epoch": 1.5355833972556039, + "grad_norm": 40.19811319403824, + "learning_rate": 5.664202391400775e-06, + "loss": 3.0002, + "step": 18017 + }, + { + "epoch": 1.5356686269496294, + "grad_norm": 55.96437258528935, + "learning_rate": 5.66371093479539e-06, + "loss": 2.6902, + "step": 18018 + }, + { + "epoch": 1.5357538566436546, + "grad_norm": 35.89453293687945, + "learning_rate": 5.663219471662672e-06, + "loss": 2.8175, + "step": 18019 + }, + { + "epoch": 1.53583908633768, + "grad_norm": 40.37976860923743, + "learning_rate": 5.6627280020074545e-06, + "loss": 3.0823, + "step": 18020 + }, + { + "epoch": 1.5359243160317053, + "grad_norm": 40.984440771556834, + "learning_rate": 5.662236525834571e-06, + "loss": 2.9746, + "step": 18021 + }, + { + "epoch": 1.5360095457257308, + "grad_norm": 34.20483478966517, + "learning_rate": 5.661745043148854e-06, + "loss": 2.857, + "step": 18022 + }, + { + "epoch": 1.5360947754197563, + "grad_norm": 37.73017288561122, + "learning_rate": 5.661253553955137e-06, + "loss": 3.4096, + "step": 18023 + }, + { + "epoch": 1.5361800051137817, + "grad_norm": 81.79768803303982, + "learning_rate": 5.6607620582582556e-06, + "loss": 4.0052, + "step": 18024 + }, + { + "epoch": 1.5362652348078072, + "grad_norm": 73.13347158556643, + "learning_rate": 5.66027055606304e-06, + "loss": 3.0561, + "step": 18025 + }, + { + "epoch": 1.5363504645018324, + "grad_norm": 22.865206972509174, + "learning_rate": 5.659779047374326e-06, + "loss": 1.8371, + "step": 18026 + }, + { + "epoch": 1.5364356941958577, + "grad_norm": 42.92603636009044, + "learning_rate": 5.659287532196947e-06, + "loss": 2.8973, + "step": 18027 + }, + { + "epoch": 1.5365209238898832, + "grad_norm": 31.804600153216683, + "learning_rate": 5.6587960105357395e-06, + "loss": 1.9657, + "step": 18028 + }, + { + "epoch": 1.5366061535839086, + "grad_norm": 48.32721252277537, + "learning_rate": 5.6583044823955335e-06, + "loss": 2.2949, + "step": 18029 + }, + { + "epoch": 1.536691383277934, + "grad_norm": 36.16249056127425, + "learning_rate": 5.657812947781164e-06, + "loss": 2.7047, + "step": 18030 + }, + { + "epoch": 1.5367766129719596, + "grad_norm": 20.22030960456077, + "learning_rate": 5.6573214066974655e-06, + "loss": 1.7357, + "step": 18031 + }, + { + "epoch": 1.5368618426659848, + "grad_norm": 84.75716974892323, + "learning_rate": 5.656829859149273e-06, + "loss": 3.5406, + "step": 18032 + }, + { + "epoch": 1.5369470723600103, + "grad_norm": 102.16898951167605, + "learning_rate": 5.65633830514142e-06, + "loss": 4.3409, + "step": 18033 + }, + { + "epoch": 1.5370323020540355, + "grad_norm": 40.64290897141058, + "learning_rate": 5.655846744678739e-06, + "loss": 3.2648, + "step": 18034 + }, + { + "epoch": 1.537117531748061, + "grad_norm": 48.039910913441375, + "learning_rate": 5.655355177766067e-06, + "loss": 3.6464, + "step": 18035 + }, + { + "epoch": 1.5372027614420865, + "grad_norm": 44.33423780822999, + "learning_rate": 5.654863604408238e-06, + "loss": 2.6892, + "step": 18036 + }, + { + "epoch": 1.537287991136112, + "grad_norm": 80.04146599502073, + "learning_rate": 5.6543720246100846e-06, + "loss": 3.0319, + "step": 18037 + }, + { + "epoch": 1.5373732208301372, + "grad_norm": 41.28390636833381, + "learning_rate": 5.653880438376442e-06, + "loss": 3.3608, + "step": 18038 + }, + { + "epoch": 1.5374584505241626, + "grad_norm": 30.05071185100663, + "learning_rate": 5.653388845712146e-06, + "loss": 1.885, + "step": 18039 + }, + { + "epoch": 1.5375436802181879, + "grad_norm": 54.1038963726048, + "learning_rate": 5.6528972466220314e-06, + "loss": 2.8938, + "step": 18040 + }, + { + "epoch": 1.5376289099122133, + "grad_norm": 27.905868888112995, + "learning_rate": 5.652405641110931e-06, + "loss": 1.5625, + "step": 18041 + }, + { + "epoch": 1.5377141396062388, + "grad_norm": 70.0623549150343, + "learning_rate": 5.651914029183678e-06, + "loss": 2.9272, + "step": 18042 + }, + { + "epoch": 1.5377993693002643, + "grad_norm": 46.28379205217858, + "learning_rate": 5.651422410845112e-06, + "loss": 3.1031, + "step": 18043 + }, + { + "epoch": 1.5378845989942898, + "grad_norm": 25.400897370992027, + "learning_rate": 5.650930786100065e-06, + "loss": 2.1274, + "step": 18044 + }, + { + "epoch": 1.537969828688315, + "grad_norm": 40.073113574906834, + "learning_rate": 5.650439154953373e-06, + "loss": 2.6979, + "step": 18045 + }, + { + "epoch": 1.5380550583823402, + "grad_norm": 58.756832333657414, + "learning_rate": 5.6499475174098684e-06, + "loss": 3.3559, + "step": 18046 + }, + { + "epoch": 1.5381402880763657, + "grad_norm": 48.31996781704469, + "learning_rate": 5.6494558734743895e-06, + "loss": 2.614, + "step": 18047 + }, + { + "epoch": 1.5382255177703912, + "grad_norm": 40.313943656243794, + "learning_rate": 5.6489642231517705e-06, + "loss": 3.4026, + "step": 18048 + }, + { + "epoch": 1.5383107474644167, + "grad_norm": 32.88263173259308, + "learning_rate": 5.648472566446846e-06, + "loss": 2.5972, + "step": 18049 + }, + { + "epoch": 1.5383959771584421, + "grad_norm": 37.02463172163628, + "learning_rate": 5.647980903364451e-06, + "loss": 2.9838, + "step": 18050 + }, + { + "epoch": 1.5384812068524674, + "grad_norm": 70.72650495543158, + "learning_rate": 5.647489233909421e-06, + "loss": 2.4241, + "step": 18051 + }, + { + "epoch": 1.5385664365464928, + "grad_norm": 66.8301862788496, + "learning_rate": 5.646997558086592e-06, + "loss": 3.0511, + "step": 18052 + }, + { + "epoch": 1.538651666240518, + "grad_norm": 98.76113068733638, + "learning_rate": 5.6465058759008e-06, + "loss": 2.5816, + "step": 18053 + }, + { + "epoch": 1.5387368959345435, + "grad_norm": 28.267389111831356, + "learning_rate": 5.646014187356879e-06, + "loss": 2.3565, + "step": 18054 + }, + { + "epoch": 1.538822125628569, + "grad_norm": 36.590060883055195, + "learning_rate": 5.6455224924596656e-06, + "loss": 3.0959, + "step": 18055 + }, + { + "epoch": 1.5389073553225945, + "grad_norm": 87.3137473434945, + "learning_rate": 5.645030791213994e-06, + "loss": 3.5695, + "step": 18056 + }, + { + "epoch": 1.53899258501662, + "grad_norm": 46.91283140357657, + "learning_rate": 5.6445390836247015e-06, + "loss": 3.298, + "step": 18057 + }, + { + "epoch": 1.5390778147106452, + "grad_norm": 30.000910274950133, + "learning_rate": 5.6440473696966235e-06, + "loss": 2.1152, + "step": 18058 + }, + { + "epoch": 1.5391630444046704, + "grad_norm": 57.12505554494945, + "learning_rate": 5.643555649434595e-06, + "loss": 2.7685, + "step": 18059 + }, + { + "epoch": 1.539248274098696, + "grad_norm": 45.33915763451283, + "learning_rate": 5.643063922843454e-06, + "loss": 3.0394, + "step": 18060 + }, + { + "epoch": 1.5393335037927214, + "grad_norm": 34.467817203402724, + "learning_rate": 5.642572189928033e-06, + "loss": 2.5187, + "step": 18061 + }, + { + "epoch": 1.5394187334867468, + "grad_norm": 63.034111962299875, + "learning_rate": 5.642080450693169e-06, + "loss": 2.5222, + "step": 18062 + }, + { + "epoch": 1.5395039631807723, + "grad_norm": 61.9165378230111, + "learning_rate": 5.6415887051437e-06, + "loss": 2.8912, + "step": 18063 + }, + { + "epoch": 1.5395891928747976, + "grad_norm": 57.343482285569785, + "learning_rate": 5.641096953284462e-06, + "loss": 2.1346, + "step": 18064 + }, + { + "epoch": 1.539674422568823, + "grad_norm": 51.404140644993355, + "learning_rate": 5.640605195120289e-06, + "loss": 2.0664, + "step": 18065 + }, + { + "epoch": 1.5397596522628483, + "grad_norm": 52.63417801893476, + "learning_rate": 5.640113430656019e-06, + "loss": 2.4827, + "step": 18066 + }, + { + "epoch": 1.5398448819568737, + "grad_norm": 39.152038673170786, + "learning_rate": 5.639621659896488e-06, + "loss": 2.7947, + "step": 18067 + }, + { + "epoch": 1.5399301116508992, + "grad_norm": 78.52558642357421, + "learning_rate": 5.639129882846532e-06, + "loss": 4.4603, + "step": 18068 + }, + { + "epoch": 1.5400153413449247, + "grad_norm": 36.029261333871716, + "learning_rate": 5.638638099510987e-06, + "loss": 2.6184, + "step": 18069 + }, + { + "epoch": 1.54010057103895, + "grad_norm": 37.429852787509745, + "learning_rate": 5.63814630989469e-06, + "loss": 3.4345, + "step": 18070 + }, + { + "epoch": 1.5401858007329754, + "grad_norm": 38.90791128842072, + "learning_rate": 5.637654514002479e-06, + "loss": 3.4575, + "step": 18071 + }, + { + "epoch": 1.5402710304270006, + "grad_norm": 42.17609025699291, + "learning_rate": 5.637162711839188e-06, + "loss": 3.0932, + "step": 18072 + }, + { + "epoch": 1.540356260121026, + "grad_norm": 216.1663210119373, + "learning_rate": 5.6366709034096545e-06, + "loss": 2.6559, + "step": 18073 + }, + { + "epoch": 1.5404414898150516, + "grad_norm": 85.27907600300527, + "learning_rate": 5.636179088718716e-06, + "loss": 2.4675, + "step": 18074 + }, + { + "epoch": 1.540526719509077, + "grad_norm": 74.83687078783267, + "learning_rate": 5.6356872677712105e-06, + "loss": 2.5511, + "step": 18075 + }, + { + "epoch": 1.5406119492031025, + "grad_norm": 44.3249691240483, + "learning_rate": 5.635195440571972e-06, + "loss": 3.0278, + "step": 18076 + }, + { + "epoch": 1.5406971788971278, + "grad_norm": 54.65241296495137, + "learning_rate": 5.634703607125837e-06, + "loss": 3.0942, + "step": 18077 + }, + { + "epoch": 1.540782408591153, + "grad_norm": 78.70312286879287, + "learning_rate": 5.634211767437647e-06, + "loss": 2.0263, + "step": 18078 + }, + { + "epoch": 1.5408676382851785, + "grad_norm": 39.47141481955436, + "learning_rate": 5.633719921512236e-06, + "loss": 2.9883, + "step": 18079 + }, + { + "epoch": 1.540952867979204, + "grad_norm": 48.056031573657194, + "learning_rate": 5.633228069354441e-06, + "loss": 2.4995, + "step": 18080 + }, + { + "epoch": 1.5410380976732294, + "grad_norm": 34.828657260124665, + "learning_rate": 5.632736210969099e-06, + "loss": 1.9872, + "step": 18081 + }, + { + "epoch": 1.5411233273672549, + "grad_norm": 40.77143993424705, + "learning_rate": 5.632244346361048e-06, + "loss": 2.7684, + "step": 18082 + }, + { + "epoch": 1.5412085570612801, + "grad_norm": 33.63413643883385, + "learning_rate": 5.631752475535125e-06, + "loss": 2.5098, + "step": 18083 + }, + { + "epoch": 1.5412937867553056, + "grad_norm": 100.45847944575827, + "learning_rate": 5.631260598496168e-06, + "loss": 4.1518, + "step": 18084 + }, + { + "epoch": 1.5413790164493308, + "grad_norm": 30.71343346755279, + "learning_rate": 5.630768715249013e-06, + "loss": 2.2477, + "step": 18085 + }, + { + "epoch": 1.5414642461433563, + "grad_norm": 33.26683273658662, + "learning_rate": 5.6302768257985e-06, + "loss": 3.0085, + "step": 18086 + }, + { + "epoch": 1.5415494758373818, + "grad_norm": 42.5418854459694, + "learning_rate": 5.629784930149462e-06, + "loss": 2.1645, + "step": 18087 + }, + { + "epoch": 1.5416347055314072, + "grad_norm": 69.63516033576538, + "learning_rate": 5.629293028306743e-06, + "loss": 3.5275, + "step": 18088 + }, + { + "epoch": 1.5417199352254325, + "grad_norm": 41.856266188347625, + "learning_rate": 5.628801120275174e-06, + "loss": 3.6182, + "step": 18089 + }, + { + "epoch": 1.541805164919458, + "grad_norm": 136.96203164269957, + "learning_rate": 5.628309206059598e-06, + "loss": 3.051, + "step": 18090 + }, + { + "epoch": 1.5418903946134832, + "grad_norm": 32.80988385031096, + "learning_rate": 5.627817285664849e-06, + "loss": 2.5542, + "step": 18091 + }, + { + "epoch": 1.5419756243075087, + "grad_norm": 92.41177289987878, + "learning_rate": 5.627325359095768e-06, + "loss": 2.8293, + "step": 18092 + }, + { + "epoch": 1.5420608540015341, + "grad_norm": 92.31260720238161, + "learning_rate": 5.626833426357189e-06, + "loss": 3.5269, + "step": 18093 + }, + { + "epoch": 1.5421460836955596, + "grad_norm": 50.898640635279925, + "learning_rate": 5.626341487453955e-06, + "loss": 3.3824, + "step": 18094 + }, + { + "epoch": 1.542231313389585, + "grad_norm": 27.038712650440015, + "learning_rate": 5.6258495423909e-06, + "loss": 2.1011, + "step": 18095 + }, + { + "epoch": 1.5423165430836103, + "grad_norm": 56.66439747384996, + "learning_rate": 5.6253575911728646e-06, + "loss": 2.5021, + "step": 18096 + }, + { + "epoch": 1.5424017727776356, + "grad_norm": 67.26142641056784, + "learning_rate": 5.624865633804683e-06, + "loss": 2.2719, + "step": 18097 + }, + { + "epoch": 1.542487002471661, + "grad_norm": 99.77958923943221, + "learning_rate": 5.624373670291199e-06, + "loss": 3.8728, + "step": 18098 + }, + { + "epoch": 1.5425722321656865, + "grad_norm": 26.250574918479433, + "learning_rate": 5.623881700637248e-06, + "loss": 2.6131, + "step": 18099 + }, + { + "epoch": 1.542657461859712, + "grad_norm": 32.3721278618976, + "learning_rate": 5.623389724847666e-06, + "loss": 2.7155, + "step": 18100 + }, + { + "epoch": 1.5427426915537374, + "grad_norm": 33.385501023122444, + "learning_rate": 5.622897742927296e-06, + "loss": 2.4028, + "step": 18101 + }, + { + "epoch": 1.5428279212477627, + "grad_norm": 43.730428042671214, + "learning_rate": 5.622405754880973e-06, + "loss": 2.998, + "step": 18102 + }, + { + "epoch": 1.5429131509417882, + "grad_norm": 57.823743407346576, + "learning_rate": 5.6219137607135374e-06, + "loss": 2.2472, + "step": 18103 + }, + { + "epoch": 1.5429983806358134, + "grad_norm": 23.846842893177875, + "learning_rate": 5.621421760429827e-06, + "loss": 1.9954, + "step": 18104 + }, + { + "epoch": 1.5430836103298389, + "grad_norm": 35.666566900604174, + "learning_rate": 5.620929754034681e-06, + "loss": 2.7998, + "step": 18105 + }, + { + "epoch": 1.5431688400238643, + "grad_norm": 42.48688680573786, + "learning_rate": 5.620437741532936e-06, + "loss": 3.0902, + "step": 18106 + }, + { + "epoch": 1.5432540697178898, + "grad_norm": 48.97881197113256, + "learning_rate": 5.619945722929434e-06, + "loss": 2.0391, + "step": 18107 + }, + { + "epoch": 1.5433392994119153, + "grad_norm": 32.41852709361769, + "learning_rate": 5.619453698229011e-06, + "loss": 2.3975, + "step": 18108 + }, + { + "epoch": 1.5434245291059405, + "grad_norm": 24.67936811438023, + "learning_rate": 5.618961667436508e-06, + "loss": 2.1643, + "step": 18109 + }, + { + "epoch": 1.5435097587999658, + "grad_norm": 42.690999410530374, + "learning_rate": 5.618469630556762e-06, + "loss": 2.2588, + "step": 18110 + }, + { + "epoch": 1.5435949884939912, + "grad_norm": 89.72358421514927, + "learning_rate": 5.617977587594614e-06, + "loss": 3.6698, + "step": 18111 + }, + { + "epoch": 1.5436802181880167, + "grad_norm": 117.51909206248874, + "learning_rate": 5.617485538554901e-06, + "loss": 3.2791, + "step": 18112 + }, + { + "epoch": 1.5437654478820422, + "grad_norm": 93.78158957762888, + "learning_rate": 5.616993483442464e-06, + "loss": 3.7201, + "step": 18113 + }, + { + "epoch": 1.5438506775760676, + "grad_norm": 43.79424860480831, + "learning_rate": 5.61650142226214e-06, + "loss": 2.8854, + "step": 18114 + }, + { + "epoch": 1.5439359072700929, + "grad_norm": 74.89474260518402, + "learning_rate": 5.616009355018771e-06, + "loss": 2.4614, + "step": 18115 + }, + { + "epoch": 1.5440211369641181, + "grad_norm": 43.81110005732891, + "learning_rate": 5.615517281717193e-06, + "loss": 2.8836, + "step": 18116 + }, + { + "epoch": 1.5441063666581436, + "grad_norm": 40.6240721847552, + "learning_rate": 5.615025202362247e-06, + "loss": 3.1224, + "step": 18117 + }, + { + "epoch": 1.544191596352169, + "grad_norm": 64.57350686054998, + "learning_rate": 5.614533116958776e-06, + "loss": 2.9074, + "step": 18118 + }, + { + "epoch": 1.5442768260461945, + "grad_norm": 75.41083688997476, + "learning_rate": 5.614041025511613e-06, + "loss": 3.0054, + "step": 18119 + }, + { + "epoch": 1.54436205574022, + "grad_norm": 42.148222792639004, + "learning_rate": 5.613548928025601e-06, + "loss": 2.8675, + "step": 18120 + }, + { + "epoch": 1.5444472854342453, + "grad_norm": 49.99527090854444, + "learning_rate": 5.6130568245055785e-06, + "loss": 1.4047, + "step": 18121 + }, + { + "epoch": 1.5445325151282707, + "grad_norm": 49.98498415840487, + "learning_rate": 5.612564714956387e-06, + "loss": 3.0571, + "step": 18122 + }, + { + "epoch": 1.544617744822296, + "grad_norm": 50.93695518122067, + "learning_rate": 5.612072599382865e-06, + "loss": 3.3043, + "step": 18123 + }, + { + "epoch": 1.5447029745163214, + "grad_norm": 48.56116857172243, + "learning_rate": 5.611580477789851e-06, + "loss": 3.2368, + "step": 18124 + }, + { + "epoch": 1.544788204210347, + "grad_norm": 54.541952569606714, + "learning_rate": 5.6110883501821864e-06, + "loss": 3.4805, + "step": 18125 + }, + { + "epoch": 1.5448734339043724, + "grad_norm": 38.88725392611849, + "learning_rate": 5.610596216564711e-06, + "loss": 3.2129, + "step": 18126 + }, + { + "epoch": 1.5449586635983978, + "grad_norm": 56.310310228134924, + "learning_rate": 5.6101040769422655e-06, + "loss": 2.6087, + "step": 18127 + }, + { + "epoch": 1.545043893292423, + "grad_norm": 61.50950624730424, + "learning_rate": 5.609611931319686e-06, + "loss": 2.9336, + "step": 18128 + }, + { + "epoch": 1.5451291229864483, + "grad_norm": 30.323780879065744, + "learning_rate": 5.6091197797018185e-06, + "loss": 2.4213, + "step": 18129 + }, + { + "epoch": 1.5452143526804738, + "grad_norm": 43.265990489521926, + "learning_rate": 5.608627622093498e-06, + "loss": 3.3374, + "step": 18130 + }, + { + "epoch": 1.5452995823744993, + "grad_norm": 34.956762575243076, + "learning_rate": 5.608135458499568e-06, + "loss": 3.1113, + "step": 18131 + }, + { + "epoch": 1.5453848120685247, + "grad_norm": 77.2244819033768, + "learning_rate": 5.607643288924866e-06, + "loss": 2.8424, + "step": 18132 + }, + { + "epoch": 1.5454700417625502, + "grad_norm": 26.856045443524465, + "learning_rate": 5.607151113374235e-06, + "loss": 1.5589, + "step": 18133 + }, + { + "epoch": 1.5455552714565755, + "grad_norm": 99.24837768141997, + "learning_rate": 5.6066589318525135e-06, + "loss": 3.4916, + "step": 18134 + }, + { + "epoch": 1.545640501150601, + "grad_norm": 26.575093012234184, + "learning_rate": 5.606166744364543e-06, + "loss": 2.3601, + "step": 18135 + }, + { + "epoch": 1.5457257308446262, + "grad_norm": 33.00979769227196, + "learning_rate": 5.605674550915161e-06, + "loss": 2.7849, + "step": 18136 + }, + { + "epoch": 1.5458109605386516, + "grad_norm": 42.066645037479795, + "learning_rate": 5.605182351509213e-06, + "loss": 2.2735, + "step": 18137 + }, + { + "epoch": 1.545896190232677, + "grad_norm": 67.75164666879697, + "learning_rate": 5.604690146151535e-06, + "loss": 3.6205, + "step": 18138 + }, + { + "epoch": 1.5459814199267026, + "grad_norm": 60.958344702547734, + "learning_rate": 5.604197934846974e-06, + "loss": 2.8013, + "step": 18139 + }, + { + "epoch": 1.5460666496207278, + "grad_norm": 51.42690821215882, + "learning_rate": 5.603705717600361e-06, + "loss": 2.5618, + "step": 18140 + }, + { + "epoch": 1.5461518793147533, + "grad_norm": 35.368488532481784, + "learning_rate": 5.603213494416545e-06, + "loss": 2.3365, + "step": 18141 + }, + { + "epoch": 1.5462371090087785, + "grad_norm": 91.4299295456672, + "learning_rate": 5.602721265300365e-06, + "loss": 3.5105, + "step": 18142 + }, + { + "epoch": 1.546322338702804, + "grad_norm": 50.13916542594189, + "learning_rate": 5.6022290302566594e-06, + "loss": 2.5263, + "step": 18143 + }, + { + "epoch": 1.5464075683968295, + "grad_norm": 49.72266340809994, + "learning_rate": 5.601736789290269e-06, + "loss": 2.5461, + "step": 18144 + }, + { + "epoch": 1.546492798090855, + "grad_norm": 37.57005828210282, + "learning_rate": 5.6012445424060395e-06, + "loss": 3.4322, + "step": 18145 + }, + { + "epoch": 1.5465780277848804, + "grad_norm": 37.68975515987467, + "learning_rate": 5.600752289608807e-06, + "loss": 2.7795, + "step": 18146 + }, + { + "epoch": 1.5466632574789057, + "grad_norm": 38.10383229199186, + "learning_rate": 5.600260030903415e-06, + "loss": 3.0127, + "step": 18147 + }, + { + "epoch": 1.546748487172931, + "grad_norm": 38.10744928951762, + "learning_rate": 5.599767766294703e-06, + "loss": 2.5007, + "step": 18148 + }, + { + "epoch": 1.5468337168669564, + "grad_norm": 41.92864001153835, + "learning_rate": 5.599275495787515e-06, + "loss": 2.9845, + "step": 18149 + }, + { + "epoch": 1.5469189465609818, + "grad_norm": 55.11440681346016, + "learning_rate": 5.59878321938669e-06, + "loss": 2.5207, + "step": 18150 + }, + { + "epoch": 1.5470041762550073, + "grad_norm": 65.86863285616108, + "learning_rate": 5.59829093709707e-06, + "loss": 3.0325, + "step": 18151 + }, + { + "epoch": 1.5470894059490328, + "grad_norm": 60.93133419008024, + "learning_rate": 5.597798648923496e-06, + "loss": 1.9845, + "step": 18152 + }, + { + "epoch": 1.547174635643058, + "grad_norm": 31.828765284521072, + "learning_rate": 5.59730635487081e-06, + "loss": 2.1252, + "step": 18153 + }, + { + "epoch": 1.5472598653370835, + "grad_norm": 64.04021393969583, + "learning_rate": 5.596814054943853e-06, + "loss": 2.4338, + "step": 18154 + }, + { + "epoch": 1.5473450950311087, + "grad_norm": 38.604434535300385, + "learning_rate": 5.596321749147468e-06, + "loss": 3.0541, + "step": 18155 + }, + { + "epoch": 1.5474303247251342, + "grad_norm": 56.336368801938576, + "learning_rate": 5.595829437486494e-06, + "loss": 3.2451, + "step": 18156 + }, + { + "epoch": 1.5475155544191597, + "grad_norm": 42.20247483339012, + "learning_rate": 5.595337119965776e-06, + "loss": 3.2427, + "step": 18157 + }, + { + "epoch": 1.5476007841131851, + "grad_norm": 40.05808286934352, + "learning_rate": 5.594844796590153e-06, + "loss": 2.4533, + "step": 18158 + }, + { + "epoch": 1.5476860138072104, + "grad_norm": 48.22873182112453, + "learning_rate": 5.594352467364468e-06, + "loss": 3.7376, + "step": 18159 + }, + { + "epoch": 1.5477712435012358, + "grad_norm": 50.85942512069784, + "learning_rate": 5.593860132293562e-06, + "loss": 3.3268, + "step": 18160 + }, + { + "epoch": 1.547856473195261, + "grad_norm": 17.810694772614923, + "learning_rate": 5.593367791382279e-06, + "loss": 1.2666, + "step": 18161 + }, + { + "epoch": 1.5479417028892866, + "grad_norm": 34.844375665575654, + "learning_rate": 5.592875444635458e-06, + "loss": 2.6824, + "step": 18162 + }, + { + "epoch": 1.548026932583312, + "grad_norm": 138.51873747860677, + "learning_rate": 5.592383092057944e-06, + "loss": 2.6626, + "step": 18163 + }, + { + "epoch": 1.5481121622773375, + "grad_norm": 58.82757700948189, + "learning_rate": 5.591890733654576e-06, + "loss": 3.3086, + "step": 18164 + }, + { + "epoch": 1.548197391971363, + "grad_norm": 55.43250326887191, + "learning_rate": 5.5913983694302e-06, + "loss": 2.4043, + "step": 18165 + }, + { + "epoch": 1.5482826216653882, + "grad_norm": 32.570929396008914, + "learning_rate": 5.590905999389654e-06, + "loss": 2.9902, + "step": 18166 + }, + { + "epoch": 1.5483678513594135, + "grad_norm": 146.88260046097542, + "learning_rate": 5.590413623537783e-06, + "loss": 3.0072, + "step": 18167 + }, + { + "epoch": 1.548453081053439, + "grad_norm": 44.97258778348803, + "learning_rate": 5.589921241879428e-06, + "loss": 2.8598, + "step": 18168 + }, + { + "epoch": 1.5485383107474644, + "grad_norm": 61.381808198628576, + "learning_rate": 5.589428854419434e-06, + "loss": 3.292, + "step": 18169 + }, + { + "epoch": 1.5486235404414899, + "grad_norm": 52.01419500036774, + "learning_rate": 5.5889364611626405e-06, + "loss": 2.9844, + "step": 18170 + }, + { + "epoch": 1.5487087701355153, + "grad_norm": 36.138717786529064, + "learning_rate": 5.588444062113889e-06, + "loss": 2.42, + "step": 18171 + }, + { + "epoch": 1.5487939998295406, + "grad_norm": 44.32389529438798, + "learning_rate": 5.587951657278027e-06, + "loss": 3.2121, + "step": 18172 + }, + { + "epoch": 1.548879229523566, + "grad_norm": 83.75694219314447, + "learning_rate": 5.587459246659892e-06, + "loss": 2.3718, + "step": 18173 + }, + { + "epoch": 1.5489644592175913, + "grad_norm": 33.38138404212532, + "learning_rate": 5.5869668302643295e-06, + "loss": 2.3035, + "step": 18174 + }, + { + "epoch": 1.5490496889116168, + "grad_norm": 69.4775045005218, + "learning_rate": 5.586474408096181e-06, + "loss": 3.7663, + "step": 18175 + }, + { + "epoch": 1.5491349186056422, + "grad_norm": 31.07698638366556, + "learning_rate": 5.585981980160291e-06, + "loss": 3.0098, + "step": 18176 + }, + { + "epoch": 1.5492201482996677, + "grad_norm": 77.12666675795327, + "learning_rate": 5.5854895464615e-06, + "loss": 2.9398, + "step": 18177 + }, + { + "epoch": 1.5493053779936932, + "grad_norm": 42.21954350639841, + "learning_rate": 5.584997107004652e-06, + "loss": 4.0914, + "step": 18178 + }, + { + "epoch": 1.5493906076877184, + "grad_norm": 55.074550346106534, + "learning_rate": 5.584504661794588e-06, + "loss": 3.7577, + "step": 18179 + }, + { + "epoch": 1.5494758373817437, + "grad_norm": 42.564335336841005, + "learning_rate": 5.584012210836157e-06, + "loss": 2.2101, + "step": 18180 + }, + { + "epoch": 1.5495610670757691, + "grad_norm": 46.99843263001696, + "learning_rate": 5.583519754134196e-06, + "loss": 2.6332, + "step": 18181 + }, + { + "epoch": 1.5496462967697946, + "grad_norm": 22.980694962636207, + "learning_rate": 5.58302729169355e-06, + "loss": 1.7676, + "step": 18182 + }, + { + "epoch": 1.54973152646382, + "grad_norm": 63.32387679690594, + "learning_rate": 5.582534823519061e-06, + "loss": 3.6725, + "step": 18183 + }, + { + "epoch": 1.5498167561578455, + "grad_norm": 90.48328245237155, + "learning_rate": 5.582042349615575e-06, + "loss": 2.8651, + "step": 18184 + }, + { + "epoch": 1.5499019858518708, + "grad_norm": 43.16707483797226, + "learning_rate": 5.5815498699879346e-06, + "loss": 2.8856, + "step": 18185 + }, + { + "epoch": 1.549987215545896, + "grad_norm": 29.9113748685888, + "learning_rate": 5.581057384640981e-06, + "loss": 2.0265, + "step": 18186 + }, + { + "epoch": 1.5500724452399215, + "grad_norm": 54.19553052629164, + "learning_rate": 5.580564893579559e-06, + "loss": 2.7089, + "step": 18187 + }, + { + "epoch": 1.550157674933947, + "grad_norm": 72.50849438085139, + "learning_rate": 5.580072396808513e-06, + "loss": 2.4273, + "step": 18188 + }, + { + "epoch": 1.5502429046279724, + "grad_norm": 47.83657625244467, + "learning_rate": 5.579579894332684e-06, + "loss": 2.9261, + "step": 18189 + }, + { + "epoch": 1.550328134321998, + "grad_norm": 65.88618914637993, + "learning_rate": 5.579087386156917e-06, + "loss": 4.1349, + "step": 18190 + }, + { + "epoch": 1.5504133640160231, + "grad_norm": 46.12786168746538, + "learning_rate": 5.578594872286056e-06, + "loss": 2.7664, + "step": 18191 + }, + { + "epoch": 1.5504985937100486, + "grad_norm": 36.87031622181266, + "learning_rate": 5.578102352724944e-06, + "loss": 2.6545, + "step": 18192 + }, + { + "epoch": 1.5505838234040739, + "grad_norm": 89.48732355450747, + "learning_rate": 5.577609827478427e-06, + "loss": 2.6094, + "step": 18193 + }, + { + "epoch": 1.5506690530980993, + "grad_norm": 49.48827011693748, + "learning_rate": 5.577117296551345e-06, + "loss": 2.5072, + "step": 18194 + }, + { + "epoch": 1.5507542827921248, + "grad_norm": 32.76506740648924, + "learning_rate": 5.576624759948545e-06, + "loss": 2.4464, + "step": 18195 + }, + { + "epoch": 1.5508395124861503, + "grad_norm": 107.89283120831057, + "learning_rate": 5.576132217674868e-06, + "loss": 4.0308, + "step": 18196 + }, + { + "epoch": 1.5509247421801757, + "grad_norm": 100.28294217609833, + "learning_rate": 5.575639669735161e-06, + "loss": 4.4763, + "step": 18197 + }, + { + "epoch": 1.551009971874201, + "grad_norm": 72.64271841735312, + "learning_rate": 5.575147116134266e-06, + "loss": 2.8718, + "step": 18198 + }, + { + "epoch": 1.5510952015682262, + "grad_norm": 35.48671935047116, + "learning_rate": 5.574654556877028e-06, + "loss": 2.7362, + "step": 18199 + }, + { + "epoch": 1.5511804312622517, + "grad_norm": 82.11807988780318, + "learning_rate": 5.574161991968291e-06, + "loss": 3.1317, + "step": 18200 + }, + { + "epoch": 1.5512656609562772, + "grad_norm": 37.57080725705311, + "learning_rate": 5.5736694214128985e-06, + "loss": 2.499, + "step": 18201 + }, + { + "epoch": 1.5513508906503026, + "grad_norm": 32.71310602539737, + "learning_rate": 5.5731768452156944e-06, + "loss": 2.5579, + "step": 18202 + }, + { + "epoch": 1.551436120344328, + "grad_norm": 19.520176465708015, + "learning_rate": 5.572684263381525e-06, + "loss": 1.3269, + "step": 18203 + }, + { + "epoch": 1.5515213500383533, + "grad_norm": 40.653505100561645, + "learning_rate": 5.572191675915233e-06, + "loss": 2.4977, + "step": 18204 + }, + { + "epoch": 1.5516065797323788, + "grad_norm": 98.54751744949651, + "learning_rate": 5.571699082821664e-06, + "loss": 4.8998, + "step": 18205 + }, + { + "epoch": 1.551691809426404, + "grad_norm": 70.97106141804872, + "learning_rate": 5.57120648410566e-06, + "loss": 3.8366, + "step": 18206 + }, + { + "epoch": 1.5517770391204295, + "grad_norm": 57.67459422619611, + "learning_rate": 5.570713879772067e-06, + "loss": 2.7136, + "step": 18207 + }, + { + "epoch": 1.551862268814455, + "grad_norm": 84.53820007688363, + "learning_rate": 5.570221269825732e-06, + "loss": 3.4046, + "step": 18208 + }, + { + "epoch": 1.5519474985084805, + "grad_norm": 35.15826227802113, + "learning_rate": 5.5697286542714954e-06, + "loss": 2.5196, + "step": 18209 + }, + { + "epoch": 1.5520327282025057, + "grad_norm": 52.51136521290571, + "learning_rate": 5.569236033114205e-06, + "loss": 2.8089, + "step": 18210 + }, + { + "epoch": 1.5521179578965312, + "grad_norm": 25.704152756237452, + "learning_rate": 5.568743406358704e-06, + "loss": 1.7252, + "step": 18211 + }, + { + "epoch": 1.5522031875905564, + "grad_norm": 41.71226140057522, + "learning_rate": 5.568250774009838e-06, + "loss": 3.3644, + "step": 18212 + }, + { + "epoch": 1.552288417284582, + "grad_norm": 45.388440283092145, + "learning_rate": 5.567758136072452e-06, + "loss": 3.2682, + "step": 18213 + }, + { + "epoch": 1.5523736469786074, + "grad_norm": 53.9194558038147, + "learning_rate": 5.567265492551388e-06, + "loss": 2.7669, + "step": 18214 + }, + { + "epoch": 1.5524588766726328, + "grad_norm": 81.74485071053014, + "learning_rate": 5.566772843451495e-06, + "loss": 4.01, + "step": 18215 + }, + { + "epoch": 1.5525441063666583, + "grad_norm": 26.444956388558207, + "learning_rate": 5.566280188777618e-06, + "loss": 1.6485, + "step": 18216 + }, + { + "epoch": 1.5526293360606835, + "grad_norm": 33.494170399439525, + "learning_rate": 5.565787528534599e-06, + "loss": 2.0825, + "step": 18217 + }, + { + "epoch": 1.5527145657547088, + "grad_norm": 34.42506978378905, + "learning_rate": 5.565294862727282e-06, + "loss": 3.1734, + "step": 18218 + }, + { + "epoch": 1.5527997954487343, + "grad_norm": 58.17626387150256, + "learning_rate": 5.564802191360516e-06, + "loss": 3.4487, + "step": 18219 + }, + { + "epoch": 1.5528850251427597, + "grad_norm": 89.16605513759299, + "learning_rate": 5.564309514439146e-06, + "loss": 3.4239, + "step": 18220 + }, + { + "epoch": 1.5529702548367852, + "grad_norm": 84.17959419109894, + "learning_rate": 5.563816831968016e-06, + "loss": 4.1753, + "step": 18221 + }, + { + "epoch": 1.5530554845308107, + "grad_norm": 73.41469431294695, + "learning_rate": 5.56332414395197e-06, + "loss": 3.5683, + "step": 18222 + }, + { + "epoch": 1.553140714224836, + "grad_norm": 65.9263208854735, + "learning_rate": 5.562831450395857e-06, + "loss": 2.8032, + "step": 18223 + }, + { + "epoch": 1.5532259439188614, + "grad_norm": 42.5616008269279, + "learning_rate": 5.562338751304519e-06, + "loss": 3.5531, + "step": 18224 + }, + { + "epoch": 1.5533111736128866, + "grad_norm": 40.660913233200795, + "learning_rate": 5.561846046682804e-06, + "loss": 2.9936, + "step": 18225 + }, + { + "epoch": 1.553396403306912, + "grad_norm": 41.87086443236373, + "learning_rate": 5.561353336535554e-06, + "loss": 2.8444, + "step": 18226 + }, + { + "epoch": 1.5534816330009376, + "grad_norm": 34.550197267801614, + "learning_rate": 5.5608606208676195e-06, + "loss": 2.998, + "step": 18227 + }, + { + "epoch": 1.553566862694963, + "grad_norm": 40.565793021444684, + "learning_rate": 5.560367899683844e-06, + "loss": 2.7234, + "step": 18228 + }, + { + "epoch": 1.5536520923889883, + "grad_norm": 43.67477671228685, + "learning_rate": 5.559875172989071e-06, + "loss": 2.9889, + "step": 18229 + }, + { + "epoch": 1.5537373220830137, + "grad_norm": 87.38413750573284, + "learning_rate": 5.559382440788148e-06, + "loss": 2.8034, + "step": 18230 + }, + { + "epoch": 1.553822551777039, + "grad_norm": 48.96787159598533, + "learning_rate": 5.558889703085922e-06, + "loss": 2.595, + "step": 18231 + }, + { + "epoch": 1.5539077814710645, + "grad_norm": 37.40016595848496, + "learning_rate": 5.558396959887238e-06, + "loss": 2.6262, + "step": 18232 + }, + { + "epoch": 1.55399301116509, + "grad_norm": 47.22365409805452, + "learning_rate": 5.557904211196942e-06, + "loss": 2.4089, + "step": 18233 + }, + { + "epoch": 1.5540782408591154, + "grad_norm": 39.15383851328138, + "learning_rate": 5.557411457019878e-06, + "loss": 2.9109, + "step": 18234 + }, + { + "epoch": 1.5541634705531409, + "grad_norm": 57.80123757437536, + "learning_rate": 5.556918697360895e-06, + "loss": 2.7377, + "step": 18235 + }, + { + "epoch": 1.554248700247166, + "grad_norm": 39.45804541277245, + "learning_rate": 5.5564259322248384e-06, + "loss": 3.3368, + "step": 18236 + }, + { + "epoch": 1.5543339299411914, + "grad_norm": 27.139610987880776, + "learning_rate": 5.555933161616553e-06, + "loss": 2.4459, + "step": 18237 + }, + { + "epoch": 1.5544191596352168, + "grad_norm": 83.12838218393526, + "learning_rate": 5.555440385540885e-06, + "loss": 3.2621, + "step": 18238 + }, + { + "epoch": 1.5545043893292423, + "grad_norm": 29.787678562634913, + "learning_rate": 5.554947604002683e-06, + "loss": 2.0526, + "step": 18239 + }, + { + "epoch": 1.5545896190232678, + "grad_norm": 44.416537887700954, + "learning_rate": 5.554454817006792e-06, + "loss": 2.3322, + "step": 18240 + }, + { + "epoch": 1.5546748487172932, + "grad_norm": 36.50567937085356, + "learning_rate": 5.553962024558058e-06, + "loss": 2.9836, + "step": 18241 + }, + { + "epoch": 1.5547600784113185, + "grad_norm": 41.60065194135983, + "learning_rate": 5.553469226661326e-06, + "loss": 2.8998, + "step": 18242 + }, + { + "epoch": 1.554845308105344, + "grad_norm": 163.0012560611664, + "learning_rate": 5.552976423321447e-06, + "loss": 2.9105, + "step": 18243 + }, + { + "epoch": 1.5549305377993692, + "grad_norm": 34.1785574827673, + "learning_rate": 5.552483614543262e-06, + "loss": 2.9996, + "step": 18244 + }, + { + "epoch": 1.5550157674933947, + "grad_norm": 68.13242699449088, + "learning_rate": 5.551990800331621e-06, + "loss": 3.6251, + "step": 18245 + }, + { + "epoch": 1.5551009971874201, + "grad_norm": 74.77711587590667, + "learning_rate": 5.55149798069137e-06, + "loss": 3.5562, + "step": 18246 + }, + { + "epoch": 1.5551862268814456, + "grad_norm": 16.2276794616014, + "learning_rate": 5.551005155627357e-06, + "loss": 1.4981, + "step": 18247 + }, + { + "epoch": 1.555271456575471, + "grad_norm": 77.69380937253075, + "learning_rate": 5.550512325144425e-06, + "loss": 4.1937, + "step": 18248 + }, + { + "epoch": 1.5553566862694963, + "grad_norm": 38.15306430946607, + "learning_rate": 5.5500194892474235e-06, + "loss": 3.2437, + "step": 18249 + }, + { + "epoch": 1.5554419159635215, + "grad_norm": 54.96551412451042, + "learning_rate": 5.549526647941199e-06, + "loss": 3.3242, + "step": 18250 + }, + { + "epoch": 1.555527145657547, + "grad_norm": 42.4618137693942, + "learning_rate": 5.549033801230599e-06, + "loss": 3.0998, + "step": 18251 + }, + { + "epoch": 1.5556123753515725, + "grad_norm": 116.51123131883026, + "learning_rate": 5.548540949120469e-06, + "loss": 3.9473, + "step": 18252 + }, + { + "epoch": 1.555697605045598, + "grad_norm": 33.60082348866802, + "learning_rate": 5.548048091615657e-06, + "loss": 2.6824, + "step": 18253 + }, + { + "epoch": 1.5557828347396234, + "grad_norm": 70.69026266621198, + "learning_rate": 5.547555228721009e-06, + "loss": 4.192, + "step": 18254 + }, + { + "epoch": 1.5558680644336487, + "grad_norm": 43.33151215899181, + "learning_rate": 5.547062360441374e-06, + "loss": 2.9516, + "step": 18255 + }, + { + "epoch": 1.5559532941276741, + "grad_norm": 67.79616073960798, + "learning_rate": 5.546569486781598e-06, + "loss": 2.4416, + "step": 18256 + }, + { + "epoch": 1.5560385238216994, + "grad_norm": 64.16073108115097, + "learning_rate": 5.546076607746528e-06, + "loss": 2.9687, + "step": 18257 + }, + { + "epoch": 1.5561237535157249, + "grad_norm": 31.811604471147795, + "learning_rate": 5.5455837233410115e-06, + "loss": 2.646, + "step": 18258 + }, + { + "epoch": 1.5562089832097503, + "grad_norm": 46.07571122772015, + "learning_rate": 5.545090833569896e-06, + "loss": 3.0834, + "step": 18259 + }, + { + "epoch": 1.5562942129037758, + "grad_norm": 45.63617747191829, + "learning_rate": 5.544597938438029e-06, + "loss": 3.2957, + "step": 18260 + }, + { + "epoch": 1.556379442597801, + "grad_norm": 19.09318866289891, + "learning_rate": 5.544105037950256e-06, + "loss": 1.1541, + "step": 18261 + }, + { + "epoch": 1.5564646722918265, + "grad_norm": 40.90021621119436, + "learning_rate": 5.543612132111429e-06, + "loss": 2.8734, + "step": 18262 + }, + { + "epoch": 1.5565499019858517, + "grad_norm": 37.9734078798367, + "learning_rate": 5.543119220926391e-06, + "loss": 2.5098, + "step": 18263 + }, + { + "epoch": 1.5566351316798772, + "grad_norm": 55.798110502022894, + "learning_rate": 5.542626304399992e-06, + "loss": 4.1012, + "step": 18264 + }, + { + "epoch": 1.5567203613739027, + "grad_norm": 46.54792771385106, + "learning_rate": 5.542133382537076e-06, + "loss": 3.3178, + "step": 18265 + }, + { + "epoch": 1.5568055910679282, + "grad_norm": 69.49295404265344, + "learning_rate": 5.541640455342497e-06, + "loss": 3.4355, + "step": 18266 + }, + { + "epoch": 1.5568908207619536, + "grad_norm": 53.12373829913681, + "learning_rate": 5.541147522821099e-06, + "loss": 3.0232, + "step": 18267 + }, + { + "epoch": 1.5569760504559789, + "grad_norm": 52.4359934830738, + "learning_rate": 5.540654584977729e-06, + "loss": 2.0968, + "step": 18268 + }, + { + "epoch": 1.5570612801500041, + "grad_norm": 55.34100803626891, + "learning_rate": 5.540161641817235e-06, + "loss": 2.6816, + "step": 18269 + }, + { + "epoch": 1.5571465098440296, + "grad_norm": 52.95296651355686, + "learning_rate": 5.539668693344469e-06, + "loss": 2.8511, + "step": 18270 + }, + { + "epoch": 1.557231739538055, + "grad_norm": 39.45355585093643, + "learning_rate": 5.5391757395642735e-06, + "loss": 2.9789, + "step": 18271 + }, + { + "epoch": 1.5573169692320805, + "grad_norm": 114.72061686626918, + "learning_rate": 5.5386827804815e-06, + "loss": 3.6259, + "step": 18272 + }, + { + "epoch": 1.557402198926106, + "grad_norm": 36.402715469142095, + "learning_rate": 5.538189816100995e-06, + "loss": 2.9646, + "step": 18273 + }, + { + "epoch": 1.5574874286201312, + "grad_norm": 41.836725074584855, + "learning_rate": 5.537696846427607e-06, + "loss": 2.9168, + "step": 18274 + }, + { + "epoch": 1.5575726583141567, + "grad_norm": 43.365063354998966, + "learning_rate": 5.537203871466184e-06, + "loss": 3.3767, + "step": 18275 + }, + { + "epoch": 1.557657888008182, + "grad_norm": 49.61347002562971, + "learning_rate": 5.536710891221575e-06, + "loss": 2.8968, + "step": 18276 + }, + { + "epoch": 1.5577431177022074, + "grad_norm": 90.17897272900802, + "learning_rate": 5.5362179056986265e-06, + "loss": 2.8315, + "step": 18277 + }, + { + "epoch": 1.5578283473962329, + "grad_norm": 53.531054127442886, + "learning_rate": 5.535724914902189e-06, + "loss": 3.1996, + "step": 18278 + }, + { + "epoch": 1.5579135770902584, + "grad_norm": 37.79980704743193, + "learning_rate": 5.53523191883711e-06, + "loss": 2.6726, + "step": 18279 + }, + { + "epoch": 1.5579988067842836, + "grad_norm": 58.854374780917276, + "learning_rate": 5.534738917508237e-06, + "loss": 4.0432, + "step": 18280 + }, + { + "epoch": 1.558084036478309, + "grad_norm": 37.99333140574082, + "learning_rate": 5.534245910920418e-06, + "loss": 1.9714, + "step": 18281 + }, + { + "epoch": 1.5581692661723343, + "grad_norm": 32.482046737049224, + "learning_rate": 5.533752899078505e-06, + "loss": 2.9179, + "step": 18282 + }, + { + "epoch": 1.5582544958663598, + "grad_norm": 41.4944674610871, + "learning_rate": 5.533259881987344e-06, + "loss": 2.8091, + "step": 18283 + }, + { + "epoch": 1.5583397255603852, + "grad_norm": 43.21468639174106, + "learning_rate": 5.532766859651783e-06, + "loss": 3.495, + "step": 18284 + }, + { + "epoch": 1.5584249552544107, + "grad_norm": 52.45210864590118, + "learning_rate": 5.532273832076672e-06, + "loss": 1.9755, + "step": 18285 + }, + { + "epoch": 1.5585101849484362, + "grad_norm": 50.11431384440643, + "learning_rate": 5.531780799266859e-06, + "loss": 3.8173, + "step": 18286 + }, + { + "epoch": 1.5585954146424614, + "grad_norm": 30.18348539860769, + "learning_rate": 5.531287761227194e-06, + "loss": 2.0716, + "step": 18287 + }, + { + "epoch": 1.5586806443364867, + "grad_norm": 35.726381737477915, + "learning_rate": 5.530794717962524e-06, + "loss": 2.8598, + "step": 18288 + }, + { + "epoch": 1.5587658740305121, + "grad_norm": 64.80679382955871, + "learning_rate": 5.5303016694777e-06, + "loss": 2.259, + "step": 18289 + }, + { + "epoch": 1.5588511037245376, + "grad_norm": 73.37583404489322, + "learning_rate": 5.52980861577757e-06, + "loss": 3.4837, + "step": 18290 + }, + { + "epoch": 1.558936333418563, + "grad_norm": 40.71067564174292, + "learning_rate": 5.529315556866982e-06, + "loss": 2.9677, + "step": 18291 + }, + { + "epoch": 1.5590215631125885, + "grad_norm": 37.39900577628053, + "learning_rate": 5.5288224927507865e-06, + "loss": 2.951, + "step": 18292 + }, + { + "epoch": 1.5591067928066138, + "grad_norm": 52.197390704402835, + "learning_rate": 5.5283294234338306e-06, + "loss": 2.8683, + "step": 18293 + }, + { + "epoch": 1.5591920225006393, + "grad_norm": 84.32860181715525, + "learning_rate": 5.527836348920965e-06, + "loss": 4.2432, + "step": 18294 + }, + { + "epoch": 1.5592772521946645, + "grad_norm": 66.75827252078975, + "learning_rate": 5.527343269217039e-06, + "loss": 3.6774, + "step": 18295 + }, + { + "epoch": 1.55936248188869, + "grad_norm": 46.46788193301844, + "learning_rate": 5.5268501843269015e-06, + "loss": 2.4198, + "step": 18296 + }, + { + "epoch": 1.5594477115827154, + "grad_norm": 52.461010873088895, + "learning_rate": 5.526357094255402e-06, + "loss": 2.4932, + "step": 18297 + }, + { + "epoch": 1.559532941276741, + "grad_norm": 112.50698024094015, + "learning_rate": 5.525863999007391e-06, + "loss": 4.8048, + "step": 18298 + }, + { + "epoch": 1.5596181709707662, + "grad_norm": 33.95670521151947, + "learning_rate": 5.525370898587714e-06, + "loss": 2.8651, + "step": 18299 + }, + { + "epoch": 1.5597034006647916, + "grad_norm": 98.65307536603814, + "learning_rate": 5.5248777930012244e-06, + "loss": 2.7366, + "step": 18300 + }, + { + "epoch": 1.5597886303588169, + "grad_norm": 47.48575688656525, + "learning_rate": 5.52438468225277e-06, + "loss": 2.4839, + "step": 18301 + }, + { + "epoch": 1.5598738600528423, + "grad_norm": 71.23389462798522, + "learning_rate": 5.5238915663472e-06, + "loss": 2.4227, + "step": 18302 + }, + { + "epoch": 1.5599590897468678, + "grad_norm": 41.71391199053138, + "learning_rate": 5.5233984452893665e-06, + "loss": 3.1981, + "step": 18303 + }, + { + "epoch": 1.5600443194408933, + "grad_norm": 44.817574209553534, + "learning_rate": 5.522905319084114e-06, + "loss": 2.0712, + "step": 18304 + }, + { + "epoch": 1.5601295491349187, + "grad_norm": 46.99033482406874, + "learning_rate": 5.522412187736299e-06, + "loss": 3.3728, + "step": 18305 + }, + { + "epoch": 1.560214778828944, + "grad_norm": 80.88654831856388, + "learning_rate": 5.521919051250766e-06, + "loss": 2.2489, + "step": 18306 + }, + { + "epoch": 1.5603000085229692, + "grad_norm": 91.45719655307194, + "learning_rate": 5.521425909632368e-06, + "loss": 3.3625, + "step": 18307 + }, + { + "epoch": 1.5603852382169947, + "grad_norm": 40.338769494876274, + "learning_rate": 5.52093276288595e-06, + "loss": 2.2995, + "step": 18308 + }, + { + "epoch": 1.5604704679110202, + "grad_norm": 36.011401056431076, + "learning_rate": 5.5204396110163676e-06, + "loss": 2.999, + "step": 18309 + }, + { + "epoch": 1.5605556976050456, + "grad_norm": 37.533103099265055, + "learning_rate": 5.5199464540284675e-06, + "loss": 2.7801, + "step": 18310 + }, + { + "epoch": 1.5606409272990711, + "grad_norm": 47.703845345830665, + "learning_rate": 5.5194532919271025e-06, + "loss": 3.4175, + "step": 18311 + }, + { + "epoch": 1.5607261569930964, + "grad_norm": 19.785911811236865, + "learning_rate": 5.518960124717117e-06, + "loss": 1.6213, + "step": 18312 + }, + { + "epoch": 1.5608113866871218, + "grad_norm": 145.9396851604314, + "learning_rate": 5.51846695240337e-06, + "loss": 3.9669, + "step": 18313 + }, + { + "epoch": 1.560896616381147, + "grad_norm": 38.90039973030705, + "learning_rate": 5.517973774990703e-06, + "loss": 3.3345, + "step": 18314 + }, + { + "epoch": 1.5609818460751725, + "grad_norm": 73.22411195848973, + "learning_rate": 5.517480592483972e-06, + "loss": 3.0705, + "step": 18315 + }, + { + "epoch": 1.561067075769198, + "grad_norm": 44.0996398381767, + "learning_rate": 5.516987404888022e-06, + "loss": 2.3875, + "step": 18316 + }, + { + "epoch": 1.5611523054632235, + "grad_norm": 50.829786091613414, + "learning_rate": 5.516494212207708e-06, + "loss": 2.5656, + "step": 18317 + }, + { + "epoch": 1.561237535157249, + "grad_norm": 51.0519804382215, + "learning_rate": 5.51600101444788e-06, + "loss": 3.2264, + "step": 18318 + }, + { + "epoch": 1.5613227648512742, + "grad_norm": 35.41906805492404, + "learning_rate": 5.515507811613385e-06, + "loss": 2.8818, + "step": 18319 + }, + { + "epoch": 1.5614079945452994, + "grad_norm": 36.175670985997584, + "learning_rate": 5.515014603709077e-06, + "loss": 2.5245, + "step": 18320 + }, + { + "epoch": 1.561493224239325, + "grad_norm": 24.67018068879361, + "learning_rate": 5.514521390739803e-06, + "loss": 2.1414, + "step": 18321 + }, + { + "epoch": 1.5615784539333504, + "grad_norm": 38.10619468610157, + "learning_rate": 5.514028172710419e-06, + "loss": 1.331, + "step": 18322 + }, + { + "epoch": 1.5616636836273758, + "grad_norm": 42.688461259627175, + "learning_rate": 5.513534949625769e-06, + "loss": 2.4765, + "step": 18323 + }, + { + "epoch": 1.5617489133214013, + "grad_norm": 35.10117340234959, + "learning_rate": 5.513041721490707e-06, + "loss": 1.9685, + "step": 18324 + }, + { + "epoch": 1.5618341430154266, + "grad_norm": 33.55865481316497, + "learning_rate": 5.512548488310084e-06, + "loss": 2.6484, + "step": 18325 + }, + { + "epoch": 1.561919372709452, + "grad_norm": 41.87566547029967, + "learning_rate": 5.5120552500887515e-06, + "loss": 2.0097, + "step": 18326 + }, + { + "epoch": 1.5620046024034773, + "grad_norm": 75.83408911877773, + "learning_rate": 5.511562006831558e-06, + "loss": 3.3365, + "step": 18327 + }, + { + "epoch": 1.5620898320975027, + "grad_norm": 36.150120803292616, + "learning_rate": 5.5110687585433556e-06, + "loss": 3.2017, + "step": 18328 + }, + { + "epoch": 1.5621750617915282, + "grad_norm": 42.30204718698227, + "learning_rate": 5.510575505228994e-06, + "loss": 2.8584, + "step": 18329 + }, + { + "epoch": 1.5622602914855537, + "grad_norm": 65.41490670386617, + "learning_rate": 5.510082246893328e-06, + "loss": 3.7985, + "step": 18330 + }, + { + "epoch": 1.562345521179579, + "grad_norm": 29.580458897486494, + "learning_rate": 5.509588983541203e-06, + "loss": 1.6663, + "step": 18331 + }, + { + "epoch": 1.5624307508736044, + "grad_norm": 97.86604215874749, + "learning_rate": 5.509095715177475e-06, + "loss": 2.8694, + "step": 18332 + }, + { + "epoch": 1.5625159805676296, + "grad_norm": 47.65158219522198, + "learning_rate": 5.508602441806992e-06, + "loss": 2.5995, + "step": 18333 + }, + { + "epoch": 1.562601210261655, + "grad_norm": 58.357648437376334, + "learning_rate": 5.508109163434605e-06, + "loss": 3.1497, + "step": 18334 + }, + { + "epoch": 1.5626864399556806, + "grad_norm": 103.74037171872274, + "learning_rate": 5.507615880065168e-06, + "loss": 3.8174, + "step": 18335 + }, + { + "epoch": 1.562771669649706, + "grad_norm": 81.83491300468745, + "learning_rate": 5.507122591703529e-06, + "loss": 2.9636, + "step": 18336 + }, + { + "epoch": 1.5628568993437315, + "grad_norm": 44.2554962557995, + "learning_rate": 5.506629298354542e-06, + "loss": 2.2437, + "step": 18337 + }, + { + "epoch": 1.5629421290377568, + "grad_norm": 70.96468978619993, + "learning_rate": 5.506136000023057e-06, + "loss": 3.1681, + "step": 18338 + }, + { + "epoch": 1.563027358731782, + "grad_norm": 36.09083694664791, + "learning_rate": 5.505642696713925e-06, + "loss": 3.1391, + "step": 18339 + }, + { + "epoch": 1.5631125884258075, + "grad_norm": 130.6719048400823, + "learning_rate": 5.505149388431999e-06, + "loss": 2.7682, + "step": 18340 + }, + { + "epoch": 1.563197818119833, + "grad_norm": 37.54161706829668, + "learning_rate": 5.504656075182129e-06, + "loss": 3.027, + "step": 18341 + }, + { + "epoch": 1.5632830478138584, + "grad_norm": 38.75071091870531, + "learning_rate": 5.5041627569691655e-06, + "loss": 3.643, + "step": 18342 + }, + { + "epoch": 1.5633682775078839, + "grad_norm": 64.46555165302458, + "learning_rate": 5.503669433797963e-06, + "loss": 3.2586, + "step": 18343 + }, + { + "epoch": 1.5634535072019091, + "grad_norm": 35.282593163555894, + "learning_rate": 5.503176105673371e-06, + "loss": 2.6576, + "step": 18344 + }, + { + "epoch": 1.5635387368959346, + "grad_norm": 41.93876108213661, + "learning_rate": 5.5026827726002444e-06, + "loss": 2.8886, + "step": 18345 + }, + { + "epoch": 1.5636239665899598, + "grad_norm": 98.63202643190476, + "learning_rate": 5.50218943458343e-06, + "loss": 3.4363, + "step": 18346 + }, + { + "epoch": 1.5637091962839853, + "grad_norm": 72.95882865468948, + "learning_rate": 5.501696091627783e-06, + "loss": 3.2914, + "step": 18347 + }, + { + "epoch": 1.5637944259780108, + "grad_norm": 27.307509321708377, + "learning_rate": 5.501202743738155e-06, + "loss": 1.4522, + "step": 18348 + }, + { + "epoch": 1.5638796556720362, + "grad_norm": 30.705959406471678, + "learning_rate": 5.500709390919395e-06, + "loss": 1.9819, + "step": 18349 + }, + { + "epoch": 1.5639648853660615, + "grad_norm": 57.67991937764346, + "learning_rate": 5.5002160331763585e-06, + "loss": 3.528, + "step": 18350 + }, + { + "epoch": 1.564050115060087, + "grad_norm": 43.32288457000552, + "learning_rate": 5.499722670513895e-06, + "loss": 2.8768, + "step": 18351 + }, + { + "epoch": 1.5641353447541122, + "grad_norm": 62.128901044011364, + "learning_rate": 5.4992293029368595e-06, + "loss": 2.9172, + "step": 18352 + }, + { + "epoch": 1.5642205744481377, + "grad_norm": 34.69754234463885, + "learning_rate": 5.498735930450101e-06, + "loss": 2.8865, + "step": 18353 + }, + { + "epoch": 1.5643058041421631, + "grad_norm": 46.79592219192554, + "learning_rate": 5.498242553058474e-06, + "loss": 2.7402, + "step": 18354 + }, + { + "epoch": 1.5643910338361886, + "grad_norm": 75.84247733191114, + "learning_rate": 5.4977491707668276e-06, + "loss": 3.7296, + "step": 18355 + }, + { + "epoch": 1.564476263530214, + "grad_norm": 32.84555855385356, + "learning_rate": 5.497255783580018e-06, + "loss": 2.6209, + "step": 18356 + }, + { + "epoch": 1.5645614932242393, + "grad_norm": 47.11218777055328, + "learning_rate": 5.496762391502894e-06, + "loss": 3.1366, + "step": 18357 + }, + { + "epoch": 1.5646467229182646, + "grad_norm": 79.05644891617553, + "learning_rate": 5.496268994540309e-06, + "loss": 4.0991, + "step": 18358 + }, + { + "epoch": 1.56473195261229, + "grad_norm": 34.6978280662886, + "learning_rate": 5.495775592697114e-06, + "loss": 3.0473, + "step": 18359 + }, + { + "epoch": 1.5648171823063155, + "grad_norm": 41.1473207352679, + "learning_rate": 5.495282185978167e-06, + "loss": 2.7367, + "step": 18360 + }, + { + "epoch": 1.564902412000341, + "grad_norm": 32.673179555898166, + "learning_rate": 5.494788774388315e-06, + "loss": 2.3235, + "step": 18361 + }, + { + "epoch": 1.5649876416943664, + "grad_norm": 53.04659272220216, + "learning_rate": 5.494295357932412e-06, + "loss": 3.657, + "step": 18362 + }, + { + "epoch": 1.5650728713883917, + "grad_norm": 41.74569271743695, + "learning_rate": 5.493801936615309e-06, + "loss": 2.5988, + "step": 18363 + }, + { + "epoch": 1.5651581010824172, + "grad_norm": 96.48707670278624, + "learning_rate": 5.493308510441862e-06, + "loss": 3.4889, + "step": 18364 + }, + { + "epoch": 1.5652433307764424, + "grad_norm": 38.06345492960028, + "learning_rate": 5.492815079416921e-06, + "loss": 2.8418, + "step": 18365 + }, + { + "epoch": 1.5653285604704679, + "grad_norm": 82.56620562994081, + "learning_rate": 5.49232164354534e-06, + "loss": 3.9968, + "step": 18366 + }, + { + "epoch": 1.5654137901644933, + "grad_norm": 30.26139771832894, + "learning_rate": 5.49182820283197e-06, + "loss": 2.184, + "step": 18367 + }, + { + "epoch": 1.5654990198585188, + "grad_norm": 113.11117824441352, + "learning_rate": 5.4913347572816664e-06, + "loss": 2.6375, + "step": 18368 + }, + { + "epoch": 1.5655842495525443, + "grad_norm": 24.520188885123172, + "learning_rate": 5.490841306899282e-06, + "loss": 1.4114, + "step": 18369 + }, + { + "epoch": 1.5656694792465695, + "grad_norm": 66.9303799314546, + "learning_rate": 5.490347851689666e-06, + "loss": 2.2126, + "step": 18370 + }, + { + "epoch": 1.5657547089405948, + "grad_norm": 68.90354192535341, + "learning_rate": 5.4898543916576745e-06, + "loss": 2.611, + "step": 18371 + }, + { + "epoch": 1.5658399386346202, + "grad_norm": 41.524917571695966, + "learning_rate": 5.489360926808159e-06, + "loss": 3.2154, + "step": 18372 + }, + { + "epoch": 1.5659251683286457, + "grad_norm": 35.98975669725305, + "learning_rate": 5.488867457145974e-06, + "loss": 2.6666, + "step": 18373 + }, + { + "epoch": 1.5660103980226712, + "grad_norm": 38.978240458487264, + "learning_rate": 5.488373982675972e-06, + "loss": 2.856, + "step": 18374 + }, + { + "epoch": 1.5660956277166966, + "grad_norm": 54.856590531189816, + "learning_rate": 5.487880503403004e-06, + "loss": 3.2863, + "step": 18375 + }, + { + "epoch": 1.5661808574107219, + "grad_norm": 27.082248845059432, + "learning_rate": 5.487387019331929e-06, + "loss": 3.038, + "step": 18376 + }, + { + "epoch": 1.5662660871047471, + "grad_norm": 111.72762102441986, + "learning_rate": 5.486893530467593e-06, + "loss": 3.7399, + "step": 18377 + }, + { + "epoch": 1.5663513167987726, + "grad_norm": 24.545996249201867, + "learning_rate": 5.486400036814854e-06, + "loss": 1.9224, + "step": 18378 + }, + { + "epoch": 1.566436546492798, + "grad_norm": 31.482116189879243, + "learning_rate": 5.485906538378563e-06, + "loss": 2.5951, + "step": 18379 + }, + { + "epoch": 1.5665217761868235, + "grad_norm": 63.586582179255835, + "learning_rate": 5.485413035163576e-06, + "loss": 2.9646, + "step": 18380 + }, + { + "epoch": 1.566607005880849, + "grad_norm": 40.19891882221824, + "learning_rate": 5.484919527174744e-06, + "loss": 2.6835, + "step": 18381 + }, + { + "epoch": 1.5666922355748742, + "grad_norm": 65.15321365811732, + "learning_rate": 5.484426014416919e-06, + "loss": 3.0304, + "step": 18382 + }, + { + "epoch": 1.5667774652688997, + "grad_norm": 36.779783406477904, + "learning_rate": 5.483932496894958e-06, + "loss": 2.6982, + "step": 18383 + }, + { + "epoch": 1.566862694962925, + "grad_norm": 40.55050566194029, + "learning_rate": 5.483438974613715e-06, + "loss": 2.727, + "step": 18384 + }, + { + "epoch": 1.5669479246569504, + "grad_norm": 46.343184966492544, + "learning_rate": 5.48294544757804e-06, + "loss": 2.9145, + "step": 18385 + }, + { + "epoch": 1.567033154350976, + "grad_norm": 89.22138087807507, + "learning_rate": 5.482451915792788e-06, + "loss": 3.5696, + "step": 18386 + }, + { + "epoch": 1.5671183840450014, + "grad_norm": 33.18228102880647, + "learning_rate": 5.481958379262814e-06, + "loss": 1.8855, + "step": 18387 + }, + { + "epoch": 1.5672036137390268, + "grad_norm": 68.92482479099739, + "learning_rate": 5.481464837992971e-06, + "loss": 3.3702, + "step": 18388 + }, + { + "epoch": 1.567288843433052, + "grad_norm": 43.67140854343811, + "learning_rate": 5.480971291988112e-06, + "loss": 2.5693, + "step": 18389 + }, + { + "epoch": 1.5673740731270773, + "grad_norm": 47.986266152502566, + "learning_rate": 5.480477741253092e-06, + "loss": 2.2639, + "step": 18390 + }, + { + "epoch": 1.5674593028211028, + "grad_norm": 104.9249687404464, + "learning_rate": 5.479984185792765e-06, + "loss": 4.0221, + "step": 18391 + }, + { + "epoch": 1.5675445325151283, + "grad_norm": 35.87229206833829, + "learning_rate": 5.479490625611983e-06, + "loss": 3.0541, + "step": 18392 + }, + { + "epoch": 1.5676297622091537, + "grad_norm": 31.373808150552193, + "learning_rate": 5.478997060715601e-06, + "loss": 2.1351, + "step": 18393 + }, + { + "epoch": 1.5677149919031792, + "grad_norm": 70.30618065946643, + "learning_rate": 5.478503491108473e-06, + "loss": 2.5622, + "step": 18394 + }, + { + "epoch": 1.5678002215972044, + "grad_norm": 56.99111802397554, + "learning_rate": 5.478009916795456e-06, + "loss": 2.7101, + "step": 18395 + }, + { + "epoch": 1.56788545129123, + "grad_norm": 33.45853954170165, + "learning_rate": 5.477516337781399e-06, + "loss": 2.5868, + "step": 18396 + }, + { + "epoch": 1.5679706809852552, + "grad_norm": 35.78839486710374, + "learning_rate": 5.477022754071159e-06, + "loss": 3.4064, + "step": 18397 + }, + { + "epoch": 1.5680559106792806, + "grad_norm": 41.19704935887193, + "learning_rate": 5.4765291656695895e-06, + "loss": 2.7924, + "step": 18398 + }, + { + "epoch": 1.568141140373306, + "grad_norm": 31.007467396351856, + "learning_rate": 5.476035572581546e-06, + "loss": 2.6069, + "step": 18399 + }, + { + "epoch": 1.5682263700673316, + "grad_norm": 31.12653773670747, + "learning_rate": 5.475541974811881e-06, + "loss": 2.3229, + "step": 18400 + }, + { + "epoch": 1.5683115997613568, + "grad_norm": 44.18897160771315, + "learning_rate": 5.475048372365451e-06, + "loss": 2.2751, + "step": 18401 + }, + { + "epoch": 1.5683968294553823, + "grad_norm": 54.05087094798, + "learning_rate": 5.474554765247107e-06, + "loss": 3.0148, + "step": 18402 + }, + { + "epoch": 1.5684820591494075, + "grad_norm": 43.45853400910801, + "learning_rate": 5.4740611534617065e-06, + "loss": 2.4206, + "step": 18403 + }, + { + "epoch": 1.568567288843433, + "grad_norm": 127.80830528923967, + "learning_rate": 5.473567537014104e-06, + "loss": 3.2013, + "step": 18404 + }, + { + "epoch": 1.5686525185374585, + "grad_norm": 87.63656705564682, + "learning_rate": 5.473073915909151e-06, + "loss": 3.1637, + "step": 18405 + }, + { + "epoch": 1.568737748231484, + "grad_norm": 40.53861431079327, + "learning_rate": 5.472580290151705e-06, + "loss": 2.7327, + "step": 18406 + }, + { + "epoch": 1.5688229779255094, + "grad_norm": 29.71924120394537, + "learning_rate": 5.4720866597466185e-06, + "loss": 2.5848, + "step": 18407 + }, + { + "epoch": 1.5689082076195346, + "grad_norm": 94.72875245802065, + "learning_rate": 5.471593024698749e-06, + "loss": 3.794, + "step": 18408 + }, + { + "epoch": 1.56899343731356, + "grad_norm": 50.23263144398061, + "learning_rate": 5.4710993850129475e-06, + "loss": 3.1753, + "step": 18409 + }, + { + "epoch": 1.5690786670075854, + "grad_norm": 49.77367413496988, + "learning_rate": 5.470605740694073e-06, + "loss": 2.1534, + "step": 18410 + }, + { + "epoch": 1.5691638967016108, + "grad_norm": 33.39315161120185, + "learning_rate": 5.470112091746977e-06, + "loss": 2.6639, + "step": 18411 + }, + { + "epoch": 1.5692491263956363, + "grad_norm": 23.21280340566586, + "learning_rate": 5.469618438176515e-06, + "loss": 1.7864, + "step": 18412 + }, + { + "epoch": 1.5693343560896618, + "grad_norm": 24.376735533641163, + "learning_rate": 5.469124779987542e-06, + "loss": 1.8822, + "step": 18413 + }, + { + "epoch": 1.569419585783687, + "grad_norm": 37.14144840405293, + "learning_rate": 5.468631117184914e-06, + "loss": 3.2212, + "step": 18414 + }, + { + "epoch": 1.5695048154777125, + "grad_norm": 65.62850291772467, + "learning_rate": 5.468137449773485e-06, + "loss": 3.0506, + "step": 18415 + }, + { + "epoch": 1.5695900451717377, + "grad_norm": 41.92413882270536, + "learning_rate": 5.46764377775811e-06, + "loss": 3.1233, + "step": 18416 + }, + { + "epoch": 1.5696752748657632, + "grad_norm": 50.03541066020432, + "learning_rate": 5.467150101143644e-06, + "loss": 2.7736, + "step": 18417 + }, + { + "epoch": 1.5697605045597887, + "grad_norm": 71.60481640828881, + "learning_rate": 5.466656419934944e-06, + "loss": 4.5568, + "step": 18418 + }, + { + "epoch": 1.5698457342538141, + "grad_norm": 34.99547753491991, + "learning_rate": 5.466162734136863e-06, + "loss": 2.3912, + "step": 18419 + }, + { + "epoch": 1.5699309639478394, + "grad_norm": 41.44800999878653, + "learning_rate": 5.4656690437542545e-06, + "loss": 2.68, + "step": 18420 + }, + { + "epoch": 1.5700161936418648, + "grad_norm": 60.00631080040735, + "learning_rate": 5.465175348791978e-06, + "loss": 3.1172, + "step": 18421 + }, + { + "epoch": 1.57010142333589, + "grad_norm": 70.22202113350151, + "learning_rate": 5.464681649254886e-06, + "loss": 2.1692, + "step": 18422 + }, + { + "epoch": 1.5701866530299156, + "grad_norm": 33.876217454398066, + "learning_rate": 5.464187945147836e-06, + "loss": 2.3178, + "step": 18423 + }, + { + "epoch": 1.570271882723941, + "grad_norm": 65.5064085756877, + "learning_rate": 5.463694236475681e-06, + "loss": 3.271, + "step": 18424 + }, + { + "epoch": 1.5703571124179665, + "grad_norm": 67.78180445290528, + "learning_rate": 5.463200523243277e-06, + "loss": 3.9917, + "step": 18425 + }, + { + "epoch": 1.570442342111992, + "grad_norm": 41.82566095093508, + "learning_rate": 5.46270680545548e-06, + "loss": 2.3748, + "step": 18426 + }, + { + "epoch": 1.5705275718060172, + "grad_norm": 37.83951560777436, + "learning_rate": 5.462213083117147e-06, + "loss": 3.2784, + "step": 18427 + }, + { + "epoch": 1.5706128015000425, + "grad_norm": 31.85099778452967, + "learning_rate": 5.461719356233131e-06, + "loss": 2.7067, + "step": 18428 + }, + { + "epoch": 1.570698031194068, + "grad_norm": 30.81691630878232, + "learning_rate": 5.461225624808287e-06, + "loss": 3.0824, + "step": 18429 + }, + { + "epoch": 1.5707832608880934, + "grad_norm": 33.88175531492464, + "learning_rate": 5.460731888847474e-06, + "loss": 2.5801, + "step": 18430 + }, + { + "epoch": 1.5708684905821189, + "grad_norm": 44.59770828749628, + "learning_rate": 5.460238148355545e-06, + "loss": 2.7693, + "step": 18431 + }, + { + "epoch": 1.5709537202761443, + "grad_norm": 37.39044252324704, + "learning_rate": 5.459744403337358e-06, + "loss": 2.237, + "step": 18432 + }, + { + "epoch": 1.5710389499701696, + "grad_norm": 39.166559425002866, + "learning_rate": 5.459250653797766e-06, + "loss": 2.9167, + "step": 18433 + }, + { + "epoch": 1.571124179664195, + "grad_norm": 30.429085711148545, + "learning_rate": 5.458756899741629e-06, + "loss": 1.9078, + "step": 18434 + }, + { + "epoch": 1.5712094093582203, + "grad_norm": 103.86740556343152, + "learning_rate": 5.4582631411737975e-06, + "loss": 3.2898, + "step": 18435 + }, + { + "epoch": 1.5712946390522458, + "grad_norm": 83.49055480846596, + "learning_rate": 5.45776937809913e-06, + "loss": 2.8973, + "step": 18436 + }, + { + "epoch": 1.5713798687462712, + "grad_norm": 33.0806443680459, + "learning_rate": 5.4572756105224835e-06, + "loss": 2.709, + "step": 18437 + }, + { + "epoch": 1.5714650984402967, + "grad_norm": 64.85159247491139, + "learning_rate": 5.4567818384487136e-06, + "loss": 2.7336, + "step": 18438 + }, + { + "epoch": 1.5715503281343222, + "grad_norm": 20.725146899203047, + "learning_rate": 5.456288061882676e-06, + "loss": 1.727, + "step": 18439 + }, + { + "epoch": 1.5716355578283474, + "grad_norm": 46.879694733486026, + "learning_rate": 5.455794280829225e-06, + "loss": 1.8349, + "step": 18440 + }, + { + "epoch": 1.5717207875223727, + "grad_norm": 52.6496877524898, + "learning_rate": 5.455300495293219e-06, + "loss": 3.2197, + "step": 18441 + }, + { + "epoch": 1.5718060172163981, + "grad_norm": 66.76907104689083, + "learning_rate": 5.454806705279514e-06, + "loss": 3.1489, + "step": 18442 + }, + { + "epoch": 1.5718912469104236, + "grad_norm": 53.65298123761903, + "learning_rate": 5.454312910792965e-06, + "loss": 4.1665, + "step": 18443 + }, + { + "epoch": 1.571976476604449, + "grad_norm": 81.13586411048453, + "learning_rate": 5.45381911183843e-06, + "loss": 2.5138, + "step": 18444 + }, + { + "epoch": 1.5720617062984745, + "grad_norm": 76.86519915586119, + "learning_rate": 5.4533253084207635e-06, + "loss": 2.8714, + "step": 18445 + }, + { + "epoch": 1.5721469359924998, + "grad_norm": 40.54480566899892, + "learning_rate": 5.452831500544824e-06, + "loss": 3.0081, + "step": 18446 + }, + { + "epoch": 1.5722321656865252, + "grad_norm": 132.24477161936045, + "learning_rate": 5.452337688215466e-06, + "loss": 2.4039, + "step": 18447 + }, + { + "epoch": 1.5723173953805505, + "grad_norm": 40.0896430362384, + "learning_rate": 5.4518438714375455e-06, + "loss": 2.5902, + "step": 18448 + }, + { + "epoch": 1.572402625074576, + "grad_norm": 40.75823140284814, + "learning_rate": 5.451350050215922e-06, + "loss": 3.4877, + "step": 18449 + }, + { + "epoch": 1.5724878547686014, + "grad_norm": 71.57924234796451, + "learning_rate": 5.450856224555449e-06, + "loss": 2.9046, + "step": 18450 + }, + { + "epoch": 1.572573084462627, + "grad_norm": 42.04174048028969, + "learning_rate": 5.450362394460984e-06, + "loss": 3.1099, + "step": 18451 + }, + { + "epoch": 1.5726583141566521, + "grad_norm": 36.814432612978536, + "learning_rate": 5.449868559937384e-06, + "loss": 3.0416, + "step": 18452 + }, + { + "epoch": 1.5727435438506776, + "grad_norm": 30.270583668230337, + "learning_rate": 5.449374720989505e-06, + "loss": 2.103, + "step": 18453 + }, + { + "epoch": 1.5728287735447029, + "grad_norm": 40.61879980112411, + "learning_rate": 5.448880877622205e-06, + "loss": 3.1021, + "step": 18454 + }, + { + "epoch": 1.5729140032387283, + "grad_norm": 46.0575580255885, + "learning_rate": 5.448387029840341e-06, + "loss": 3.1219, + "step": 18455 + }, + { + "epoch": 1.5729992329327538, + "grad_norm": 59.74446220906995, + "learning_rate": 5.447893177648766e-06, + "loss": 3.493, + "step": 18456 + }, + { + "epoch": 1.5730844626267793, + "grad_norm": 36.40900882188733, + "learning_rate": 5.447399321052342e-06, + "loss": 2.2873, + "step": 18457 + }, + { + "epoch": 1.5731696923208047, + "grad_norm": 41.21771752684247, + "learning_rate": 5.446905460055921e-06, + "loss": 2.7643, + "step": 18458 + }, + { + "epoch": 1.57325492201483, + "grad_norm": 59.36839282613634, + "learning_rate": 5.446411594664365e-06, + "loss": 3.1236, + "step": 18459 + }, + { + "epoch": 1.5733401517088552, + "grad_norm": 53.81248966040172, + "learning_rate": 5.445917724882527e-06, + "loss": 2.7439, + "step": 18460 + }, + { + "epoch": 1.5734253814028807, + "grad_norm": 43.51343909829624, + "learning_rate": 5.445423850715265e-06, + "loss": 2.1997, + "step": 18461 + }, + { + "epoch": 1.5735106110969062, + "grad_norm": 54.59167186481834, + "learning_rate": 5.444929972167437e-06, + "loss": 2.7995, + "step": 18462 + }, + { + "epoch": 1.5735958407909316, + "grad_norm": 66.5116482871651, + "learning_rate": 5.4444360892439e-06, + "loss": 3.2141, + "step": 18463 + }, + { + "epoch": 1.573681070484957, + "grad_norm": 42.319762161886736, + "learning_rate": 5.443942201949509e-06, + "loss": 3.1508, + "step": 18464 + }, + { + "epoch": 1.5737663001789823, + "grad_norm": 57.59534152126014, + "learning_rate": 5.443448310289124e-06, + "loss": 1.8421, + "step": 18465 + }, + { + "epoch": 1.5738515298730078, + "grad_norm": 42.19483258127229, + "learning_rate": 5.442954414267602e-06, + "loss": 3.7511, + "step": 18466 + }, + { + "epoch": 1.573936759567033, + "grad_norm": 49.00773578696755, + "learning_rate": 5.442460513889798e-06, + "loss": 2.397, + "step": 18467 + }, + { + "epoch": 1.5740219892610585, + "grad_norm": 30.276980161189563, + "learning_rate": 5.44196660916057e-06, + "loss": 2.5144, + "step": 18468 + }, + { + "epoch": 1.574107218955084, + "grad_norm": 48.650645120316355, + "learning_rate": 5.441472700084778e-06, + "loss": 1.6138, + "step": 18469 + }, + { + "epoch": 1.5741924486491095, + "grad_norm": 29.479065567809727, + "learning_rate": 5.440978786667277e-06, + "loss": 2.4321, + "step": 18470 + }, + { + "epoch": 1.5742776783431347, + "grad_norm": 157.6880789363716, + "learning_rate": 5.440484868912924e-06, + "loss": 2.906, + "step": 18471 + }, + { + "epoch": 1.5743629080371602, + "grad_norm": 49.90476163026912, + "learning_rate": 5.4399909468265775e-06, + "loss": 3.7949, + "step": 18472 + }, + { + "epoch": 1.5744481377311854, + "grad_norm": 53.71469814655875, + "learning_rate": 5.4394970204130945e-06, + "loss": 4.0074, + "step": 18473 + }, + { + "epoch": 1.5745333674252109, + "grad_norm": 24.617602650607143, + "learning_rate": 5.4390030896773346e-06, + "loss": 2.6037, + "step": 18474 + }, + { + "epoch": 1.5746185971192364, + "grad_norm": 20.243115407880097, + "learning_rate": 5.438509154624152e-06, + "loss": 1.3204, + "step": 18475 + }, + { + "epoch": 1.5747038268132618, + "grad_norm": 61.34036756528149, + "learning_rate": 5.438015215258406e-06, + "loss": 2.1313, + "step": 18476 + }, + { + "epoch": 1.5747890565072873, + "grad_norm": 66.32242943282483, + "learning_rate": 5.4375212715849565e-06, + "loss": 2.2461, + "step": 18477 + }, + { + "epoch": 1.5748742862013125, + "grad_norm": 34.19185455001922, + "learning_rate": 5.437027323608658e-06, + "loss": 2.4634, + "step": 18478 + }, + { + "epoch": 1.5749595158953378, + "grad_norm": 57.222868583254396, + "learning_rate": 5.436533371334369e-06, + "loss": 2.736, + "step": 18479 + }, + { + "epoch": 1.5750447455893632, + "grad_norm": 35.843489548645614, + "learning_rate": 5.436039414766947e-06, + "loss": 2.6713, + "step": 18480 + }, + { + "epoch": 1.5751299752833887, + "grad_norm": 26.371554982296406, + "learning_rate": 5.435545453911253e-06, + "loss": 2.4407, + "step": 18481 + }, + { + "epoch": 1.5752152049774142, + "grad_norm": 58.94896486624907, + "learning_rate": 5.435051488772141e-06, + "loss": 2.3334, + "step": 18482 + }, + { + "epoch": 1.5753004346714397, + "grad_norm": 81.64198147604385, + "learning_rate": 5.434557519354471e-06, + "loss": 2.3987, + "step": 18483 + }, + { + "epoch": 1.575385664365465, + "grad_norm": 99.41980158993029, + "learning_rate": 5.434063545663099e-06, + "loss": 3.1267, + "step": 18484 + }, + { + "epoch": 1.5754708940594904, + "grad_norm": 38.79828245294068, + "learning_rate": 5.433569567702887e-06, + "loss": 3.2008, + "step": 18485 + }, + { + "epoch": 1.5755561237535156, + "grad_norm": 30.782835374692763, + "learning_rate": 5.433075585478688e-06, + "loss": 1.9857, + "step": 18486 + }, + { + "epoch": 1.575641353447541, + "grad_norm": 31.128711693011823, + "learning_rate": 5.432581598995364e-06, + "loss": 2.75, + "step": 18487 + }, + { + "epoch": 1.5757265831415666, + "grad_norm": 34.79249992746804, + "learning_rate": 5.432087608257771e-06, + "loss": 2.9921, + "step": 18488 + }, + { + "epoch": 1.575811812835592, + "grad_norm": 47.04790030734237, + "learning_rate": 5.43159361327077e-06, + "loss": 2.4477, + "step": 18489 + }, + { + "epoch": 1.5758970425296173, + "grad_norm": 53.75712715022081, + "learning_rate": 5.431099614039216e-06, + "loss": 1.9881, + "step": 18490 + }, + { + "epoch": 1.5759822722236427, + "grad_norm": 52.770495332479555, + "learning_rate": 5.430605610567969e-06, + "loss": 3.2029, + "step": 18491 + }, + { + "epoch": 1.576067501917668, + "grad_norm": 52.00041609426455, + "learning_rate": 5.430111602861888e-06, + "loss": 2.8849, + "step": 18492 + }, + { + "epoch": 1.5761527316116934, + "grad_norm": 43.247334606904694, + "learning_rate": 5.4296175909258275e-06, + "loss": 2.8559, + "step": 18493 + }, + { + "epoch": 1.576237961305719, + "grad_norm": 88.293096575782, + "learning_rate": 5.429123574764651e-06, + "loss": 3.3386, + "step": 18494 + }, + { + "epoch": 1.5763231909997444, + "grad_norm": 42.71160219541294, + "learning_rate": 5.428629554383212e-06, + "loss": 2.8785, + "step": 18495 + }, + { + "epoch": 1.5764084206937699, + "grad_norm": 132.09622158934965, + "learning_rate": 5.4281355297863746e-06, + "loss": 3.0146, + "step": 18496 + }, + { + "epoch": 1.576493650387795, + "grad_norm": 28.21199047024404, + "learning_rate": 5.427641500978994e-06, + "loss": 1.4771, + "step": 18497 + }, + { + "epoch": 1.5765788800818203, + "grad_norm": 46.764339083222296, + "learning_rate": 5.427147467965929e-06, + "loss": 2.8271, + "step": 18498 + }, + { + "epoch": 1.5766641097758458, + "grad_norm": 84.54876265475494, + "learning_rate": 5.426653430752037e-06, + "loss": 3.1402, + "step": 18499 + }, + { + "epoch": 1.5767493394698713, + "grad_norm": 39.1554820090618, + "learning_rate": 5.426159389342179e-06, + "loss": 2.5812, + "step": 18500 + }, + { + "epoch": 1.5768345691638967, + "grad_norm": 58.3927421090677, + "learning_rate": 5.425665343741212e-06, + "loss": 3.187, + "step": 18501 + }, + { + "epoch": 1.5769197988579222, + "grad_norm": 56.7104245403336, + "learning_rate": 5.425171293953997e-06, + "loss": 3.0534, + "step": 18502 + }, + { + "epoch": 1.5770050285519475, + "grad_norm": 56.620294051708164, + "learning_rate": 5.42467723998539e-06, + "loss": 3.3758, + "step": 18503 + }, + { + "epoch": 1.577090258245973, + "grad_norm": 144.34079792844707, + "learning_rate": 5.424183181840251e-06, + "loss": 2.5828, + "step": 18504 + }, + { + "epoch": 1.5771754879399982, + "grad_norm": 35.678167468641725, + "learning_rate": 5.4236891195234395e-06, + "loss": 2.6021, + "step": 18505 + }, + { + "epoch": 1.5772607176340236, + "grad_norm": 37.82394936018337, + "learning_rate": 5.4231950530398145e-06, + "loss": 3.1645, + "step": 18506 + }, + { + "epoch": 1.5773459473280491, + "grad_norm": 39.45575923141845, + "learning_rate": 5.422700982394233e-06, + "loss": 3.546, + "step": 18507 + }, + { + "epoch": 1.5774311770220746, + "grad_norm": 48.07561092124668, + "learning_rate": 5.422206907591556e-06, + "loss": 2.1248, + "step": 18508 + }, + { + "epoch": 1.5775164067161, + "grad_norm": 28.87415666536112, + "learning_rate": 5.421712828636642e-06, + "loss": 1.7962, + "step": 18509 + }, + { + "epoch": 1.5776016364101253, + "grad_norm": 30.57938590756911, + "learning_rate": 5.42121874553435e-06, + "loss": 2.8061, + "step": 18510 + }, + { + "epoch": 1.5776868661041505, + "grad_norm": 69.24714350483175, + "learning_rate": 5.420724658289539e-06, + "loss": 4.0355, + "step": 18511 + }, + { + "epoch": 1.577772095798176, + "grad_norm": 73.94452660441553, + "learning_rate": 5.420230566907068e-06, + "loss": 2.3932, + "step": 18512 + }, + { + "epoch": 1.5778573254922015, + "grad_norm": 49.29366384138952, + "learning_rate": 5.419736471391797e-06, + "loss": 2.5965, + "step": 18513 + }, + { + "epoch": 1.577942555186227, + "grad_norm": 105.94394730714518, + "learning_rate": 5.419242371748583e-06, + "loss": 3.6592, + "step": 18514 + }, + { + "epoch": 1.5780277848802524, + "grad_norm": 79.1934359140731, + "learning_rate": 5.4187482679822865e-06, + "loss": 3.6533, + "step": 18515 + }, + { + "epoch": 1.5781130145742777, + "grad_norm": 43.93800671175934, + "learning_rate": 5.418254160097769e-06, + "loss": 3.6353, + "step": 18516 + }, + { + "epoch": 1.5781982442683031, + "grad_norm": 66.80582492563043, + "learning_rate": 5.417760048099888e-06, + "loss": 2.5227, + "step": 18517 + }, + { + "epoch": 1.5782834739623284, + "grad_norm": 66.85577156817007, + "learning_rate": 5.417265931993502e-06, + "loss": 2.3777, + "step": 18518 + }, + { + "epoch": 1.5783687036563538, + "grad_norm": 86.27139425683622, + "learning_rate": 5.4167718117834715e-06, + "loss": 3.4856, + "step": 18519 + }, + { + "epoch": 1.5784539333503793, + "grad_norm": 39.39600497777184, + "learning_rate": 5.416277687474656e-06, + "loss": 3.2125, + "step": 18520 + }, + { + "epoch": 1.5785391630444048, + "grad_norm": 105.69820964136751, + "learning_rate": 5.415783559071916e-06, + "loss": 3.0116, + "step": 18521 + }, + { + "epoch": 1.57862439273843, + "grad_norm": 45.87287948047719, + "learning_rate": 5.415289426580107e-06, + "loss": 2.6387, + "step": 18522 + }, + { + "epoch": 1.5787096224324555, + "grad_norm": 54.93996158259685, + "learning_rate": 5.414795290004094e-06, + "loss": 2.4053, + "step": 18523 + }, + { + "epoch": 1.5787948521264807, + "grad_norm": 40.55697687910436, + "learning_rate": 5.414301149348734e-06, + "loss": 3.6141, + "step": 18524 + }, + { + "epoch": 1.5788800818205062, + "grad_norm": 101.70880563619322, + "learning_rate": 5.413807004618885e-06, + "loss": 3.6032, + "step": 18525 + }, + { + "epoch": 1.5789653115145317, + "grad_norm": 35.463343964029775, + "learning_rate": 5.41331285581941e-06, + "loss": 2.799, + "step": 18526 + }, + { + "epoch": 1.5790505412085571, + "grad_norm": 43.209936311752216, + "learning_rate": 5.412818702955167e-06, + "loss": 3.0075, + "step": 18527 + }, + { + "epoch": 1.5791357709025826, + "grad_norm": 94.93605671688879, + "learning_rate": 5.412324546031017e-06, + "loss": 3.577, + "step": 18528 + }, + { + "epoch": 1.5792210005966079, + "grad_norm": 29.99204244500157, + "learning_rate": 5.411830385051817e-06, + "loss": 1.5712, + "step": 18529 + }, + { + "epoch": 1.579306230290633, + "grad_norm": 46.19548133650988, + "learning_rate": 5.411336220022431e-06, + "loss": 2.6348, + "step": 18530 + }, + { + "epoch": 1.5793914599846586, + "grad_norm": 54.471332697689064, + "learning_rate": 5.4108420509477145e-06, + "loss": 1.7046, + "step": 18531 + }, + { + "epoch": 1.579476689678684, + "grad_norm": 25.647799625245227, + "learning_rate": 5.410347877832531e-06, + "loss": 2.6414, + "step": 18532 + }, + { + "epoch": 1.5795619193727095, + "grad_norm": 28.695122176305116, + "learning_rate": 5.409853700681739e-06, + "loss": 2.4185, + "step": 18533 + }, + { + "epoch": 1.579647149066735, + "grad_norm": 34.37194511720538, + "learning_rate": 5.409359519500199e-06, + "loss": 3.2967, + "step": 18534 + }, + { + "epoch": 1.5797323787607602, + "grad_norm": 62.25243574327382, + "learning_rate": 5.408865334292771e-06, + "loss": 2.8607, + "step": 18535 + }, + { + "epoch": 1.5798176084547857, + "grad_norm": 49.94472559106223, + "learning_rate": 5.408371145064314e-06, + "loss": 3.1025, + "step": 18536 + }, + { + "epoch": 1.579902838148811, + "grad_norm": 34.68997232671227, + "learning_rate": 5.40787695181969e-06, + "loss": 2.4921, + "step": 18537 + }, + { + "epoch": 1.5799880678428364, + "grad_norm": 140.55279994317596, + "learning_rate": 5.407382754563757e-06, + "loss": 3.0644, + "step": 18538 + }, + { + "epoch": 1.5800732975368619, + "grad_norm": 61.92835585195011, + "learning_rate": 5.406888553301378e-06, + "loss": 2.9476, + "step": 18539 + }, + { + "epoch": 1.5801585272308873, + "grad_norm": 27.27689233744661, + "learning_rate": 5.406394348037411e-06, + "loss": 2.6588, + "step": 18540 + }, + { + "epoch": 1.5802437569249126, + "grad_norm": 44.49221865657045, + "learning_rate": 5.405900138776717e-06, + "loss": 3.1382, + "step": 18541 + }, + { + "epoch": 1.580328986618938, + "grad_norm": 101.68778608872982, + "learning_rate": 5.405405925524155e-06, + "loss": 2.7875, + "step": 18542 + }, + { + "epoch": 1.5804142163129633, + "grad_norm": 59.19167249707107, + "learning_rate": 5.404911708284591e-06, + "loss": 2.9864, + "step": 18543 + }, + { + "epoch": 1.5804994460069888, + "grad_norm": 46.14199693670779, + "learning_rate": 5.404417487062878e-06, + "loss": 2.5009, + "step": 18544 + }, + { + "epoch": 1.5805846757010142, + "grad_norm": 133.89666755828696, + "learning_rate": 5.40392326186388e-06, + "loss": 5.09, + "step": 18545 + }, + { + "epoch": 1.5806699053950397, + "grad_norm": 42.841864387648044, + "learning_rate": 5.403429032692456e-06, + "loss": 2.7255, + "step": 18546 + }, + { + "epoch": 1.5807551350890652, + "grad_norm": 50.825151841451024, + "learning_rate": 5.40293479955347e-06, + "loss": 3.1139, + "step": 18547 + }, + { + "epoch": 1.5808403647830904, + "grad_norm": 31.581513149556844, + "learning_rate": 5.402440562451779e-06, + "loss": 2.3903, + "step": 18548 + }, + { + "epoch": 1.5809255944771157, + "grad_norm": 48.18127182898227, + "learning_rate": 5.401946321392246e-06, + "loss": 2.683, + "step": 18549 + }, + { + "epoch": 1.5810108241711411, + "grad_norm": 104.31792734832301, + "learning_rate": 5.401452076379731e-06, + "loss": 3.7929, + "step": 18550 + }, + { + "epoch": 1.5810960538651666, + "grad_norm": 82.6243650482732, + "learning_rate": 5.400957827419093e-06, + "loss": 3.6623, + "step": 18551 + }, + { + "epoch": 1.581181283559192, + "grad_norm": 66.42193017270044, + "learning_rate": 5.4004635745151955e-06, + "loss": 2.5139, + "step": 18552 + }, + { + "epoch": 1.5812665132532175, + "grad_norm": 34.626292906345604, + "learning_rate": 5.399969317672898e-06, + "loss": 2.9345, + "step": 18553 + }, + { + "epoch": 1.5813517429472428, + "grad_norm": 62.377324762402075, + "learning_rate": 5.399475056897059e-06, + "loss": 2.9973, + "step": 18554 + }, + { + "epoch": 1.5814369726412683, + "grad_norm": 32.25951934871416, + "learning_rate": 5.398980792192542e-06, + "loss": 2.5475, + "step": 18555 + }, + { + "epoch": 1.5815222023352935, + "grad_norm": 36.861338298504194, + "learning_rate": 5.398486523564209e-06, + "loss": 2.5186, + "step": 18556 + }, + { + "epoch": 1.581607432029319, + "grad_norm": 43.037107638981944, + "learning_rate": 5.397992251016918e-06, + "loss": 3.0546, + "step": 18557 + }, + { + "epoch": 1.5816926617233444, + "grad_norm": 77.45913190852083, + "learning_rate": 5.3974979745555325e-06, + "loss": 3.0726, + "step": 18558 + }, + { + "epoch": 1.58177789141737, + "grad_norm": 31.84812858515153, + "learning_rate": 5.397003694184911e-06, + "loss": 3.1469, + "step": 18559 + }, + { + "epoch": 1.5818631211113954, + "grad_norm": 43.383265705783245, + "learning_rate": 5.396509409909918e-06, + "loss": 3.4789, + "step": 18560 + }, + { + "epoch": 1.5819483508054206, + "grad_norm": 67.74012436082883, + "learning_rate": 5.396015121735411e-06, + "loss": 2.8838, + "step": 18561 + }, + { + "epoch": 1.5820335804994459, + "grad_norm": 48.28399121283681, + "learning_rate": 5.395520829666252e-06, + "loss": 3.5771, + "step": 18562 + }, + { + "epoch": 1.5821188101934713, + "grad_norm": 41.96287383800005, + "learning_rate": 5.3950265337073034e-06, + "loss": 2.7201, + "step": 18563 + }, + { + "epoch": 1.5822040398874968, + "grad_norm": 81.32697710688628, + "learning_rate": 5.394532233863427e-06, + "loss": 3.5243, + "step": 18564 + }, + { + "epoch": 1.5822892695815223, + "grad_norm": 52.34562909631395, + "learning_rate": 5.3940379301394814e-06, + "loss": 2.4223, + "step": 18565 + }, + { + "epoch": 1.5823744992755477, + "grad_norm": 52.33786159558792, + "learning_rate": 5.39354362254033e-06, + "loss": 2.4158, + "step": 18566 + }, + { + "epoch": 1.582459728969573, + "grad_norm": 33.4487453089894, + "learning_rate": 5.3930493110708345e-06, + "loss": 2.4422, + "step": 18567 + }, + { + "epoch": 1.5825449586635982, + "grad_norm": 89.54944496759703, + "learning_rate": 5.392554995735853e-06, + "loss": 2.0827, + "step": 18568 + }, + { + "epoch": 1.5826301883576237, + "grad_norm": 91.80580102035427, + "learning_rate": 5.392060676540251e-06, + "loss": 3.3562, + "step": 18569 + }, + { + "epoch": 1.5827154180516492, + "grad_norm": 121.40137036916538, + "learning_rate": 5.391566353488887e-06, + "loss": 3.9771, + "step": 18570 + }, + { + "epoch": 1.5828006477456746, + "grad_norm": 72.68177436429883, + "learning_rate": 5.391072026586624e-06, + "loss": 2.9657, + "step": 18571 + }, + { + "epoch": 1.5828858774397, + "grad_norm": 43.293311509512144, + "learning_rate": 5.390577695838323e-06, + "loss": 3.5872, + "step": 18572 + }, + { + "epoch": 1.5829711071337254, + "grad_norm": 75.25178705898955, + "learning_rate": 5.390083361248844e-06, + "loss": 3.0701, + "step": 18573 + }, + { + "epoch": 1.5830563368277508, + "grad_norm": 26.45868139812912, + "learning_rate": 5.389589022823052e-06, + "loss": 2.439, + "step": 18574 + }, + { + "epoch": 1.583141566521776, + "grad_norm": 139.73464713527022, + "learning_rate": 5.389094680565806e-06, + "loss": 3.4273, + "step": 18575 + }, + { + "epoch": 1.5832267962158015, + "grad_norm": 81.7958127763046, + "learning_rate": 5.3886003344819695e-06, + "loss": 3.5273, + "step": 18576 + }, + { + "epoch": 1.583312025909827, + "grad_norm": 56.27495638179293, + "learning_rate": 5.3881059845764015e-06, + "loss": 3.2941, + "step": 18577 + }, + { + "epoch": 1.5833972556038525, + "grad_norm": 58.371771160124425, + "learning_rate": 5.387611630853966e-06, + "loss": 2.674, + "step": 18578 + }, + { + "epoch": 1.583482485297878, + "grad_norm": 33.113750677407126, + "learning_rate": 5.387117273319525e-06, + "loss": 2.3926, + "step": 18579 + }, + { + "epoch": 1.5835677149919032, + "grad_norm": 84.45607801474036, + "learning_rate": 5.386622911977939e-06, + "loss": 2.211, + "step": 18580 + }, + { + "epoch": 1.5836529446859284, + "grad_norm": 76.254909526179, + "learning_rate": 5.386128546834068e-06, + "loss": 2.7101, + "step": 18581 + }, + { + "epoch": 1.583738174379954, + "grad_norm": 44.23339093032325, + "learning_rate": 5.38563417789278e-06, + "loss": 2.7482, + "step": 18582 + }, + { + "epoch": 1.5838234040739794, + "grad_norm": 65.71544389101446, + "learning_rate": 5.385139805158931e-06, + "loss": 3.0535, + "step": 18583 + }, + { + "epoch": 1.5839086337680048, + "grad_norm": 68.96783454348957, + "learning_rate": 5.3846454286373865e-06, + "loss": 3.2946, + "step": 18584 + }, + { + "epoch": 1.5839938634620303, + "grad_norm": 58.63594218134052, + "learning_rate": 5.384151048333004e-06, + "loss": 2.6905, + "step": 18585 + }, + { + "epoch": 1.5840790931560556, + "grad_norm": 95.36249675053581, + "learning_rate": 5.383656664250652e-06, + "loss": 3.8526, + "step": 18586 + }, + { + "epoch": 1.584164322850081, + "grad_norm": 57.08898543178292, + "learning_rate": 5.383162276395186e-06, + "loss": 2.5499, + "step": 18587 + }, + { + "epoch": 1.5842495525441063, + "grad_norm": 95.45446037022778, + "learning_rate": 5.382667884771474e-06, + "loss": 3.9541, + "step": 18588 + }, + { + "epoch": 1.5843347822381317, + "grad_norm": 38.85844278767445, + "learning_rate": 5.382173489384373e-06, + "loss": 2.6836, + "step": 18589 + }, + { + "epoch": 1.5844200119321572, + "grad_norm": 54.736953951796735, + "learning_rate": 5.38167909023875e-06, + "loss": 4.3516, + "step": 18590 + }, + { + "epoch": 1.5845052416261827, + "grad_norm": 48.54096359776983, + "learning_rate": 5.381184687339463e-06, + "loss": 3.0084, + "step": 18591 + }, + { + "epoch": 1.584590471320208, + "grad_norm": 25.061376463315163, + "learning_rate": 5.380690280691377e-06, + "loss": 2.2148, + "step": 18592 + }, + { + "epoch": 1.5846757010142334, + "grad_norm": 42.74163931972843, + "learning_rate": 5.380195870299353e-06, + "loss": 4.1122, + "step": 18593 + }, + { + "epoch": 1.5847609307082586, + "grad_norm": 60.36075957687664, + "learning_rate": 5.3797014561682535e-06, + "loss": 2.8738, + "step": 18594 + }, + { + "epoch": 1.584846160402284, + "grad_norm": 42.81147324972374, + "learning_rate": 5.379207038302943e-06, + "loss": 2.8387, + "step": 18595 + }, + { + "epoch": 1.5849313900963096, + "grad_norm": 102.99321440193418, + "learning_rate": 5.37871261670828e-06, + "loss": 2.814, + "step": 18596 + }, + { + "epoch": 1.585016619790335, + "grad_norm": 109.12211272730104, + "learning_rate": 5.378218191389129e-06, + "loss": 2.7031, + "step": 18597 + }, + { + "epoch": 1.5851018494843605, + "grad_norm": 33.823276446887014, + "learning_rate": 5.377723762350352e-06, + "loss": 2.4698, + "step": 18598 + }, + { + "epoch": 1.5851870791783857, + "grad_norm": 65.14946002573922, + "learning_rate": 5.377229329596813e-06, + "loss": 3.5494, + "step": 18599 + }, + { + "epoch": 1.585272308872411, + "grad_norm": 43.698953521926676, + "learning_rate": 5.376734893133373e-06, + "loss": 2.6859, + "step": 18600 + }, + { + "epoch": 1.5853575385664365, + "grad_norm": 52.84535153913935, + "learning_rate": 5.376240452964895e-06, + "loss": 2.6343, + "step": 18601 + }, + { + "epoch": 1.585442768260462, + "grad_norm": 100.04756415928993, + "learning_rate": 5.375746009096241e-06, + "loss": 3.9218, + "step": 18602 + }, + { + "epoch": 1.5855279979544874, + "grad_norm": 58.49145372804391, + "learning_rate": 5.3752515615322755e-06, + "loss": 2.9199, + "step": 18603 + }, + { + "epoch": 1.5856132276485129, + "grad_norm": 95.14688620716349, + "learning_rate": 5.374757110277859e-06, + "loss": 4.108, + "step": 18604 + }, + { + "epoch": 1.5856984573425381, + "grad_norm": 38.95685354733545, + "learning_rate": 5.374262655337856e-06, + "loss": 3.1275, + "step": 18605 + }, + { + "epoch": 1.5857836870365636, + "grad_norm": 61.2267492845562, + "learning_rate": 5.373768196717127e-06, + "loss": 2.692, + "step": 18606 + }, + { + "epoch": 1.5858689167305888, + "grad_norm": 94.60771182780093, + "learning_rate": 5.373273734420539e-06, + "loss": 3.0144, + "step": 18607 + }, + { + "epoch": 1.5859541464246143, + "grad_norm": 43.2741274941361, + "learning_rate": 5.37277926845295e-06, + "loss": 3.1068, + "step": 18608 + }, + { + "epoch": 1.5860393761186398, + "grad_norm": 35.27622277356473, + "learning_rate": 5.372284798819225e-06, + "loss": 2.9578, + "step": 18609 + }, + { + "epoch": 1.5861246058126652, + "grad_norm": 23.53789801670354, + "learning_rate": 5.371790325524229e-06, + "loss": 2.0426, + "step": 18610 + }, + { + "epoch": 1.5862098355066905, + "grad_norm": 259.49599290633404, + "learning_rate": 5.37129584857282e-06, + "loss": 3.7853, + "step": 18611 + }, + { + "epoch": 1.586295065200716, + "grad_norm": 29.413137854463063, + "learning_rate": 5.370801367969865e-06, + "loss": 2.1449, + "step": 18612 + }, + { + "epoch": 1.5863802948947412, + "grad_norm": 39.06504116235088, + "learning_rate": 5.370306883720227e-06, + "loss": 2.4244, + "step": 18613 + }, + { + "epoch": 1.5864655245887667, + "grad_norm": 86.8581534918018, + "learning_rate": 5.369812395828768e-06, + "loss": 3.6532, + "step": 18614 + }, + { + "epoch": 1.5865507542827921, + "grad_norm": 32.35740884216144, + "learning_rate": 5.3693179043003496e-06, + "loss": 2.0224, + "step": 18615 + }, + { + "epoch": 1.5866359839768176, + "grad_norm": 48.89203089221131, + "learning_rate": 5.368823409139837e-06, + "loss": 2.8748, + "step": 18616 + }, + { + "epoch": 1.586721213670843, + "grad_norm": 72.94919638002249, + "learning_rate": 5.368328910352092e-06, + "loss": 4.9876, + "step": 18617 + }, + { + "epoch": 1.5868064433648683, + "grad_norm": 49.779301039435715, + "learning_rate": 5.36783440794198e-06, + "loss": 3.1569, + "step": 18618 + }, + { + "epoch": 1.5868916730588936, + "grad_norm": 29.783418412682337, + "learning_rate": 5.367339901914363e-06, + "loss": 2.0761, + "step": 18619 + }, + { + "epoch": 1.586976902752919, + "grad_norm": 59.1398564347237, + "learning_rate": 5.366845392274102e-06, + "loss": 3.4919, + "step": 18620 + }, + { + "epoch": 1.5870621324469445, + "grad_norm": 46.15730203644739, + "learning_rate": 5.366350879026063e-06, + "loss": 2.2723, + "step": 18621 + }, + { + "epoch": 1.58714736214097, + "grad_norm": 44.76964162149301, + "learning_rate": 5.365856362175109e-06, + "loss": 2.4363, + "step": 18622 + }, + { + "epoch": 1.5872325918349954, + "grad_norm": 52.61241610754705, + "learning_rate": 5.365361841726104e-06, + "loss": 2.7255, + "step": 18623 + }, + { + "epoch": 1.5873178215290207, + "grad_norm": 81.97392393860937, + "learning_rate": 5.364867317683908e-06, + "loss": 3.4671, + "step": 18624 + }, + { + "epoch": 1.5874030512230461, + "grad_norm": 23.436191229720126, + "learning_rate": 5.364372790053389e-06, + "loss": 1.9456, + "step": 18625 + }, + { + "epoch": 1.5874882809170714, + "grad_norm": 108.93993131046386, + "learning_rate": 5.363878258839408e-06, + "loss": 3.9468, + "step": 18626 + }, + { + "epoch": 1.5875735106110969, + "grad_norm": 44.39030691246077, + "learning_rate": 5.36338372404683e-06, + "loss": 3.1857, + "step": 18627 + }, + { + "epoch": 1.5876587403051223, + "grad_norm": 36.572694008687854, + "learning_rate": 5.362889185680515e-06, + "loss": 2.6436, + "step": 18628 + }, + { + "epoch": 1.5877439699991478, + "grad_norm": 41.85080065912966, + "learning_rate": 5.362394643745331e-06, + "loss": 2.7586, + "step": 18629 + }, + { + "epoch": 1.5878291996931733, + "grad_norm": 41.197486356522596, + "learning_rate": 5.361900098246139e-06, + "loss": 3.3721, + "step": 18630 + }, + { + "epoch": 1.5879144293871985, + "grad_norm": 65.09659939492953, + "learning_rate": 5.361405549187804e-06, + "loss": 2.6482, + "step": 18631 + }, + { + "epoch": 1.5879996590812238, + "grad_norm": 92.52246128507156, + "learning_rate": 5.360910996575186e-06, + "loss": 4.3731, + "step": 18632 + }, + { + "epoch": 1.5880848887752492, + "grad_norm": 34.93771657005902, + "learning_rate": 5.3604164404131554e-06, + "loss": 2.6706, + "step": 18633 + }, + { + "epoch": 1.5881701184692747, + "grad_norm": 38.274133244120144, + "learning_rate": 5.359921880706571e-06, + "loss": 2.8955, + "step": 18634 + }, + { + "epoch": 1.5882553481633002, + "grad_norm": 47.05493604435042, + "learning_rate": 5.359427317460299e-06, + "loss": 2.51, + "step": 18635 + }, + { + "epoch": 1.5883405778573256, + "grad_norm": 38.87881529634741, + "learning_rate": 5.3589327506791985e-06, + "loss": 3.2632, + "step": 18636 + }, + { + "epoch": 1.5884258075513509, + "grad_norm": 57.04067045689602, + "learning_rate": 5.358438180368141e-06, + "loss": 3.1881, + "step": 18637 + }, + { + "epoch": 1.5885110372453761, + "grad_norm": 29.575430764664983, + "learning_rate": 5.357943606531986e-06, + "loss": 2.9039, + "step": 18638 + }, + { + "epoch": 1.5885962669394016, + "grad_norm": 85.42765882083533, + "learning_rate": 5.357449029175596e-06, + "loss": 3.0081, + "step": 18639 + }, + { + "epoch": 1.588681496633427, + "grad_norm": 37.10677663872442, + "learning_rate": 5.356954448303838e-06, + "loss": 2.8191, + "step": 18640 + }, + { + "epoch": 1.5887667263274525, + "grad_norm": 55.83704175251035, + "learning_rate": 5.3564598639215745e-06, + "loss": 2.4278, + "step": 18641 + }, + { + "epoch": 1.588851956021478, + "grad_norm": 38.30585024797012, + "learning_rate": 5.35596527603367e-06, + "loss": 3.065, + "step": 18642 + }, + { + "epoch": 1.5889371857155032, + "grad_norm": 46.71402398751512, + "learning_rate": 5.355470684644989e-06, + "loss": 2.4331, + "step": 18643 + }, + { + "epoch": 1.5890224154095287, + "grad_norm": 56.01570097993332, + "learning_rate": 5.3549760897603946e-06, + "loss": 2.6095, + "step": 18644 + }, + { + "epoch": 1.589107645103554, + "grad_norm": 57.30228219149218, + "learning_rate": 5.3544814913847495e-06, + "loss": 2.8666, + "step": 18645 + }, + { + "epoch": 1.5891928747975794, + "grad_norm": 134.7012159256455, + "learning_rate": 5.353986889522923e-06, + "loss": 3.3785, + "step": 18646 + }, + { + "epoch": 1.589278104491605, + "grad_norm": 68.20681639900083, + "learning_rate": 5.353492284179774e-06, + "loss": 2.0898, + "step": 18647 + }, + { + "epoch": 1.5893633341856304, + "grad_norm": 88.35899140339387, + "learning_rate": 5.35299767536017e-06, + "loss": 4.0281, + "step": 18648 + }, + { + "epoch": 1.5894485638796558, + "grad_norm": 80.35809963408077, + "learning_rate": 5.352503063068974e-06, + "loss": 4.4256, + "step": 18649 + }, + { + "epoch": 1.589533793573681, + "grad_norm": 29.178324285625823, + "learning_rate": 5.35200844731105e-06, + "loss": 2.2236, + "step": 18650 + }, + { + "epoch": 1.5896190232677063, + "grad_norm": 30.23855988608693, + "learning_rate": 5.351513828091263e-06, + "loss": 2.0711, + "step": 18651 + }, + { + "epoch": 1.5897042529617318, + "grad_norm": 43.06029030243574, + "learning_rate": 5.351019205414476e-06, + "loss": 3.017, + "step": 18652 + }, + { + "epoch": 1.5897894826557573, + "grad_norm": 39.624251126049025, + "learning_rate": 5.350524579285556e-06, + "loss": 2.7653, + "step": 18653 + }, + { + "epoch": 1.5898747123497827, + "grad_norm": 62.709489763371884, + "learning_rate": 5.350029949709367e-06, + "loss": 3.191, + "step": 18654 + }, + { + "epoch": 1.5899599420438082, + "grad_norm": 86.17649140771984, + "learning_rate": 5.349535316690771e-06, + "loss": 3.0805, + "step": 18655 + }, + { + "epoch": 1.5900451717378334, + "grad_norm": 39.34927181947039, + "learning_rate": 5.349040680234634e-06, + "loss": 1.8835, + "step": 18656 + }, + { + "epoch": 1.590130401431859, + "grad_norm": 33.8519972235941, + "learning_rate": 5.348546040345821e-06, + "loss": 2.76, + "step": 18657 + }, + { + "epoch": 1.5902156311258842, + "grad_norm": 42.025074690047845, + "learning_rate": 5.348051397029196e-06, + "loss": 3.18, + "step": 18658 + }, + { + "epoch": 1.5903008608199096, + "grad_norm": 38.652608987030945, + "learning_rate": 5.3475567502896246e-06, + "loss": 3.2367, + "step": 18659 + }, + { + "epoch": 1.590386090513935, + "grad_norm": 39.82463122461587, + "learning_rate": 5.347062100131969e-06, + "loss": 3.4872, + "step": 18660 + }, + { + "epoch": 1.5904713202079606, + "grad_norm": 37.27237801908968, + "learning_rate": 5.346567446561097e-06, + "loss": 2.8233, + "step": 18661 + }, + { + "epoch": 1.5905565499019858, + "grad_norm": 97.0563936812031, + "learning_rate": 5.346072789581871e-06, + "loss": 4.6872, + "step": 18662 + }, + { + "epoch": 1.5906417795960113, + "grad_norm": 38.91913119831468, + "learning_rate": 5.345578129199157e-06, + "loss": 2.5736, + "step": 18663 + }, + { + "epoch": 1.5907270092900365, + "grad_norm": 52.931958493900645, + "learning_rate": 5.34508346541782e-06, + "loss": 3.1307, + "step": 18664 + }, + { + "epoch": 1.590812238984062, + "grad_norm": 125.65496107044162, + "learning_rate": 5.344588798242724e-06, + "loss": 2.1692, + "step": 18665 + }, + { + "epoch": 1.5908974686780875, + "grad_norm": 48.550180193123225, + "learning_rate": 5.344094127678733e-06, + "loss": 2.1199, + "step": 18666 + }, + { + "epoch": 1.590982698372113, + "grad_norm": 49.96004649572723, + "learning_rate": 5.343599453730714e-06, + "loss": 2.5521, + "step": 18667 + }, + { + "epoch": 1.5910679280661384, + "grad_norm": 39.36263036961708, + "learning_rate": 5.343104776403532e-06, + "loss": 3.3591, + "step": 18668 + }, + { + "epoch": 1.5911531577601636, + "grad_norm": 62.110770816475736, + "learning_rate": 5.342610095702049e-06, + "loss": 4.07, + "step": 18669 + }, + { + "epoch": 1.5912383874541889, + "grad_norm": 50.149649497320645, + "learning_rate": 5.342115411631132e-06, + "loss": 2.8844, + "step": 18670 + }, + { + "epoch": 1.5913236171482144, + "grad_norm": 23.469996113546408, + "learning_rate": 5.341620724195645e-06, + "loss": 1.6003, + "step": 18671 + }, + { + "epoch": 1.5914088468422398, + "grad_norm": 46.50280910716342, + "learning_rate": 5.341126033400456e-06, + "loss": 2.4499, + "step": 18672 + }, + { + "epoch": 1.5914940765362653, + "grad_norm": 27.84423735172665, + "learning_rate": 5.3406313392504274e-06, + "loss": 1.7281, + "step": 18673 + }, + { + "epoch": 1.5915793062302908, + "grad_norm": 31.26549666699668, + "learning_rate": 5.340136641750426e-06, + "loss": 2.9302, + "step": 18674 + }, + { + "epoch": 1.591664535924316, + "grad_norm": 50.81537852131578, + "learning_rate": 5.339641940905313e-06, + "loss": 2.9186, + "step": 18675 + }, + { + "epoch": 1.5917497656183415, + "grad_norm": 21.44453056910309, + "learning_rate": 5.3391472367199595e-06, + "loss": 1.3402, + "step": 18676 + }, + { + "epoch": 1.5918349953123667, + "grad_norm": 32.3960093298346, + "learning_rate": 5.338652529199225e-06, + "loss": 2.6709, + "step": 18677 + }, + { + "epoch": 1.5919202250063922, + "grad_norm": 34.88631839393203, + "learning_rate": 5.33815781834798e-06, + "loss": 2.8645, + "step": 18678 + }, + { + "epoch": 1.5920054547004177, + "grad_norm": 46.669747487516766, + "learning_rate": 5.337663104171084e-06, + "loss": 3.5737, + "step": 18679 + }, + { + "epoch": 1.5920906843944431, + "grad_norm": 95.78627900888421, + "learning_rate": 5.337168386673408e-06, + "loss": 2.9791, + "step": 18680 + }, + { + "epoch": 1.5921759140884684, + "grad_norm": 45.43670295846054, + "learning_rate": 5.336673665859816e-06, + "loss": 3.6638, + "step": 18681 + }, + { + "epoch": 1.5922611437824938, + "grad_norm": 50.035608115699205, + "learning_rate": 5.336178941735171e-06, + "loss": 3.4053, + "step": 18682 + }, + { + "epoch": 1.592346373476519, + "grad_norm": 85.25748076277586, + "learning_rate": 5.335684214304338e-06, + "loss": 2.8293, + "step": 18683 + }, + { + "epoch": 1.5924316031705446, + "grad_norm": 60.26549921053508, + "learning_rate": 5.335189483572185e-06, + "loss": 2.0735, + "step": 18684 + }, + { + "epoch": 1.59251683286457, + "grad_norm": 58.231001975602375, + "learning_rate": 5.3346947495435786e-06, + "loss": 3.0243, + "step": 18685 + }, + { + "epoch": 1.5926020625585955, + "grad_norm": 47.41400436891411, + "learning_rate": 5.3342000122233805e-06, + "loss": 2.5245, + "step": 18686 + }, + { + "epoch": 1.592687292252621, + "grad_norm": 41.938860908710716, + "learning_rate": 5.333705271616458e-06, + "loss": 2.8044, + "step": 18687 + }, + { + "epoch": 1.5927725219466462, + "grad_norm": 48.18199457478192, + "learning_rate": 5.3332105277276765e-06, + "loss": 3.5287, + "step": 18688 + }, + { + "epoch": 1.5928577516406714, + "grad_norm": 41.81188410571394, + "learning_rate": 5.332715780561904e-06, + "loss": 2.8586, + "step": 18689 + }, + { + "epoch": 1.592942981334697, + "grad_norm": 40.61506709393385, + "learning_rate": 5.332221030124002e-06, + "loss": 2.1095, + "step": 18690 + }, + { + "epoch": 1.5930282110287224, + "grad_norm": 50.56003751316639, + "learning_rate": 5.3317262764188385e-06, + "loss": 3.0755, + "step": 18691 + }, + { + "epoch": 1.5931134407227479, + "grad_norm": 42.34395446993836, + "learning_rate": 5.331231519451278e-06, + "loss": 3.2879, + "step": 18692 + }, + { + "epoch": 1.5931986704167733, + "grad_norm": 109.72819540986087, + "learning_rate": 5.3307367592261905e-06, + "loss": 3.4913, + "step": 18693 + }, + { + "epoch": 1.5932839001107986, + "grad_norm": 38.81834324262362, + "learning_rate": 5.330241995748435e-06, + "loss": 3.2025, + "step": 18694 + }, + { + "epoch": 1.593369129804824, + "grad_norm": 28.222607158244518, + "learning_rate": 5.329747229022883e-06, + "loss": 1.8654, + "step": 18695 + }, + { + "epoch": 1.5934543594988493, + "grad_norm": 35.837898706296166, + "learning_rate": 5.329252459054397e-06, + "loss": 2.5282, + "step": 18696 + }, + { + "epoch": 1.5935395891928748, + "grad_norm": 37.58183206319717, + "learning_rate": 5.328757685847843e-06, + "loss": 2.618, + "step": 18697 + }, + { + "epoch": 1.5936248188869002, + "grad_norm": 30.239377389989397, + "learning_rate": 5.328262909408088e-06, + "loss": 1.6776, + "step": 18698 + }, + { + "epoch": 1.5937100485809257, + "grad_norm": 47.73035583569445, + "learning_rate": 5.3277681297399984e-06, + "loss": 2.7513, + "step": 18699 + }, + { + "epoch": 1.5937952782749512, + "grad_norm": 26.488025106766365, + "learning_rate": 5.32727334684844e-06, + "loss": 2.0598, + "step": 18700 + }, + { + "epoch": 1.5938805079689764, + "grad_norm": 50.76858340380087, + "learning_rate": 5.326778560738277e-06, + "loss": 3.5224, + "step": 18701 + }, + { + "epoch": 1.5939657376630016, + "grad_norm": 76.37020854000636, + "learning_rate": 5.326283771414378e-06, + "loss": 2.7905, + "step": 18702 + }, + { + "epoch": 1.5940509673570271, + "grad_norm": 67.45013431511461, + "learning_rate": 5.325788978881607e-06, + "loss": 3.4277, + "step": 18703 + }, + { + "epoch": 1.5941361970510526, + "grad_norm": 87.54151405948767, + "learning_rate": 5.325294183144831e-06, + "loss": 2.5426, + "step": 18704 + }, + { + "epoch": 1.594221426745078, + "grad_norm": 36.4139763272944, + "learning_rate": 5.324799384208916e-06, + "loss": 2.5866, + "step": 18705 + }, + { + "epoch": 1.5943066564391035, + "grad_norm": 30.848335518907017, + "learning_rate": 5.324304582078727e-06, + "loss": 2.5857, + "step": 18706 + }, + { + "epoch": 1.5943918861331288, + "grad_norm": 62.547925673323924, + "learning_rate": 5.323809776759133e-06, + "loss": 3.2599, + "step": 18707 + }, + { + "epoch": 1.5944771158271542, + "grad_norm": 59.61115203653552, + "learning_rate": 5.323314968254999e-06, + "loss": 1.3084, + "step": 18708 + }, + { + "epoch": 1.5945623455211795, + "grad_norm": 35.10631010269697, + "learning_rate": 5.322820156571189e-06, + "loss": 3.3255, + "step": 18709 + }, + { + "epoch": 1.594647575215205, + "grad_norm": 51.40675309763153, + "learning_rate": 5.322325341712571e-06, + "loss": 3.0195, + "step": 18710 + }, + { + "epoch": 1.5947328049092304, + "grad_norm": 71.97337404796012, + "learning_rate": 5.3218305236840125e-06, + "loss": 2.675, + "step": 18711 + }, + { + "epoch": 1.5948180346032559, + "grad_norm": 46.30577542686656, + "learning_rate": 5.321335702490379e-06, + "loss": 2.2719, + "step": 18712 + }, + { + "epoch": 1.5949032642972811, + "grad_norm": 38.79890318182827, + "learning_rate": 5.320840878136536e-06, + "loss": 2.9021, + "step": 18713 + }, + { + "epoch": 1.5949884939913066, + "grad_norm": 46.3209236847172, + "learning_rate": 5.320346050627349e-06, + "loss": 2.5204, + "step": 18714 + }, + { + "epoch": 1.5950737236853318, + "grad_norm": 109.06509216411294, + "learning_rate": 5.3198512199676876e-06, + "loss": 2.6603, + "step": 18715 + }, + { + "epoch": 1.5951589533793573, + "grad_norm": 30.09093023623328, + "learning_rate": 5.319356386162415e-06, + "loss": 2.2857, + "step": 18716 + }, + { + "epoch": 1.5952441830733828, + "grad_norm": 58.412411451657526, + "learning_rate": 5.318861549216402e-06, + "loss": 2.737, + "step": 18717 + }, + { + "epoch": 1.5953294127674083, + "grad_norm": 44.53060822033602, + "learning_rate": 5.318366709134509e-06, + "loss": 3.6427, + "step": 18718 + }, + { + "epoch": 1.5954146424614337, + "grad_norm": 68.79596536845041, + "learning_rate": 5.3178718659216085e-06, + "loss": 3.324, + "step": 18719 + }, + { + "epoch": 1.595499872155459, + "grad_norm": 59.07644774361048, + "learning_rate": 5.317377019582564e-06, + "loss": 1.5111, + "step": 18720 + }, + { + "epoch": 1.5955851018494842, + "grad_norm": 42.551284947926085, + "learning_rate": 5.316882170122243e-06, + "loss": 2.7297, + "step": 18721 + }, + { + "epoch": 1.5956703315435097, + "grad_norm": 37.96295342763901, + "learning_rate": 5.31638731754551e-06, + "loss": 2.5901, + "step": 18722 + }, + { + "epoch": 1.5957555612375351, + "grad_norm": 53.820901594454895, + "learning_rate": 5.315892461857234e-06, + "loss": 3.4124, + "step": 18723 + }, + { + "epoch": 1.5958407909315606, + "grad_norm": 29.145097595902058, + "learning_rate": 5.315397603062281e-06, + "loss": 1.9478, + "step": 18724 + }, + { + "epoch": 1.595926020625586, + "grad_norm": 73.0563099951189, + "learning_rate": 5.31490274116552e-06, + "loss": 2.5156, + "step": 18725 + }, + { + "epoch": 1.5960112503196113, + "grad_norm": 49.94220167078169, + "learning_rate": 5.3144078761718135e-06, + "loss": 2.2971, + "step": 18726 + }, + { + "epoch": 1.5960964800136368, + "grad_norm": 68.52540013227964, + "learning_rate": 5.313913008086031e-06, + "loss": 2.9427, + "step": 18727 + }, + { + "epoch": 1.596181709707662, + "grad_norm": 95.0904352405366, + "learning_rate": 5.31341813691304e-06, + "loss": 4.0256, + "step": 18728 + }, + { + "epoch": 1.5962669394016875, + "grad_norm": 22.16659990716698, + "learning_rate": 5.3129232626577044e-06, + "loss": 1.4178, + "step": 18729 + }, + { + "epoch": 1.596352169095713, + "grad_norm": 95.71463927305938, + "learning_rate": 5.312428385324894e-06, + "loss": 4.3543, + "step": 18730 + }, + { + "epoch": 1.5964373987897384, + "grad_norm": 69.52755625330916, + "learning_rate": 5.311933504919473e-06, + "loss": 3.6153, + "step": 18731 + }, + { + "epoch": 1.5965226284837637, + "grad_norm": 88.8281756688704, + "learning_rate": 5.311438621446312e-06, + "loss": 3.9086, + "step": 18732 + }, + { + "epoch": 1.5966078581777892, + "grad_norm": 43.14774407476014, + "learning_rate": 5.310943734910274e-06, + "loss": 3.0427, + "step": 18733 + }, + { + "epoch": 1.5966930878718144, + "grad_norm": 49.6589122388002, + "learning_rate": 5.31044884531623e-06, + "loss": 3.2484, + "step": 18734 + }, + { + "epoch": 1.5967783175658399, + "grad_norm": 44.32414290440025, + "learning_rate": 5.309953952669042e-06, + "loss": 2.058, + "step": 18735 + }, + { + "epoch": 1.5968635472598653, + "grad_norm": 96.79161282625145, + "learning_rate": 5.309459056973584e-06, + "loss": 3.8132, + "step": 18736 + }, + { + "epoch": 1.5969487769538908, + "grad_norm": 127.28655214558344, + "learning_rate": 5.308964158234716e-06, + "loss": 2.7934, + "step": 18737 + }, + { + "epoch": 1.5970340066479163, + "grad_norm": 81.16402507218024, + "learning_rate": 5.308469256457309e-06, + "loss": 3.8879, + "step": 18738 + }, + { + "epoch": 1.5971192363419415, + "grad_norm": 43.10103420852817, + "learning_rate": 5.30797435164623e-06, + "loss": 3.2004, + "step": 18739 + }, + { + "epoch": 1.5972044660359668, + "grad_norm": 29.870021535253464, + "learning_rate": 5.307479443806346e-06, + "loss": 2.9203, + "step": 18740 + }, + { + "epoch": 1.5972896957299922, + "grad_norm": 131.27494125114418, + "learning_rate": 5.306984532942523e-06, + "loss": 1.8439, + "step": 18741 + }, + { + "epoch": 1.5973749254240177, + "grad_norm": 56.80581635532969, + "learning_rate": 5.306489619059629e-06, + "loss": 2.8831, + "step": 18742 + }, + { + "epoch": 1.5974601551180432, + "grad_norm": 57.925311273521885, + "learning_rate": 5.305994702162532e-06, + "loss": 3.8364, + "step": 18743 + }, + { + "epoch": 1.5975453848120686, + "grad_norm": 29.398049322082546, + "learning_rate": 5.305499782256098e-06, + "loss": 2.368, + "step": 18744 + }, + { + "epoch": 1.597630614506094, + "grad_norm": 43.235198150515735, + "learning_rate": 5.305004859345196e-06, + "loss": 2.2907, + "step": 18745 + }, + { + "epoch": 1.5977158442001194, + "grad_norm": 55.14589507618937, + "learning_rate": 5.3045099334346915e-06, + "loss": 3.4806, + "step": 18746 + }, + { + "epoch": 1.5978010738941446, + "grad_norm": 68.74387329885742, + "learning_rate": 5.304015004529453e-06, + "loss": 3.8636, + "step": 18747 + }, + { + "epoch": 1.59788630358817, + "grad_norm": 38.87727302830286, + "learning_rate": 5.303520072634348e-06, + "loss": 3.0993, + "step": 18748 + }, + { + "epoch": 1.5979715332821955, + "grad_norm": 90.98531820095096, + "learning_rate": 5.303025137754244e-06, + "loss": 3.3305, + "step": 18749 + }, + { + "epoch": 1.598056762976221, + "grad_norm": 48.04816828508251, + "learning_rate": 5.302530199894007e-06, + "loss": 2.7035, + "step": 18750 + }, + { + "epoch": 1.5981419926702463, + "grad_norm": 41.08075116822414, + "learning_rate": 5.3020352590585074e-06, + "loss": 2.9261, + "step": 18751 + }, + { + "epoch": 1.5982272223642717, + "grad_norm": 52.87276998995624, + "learning_rate": 5.3015403152526104e-06, + "loss": 2.3229, + "step": 18752 + }, + { + "epoch": 1.598312452058297, + "grad_norm": 33.97636625745173, + "learning_rate": 5.301045368481183e-06, + "loss": 3.4913, + "step": 18753 + }, + { + "epoch": 1.5983976817523224, + "grad_norm": 230.6728426414453, + "learning_rate": 5.3005504187490965e-06, + "loss": 2.1669, + "step": 18754 + }, + { + "epoch": 1.598482911446348, + "grad_norm": 51.233497097311364, + "learning_rate": 5.300055466061214e-06, + "loss": 2.9199, + "step": 18755 + }, + { + "epoch": 1.5985681411403734, + "grad_norm": 82.87215605472468, + "learning_rate": 5.299560510422407e-06, + "loss": 2.9647, + "step": 18756 + }, + { + "epoch": 1.5986533708343988, + "grad_norm": 29.430862068428016, + "learning_rate": 5.299065551837538e-06, + "loss": 2.6347, + "step": 18757 + }, + { + "epoch": 1.598738600528424, + "grad_norm": 35.0320318313767, + "learning_rate": 5.298570590311482e-06, + "loss": 2.7049, + "step": 18758 + }, + { + "epoch": 1.5988238302224493, + "grad_norm": 50.00319398679704, + "learning_rate": 5.2980756258491e-06, + "loss": 2.614, + "step": 18759 + }, + { + "epoch": 1.5989090599164748, + "grad_norm": 53.435382118943636, + "learning_rate": 5.297580658455265e-06, + "loss": 3.2603, + "step": 18760 + }, + { + "epoch": 1.5989942896105003, + "grad_norm": 68.20707163061259, + "learning_rate": 5.297085688134839e-06, + "loss": 3.5948, + "step": 18761 + }, + { + "epoch": 1.5990795193045257, + "grad_norm": 37.33833429153736, + "learning_rate": 5.2965907148926976e-06, + "loss": 2.1243, + "step": 18762 + }, + { + "epoch": 1.5991647489985512, + "grad_norm": 70.4765697819069, + "learning_rate": 5.296095738733702e-06, + "loss": 2.5646, + "step": 18763 + }, + { + "epoch": 1.5992499786925765, + "grad_norm": 17.47812253396407, + "learning_rate": 5.295600759662723e-06, + "loss": 1.5609, + "step": 18764 + }, + { + "epoch": 1.599335208386602, + "grad_norm": 72.72876293547719, + "learning_rate": 5.295105777684627e-06, + "loss": 2.9415, + "step": 18765 + }, + { + "epoch": 1.5994204380806272, + "grad_norm": 45.457484282599836, + "learning_rate": 5.294610792804284e-06, + "loss": 2.6815, + "step": 18766 + }, + { + "epoch": 1.5995056677746526, + "grad_norm": 36.1136721275152, + "learning_rate": 5.294115805026561e-06, + "loss": 2.3843, + "step": 18767 + }, + { + "epoch": 1.599590897468678, + "grad_norm": 36.442402236494175, + "learning_rate": 5.293620814356326e-06, + "loss": 2.8273, + "step": 18768 + }, + { + "epoch": 1.5996761271627036, + "grad_norm": 51.22556139196483, + "learning_rate": 5.293125820798446e-06, + "loss": 2.8253, + "step": 18769 + }, + { + "epoch": 1.599761356856729, + "grad_norm": 38.61578403129057, + "learning_rate": 5.292630824357791e-06, + "loss": 3.1518, + "step": 18770 + }, + { + "epoch": 1.5998465865507543, + "grad_norm": 39.99019925337925, + "learning_rate": 5.292135825039228e-06, + "loss": 2.5955, + "step": 18771 + }, + { + "epoch": 1.5999318162447795, + "grad_norm": 41.89063816394581, + "learning_rate": 5.2916408228476245e-06, + "loss": 3.4591, + "step": 18772 + }, + { + "epoch": 1.600017045938805, + "grad_norm": 41.946726038897275, + "learning_rate": 5.2911458177878505e-06, + "loss": 2.8469, + "step": 18773 + }, + { + "epoch": 1.6001022756328305, + "grad_norm": 33.21423827392997, + "learning_rate": 5.290650809864771e-06, + "loss": 2.4139, + "step": 18774 + }, + { + "epoch": 1.600187505326856, + "grad_norm": 37.49307929534306, + "learning_rate": 5.29015579908326e-06, + "loss": 2.5585, + "step": 18775 + }, + { + "epoch": 1.6002727350208814, + "grad_norm": 38.45108946037658, + "learning_rate": 5.2896607854481786e-06, + "loss": 2.8922, + "step": 18776 + }, + { + "epoch": 1.6003579647149067, + "grad_norm": 36.76154807049504, + "learning_rate": 5.289165768964399e-06, + "loss": 3.0436, + "step": 18777 + }, + { + "epoch": 1.6004431944089321, + "grad_norm": 34.00785420618056, + "learning_rate": 5.28867074963679e-06, + "loss": 2.8458, + "step": 18778 + }, + { + "epoch": 1.6005284241029574, + "grad_norm": 33.70096236667019, + "learning_rate": 5.2881757274702195e-06, + "loss": 2.7256, + "step": 18779 + }, + { + "epoch": 1.6006136537969828, + "grad_norm": 45.752531827796574, + "learning_rate": 5.2876807024695534e-06, + "loss": 2.4519, + "step": 18780 + }, + { + "epoch": 1.6006988834910083, + "grad_norm": 44.13967378635677, + "learning_rate": 5.287185674639662e-06, + "loss": 3.9753, + "step": 18781 + }, + { + "epoch": 1.6007841131850338, + "grad_norm": 144.69344300925744, + "learning_rate": 5.286690643985415e-06, + "loss": 3.2204, + "step": 18782 + }, + { + "epoch": 1.600869342879059, + "grad_norm": 64.03015081214414, + "learning_rate": 5.2861956105116785e-06, + "loss": 3.0681, + "step": 18783 + }, + { + "epoch": 1.6009545725730845, + "grad_norm": 31.246251931531773, + "learning_rate": 5.285700574223323e-06, + "loss": 1.909, + "step": 18784 + }, + { + "epoch": 1.6010398022671097, + "grad_norm": 57.19867634542985, + "learning_rate": 5.2852055351252144e-06, + "loss": 3.0395, + "step": 18785 + }, + { + "epoch": 1.6011250319611352, + "grad_norm": 60.89181564593446, + "learning_rate": 5.284710493222224e-06, + "loss": 2.7603, + "step": 18786 + }, + { + "epoch": 1.6012102616551607, + "grad_norm": 42.44555925411254, + "learning_rate": 5.284215448519218e-06, + "loss": 3.2424, + "step": 18787 + }, + { + "epoch": 1.6012954913491861, + "grad_norm": 66.27295160790972, + "learning_rate": 5.283720401021066e-06, + "loss": 2.4245, + "step": 18788 + }, + { + "epoch": 1.6013807210432116, + "grad_norm": 95.7853320948333, + "learning_rate": 5.283225350732637e-06, + "loss": 3.2828, + "step": 18789 + }, + { + "epoch": 1.6014659507372369, + "grad_norm": 36.7463987649432, + "learning_rate": 5.282730297658799e-06, + "loss": 2.7767, + "step": 18790 + }, + { + "epoch": 1.601551180431262, + "grad_norm": 43.51183891181784, + "learning_rate": 5.282235241804421e-06, + "loss": 3.0162, + "step": 18791 + }, + { + "epoch": 1.6016364101252876, + "grad_norm": 39.206567029728625, + "learning_rate": 5.281740183174372e-06, + "loss": 2.7263, + "step": 18792 + }, + { + "epoch": 1.601721639819313, + "grad_norm": 43.068614352527746, + "learning_rate": 5.28124512177352e-06, + "loss": 2.7694, + "step": 18793 + }, + { + "epoch": 1.6018068695133385, + "grad_norm": 66.58320116403726, + "learning_rate": 5.280750057606734e-06, + "loss": 2.8462, + "step": 18794 + }, + { + "epoch": 1.601892099207364, + "grad_norm": 53.35622473632944, + "learning_rate": 5.280254990678881e-06, + "loss": 2.6074, + "step": 18795 + }, + { + "epoch": 1.6019773289013892, + "grad_norm": 37.50570419845894, + "learning_rate": 5.2797599209948336e-06, + "loss": 2.9388, + "step": 18796 + }, + { + "epoch": 1.6020625585954147, + "grad_norm": 81.74899069184306, + "learning_rate": 5.279264848559458e-06, + "loss": 2.8478, + "step": 18797 + }, + { + "epoch": 1.60214778828944, + "grad_norm": 49.714195557616414, + "learning_rate": 5.278769773377625e-06, + "loss": 2.9416, + "step": 18798 + }, + { + "epoch": 1.6022330179834654, + "grad_norm": 47.97779572577989, + "learning_rate": 5.2782746954542e-06, + "loss": 1.5546, + "step": 18799 + }, + { + "epoch": 1.6023182476774909, + "grad_norm": 38.16824645945424, + "learning_rate": 5.277779614794054e-06, + "loss": 2.5636, + "step": 18800 + }, + { + "epoch": 1.6024034773715163, + "grad_norm": 47.970504546968684, + "learning_rate": 5.277284531402057e-06, + "loss": 2.4745, + "step": 18801 + }, + { + "epoch": 1.6024887070655416, + "grad_norm": 34.53650011830157, + "learning_rate": 5.276789445283076e-06, + "loss": 2.4456, + "step": 18802 + }, + { + "epoch": 1.602573936759567, + "grad_norm": 145.24889716996873, + "learning_rate": 5.276294356441982e-06, + "loss": 2.7663, + "step": 18803 + }, + { + "epoch": 1.6026591664535923, + "grad_norm": 52.31002897816984, + "learning_rate": 5.275799264883641e-06, + "loss": 2.936, + "step": 18804 + }, + { + "epoch": 1.6027443961476178, + "grad_norm": 47.09951672708291, + "learning_rate": 5.275304170612925e-06, + "loss": 3.2276, + "step": 18805 + }, + { + "epoch": 1.6028296258416432, + "grad_norm": 103.39941736137635, + "learning_rate": 5.274809073634701e-06, + "loss": 2.8704, + "step": 18806 + }, + { + "epoch": 1.6029148555356687, + "grad_norm": 40.89609677293441, + "learning_rate": 5.274313973953841e-06, + "loss": 3.5034, + "step": 18807 + }, + { + "epoch": 1.6030000852296942, + "grad_norm": 86.12402940342834, + "learning_rate": 5.273818871575209e-06, + "loss": 3.3783, + "step": 18808 + }, + { + "epoch": 1.6030853149237194, + "grad_norm": 76.66689619858023, + "learning_rate": 5.27332376650368e-06, + "loss": 3.4392, + "step": 18809 + }, + { + "epoch": 1.6031705446177447, + "grad_norm": 61.26559602562885, + "learning_rate": 5.272828658744118e-06, + "loss": 3.1919, + "step": 18810 + }, + { + "epoch": 1.6032557743117701, + "grad_norm": 29.29375753658017, + "learning_rate": 5.272333548301397e-06, + "loss": 1.7305, + "step": 18811 + }, + { + "epoch": 1.6033410040057956, + "grad_norm": 81.82656548536148, + "learning_rate": 5.271838435180381e-06, + "loss": 2.4733, + "step": 18812 + }, + { + "epoch": 1.603426233699821, + "grad_norm": 36.3295675642705, + "learning_rate": 5.271343319385945e-06, + "loss": 2.5653, + "step": 18813 + }, + { + "epoch": 1.6035114633938465, + "grad_norm": 41.20731274694065, + "learning_rate": 5.270848200922953e-06, + "loss": 2.8509, + "step": 18814 + }, + { + "epoch": 1.6035966930878718, + "grad_norm": 53.724098467498735, + "learning_rate": 5.270353079796278e-06, + "loss": 2.467, + "step": 18815 + }, + { + "epoch": 1.6036819227818973, + "grad_norm": 60.60721904084936, + "learning_rate": 5.269857956010787e-06, + "loss": 2.33, + "step": 18816 + }, + { + "epoch": 1.6037671524759225, + "grad_norm": 96.94772335518181, + "learning_rate": 5.26936282957135e-06, + "loss": 2.9863, + "step": 18817 + }, + { + "epoch": 1.603852382169948, + "grad_norm": 69.3709499641595, + "learning_rate": 5.268867700482838e-06, + "loss": 2.6575, + "step": 18818 + }, + { + "epoch": 1.6039376118639734, + "grad_norm": 57.38782882160754, + "learning_rate": 5.268372568750118e-06, + "loss": 3.856, + "step": 18819 + }, + { + "epoch": 1.604022841557999, + "grad_norm": 41.76759825540486, + "learning_rate": 5.267877434378059e-06, + "loss": 1.7244, + "step": 18820 + }, + { + "epoch": 1.6041080712520244, + "grad_norm": 85.42355050212954, + "learning_rate": 5.2673822973715334e-06, + "loss": 1.5354, + "step": 18821 + }, + { + "epoch": 1.6041933009460496, + "grad_norm": 38.25015895588585, + "learning_rate": 5.2668871577354095e-06, + "loss": 2.4837, + "step": 18822 + }, + { + "epoch": 1.6042785306400749, + "grad_norm": 35.13506170885036, + "learning_rate": 5.266392015474557e-06, + "loss": 2.5539, + "step": 18823 + }, + { + "epoch": 1.6043637603341003, + "grad_norm": 54.7769522731906, + "learning_rate": 5.265896870593843e-06, + "loss": 2.4484, + "step": 18824 + }, + { + "epoch": 1.6044489900281258, + "grad_norm": 66.53716453576027, + "learning_rate": 5.265401723098141e-06, + "loss": 3.2643, + "step": 18825 + }, + { + "epoch": 1.6045342197221513, + "grad_norm": 34.81985996097626, + "learning_rate": 5.264906572992317e-06, + "loss": 2.2559, + "step": 18826 + }, + { + "epoch": 1.6046194494161767, + "grad_norm": 64.01939363750127, + "learning_rate": 5.264411420281242e-06, + "loss": 2.8685, + "step": 18827 + }, + { + "epoch": 1.604704679110202, + "grad_norm": 139.86780774702575, + "learning_rate": 5.263916264969786e-06, + "loss": 3.0249, + "step": 18828 + }, + { + "epoch": 1.6047899088042272, + "grad_norm": 151.44510149721893, + "learning_rate": 5.263421107062819e-06, + "loss": 2.8775, + "step": 18829 + }, + { + "epoch": 1.6048751384982527, + "grad_norm": 42.20829628243571, + "learning_rate": 5.262925946565211e-06, + "loss": 2.5863, + "step": 18830 + }, + { + "epoch": 1.6049603681922782, + "grad_norm": 25.523926428280173, + "learning_rate": 5.262430783481829e-06, + "loss": 2.3747, + "step": 18831 + }, + { + "epoch": 1.6050455978863036, + "grad_norm": 78.09686607221158, + "learning_rate": 5.261935617817544e-06, + "loss": 4.1405, + "step": 18832 + }, + { + "epoch": 1.605130827580329, + "grad_norm": 152.43541429006052, + "learning_rate": 5.261440449577229e-06, + "loss": 2.2957, + "step": 18833 + }, + { + "epoch": 1.6052160572743543, + "grad_norm": 44.03461570385365, + "learning_rate": 5.260945278765749e-06, + "loss": 3.1704, + "step": 18834 + }, + { + "epoch": 1.6053012869683798, + "grad_norm": 91.47320604588094, + "learning_rate": 5.260450105387975e-06, + "loss": 3.4341, + "step": 18835 + }, + { + "epoch": 1.605386516662405, + "grad_norm": 64.72571409116202, + "learning_rate": 5.259954929448779e-06, + "loss": 3.3681, + "step": 18836 + }, + { + "epoch": 1.6054717463564305, + "grad_norm": 60.49873723139474, + "learning_rate": 5.259459750953031e-06, + "loss": 2.8609, + "step": 18837 + }, + { + "epoch": 1.605556976050456, + "grad_norm": 46.19659219695925, + "learning_rate": 5.258964569905597e-06, + "loss": 3.5419, + "step": 18838 + }, + { + "epoch": 1.6056422057444815, + "grad_norm": 46.072048117463325, + "learning_rate": 5.25846938631135e-06, + "loss": 2.4788, + "step": 18839 + }, + { + "epoch": 1.605727435438507, + "grad_norm": 41.92494697589808, + "learning_rate": 5.2579742001751595e-06, + "loss": 3.233, + "step": 18840 + }, + { + "epoch": 1.6058126651325322, + "grad_norm": 43.33073465250891, + "learning_rate": 5.257479011501896e-06, + "loss": 2.7119, + "step": 18841 + }, + { + "epoch": 1.6058978948265574, + "grad_norm": 61.40021821207289, + "learning_rate": 5.256983820296429e-06, + "loss": 1.9657, + "step": 18842 + }, + { + "epoch": 1.605983124520583, + "grad_norm": 37.685647234955105, + "learning_rate": 5.256488626563626e-06, + "loss": 2.8531, + "step": 18843 + }, + { + "epoch": 1.6060683542146084, + "grad_norm": 43.06169391601206, + "learning_rate": 5.255993430308362e-06, + "loss": 2.8534, + "step": 18844 + }, + { + "epoch": 1.6061535839086338, + "grad_norm": 58.43492668192055, + "learning_rate": 5.255498231535503e-06, + "loss": 2.9561, + "step": 18845 + }, + { + "epoch": 1.6062388136026593, + "grad_norm": 79.11182050890103, + "learning_rate": 5.255003030249921e-06, + "loss": 4.0123, + "step": 18846 + }, + { + "epoch": 1.6063240432966845, + "grad_norm": 50.570169720914336, + "learning_rate": 5.254507826456483e-06, + "loss": 4.1504, + "step": 18847 + }, + { + "epoch": 1.60640927299071, + "grad_norm": 39.375405453257095, + "learning_rate": 5.254012620160065e-06, + "loss": 2.6454, + "step": 18848 + }, + { + "epoch": 1.6064945026847353, + "grad_norm": 36.30883601235602, + "learning_rate": 5.2535174113655315e-06, + "loss": 2.5045, + "step": 18849 + }, + { + "epoch": 1.6065797323787607, + "grad_norm": 89.47840262476223, + "learning_rate": 5.253022200077757e-06, + "loss": 3.6615, + "step": 18850 + }, + { + "epoch": 1.6066649620727862, + "grad_norm": 50.251314044112064, + "learning_rate": 5.252526986301607e-06, + "loss": 2.8988, + "step": 18851 + }, + { + "epoch": 1.6067501917668117, + "grad_norm": 35.460359746732635, + "learning_rate": 5.2520317700419575e-06, + "loss": 2.3565, + "step": 18852 + }, + { + "epoch": 1.606835421460837, + "grad_norm": 71.01136187096544, + "learning_rate": 5.251536551303673e-06, + "loss": 2.3921, + "step": 18853 + }, + { + "epoch": 1.6069206511548624, + "grad_norm": 42.69628086229028, + "learning_rate": 5.251041330091629e-06, + "loss": 1.718, + "step": 18854 + }, + { + "epoch": 1.6070058808488876, + "grad_norm": 57.42761087323818, + "learning_rate": 5.250546106410691e-06, + "loss": 2.4757, + "step": 18855 + }, + { + "epoch": 1.607091110542913, + "grad_norm": 66.91625857818548, + "learning_rate": 5.250050880265733e-06, + "loss": 2.8343, + "step": 18856 + }, + { + "epoch": 1.6071763402369386, + "grad_norm": 62.30639685646912, + "learning_rate": 5.249555651661624e-06, + "loss": 3.4875, + "step": 18857 + }, + { + "epoch": 1.607261569930964, + "grad_norm": 167.31164392292897, + "learning_rate": 5.249060420603233e-06, + "loss": 5.5514, + "step": 18858 + }, + { + "epoch": 1.6073467996249895, + "grad_norm": 68.74748629954499, + "learning_rate": 5.2485651870954336e-06, + "loss": 4.583, + "step": 18859 + }, + { + "epoch": 1.6074320293190147, + "grad_norm": 41.01761681345654, + "learning_rate": 5.248069951143093e-06, + "loss": 3.099, + "step": 18860 + }, + { + "epoch": 1.60751725901304, + "grad_norm": 40.79630775139381, + "learning_rate": 5.247574712751084e-06, + "loss": 2.0437, + "step": 18861 + }, + { + "epoch": 1.6076024887070655, + "grad_norm": 59.54170857390893, + "learning_rate": 5.247079471924275e-06, + "loss": 2.7679, + "step": 18862 + }, + { + "epoch": 1.607687718401091, + "grad_norm": 54.430719935862356, + "learning_rate": 5.246584228667537e-06, + "loss": 2.2496, + "step": 18863 + }, + { + "epoch": 1.6077729480951164, + "grad_norm": 31.878898527256684, + "learning_rate": 5.246088982985742e-06, + "loss": 2.6025, + "step": 18864 + }, + { + "epoch": 1.6078581777891419, + "grad_norm": 36.50068007838865, + "learning_rate": 5.245593734883762e-06, + "loss": 3.4814, + "step": 18865 + }, + { + "epoch": 1.607943407483167, + "grad_norm": 48.63649221945914, + "learning_rate": 5.245098484366462e-06, + "loss": 3.2323, + "step": 18866 + }, + { + "epoch": 1.6080286371771926, + "grad_norm": 29.45609179693243, + "learning_rate": 5.244603231438717e-06, + "loss": 2.2576, + "step": 18867 + }, + { + "epoch": 1.6081138668712178, + "grad_norm": 62.10871174845493, + "learning_rate": 5.244107976105395e-06, + "loss": 3.8008, + "step": 18868 + }, + { + "epoch": 1.6081990965652433, + "grad_norm": 39.00587900642391, + "learning_rate": 5.24361271837137e-06, + "loss": 2.6833, + "step": 18869 + }, + { + "epoch": 1.6082843262592688, + "grad_norm": 40.30017210925537, + "learning_rate": 5.24311745824151e-06, + "loss": 2.7124, + "step": 18870 + }, + { + "epoch": 1.6083695559532942, + "grad_norm": 71.87342456372666, + "learning_rate": 5.242622195720686e-06, + "loss": 2.8234, + "step": 18871 + }, + { + "epoch": 1.6084547856473195, + "grad_norm": 44.147348622781806, + "learning_rate": 5.24212693081377e-06, + "loss": 2.3167, + "step": 18872 + }, + { + "epoch": 1.608540015341345, + "grad_norm": 20.923769545258896, + "learning_rate": 5.2416316635256315e-06, + "loss": 1.7816, + "step": 18873 + }, + { + "epoch": 1.6086252450353702, + "grad_norm": 79.64950947300416, + "learning_rate": 5.24113639386114e-06, + "loss": 3.0302, + "step": 18874 + }, + { + "epoch": 1.6087104747293957, + "grad_norm": 56.589245175207765, + "learning_rate": 5.24064112182517e-06, + "loss": 3.0835, + "step": 18875 + }, + { + "epoch": 1.6087957044234211, + "grad_norm": 35.8417577809927, + "learning_rate": 5.24014584742259e-06, + "loss": 2.9782, + "step": 18876 + }, + { + "epoch": 1.6088809341174466, + "grad_norm": 62.669989692373456, + "learning_rate": 5.23965057065827e-06, + "loss": 2.7056, + "step": 18877 + }, + { + "epoch": 1.608966163811472, + "grad_norm": 45.702116718987064, + "learning_rate": 5.239155291537083e-06, + "loss": 2.5323, + "step": 18878 + }, + { + "epoch": 1.6090513935054973, + "grad_norm": 51.76532834210267, + "learning_rate": 5.2386600100638975e-06, + "loss": 2.9642, + "step": 18879 + }, + { + "epoch": 1.6091366231995226, + "grad_norm": 68.0761424666435, + "learning_rate": 5.238164726243587e-06, + "loss": 2.2854, + "step": 18880 + }, + { + "epoch": 1.609221852893548, + "grad_norm": 66.52645080886822, + "learning_rate": 5.2376694400810195e-06, + "loss": 3.8759, + "step": 18881 + }, + { + "epoch": 1.6093070825875735, + "grad_norm": 66.84229501157579, + "learning_rate": 5.237174151581068e-06, + "loss": 2.8269, + "step": 18882 + }, + { + "epoch": 1.609392312281599, + "grad_norm": 41.28000671839385, + "learning_rate": 5.236678860748603e-06, + "loss": 2.753, + "step": 18883 + }, + { + "epoch": 1.6094775419756244, + "grad_norm": 55.77709954760078, + "learning_rate": 5.236183567588497e-06, + "loss": 2.6055, + "step": 18884 + }, + { + "epoch": 1.6095627716696497, + "grad_norm": 51.867693139322014, + "learning_rate": 5.235688272105617e-06, + "loss": 3.0502, + "step": 18885 + }, + { + "epoch": 1.6096480013636751, + "grad_norm": 78.69861370884229, + "learning_rate": 5.235192974304838e-06, + "loss": 3.0245, + "step": 18886 + }, + { + "epoch": 1.6097332310577004, + "grad_norm": 54.32436641851943, + "learning_rate": 5.2346976741910295e-06, + "loss": 3.5714, + "step": 18887 + }, + { + "epoch": 1.6098184607517259, + "grad_norm": 50.26356028459417, + "learning_rate": 5.234202371769063e-06, + "loss": 3.0478, + "step": 18888 + }, + { + "epoch": 1.6099036904457513, + "grad_norm": 66.88068288567226, + "learning_rate": 5.233707067043808e-06, + "loss": 2.6938, + "step": 18889 + }, + { + "epoch": 1.6099889201397768, + "grad_norm": 49.7087829965748, + "learning_rate": 5.233211760020136e-06, + "loss": 3.3022, + "step": 18890 + }, + { + "epoch": 1.6100741498338023, + "grad_norm": 72.49756162225653, + "learning_rate": 5.232716450702921e-06, + "loss": 3.4549, + "step": 18891 + }, + { + "epoch": 1.6101593795278275, + "grad_norm": 42.84655465739796, + "learning_rate": 5.232221139097032e-06, + "loss": 2.2919, + "step": 18892 + }, + { + "epoch": 1.6102446092218528, + "grad_norm": 34.08439825366507, + "learning_rate": 5.231725825207341e-06, + "loss": 2.8725, + "step": 18893 + }, + { + "epoch": 1.6103298389158782, + "grad_norm": 54.571620203694, + "learning_rate": 5.231230509038716e-06, + "loss": 3.1674, + "step": 18894 + }, + { + "epoch": 1.6104150686099037, + "grad_norm": 34.546264506528736, + "learning_rate": 5.230735190596034e-06, + "loss": 1.9296, + "step": 18895 + }, + { + "epoch": 1.6105002983039292, + "grad_norm": 138.40682194047253, + "learning_rate": 5.230239869884161e-06, + "loss": 2.9292, + "step": 18896 + }, + { + "epoch": 1.6105855279979546, + "grad_norm": 35.94488660894465, + "learning_rate": 5.229744546907971e-06, + "loss": 3.8828, + "step": 18897 + }, + { + "epoch": 1.6106707576919799, + "grad_norm": 34.935240828691605, + "learning_rate": 5.229249221672334e-06, + "loss": 2.612, + "step": 18898 + }, + { + "epoch": 1.6107559873860053, + "grad_norm": 55.92216245398795, + "learning_rate": 5.228753894182123e-06, + "loss": 3.3611, + "step": 18899 + }, + { + "epoch": 1.6108412170800306, + "grad_norm": 77.71419271775537, + "learning_rate": 5.228258564442207e-06, + "loss": 2.5004, + "step": 18900 + }, + { + "epoch": 1.610926446774056, + "grad_norm": 121.46694208862561, + "learning_rate": 5.227763232457461e-06, + "loss": 3.8992, + "step": 18901 + }, + { + "epoch": 1.6110116764680815, + "grad_norm": 30.147095094449167, + "learning_rate": 5.227267898232752e-06, + "loss": 3.0159, + "step": 18902 + }, + { + "epoch": 1.611096906162107, + "grad_norm": 51.96139833989853, + "learning_rate": 5.2267725617729535e-06, + "loss": 3.7181, + "step": 18903 + }, + { + "epoch": 1.6111821358561322, + "grad_norm": 63.28891117961418, + "learning_rate": 5.226277223082939e-06, + "loss": 3.3468, + "step": 18904 + }, + { + "epoch": 1.6112673655501577, + "grad_norm": 44.25079148493902, + "learning_rate": 5.225781882167576e-06, + "loss": 3.6149, + "step": 18905 + }, + { + "epoch": 1.611352595244183, + "grad_norm": 44.05444770679901, + "learning_rate": 5.2252865390317385e-06, + "loss": 2.6599, + "step": 18906 + }, + { + "epoch": 1.6114378249382084, + "grad_norm": 102.07108115798148, + "learning_rate": 5.224791193680297e-06, + "loss": 3.7634, + "step": 18907 + }, + { + "epoch": 1.6115230546322339, + "grad_norm": 58.04531687251874, + "learning_rate": 5.224295846118125e-06, + "loss": 2.8972, + "step": 18908 + }, + { + "epoch": 1.6116082843262594, + "grad_norm": 41.593466449963394, + "learning_rate": 5.223800496350091e-06, + "loss": 3.0686, + "step": 18909 + }, + { + "epoch": 1.6116935140202848, + "grad_norm": 57.808466731274535, + "learning_rate": 5.223305144381068e-06, + "loss": 3.3283, + "step": 18910 + }, + { + "epoch": 1.61177874371431, + "grad_norm": 71.64312195025876, + "learning_rate": 5.222809790215928e-06, + "loss": 2.5651, + "step": 18911 + }, + { + "epoch": 1.6118639734083353, + "grad_norm": 80.60022068811912, + "learning_rate": 5.222314433859543e-06, + "loss": 2.2742, + "step": 18912 + }, + { + "epoch": 1.6119492031023608, + "grad_norm": 37.94298029060228, + "learning_rate": 5.221819075316783e-06, + "loss": 3.7873, + "step": 18913 + }, + { + "epoch": 1.6120344327963863, + "grad_norm": 41.52717015091937, + "learning_rate": 5.22132371459252e-06, + "loss": 2.9504, + "step": 18914 + }, + { + "epoch": 1.6121196624904117, + "grad_norm": 124.04334648117776, + "learning_rate": 5.220828351691628e-06, + "loss": 3.8549, + "step": 18915 + }, + { + "epoch": 1.6122048921844372, + "grad_norm": 52.32495030782834, + "learning_rate": 5.220332986618976e-06, + "loss": 2.6153, + "step": 18916 + }, + { + "epoch": 1.6122901218784624, + "grad_norm": 41.424965929095634, + "learning_rate": 5.219837619379437e-06, + "loss": 3.261, + "step": 18917 + }, + { + "epoch": 1.612375351572488, + "grad_norm": 49.9262217017669, + "learning_rate": 5.2193422499778805e-06, + "loss": 3.3957, + "step": 18918 + }, + { + "epoch": 1.6124605812665131, + "grad_norm": 34.1103992484535, + "learning_rate": 5.218846878419183e-06, + "loss": 2.6675, + "step": 18919 + }, + { + "epoch": 1.6125458109605386, + "grad_norm": 39.13677732650805, + "learning_rate": 5.218351504708211e-06, + "loss": 2.9415, + "step": 18920 + }, + { + "epoch": 1.612631040654564, + "grad_norm": 30.30173983959165, + "learning_rate": 5.217856128849839e-06, + "loss": 3.0419, + "step": 18921 + }, + { + "epoch": 1.6127162703485896, + "grad_norm": 45.63308261587133, + "learning_rate": 5.217360750848939e-06, + "loss": 2.1318, + "step": 18922 + }, + { + "epoch": 1.6128015000426148, + "grad_norm": 47.22617339156918, + "learning_rate": 5.216865370710383e-06, + "loss": 1.1941, + "step": 18923 + }, + { + "epoch": 1.6128867297366403, + "grad_norm": 46.02500498156413, + "learning_rate": 5.2163699884390414e-06, + "loss": 2.1653, + "step": 18924 + }, + { + "epoch": 1.6129719594306655, + "grad_norm": 34.72592916664871, + "learning_rate": 5.215874604039788e-06, + "loss": 3.1923, + "step": 18925 + }, + { + "epoch": 1.613057189124691, + "grad_norm": 25.083207077473027, + "learning_rate": 5.215379217517493e-06, + "loss": 1.8411, + "step": 18926 + }, + { + "epoch": 1.6131424188187165, + "grad_norm": 36.95056013400587, + "learning_rate": 5.214883828877029e-06, + "loss": 3.1141, + "step": 18927 + }, + { + "epoch": 1.613227648512742, + "grad_norm": 58.07312224487053, + "learning_rate": 5.214388438123268e-06, + "loss": 2.8117, + "step": 18928 + }, + { + "epoch": 1.6133128782067674, + "grad_norm": 46.296734042569526, + "learning_rate": 5.213893045261081e-06, + "loss": 2.6925, + "step": 18929 + }, + { + "epoch": 1.6133981079007926, + "grad_norm": 61.45548400906047, + "learning_rate": 5.213397650295342e-06, + "loss": 3.3672, + "step": 18930 + }, + { + "epoch": 1.6134833375948179, + "grad_norm": 35.162141465595745, + "learning_rate": 5.212902253230922e-06, + "loss": 2.8254, + "step": 18931 + }, + { + "epoch": 1.6135685672888433, + "grad_norm": 57.82390099470897, + "learning_rate": 5.212406854072693e-06, + "loss": 3.8113, + "step": 18932 + }, + { + "epoch": 1.6136537969828688, + "grad_norm": 35.61811244681609, + "learning_rate": 5.2119114528255256e-06, + "loss": 2.6876, + "step": 18933 + }, + { + "epoch": 1.6137390266768943, + "grad_norm": 35.574964383432786, + "learning_rate": 5.211416049494294e-06, + "loss": 2.2818, + "step": 18934 + }, + { + "epoch": 1.6138242563709198, + "grad_norm": 79.42834827970445, + "learning_rate": 5.21092064408387e-06, + "loss": 4.9418, + "step": 18935 + }, + { + "epoch": 1.613909486064945, + "grad_norm": 42.57252812293679, + "learning_rate": 5.210425236599126e-06, + "loss": 2.957, + "step": 18936 + }, + { + "epoch": 1.6139947157589705, + "grad_norm": 79.58855994804063, + "learning_rate": 5.209929827044931e-06, + "loss": 2.8983, + "step": 18937 + }, + { + "epoch": 1.6140799454529957, + "grad_norm": 39.152772500963486, + "learning_rate": 5.209434415426161e-06, + "loss": 3.1259, + "step": 18938 + }, + { + "epoch": 1.6141651751470212, + "grad_norm": 79.09935235490967, + "learning_rate": 5.208939001747687e-06, + "loss": 3.3248, + "step": 18939 + }, + { + "epoch": 1.6142504048410466, + "grad_norm": 30.63807529041469, + "learning_rate": 5.208443586014381e-06, + "loss": 2.5339, + "step": 18940 + }, + { + "epoch": 1.6143356345350721, + "grad_norm": 88.27068118511642, + "learning_rate": 5.207948168231114e-06, + "loss": 3.9378, + "step": 18941 + }, + { + "epoch": 1.6144208642290974, + "grad_norm": 32.1815531705799, + "learning_rate": 5.2074527484027614e-06, + "loss": 2.2386, + "step": 18942 + }, + { + "epoch": 1.6145060939231228, + "grad_norm": 47.06079670451861, + "learning_rate": 5.206957326534191e-06, + "loss": 2.6898, + "step": 18943 + }, + { + "epoch": 1.614591323617148, + "grad_norm": 39.88321150750542, + "learning_rate": 5.20646190263028e-06, + "loss": 2.514, + "step": 18944 + }, + { + "epoch": 1.6146765533111735, + "grad_norm": 34.23377316646404, + "learning_rate": 5.2059664766958974e-06, + "loss": 2.8713, + "step": 18945 + }, + { + "epoch": 1.614761783005199, + "grad_norm": 48.062268353621235, + "learning_rate": 5.205471048735915e-06, + "loss": 2.8331, + "step": 18946 + }, + { + "epoch": 1.6148470126992245, + "grad_norm": 82.0779327873137, + "learning_rate": 5.204975618755208e-06, + "loss": 2.6887, + "step": 18947 + }, + { + "epoch": 1.61493224239325, + "grad_norm": 64.44853117060471, + "learning_rate": 5.2044801867586466e-06, + "loss": 3.9011, + "step": 18948 + }, + { + "epoch": 1.6150174720872752, + "grad_norm": 41.17473290545428, + "learning_rate": 5.203984752751103e-06, + "loss": 3.245, + "step": 18949 + }, + { + "epoch": 1.6151027017813004, + "grad_norm": 49.34962853003382, + "learning_rate": 5.203489316737452e-06, + "loss": 2.7621, + "step": 18950 + }, + { + "epoch": 1.615187931475326, + "grad_norm": 49.26067276097687, + "learning_rate": 5.202993878722564e-06, + "loss": 3.0208, + "step": 18951 + }, + { + "epoch": 1.6152731611693514, + "grad_norm": 67.12732272866491, + "learning_rate": 5.202498438711312e-06, + "loss": 3.4589, + "step": 18952 + }, + { + "epoch": 1.6153583908633768, + "grad_norm": 37.325436156905745, + "learning_rate": 5.202002996708567e-06, + "loss": 3.2618, + "step": 18953 + }, + { + "epoch": 1.6154436205574023, + "grad_norm": 24.36498408529128, + "learning_rate": 5.201507552719204e-06, + "loss": 1.9744, + "step": 18954 + }, + { + "epoch": 1.6155288502514276, + "grad_norm": 48.577771646030314, + "learning_rate": 5.201012106748095e-06, + "loss": 2.548, + "step": 18955 + }, + { + "epoch": 1.615614079945453, + "grad_norm": 46.74231286835039, + "learning_rate": 5.2005166588001095e-06, + "loss": 2.1714, + "step": 18956 + }, + { + "epoch": 1.6156993096394783, + "grad_norm": 85.8627432830068, + "learning_rate": 5.200021208880124e-06, + "loss": 4.0437, + "step": 18957 + }, + { + "epoch": 1.6157845393335037, + "grad_norm": 36.374010074968034, + "learning_rate": 5.199525756993009e-06, + "loss": 3.2043, + "step": 18958 + }, + { + "epoch": 1.6158697690275292, + "grad_norm": 45.46030058445945, + "learning_rate": 5.199030303143638e-06, + "loss": 3.2263, + "step": 18959 + }, + { + "epoch": 1.6159549987215547, + "grad_norm": 47.68684447597132, + "learning_rate": 5.198534847336882e-06, + "loss": 3.2051, + "step": 18960 + }, + { + "epoch": 1.6160402284155801, + "grad_norm": 40.153728333854914, + "learning_rate": 5.198039389577616e-06, + "loss": 3.0564, + "step": 18961 + }, + { + "epoch": 1.6161254581096054, + "grad_norm": 84.44545824806103, + "learning_rate": 5.197543929870711e-06, + "loss": 1.8119, + "step": 18962 + }, + { + "epoch": 1.6162106878036306, + "grad_norm": 137.39085062028448, + "learning_rate": 5.197048468221039e-06, + "loss": 3.7434, + "step": 18963 + }, + { + "epoch": 1.616295917497656, + "grad_norm": 57.21951312548219, + "learning_rate": 5.1965530046334745e-06, + "loss": 2.1561, + "step": 18964 + }, + { + "epoch": 1.6163811471916816, + "grad_norm": 70.53784634515362, + "learning_rate": 5.196057539112889e-06, + "loss": 2.8991, + "step": 18965 + }, + { + "epoch": 1.616466376885707, + "grad_norm": 19.44135189397429, + "learning_rate": 5.195562071664155e-06, + "loss": 1.054, + "step": 18966 + }, + { + "epoch": 1.6165516065797325, + "grad_norm": 31.578815641475504, + "learning_rate": 5.195066602292146e-06, + "loss": 1.1214, + "step": 18967 + }, + { + "epoch": 1.6166368362737578, + "grad_norm": 50.57247724802715, + "learning_rate": 5.1945711310017345e-06, + "loss": 3.03, + "step": 18968 + }, + { + "epoch": 1.6167220659677832, + "grad_norm": 80.03899596840198, + "learning_rate": 5.194075657797793e-06, + "loss": 3.5193, + "step": 18969 + }, + { + "epoch": 1.6168072956618085, + "grad_norm": 35.18368331514127, + "learning_rate": 5.193580182685196e-06, + "loss": 2.7522, + "step": 18970 + }, + { + "epoch": 1.616892525355834, + "grad_norm": 39.29368232407077, + "learning_rate": 5.193084705668814e-06, + "loss": 2.8053, + "step": 18971 + }, + { + "epoch": 1.6169777550498594, + "grad_norm": 35.985955002483486, + "learning_rate": 5.192589226753519e-06, + "loss": 3.2153, + "step": 18972 + }, + { + "epoch": 1.6170629847438849, + "grad_norm": 44.54698520012292, + "learning_rate": 5.192093745944187e-06, + "loss": 2.2733, + "step": 18973 + }, + { + "epoch": 1.6171482144379101, + "grad_norm": 67.51457691919224, + "learning_rate": 5.191598263245689e-06, + "loss": 2.975, + "step": 18974 + }, + { + "epoch": 1.6172334441319356, + "grad_norm": 79.93398225055013, + "learning_rate": 5.1911027786629e-06, + "loss": 3.7002, + "step": 18975 + }, + { + "epoch": 1.6173186738259608, + "grad_norm": 39.21306269597093, + "learning_rate": 5.190607292200688e-06, + "loss": 3.8657, + "step": 18976 + }, + { + "epoch": 1.6174039035199863, + "grad_norm": 42.319343380799914, + "learning_rate": 5.190111803863932e-06, + "loss": 2.7964, + "step": 18977 + }, + { + "epoch": 1.6174891332140118, + "grad_norm": 25.197299066228016, + "learning_rate": 5.1896163136575e-06, + "loss": 2.449, + "step": 18978 + }, + { + "epoch": 1.6175743629080372, + "grad_norm": 46.96818217302725, + "learning_rate": 5.189120821586268e-06, + "loss": 2.5617, + "step": 18979 + }, + { + "epoch": 1.6176595926020627, + "grad_norm": 28.96995691368098, + "learning_rate": 5.188625327655106e-06, + "loss": 2.1831, + "step": 18980 + }, + { + "epoch": 1.617744822296088, + "grad_norm": 37.29457185358138, + "learning_rate": 5.188129831868891e-06, + "loss": 2.889, + "step": 18981 + }, + { + "epoch": 1.6178300519901132, + "grad_norm": 41.388803246457, + "learning_rate": 5.187634334232492e-06, + "loss": 1.7491, + "step": 18982 + }, + { + "epoch": 1.6179152816841387, + "grad_norm": 48.561176646075445, + "learning_rate": 5.187138834750785e-06, + "loss": 2.6259, + "step": 18983 + }, + { + "epoch": 1.6180005113781641, + "grad_norm": 42.94119077020832, + "learning_rate": 5.186643333428642e-06, + "loss": 3.6454, + "step": 18984 + }, + { + "epoch": 1.6180857410721896, + "grad_norm": 44.08933407185207, + "learning_rate": 5.186147830270936e-06, + "loss": 3.3091, + "step": 18985 + }, + { + "epoch": 1.618170970766215, + "grad_norm": 57.87985527241045, + "learning_rate": 5.18565232528254e-06, + "loss": 2.826, + "step": 18986 + }, + { + "epoch": 1.6182562004602403, + "grad_norm": 107.12728706309933, + "learning_rate": 5.185156818468328e-06, + "loss": 3.1584, + "step": 18987 + }, + { + "epoch": 1.6183414301542658, + "grad_norm": 46.599759373868245, + "learning_rate": 5.18466130983317e-06, + "loss": 2.4811, + "step": 18988 + }, + { + "epoch": 1.618426659848291, + "grad_norm": 58.441910067631554, + "learning_rate": 5.184165799381943e-06, + "loss": 2.6714, + "step": 18989 + }, + { + "epoch": 1.6185118895423165, + "grad_norm": 54.3989204224092, + "learning_rate": 5.183670287119518e-06, + "loss": 2.1293, + "step": 18990 + }, + { + "epoch": 1.618597119236342, + "grad_norm": 79.709458608883, + "learning_rate": 5.183174773050768e-06, + "loss": 3.6097, + "step": 18991 + }, + { + "epoch": 1.6186823489303674, + "grad_norm": 45.24527011213661, + "learning_rate": 5.182679257180569e-06, + "loss": 3.0901, + "step": 18992 + }, + { + "epoch": 1.6187675786243927, + "grad_norm": 35.92263502746301, + "learning_rate": 5.1821837395137895e-06, + "loss": 1.689, + "step": 18993 + }, + { + "epoch": 1.6188528083184182, + "grad_norm": 96.55982943130779, + "learning_rate": 5.181688220055309e-06, + "loss": 4.284, + "step": 18994 + }, + { + "epoch": 1.6189380380124434, + "grad_norm": 67.91348538306912, + "learning_rate": 5.181192698809994e-06, + "loss": 3.93, + "step": 18995 + }, + { + "epoch": 1.6190232677064689, + "grad_norm": 33.34239746487308, + "learning_rate": 5.180697175782721e-06, + "loss": 2.458, + "step": 18996 + }, + { + "epoch": 1.6191084974004943, + "grad_norm": 66.96761936068273, + "learning_rate": 5.1802016509783635e-06, + "loss": 2.9941, + "step": 18997 + }, + { + "epoch": 1.6191937270945198, + "grad_norm": 98.44800626412486, + "learning_rate": 5.1797061244017955e-06, + "loss": 3.8179, + "step": 18998 + }, + { + "epoch": 1.6192789567885453, + "grad_norm": 42.39130441261876, + "learning_rate": 5.179210596057887e-06, + "loss": 2.9149, + "step": 18999 + }, + { + "epoch": 1.6193641864825705, + "grad_norm": 55.175831228625604, + "learning_rate": 5.178715065951515e-06, + "loss": 3.54, + "step": 19000 + }, + { + "epoch": 1.6194494161765958, + "grad_norm": 93.2216416627548, + "learning_rate": 5.178219534087551e-06, + "loss": 4.0501, + "step": 19001 + }, + { + "epoch": 1.6195346458706212, + "grad_norm": 70.77386470770786, + "learning_rate": 5.177724000470869e-06, + "loss": 2.4095, + "step": 19002 + }, + { + "epoch": 1.6196198755646467, + "grad_norm": 37.89196547205511, + "learning_rate": 5.177228465106341e-06, + "loss": 2.6867, + "step": 19003 + }, + { + "epoch": 1.6197051052586722, + "grad_norm": 83.42659203310171, + "learning_rate": 5.176732927998842e-06, + "loss": 3.0443, + "step": 19004 + }, + { + "epoch": 1.6197903349526976, + "grad_norm": 26.51315866832937, + "learning_rate": 5.176237389153246e-06, + "loss": 2.7809, + "step": 19005 + }, + { + "epoch": 1.6198755646467229, + "grad_norm": 57.866306370801716, + "learning_rate": 5.175741848574425e-06, + "loss": 2.4591, + "step": 19006 + }, + { + "epoch": 1.6199607943407484, + "grad_norm": 56.98699436543668, + "learning_rate": 5.175246306267251e-06, + "loss": 3.2679, + "step": 19007 + }, + { + "epoch": 1.6200460240347736, + "grad_norm": 52.12528082637738, + "learning_rate": 5.1747507622366e-06, + "loss": 3.0664, + "step": 19008 + }, + { + "epoch": 1.620131253728799, + "grad_norm": 37.84353940308104, + "learning_rate": 5.174255216487346e-06, + "loss": 3.2354, + "step": 19009 + }, + { + "epoch": 1.6202164834228245, + "grad_norm": 42.10356371609708, + "learning_rate": 5.173759669024361e-06, + "loss": 3.1303, + "step": 19010 + }, + { + "epoch": 1.62030171311685, + "grad_norm": 54.363904307657236, + "learning_rate": 5.173264119852517e-06, + "loss": 2.9344, + "step": 19011 + }, + { + "epoch": 1.6203869428108755, + "grad_norm": 37.39493292150705, + "learning_rate": 5.17276856897669e-06, + "loss": 2.1636, + "step": 19012 + }, + { + "epoch": 1.6204721725049007, + "grad_norm": 77.09173932257094, + "learning_rate": 5.1722730164017545e-06, + "loss": 2.7441, + "step": 19013 + }, + { + "epoch": 1.620557402198926, + "grad_norm": 42.99768894013375, + "learning_rate": 5.17177746213258e-06, + "loss": 4.1355, + "step": 19014 + }, + { + "epoch": 1.6206426318929514, + "grad_norm": 41.714416383967134, + "learning_rate": 5.1712819061740426e-06, + "loss": 3.0225, + "step": 19015 + }, + { + "epoch": 1.620727861586977, + "grad_norm": 35.75685041512139, + "learning_rate": 5.1707863485310165e-06, + "loss": 3.2617, + "step": 19016 + }, + { + "epoch": 1.6208130912810024, + "grad_norm": 56.99395680471746, + "learning_rate": 5.170290789208375e-06, + "loss": 2.7616, + "step": 19017 + }, + { + "epoch": 1.6208983209750278, + "grad_norm": 43.53304265178738, + "learning_rate": 5.169795228210991e-06, + "loss": 2.728, + "step": 19018 + }, + { + "epoch": 1.620983550669053, + "grad_norm": 28.261655439373207, + "learning_rate": 5.1692996655437364e-06, + "loss": 2.3907, + "step": 19019 + }, + { + "epoch": 1.6210687803630783, + "grad_norm": 86.26508180843297, + "learning_rate": 5.16880410121149e-06, + "loss": 2.8406, + "step": 19020 + }, + { + "epoch": 1.6211540100571038, + "grad_norm": 26.940821215418172, + "learning_rate": 5.168308535219121e-06, + "loss": 2.5549, + "step": 19021 + }, + { + "epoch": 1.6212392397511293, + "grad_norm": 68.85183113253564, + "learning_rate": 5.167812967571505e-06, + "loss": 3.3612, + "step": 19022 + }, + { + "epoch": 1.6213244694451547, + "grad_norm": 74.53494666755572, + "learning_rate": 5.167317398273512e-06, + "loss": 4.483, + "step": 19023 + }, + { + "epoch": 1.6214096991391802, + "grad_norm": 81.52015340048902, + "learning_rate": 5.166821827330023e-06, + "loss": 3.5111, + "step": 19024 + }, + { + "epoch": 1.6214949288332055, + "grad_norm": 31.396790360989606, + "learning_rate": 5.1663262547459046e-06, + "loss": 1.1877, + "step": 19025 + }, + { + "epoch": 1.621580158527231, + "grad_norm": 35.42649744385382, + "learning_rate": 5.165830680526036e-06, + "loss": 2.0621, + "step": 19026 + }, + { + "epoch": 1.6216653882212562, + "grad_norm": 33.89011702680049, + "learning_rate": 5.165335104675287e-06, + "loss": 2.63, + "step": 19027 + }, + { + "epoch": 1.6217506179152816, + "grad_norm": 44.19905681216867, + "learning_rate": 5.164839527198533e-06, + "loss": 2.8557, + "step": 19028 + }, + { + "epoch": 1.621835847609307, + "grad_norm": 56.904607541840164, + "learning_rate": 5.164343948100648e-06, + "loss": 3.6331, + "step": 19029 + }, + { + "epoch": 1.6219210773033326, + "grad_norm": 85.94061485067073, + "learning_rate": 5.163848367386507e-06, + "loss": 3.8809, + "step": 19030 + }, + { + "epoch": 1.622006306997358, + "grad_norm": 46.690920358554756, + "learning_rate": 5.16335278506098e-06, + "loss": 2.8718, + "step": 19031 + }, + { + "epoch": 1.6220915366913833, + "grad_norm": 46.51301868101942, + "learning_rate": 5.162857201128945e-06, + "loss": 2.7781, + "step": 19032 + }, + { + "epoch": 1.6221767663854085, + "grad_norm": 54.62891740197574, + "learning_rate": 5.162361615595274e-06, + "loss": 2.939, + "step": 19033 + }, + { + "epoch": 1.622261996079434, + "grad_norm": 31.61322506027153, + "learning_rate": 5.16186602846484e-06, + "loss": 1.5625, + "step": 19034 + }, + { + "epoch": 1.6223472257734595, + "grad_norm": 61.044500902248814, + "learning_rate": 5.161370439742519e-06, + "loss": 3.3351, + "step": 19035 + }, + { + "epoch": 1.622432455467485, + "grad_norm": 39.91241977437835, + "learning_rate": 5.1608748494331825e-06, + "loss": 2.9119, + "step": 19036 + }, + { + "epoch": 1.6225176851615104, + "grad_norm": 39.577133403526254, + "learning_rate": 5.1603792575417074e-06, + "loss": 2.9607, + "step": 19037 + }, + { + "epoch": 1.6226029148555356, + "grad_norm": 28.279886109572644, + "learning_rate": 5.1598836640729655e-06, + "loss": 2.4943, + "step": 19038 + }, + { + "epoch": 1.6226881445495611, + "grad_norm": 36.14763795726451, + "learning_rate": 5.15938806903183e-06, + "loss": 2.2829, + "step": 19039 + }, + { + "epoch": 1.6227733742435864, + "grad_norm": 78.11015761431491, + "learning_rate": 5.158892472423177e-06, + "loss": 2.7573, + "step": 19040 + }, + { + "epoch": 1.6228586039376118, + "grad_norm": 66.95440870434314, + "learning_rate": 5.158396874251881e-06, + "loss": 2.5544, + "step": 19041 + }, + { + "epoch": 1.6229438336316373, + "grad_norm": 33.30142584596776, + "learning_rate": 5.157901274522813e-06, + "loss": 2.3608, + "step": 19042 + }, + { + "epoch": 1.6230290633256628, + "grad_norm": 59.68657051384094, + "learning_rate": 5.157405673240849e-06, + "loss": 4.0744, + "step": 19043 + }, + { + "epoch": 1.623114293019688, + "grad_norm": 29.70351807148065, + "learning_rate": 5.1569100704108635e-06, + "loss": 2.5548, + "step": 19044 + }, + { + "epoch": 1.6231995227137135, + "grad_norm": 99.79436657301008, + "learning_rate": 5.1564144660377294e-06, + "loss": 3.4186, + "step": 19045 + }, + { + "epoch": 1.6232847524077387, + "grad_norm": 38.21446480711292, + "learning_rate": 5.155918860126321e-06, + "loss": 3.8648, + "step": 19046 + }, + { + "epoch": 1.6233699821017642, + "grad_norm": 36.6961898006193, + "learning_rate": 5.155423252681513e-06, + "loss": 3.6182, + "step": 19047 + }, + { + "epoch": 1.6234552117957897, + "grad_norm": 54.352890723715824, + "learning_rate": 5.154927643708179e-06, + "loss": 1.7876, + "step": 19048 + }, + { + "epoch": 1.6235404414898151, + "grad_norm": 46.59235425076038, + "learning_rate": 5.154432033211193e-06, + "loss": 2.4962, + "step": 19049 + }, + { + "epoch": 1.6236256711838406, + "grad_norm": 46.68958580970312, + "learning_rate": 5.153936421195429e-06, + "loss": 2.8348, + "step": 19050 + }, + { + "epoch": 1.6237109008778658, + "grad_norm": 20.45087145852533, + "learning_rate": 5.15344080766576e-06, + "loss": 2.2118, + "step": 19051 + }, + { + "epoch": 1.623796130571891, + "grad_norm": 62.103875260537166, + "learning_rate": 5.152945192627064e-06, + "loss": 2.2407, + "step": 19052 + }, + { + "epoch": 1.6238813602659166, + "grad_norm": 37.087109066811614, + "learning_rate": 5.152449576084213e-06, + "loss": 3.036, + "step": 19053 + }, + { + "epoch": 1.623966589959942, + "grad_norm": 49.26155945301014, + "learning_rate": 5.15195395804208e-06, + "loss": 3.3237, + "step": 19054 + }, + { + "epoch": 1.6240518196539675, + "grad_norm": 43.643981457098825, + "learning_rate": 5.15145833850554e-06, + "loss": 3.659, + "step": 19055 + }, + { + "epoch": 1.624137049347993, + "grad_norm": 33.368335416166595, + "learning_rate": 5.1509627174794686e-06, + "loss": 2.7507, + "step": 19056 + }, + { + "epoch": 1.6242222790420182, + "grad_norm": 68.91998068961718, + "learning_rate": 5.1504670949687375e-06, + "loss": 4.1369, + "step": 19057 + }, + { + "epoch": 1.6243075087360437, + "grad_norm": 36.75823384148621, + "learning_rate": 5.149971470978222e-06, + "loss": 1.731, + "step": 19058 + }, + { + "epoch": 1.624392738430069, + "grad_norm": 45.47398309953909, + "learning_rate": 5.149475845512798e-06, + "loss": 3.3431, + "step": 19059 + }, + { + "epoch": 1.6244779681240944, + "grad_norm": 43.96472559196875, + "learning_rate": 5.148980218577339e-06, + "loss": 3.2539, + "step": 19060 + }, + { + "epoch": 1.6245631978181199, + "grad_norm": 46.26528557911803, + "learning_rate": 5.1484845901767175e-06, + "loss": 2.2895, + "step": 19061 + }, + { + "epoch": 1.6246484275121453, + "grad_norm": 58.15669974663834, + "learning_rate": 5.147988960315808e-06, + "loss": 2.7367, + "step": 19062 + }, + { + "epoch": 1.6247336572061706, + "grad_norm": 46.10111732523068, + "learning_rate": 5.1474933289994885e-06, + "loss": 2.6731, + "step": 19063 + }, + { + "epoch": 1.624818886900196, + "grad_norm": 60.48491476211556, + "learning_rate": 5.146997696232629e-06, + "loss": 2.1658, + "step": 19064 + }, + { + "epoch": 1.6249041165942213, + "grad_norm": 48.68139908250671, + "learning_rate": 5.146502062020107e-06, + "loss": 2.9958, + "step": 19065 + }, + { + "epoch": 1.6249893462882468, + "grad_norm": 77.3476119075273, + "learning_rate": 5.146006426366793e-06, + "loss": 3.191, + "step": 19066 + }, + { + "epoch": 1.6250745759822722, + "grad_norm": 36.217296128481095, + "learning_rate": 5.145510789277566e-06, + "loss": 2.7141, + "step": 19067 + }, + { + "epoch": 1.6251598056762977, + "grad_norm": 41.70948584625008, + "learning_rate": 5.145015150757297e-06, + "loss": 2.9297, + "step": 19068 + }, + { + "epoch": 1.6252450353703232, + "grad_norm": 96.58477193679217, + "learning_rate": 5.144519510810862e-06, + "loss": 2.0227, + "step": 19069 + }, + { + "epoch": 1.6253302650643484, + "grad_norm": 49.93272091029095, + "learning_rate": 5.144023869443134e-06, + "loss": 2.943, + "step": 19070 + }, + { + "epoch": 1.6254154947583737, + "grad_norm": 74.15746458399326, + "learning_rate": 5.143528226658991e-06, + "loss": 3.1207, + "step": 19071 + }, + { + "epoch": 1.6255007244523991, + "grad_norm": 83.51483916755731, + "learning_rate": 5.143032582463303e-06, + "loss": 3.6141, + "step": 19072 + }, + { + "epoch": 1.6255859541464246, + "grad_norm": 42.91626349667804, + "learning_rate": 5.142536936860948e-06, + "loss": 2.9711, + "step": 19073 + }, + { + "epoch": 1.62567118384045, + "grad_norm": 103.11797843327037, + "learning_rate": 5.142041289856796e-06, + "loss": 5.006, + "step": 19074 + }, + { + "epoch": 1.6257564135344755, + "grad_norm": 45.878353688659985, + "learning_rate": 5.141545641455727e-06, + "loss": 2.9676, + "step": 19075 + }, + { + "epoch": 1.6258416432285008, + "grad_norm": 32.66467810404717, + "learning_rate": 5.141049991662612e-06, + "loss": 3.2903, + "step": 19076 + }, + { + "epoch": 1.6259268729225262, + "grad_norm": 54.2038123561616, + "learning_rate": 5.140554340482326e-06, + "loss": 3.3773, + "step": 19077 + }, + { + "epoch": 1.6260121026165515, + "grad_norm": 40.40586526744771, + "learning_rate": 5.140058687919745e-06, + "loss": 2.9649, + "step": 19078 + }, + { + "epoch": 1.626097332310577, + "grad_norm": 57.42959337458743, + "learning_rate": 5.139563033979741e-06, + "loss": 3.0248, + "step": 19079 + }, + { + "epoch": 1.6261825620046024, + "grad_norm": 112.43751556788646, + "learning_rate": 5.1390673786671914e-06, + "loss": 4.0259, + "step": 19080 + }, + { + "epoch": 1.626267791698628, + "grad_norm": 65.0533251491371, + "learning_rate": 5.138571721986969e-06, + "loss": 3.3953, + "step": 19081 + }, + { + "epoch": 1.6263530213926534, + "grad_norm": 47.31300087308239, + "learning_rate": 5.138076063943947e-06, + "loss": 3.406, + "step": 19082 + }, + { + "epoch": 1.6264382510866786, + "grad_norm": 44.23005750068207, + "learning_rate": 5.137580404543003e-06, + "loss": 3.2184, + "step": 19083 + }, + { + "epoch": 1.6265234807807039, + "grad_norm": 62.650743511060455, + "learning_rate": 5.137084743789013e-06, + "loss": 2.3908, + "step": 19084 + }, + { + "epoch": 1.6266087104747293, + "grad_norm": 83.70425252779022, + "learning_rate": 5.136589081686846e-06, + "loss": 2.5799, + "step": 19085 + }, + { + "epoch": 1.6266939401687548, + "grad_norm": 39.086867523070936, + "learning_rate": 5.136093418241379e-06, + "loss": 2.9931, + "step": 19086 + }, + { + "epoch": 1.6267791698627803, + "grad_norm": 88.56267411784917, + "learning_rate": 5.135597753457489e-06, + "loss": 3.847, + "step": 19087 + }, + { + "epoch": 1.6268643995568057, + "grad_norm": 73.05185472137626, + "learning_rate": 5.135102087340049e-06, + "loss": 2.9304, + "step": 19088 + }, + { + "epoch": 1.626949629250831, + "grad_norm": 98.82015130473712, + "learning_rate": 5.134606419893933e-06, + "loss": 3.0369, + "step": 19089 + }, + { + "epoch": 1.6270348589448562, + "grad_norm": 45.08355361125891, + "learning_rate": 5.134110751124016e-06, + "loss": 2.0734, + "step": 19090 + }, + { + "epoch": 1.6271200886388817, + "grad_norm": 50.85712124643483, + "learning_rate": 5.133615081035175e-06, + "loss": 2.4052, + "step": 19091 + }, + { + "epoch": 1.6272053183329072, + "grad_norm": 71.31829752951917, + "learning_rate": 5.133119409632281e-06, + "loss": 3.5319, + "step": 19092 + }, + { + "epoch": 1.6272905480269326, + "grad_norm": 25.82874114847942, + "learning_rate": 5.13262373692021e-06, + "loss": 2.8293, + "step": 19093 + }, + { + "epoch": 1.627375777720958, + "grad_norm": 45.782493871916856, + "learning_rate": 5.132128062903838e-06, + "loss": 3.5191, + "step": 19094 + }, + { + "epoch": 1.6274610074149833, + "grad_norm": 55.336609083668655, + "learning_rate": 5.13163238758804e-06, + "loss": 3.7413, + "step": 19095 + }, + { + "epoch": 1.6275462371090088, + "grad_norm": 34.097562717420764, + "learning_rate": 5.131136710977689e-06, + "loss": 2.6614, + "step": 19096 + }, + { + "epoch": 1.627631466803034, + "grad_norm": 68.79931783526129, + "learning_rate": 5.13064103307766e-06, + "loss": 3.101, + "step": 19097 + }, + { + "epoch": 1.6277166964970595, + "grad_norm": 35.45418361462608, + "learning_rate": 5.130145353892829e-06, + "loss": 2.245, + "step": 19098 + }, + { + "epoch": 1.627801926191085, + "grad_norm": 54.058210386213744, + "learning_rate": 5.129649673428071e-06, + "loss": 2.3538, + "step": 19099 + }, + { + "epoch": 1.6278871558851105, + "grad_norm": 54.00840965441633, + "learning_rate": 5.129153991688258e-06, + "loss": 2.9151, + "step": 19100 + }, + { + "epoch": 1.627972385579136, + "grad_norm": 44.3778315287534, + "learning_rate": 5.128658308678268e-06, + "loss": 3.4886, + "step": 19101 + }, + { + "epoch": 1.6280576152731612, + "grad_norm": 77.93838413501089, + "learning_rate": 5.128162624402974e-06, + "loss": 2.3374, + "step": 19102 + }, + { + "epoch": 1.6281428449671864, + "grad_norm": 30.50871232077848, + "learning_rate": 5.127666938867253e-06, + "loss": 2.774, + "step": 19103 + }, + { + "epoch": 1.6282280746612119, + "grad_norm": 34.58857520945834, + "learning_rate": 5.1271712520759764e-06, + "loss": 2.7892, + "step": 19104 + }, + { + "epoch": 1.6283133043552374, + "grad_norm": 45.731760357601146, + "learning_rate": 5.126675564034023e-06, + "loss": 3.3801, + "step": 19105 + }, + { + "epoch": 1.6283985340492628, + "grad_norm": 70.67997773151157, + "learning_rate": 5.1261798747462654e-06, + "loss": 3.13, + "step": 19106 + }, + { + "epoch": 1.6284837637432883, + "grad_norm": 24.04029038939313, + "learning_rate": 5.1256841842175775e-06, + "loss": 1.9739, + "step": 19107 + }, + { + "epoch": 1.6285689934373135, + "grad_norm": 99.19254477914527, + "learning_rate": 5.125188492452837e-06, + "loss": 3.5234, + "step": 19108 + }, + { + "epoch": 1.628654223131339, + "grad_norm": 64.26929951847828, + "learning_rate": 5.124692799456915e-06, + "loss": 2.7737, + "step": 19109 + }, + { + "epoch": 1.6287394528253643, + "grad_norm": 42.98275189946389, + "learning_rate": 5.124197105234692e-06, + "loss": 2.91, + "step": 19110 + }, + { + "epoch": 1.6288246825193897, + "grad_norm": 36.2141938839776, + "learning_rate": 5.123701409791039e-06, + "loss": 2.549, + "step": 19111 + }, + { + "epoch": 1.6289099122134152, + "grad_norm": 40.55831803458799, + "learning_rate": 5.123205713130831e-06, + "loss": 3.1264, + "step": 19112 + }, + { + "epoch": 1.6289951419074407, + "grad_norm": 25.48019455495124, + "learning_rate": 5.122710015258943e-06, + "loss": 1.9931, + "step": 19113 + }, + { + "epoch": 1.629080371601466, + "grad_norm": 37.7018366095916, + "learning_rate": 5.1222143161802525e-06, + "loss": 2.762, + "step": 19114 + }, + { + "epoch": 1.6291656012954914, + "grad_norm": 45.600250591969974, + "learning_rate": 5.121718615899631e-06, + "loss": 2.8678, + "step": 19115 + }, + { + "epoch": 1.6292508309895166, + "grad_norm": 58.6928747882856, + "learning_rate": 5.121222914421958e-06, + "loss": 3.5813, + "step": 19116 + }, + { + "epoch": 1.629336060683542, + "grad_norm": 36.336758636290426, + "learning_rate": 5.120727211752102e-06, + "loss": 2.5141, + "step": 19117 + }, + { + "epoch": 1.6294212903775676, + "grad_norm": 56.1683426127678, + "learning_rate": 5.120231507894945e-06, + "loss": 2.8666, + "step": 19118 + }, + { + "epoch": 1.629506520071593, + "grad_norm": 34.518668453660304, + "learning_rate": 5.119735802855358e-06, + "loss": 2.8634, + "step": 19119 + }, + { + "epoch": 1.6295917497656185, + "grad_norm": 42.537020044688106, + "learning_rate": 5.119240096638217e-06, + "loss": 3.2352, + "step": 19120 + }, + { + "epoch": 1.6296769794596437, + "grad_norm": 88.35586022204863, + "learning_rate": 5.118744389248396e-06, + "loss": 2.2209, + "step": 19121 + }, + { + "epoch": 1.629762209153669, + "grad_norm": 36.40523217540995, + "learning_rate": 5.118248680690773e-06, + "loss": 2.7791, + "step": 19122 + }, + { + "epoch": 1.6298474388476945, + "grad_norm": 43.63244755373267, + "learning_rate": 5.117752970970221e-06, + "loss": 2.1651, + "step": 19123 + }, + { + "epoch": 1.62993266854172, + "grad_norm": 34.62554890691027, + "learning_rate": 5.117257260091614e-06, + "loss": 3.5871, + "step": 19124 + }, + { + "epoch": 1.6300178982357454, + "grad_norm": 58.863225152319686, + "learning_rate": 5.116761548059829e-06, + "loss": 2.6678, + "step": 19125 + }, + { + "epoch": 1.6301031279297709, + "grad_norm": 78.32003983616279, + "learning_rate": 5.11626583487974e-06, + "loss": 4.0908, + "step": 19126 + }, + { + "epoch": 1.630188357623796, + "grad_norm": 50.03840958867974, + "learning_rate": 5.115770120556224e-06, + "loss": 2.8083, + "step": 19127 + }, + { + "epoch": 1.6302735873178216, + "grad_norm": 37.99589775295997, + "learning_rate": 5.115274405094154e-06, + "loss": 2.1528, + "step": 19128 + }, + { + "epoch": 1.6303588170118468, + "grad_norm": 31.588678879961613, + "learning_rate": 5.1147786884984055e-06, + "loss": 1.9639, + "step": 19129 + }, + { + "epoch": 1.6304440467058723, + "grad_norm": 68.35441517772816, + "learning_rate": 5.114282970773855e-06, + "loss": 3.0577, + "step": 19130 + }, + { + "epoch": 1.6305292763998978, + "grad_norm": 29.51876626157495, + "learning_rate": 5.113787251925377e-06, + "loss": 1.9143, + "step": 19131 + }, + { + "epoch": 1.6306145060939232, + "grad_norm": 30.99156570659834, + "learning_rate": 5.113291531957846e-06, + "loss": 2.257, + "step": 19132 + }, + { + "epoch": 1.6306997357879485, + "grad_norm": 42.95466522812378, + "learning_rate": 5.112795810876138e-06, + "loss": 2.2876, + "step": 19133 + }, + { + "epoch": 1.630784965481974, + "grad_norm": 57.54646886182547, + "learning_rate": 5.112300088685129e-06, + "loss": 2.6655, + "step": 19134 + }, + { + "epoch": 1.6308701951759992, + "grad_norm": 85.44701253029727, + "learning_rate": 5.111804365389692e-06, + "loss": 3.4019, + "step": 19135 + }, + { + "epoch": 1.6309554248700247, + "grad_norm": 54.17187781570934, + "learning_rate": 5.1113086409947035e-06, + "loss": 3.3946, + "step": 19136 + }, + { + "epoch": 1.6310406545640501, + "grad_norm": 43.92552747733114, + "learning_rate": 5.110812915505039e-06, + "loss": 2.796, + "step": 19137 + }, + { + "epoch": 1.6311258842580756, + "grad_norm": 50.33517229108639, + "learning_rate": 5.110317188925574e-06, + "loss": 3.888, + "step": 19138 + }, + { + "epoch": 1.631211113952101, + "grad_norm": 14.11867058060253, + "learning_rate": 5.109821461261183e-06, + "loss": 0.8791, + "step": 19139 + }, + { + "epoch": 1.6312963436461263, + "grad_norm": 34.22607809496363, + "learning_rate": 5.109325732516741e-06, + "loss": 2.7155, + "step": 19140 + }, + { + "epoch": 1.6313815733401515, + "grad_norm": 52.404905219501835, + "learning_rate": 5.1088300026971245e-06, + "loss": 3.5327, + "step": 19141 + }, + { + "epoch": 1.631466803034177, + "grad_norm": 76.78939893044372, + "learning_rate": 5.108334271807209e-06, + "loss": 4.2458, + "step": 19142 + }, + { + "epoch": 1.6315520327282025, + "grad_norm": 47.84266675051538, + "learning_rate": 5.107838539851867e-06, + "loss": 2.5652, + "step": 19143 + }, + { + "epoch": 1.631637262422228, + "grad_norm": 50.742401115232504, + "learning_rate": 5.1073428068359765e-06, + "loss": 2.895, + "step": 19144 + }, + { + "epoch": 1.6317224921162534, + "grad_norm": 52.751790945321545, + "learning_rate": 5.106847072764413e-06, + "loss": 2.4507, + "step": 19145 + }, + { + "epoch": 1.6318077218102787, + "grad_norm": 45.0420401577963, + "learning_rate": 5.106351337642052e-06, + "loss": 3.2837, + "step": 19146 + }, + { + "epoch": 1.6318929515043041, + "grad_norm": 34.99513130598759, + "learning_rate": 5.105855601473765e-06, + "loss": 2.5942, + "step": 19147 + }, + { + "epoch": 1.6319781811983294, + "grad_norm": 80.31589193890836, + "learning_rate": 5.1053598642644305e-06, + "loss": 2.8451, + "step": 19148 + }, + { + "epoch": 1.6320634108923548, + "grad_norm": 73.91809888330356, + "learning_rate": 5.104864126018926e-06, + "loss": 3.6302, + "step": 19149 + }, + { + "epoch": 1.6321486405863803, + "grad_norm": 53.56312674836205, + "learning_rate": 5.104368386742124e-06, + "loss": 2.6419, + "step": 19150 + }, + { + "epoch": 1.6322338702804058, + "grad_norm": 48.75084193787243, + "learning_rate": 5.1038726464389e-06, + "loss": 3.9506, + "step": 19151 + }, + { + "epoch": 1.6323190999744313, + "grad_norm": 40.34027767110348, + "learning_rate": 5.103376905114127e-06, + "loss": 2.8835, + "step": 19152 + }, + { + "epoch": 1.6324043296684565, + "grad_norm": 68.17090741700217, + "learning_rate": 5.102881162772687e-06, + "loss": 2.7593, + "step": 19153 + }, + { + "epoch": 1.6324895593624817, + "grad_norm": 60.16055832714408, + "learning_rate": 5.10238541941945e-06, + "loss": 2.9609, + "step": 19154 + }, + { + "epoch": 1.6325747890565072, + "grad_norm": 45.38359004982193, + "learning_rate": 5.101889675059294e-06, + "loss": 1.621, + "step": 19155 + }, + { + "epoch": 1.6326600187505327, + "grad_norm": 52.81307600994378, + "learning_rate": 5.1013939296970915e-06, + "loss": 2.9737, + "step": 19156 + }, + { + "epoch": 1.6327452484445582, + "grad_norm": 54.09061842363747, + "learning_rate": 5.100898183337721e-06, + "loss": 1.8423, + "step": 19157 + }, + { + "epoch": 1.6328304781385836, + "grad_norm": 73.13468555594349, + "learning_rate": 5.100402435986057e-06, + "loss": 2.8392, + "step": 19158 + }, + { + "epoch": 1.6329157078326089, + "grad_norm": 45.22247395045461, + "learning_rate": 5.099906687646976e-06, + "loss": 3.4848, + "step": 19159 + }, + { + "epoch": 1.6330009375266343, + "grad_norm": 39.65073075695034, + "learning_rate": 5.099410938325351e-06, + "loss": 3.1241, + "step": 19160 + }, + { + "epoch": 1.6330861672206596, + "grad_norm": 74.74538091575309, + "learning_rate": 5.09891518802606e-06, + "loss": 2.9835, + "step": 19161 + }, + { + "epoch": 1.633171396914685, + "grad_norm": 19.65408551699796, + "learning_rate": 5.098419436753976e-06, + "loss": 1.77, + "step": 19162 + }, + { + "epoch": 1.6332566266087105, + "grad_norm": 46.80028328481838, + "learning_rate": 5.097923684513979e-06, + "loss": 2.2373, + "step": 19163 + }, + { + "epoch": 1.633341856302736, + "grad_norm": 51.743985558066164, + "learning_rate": 5.0974279313109385e-06, + "loss": 2.5519, + "step": 19164 + }, + { + "epoch": 1.6334270859967612, + "grad_norm": 85.53248421038464, + "learning_rate": 5.096932177149734e-06, + "loss": 2.3697, + "step": 19165 + }, + { + "epoch": 1.6335123156907867, + "grad_norm": 38.190944669704, + "learning_rate": 5.09643642203524e-06, + "loss": 2.9258, + "step": 19166 + }, + { + "epoch": 1.633597545384812, + "grad_norm": 38.83913629137834, + "learning_rate": 5.095940665972332e-06, + "loss": 2.7348, + "step": 19167 + }, + { + "epoch": 1.6336827750788374, + "grad_norm": 35.88406152579796, + "learning_rate": 5.0954449089658854e-06, + "loss": 3.1456, + "step": 19168 + }, + { + "epoch": 1.6337680047728629, + "grad_norm": 87.57554683272039, + "learning_rate": 5.094949151020776e-06, + "loss": 2.6637, + "step": 19169 + }, + { + "epoch": 1.6338532344668883, + "grad_norm": 43.57169949529639, + "learning_rate": 5.094453392141882e-06, + "loss": 3.5279, + "step": 19170 + }, + { + "epoch": 1.6339384641609138, + "grad_norm": 43.5860374517632, + "learning_rate": 5.0939576323340735e-06, + "loss": 2.0328, + "step": 19171 + }, + { + "epoch": 1.634023693854939, + "grad_norm": 97.53622008620728, + "learning_rate": 5.0934618716022284e-06, + "loss": 3.0767, + "step": 19172 + }, + { + "epoch": 1.6341089235489643, + "grad_norm": 59.83763139550071, + "learning_rate": 5.092966109951224e-06, + "loss": 2.5564, + "step": 19173 + }, + { + "epoch": 1.6341941532429898, + "grad_norm": 43.783079544831686, + "learning_rate": 5.092470347385936e-06, + "loss": 2.3904, + "step": 19174 + }, + { + "epoch": 1.6342793829370152, + "grad_norm": 51.247866680822774, + "learning_rate": 5.0919745839112376e-06, + "loss": 2.8316, + "step": 19175 + }, + { + "epoch": 1.6343646126310407, + "grad_norm": 115.55550612223792, + "learning_rate": 5.091478819532007e-06, + "loss": 4.6247, + "step": 19176 + }, + { + "epoch": 1.6344498423250662, + "grad_norm": 33.746639777564255, + "learning_rate": 5.090983054253117e-06, + "loss": 2.5027, + "step": 19177 + }, + { + "epoch": 1.6345350720190914, + "grad_norm": 20.69615957144818, + "learning_rate": 5.090487288079447e-06, + "loss": 1.7305, + "step": 19178 + }, + { + "epoch": 1.634620301713117, + "grad_norm": 57.164412839057434, + "learning_rate": 5.089991521015868e-06, + "loss": 2.6442, + "step": 19179 + }, + { + "epoch": 1.6347055314071421, + "grad_norm": 36.82139275624942, + "learning_rate": 5.089495753067258e-06, + "loss": 2.0525, + "step": 19180 + }, + { + "epoch": 1.6347907611011676, + "grad_norm": 41.53281397324015, + "learning_rate": 5.0889999842384955e-06, + "loss": 3.2721, + "step": 19181 + }, + { + "epoch": 1.634875990795193, + "grad_norm": 48.660180058488955, + "learning_rate": 5.088504214534452e-06, + "loss": 2.2537, + "step": 19182 + }, + { + "epoch": 1.6349612204892185, + "grad_norm": 35.106022444560764, + "learning_rate": 5.088008443960004e-06, + "loss": 1.7802, + "step": 19183 + }, + { + "epoch": 1.6350464501832438, + "grad_norm": 40.70559114853231, + "learning_rate": 5.0875126725200285e-06, + "loss": 2.9384, + "step": 19184 + }, + { + "epoch": 1.6351316798772693, + "grad_norm": 54.651940909790945, + "learning_rate": 5.0870169002194015e-06, + "loss": 2.5595, + "step": 19185 + }, + { + "epoch": 1.6352169095712945, + "grad_norm": 73.59478477888389, + "learning_rate": 5.086521127062997e-06, + "loss": 3.5524, + "step": 19186 + }, + { + "epoch": 1.63530213926532, + "grad_norm": 45.841595128007796, + "learning_rate": 5.086025353055692e-06, + "loss": 3.4035, + "step": 19187 + }, + { + "epoch": 1.6353873689593454, + "grad_norm": 78.3489492009252, + "learning_rate": 5.085529578202361e-06, + "loss": 3.0638, + "step": 19188 + }, + { + "epoch": 1.635472598653371, + "grad_norm": 35.08889296383454, + "learning_rate": 5.085033802507882e-06, + "loss": 2.3952, + "step": 19189 + }, + { + "epoch": 1.6355578283473964, + "grad_norm": 51.344257716362605, + "learning_rate": 5.084538025977127e-06, + "loss": 2.4476, + "step": 19190 + }, + { + "epoch": 1.6356430580414216, + "grad_norm": 69.58185777210953, + "learning_rate": 5.0840422486149755e-06, + "loss": 2.7696, + "step": 19191 + }, + { + "epoch": 1.6357282877354469, + "grad_norm": 30.329049168043003, + "learning_rate": 5.0835464704263014e-06, + "loss": 2.2559, + "step": 19192 + }, + { + "epoch": 1.6358135174294723, + "grad_norm": 76.51067315020897, + "learning_rate": 5.083050691415983e-06, + "loss": 3.1319, + "step": 19193 + }, + { + "epoch": 1.6358987471234978, + "grad_norm": 80.95251025100515, + "learning_rate": 5.082554911588894e-06, + "loss": 4.3698, + "step": 19194 + }, + { + "epoch": 1.6359839768175233, + "grad_norm": 49.78478323764254, + "learning_rate": 5.0820591309499066e-06, + "loss": 2.6432, + "step": 19195 + }, + { + "epoch": 1.6360692065115487, + "grad_norm": 45.922050713066795, + "learning_rate": 5.081563349503904e-06, + "loss": 2.6448, + "step": 19196 + }, + { + "epoch": 1.636154436205574, + "grad_norm": 63.86126266447576, + "learning_rate": 5.081067567255757e-06, + "loss": 3.4898, + "step": 19197 + }, + { + "epoch": 1.6362396658995995, + "grad_norm": 38.517654961589564, + "learning_rate": 5.080571784210342e-06, + "loss": 2.6782, + "step": 19198 + }, + { + "epoch": 1.6363248955936247, + "grad_norm": 63.362545870902636, + "learning_rate": 5.0800760003725345e-06, + "loss": 2.9173, + "step": 19199 + }, + { + "epoch": 1.6364101252876502, + "grad_norm": 58.8575100066683, + "learning_rate": 5.079580215747214e-06, + "loss": 3.1386, + "step": 19200 + }, + { + "epoch": 1.6364953549816756, + "grad_norm": 65.12967505264952, + "learning_rate": 5.079084430339252e-06, + "loss": 3.415, + "step": 19201 + }, + { + "epoch": 1.6365805846757011, + "grad_norm": 64.19822393343708, + "learning_rate": 5.078588644153527e-06, + "loss": 2.4847, + "step": 19202 + }, + { + "epoch": 1.6366658143697264, + "grad_norm": 44.31927922018756, + "learning_rate": 5.078092857194913e-06, + "loss": 2.2314, + "step": 19203 + }, + { + "epoch": 1.6367510440637518, + "grad_norm": 50.757783925661506, + "learning_rate": 5.077597069468288e-06, + "loss": 2.9693, + "step": 19204 + }, + { + "epoch": 1.636836273757777, + "grad_norm": 51.989362257443204, + "learning_rate": 5.077101280978525e-06, + "loss": 2.2865, + "step": 19205 + }, + { + "epoch": 1.6369215034518025, + "grad_norm": 37.90826487593196, + "learning_rate": 5.0766054917305044e-06, + "loss": 3.2254, + "step": 19206 + }, + { + "epoch": 1.637006733145828, + "grad_norm": 67.83208908744201, + "learning_rate": 5.076109701729095e-06, + "loss": 1.935, + "step": 19207 + }, + { + "epoch": 1.6370919628398535, + "grad_norm": 51.17749965385478, + "learning_rate": 5.07561391097918e-06, + "loss": 2.4965, + "step": 19208 + }, + { + "epoch": 1.637177192533879, + "grad_norm": 128.541494831362, + "learning_rate": 5.075118119485633e-06, + "loss": 1.8587, + "step": 19209 + }, + { + "epoch": 1.6372624222279042, + "grad_norm": 41.05114334585112, + "learning_rate": 5.074622327253327e-06, + "loss": 2.8568, + "step": 19210 + }, + { + "epoch": 1.6373476519219294, + "grad_norm": 65.46943655983642, + "learning_rate": 5.07412653428714e-06, + "loss": 2.1429, + "step": 19211 + }, + { + "epoch": 1.637432881615955, + "grad_norm": 31.45327208311908, + "learning_rate": 5.073630740591948e-06, + "loss": 1.8598, + "step": 19212 + }, + { + "epoch": 1.6375181113099804, + "grad_norm": 48.60164785470311, + "learning_rate": 5.073134946172629e-06, + "loss": 1.9849, + "step": 19213 + }, + { + "epoch": 1.6376033410040058, + "grad_norm": 142.6421499355585, + "learning_rate": 5.072639151034055e-06, + "loss": 2.3096, + "step": 19214 + }, + { + "epoch": 1.6376885706980313, + "grad_norm": 45.78280462932332, + "learning_rate": 5.072143355181104e-06, + "loss": 3.5504, + "step": 19215 + }, + { + "epoch": 1.6377738003920566, + "grad_norm": 70.13570717271139, + "learning_rate": 5.071647558618652e-06, + "loss": 2.7562, + "step": 19216 + }, + { + "epoch": 1.637859030086082, + "grad_norm": 53.0221046819922, + "learning_rate": 5.071151761351575e-06, + "loss": 2.8974, + "step": 19217 + }, + { + "epoch": 1.6379442597801073, + "grad_norm": 54.07627824621815, + "learning_rate": 5.070655963384749e-06, + "loss": 3.4839, + "step": 19218 + }, + { + "epoch": 1.6380294894741327, + "grad_norm": 48.99502636591741, + "learning_rate": 5.070160164723048e-06, + "loss": 3.1937, + "step": 19219 + }, + { + "epoch": 1.6381147191681582, + "grad_norm": 32.72246816193826, + "learning_rate": 5.06966436537135e-06, + "loss": 2.471, + "step": 19220 + }, + { + "epoch": 1.6381999488621837, + "grad_norm": 95.1965026095941, + "learning_rate": 5.069168565334532e-06, + "loss": 3.2585, + "step": 19221 + }, + { + "epoch": 1.6382851785562091, + "grad_norm": 127.70833923661978, + "learning_rate": 5.0686727646174685e-06, + "loss": 2.7306, + "step": 19222 + }, + { + "epoch": 1.6383704082502344, + "grad_norm": 41.83270381909635, + "learning_rate": 5.068176963225036e-06, + "loss": 2.7822, + "step": 19223 + }, + { + "epoch": 1.6384556379442596, + "grad_norm": 53.62792603227171, + "learning_rate": 5.067681161162109e-06, + "loss": 2.2461, + "step": 19224 + }, + { + "epoch": 1.638540867638285, + "grad_norm": 66.0603949067537, + "learning_rate": 5.067185358433565e-06, + "loss": 2.8244, + "step": 19225 + }, + { + "epoch": 1.6386260973323106, + "grad_norm": 35.60832296825451, + "learning_rate": 5.06668955504428e-06, + "loss": 3.1201, + "step": 19226 + }, + { + "epoch": 1.638711327026336, + "grad_norm": 40.617773700918306, + "learning_rate": 5.066193750999129e-06, + "loss": 3.034, + "step": 19227 + }, + { + "epoch": 1.6387965567203615, + "grad_norm": 77.35333101716685, + "learning_rate": 5.06569794630299e-06, + "loss": 2.7497, + "step": 19228 + }, + { + "epoch": 1.6388817864143868, + "grad_norm": 291.9172673526146, + "learning_rate": 5.065202140960737e-06, + "loss": 3.7175, + "step": 19229 + }, + { + "epoch": 1.6389670161084122, + "grad_norm": 37.39088651189213, + "learning_rate": 5.064706334977246e-06, + "loss": 2.8805, + "step": 19230 + }, + { + "epoch": 1.6390522458024375, + "grad_norm": 47.3697648245399, + "learning_rate": 5.0642105283573935e-06, + "loss": 2.0694, + "step": 19231 + }, + { + "epoch": 1.639137475496463, + "grad_norm": 79.20860809391492, + "learning_rate": 5.0637147211060585e-06, + "loss": 3.2202, + "step": 19232 + }, + { + "epoch": 1.6392227051904884, + "grad_norm": 86.96500877369088, + "learning_rate": 5.063218913228114e-06, + "loss": 3.4447, + "step": 19233 + }, + { + "epoch": 1.6393079348845139, + "grad_norm": 66.9722307934162, + "learning_rate": 5.062723104728435e-06, + "loss": 2.8099, + "step": 19234 + }, + { + "epoch": 1.6393931645785391, + "grad_norm": 42.630059655524384, + "learning_rate": 5.0622272956119e-06, + "loss": 2.9138, + "step": 19235 + }, + { + "epoch": 1.6394783942725646, + "grad_norm": 26.086151820139147, + "learning_rate": 5.061731485883386e-06, + "loss": 2.8131, + "step": 19236 + }, + { + "epoch": 1.6395636239665898, + "grad_norm": 49.88861346038933, + "learning_rate": 5.061235675547764e-06, + "loss": 2.7063, + "step": 19237 + }, + { + "epoch": 1.6396488536606153, + "grad_norm": 37.34630516342108, + "learning_rate": 5.060739864609916e-06, + "loss": 2.6937, + "step": 19238 + }, + { + "epoch": 1.6397340833546408, + "grad_norm": 48.32559267404655, + "learning_rate": 5.060244053074716e-06, + "loss": 2.7094, + "step": 19239 + }, + { + "epoch": 1.6398193130486662, + "grad_norm": 33.44146582635496, + "learning_rate": 5.0597482409470376e-06, + "loss": 2.4742, + "step": 19240 + }, + { + "epoch": 1.6399045427426917, + "grad_norm": 69.52723337335753, + "learning_rate": 5.05925242823176e-06, + "loss": 2.5583, + "step": 19241 + }, + { + "epoch": 1.639989772436717, + "grad_norm": 52.172990080349486, + "learning_rate": 5.058756614933757e-06, + "loss": 2.1608, + "step": 19242 + }, + { + "epoch": 1.6400750021307422, + "grad_norm": 34.454840638717265, + "learning_rate": 5.058260801057908e-06, + "loss": 3.0468, + "step": 19243 + }, + { + "epoch": 1.6401602318247677, + "grad_norm": 51.6039566058258, + "learning_rate": 5.057764986609086e-06, + "loss": 2.4137, + "step": 19244 + }, + { + "epoch": 1.6402454615187931, + "grad_norm": 76.88327605495044, + "learning_rate": 5.05726917159217e-06, + "loss": 1.6774, + "step": 19245 + }, + { + "epoch": 1.6403306912128186, + "grad_norm": 27.20275643592124, + "learning_rate": 5.056773356012031e-06, + "loss": 2.1351, + "step": 19246 + }, + { + "epoch": 1.640415920906844, + "grad_norm": 107.96340721595575, + "learning_rate": 5.056277539873552e-06, + "loss": 4.0327, + "step": 19247 + }, + { + "epoch": 1.6405011506008693, + "grad_norm": 67.30614823036171, + "learning_rate": 5.0557817231816045e-06, + "loss": 4.113, + "step": 19248 + }, + { + "epoch": 1.6405863802948948, + "grad_norm": 152.13143929893315, + "learning_rate": 5.055285905941066e-06, + "loss": 3.3333, + "step": 19249 + }, + { + "epoch": 1.64067160998892, + "grad_norm": 37.43661924995931, + "learning_rate": 5.054790088156811e-06, + "loss": 2.7093, + "step": 19250 + }, + { + "epoch": 1.6407568396829455, + "grad_norm": 75.81127104043246, + "learning_rate": 5.05429426983372e-06, + "loss": 3.5591, + "step": 19251 + }, + { + "epoch": 1.640842069376971, + "grad_norm": 60.698996422542194, + "learning_rate": 5.053798450976666e-06, + "loss": 3.9093, + "step": 19252 + }, + { + "epoch": 1.6409272990709964, + "grad_norm": 43.64537548225728, + "learning_rate": 5.0533026315905246e-06, + "loss": 2.1967, + "step": 19253 + }, + { + "epoch": 1.6410125287650217, + "grad_norm": 76.85028939015366, + "learning_rate": 5.052806811680173e-06, + "loss": 3.2058, + "step": 19254 + }, + { + "epoch": 1.6410977584590472, + "grad_norm": 43.35741384686164, + "learning_rate": 5.052310991250488e-06, + "loss": 3.3204, + "step": 19255 + }, + { + "epoch": 1.6411829881530724, + "grad_norm": 73.96913803811297, + "learning_rate": 5.051815170306344e-06, + "loss": 3.5298, + "step": 19256 + }, + { + "epoch": 1.6412682178470979, + "grad_norm": 74.52570814562975, + "learning_rate": 5.051319348852619e-06, + "loss": 3.636, + "step": 19257 + }, + { + "epoch": 1.6413534475411233, + "grad_norm": 31.269741399404026, + "learning_rate": 5.050823526894188e-06, + "loss": 1.976, + "step": 19258 + }, + { + "epoch": 1.6414386772351488, + "grad_norm": 74.51978312350543, + "learning_rate": 5.050327704435929e-06, + "loss": 3.3591, + "step": 19259 + }, + { + "epoch": 1.6415239069291743, + "grad_norm": 57.671052052093835, + "learning_rate": 5.0498318814827176e-06, + "loss": 2.5793, + "step": 19260 + }, + { + "epoch": 1.6416091366231995, + "grad_norm": 49.18660726137205, + "learning_rate": 5.049336058039428e-06, + "loss": 2.3919, + "step": 19261 + }, + { + "epoch": 1.6416943663172248, + "grad_norm": 153.56588773809173, + "learning_rate": 5.0488402341109375e-06, + "loss": 4.1777, + "step": 19262 + }, + { + "epoch": 1.6417795960112502, + "grad_norm": 36.85788394027394, + "learning_rate": 5.048344409702124e-06, + "loss": 3.6951, + "step": 19263 + }, + { + "epoch": 1.6418648257052757, + "grad_norm": 53.80261063174326, + "learning_rate": 5.047848584817863e-06, + "loss": 3.169, + "step": 19264 + }, + { + "epoch": 1.6419500553993012, + "grad_norm": 41.535512565743495, + "learning_rate": 5.047352759463029e-06, + "loss": 2.9356, + "step": 19265 + }, + { + "epoch": 1.6420352850933266, + "grad_norm": 123.53858239008393, + "learning_rate": 5.0468569336424985e-06, + "loss": 2.4616, + "step": 19266 + }, + { + "epoch": 1.6421205147873519, + "grad_norm": 37.01431441154002, + "learning_rate": 5.04636110736115e-06, + "loss": 2.9267, + "step": 19267 + }, + { + "epoch": 1.6422057444813773, + "grad_norm": 34.49861490589574, + "learning_rate": 5.045865280623858e-06, + "loss": 2.4126, + "step": 19268 + }, + { + "epoch": 1.6422909741754026, + "grad_norm": 41.16419977647988, + "learning_rate": 5.0453694534354995e-06, + "loss": 3.2567, + "step": 19269 + }, + { + "epoch": 1.642376203869428, + "grad_norm": 65.62181689904041, + "learning_rate": 5.04487362580095e-06, + "loss": 2.9023, + "step": 19270 + }, + { + "epoch": 1.6424614335634535, + "grad_norm": 37.153556248385975, + "learning_rate": 5.0443777977250875e-06, + "loss": 3.1288, + "step": 19271 + }, + { + "epoch": 1.642546663257479, + "grad_norm": 43.487892727811776, + "learning_rate": 5.043881969212786e-06, + "loss": 2.7772, + "step": 19272 + }, + { + "epoch": 1.6426318929515045, + "grad_norm": 46.672047965226085, + "learning_rate": 5.043386140268922e-06, + "loss": 1.6664, + "step": 19273 + }, + { + "epoch": 1.6427171226455297, + "grad_norm": 54.75833248718828, + "learning_rate": 5.042890310898373e-06, + "loss": 3.3033, + "step": 19274 + }, + { + "epoch": 1.642802352339555, + "grad_norm": 56.91998301194335, + "learning_rate": 5.042394481106016e-06, + "loss": 2.4825, + "step": 19275 + }, + { + "epoch": 1.6428875820335804, + "grad_norm": 54.10594815282476, + "learning_rate": 5.041898650896725e-06, + "loss": 2.8383, + "step": 19276 + }, + { + "epoch": 1.642972811727606, + "grad_norm": 89.78687698448613, + "learning_rate": 5.041402820275377e-06, + "loss": 3.4208, + "step": 19277 + }, + { + "epoch": 1.6430580414216314, + "grad_norm": 19.437493917914658, + "learning_rate": 5.04090698924685e-06, + "loss": 1.4641, + "step": 19278 + }, + { + "epoch": 1.6431432711156568, + "grad_norm": 31.761661544819297, + "learning_rate": 5.040411157816019e-06, + "loss": 1.8283, + "step": 19279 + }, + { + "epoch": 1.643228500809682, + "grad_norm": 52.32652531227458, + "learning_rate": 5.03991532598776e-06, + "loss": 3.7399, + "step": 19280 + }, + { + "epoch": 1.6433137305037073, + "grad_norm": 44.12205886085598, + "learning_rate": 5.039419493766948e-06, + "loss": 3.1906, + "step": 19281 + }, + { + "epoch": 1.6433989601977328, + "grad_norm": 48.05081272216895, + "learning_rate": 5.038923661158463e-06, + "loss": 2.6648, + "step": 19282 + }, + { + "epoch": 1.6434841898917583, + "grad_norm": 46.92912004625948, + "learning_rate": 5.038427828167179e-06, + "loss": 4.4831, + "step": 19283 + }, + { + "epoch": 1.6435694195857837, + "grad_norm": 29.15845233767615, + "learning_rate": 5.037931994797972e-06, + "loss": 1.6864, + "step": 19284 + }, + { + "epoch": 1.6436546492798092, + "grad_norm": 45.326445813788034, + "learning_rate": 5.037436161055719e-06, + "loss": 2.1078, + "step": 19285 + }, + { + "epoch": 1.6437398789738344, + "grad_norm": 73.85650541445777, + "learning_rate": 5.036940326945296e-06, + "loss": 2.4304, + "step": 19286 + }, + { + "epoch": 1.64382510866786, + "grad_norm": 36.040623277969615, + "learning_rate": 5.03644449247158e-06, + "loss": 2.8084, + "step": 19287 + }, + { + "epoch": 1.6439103383618852, + "grad_norm": 42.79669556661117, + "learning_rate": 5.035948657639448e-06, + "loss": 2.7488, + "step": 19288 + }, + { + "epoch": 1.6439955680559106, + "grad_norm": 66.10812623182092, + "learning_rate": 5.035452822453772e-06, + "loss": 3.4114, + "step": 19289 + }, + { + "epoch": 1.644080797749936, + "grad_norm": 54.96644457548555, + "learning_rate": 5.0349569869194345e-06, + "loss": 3.0875, + "step": 19290 + }, + { + "epoch": 1.6441660274439616, + "grad_norm": 48.36856241496583, + "learning_rate": 5.034461151041309e-06, + "loss": 2.1575, + "step": 19291 + }, + { + "epoch": 1.644251257137987, + "grad_norm": 46.58263400182298, + "learning_rate": 5.03396531482427e-06, + "loss": 2.1304, + "step": 19292 + }, + { + "epoch": 1.6443364868320123, + "grad_norm": 38.07604343007074, + "learning_rate": 5.033469478273197e-06, + "loss": 2.8563, + "step": 19293 + }, + { + "epoch": 1.6444217165260375, + "grad_norm": 37.09126363124448, + "learning_rate": 5.032973641392964e-06, + "loss": 3.0846, + "step": 19294 + }, + { + "epoch": 1.644506946220063, + "grad_norm": 68.79231266612214, + "learning_rate": 5.0324778041884505e-06, + "loss": 3.1598, + "step": 19295 + }, + { + "epoch": 1.6445921759140885, + "grad_norm": 62.364595724119795, + "learning_rate": 5.031981966664527e-06, + "loss": 3.4334, + "step": 19296 + }, + { + "epoch": 1.644677405608114, + "grad_norm": 77.30598092201163, + "learning_rate": 5.031486128826076e-06, + "loss": 3.3961, + "step": 19297 + }, + { + "epoch": 1.6447626353021394, + "grad_norm": 95.84661229568029, + "learning_rate": 5.030990290677971e-06, + "loss": 2.8526, + "step": 19298 + }, + { + "epoch": 1.6448478649961646, + "grad_norm": 69.23422732883648, + "learning_rate": 5.03049445222509e-06, + "loss": 2.8525, + "step": 19299 + }, + { + "epoch": 1.6449330946901901, + "grad_norm": 79.97653481527826, + "learning_rate": 5.029998613472306e-06, + "loss": 5.1469, + "step": 19300 + }, + { + "epoch": 1.6450183243842154, + "grad_norm": 82.78966687441833, + "learning_rate": 5.029502774424498e-06, + "loss": 2.6948, + "step": 19301 + }, + { + "epoch": 1.6451035540782408, + "grad_norm": 34.13602732942823, + "learning_rate": 5.029006935086542e-06, + "loss": 2.8982, + "step": 19302 + }, + { + "epoch": 1.6451887837722663, + "grad_norm": 57.46457194812145, + "learning_rate": 5.0285110954633155e-06, + "loss": 3.5547, + "step": 19303 + }, + { + "epoch": 1.6452740134662918, + "grad_norm": 33.66594543693968, + "learning_rate": 5.028015255559693e-06, + "loss": 2.1489, + "step": 19304 + }, + { + "epoch": 1.645359243160317, + "grad_norm": 47.594584091241856, + "learning_rate": 5.027519415380552e-06, + "loss": 1.7281, + "step": 19305 + }, + { + "epoch": 1.6454444728543425, + "grad_norm": 39.77816073859647, + "learning_rate": 5.027023574930767e-06, + "loss": 3.6018, + "step": 19306 + }, + { + "epoch": 1.6455297025483677, + "grad_norm": 63.498080368680846, + "learning_rate": 5.026527734215218e-06, + "loss": 2.715, + "step": 19307 + }, + { + "epoch": 1.6456149322423932, + "grad_norm": 50.17036147717706, + "learning_rate": 5.026031893238777e-06, + "loss": 2.9198, + "step": 19308 + }, + { + "epoch": 1.6457001619364187, + "grad_norm": 39.17231051370479, + "learning_rate": 5.025536052006324e-06, + "loss": 2.4014, + "step": 19309 + }, + { + "epoch": 1.6457853916304441, + "grad_norm": 77.21531704589809, + "learning_rate": 5.025040210522735e-06, + "loss": 3.6355, + "step": 19310 + }, + { + "epoch": 1.6458706213244696, + "grad_norm": 38.70534824540925, + "learning_rate": 5.024544368792884e-06, + "loss": 2.719, + "step": 19311 + }, + { + "epoch": 1.6459558510184948, + "grad_norm": 42.74072965734522, + "learning_rate": 5.024048526821649e-06, + "loss": 2.3562, + "step": 19312 + }, + { + "epoch": 1.64604108071252, + "grad_norm": 62.70656237009095, + "learning_rate": 5.023552684613906e-06, + "loss": 2.6786, + "step": 19313 + }, + { + "epoch": 1.6461263104065456, + "grad_norm": 38.65934717334896, + "learning_rate": 5.023056842174533e-06, + "loss": 2.8051, + "step": 19314 + }, + { + "epoch": 1.646211540100571, + "grad_norm": 155.37088057143444, + "learning_rate": 5.022560999508405e-06, + "loss": 3.5661, + "step": 19315 + }, + { + "epoch": 1.6462967697945965, + "grad_norm": 74.22721651373107, + "learning_rate": 5.022065156620397e-06, + "loss": 2.847, + "step": 19316 + }, + { + "epoch": 1.646381999488622, + "grad_norm": 67.4060137895981, + "learning_rate": 5.021569313515388e-06, + "loss": 3.0105, + "step": 19317 + }, + { + "epoch": 1.6464672291826472, + "grad_norm": 46.74818231091766, + "learning_rate": 5.021073470198255e-06, + "loss": 2.6448, + "step": 19318 + }, + { + "epoch": 1.6465524588766727, + "grad_norm": 30.646833424554245, + "learning_rate": 5.0205776266738695e-06, + "loss": 2.2614, + "step": 19319 + }, + { + "epoch": 1.646637688570698, + "grad_norm": 40.628443736071176, + "learning_rate": 5.020081782947113e-06, + "loss": 2.826, + "step": 19320 + }, + { + "epoch": 1.6467229182647234, + "grad_norm": 68.15269775901204, + "learning_rate": 5.01958593902286e-06, + "loss": 3.2258, + "step": 19321 + }, + { + "epoch": 1.6468081479587489, + "grad_norm": 75.20557752883315, + "learning_rate": 5.019090094905987e-06, + "loss": 3.1836, + "step": 19322 + }, + { + "epoch": 1.6468933776527743, + "grad_norm": 46.74464869802552, + "learning_rate": 5.01859425060137e-06, + "loss": 2.5187, + "step": 19323 + }, + { + "epoch": 1.6469786073467996, + "grad_norm": 42.04194482329408, + "learning_rate": 5.018098406113886e-06, + "loss": 2.9985, + "step": 19324 + }, + { + "epoch": 1.647063837040825, + "grad_norm": 23.06083823949737, + "learning_rate": 5.017602561448413e-06, + "loss": 1.9624, + "step": 19325 + }, + { + "epoch": 1.6471490667348503, + "grad_norm": 33.52409770209725, + "learning_rate": 5.017106716609824e-06, + "loss": 2.8173, + "step": 19326 + }, + { + "epoch": 1.6472342964288758, + "grad_norm": 25.490916133258185, + "learning_rate": 5.016610871602998e-06, + "loss": 2.1522, + "step": 19327 + }, + { + "epoch": 1.6473195261229012, + "grad_norm": 30.453346840981357, + "learning_rate": 5.01611502643281e-06, + "loss": 2.5527, + "step": 19328 + }, + { + "epoch": 1.6474047558169267, + "grad_norm": 147.60532967416785, + "learning_rate": 5.015619181104139e-06, + "loss": 3.6305, + "step": 19329 + }, + { + "epoch": 1.6474899855109522, + "grad_norm": 73.44301802747934, + "learning_rate": 5.015123335621858e-06, + "loss": 3.1704, + "step": 19330 + }, + { + "epoch": 1.6475752152049774, + "grad_norm": 92.39629352130022, + "learning_rate": 5.014627489990844e-06, + "loss": 4.0227, + "step": 19331 + }, + { + "epoch": 1.6476604448990027, + "grad_norm": 64.66858229040079, + "learning_rate": 5.014131644215976e-06, + "loss": 3.1807, + "step": 19332 + }, + { + "epoch": 1.6477456745930281, + "grad_norm": 84.87589777220926, + "learning_rate": 5.013635798302129e-06, + "loss": 2.8897, + "step": 19333 + }, + { + "epoch": 1.6478309042870536, + "grad_norm": 79.31975054229873, + "learning_rate": 5.013139952254179e-06, + "loss": 3.6308, + "step": 19334 + }, + { + "epoch": 1.647916133981079, + "grad_norm": 30.242486937071455, + "learning_rate": 5.012644106077003e-06, + "loss": 1.9031, + "step": 19335 + }, + { + "epoch": 1.6480013636751045, + "grad_norm": 107.23273671333507, + "learning_rate": 5.012148259775475e-06, + "loss": 2.6776, + "step": 19336 + }, + { + "epoch": 1.6480865933691298, + "grad_norm": 39.61105751808089, + "learning_rate": 5.011652413354477e-06, + "loss": 2.7956, + "step": 19337 + }, + { + "epoch": 1.6481718230631552, + "grad_norm": 34.79036003420287, + "learning_rate": 5.011156566818881e-06, + "loss": 3.5278, + "step": 19338 + }, + { + "epoch": 1.6482570527571805, + "grad_norm": 40.48215802576651, + "learning_rate": 5.010660720173565e-06, + "loss": 2.7435, + "step": 19339 + }, + { + "epoch": 1.648342282451206, + "grad_norm": 88.70718551468974, + "learning_rate": 5.0101648734234045e-06, + "loss": 2.9648, + "step": 19340 + }, + { + "epoch": 1.6484275121452314, + "grad_norm": 23.487256388616057, + "learning_rate": 5.009669026573275e-06, + "loss": 2.1177, + "step": 19341 + }, + { + "epoch": 1.648512741839257, + "grad_norm": 55.14602959348242, + "learning_rate": 5.009173179628058e-06, + "loss": 2.5222, + "step": 19342 + }, + { + "epoch": 1.6485979715332824, + "grad_norm": 77.19238739617856, + "learning_rate": 5.008677332592623e-06, + "loss": 3.433, + "step": 19343 + }, + { + "epoch": 1.6486832012273076, + "grad_norm": 62.90262652300232, + "learning_rate": 5.008181485471851e-06, + "loss": 3.3683, + "step": 19344 + }, + { + "epoch": 1.6487684309213329, + "grad_norm": 75.24750576376198, + "learning_rate": 5.007685638270617e-06, + "loss": 3.9673, + "step": 19345 + }, + { + "epoch": 1.6488536606153583, + "grad_norm": 51.40327104014162, + "learning_rate": 5.0071897909938e-06, + "loss": 2.4154, + "step": 19346 + }, + { + "epoch": 1.6489388903093838, + "grad_norm": 53.44347494010153, + "learning_rate": 5.006693943646271e-06, + "loss": 2.1534, + "step": 19347 + }, + { + "epoch": 1.6490241200034093, + "grad_norm": 54.575568220588565, + "learning_rate": 5.0061980962329115e-06, + "loss": 2.8388, + "step": 19348 + }, + { + "epoch": 1.6491093496974347, + "grad_norm": 64.45958877486434, + "learning_rate": 5.005702248758596e-06, + "loss": 2.521, + "step": 19349 + }, + { + "epoch": 1.64919457939146, + "grad_norm": 76.03094104043315, + "learning_rate": 5.005206401228202e-06, + "loss": 2.9229, + "step": 19350 + }, + { + "epoch": 1.6492798090854854, + "grad_norm": 46.00680034442722, + "learning_rate": 5.004710553646605e-06, + "loss": 3.2826, + "step": 19351 + }, + { + "epoch": 1.6493650387795107, + "grad_norm": 53.46358244813143, + "learning_rate": 5.00421470601868e-06, + "loss": 2.0774, + "step": 19352 + }, + { + "epoch": 1.6494502684735362, + "grad_norm": 54.49295304749847, + "learning_rate": 5.003718858349306e-06, + "loss": 2.8696, + "step": 19353 + }, + { + "epoch": 1.6495354981675616, + "grad_norm": 71.45851828686692, + "learning_rate": 5.003223010643359e-06, + "loss": 3.1502, + "step": 19354 + }, + { + "epoch": 1.649620727861587, + "grad_norm": 62.898249189675994, + "learning_rate": 5.002727162905714e-06, + "loss": 2.4082, + "step": 19355 + }, + { + "epoch": 1.6497059575556123, + "grad_norm": 47.848505312462386, + "learning_rate": 5.002231315141248e-06, + "loss": 1.2477, + "step": 19356 + }, + { + "epoch": 1.6497911872496378, + "grad_norm": 40.98496861347841, + "learning_rate": 5.001735467354839e-06, + "loss": 1.823, + "step": 19357 + }, + { + "epoch": 1.649876416943663, + "grad_norm": 38.81872423626096, + "learning_rate": 5.001239619551362e-06, + "loss": 2.7286, + "step": 19358 + }, + { + "epoch": 1.6499616466376885, + "grad_norm": 33.50066328174879, + "learning_rate": 5.000743771735695e-06, + "loss": 2.7727, + "step": 19359 + }, + { + "epoch": 1.650046876331714, + "grad_norm": 116.32659898217722, + "learning_rate": 5.000247923912711e-06, + "loss": 2.9321, + "step": 19360 + }, + { + "epoch": 1.6501321060257395, + "grad_norm": 64.0837540123055, + "learning_rate": 4.999752076087291e-06, + "loss": 3.5921, + "step": 19361 + }, + { + "epoch": 1.650217335719765, + "grad_norm": 29.17781737632071, + "learning_rate": 4.999256228264308e-06, + "loss": 2.0635, + "step": 19362 + }, + { + "epoch": 1.6503025654137902, + "grad_norm": 56.723703519290346, + "learning_rate": 4.998760380448639e-06, + "loss": 3.0412, + "step": 19363 + }, + { + "epoch": 1.6503877951078154, + "grad_norm": 55.385705128060714, + "learning_rate": 4.998264532645162e-06, + "loss": 2.2638, + "step": 19364 + }, + { + "epoch": 1.6504730248018409, + "grad_norm": 49.68644938911799, + "learning_rate": 4.9977686848587535e-06, + "loss": 3.1103, + "step": 19365 + }, + { + "epoch": 1.6505582544958664, + "grad_norm": 26.789593940417458, + "learning_rate": 4.997272837094288e-06, + "loss": 2.4347, + "step": 19366 + }, + { + "epoch": 1.6506434841898918, + "grad_norm": 48.79105226417806, + "learning_rate": 4.996776989356643e-06, + "loss": 2.2601, + "step": 19367 + }, + { + "epoch": 1.6507287138839173, + "grad_norm": 49.617294391426746, + "learning_rate": 4.996281141650695e-06, + "loss": 3.5817, + "step": 19368 + }, + { + "epoch": 1.6508139435779425, + "grad_norm": 15.911050502427653, + "learning_rate": 4.995785293981322e-06, + "loss": 1.4092, + "step": 19369 + }, + { + "epoch": 1.650899173271968, + "grad_norm": 35.02378332864866, + "learning_rate": 4.995289446353398e-06, + "loss": 2.9884, + "step": 19370 + }, + { + "epoch": 1.6509844029659932, + "grad_norm": 44.134092280131085, + "learning_rate": 4.9947935987718e-06, + "loss": 2.1781, + "step": 19371 + }, + { + "epoch": 1.6510696326600187, + "grad_norm": 50.496258942154995, + "learning_rate": 4.994297751241404e-06, + "loss": 2.294, + "step": 19372 + }, + { + "epoch": 1.6511548623540442, + "grad_norm": 35.45222107043942, + "learning_rate": 4.993801903767089e-06, + "loss": 3.3048, + "step": 19373 + }, + { + "epoch": 1.6512400920480697, + "grad_norm": 33.4889047098037, + "learning_rate": 4.9933060563537295e-06, + "loss": 2.764, + "step": 19374 + }, + { + "epoch": 1.651325321742095, + "grad_norm": 35.212340040998214, + "learning_rate": 4.992810209006203e-06, + "loss": 2.7604, + "step": 19375 + }, + { + "epoch": 1.6514105514361204, + "grad_norm": 58.789384873426684, + "learning_rate": 4.992314361729383e-06, + "loss": 3.051, + "step": 19376 + }, + { + "epoch": 1.6514957811301456, + "grad_norm": 62.82405457785531, + "learning_rate": 4.99181851452815e-06, + "loss": 2.162, + "step": 19377 + }, + { + "epoch": 1.651581010824171, + "grad_norm": 65.92981462191608, + "learning_rate": 4.991322667407378e-06, + "loss": 2.4005, + "step": 19378 + }, + { + "epoch": 1.6516662405181965, + "grad_norm": 34.06602264112898, + "learning_rate": 4.990826820371945e-06, + "loss": 2.6905, + "step": 19379 + }, + { + "epoch": 1.651751470212222, + "grad_norm": 87.06987561272456, + "learning_rate": 4.990330973426725e-06, + "loss": 1.7493, + "step": 19380 + }, + { + "epoch": 1.6518366999062475, + "grad_norm": 55.38827211091591, + "learning_rate": 4.989835126576598e-06, + "loss": 3.6336, + "step": 19381 + }, + { + "epoch": 1.6519219296002727, + "grad_norm": 40.24024108928873, + "learning_rate": 4.989339279826437e-06, + "loss": 2.9266, + "step": 19382 + }, + { + "epoch": 1.652007159294298, + "grad_norm": 72.72461129041734, + "learning_rate": 4.988843433181121e-06, + "loss": 2.4741, + "step": 19383 + }, + { + "epoch": 1.6520923889883234, + "grad_norm": 194.29069309999758, + "learning_rate": 4.988347586645525e-06, + "loss": 3.4681, + "step": 19384 + }, + { + "epoch": 1.652177618682349, + "grad_norm": 34.51869330192934, + "learning_rate": 4.987851740224525e-06, + "loss": 2.3902, + "step": 19385 + }, + { + "epoch": 1.6522628483763744, + "grad_norm": 81.29414674671577, + "learning_rate": 4.987355893922999e-06, + "loss": 2.4864, + "step": 19386 + }, + { + "epoch": 1.6523480780703999, + "grad_norm": 38.60840769501013, + "learning_rate": 4.986860047745823e-06, + "loss": 3.1475, + "step": 19387 + }, + { + "epoch": 1.652433307764425, + "grad_norm": 38.63777020410073, + "learning_rate": 4.986364201697873e-06, + "loss": 2.7376, + "step": 19388 + }, + { + "epoch": 1.6525185374584506, + "grad_norm": 35.31286176419416, + "learning_rate": 4.985868355784024e-06, + "loss": 2.7329, + "step": 19389 + }, + { + "epoch": 1.6526037671524758, + "grad_norm": 39.937095622873905, + "learning_rate": 4.9853725100091575e-06, + "loss": 2.7039, + "step": 19390 + }, + { + "epoch": 1.6526889968465013, + "grad_norm": 40.203864586257154, + "learning_rate": 4.984876664378144e-06, + "loss": 3.2484, + "step": 19391 + }, + { + "epoch": 1.6527742265405267, + "grad_norm": 44.5110709435727, + "learning_rate": 4.984380818895864e-06, + "loss": 2.4146, + "step": 19392 + }, + { + "epoch": 1.6528594562345522, + "grad_norm": 69.30637133563204, + "learning_rate": 4.9838849735671905e-06, + "loss": 3.149, + "step": 19393 + }, + { + "epoch": 1.6529446859285775, + "grad_norm": 42.839541690514004, + "learning_rate": 4.9833891283970035e-06, + "loss": 3.1705, + "step": 19394 + }, + { + "epoch": 1.653029915622603, + "grad_norm": 66.05716198443247, + "learning_rate": 4.982893283390177e-06, + "loss": 3.2865, + "step": 19395 + }, + { + "epoch": 1.6531151453166282, + "grad_norm": 78.32609110425281, + "learning_rate": 4.98239743855159e-06, + "loss": 2.4952, + "step": 19396 + }, + { + "epoch": 1.6532003750106536, + "grad_norm": 50.829545293707724, + "learning_rate": 4.981901593886114e-06, + "loss": 2.8095, + "step": 19397 + }, + { + "epoch": 1.6532856047046791, + "grad_norm": 40.52570916288655, + "learning_rate": 4.981405749398631e-06, + "loss": 2.8799, + "step": 19398 + }, + { + "epoch": 1.6533708343987046, + "grad_norm": 77.41962595080729, + "learning_rate": 4.980909905094015e-06, + "loss": 3.1039, + "step": 19399 + }, + { + "epoch": 1.65345606409273, + "grad_norm": 19.307671265703327, + "learning_rate": 4.980414060977143e-06, + "loss": 1.0749, + "step": 19400 + }, + { + "epoch": 1.6535412937867553, + "grad_norm": 37.225794291101735, + "learning_rate": 4.979918217052889e-06, + "loss": 2.8205, + "step": 19401 + }, + { + "epoch": 1.6536265234807805, + "grad_norm": 99.33392260869061, + "learning_rate": 4.979422373326131e-06, + "loss": 3.4778, + "step": 19402 + }, + { + "epoch": 1.653711753174806, + "grad_norm": 27.797162111658437, + "learning_rate": 4.978926529801748e-06, + "loss": 1.9031, + "step": 19403 + }, + { + "epoch": 1.6537969828688315, + "grad_norm": 25.276472387955216, + "learning_rate": 4.978430686484614e-06, + "loss": 1.3008, + "step": 19404 + }, + { + "epoch": 1.653882212562857, + "grad_norm": 59.57411947976392, + "learning_rate": 4.977934843379604e-06, + "loss": 3.0907, + "step": 19405 + }, + { + "epoch": 1.6539674422568824, + "grad_norm": 79.80966714051532, + "learning_rate": 4.977439000491597e-06, + "loss": 2.717, + "step": 19406 + }, + { + "epoch": 1.6540526719509077, + "grad_norm": 38.80814630772229, + "learning_rate": 4.976943157825468e-06, + "loss": 3.3324, + "step": 19407 + }, + { + "epoch": 1.6541379016449331, + "grad_norm": 28.87040751357037, + "learning_rate": 4.9764473153860955e-06, + "loss": 3.1097, + "step": 19408 + }, + { + "epoch": 1.6542231313389584, + "grad_norm": 26.06685733952591, + "learning_rate": 4.975951473178353e-06, + "loss": 2.7231, + "step": 19409 + }, + { + "epoch": 1.6543083610329838, + "grad_norm": 66.4653435816989, + "learning_rate": 4.975455631207117e-06, + "loss": 3.1336, + "step": 19410 + }, + { + "epoch": 1.6543935907270093, + "grad_norm": 28.49784193606611, + "learning_rate": 4.974959789477267e-06, + "loss": 2.1074, + "step": 19411 + }, + { + "epoch": 1.6544788204210348, + "grad_norm": 52.06485146992336, + "learning_rate": 4.974463947993677e-06, + "loss": 2.7457, + "step": 19412 + }, + { + "epoch": 1.6545640501150602, + "grad_norm": 37.45686581732069, + "learning_rate": 4.973968106761224e-06, + "loss": 3.3404, + "step": 19413 + }, + { + "epoch": 1.6546492798090855, + "grad_norm": 70.42538509918825, + "learning_rate": 4.973472265784785e-06, + "loss": 3.5148, + "step": 19414 + }, + { + "epoch": 1.6547345095031107, + "grad_norm": 49.03272964568006, + "learning_rate": 4.972976425069233e-06, + "loss": 2.0166, + "step": 19415 + }, + { + "epoch": 1.6548197391971362, + "grad_norm": 102.38776142590808, + "learning_rate": 4.9724805846194506e-06, + "loss": 3.3523, + "step": 19416 + }, + { + "epoch": 1.6549049688911617, + "grad_norm": 26.850835429204977, + "learning_rate": 4.971984744440309e-06, + "loss": 0.9323, + "step": 19417 + }, + { + "epoch": 1.6549901985851871, + "grad_norm": 33.77808842987797, + "learning_rate": 4.971488904536686e-06, + "loss": 2.123, + "step": 19418 + }, + { + "epoch": 1.6550754282792126, + "grad_norm": 69.92984417585107, + "learning_rate": 4.970993064913458e-06, + "loss": 3.3405, + "step": 19419 + }, + { + "epoch": 1.6551606579732379, + "grad_norm": 41.824979947554375, + "learning_rate": 4.970497225575503e-06, + "loss": 2.4742, + "step": 19420 + }, + { + "epoch": 1.6552458876672633, + "grad_norm": 47.23466462948367, + "learning_rate": 4.970001386527695e-06, + "loss": 3.8576, + "step": 19421 + }, + { + "epoch": 1.6553311173612886, + "grad_norm": 36.90786909085898, + "learning_rate": 4.9695055477749135e-06, + "loss": 3.1245, + "step": 19422 + }, + { + "epoch": 1.655416347055314, + "grad_norm": 76.84957569236174, + "learning_rate": 4.96900970932203e-06, + "loss": 3.2681, + "step": 19423 + }, + { + "epoch": 1.6555015767493395, + "grad_norm": 33.20483032269249, + "learning_rate": 4.968513871173925e-06, + "loss": 2.4457, + "step": 19424 + }, + { + "epoch": 1.655586806443365, + "grad_norm": 269.03449054329667, + "learning_rate": 4.9680180333354735e-06, + "loss": 2.975, + "step": 19425 + }, + { + "epoch": 1.6556720361373902, + "grad_norm": 43.385085762897255, + "learning_rate": 4.967522195811553e-06, + "loss": 3.3816, + "step": 19426 + }, + { + "epoch": 1.6557572658314157, + "grad_norm": 64.37340544048885, + "learning_rate": 4.967026358607038e-06, + "loss": 3.0019, + "step": 19427 + }, + { + "epoch": 1.655842495525441, + "grad_norm": 36.832987050225015, + "learning_rate": 4.966530521726804e-06, + "loss": 2.9298, + "step": 19428 + }, + { + "epoch": 1.6559277252194664, + "grad_norm": 38.968730880126934, + "learning_rate": 4.9660346851757304e-06, + "loss": 2.3739, + "step": 19429 + }, + { + "epoch": 1.6560129549134919, + "grad_norm": 59.90289655465402, + "learning_rate": 4.965538848958693e-06, + "loss": 3.8386, + "step": 19430 + }, + { + "epoch": 1.6560981846075173, + "grad_norm": 55.93571909114423, + "learning_rate": 4.965043013080567e-06, + "loss": 3.3884, + "step": 19431 + }, + { + "epoch": 1.6561834143015428, + "grad_norm": 92.66882719105114, + "learning_rate": 4.964547177546227e-06, + "loss": 4.7756, + "step": 19432 + }, + { + "epoch": 1.656268643995568, + "grad_norm": 22.621123109257997, + "learning_rate": 4.964051342360553e-06, + "loss": 1.8839, + "step": 19433 + }, + { + "epoch": 1.6563538736895933, + "grad_norm": 103.41820964041386, + "learning_rate": 4.9635555075284205e-06, + "loss": 4.4166, + "step": 19434 + }, + { + "epoch": 1.6564391033836188, + "grad_norm": 34.372940024833646, + "learning_rate": 4.963059673054705e-06, + "loss": 2.8335, + "step": 19435 + }, + { + "epoch": 1.6565243330776442, + "grad_norm": 57.07196686532735, + "learning_rate": 4.962563838944282e-06, + "loss": 3.5034, + "step": 19436 + }, + { + "epoch": 1.6566095627716697, + "grad_norm": 27.852047679058327, + "learning_rate": 4.962068005202029e-06, + "loss": 2.9466, + "step": 19437 + }, + { + "epoch": 1.6566947924656952, + "grad_norm": 39.55293551668253, + "learning_rate": 4.961572171832822e-06, + "loss": 2.0689, + "step": 19438 + }, + { + "epoch": 1.6567800221597204, + "grad_norm": 81.17606454346291, + "learning_rate": 4.961076338841538e-06, + "loss": 2.9489, + "step": 19439 + }, + { + "epoch": 1.656865251853746, + "grad_norm": 68.2565539623258, + "learning_rate": 4.960580506233052e-06, + "loss": 2.228, + "step": 19440 + }, + { + "epoch": 1.6569504815477711, + "grad_norm": 37.653398914581274, + "learning_rate": 4.960084674012242e-06, + "loss": 3.0254, + "step": 19441 + }, + { + "epoch": 1.6570357112417966, + "grad_norm": 47.76172804343153, + "learning_rate": 4.959588842183982e-06, + "loss": 2.4679, + "step": 19442 + }, + { + "epoch": 1.657120940935822, + "grad_norm": 72.56786487541873, + "learning_rate": 4.9590930107531516e-06, + "loss": 3.0213, + "step": 19443 + }, + { + "epoch": 1.6572061706298475, + "grad_norm": 40.69619573315599, + "learning_rate": 4.9585971797246234e-06, + "loss": 3.2609, + "step": 19444 + }, + { + "epoch": 1.6572914003238728, + "grad_norm": 78.016780281153, + "learning_rate": 4.958101349103276e-06, + "loss": 3.1318, + "step": 19445 + }, + { + "epoch": 1.6573766300178983, + "grad_norm": 47.707322185743045, + "learning_rate": 4.957605518893985e-06, + "loss": 3.3461, + "step": 19446 + }, + { + "epoch": 1.6574618597119235, + "grad_norm": 45.64751430133607, + "learning_rate": 4.957109689101628e-06, + "loss": 3.2516, + "step": 19447 + }, + { + "epoch": 1.657547089405949, + "grad_norm": 70.87639233188173, + "learning_rate": 4.956613859731079e-06, + "loss": 3.704, + "step": 19448 + }, + { + "epoch": 1.6576323190999744, + "grad_norm": 80.67609611918503, + "learning_rate": 4.956118030787216e-06, + "loss": 4.0957, + "step": 19449 + }, + { + "epoch": 1.657717548794, + "grad_norm": 34.685118823481695, + "learning_rate": 4.955622202274914e-06, + "loss": 3.6338, + "step": 19450 + }, + { + "epoch": 1.6578027784880254, + "grad_norm": 40.61586590626316, + "learning_rate": 4.9551263741990514e-06, + "loss": 2.6423, + "step": 19451 + }, + { + "epoch": 1.6578880081820506, + "grad_norm": 70.0400311021935, + "learning_rate": 4.954630546564501e-06, + "loss": 3.3792, + "step": 19452 + }, + { + "epoch": 1.6579732378760759, + "grad_norm": 58.52537465221441, + "learning_rate": 4.954134719376143e-06, + "loss": 2.6474, + "step": 19453 + }, + { + "epoch": 1.6580584675701013, + "grad_norm": 28.020178282208775, + "learning_rate": 4.95363889263885e-06, + "loss": 1.7294, + "step": 19454 + }, + { + "epoch": 1.6581436972641268, + "grad_norm": 52.492102079099226, + "learning_rate": 4.953143066357502e-06, + "loss": 2.2934, + "step": 19455 + }, + { + "epoch": 1.6582289269581523, + "grad_norm": 61.276884957789065, + "learning_rate": 4.952647240536973e-06, + "loss": 2.5744, + "step": 19456 + }, + { + "epoch": 1.6583141566521777, + "grad_norm": 63.46638052826569, + "learning_rate": 4.9521514151821394e-06, + "loss": 2.6315, + "step": 19457 + }, + { + "epoch": 1.658399386346203, + "grad_norm": 30.836009874009246, + "learning_rate": 4.9516555902978765e-06, + "loss": 2.2353, + "step": 19458 + }, + { + "epoch": 1.6584846160402285, + "grad_norm": 55.25057460787288, + "learning_rate": 4.951159765889063e-06, + "loss": 3.6485, + "step": 19459 + }, + { + "epoch": 1.6585698457342537, + "grad_norm": 66.21925809297713, + "learning_rate": 4.950663941960574e-06, + "loss": 3.4188, + "step": 19460 + }, + { + "epoch": 1.6586550754282792, + "grad_norm": 48.7625055786206, + "learning_rate": 4.950168118517285e-06, + "loss": 2.3868, + "step": 19461 + }, + { + "epoch": 1.6587403051223046, + "grad_norm": 28.01790223756447, + "learning_rate": 4.949672295564072e-06, + "loss": 2.3588, + "step": 19462 + }, + { + "epoch": 1.65882553481633, + "grad_norm": 33.74926158856425, + "learning_rate": 4.949176473105813e-06, + "loss": 2.583, + "step": 19463 + }, + { + "epoch": 1.6589107645103556, + "grad_norm": 55.54229183247289, + "learning_rate": 4.948680651147382e-06, + "loss": 2.9435, + "step": 19464 + }, + { + "epoch": 1.6589959942043808, + "grad_norm": 25.719333198193542, + "learning_rate": 4.948184829693658e-06, + "loss": 2.2116, + "step": 19465 + }, + { + "epoch": 1.659081223898406, + "grad_norm": 47.65901284128178, + "learning_rate": 4.947689008749513e-06, + "loss": 2.7929, + "step": 19466 + }, + { + "epoch": 1.6591664535924315, + "grad_norm": 44.609331643371405, + "learning_rate": 4.947193188319829e-06, + "loss": 2.1327, + "step": 19467 + }, + { + "epoch": 1.659251683286457, + "grad_norm": 95.74111208446206, + "learning_rate": 4.946697368409477e-06, + "loss": 2.9307, + "step": 19468 + }, + { + "epoch": 1.6593369129804825, + "grad_norm": 69.39527957356553, + "learning_rate": 4.946201549023336e-06, + "loss": 2.545, + "step": 19469 + }, + { + "epoch": 1.659422142674508, + "grad_norm": 126.69219100638769, + "learning_rate": 4.94570573016628e-06, + "loss": 3.2514, + "step": 19470 + }, + { + "epoch": 1.6595073723685332, + "grad_norm": 61.325229943096254, + "learning_rate": 4.945209911843189e-06, + "loss": 2.1375, + "step": 19471 + }, + { + "epoch": 1.6595926020625584, + "grad_norm": 73.66507091184387, + "learning_rate": 4.944714094058935e-06, + "loss": 3.2185, + "step": 19472 + }, + { + "epoch": 1.659677831756584, + "grad_norm": 93.14074631596118, + "learning_rate": 4.944218276818397e-06, + "loss": 2.7603, + "step": 19473 + }, + { + "epoch": 1.6597630614506094, + "grad_norm": 31.63711395494214, + "learning_rate": 4.9437224601264505e-06, + "loss": 1.6846, + "step": 19474 + }, + { + "epoch": 1.6598482911446348, + "grad_norm": 54.75047461314517, + "learning_rate": 4.943226643987969e-06, + "loss": 3.6919, + "step": 19475 + }, + { + "epoch": 1.6599335208386603, + "grad_norm": 94.70583655006135, + "learning_rate": 4.942730828407832e-06, + "loss": 3.7504, + "step": 19476 + }, + { + "epoch": 1.6600187505326855, + "grad_norm": 47.137986657304936, + "learning_rate": 4.942235013390916e-06, + "loss": 3.0658, + "step": 19477 + }, + { + "epoch": 1.660103980226711, + "grad_norm": 95.5538956896955, + "learning_rate": 4.941739198942094e-06, + "loss": 3.262, + "step": 19478 + }, + { + "epoch": 1.6601892099207363, + "grad_norm": 37.45416083660446, + "learning_rate": 4.9412433850662434e-06, + "loss": 1.65, + "step": 19479 + }, + { + "epoch": 1.6602744396147617, + "grad_norm": 84.79696144276306, + "learning_rate": 4.940747571768241e-06, + "loss": 2.9649, + "step": 19480 + }, + { + "epoch": 1.6603596693087872, + "grad_norm": 65.69026853364635, + "learning_rate": 4.940251759052964e-06, + "loss": 2.1349, + "step": 19481 + }, + { + "epoch": 1.6604448990028127, + "grad_norm": 46.570147356613994, + "learning_rate": 4.9397559469252875e-06, + "loss": 2.9587, + "step": 19482 + }, + { + "epoch": 1.6605301286968381, + "grad_norm": 52.48749628601362, + "learning_rate": 4.939260135390085e-06, + "loss": 2.1979, + "step": 19483 + }, + { + "epoch": 1.6606153583908634, + "grad_norm": 34.145944739632505, + "learning_rate": 4.9387643244522375e-06, + "loss": 2.2417, + "step": 19484 + }, + { + "epoch": 1.6607005880848886, + "grad_norm": 50.288960813695425, + "learning_rate": 4.938268514116617e-06, + "loss": 2.5236, + "step": 19485 + }, + { + "epoch": 1.660785817778914, + "grad_norm": 44.9586292982124, + "learning_rate": 4.937772704388102e-06, + "loss": 3.3779, + "step": 19486 + }, + { + "epoch": 1.6608710474729396, + "grad_norm": 68.2179599846016, + "learning_rate": 4.937276895271566e-06, + "loss": 2.4203, + "step": 19487 + }, + { + "epoch": 1.660956277166965, + "grad_norm": 47.402348611851814, + "learning_rate": 4.936781086771888e-06, + "loss": 2.5416, + "step": 19488 + }, + { + "epoch": 1.6610415068609905, + "grad_norm": 42.1567429472563, + "learning_rate": 4.936285278893942e-06, + "loss": 2.6852, + "step": 19489 + }, + { + "epoch": 1.6611267365550157, + "grad_norm": 36.906081857620784, + "learning_rate": 4.935789471642607e-06, + "loss": 2.3781, + "step": 19490 + }, + { + "epoch": 1.6612119662490412, + "grad_norm": 67.49021462360868, + "learning_rate": 4.935293665022756e-06, + "loss": 2.908, + "step": 19491 + }, + { + "epoch": 1.6612971959430665, + "grad_norm": 42.5402824388552, + "learning_rate": 4.934797859039266e-06, + "loss": 2.9574, + "step": 19492 + }, + { + "epoch": 1.661382425637092, + "grad_norm": 109.92293873004448, + "learning_rate": 4.9343020536970125e-06, + "loss": 3.9834, + "step": 19493 + }, + { + "epoch": 1.6614676553311174, + "grad_norm": 38.289945716577876, + "learning_rate": 4.9338062490008735e-06, + "loss": 3.0793, + "step": 19494 + }, + { + "epoch": 1.6615528850251429, + "grad_norm": 42.81693987901954, + "learning_rate": 4.9333104449557226e-06, + "loss": 3.2396, + "step": 19495 + }, + { + "epoch": 1.6616381147191681, + "grad_norm": 62.56068527190473, + "learning_rate": 4.932814641566436e-06, + "loss": 2.1671, + "step": 19496 + }, + { + "epoch": 1.6617233444131936, + "grad_norm": 23.21871969957008, + "learning_rate": 4.932318838837892e-06, + "loss": 1.8159, + "step": 19497 + }, + { + "epoch": 1.6618085741072188, + "grad_norm": 62.092982957472316, + "learning_rate": 4.931823036774967e-06, + "loss": 2.2515, + "step": 19498 + }, + { + "epoch": 1.6618938038012443, + "grad_norm": 40.259519478864235, + "learning_rate": 4.931327235382533e-06, + "loss": 2.5641, + "step": 19499 + }, + { + "epoch": 1.6619790334952698, + "grad_norm": 36.27594367752492, + "learning_rate": 4.9308314346654695e-06, + "loss": 2.6571, + "step": 19500 + }, + { + "epoch": 1.6620642631892952, + "grad_norm": 32.2633447546223, + "learning_rate": 4.9303356346286496e-06, + "loss": 2.0413, + "step": 19501 + }, + { + "epoch": 1.6621494928833207, + "grad_norm": 35.9475276140696, + "learning_rate": 4.929839835276953e-06, + "loss": 2.3882, + "step": 19502 + }, + { + "epoch": 1.662234722577346, + "grad_norm": 22.731171997350838, + "learning_rate": 4.929344036615253e-06, + "loss": 1.6459, + "step": 19503 + }, + { + "epoch": 1.6623199522713712, + "grad_norm": 60.62387624878796, + "learning_rate": 4.9288482386484274e-06, + "loss": 3.0694, + "step": 19504 + }, + { + "epoch": 1.6624051819653967, + "grad_norm": 73.2891077076128, + "learning_rate": 4.9283524413813485e-06, + "loss": 2.6961, + "step": 19505 + }, + { + "epoch": 1.6624904116594221, + "grad_norm": 65.88373939699501, + "learning_rate": 4.927856644818898e-06, + "loss": 3.1304, + "step": 19506 + }, + { + "epoch": 1.6625756413534476, + "grad_norm": 99.36567075139176, + "learning_rate": 4.927360848965946e-06, + "loss": 3.9114, + "step": 19507 + }, + { + "epoch": 1.662660871047473, + "grad_norm": 67.8597423445224, + "learning_rate": 4.926865053827374e-06, + "loss": 3.0887, + "step": 19508 + }, + { + "epoch": 1.6627461007414983, + "grad_norm": 27.17719445485878, + "learning_rate": 4.926369259408052e-06, + "loss": 2.2452, + "step": 19509 + }, + { + "epoch": 1.6628313304355238, + "grad_norm": 92.84195924936238, + "learning_rate": 4.9258734657128616e-06, + "loss": 3.309, + "step": 19510 + }, + { + "epoch": 1.662916560129549, + "grad_norm": 42.90994427960817, + "learning_rate": 4.9253776727466746e-06, + "loss": 2.9403, + "step": 19511 + }, + { + "epoch": 1.6630017898235745, + "grad_norm": 49.70420507272102, + "learning_rate": 4.92488188051437e-06, + "loss": 2.5115, + "step": 19512 + }, + { + "epoch": 1.6630870195176, + "grad_norm": 36.228058193604355, + "learning_rate": 4.92438608902082e-06, + "loss": 1.6892, + "step": 19513 + }, + { + "epoch": 1.6631722492116254, + "grad_norm": 40.556509175844326, + "learning_rate": 4.9238902982709054e-06, + "loss": 2.7122, + "step": 19514 + }, + { + "epoch": 1.6632574789056507, + "grad_norm": 33.53748815760942, + "learning_rate": 4.923394508269498e-06, + "loss": 2.4513, + "step": 19515 + }, + { + "epoch": 1.6633427085996761, + "grad_norm": 112.19370178539408, + "learning_rate": 4.922898719021476e-06, + "loss": 3.0317, + "step": 19516 + }, + { + "epoch": 1.6634279382937014, + "grad_norm": 41.87284383296851, + "learning_rate": 4.922402930531715e-06, + "loss": 3.1823, + "step": 19517 + }, + { + "epoch": 1.6635131679877269, + "grad_norm": 62.50958096859166, + "learning_rate": 4.921907142805088e-06, + "loss": 3.3312, + "step": 19518 + }, + { + "epoch": 1.6635983976817523, + "grad_norm": 62.75810161759082, + "learning_rate": 4.921411355846474e-06, + "loss": 3.1037, + "step": 19519 + }, + { + "epoch": 1.6636836273757778, + "grad_norm": 42.9140441123179, + "learning_rate": 4.920915569660749e-06, + "loss": 2.5194, + "step": 19520 + }, + { + "epoch": 1.6637688570698033, + "grad_norm": 61.13214560090548, + "learning_rate": 4.920419784252789e-06, + "loss": 2.2228, + "step": 19521 + }, + { + "epoch": 1.6638540867638285, + "grad_norm": 35.75368264815509, + "learning_rate": 4.9199239996274655e-06, + "loss": 3.1778, + "step": 19522 + }, + { + "epoch": 1.6639393164578538, + "grad_norm": 108.16827720472065, + "learning_rate": 4.919428215789659e-06, + "loss": 3.1167, + "step": 19523 + }, + { + "epoch": 1.6640245461518792, + "grad_norm": 76.63967576269874, + "learning_rate": 4.918932432744246e-06, + "loss": 3.5006, + "step": 19524 + }, + { + "epoch": 1.6641097758459047, + "grad_norm": 78.11893157473365, + "learning_rate": 4.918436650496099e-06, + "loss": 3.4817, + "step": 19525 + }, + { + "epoch": 1.6641950055399302, + "grad_norm": 39.23027019698521, + "learning_rate": 4.917940869050093e-06, + "loss": 3.0676, + "step": 19526 + }, + { + "epoch": 1.6642802352339556, + "grad_norm": 25.327184227430482, + "learning_rate": 4.917445088411108e-06, + "loss": 1.7976, + "step": 19527 + }, + { + "epoch": 1.6643654649279809, + "grad_norm": 160.67243123522752, + "learning_rate": 4.916949308584018e-06, + "loss": 3.2079, + "step": 19528 + }, + { + "epoch": 1.6644506946220063, + "grad_norm": 34.291223022842544, + "learning_rate": 4.916453529573699e-06, + "loss": 2.1233, + "step": 19529 + }, + { + "epoch": 1.6645359243160316, + "grad_norm": 51.53745374301175, + "learning_rate": 4.915957751385025e-06, + "loss": 1.7422, + "step": 19530 + }, + { + "epoch": 1.664621154010057, + "grad_norm": 27.228459540345536, + "learning_rate": 4.9154619740228725e-06, + "loss": 2.3688, + "step": 19531 + }, + { + "epoch": 1.6647063837040825, + "grad_norm": 47.213834953295994, + "learning_rate": 4.91496619749212e-06, + "loss": 2.6773, + "step": 19532 + }, + { + "epoch": 1.664791613398108, + "grad_norm": 41.634365359697796, + "learning_rate": 4.914470421797641e-06, + "loss": 2.8213, + "step": 19533 + }, + { + "epoch": 1.6648768430921335, + "grad_norm": 47.700108371339326, + "learning_rate": 4.91397464694431e-06, + "loss": 2.8062, + "step": 19534 + }, + { + "epoch": 1.6649620727861587, + "grad_norm": 31.20824949256291, + "learning_rate": 4.913478872937005e-06, + "loss": 2.5349, + "step": 19535 + }, + { + "epoch": 1.665047302480184, + "grad_norm": 72.2264828211598, + "learning_rate": 4.9129830997806e-06, + "loss": 2.5605, + "step": 19536 + }, + { + "epoch": 1.6651325321742094, + "grad_norm": 83.53972692361897, + "learning_rate": 4.912487327479973e-06, + "loss": 2.7324, + "step": 19537 + }, + { + "epoch": 1.665217761868235, + "grad_norm": 53.721400130138505, + "learning_rate": 4.911991556039997e-06, + "loss": 2.5054, + "step": 19538 + }, + { + "epoch": 1.6653029915622604, + "grad_norm": 54.05155945607395, + "learning_rate": 4.91149578546555e-06, + "loss": 2.7594, + "step": 19539 + }, + { + "epoch": 1.6653882212562858, + "grad_norm": 30.660270727136243, + "learning_rate": 4.911000015761506e-06, + "loss": 3.1831, + "step": 19540 + }, + { + "epoch": 1.665473450950311, + "grad_norm": 37.8933667577889, + "learning_rate": 4.910504246932743e-06, + "loss": 2.4038, + "step": 19541 + }, + { + "epoch": 1.6655586806443365, + "grad_norm": 35.44476416718834, + "learning_rate": 4.910008478984134e-06, + "loss": 2.3274, + "step": 19542 + }, + { + "epoch": 1.6656439103383618, + "grad_norm": 31.819084515569763, + "learning_rate": 4.909512711920556e-06, + "loss": 1.4272, + "step": 19543 + }, + { + "epoch": 1.6657291400323873, + "grad_norm": 82.4455154546074, + "learning_rate": 4.909016945746883e-06, + "loss": 3.0777, + "step": 19544 + }, + { + "epoch": 1.6658143697264127, + "grad_norm": 68.23891393147066, + "learning_rate": 4.908521180467995e-06, + "loss": 2.3052, + "step": 19545 + }, + { + "epoch": 1.6658995994204382, + "grad_norm": 70.96170494292359, + "learning_rate": 4.908025416088763e-06, + "loss": 2.5269, + "step": 19546 + }, + { + "epoch": 1.6659848291144634, + "grad_norm": 35.54988205150106, + "learning_rate": 4.907529652614066e-06, + "loss": 2.0941, + "step": 19547 + }, + { + "epoch": 1.666070058808489, + "grad_norm": 47.35182299562181, + "learning_rate": 4.907033890048776e-06, + "loss": 2.9117, + "step": 19548 + }, + { + "epoch": 1.6661552885025142, + "grad_norm": 131.7038930027512, + "learning_rate": 4.906538128397772e-06, + "loss": 2.1541, + "step": 19549 + }, + { + "epoch": 1.6662405181965396, + "grad_norm": 58.39955996091494, + "learning_rate": 4.906042367665928e-06, + "loss": 3.6748, + "step": 19550 + }, + { + "epoch": 1.666325747890565, + "grad_norm": 50.16319904665865, + "learning_rate": 4.9055466078581215e-06, + "loss": 2.9559, + "step": 19551 + }, + { + "epoch": 1.6664109775845906, + "grad_norm": 52.97777768366683, + "learning_rate": 4.905050848979224e-06, + "loss": 2.1567, + "step": 19552 + }, + { + "epoch": 1.666496207278616, + "grad_norm": 44.13548826346402, + "learning_rate": 4.904555091034115e-06, + "loss": 2.391, + "step": 19553 + }, + { + "epoch": 1.6665814369726413, + "grad_norm": 36.627549806042985, + "learning_rate": 4.9040593340276695e-06, + "loss": 2.5195, + "step": 19554 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 32.23641005381781, + "learning_rate": 4.903563577964762e-06, + "loss": 2.6905, + "step": 19555 + }, + { + "epoch": 1.666751896360692, + "grad_norm": 38.95294769288822, + "learning_rate": 4.903067822850266e-06, + "loss": 2.9569, + "step": 19556 + }, + { + "epoch": 1.6668371260547175, + "grad_norm": 90.75835200814193, + "learning_rate": 4.902572068689062e-06, + "loss": 2.792, + "step": 19557 + }, + { + "epoch": 1.666922355748743, + "grad_norm": 44.32290765124504, + "learning_rate": 4.902076315486023e-06, + "loss": 1.719, + "step": 19558 + }, + { + "epoch": 1.6670075854427684, + "grad_norm": 80.34877216127336, + "learning_rate": 4.901580563246025e-06, + "loss": 2.8204, + "step": 19559 + }, + { + "epoch": 1.6670928151367936, + "grad_norm": 37.4026090212605, + "learning_rate": 4.901084811973943e-06, + "loss": 3.6667, + "step": 19560 + }, + { + "epoch": 1.667178044830819, + "grad_norm": 43.84049145233016, + "learning_rate": 4.900589061674649e-06, + "loss": 3.7693, + "step": 19561 + }, + { + "epoch": 1.6672632745248444, + "grad_norm": 19.297268353738282, + "learning_rate": 4.900093312353025e-06, + "loss": 1.8423, + "step": 19562 + }, + { + "epoch": 1.6673485042188698, + "grad_norm": 42.99396291906129, + "learning_rate": 4.8995975640139436e-06, + "loss": 3.0874, + "step": 19563 + }, + { + "epoch": 1.6674337339128953, + "grad_norm": 80.44611427146248, + "learning_rate": 4.899101816662281e-06, + "loss": 3.3697, + "step": 19564 + }, + { + "epoch": 1.6675189636069208, + "grad_norm": 81.6446840374153, + "learning_rate": 4.898606070302909e-06, + "loss": 4.0152, + "step": 19565 + }, + { + "epoch": 1.667604193300946, + "grad_norm": 49.027874533881615, + "learning_rate": 4.898110324940708e-06, + "loss": 3.7913, + "step": 19566 + }, + { + "epoch": 1.6676894229949715, + "grad_norm": 49.63652364727223, + "learning_rate": 4.897614580580552e-06, + "loss": 2.4647, + "step": 19567 + }, + { + "epoch": 1.6677746526889967, + "grad_norm": 68.29261276229221, + "learning_rate": 4.897118837227316e-06, + "loss": 2.2376, + "step": 19568 + }, + { + "epoch": 1.6678598823830222, + "grad_norm": 61.92203602507151, + "learning_rate": 4.896623094885873e-06, + "loss": 2.8792, + "step": 19569 + }, + { + "epoch": 1.6679451120770477, + "grad_norm": 67.26208547219379, + "learning_rate": 4.896127353561102e-06, + "loss": 3.2327, + "step": 19570 + }, + { + "epoch": 1.6680303417710731, + "grad_norm": 40.58784374485269, + "learning_rate": 4.895631613257879e-06, + "loss": 2.1879, + "step": 19571 + }, + { + "epoch": 1.6681155714650986, + "grad_norm": 45.15097151760452, + "learning_rate": 4.895135873981076e-06, + "loss": 2.785, + "step": 19572 + }, + { + "epoch": 1.6682008011591238, + "grad_norm": 26.83669203074333, + "learning_rate": 4.89464013573557e-06, + "loss": 2.1098, + "step": 19573 + }, + { + "epoch": 1.668286030853149, + "grad_norm": 52.17462195566368, + "learning_rate": 4.894144398526235e-06, + "loss": 2.8536, + "step": 19574 + }, + { + "epoch": 1.6683712605471746, + "grad_norm": 40.04461791926449, + "learning_rate": 4.893648662357951e-06, + "loss": 3.1395, + "step": 19575 + }, + { + "epoch": 1.6684564902412, + "grad_norm": 58.551945524541466, + "learning_rate": 4.893152927235589e-06, + "loss": 3.6313, + "step": 19576 + }, + { + "epoch": 1.6685417199352255, + "grad_norm": 36.4510121005392, + "learning_rate": 4.892657193164024e-06, + "loss": 2.3269, + "step": 19577 + }, + { + "epoch": 1.668626949629251, + "grad_norm": 65.93561843362642, + "learning_rate": 4.892161460148132e-06, + "loss": 3.004, + "step": 19578 + }, + { + "epoch": 1.6687121793232762, + "grad_norm": 32.984222724960205, + "learning_rate": 4.891665728192793e-06, + "loss": 2.756, + "step": 19579 + }, + { + "epoch": 1.6687974090173017, + "grad_norm": 34.110652050303855, + "learning_rate": 4.891169997302877e-06, + "loss": 2.4009, + "step": 19580 + }, + { + "epoch": 1.668882638711327, + "grad_norm": 35.18856174001707, + "learning_rate": 4.890674267483261e-06, + "loss": 2.6662, + "step": 19581 + }, + { + "epoch": 1.6689678684053524, + "grad_norm": 63.945854544929425, + "learning_rate": 4.890178538738817e-06, + "loss": 2.5393, + "step": 19582 + }, + { + "epoch": 1.6690530980993779, + "grad_norm": 52.51115448081069, + "learning_rate": 4.8896828110744275e-06, + "loss": 2.6582, + "step": 19583 + }, + { + "epoch": 1.6691383277934033, + "grad_norm": 36.28156999545254, + "learning_rate": 4.889187084494963e-06, + "loss": 2.8059, + "step": 19584 + }, + { + "epoch": 1.6692235574874286, + "grad_norm": 115.81223777525183, + "learning_rate": 4.888691359005297e-06, + "loss": 3.2953, + "step": 19585 + }, + { + "epoch": 1.669308787181454, + "grad_norm": 48.57451107397079, + "learning_rate": 4.888195634610309e-06, + "loss": 2.7922, + "step": 19586 + }, + { + "epoch": 1.6693940168754793, + "grad_norm": 45.59821640530898, + "learning_rate": 4.887699911314873e-06, + "loss": 2.1404, + "step": 19587 + }, + { + "epoch": 1.6694792465695047, + "grad_norm": 40.821129281184895, + "learning_rate": 4.8872041891238636e-06, + "loss": 2.873, + "step": 19588 + }, + { + "epoch": 1.6695644762635302, + "grad_norm": 44.47662985025845, + "learning_rate": 4.886708468042155e-06, + "loss": 3.7596, + "step": 19589 + }, + { + "epoch": 1.6696497059575557, + "grad_norm": 71.3517171471572, + "learning_rate": 4.886212748074625e-06, + "loss": 2.959, + "step": 19590 + }, + { + "epoch": 1.6697349356515812, + "grad_norm": 75.29294626078745, + "learning_rate": 4.8857170292261454e-06, + "loss": 3.4216, + "step": 19591 + }, + { + "epoch": 1.6698201653456064, + "grad_norm": 37.51892250296187, + "learning_rate": 4.885221311501595e-06, + "loss": 2.3558, + "step": 19592 + }, + { + "epoch": 1.6699053950396316, + "grad_norm": 32.176849998082695, + "learning_rate": 4.884725594905848e-06, + "loss": 2.7932, + "step": 19593 + }, + { + "epoch": 1.6699906247336571, + "grad_norm": 44.49369976935359, + "learning_rate": 4.884229879443779e-06, + "loss": 2.8093, + "step": 19594 + }, + { + "epoch": 1.6700758544276826, + "grad_norm": 43.35286390647435, + "learning_rate": 4.8837341651202606e-06, + "loss": 2.7782, + "step": 19595 + }, + { + "epoch": 1.670161084121708, + "grad_norm": 76.93147021084901, + "learning_rate": 4.883238451940173e-06, + "loss": 3.0873, + "step": 19596 + }, + { + "epoch": 1.6702463138157335, + "grad_norm": 42.75160561204814, + "learning_rate": 4.882742739908388e-06, + "loss": 2.4256, + "step": 19597 + }, + { + "epoch": 1.6703315435097588, + "grad_norm": 73.7925018734116, + "learning_rate": 4.8822470290297815e-06, + "loss": 3.378, + "step": 19598 + }, + { + "epoch": 1.6704167732037842, + "grad_norm": 35.34276530217992, + "learning_rate": 4.881751319309228e-06, + "loss": 2.173, + "step": 19599 + }, + { + "epoch": 1.6705020028978095, + "grad_norm": 129.75241836491756, + "learning_rate": 4.881255610751604e-06, + "loss": 4.4501, + "step": 19600 + }, + { + "epoch": 1.670587232591835, + "grad_norm": 67.8347756190658, + "learning_rate": 4.880759903361784e-06, + "loss": 3.238, + "step": 19601 + }, + { + "epoch": 1.6706724622858604, + "grad_norm": 68.66979262678085, + "learning_rate": 4.880264197144644e-06, + "loss": 3.6433, + "step": 19602 + }, + { + "epoch": 1.6707576919798859, + "grad_norm": 33.43433944717437, + "learning_rate": 4.879768492105057e-06, + "loss": 2.4337, + "step": 19603 + }, + { + "epoch": 1.6708429216739114, + "grad_norm": 67.12482629681713, + "learning_rate": 4.879272788247898e-06, + "loss": 4.0185, + "step": 19604 + }, + { + "epoch": 1.6709281513679366, + "grad_norm": 40.017643375677906, + "learning_rate": 4.878777085578044e-06, + "loss": 2.9691, + "step": 19605 + }, + { + "epoch": 1.6710133810619618, + "grad_norm": 78.3272580481565, + "learning_rate": 4.87828138410037e-06, + "loss": 3.2074, + "step": 19606 + }, + { + "epoch": 1.6710986107559873, + "grad_norm": 69.18153359781083, + "learning_rate": 4.87778568381975e-06, + "loss": 3.2055, + "step": 19607 + }, + { + "epoch": 1.6711838404500128, + "grad_norm": 34.6617037008233, + "learning_rate": 4.877289984741057e-06, + "loss": 2.497, + "step": 19608 + }, + { + "epoch": 1.6712690701440382, + "grad_norm": 42.80202684239844, + "learning_rate": 4.8767942868691704e-06, + "loss": 2.8833, + "step": 19609 + }, + { + "epoch": 1.6713542998380637, + "grad_norm": 36.98168281805439, + "learning_rate": 4.8762985902089634e-06, + "loss": 2.9531, + "step": 19610 + }, + { + "epoch": 1.671439529532089, + "grad_norm": 39.658839760714386, + "learning_rate": 4.875802894765311e-06, + "loss": 2.7416, + "step": 19611 + }, + { + "epoch": 1.6715247592261144, + "grad_norm": 67.25602273716773, + "learning_rate": 4.875307200543085e-06, + "loss": 3.5351, + "step": 19612 + }, + { + "epoch": 1.6716099889201397, + "grad_norm": 44.59171913270611, + "learning_rate": 4.8748115075471645e-06, + "loss": 3.1343, + "step": 19613 + }, + { + "epoch": 1.6716952186141651, + "grad_norm": 45.488907484178725, + "learning_rate": 4.874315815782424e-06, + "loss": 2.7925, + "step": 19614 + }, + { + "epoch": 1.6717804483081906, + "grad_norm": 53.860470815025806, + "learning_rate": 4.873820125253738e-06, + "loss": 3.0427, + "step": 19615 + }, + { + "epoch": 1.671865678002216, + "grad_norm": 50.82333324795045, + "learning_rate": 4.873324435965978e-06, + "loss": 1.9796, + "step": 19616 + }, + { + "epoch": 1.6719509076962413, + "grad_norm": 23.262999271099098, + "learning_rate": 4.8728287479240235e-06, + "loss": 2.3523, + "step": 19617 + }, + { + "epoch": 1.6720361373902668, + "grad_norm": 60.462915843658976, + "learning_rate": 4.872333061132748e-06, + "loss": 3.0567, + "step": 19618 + }, + { + "epoch": 1.672121367084292, + "grad_norm": 35.500702603291714, + "learning_rate": 4.871837375597027e-06, + "loss": 2.8199, + "step": 19619 + }, + { + "epoch": 1.6722065967783175, + "grad_norm": 42.87372481380905, + "learning_rate": 4.871341691321734e-06, + "loss": 3.2742, + "step": 19620 + }, + { + "epoch": 1.672291826472343, + "grad_norm": 35.05073445502354, + "learning_rate": 4.870846008311742e-06, + "loss": 2.5043, + "step": 19621 + }, + { + "epoch": 1.6723770561663684, + "grad_norm": 34.45051036880438, + "learning_rate": 4.870350326571931e-06, + "loss": 2.3946, + "step": 19622 + }, + { + "epoch": 1.672462285860394, + "grad_norm": 72.54666645675479, + "learning_rate": 4.869854646107173e-06, + "loss": 2.6371, + "step": 19623 + }, + { + "epoch": 1.6725475155544192, + "grad_norm": 42.34651709592563, + "learning_rate": 4.869358966922341e-06, + "loss": 3.0344, + "step": 19624 + }, + { + "epoch": 1.6726327452484444, + "grad_norm": 42.591688996311106, + "learning_rate": 4.868863289022311e-06, + "loss": 3.2244, + "step": 19625 + }, + { + "epoch": 1.6727179749424699, + "grad_norm": 33.76546745953146, + "learning_rate": 4.868367612411961e-06, + "loss": 2.4013, + "step": 19626 + }, + { + "epoch": 1.6728032046364953, + "grad_norm": 41.266627432945214, + "learning_rate": 4.867871937096163e-06, + "loss": 2.3351, + "step": 19627 + }, + { + "epoch": 1.6728884343305208, + "grad_norm": 17.263291398300627, + "learning_rate": 4.867376263079791e-06, + "loss": 1.3546, + "step": 19628 + }, + { + "epoch": 1.6729736640245463, + "grad_norm": 32.54620527047789, + "learning_rate": 4.866880590367719e-06, + "loss": 2.9081, + "step": 19629 + }, + { + "epoch": 1.6730588937185715, + "grad_norm": 39.86870791789329, + "learning_rate": 4.866384918964827e-06, + "loss": 2.8588, + "step": 19630 + }, + { + "epoch": 1.673144123412597, + "grad_norm": 37.21038448282757, + "learning_rate": 4.865889248875985e-06, + "loss": 1.6227, + "step": 19631 + }, + { + "epoch": 1.6732293531066222, + "grad_norm": 81.7661284149682, + "learning_rate": 4.865393580106068e-06, + "loss": 3.3683, + "step": 19632 + }, + { + "epoch": 1.6733145828006477, + "grad_norm": 33.66600596366513, + "learning_rate": 4.864897912659953e-06, + "loss": 2.7472, + "step": 19633 + }, + { + "epoch": 1.6733998124946732, + "grad_norm": 59.03316243622582, + "learning_rate": 4.864402246542511e-06, + "loss": 3.3093, + "step": 19634 + }, + { + "epoch": 1.6734850421886986, + "grad_norm": 69.21081770565497, + "learning_rate": 4.8639065817586215e-06, + "loss": 2.815, + "step": 19635 + }, + { + "epoch": 1.673570271882724, + "grad_norm": 21.109900968900963, + "learning_rate": 4.8634109183131564e-06, + "loss": 2.086, + "step": 19636 + }, + { + "epoch": 1.6736555015767494, + "grad_norm": 36.35000473056541, + "learning_rate": 4.862915256210991e-06, + "loss": 2.3749, + "step": 19637 + }, + { + "epoch": 1.6737407312707746, + "grad_norm": 170.80455923756813, + "learning_rate": 4.8624195954569966e-06, + "loss": 3.0316, + "step": 19638 + }, + { + "epoch": 1.6738259609648, + "grad_norm": 38.36256528270109, + "learning_rate": 4.861923936056054e-06, + "loss": 3.0682, + "step": 19639 + }, + { + "epoch": 1.6739111906588255, + "grad_norm": 58.5972844734275, + "learning_rate": 4.8614282780130325e-06, + "loss": 3.8398, + "step": 19640 + }, + { + "epoch": 1.673996420352851, + "grad_norm": 80.2723737448827, + "learning_rate": 4.860932621332811e-06, + "loss": 3.5307, + "step": 19641 + }, + { + "epoch": 1.6740816500468765, + "grad_norm": 66.71168441296243, + "learning_rate": 4.860436966020259e-06, + "loss": 3.1161, + "step": 19642 + }, + { + "epoch": 1.6741668797409017, + "grad_norm": 41.94947643542623, + "learning_rate": 4.859941312080257e-06, + "loss": 3.0367, + "step": 19643 + }, + { + "epoch": 1.674252109434927, + "grad_norm": 70.42707192490569, + "learning_rate": 4.8594456595176745e-06, + "loss": 2.4856, + "step": 19644 + }, + { + "epoch": 1.6743373391289524, + "grad_norm": 67.71955759447162, + "learning_rate": 4.85895000833739e-06, + "loss": 3.0186, + "step": 19645 + }, + { + "epoch": 1.674422568822978, + "grad_norm": 56.27184253483816, + "learning_rate": 4.858454358544275e-06, + "loss": 3.4238, + "step": 19646 + }, + { + "epoch": 1.6745077985170034, + "grad_norm": 58.929793342141544, + "learning_rate": 4.857958710143204e-06, + "loss": 3.0142, + "step": 19647 + }, + { + "epoch": 1.6745930282110288, + "grad_norm": 59.86038599554835, + "learning_rate": 4.857463063139053e-06, + "loss": 2.5204, + "step": 19648 + }, + { + "epoch": 1.674678257905054, + "grad_norm": 35.938224704112734, + "learning_rate": 4.856967417536698e-06, + "loss": 2.8493, + "step": 19649 + }, + { + "epoch": 1.6747634875990796, + "grad_norm": 37.94428337925081, + "learning_rate": 4.856471773341011e-06, + "loss": 2.4964, + "step": 19650 + }, + { + "epoch": 1.6748487172931048, + "grad_norm": 68.89223272705331, + "learning_rate": 4.855976130556866e-06, + "loss": 2.2532, + "step": 19651 + }, + { + "epoch": 1.6749339469871303, + "grad_norm": 71.54037139656693, + "learning_rate": 4.855480489189138e-06, + "loss": 2.914, + "step": 19652 + }, + { + "epoch": 1.6750191766811557, + "grad_norm": 53.550674823197085, + "learning_rate": 4.854984849242704e-06, + "loss": 3.3416, + "step": 19653 + }, + { + "epoch": 1.6751044063751812, + "grad_norm": 44.625373768843254, + "learning_rate": 4.854489210722436e-06, + "loss": 2.3565, + "step": 19654 + }, + { + "epoch": 1.6751896360692067, + "grad_norm": 37.52078091136628, + "learning_rate": 4.853993573633207e-06, + "loss": 3.1485, + "step": 19655 + }, + { + "epoch": 1.675274865763232, + "grad_norm": 74.88097121460353, + "learning_rate": 4.853497937979895e-06, + "loss": 2.9914, + "step": 19656 + }, + { + "epoch": 1.6753600954572572, + "grad_norm": 45.02519872216637, + "learning_rate": 4.853002303767372e-06, + "loss": 2.5096, + "step": 19657 + }, + { + "epoch": 1.6754453251512826, + "grad_norm": 81.72994250064397, + "learning_rate": 4.852506671000514e-06, + "loss": 3.3126, + "step": 19658 + }, + { + "epoch": 1.675530554845308, + "grad_norm": 62.07309128283589, + "learning_rate": 4.852011039684192e-06, + "loss": 2.7427, + "step": 19659 + }, + { + "epoch": 1.6756157845393336, + "grad_norm": 74.83087380647596, + "learning_rate": 4.851515409823283e-06, + "loss": 2.9939, + "step": 19660 + }, + { + "epoch": 1.675701014233359, + "grad_norm": 84.52351679640321, + "learning_rate": 4.851019781422663e-06, + "loss": 3.2052, + "step": 19661 + }, + { + "epoch": 1.6757862439273843, + "grad_norm": 35.108426432088585, + "learning_rate": 4.850524154487204e-06, + "loss": 3.2344, + "step": 19662 + }, + { + "epoch": 1.6758714736214095, + "grad_norm": 64.37689796490491, + "learning_rate": 4.850028529021779e-06, + "loss": 2.8328, + "step": 19663 + }, + { + "epoch": 1.675956703315435, + "grad_norm": 37.23754345316407, + "learning_rate": 4.8495329050312624e-06, + "loss": 2.766, + "step": 19664 + }, + { + "epoch": 1.6760419330094605, + "grad_norm": 53.391478122769364, + "learning_rate": 4.849037282520533e-06, + "loss": 3.5377, + "step": 19665 + }, + { + "epoch": 1.676127162703486, + "grad_norm": 31.91074238711825, + "learning_rate": 4.848541661494461e-06, + "loss": 2.6272, + "step": 19666 + }, + { + "epoch": 1.6762123923975114, + "grad_norm": 41.46364524898084, + "learning_rate": 4.848046041957922e-06, + "loss": 3.1008, + "step": 19667 + }, + { + "epoch": 1.6762976220915367, + "grad_norm": 36.69196616673499, + "learning_rate": 4.847550423915787e-06, + "loss": 2.9819, + "step": 19668 + }, + { + "epoch": 1.6763828517855621, + "grad_norm": 51.34351907975158, + "learning_rate": 4.8470548073729365e-06, + "loss": 2.7847, + "step": 19669 + }, + { + "epoch": 1.6764680814795874, + "grad_norm": 53.12287062636151, + "learning_rate": 4.8465591923342406e-06, + "loss": 2.291, + "step": 19670 + }, + { + "epoch": 1.6765533111736128, + "grad_norm": 33.7842835030202, + "learning_rate": 4.846063578804573e-06, + "loss": 3.5713, + "step": 19671 + }, + { + "epoch": 1.6766385408676383, + "grad_norm": 46.45628133713266, + "learning_rate": 4.845567966788808e-06, + "loss": 3.1474, + "step": 19672 + }, + { + "epoch": 1.6767237705616638, + "grad_norm": 44.01206895497803, + "learning_rate": 4.845072356291822e-06, + "loss": 3.041, + "step": 19673 + }, + { + "epoch": 1.6768090002556892, + "grad_norm": 26.116889227888727, + "learning_rate": 4.844576747318489e-06, + "loss": 2.0046, + "step": 19674 + }, + { + "epoch": 1.6768942299497145, + "grad_norm": 31.375370835677433, + "learning_rate": 4.84408113987368e-06, + "loss": 2.0004, + "step": 19675 + }, + { + "epoch": 1.6769794596437397, + "grad_norm": 78.21359022041352, + "learning_rate": 4.843585533962273e-06, + "loss": 3.4332, + "step": 19676 + }, + { + "epoch": 1.6770646893377652, + "grad_norm": 32.08929808152089, + "learning_rate": 4.843089929589137e-06, + "loss": 2.5108, + "step": 19677 + }, + { + "epoch": 1.6771499190317907, + "grad_norm": 34.875816329011016, + "learning_rate": 4.842594326759152e-06, + "loss": 2.3814, + "step": 19678 + }, + { + "epoch": 1.6772351487258161, + "grad_norm": 85.36843682593356, + "learning_rate": 4.842098725477188e-06, + "loss": 3.8623, + "step": 19679 + }, + { + "epoch": 1.6773203784198416, + "grad_norm": 42.66876485027683, + "learning_rate": 4.841603125748121e-06, + "loss": 1.4695, + "step": 19680 + }, + { + "epoch": 1.6774056081138669, + "grad_norm": 60.97344175080794, + "learning_rate": 4.8411075275768235e-06, + "loss": 3.9729, + "step": 19681 + }, + { + "epoch": 1.6774908378078923, + "grad_norm": 39.51796237649708, + "learning_rate": 4.840611930968171e-06, + "loss": 2.8166, + "step": 19682 + }, + { + "epoch": 1.6775760675019176, + "grad_norm": 44.82313552580934, + "learning_rate": 4.840116335927036e-06, + "loss": 2.6158, + "step": 19683 + }, + { + "epoch": 1.677661297195943, + "grad_norm": 38.12534160107495, + "learning_rate": 4.839620742458295e-06, + "loss": 2.1212, + "step": 19684 + }, + { + "epoch": 1.6777465268899685, + "grad_norm": 62.23884361737131, + "learning_rate": 4.839125150566818e-06, + "loss": 3.862, + "step": 19685 + }, + { + "epoch": 1.677831756583994, + "grad_norm": 54.05319094835658, + "learning_rate": 4.838629560257483e-06, + "loss": 3.7729, + "step": 19686 + }, + { + "epoch": 1.6779169862780192, + "grad_norm": 36.84923836238703, + "learning_rate": 4.838133971535161e-06, + "loss": 2.2907, + "step": 19687 + }, + { + "epoch": 1.6780022159720447, + "grad_norm": 60.11833772454318, + "learning_rate": 4.837638384404729e-06, + "loss": 2.4719, + "step": 19688 + }, + { + "epoch": 1.67808744566607, + "grad_norm": 27.50387315523403, + "learning_rate": 4.837142798871056e-06, + "loss": 2.544, + "step": 19689 + }, + { + "epoch": 1.6781726753600954, + "grad_norm": 65.11729022295921, + "learning_rate": 4.836647214939021e-06, + "loss": 3.5996, + "step": 19690 + }, + { + "epoch": 1.6782579050541209, + "grad_norm": 40.95375430220146, + "learning_rate": 4.836151632613494e-06, + "loss": 2.5586, + "step": 19691 + }, + { + "epoch": 1.6783431347481463, + "grad_norm": 46.80294229019989, + "learning_rate": 4.8356560518993525e-06, + "loss": 3.7466, + "step": 19692 + }, + { + "epoch": 1.6784283644421718, + "grad_norm": 47.51979079593615, + "learning_rate": 4.835160472801469e-06, + "loss": 3.0193, + "step": 19693 + }, + { + "epoch": 1.678513594136197, + "grad_norm": 43.11440136903738, + "learning_rate": 4.834664895324714e-06, + "loss": 2.8542, + "step": 19694 + }, + { + "epoch": 1.6785988238302223, + "grad_norm": 60.450147054679235, + "learning_rate": 4.834169319473965e-06, + "loss": 3.2401, + "step": 19695 + }, + { + "epoch": 1.6786840535242478, + "grad_norm": 38.6464158223072, + "learning_rate": 4.833673745254096e-06, + "loss": 3.2635, + "step": 19696 + }, + { + "epoch": 1.6787692832182732, + "grad_norm": 33.18196502602489, + "learning_rate": 4.83317817266998e-06, + "loss": 2.7171, + "step": 19697 + }, + { + "epoch": 1.6788545129122987, + "grad_norm": 66.45209191947674, + "learning_rate": 4.832682601726488e-06, + "loss": 3.2085, + "step": 19698 + }, + { + "epoch": 1.6789397426063242, + "grad_norm": 44.86198451919388, + "learning_rate": 4.832187032428497e-06, + "loss": 2.654, + "step": 19699 + }, + { + "epoch": 1.6790249723003494, + "grad_norm": 43.25783738573499, + "learning_rate": 4.831691464780881e-06, + "loss": 3.0513, + "step": 19700 + }, + { + "epoch": 1.6791102019943749, + "grad_norm": 46.34646945126589, + "learning_rate": 4.831195898788513e-06, + "loss": 3.5353, + "step": 19701 + }, + { + "epoch": 1.6791954316884001, + "grad_norm": 25.685387265646845, + "learning_rate": 4.8307003344562635e-06, + "loss": 1.5438, + "step": 19702 + }, + { + "epoch": 1.6792806613824256, + "grad_norm": 36.994618234930236, + "learning_rate": 4.83020477178901e-06, + "loss": 2.4681, + "step": 19703 + }, + { + "epoch": 1.679365891076451, + "grad_norm": 41.93092312911053, + "learning_rate": 4.829709210791627e-06, + "loss": 3.1964, + "step": 19704 + }, + { + "epoch": 1.6794511207704765, + "grad_norm": 30.690589982637317, + "learning_rate": 4.829213651468985e-06, + "loss": 1.6775, + "step": 19705 + }, + { + "epoch": 1.6795363504645018, + "grad_norm": 51.53523323297095, + "learning_rate": 4.828718093825959e-06, + "loss": 2.9459, + "step": 19706 + }, + { + "epoch": 1.6796215801585272, + "grad_norm": 75.39729230589869, + "learning_rate": 4.828222537867421e-06, + "loss": 3.9369, + "step": 19707 + }, + { + "epoch": 1.6797068098525525, + "grad_norm": 42.83252185508961, + "learning_rate": 4.827726983598248e-06, + "loss": 3.0129, + "step": 19708 + }, + { + "epoch": 1.679792039546578, + "grad_norm": 38.142060473141015, + "learning_rate": 4.8272314310233115e-06, + "loss": 2.3434, + "step": 19709 + }, + { + "epoch": 1.6798772692406034, + "grad_norm": 92.84170066230008, + "learning_rate": 4.826735880147484e-06, + "loss": 4.0612, + "step": 19710 + }, + { + "epoch": 1.679962498934629, + "grad_norm": 54.29931036009173, + "learning_rate": 4.82624033097564e-06, + "loss": 2.4869, + "step": 19711 + }, + { + "epoch": 1.6800477286286544, + "grad_norm": 83.38209543941245, + "learning_rate": 4.825744783512655e-06, + "loss": 2.9102, + "step": 19712 + }, + { + "epoch": 1.6801329583226796, + "grad_norm": 67.69250082050559, + "learning_rate": 4.825249237763401e-06, + "loss": 2.4431, + "step": 19713 + }, + { + "epoch": 1.6802181880167049, + "grad_norm": 34.308715334640816, + "learning_rate": 4.82475369373275e-06, + "loss": 2.2884, + "step": 19714 + }, + { + "epoch": 1.6803034177107303, + "grad_norm": 69.99854641074575, + "learning_rate": 4.824258151425576e-06, + "loss": 3.0297, + "step": 19715 + }, + { + "epoch": 1.6803886474047558, + "grad_norm": 56.976352402710404, + "learning_rate": 4.823762610846755e-06, + "loss": 3.3562, + "step": 19716 + }, + { + "epoch": 1.6804738770987813, + "grad_norm": 44.99220885670455, + "learning_rate": 4.82326707200116e-06, + "loss": 2.9454, + "step": 19717 + }, + { + "epoch": 1.6805591067928067, + "grad_norm": 53.16051333090949, + "learning_rate": 4.82277153489366e-06, + "loss": 2.7681, + "step": 19718 + }, + { + "epoch": 1.680644336486832, + "grad_norm": 52.011078144732416, + "learning_rate": 4.822275999529133e-06, + "loss": 2.0294, + "step": 19719 + }, + { + "epoch": 1.6807295661808574, + "grad_norm": 47.69267080756893, + "learning_rate": 4.82178046591245e-06, + "loss": 2.5761, + "step": 19720 + }, + { + "epoch": 1.6808147958748827, + "grad_norm": 73.24998127237946, + "learning_rate": 4.821284934048487e-06, + "loss": 3.8104, + "step": 19721 + }, + { + "epoch": 1.6809000255689082, + "grad_norm": 30.536925168382524, + "learning_rate": 4.8207894039421135e-06, + "loss": 1.6392, + "step": 19722 + }, + { + "epoch": 1.6809852552629336, + "grad_norm": 67.02846515358155, + "learning_rate": 4.820293875598208e-06, + "loss": 3.1704, + "step": 19723 + }, + { + "epoch": 1.681070484956959, + "grad_norm": 18.93134576371232, + "learning_rate": 4.819798349021637e-06, + "loss": 1.174, + "step": 19724 + }, + { + "epoch": 1.6811557146509846, + "grad_norm": 46.6384566977576, + "learning_rate": 4.819302824217281e-06, + "loss": 2.4714, + "step": 19725 + }, + { + "epoch": 1.6812409443450098, + "grad_norm": 56.48313089515161, + "learning_rate": 4.818807301190008e-06, + "loss": 2.6464, + "step": 19726 + }, + { + "epoch": 1.681326174039035, + "grad_norm": 54.42036443473099, + "learning_rate": 4.818311779944695e-06, + "loss": 3.0249, + "step": 19727 + }, + { + "epoch": 1.6814114037330605, + "grad_norm": 45.40651389813833, + "learning_rate": 4.81781626048621e-06, + "loss": 2.4832, + "step": 19728 + }, + { + "epoch": 1.681496633427086, + "grad_norm": 42.164031196874646, + "learning_rate": 4.817320742819433e-06, + "loss": 2.5737, + "step": 19729 + }, + { + "epoch": 1.6815818631211115, + "grad_norm": 56.52436378159543, + "learning_rate": 4.8168252269492326e-06, + "loss": 1.8146, + "step": 19730 + }, + { + "epoch": 1.681667092815137, + "grad_norm": 82.49258955999944, + "learning_rate": 4.816329712880483e-06, + "loss": 3.4877, + "step": 19731 + }, + { + "epoch": 1.6817523225091622, + "grad_norm": 51.2545161512403, + "learning_rate": 4.815834200618057e-06, + "loss": 3.7336, + "step": 19732 + }, + { + "epoch": 1.6818375522031874, + "grad_norm": 24.672455629874744, + "learning_rate": 4.815338690166831e-06, + "loss": 1.5242, + "step": 19733 + }, + { + "epoch": 1.681922781897213, + "grad_norm": 58.21162536233661, + "learning_rate": 4.814843181531674e-06, + "loss": 3.1613, + "step": 19734 + }, + { + "epoch": 1.6820080115912384, + "grad_norm": 35.54225683505467, + "learning_rate": 4.814347674717462e-06, + "loss": 2.4643, + "step": 19735 + }, + { + "epoch": 1.6820932412852638, + "grad_norm": 65.62283680015295, + "learning_rate": 4.813852169729066e-06, + "loss": 3.1037, + "step": 19736 + }, + { + "epoch": 1.6821784709792893, + "grad_norm": 74.85930101458646, + "learning_rate": 4.813356666571359e-06, + "loss": 3.1793, + "step": 19737 + }, + { + "epoch": 1.6822637006733145, + "grad_norm": 42.68810635630486, + "learning_rate": 4.8128611652492155e-06, + "loss": 2.8882, + "step": 19738 + }, + { + "epoch": 1.68234893036734, + "grad_norm": 37.13964175430284, + "learning_rate": 4.8123656657675086e-06, + "loss": 2.7744, + "step": 19739 + }, + { + "epoch": 1.6824341600613653, + "grad_norm": 34.72281962494168, + "learning_rate": 4.811870168131112e-06, + "loss": 1.3462, + "step": 19740 + }, + { + "epoch": 1.6825193897553907, + "grad_norm": 62.72709772448552, + "learning_rate": 4.811374672344895e-06, + "loss": 2.1443, + "step": 19741 + }, + { + "epoch": 1.6826046194494162, + "grad_norm": 49.75289315650308, + "learning_rate": 4.810879178413733e-06, + "loss": 2.4788, + "step": 19742 + }, + { + "epoch": 1.6826898491434417, + "grad_norm": 41.118324241809766, + "learning_rate": 4.810383686342502e-06, + "loss": 2.4091, + "step": 19743 + }, + { + "epoch": 1.6827750788374671, + "grad_norm": 32.89833158507092, + "learning_rate": 4.809888196136071e-06, + "loss": 2.0238, + "step": 19744 + }, + { + "epoch": 1.6828603085314924, + "grad_norm": 45.193741514716265, + "learning_rate": 4.809392707799312e-06, + "loss": 2.6025, + "step": 19745 + }, + { + "epoch": 1.6829455382255176, + "grad_norm": 22.985943484099966, + "learning_rate": 4.808897221337101e-06, + "loss": 2.0648, + "step": 19746 + }, + { + "epoch": 1.683030767919543, + "grad_norm": 47.242393615870455, + "learning_rate": 4.8084017367543116e-06, + "loss": 3.2542, + "step": 19747 + }, + { + "epoch": 1.6831159976135686, + "grad_norm": 65.55212524669386, + "learning_rate": 4.807906254055815e-06, + "loss": 2.9403, + "step": 19748 + }, + { + "epoch": 1.683201227307594, + "grad_norm": 81.06480204475618, + "learning_rate": 4.807410773246482e-06, + "loss": 2.3858, + "step": 19749 + }, + { + "epoch": 1.6832864570016195, + "grad_norm": 58.15550275389433, + "learning_rate": 4.806915294331187e-06, + "loss": 3.7957, + "step": 19750 + }, + { + "epoch": 1.6833716866956447, + "grad_norm": 46.34285663249418, + "learning_rate": 4.806419817314806e-06, + "loss": 3.2329, + "step": 19751 + }, + { + "epoch": 1.6834569163896702, + "grad_norm": 58.04213915293855, + "learning_rate": 4.805924342202208e-06, + "loss": 2.6452, + "step": 19752 + }, + { + "epoch": 1.6835421460836955, + "grad_norm": 35.87632940079475, + "learning_rate": 4.805428868998267e-06, + "loss": 2.8978, + "step": 19753 + }, + { + "epoch": 1.683627375777721, + "grad_norm": 66.85288562283343, + "learning_rate": 4.804933397707854e-06, + "loss": 2.3381, + "step": 19754 + }, + { + "epoch": 1.6837126054717464, + "grad_norm": 74.96023058766413, + "learning_rate": 4.8044379283358465e-06, + "loss": 3.4284, + "step": 19755 + }, + { + "epoch": 1.6837978351657719, + "grad_norm": 43.43588484383797, + "learning_rate": 4.803942460887113e-06, + "loss": 2.8967, + "step": 19756 + }, + { + "epoch": 1.683883064859797, + "grad_norm": 35.253797666137686, + "learning_rate": 4.803446995366527e-06, + "loss": 2.7677, + "step": 19757 + }, + { + "epoch": 1.6839682945538226, + "grad_norm": 42.110534248790934, + "learning_rate": 4.802951531778961e-06, + "loss": 2.278, + "step": 19758 + }, + { + "epoch": 1.6840535242478478, + "grad_norm": 43.113957798576244, + "learning_rate": 4.8024560701292905e-06, + "loss": 2.9507, + "step": 19759 + }, + { + "epoch": 1.6841387539418733, + "grad_norm": 39.12844258653145, + "learning_rate": 4.801960610422386e-06, + "loss": 2.88, + "step": 19760 + }, + { + "epoch": 1.6842239836358988, + "grad_norm": 47.60509453912718, + "learning_rate": 4.801465152663119e-06, + "loss": 3.2282, + "step": 19761 + }, + { + "epoch": 1.6843092133299242, + "grad_norm": 43.39653659507302, + "learning_rate": 4.800969696856362e-06, + "loss": 2.6448, + "step": 19762 + }, + { + "epoch": 1.6843944430239497, + "grad_norm": 138.80226803406546, + "learning_rate": 4.800474243006991e-06, + "loss": 2.2386, + "step": 19763 + }, + { + "epoch": 1.684479672717975, + "grad_norm": 34.20536661515275, + "learning_rate": 4.799978791119877e-06, + "loss": 2.5953, + "step": 19764 + }, + { + "epoch": 1.6845649024120002, + "grad_norm": 40.923446686657236, + "learning_rate": 4.799483341199891e-06, + "loss": 2.2961, + "step": 19765 + }, + { + "epoch": 1.6846501321060257, + "grad_norm": 39.50007522898607, + "learning_rate": 4.798987893251908e-06, + "loss": 2.8687, + "step": 19766 + }, + { + "epoch": 1.6847353618000511, + "grad_norm": 62.26876749783017, + "learning_rate": 4.798492447280797e-06, + "loss": 2.4179, + "step": 19767 + }, + { + "epoch": 1.6848205914940766, + "grad_norm": 99.90425328482513, + "learning_rate": 4.797997003291434e-06, + "loss": 3.9603, + "step": 19768 + }, + { + "epoch": 1.684905821188102, + "grad_norm": 20.590574289754738, + "learning_rate": 4.7975015612886894e-06, + "loss": 1.383, + "step": 19769 + }, + { + "epoch": 1.6849910508821273, + "grad_norm": 55.04461566115036, + "learning_rate": 4.797006121277438e-06, + "loss": 3.1339, + "step": 19770 + }, + { + "epoch": 1.6850762805761528, + "grad_norm": 25.57162797560116, + "learning_rate": 4.7965106832625485e-06, + "loss": 1.5051, + "step": 19771 + }, + { + "epoch": 1.685161510270178, + "grad_norm": 32.01235874027776, + "learning_rate": 4.7960152472488975e-06, + "loss": 2.4889, + "step": 19772 + }, + { + "epoch": 1.6852467399642035, + "grad_norm": 40.06490400753847, + "learning_rate": 4.795519813241354e-06, + "loss": 2.2562, + "step": 19773 + }, + { + "epoch": 1.685331969658229, + "grad_norm": 33.21904567310744, + "learning_rate": 4.7950243812447934e-06, + "loss": 3.2107, + "step": 19774 + }, + { + "epoch": 1.6854171993522544, + "grad_norm": 50.005074445617865, + "learning_rate": 4.7945289512640855e-06, + "loss": 2.7415, + "step": 19775 + }, + { + "epoch": 1.6855024290462797, + "grad_norm": 44.64344679753226, + "learning_rate": 4.794033523304105e-06, + "loss": 2.9064, + "step": 19776 + }, + { + "epoch": 1.6855876587403051, + "grad_norm": 56.247781847810764, + "learning_rate": 4.793538097369721e-06, + "loss": 3.3305, + "step": 19777 + }, + { + "epoch": 1.6856728884343304, + "grad_norm": 41.522024090673284, + "learning_rate": 4.7930426734658095e-06, + "loss": 2.6453, + "step": 19778 + }, + { + "epoch": 1.6857581181283559, + "grad_norm": 55.56943522663488, + "learning_rate": 4.792547251597242e-06, + "loss": 3.1725, + "step": 19779 + }, + { + "epoch": 1.6858433478223813, + "grad_norm": 59.141046333909834, + "learning_rate": 4.792051831768886e-06, + "loss": 2.501, + "step": 19780 + }, + { + "epoch": 1.6859285775164068, + "grad_norm": 59.327323326202794, + "learning_rate": 4.79155641398562e-06, + "loss": 2.9937, + "step": 19781 + }, + { + "epoch": 1.6860138072104323, + "grad_norm": 38.91753761551592, + "learning_rate": 4.791060998252314e-06, + "loss": 2.3005, + "step": 19782 + }, + { + "epoch": 1.6860990369044575, + "grad_norm": 39.431460526835906, + "learning_rate": 4.790565584573841e-06, + "loss": 3.2578, + "step": 19783 + }, + { + "epoch": 1.6861842665984828, + "grad_norm": 49.1044003577461, + "learning_rate": 4.790070172955069e-06, + "loss": 3.1514, + "step": 19784 + }, + { + "epoch": 1.6862694962925082, + "grad_norm": 40.457670185294525, + "learning_rate": 4.789574763400875e-06, + "loss": 2.6236, + "step": 19785 + }, + { + "epoch": 1.6863547259865337, + "grad_norm": 48.02033716656707, + "learning_rate": 4.7890793559161316e-06, + "loss": 3.1626, + "step": 19786 + }, + { + "epoch": 1.6864399556805592, + "grad_norm": 51.6041664998431, + "learning_rate": 4.788583950505708e-06, + "loss": 2.7738, + "step": 19787 + }, + { + "epoch": 1.6865251853745846, + "grad_norm": 142.92102362527314, + "learning_rate": 4.788088547174475e-06, + "loss": 4.7179, + "step": 19788 + }, + { + "epoch": 1.6866104150686099, + "grad_norm": 31.2541279183654, + "learning_rate": 4.7875931459273085e-06, + "loss": 2.3804, + "step": 19789 + }, + { + "epoch": 1.6866956447626353, + "grad_norm": 39.138277052127755, + "learning_rate": 4.78709774676908e-06, + "loss": 3.2825, + "step": 19790 + }, + { + "epoch": 1.6867808744566606, + "grad_norm": 44.506858250863644, + "learning_rate": 4.78660234970466e-06, + "loss": 2.6206, + "step": 19791 + }, + { + "epoch": 1.686866104150686, + "grad_norm": 40.42919519854914, + "learning_rate": 4.7861069547389205e-06, + "loss": 2.4325, + "step": 19792 + }, + { + "epoch": 1.6869513338447115, + "grad_norm": 36.49003869541287, + "learning_rate": 4.785611561876733e-06, + "loss": 2.2536, + "step": 19793 + }, + { + "epoch": 1.687036563538737, + "grad_norm": 25.816500408884746, + "learning_rate": 4.785116171122973e-06, + "loss": 3.0174, + "step": 19794 + }, + { + "epoch": 1.6871217932327625, + "grad_norm": 51.25038471468073, + "learning_rate": 4.784620782482509e-06, + "loss": 2.5969, + "step": 19795 + }, + { + "epoch": 1.6872070229267877, + "grad_norm": 46.99030804404984, + "learning_rate": 4.784125395960214e-06, + "loss": 2.9429, + "step": 19796 + }, + { + "epoch": 1.687292252620813, + "grad_norm": 30.154897448566764, + "learning_rate": 4.7836300115609585e-06, + "loss": 1.8802, + "step": 19797 + }, + { + "epoch": 1.6873774823148384, + "grad_norm": 85.69538024104797, + "learning_rate": 4.783134629289618e-06, + "loss": 3.9113, + "step": 19798 + }, + { + "epoch": 1.6874627120088639, + "grad_norm": 39.43290361945921, + "learning_rate": 4.7826392491510625e-06, + "loss": 2.7505, + "step": 19799 + }, + { + "epoch": 1.6875479417028894, + "grad_norm": 76.90931196435601, + "learning_rate": 4.782143871150162e-06, + "loss": 2.6793, + "step": 19800 + }, + { + "epoch": 1.6876331713969148, + "grad_norm": 41.01515511418431, + "learning_rate": 4.78164849529179e-06, + "loss": 2.5969, + "step": 19801 + }, + { + "epoch": 1.68771840109094, + "grad_norm": 60.66672880579113, + "learning_rate": 4.7811531215808195e-06, + "loss": 2.7809, + "step": 19802 + }, + { + "epoch": 1.6878036307849655, + "grad_norm": 23.30426082782592, + "learning_rate": 4.78065775002212e-06, + "loss": 1.6046, + "step": 19803 + }, + { + "epoch": 1.6878888604789908, + "grad_norm": 31.29048181372558, + "learning_rate": 4.780162380620566e-06, + "loss": 2.1541, + "step": 19804 + }, + { + "epoch": 1.6879740901730163, + "grad_norm": 55.381327499303794, + "learning_rate": 4.779667013381025e-06, + "loss": 3.2088, + "step": 19805 + }, + { + "epoch": 1.6880593198670417, + "grad_norm": 44.06369070832237, + "learning_rate": 4.779171648308373e-06, + "loss": 3.2972, + "step": 19806 + }, + { + "epoch": 1.6881445495610672, + "grad_norm": 39.55085922451375, + "learning_rate": 4.778676285407481e-06, + "loss": 3.6208, + "step": 19807 + }, + { + "epoch": 1.6882297792550924, + "grad_norm": 51.65993535610697, + "learning_rate": 4.7781809246832186e-06, + "loss": 3.6909, + "step": 19808 + }, + { + "epoch": 1.688315008949118, + "grad_norm": 75.66561882492876, + "learning_rate": 4.77768556614046e-06, + "loss": 3.0821, + "step": 19809 + }, + { + "epoch": 1.6884002386431431, + "grad_norm": 35.06044677311, + "learning_rate": 4.777190209784072e-06, + "loss": 2.6052, + "step": 19810 + }, + { + "epoch": 1.6884854683371686, + "grad_norm": 92.29614287149919, + "learning_rate": 4.776694855618933e-06, + "loss": 4.201, + "step": 19811 + }, + { + "epoch": 1.688570698031194, + "grad_norm": 29.28368843297977, + "learning_rate": 4.77619950364991e-06, + "loss": 1.8624, + "step": 19812 + }, + { + "epoch": 1.6886559277252196, + "grad_norm": 30.334064937776766, + "learning_rate": 4.775704153881877e-06, + "loss": 2.0962, + "step": 19813 + }, + { + "epoch": 1.688741157419245, + "grad_norm": 39.824891509539945, + "learning_rate": 4.775208806319703e-06, + "loss": 2.5227, + "step": 19814 + }, + { + "epoch": 1.6888263871132703, + "grad_norm": 41.17959370770665, + "learning_rate": 4.774713460968262e-06, + "loss": 2.6367, + "step": 19815 + }, + { + "epoch": 1.6889116168072955, + "grad_norm": 30.27907881251506, + "learning_rate": 4.7742181178324245e-06, + "loss": 2.0758, + "step": 19816 + }, + { + "epoch": 1.688996846501321, + "grad_norm": 49.605091939524314, + "learning_rate": 4.773722776917063e-06, + "loss": 1.53, + "step": 19817 + }, + { + "epoch": 1.6890820761953464, + "grad_norm": 38.91844264903118, + "learning_rate": 4.7732274382270464e-06, + "loss": 2.3164, + "step": 19818 + }, + { + "epoch": 1.689167305889372, + "grad_norm": 59.47019180028494, + "learning_rate": 4.772732101767249e-06, + "loss": 2.2127, + "step": 19819 + }, + { + "epoch": 1.6892525355833974, + "grad_norm": 44.4757200287435, + "learning_rate": 4.772236767542541e-06, + "loss": 2.9871, + "step": 19820 + }, + { + "epoch": 1.6893377652774226, + "grad_norm": 47.33383013110691, + "learning_rate": 4.771741435557794e-06, + "loss": 3.1042, + "step": 19821 + }, + { + "epoch": 1.689422994971448, + "grad_norm": 14.327342668323253, + "learning_rate": 4.77124610581788e-06, + "loss": 0.9814, + "step": 19822 + }, + { + "epoch": 1.6895082246654733, + "grad_norm": 32.930635476782975, + "learning_rate": 4.770750778327667e-06, + "loss": 3.6266, + "step": 19823 + }, + { + "epoch": 1.6895934543594988, + "grad_norm": 39.20406883465374, + "learning_rate": 4.77025545309203e-06, + "loss": 2.6721, + "step": 19824 + }, + { + "epoch": 1.6896786840535243, + "grad_norm": 45.676828614842094, + "learning_rate": 4.769760130115841e-06, + "loss": 3.1232, + "step": 19825 + }, + { + "epoch": 1.6897639137475498, + "grad_norm": 35.45980126947254, + "learning_rate": 4.769264809403969e-06, + "loss": 2.2677, + "step": 19826 + }, + { + "epoch": 1.689849143441575, + "grad_norm": 13.041460271771978, + "learning_rate": 4.768769490961284e-06, + "loss": 0.9444, + "step": 19827 + }, + { + "epoch": 1.6899343731356005, + "grad_norm": 62.58559045343563, + "learning_rate": 4.7682741747926605e-06, + "loss": 2.4729, + "step": 19828 + }, + { + "epoch": 1.6900196028296257, + "grad_norm": 67.95069456742138, + "learning_rate": 4.767778860902969e-06, + "loss": 2.8583, + "step": 19829 + }, + { + "epoch": 1.6901048325236512, + "grad_norm": 49.89803369489374, + "learning_rate": 4.767283549297081e-06, + "loss": 3.166, + "step": 19830 + }, + { + "epoch": 1.6901900622176766, + "grad_norm": 40.23379916916646, + "learning_rate": 4.766788239979864e-06, + "loss": 2.238, + "step": 19831 + }, + { + "epoch": 1.6902752919117021, + "grad_norm": 50.297309677588274, + "learning_rate": 4.766292932956193e-06, + "loss": 3.015, + "step": 19832 + }, + { + "epoch": 1.6903605216057276, + "grad_norm": 64.8713693152495, + "learning_rate": 4.765797628230939e-06, + "loss": 2.6633, + "step": 19833 + }, + { + "epoch": 1.6904457512997528, + "grad_norm": 144.03886270134421, + "learning_rate": 4.765302325808973e-06, + "loss": 3.4821, + "step": 19834 + }, + { + "epoch": 1.690530980993778, + "grad_norm": 73.90734457731456, + "learning_rate": 4.7648070256951626e-06, + "loss": 2.9651, + "step": 19835 + }, + { + "epoch": 1.6906162106878035, + "grad_norm": 41.643380094898845, + "learning_rate": 4.764311727894383e-06, + "loss": 2.7556, + "step": 19836 + }, + { + "epoch": 1.690701440381829, + "grad_norm": 60.497974061354, + "learning_rate": 4.763816432411505e-06, + "loss": 3.0955, + "step": 19837 + }, + { + "epoch": 1.6907866700758545, + "grad_norm": 42.526654114965396, + "learning_rate": 4.763321139251398e-06, + "loss": 3.3291, + "step": 19838 + }, + { + "epoch": 1.69087189976988, + "grad_norm": 71.5123474291261, + "learning_rate": 4.762825848418934e-06, + "loss": 3.0389, + "step": 19839 + }, + { + "epoch": 1.6909571294639052, + "grad_norm": 48.04730034689886, + "learning_rate": 4.7623305599189805e-06, + "loss": 3.8728, + "step": 19840 + }, + { + "epoch": 1.6910423591579307, + "grad_norm": 37.98765523071279, + "learning_rate": 4.761835273756415e-06, + "loss": 2.7725, + "step": 19841 + }, + { + "epoch": 1.691127588851956, + "grad_norm": 33.49902684982664, + "learning_rate": 4.761339989936105e-06, + "loss": 1.8715, + "step": 19842 + }, + { + "epoch": 1.6912128185459814, + "grad_norm": 39.33962304059006, + "learning_rate": 4.76084470846292e-06, + "loss": 2.9083, + "step": 19843 + }, + { + "epoch": 1.6912980482400068, + "grad_norm": 37.1675983044304, + "learning_rate": 4.76034942934173e-06, + "loss": 1.9061, + "step": 19844 + }, + { + "epoch": 1.6913832779340323, + "grad_norm": 57.73608154139974, + "learning_rate": 4.759854152577412e-06, + "loss": 2.7313, + "step": 19845 + }, + { + "epoch": 1.6914685076280576, + "grad_norm": 31.360205774620834, + "learning_rate": 4.759358878174832e-06, + "loss": 1.6477, + "step": 19846 + }, + { + "epoch": 1.691553737322083, + "grad_norm": 61.20753421015647, + "learning_rate": 4.758863606138861e-06, + "loss": 2.7449, + "step": 19847 + }, + { + "epoch": 1.6916389670161083, + "grad_norm": 60.98538225282907, + "learning_rate": 4.758368336474369e-06, + "loss": 3.5841, + "step": 19848 + }, + { + "epoch": 1.6917241967101337, + "grad_norm": 62.0832683818706, + "learning_rate": 4.7578730691862315e-06, + "loss": 2.9404, + "step": 19849 + }, + { + "epoch": 1.6918094264041592, + "grad_norm": 35.51548699681608, + "learning_rate": 4.757377804279316e-06, + "loss": 3.0747, + "step": 19850 + }, + { + "epoch": 1.6918946560981847, + "grad_norm": 25.14121333715879, + "learning_rate": 4.756882541758492e-06, + "loss": 2.9813, + "step": 19851 + }, + { + "epoch": 1.6919798857922101, + "grad_norm": 74.18496894250657, + "learning_rate": 4.756387281628633e-06, + "loss": 3.6445, + "step": 19852 + }, + { + "epoch": 1.6920651154862354, + "grad_norm": 73.4762823569591, + "learning_rate": 4.755892023894606e-06, + "loss": 4.1238, + "step": 19853 + }, + { + "epoch": 1.6921503451802606, + "grad_norm": 31.5922008335588, + "learning_rate": 4.755396768561285e-06, + "loss": 2.125, + "step": 19854 + }, + { + "epoch": 1.692235574874286, + "grad_norm": 80.04768038486479, + "learning_rate": 4.75490151563354e-06, + "loss": 3.539, + "step": 19855 + }, + { + "epoch": 1.6923208045683116, + "grad_norm": 29.162140368592972, + "learning_rate": 4.754406265116242e-06, + "loss": 2.391, + "step": 19856 + }, + { + "epoch": 1.692406034262337, + "grad_norm": 38.65552770876247, + "learning_rate": 4.753911017014258e-06, + "loss": 2.9331, + "step": 19857 + }, + { + "epoch": 1.6924912639563625, + "grad_norm": 70.05457071292172, + "learning_rate": 4.753415771332464e-06, + "loss": 2.6152, + "step": 19858 + }, + { + "epoch": 1.6925764936503878, + "grad_norm": 59.87696939517889, + "learning_rate": 4.752920528075726e-06, + "loss": 3.3586, + "step": 19859 + }, + { + "epoch": 1.6926617233444132, + "grad_norm": 69.48667061368229, + "learning_rate": 4.752425287248919e-06, + "loss": 1.7491, + "step": 19860 + }, + { + "epoch": 1.6927469530384385, + "grad_norm": 31.811266472752635, + "learning_rate": 4.751930048856908e-06, + "loss": 2.0023, + "step": 19861 + }, + { + "epoch": 1.692832182732464, + "grad_norm": 52.514164525069326, + "learning_rate": 4.751434812904568e-06, + "loss": 2.5713, + "step": 19862 + }, + { + "epoch": 1.6929174124264894, + "grad_norm": 43.79017763912647, + "learning_rate": 4.750939579396767e-06, + "loss": 3.1436, + "step": 19863 + }, + { + "epoch": 1.6930026421205149, + "grad_norm": 39.40923961187106, + "learning_rate": 4.7504443483383785e-06, + "loss": 2.8373, + "step": 19864 + }, + { + "epoch": 1.6930878718145403, + "grad_norm": 39.01389748169667, + "learning_rate": 4.749949119734269e-06, + "loss": 2.7387, + "step": 19865 + }, + { + "epoch": 1.6931731015085656, + "grad_norm": 108.72258805245532, + "learning_rate": 4.74945389358931e-06, + "loss": 2.7622, + "step": 19866 + }, + { + "epoch": 1.6932583312025908, + "grad_norm": 19.14954372002842, + "learning_rate": 4.748958669908372e-06, + "loss": 1.3891, + "step": 19867 + }, + { + "epoch": 1.6933435608966163, + "grad_norm": 47.081038082089556, + "learning_rate": 4.748463448696328e-06, + "loss": 3.6101, + "step": 19868 + }, + { + "epoch": 1.6934287905906418, + "grad_norm": 38.981440954183334, + "learning_rate": 4.747968229958045e-06, + "loss": 2.5117, + "step": 19869 + }, + { + "epoch": 1.6935140202846672, + "grad_norm": 65.14708285196835, + "learning_rate": 4.747473013698393e-06, + "loss": 3.4396, + "step": 19870 + }, + { + "epoch": 1.6935992499786927, + "grad_norm": 45.25847840591809, + "learning_rate": 4.7469777999222445e-06, + "loss": 3.1855, + "step": 19871 + }, + { + "epoch": 1.693684479672718, + "grad_norm": 22.705943077430543, + "learning_rate": 4.746482588634469e-06, + "loss": 1.5547, + "step": 19872 + }, + { + "epoch": 1.6937697093667434, + "grad_norm": 46.83496303572593, + "learning_rate": 4.745987379839938e-06, + "loss": 2.5503, + "step": 19873 + }, + { + "epoch": 1.6938549390607687, + "grad_norm": 110.97227265391207, + "learning_rate": 4.745492173543517e-06, + "loss": 3.8857, + "step": 19874 + }, + { + "epoch": 1.6939401687547941, + "grad_norm": 59.306333411349335, + "learning_rate": 4.7449969697500805e-06, + "loss": 3.352, + "step": 19875 + }, + { + "epoch": 1.6940253984488196, + "grad_norm": 30.979517034784582, + "learning_rate": 4.744501768464499e-06, + "loss": 1.9332, + "step": 19876 + }, + { + "epoch": 1.694110628142845, + "grad_norm": 29.373888858479017, + "learning_rate": 4.744006569691641e-06, + "loss": 2.7193, + "step": 19877 + }, + { + "epoch": 1.6941958578368703, + "grad_norm": 68.89942026672703, + "learning_rate": 4.7435113734363745e-06, + "loss": 2.4548, + "step": 19878 + }, + { + "epoch": 1.6942810875308958, + "grad_norm": 40.48011818794443, + "learning_rate": 4.743016179703572e-06, + "loss": 3.2216, + "step": 19879 + }, + { + "epoch": 1.694366317224921, + "grad_norm": 36.97556101824117, + "learning_rate": 4.742520988498105e-06, + "loss": 2.5054, + "step": 19880 + }, + { + "epoch": 1.6944515469189465, + "grad_norm": 40.93327178889836, + "learning_rate": 4.742025799824842e-06, + "loss": 3.4758, + "step": 19881 + }, + { + "epoch": 1.694536776612972, + "grad_norm": 35.175112506900575, + "learning_rate": 4.741530613688652e-06, + "loss": 2.4407, + "step": 19882 + }, + { + "epoch": 1.6946220063069974, + "grad_norm": 68.79844715584083, + "learning_rate": 4.741035430094403e-06, + "loss": 3.0555, + "step": 19883 + }, + { + "epoch": 1.694707236001023, + "grad_norm": 24.947593172539015, + "learning_rate": 4.740540249046971e-06, + "loss": 3.1039, + "step": 19884 + }, + { + "epoch": 1.6947924656950482, + "grad_norm": 77.41153184540174, + "learning_rate": 4.7400450705512225e-06, + "loss": 3.2698, + "step": 19885 + }, + { + "epoch": 1.6948776953890734, + "grad_norm": 75.69964546638886, + "learning_rate": 4.739549894612026e-06, + "loss": 2.9429, + "step": 19886 + }, + { + "epoch": 1.6949629250830989, + "grad_norm": 46.463257021574734, + "learning_rate": 4.739054721234252e-06, + "loss": 3.389, + "step": 19887 + }, + { + "epoch": 1.6950481547771243, + "grad_norm": 74.0221515008056, + "learning_rate": 4.7385595504227735e-06, + "loss": 2.1743, + "step": 19888 + }, + { + "epoch": 1.6951333844711498, + "grad_norm": 34.40114764312466, + "learning_rate": 4.738064382182457e-06, + "loss": 2.2843, + "step": 19889 + }, + { + "epoch": 1.6952186141651753, + "grad_norm": 65.83379287522183, + "learning_rate": 4.737569216518173e-06, + "loss": 2.9772, + "step": 19890 + }, + { + "epoch": 1.6953038438592005, + "grad_norm": 275.4405472653216, + "learning_rate": 4.73707405343479e-06, + "loss": 4.1852, + "step": 19891 + }, + { + "epoch": 1.695389073553226, + "grad_norm": 111.69459178801176, + "learning_rate": 4.736578892937182e-06, + "loss": 4.3487, + "step": 19892 + }, + { + "epoch": 1.6954743032472512, + "grad_norm": 98.18472540654977, + "learning_rate": 4.736083735030216e-06, + "loss": 2.8005, + "step": 19893 + }, + { + "epoch": 1.6955595329412767, + "grad_norm": 35.79267939526745, + "learning_rate": 4.735588579718759e-06, + "loss": 2.2535, + "step": 19894 + }, + { + "epoch": 1.6956447626353022, + "grad_norm": 40.23066928727273, + "learning_rate": 4.735093427007686e-06, + "loss": 2.8528, + "step": 19895 + }, + { + "epoch": 1.6957299923293276, + "grad_norm": 55.05255559981113, + "learning_rate": 4.73459827690186e-06, + "loss": 2.5969, + "step": 19896 + }, + { + "epoch": 1.6958152220233529, + "grad_norm": 43.19610077073998, + "learning_rate": 4.734103129406158e-06, + "loss": 2.4504, + "step": 19897 + }, + { + "epoch": 1.6959004517173784, + "grad_norm": 88.80351495562473, + "learning_rate": 4.733607984525445e-06, + "loss": 2.7486, + "step": 19898 + }, + { + "epoch": 1.6959856814114036, + "grad_norm": 30.83616907592624, + "learning_rate": 4.733112842264592e-06, + "loss": 2.4121, + "step": 19899 + }, + { + "epoch": 1.696070911105429, + "grad_norm": 60.6741128029627, + "learning_rate": 4.7326177026284665e-06, + "loss": 3.0695, + "step": 19900 + }, + { + "epoch": 1.6961561407994545, + "grad_norm": 49.07453173042211, + "learning_rate": 4.732122565621942e-06, + "loss": 3.3758, + "step": 19901 + }, + { + "epoch": 1.69624137049348, + "grad_norm": 88.40357260221977, + "learning_rate": 4.731627431249884e-06, + "loss": 4.0875, + "step": 19902 + }, + { + "epoch": 1.6963266001875055, + "grad_norm": 68.40658738571484, + "learning_rate": 4.731132299517165e-06, + "loss": 3.0779, + "step": 19903 + }, + { + "epoch": 1.6964118298815307, + "grad_norm": 50.70028245598837, + "learning_rate": 4.730637170428651e-06, + "loss": 2.4203, + "step": 19904 + }, + { + "epoch": 1.696497059575556, + "grad_norm": 85.37701612708206, + "learning_rate": 4.730142043989215e-06, + "loss": 1.7845, + "step": 19905 + }, + { + "epoch": 1.6965822892695814, + "grad_norm": 49.76390888319052, + "learning_rate": 4.729646920203724e-06, + "loss": 2.282, + "step": 19906 + }, + { + "epoch": 1.696667518963607, + "grad_norm": 49.080798444328074, + "learning_rate": 4.729151799077049e-06, + "loss": 4.3031, + "step": 19907 + }, + { + "epoch": 1.6967527486576324, + "grad_norm": 51.40494485180347, + "learning_rate": 4.728656680614056e-06, + "loss": 3.3333, + "step": 19908 + }, + { + "epoch": 1.6968379783516578, + "grad_norm": 51.84218066243553, + "learning_rate": 4.728161564819619e-06, + "loss": 2.9448, + "step": 19909 + }, + { + "epoch": 1.696923208045683, + "grad_norm": 79.37276925715486, + "learning_rate": 4.727666451698605e-06, + "loss": 2.871, + "step": 19910 + }, + { + "epoch": 1.6970084377397086, + "grad_norm": 34.61522191589787, + "learning_rate": 4.727171341255883e-06, + "loss": 2.6506, + "step": 19911 + }, + { + "epoch": 1.6970936674337338, + "grad_norm": 84.30905433960925, + "learning_rate": 4.726676233496324e-06, + "loss": 3.7057, + "step": 19912 + }, + { + "epoch": 1.6971788971277593, + "grad_norm": 29.92257200294219, + "learning_rate": 4.726181128424792e-06, + "loss": 1.8714, + "step": 19913 + }, + { + "epoch": 1.6972641268217847, + "grad_norm": 42.295509933058895, + "learning_rate": 4.725686026046161e-06, + "loss": 3.4573, + "step": 19914 + }, + { + "epoch": 1.6973493565158102, + "grad_norm": 42.80930958868086, + "learning_rate": 4.7251909263653e-06, + "loss": 3.0956, + "step": 19915 + }, + { + "epoch": 1.6974345862098357, + "grad_norm": 37.20731652437382, + "learning_rate": 4.724695829387078e-06, + "loss": 3.3916, + "step": 19916 + }, + { + "epoch": 1.697519815903861, + "grad_norm": 42.557436081430346, + "learning_rate": 4.72420073511636e-06, + "loss": 3.4233, + "step": 19917 + }, + { + "epoch": 1.6976050455978862, + "grad_norm": 36.400443395021895, + "learning_rate": 4.72370564355802e-06, + "loss": 3.2459, + "step": 19918 + }, + { + "epoch": 1.6976902752919116, + "grad_norm": 35.51078966639213, + "learning_rate": 4.723210554716925e-06, + "loss": 3.0985, + "step": 19919 + }, + { + "epoch": 1.697775504985937, + "grad_norm": 43.49708330122472, + "learning_rate": 4.7227154685979454e-06, + "loss": 3.2352, + "step": 19920 + }, + { + "epoch": 1.6978607346799626, + "grad_norm": 54.6390568619697, + "learning_rate": 4.722220385205947e-06, + "loss": 3.1898, + "step": 19921 + }, + { + "epoch": 1.697945964373988, + "grad_norm": 79.82284531545457, + "learning_rate": 4.7217253045458004e-06, + "loss": 2.7448, + "step": 19922 + }, + { + "epoch": 1.6980311940680133, + "grad_norm": 38.57447201225454, + "learning_rate": 4.721230226622378e-06, + "loss": 2.9625, + "step": 19923 + }, + { + "epoch": 1.6981164237620385, + "grad_norm": 44.004277769005114, + "learning_rate": 4.720735151440543e-06, + "loss": 3.3942, + "step": 19924 + }, + { + "epoch": 1.698201653456064, + "grad_norm": 51.517737714645456, + "learning_rate": 4.720240079005168e-06, + "loss": 2.6092, + "step": 19925 + }, + { + "epoch": 1.6982868831500895, + "grad_norm": 52.87424342541485, + "learning_rate": 4.719745009321119e-06, + "loss": 3.3326, + "step": 19926 + }, + { + "epoch": 1.698372112844115, + "grad_norm": 45.99161585287922, + "learning_rate": 4.719249942393268e-06, + "loss": 2.633, + "step": 19927 + }, + { + "epoch": 1.6984573425381404, + "grad_norm": 34.584308404596165, + "learning_rate": 4.718754878226483e-06, + "loss": 2.7297, + "step": 19928 + }, + { + "epoch": 1.6985425722321656, + "grad_norm": 33.279289182637086, + "learning_rate": 4.71825981682563e-06, + "loss": 2.4574, + "step": 19929 + }, + { + "epoch": 1.6986278019261911, + "grad_norm": 137.00367268253623, + "learning_rate": 4.717764758195579e-06, + "loss": 2.9134, + "step": 19930 + }, + { + "epoch": 1.6987130316202164, + "grad_norm": 52.03300752630739, + "learning_rate": 4.717269702341202e-06, + "loss": 3.033, + "step": 19931 + }, + { + "epoch": 1.6987982613142418, + "grad_norm": 75.34770908088267, + "learning_rate": 4.716774649267365e-06, + "loss": 2.676, + "step": 19932 + }, + { + "epoch": 1.6988834910082673, + "grad_norm": 65.9301693127558, + "learning_rate": 4.716279598978935e-06, + "loss": 3.0884, + "step": 19933 + }, + { + "epoch": 1.6989687207022928, + "grad_norm": 98.18058495945337, + "learning_rate": 4.715784551480783e-06, + "loss": 2.7477, + "step": 19934 + }, + { + "epoch": 1.6990539503963182, + "grad_norm": 36.81359546622491, + "learning_rate": 4.715289506777777e-06, + "loss": 2.7954, + "step": 19935 + }, + { + "epoch": 1.6991391800903435, + "grad_norm": 40.76140429911239, + "learning_rate": 4.714794464874787e-06, + "loss": 1.7303, + "step": 19936 + }, + { + "epoch": 1.6992244097843687, + "grad_norm": 36.111634283760466, + "learning_rate": 4.714299425776679e-06, + "loss": 2.9288, + "step": 19937 + }, + { + "epoch": 1.6993096394783942, + "grad_norm": 42.261963375878594, + "learning_rate": 4.713804389488323e-06, + "loss": 3.4536, + "step": 19938 + }, + { + "epoch": 1.6993948691724197, + "grad_norm": 41.09617496134652, + "learning_rate": 4.713309356014586e-06, + "loss": 2.7638, + "step": 19939 + }, + { + "epoch": 1.6994800988664451, + "grad_norm": 46.74837385795137, + "learning_rate": 4.712814325360339e-06, + "loss": 3.1667, + "step": 19940 + }, + { + "epoch": 1.6995653285604706, + "grad_norm": 71.58096348859691, + "learning_rate": 4.712319297530448e-06, + "loss": 2.6688, + "step": 19941 + }, + { + "epoch": 1.6996505582544958, + "grad_norm": 59.81984671578013, + "learning_rate": 4.711824272529784e-06, + "loss": 2.6179, + "step": 19942 + }, + { + "epoch": 1.6997357879485213, + "grad_norm": 81.7892852999794, + "learning_rate": 4.71132925036321e-06, + "loss": 4.2244, + "step": 19943 + }, + { + "epoch": 1.6998210176425466, + "grad_norm": 68.40444587884741, + "learning_rate": 4.710834231035602e-06, + "loss": 2.7508, + "step": 19944 + }, + { + "epoch": 1.699906247336572, + "grad_norm": 42.5355457604574, + "learning_rate": 4.710339214551823e-06, + "loss": 3.118, + "step": 19945 + }, + { + "epoch": 1.6999914770305975, + "grad_norm": 32.59144623852273, + "learning_rate": 4.709844200916743e-06, + "loss": 2.8669, + "step": 19946 + }, + { + "epoch": 1.700076706724623, + "grad_norm": 33.74250794975398, + "learning_rate": 4.709349190135229e-06, + "loss": 2.3461, + "step": 19947 + }, + { + "epoch": 1.7001619364186482, + "grad_norm": 24.62862457589347, + "learning_rate": 4.708854182212151e-06, + "loss": 1.8416, + "step": 19948 + }, + { + "epoch": 1.7002471661126737, + "grad_norm": 49.06884729161572, + "learning_rate": 4.708359177152376e-06, + "loss": 3.2845, + "step": 19949 + }, + { + "epoch": 1.700332395806699, + "grad_norm": 125.30561796026338, + "learning_rate": 4.707864174960774e-06, + "loss": 4.5503, + "step": 19950 + }, + { + "epoch": 1.7004176255007244, + "grad_norm": 33.39775785123958, + "learning_rate": 4.70736917564221e-06, + "loss": 3.0921, + "step": 19951 + }, + { + "epoch": 1.7005028551947499, + "grad_norm": 105.33872074141789, + "learning_rate": 4.706874179201555e-06, + "loss": 2.6378, + "step": 19952 + }, + { + "epoch": 1.7005880848887753, + "grad_norm": 37.58184179437141, + "learning_rate": 4.706379185643675e-06, + "loss": 2.7837, + "step": 19953 + }, + { + "epoch": 1.7006733145828008, + "grad_norm": 43.89917576817269, + "learning_rate": 4.705884194973441e-06, + "loss": 2.0743, + "step": 19954 + }, + { + "epoch": 1.700758544276826, + "grad_norm": 44.123440306223586, + "learning_rate": 4.705389207195718e-06, + "loss": 2.8877, + "step": 19955 + }, + { + "epoch": 1.7008437739708513, + "grad_norm": 26.578394056167056, + "learning_rate": 4.704894222315374e-06, + "loss": 2.948, + "step": 19956 + }, + { + "epoch": 1.7009290036648768, + "grad_norm": 63.30701882733918, + "learning_rate": 4.704399240337278e-06, + "loss": 2.7988, + "step": 19957 + }, + { + "epoch": 1.7010142333589022, + "grad_norm": 34.56525572740236, + "learning_rate": 4.7039042612663e-06, + "loss": 1.4432, + "step": 19958 + }, + { + "epoch": 1.7010994630529277, + "grad_norm": 27.378746130429565, + "learning_rate": 4.703409285107306e-06, + "loss": 1.4261, + "step": 19959 + }, + { + "epoch": 1.7011846927469532, + "grad_norm": 33.32942179960936, + "learning_rate": 4.7029143118651605e-06, + "loss": 2.8069, + "step": 19960 + }, + { + "epoch": 1.7012699224409784, + "grad_norm": 66.26202270215539, + "learning_rate": 4.702419341544737e-06, + "loss": 3.374, + "step": 19961 + }, + { + "epoch": 1.7013551521350039, + "grad_norm": 86.00603563578842, + "learning_rate": 4.701924374150901e-06, + "loss": 3.9114, + "step": 19962 + }, + { + "epoch": 1.7014403818290291, + "grad_norm": 32.82414864547889, + "learning_rate": 4.701429409688522e-06, + "loss": 2.6057, + "step": 19963 + }, + { + "epoch": 1.7015256115230546, + "grad_norm": 21.45787820545616, + "learning_rate": 4.700934448162462e-06, + "loss": 1.6312, + "step": 19964 + }, + { + "epoch": 1.70161084121708, + "grad_norm": 27.415195324678617, + "learning_rate": 4.7004394895775955e-06, + "loss": 2.5584, + "step": 19965 + }, + { + "epoch": 1.7016960709111055, + "grad_norm": 86.21218062073928, + "learning_rate": 4.699944533938788e-06, + "loss": 3.1331, + "step": 19966 + }, + { + "epoch": 1.7017813006051308, + "grad_norm": 76.43948706920388, + "learning_rate": 4.699449581250907e-06, + "loss": 2.1084, + "step": 19967 + }, + { + "epoch": 1.7018665302991562, + "grad_norm": 274.51796653083284, + "learning_rate": 4.698954631518818e-06, + "loss": 3.1669, + "step": 19968 + }, + { + "epoch": 1.7019517599931815, + "grad_norm": 79.76231561681904, + "learning_rate": 4.69845968474739e-06, + "loss": 2.7481, + "step": 19969 + }, + { + "epoch": 1.702036989687207, + "grad_norm": 42.980714178076234, + "learning_rate": 4.697964740941493e-06, + "loss": 2.6905, + "step": 19970 + }, + { + "epoch": 1.7021222193812324, + "grad_norm": 50.993185179848034, + "learning_rate": 4.697469800105994e-06, + "loss": 2.5883, + "step": 19971 + }, + { + "epoch": 1.702207449075258, + "grad_norm": 68.20051673712048, + "learning_rate": 4.696974862245757e-06, + "loss": 3.209, + "step": 19972 + }, + { + "epoch": 1.7022926787692834, + "grad_norm": 34.43590916499374, + "learning_rate": 4.696479927365652e-06, + "loss": 2.385, + "step": 19973 + }, + { + "epoch": 1.7023779084633086, + "grad_norm": 105.15036227135184, + "learning_rate": 4.6959849954705475e-06, + "loss": 3.4589, + "step": 19974 + }, + { + "epoch": 1.7024631381573339, + "grad_norm": 80.70118219910186, + "learning_rate": 4.695490066565309e-06, + "loss": 2.9967, + "step": 19975 + }, + { + "epoch": 1.7025483678513593, + "grad_norm": 59.744201087251376, + "learning_rate": 4.694995140654805e-06, + "loss": 3.0142, + "step": 19976 + }, + { + "epoch": 1.7026335975453848, + "grad_norm": 54.63506367045389, + "learning_rate": 4.6945002177439016e-06, + "loss": 3.0776, + "step": 19977 + }, + { + "epoch": 1.7027188272394103, + "grad_norm": 37.57085836805757, + "learning_rate": 4.694005297837469e-06, + "loss": 1.9048, + "step": 19978 + }, + { + "epoch": 1.7028040569334357, + "grad_norm": 75.5753049350993, + "learning_rate": 4.693510380940372e-06, + "loss": 2.4264, + "step": 19979 + }, + { + "epoch": 1.702889286627461, + "grad_norm": 60.88706008527727, + "learning_rate": 4.693015467057478e-06, + "loss": 3.1951, + "step": 19980 + }, + { + "epoch": 1.7029745163214864, + "grad_norm": 47.36551010567883, + "learning_rate": 4.692520556193655e-06, + "loss": 2.4682, + "step": 19981 + }, + { + "epoch": 1.7030597460155117, + "grad_norm": 57.94587778004816, + "learning_rate": 4.69202564835377e-06, + "loss": 3.0515, + "step": 19982 + }, + { + "epoch": 1.7031449757095372, + "grad_norm": 101.8687684136119, + "learning_rate": 4.6915307435426924e-06, + "loss": 3.9578, + "step": 19983 + }, + { + "epoch": 1.7032302054035626, + "grad_norm": 32.45553584798538, + "learning_rate": 4.691035841765285e-06, + "loss": 2.7303, + "step": 19984 + }, + { + "epoch": 1.703315435097588, + "grad_norm": 46.37444566610978, + "learning_rate": 4.690540943026418e-06, + "loss": 3.2607, + "step": 19985 + }, + { + "epoch": 1.7034006647916136, + "grad_norm": 35.98118216304476, + "learning_rate": 4.690046047330958e-06, + "loss": 3.0811, + "step": 19986 + }, + { + "epoch": 1.7034858944856388, + "grad_norm": 114.19447136993666, + "learning_rate": 4.689551154683772e-06, + "loss": 3.8854, + "step": 19987 + }, + { + "epoch": 1.703571124179664, + "grad_norm": 41.86756650524155, + "learning_rate": 4.689056265089727e-06, + "loss": 2.5967, + "step": 19988 + }, + { + "epoch": 1.7036563538736895, + "grad_norm": 36.10533291028388, + "learning_rate": 4.688561378553691e-06, + "loss": 2.5192, + "step": 19989 + }, + { + "epoch": 1.703741583567715, + "grad_norm": 47.60448696899797, + "learning_rate": 4.688066495080527e-06, + "loss": 3.2546, + "step": 19990 + }, + { + "epoch": 1.7038268132617405, + "grad_norm": 40.871289152072514, + "learning_rate": 4.6875716146751085e-06, + "loss": 2.9581, + "step": 19991 + }, + { + "epoch": 1.703912042955766, + "grad_norm": 55.562305323165624, + "learning_rate": 4.687076737342297e-06, + "loss": 3.7051, + "step": 19992 + }, + { + "epoch": 1.7039972726497912, + "grad_norm": 43.530900881065676, + "learning_rate": 4.686581863086963e-06, + "loss": 3.4521, + "step": 19993 + }, + { + "epoch": 1.7040825023438166, + "grad_norm": 106.54048697247276, + "learning_rate": 4.6860869919139696e-06, + "loss": 2.0732, + "step": 19994 + }, + { + "epoch": 1.7041677320378419, + "grad_norm": 39.873996281008466, + "learning_rate": 4.685592123828187e-06, + "loss": 2.8456, + "step": 19995 + }, + { + "epoch": 1.7042529617318674, + "grad_norm": 53.00772583651254, + "learning_rate": 4.685097258834482e-06, + "loss": 2.21, + "step": 19996 + }, + { + "epoch": 1.7043381914258928, + "grad_norm": 44.84597928815579, + "learning_rate": 4.6846023969377195e-06, + "loss": 2.2497, + "step": 19997 + }, + { + "epoch": 1.7044234211199183, + "grad_norm": 68.24872203331788, + "learning_rate": 4.684107538142768e-06, + "loss": 2.4544, + "step": 19998 + }, + { + "epoch": 1.7045086508139435, + "grad_norm": 40.59077991696395, + "learning_rate": 4.683612682454491e-06, + "loss": 2.8011, + "step": 19999 + }, + { + "epoch": 1.704593880507969, + "grad_norm": 56.312813173087555, + "learning_rate": 4.683117829877759e-06, + "loss": 2.557, + "step": 20000 + }, + { + "epoch": 1.7046791102019943, + "grad_norm": 28.396103175570065, + "learning_rate": 4.682622980417438e-06, + "loss": 2.4651, + "step": 20001 + }, + { + "epoch": 1.7047643398960197, + "grad_norm": 52.6504474930813, + "learning_rate": 4.682128134078394e-06, + "loss": 2.2907, + "step": 20002 + }, + { + "epoch": 1.7048495695900452, + "grad_norm": 25.806307749142043, + "learning_rate": 4.681633290865491e-06, + "loss": 2.5556, + "step": 20003 + }, + { + "epoch": 1.7049347992840707, + "grad_norm": 116.27995767782726, + "learning_rate": 4.6811384507836e-06, + "loss": 3.9261, + "step": 20004 + }, + { + "epoch": 1.7050200289780961, + "grad_norm": 85.53157623457525, + "learning_rate": 4.680643613837585e-06, + "loss": 4.0883, + "step": 20005 + }, + { + "epoch": 1.7051052586721214, + "grad_norm": 53.215325931808096, + "learning_rate": 4.680148780032315e-06, + "loss": 3.1148, + "step": 20006 + }, + { + "epoch": 1.7051904883661466, + "grad_norm": 46.41239522442634, + "learning_rate": 4.679653949372652e-06, + "loss": 2.7689, + "step": 20007 + }, + { + "epoch": 1.705275718060172, + "grad_norm": 36.9166672491902, + "learning_rate": 4.679159121863465e-06, + "loss": 2.7398, + "step": 20008 + }, + { + "epoch": 1.7053609477541976, + "grad_norm": 26.75982451563217, + "learning_rate": 4.678664297509623e-06, + "loss": 2.793, + "step": 20009 + }, + { + "epoch": 1.705446177448223, + "grad_norm": 42.2207135896037, + "learning_rate": 4.678169476315989e-06, + "loss": 2.7728, + "step": 20010 + }, + { + "epoch": 1.7055314071422485, + "grad_norm": 85.24783042697173, + "learning_rate": 4.677674658287429e-06, + "loss": 1.9626, + "step": 20011 + }, + { + "epoch": 1.7056166368362737, + "grad_norm": 54.96324783166995, + "learning_rate": 4.677179843428812e-06, + "loss": 3.3195, + "step": 20012 + }, + { + "epoch": 1.7057018665302992, + "grad_norm": 37.99939525177307, + "learning_rate": 4.676685031745003e-06, + "loss": 2.7312, + "step": 20013 + }, + { + "epoch": 1.7057870962243245, + "grad_norm": 30.592742017578704, + "learning_rate": 4.676190223240868e-06, + "loss": 1.9203, + "step": 20014 + }, + { + "epoch": 1.70587232591835, + "grad_norm": 19.904281952355078, + "learning_rate": 4.6756954179212735e-06, + "loss": 1.5627, + "step": 20015 + }, + { + "epoch": 1.7059575556123754, + "grad_norm": 77.07369034859181, + "learning_rate": 4.675200615791085e-06, + "loss": 2.8531, + "step": 20016 + }, + { + "epoch": 1.7060427853064009, + "grad_norm": 53.9026592393049, + "learning_rate": 4.67470581685517e-06, + "loss": 3.3348, + "step": 20017 + }, + { + "epoch": 1.706128015000426, + "grad_norm": 57.09918706444022, + "learning_rate": 4.674211021118395e-06, + "loss": 1.874, + "step": 20018 + }, + { + "epoch": 1.7062132446944516, + "grad_norm": 58.52843231273947, + "learning_rate": 4.673716228585623e-06, + "loss": 4.1686, + "step": 20019 + }, + { + "epoch": 1.7062984743884768, + "grad_norm": 86.25005733504265, + "learning_rate": 4.673221439261723e-06, + "loss": 3.3432, + "step": 20020 + }, + { + "epoch": 1.7063837040825023, + "grad_norm": 43.663111046210055, + "learning_rate": 4.6727266531515615e-06, + "loss": 3.1997, + "step": 20021 + }, + { + "epoch": 1.7064689337765278, + "grad_norm": 36.5436885841175, + "learning_rate": 4.672231870260002e-06, + "loss": 2.2412, + "step": 20022 + }, + { + "epoch": 1.7065541634705532, + "grad_norm": 48.72033820386457, + "learning_rate": 4.671737090591913e-06, + "loss": 2.6073, + "step": 20023 + }, + { + "epoch": 1.7066393931645787, + "grad_norm": 54.06583441129581, + "learning_rate": 4.671242314152157e-06, + "loss": 3.5741, + "step": 20024 + }, + { + "epoch": 1.706724622858604, + "grad_norm": 90.05988871443473, + "learning_rate": 4.670747540945605e-06, + "loss": 3.4901, + "step": 20025 + }, + { + "epoch": 1.7068098525526292, + "grad_norm": 54.253998691646025, + "learning_rate": 4.67025277097712e-06, + "loss": 2.6702, + "step": 20026 + }, + { + "epoch": 1.7068950822466546, + "grad_norm": 75.15705728297347, + "learning_rate": 4.6697580042515665e-06, + "loss": 2.5548, + "step": 20027 + }, + { + "epoch": 1.7069803119406801, + "grad_norm": 88.72687699184715, + "learning_rate": 4.669263240773813e-06, + "loss": 3.602, + "step": 20028 + }, + { + "epoch": 1.7070655416347056, + "grad_norm": 50.22175144958627, + "learning_rate": 4.668768480548722e-06, + "loss": 1.9466, + "step": 20029 + }, + { + "epoch": 1.707150771328731, + "grad_norm": 32.371777190750045, + "learning_rate": 4.668273723581162e-06, + "loss": 1.9373, + "step": 20030 + }, + { + "epoch": 1.7072360010227563, + "grad_norm": 55.53354728860408, + "learning_rate": 4.667778969875999e-06, + "loss": 2.3955, + "step": 20031 + }, + { + "epoch": 1.7073212307167818, + "grad_norm": 57.141996375154704, + "learning_rate": 4.667284219438099e-06, + "loss": 2.0977, + "step": 20032 + }, + { + "epoch": 1.707406460410807, + "grad_norm": 47.98027856095101, + "learning_rate": 4.6667894722723235e-06, + "loss": 3.0571, + "step": 20033 + }, + { + "epoch": 1.7074916901048325, + "grad_norm": 92.65091501889283, + "learning_rate": 4.666294728383543e-06, + "loss": 4.7392, + "step": 20034 + }, + { + "epoch": 1.707576919798858, + "grad_norm": 63.878586452729344, + "learning_rate": 4.665799987776621e-06, + "loss": 3.4851, + "step": 20035 + }, + { + "epoch": 1.7076621494928834, + "grad_norm": 43.52893165909261, + "learning_rate": 4.665305250456424e-06, + "loss": 2.8879, + "step": 20036 + }, + { + "epoch": 1.7077473791869087, + "grad_norm": 66.17602923257435, + "learning_rate": 4.6648105164278146e-06, + "loss": 3.1264, + "step": 20037 + }, + { + "epoch": 1.7078326088809341, + "grad_norm": 60.45978040043167, + "learning_rate": 4.6643157856956634e-06, + "loss": 2.7276, + "step": 20038 + }, + { + "epoch": 1.7079178385749594, + "grad_norm": 32.62680103947187, + "learning_rate": 4.663821058264832e-06, + "loss": 2.0782, + "step": 20039 + }, + { + "epoch": 1.7080030682689848, + "grad_norm": 38.80175474254965, + "learning_rate": 4.663326334140187e-06, + "loss": 2.7175, + "step": 20040 + }, + { + "epoch": 1.7080882979630103, + "grad_norm": 56.63608188003557, + "learning_rate": 4.662831613326594e-06, + "loss": 3.6142, + "step": 20041 + }, + { + "epoch": 1.7081735276570358, + "grad_norm": 87.6734548628073, + "learning_rate": 4.6623368958289165e-06, + "loss": 3.2965, + "step": 20042 + }, + { + "epoch": 1.7082587573510613, + "grad_norm": 95.20610286485234, + "learning_rate": 4.661842181652022e-06, + "loss": 3.0875, + "step": 20043 + }, + { + "epoch": 1.7083439870450865, + "grad_norm": 35.899465549124585, + "learning_rate": 4.661347470800776e-06, + "loss": 2.9454, + "step": 20044 + }, + { + "epoch": 1.7084292167391117, + "grad_norm": 27.484806594099688, + "learning_rate": 4.660852763280044e-06, + "loss": 2.5964, + "step": 20045 + }, + { + "epoch": 1.7085144464331372, + "grad_norm": 66.10799355600626, + "learning_rate": 4.660358059094687e-06, + "loss": 4.1591, + "step": 20046 + }, + { + "epoch": 1.7085996761271627, + "grad_norm": 47.79869322024666, + "learning_rate": 4.659863358249575e-06, + "loss": 2.869, + "step": 20047 + }, + { + "epoch": 1.7086849058211881, + "grad_norm": 56.75180278708542, + "learning_rate": 4.659368660749574e-06, + "loss": 2.4811, + "step": 20048 + }, + { + "epoch": 1.7087701355152136, + "grad_norm": 59.030903786677314, + "learning_rate": 4.658873966599546e-06, + "loss": 3.5181, + "step": 20049 + }, + { + "epoch": 1.7088553652092389, + "grad_norm": 122.05259733919934, + "learning_rate": 4.658379275804355e-06, + "loss": 4.7195, + "step": 20050 + }, + { + "epoch": 1.7089405949032643, + "grad_norm": 58.28534695527402, + "learning_rate": 4.657884588368869e-06, + "loss": 2.2595, + "step": 20051 + }, + { + "epoch": 1.7090258245972896, + "grad_norm": 79.5461746481489, + "learning_rate": 4.6573899042979535e-06, + "loss": 2.7701, + "step": 20052 + }, + { + "epoch": 1.709111054291315, + "grad_norm": 43.3753145240707, + "learning_rate": 4.656895223596472e-06, + "loss": 2.7799, + "step": 20053 + }, + { + "epoch": 1.7091962839853405, + "grad_norm": 62.696806757214354, + "learning_rate": 4.656400546269287e-06, + "loss": 2.817, + "step": 20054 + }, + { + "epoch": 1.709281513679366, + "grad_norm": 45.499731975736715, + "learning_rate": 4.655905872321267e-06, + "loss": 2.9205, + "step": 20055 + }, + { + "epoch": 1.7093667433733915, + "grad_norm": 56.06308866904938, + "learning_rate": 4.655411201757278e-06, + "loss": 3.4225, + "step": 20056 + }, + { + "epoch": 1.7094519730674167, + "grad_norm": 44.963660078862425, + "learning_rate": 4.654916534582182e-06, + "loss": 3.5991, + "step": 20057 + }, + { + "epoch": 1.709537202761442, + "grad_norm": 44.31476543674926, + "learning_rate": 4.654421870800844e-06, + "loss": 2.8641, + "step": 20058 + }, + { + "epoch": 1.7096224324554674, + "grad_norm": 80.37937579416229, + "learning_rate": 4.653927210418129e-06, + "loss": 3.0475, + "step": 20059 + }, + { + "epoch": 1.7097076621494929, + "grad_norm": 105.93523159794799, + "learning_rate": 4.6534325534389045e-06, + "loss": 3.6664, + "step": 20060 + }, + { + "epoch": 1.7097928918435183, + "grad_norm": 56.50023365680108, + "learning_rate": 4.652937899868033e-06, + "loss": 3.4037, + "step": 20061 + }, + { + "epoch": 1.7098781215375438, + "grad_norm": 58.93825604054571, + "learning_rate": 4.652443249710378e-06, + "loss": 2.9956, + "step": 20062 + }, + { + "epoch": 1.709963351231569, + "grad_norm": 111.07847252744058, + "learning_rate": 4.651948602970804e-06, + "loss": 2.9309, + "step": 20063 + }, + { + "epoch": 1.7100485809255945, + "grad_norm": 68.91293258049893, + "learning_rate": 4.65145395965418e-06, + "loss": 2.7101, + "step": 20064 + }, + { + "epoch": 1.7101338106196198, + "grad_norm": 88.17783786829632, + "learning_rate": 4.650959319765368e-06, + "loss": 3.1576, + "step": 20065 + }, + { + "epoch": 1.7102190403136452, + "grad_norm": 44.871131673818134, + "learning_rate": 4.650464683309231e-06, + "loss": 2.6244, + "step": 20066 + }, + { + "epoch": 1.7103042700076707, + "grad_norm": 33.5139030200731, + "learning_rate": 4.649970050290634e-06, + "loss": 2.2274, + "step": 20067 + }, + { + "epoch": 1.7103894997016962, + "grad_norm": 48.0011781319751, + "learning_rate": 4.649475420714445e-06, + "loss": 2.6534, + "step": 20068 + }, + { + "epoch": 1.7104747293957214, + "grad_norm": 33.1315799053478, + "learning_rate": 4.6489807945855245e-06, + "loss": 3.0719, + "step": 20069 + }, + { + "epoch": 1.710559959089747, + "grad_norm": 31.07344895144538, + "learning_rate": 4.648486171908739e-06, + "loss": 2.2994, + "step": 20070 + }, + { + "epoch": 1.7106451887837721, + "grad_norm": 46.291346499763684, + "learning_rate": 4.647991552688952e-06, + "loss": 2.7809, + "step": 20071 + }, + { + "epoch": 1.7107304184777976, + "grad_norm": 49.8708181506838, + "learning_rate": 4.647496936931027e-06, + "loss": 3.3334, + "step": 20072 + }, + { + "epoch": 1.710815648171823, + "grad_norm": 33.75315746528572, + "learning_rate": 4.647002324639832e-06, + "loss": 1.9992, + "step": 20073 + }, + { + "epoch": 1.7109008778658485, + "grad_norm": 101.91129855989702, + "learning_rate": 4.646507715820227e-06, + "loss": 2.2067, + "step": 20074 + }, + { + "epoch": 1.710986107559874, + "grad_norm": 118.72932427549918, + "learning_rate": 4.64601311047708e-06, + "loss": 3.3937, + "step": 20075 + }, + { + "epoch": 1.7110713372538993, + "grad_norm": 40.27334746172971, + "learning_rate": 4.64551850861525e-06, + "loss": 2.8988, + "step": 20076 + }, + { + "epoch": 1.7111565669479245, + "grad_norm": 30.075561936095454, + "learning_rate": 4.645023910239608e-06, + "loss": 2.1525, + "step": 20077 + }, + { + "epoch": 1.71124179664195, + "grad_norm": 78.28104756190997, + "learning_rate": 4.644529315355013e-06, + "loss": 3.551, + "step": 20078 + }, + { + "epoch": 1.7113270263359754, + "grad_norm": 43.543151375963994, + "learning_rate": 4.644034723966332e-06, + "loss": 3.0794, + "step": 20079 + }, + { + "epoch": 1.711412256030001, + "grad_norm": 75.41938277574384, + "learning_rate": 4.643540136078426e-06, + "loss": 4.0486, + "step": 20080 + }, + { + "epoch": 1.7114974857240264, + "grad_norm": 79.31756637829412, + "learning_rate": 4.643045551696164e-06, + "loss": 4.1929, + "step": 20081 + }, + { + "epoch": 1.7115827154180516, + "grad_norm": 30.29184076740031, + "learning_rate": 4.6425509708244045e-06, + "loss": 2.7254, + "step": 20082 + }, + { + "epoch": 1.711667945112077, + "grad_norm": 34.36747691127226, + "learning_rate": 4.642056393468017e-06, + "loss": 2.9219, + "step": 20083 + }, + { + "epoch": 1.7117531748061023, + "grad_norm": 52.83786245542114, + "learning_rate": 4.641561819631859e-06, + "loss": 2.7525, + "step": 20084 + }, + { + "epoch": 1.7118384045001278, + "grad_norm": 34.55198891642362, + "learning_rate": 4.641067249320801e-06, + "loss": 2.7182, + "step": 20085 + }, + { + "epoch": 1.7119236341941533, + "grad_norm": 43.37680856662717, + "learning_rate": 4.640572682539703e-06, + "loss": 1.9864, + "step": 20086 + }, + { + "epoch": 1.7120088638881787, + "grad_norm": 59.95316044475638, + "learning_rate": 4.640078119293431e-06, + "loss": 2.0356, + "step": 20087 + }, + { + "epoch": 1.712094093582204, + "grad_norm": 34.77710365256896, + "learning_rate": 4.639583559586847e-06, + "loss": 3.2377, + "step": 20088 + }, + { + "epoch": 1.7121793232762295, + "grad_norm": 80.61520552717936, + "learning_rate": 4.6390890034248135e-06, + "loss": 3.2093, + "step": 20089 + }, + { + "epoch": 1.7122645529702547, + "grad_norm": 47.555963586124456, + "learning_rate": 4.638594450812198e-06, + "loss": 2.0252, + "step": 20090 + }, + { + "epoch": 1.7123497826642802, + "grad_norm": 37.02371611373623, + "learning_rate": 4.638099901753863e-06, + "loss": 1.9786, + "step": 20091 + }, + { + "epoch": 1.7124350123583056, + "grad_norm": 46.23252231407429, + "learning_rate": 4.637605356254672e-06, + "loss": 2.4238, + "step": 20092 + }, + { + "epoch": 1.712520242052331, + "grad_norm": 37.672684046944546, + "learning_rate": 4.637110814319485e-06, + "loss": 3.215, + "step": 20093 + }, + { + "epoch": 1.7126054717463566, + "grad_norm": 34.9730753842641, + "learning_rate": 4.636616275953172e-06, + "loss": 2.3326, + "step": 20094 + }, + { + "epoch": 1.7126907014403818, + "grad_norm": 54.19473810285499, + "learning_rate": 4.636121741160593e-06, + "loss": 2.9685, + "step": 20095 + }, + { + "epoch": 1.712775931134407, + "grad_norm": 36.26117873382227, + "learning_rate": 4.635627209946613e-06, + "loss": 3.0462, + "step": 20096 + }, + { + "epoch": 1.7128611608284325, + "grad_norm": 42.35444613430768, + "learning_rate": 4.635132682316092e-06, + "loss": 2.9, + "step": 20097 + }, + { + "epoch": 1.712946390522458, + "grad_norm": 39.275848187229116, + "learning_rate": 4.634638158273897e-06, + "loss": 2.6989, + "step": 20098 + }, + { + "epoch": 1.7130316202164835, + "grad_norm": 58.50956275373738, + "learning_rate": 4.634143637824892e-06, + "loss": 1.9629, + "step": 20099 + }, + { + "epoch": 1.713116849910509, + "grad_norm": 142.1739428851712, + "learning_rate": 4.633649120973939e-06, + "loss": 2.7109, + "step": 20100 + }, + { + "epoch": 1.7132020796045342, + "grad_norm": 41.06921689188928, + "learning_rate": 4.6331546077259e-06, + "loss": 3.1882, + "step": 20101 + }, + { + "epoch": 1.7132873092985597, + "grad_norm": 139.9389785780794, + "learning_rate": 4.632660098085638e-06, + "loss": 3.6416, + "step": 20102 + }, + { + "epoch": 1.713372538992585, + "grad_norm": 49.706532003850754, + "learning_rate": 4.632165592058021e-06, + "loss": 2.712, + "step": 20103 + }, + { + "epoch": 1.7134577686866104, + "grad_norm": 30.721599728946504, + "learning_rate": 4.631671089647909e-06, + "loss": 2.4378, + "step": 20104 + }, + { + "epoch": 1.7135429983806358, + "grad_norm": 37.22843358013257, + "learning_rate": 4.631176590860164e-06, + "loss": 2.8879, + "step": 20105 + }, + { + "epoch": 1.7136282280746613, + "grad_norm": 97.70664632144462, + "learning_rate": 4.63068209569965e-06, + "loss": 4.4476, + "step": 20106 + }, + { + "epoch": 1.7137134577686868, + "grad_norm": 73.90343297454697, + "learning_rate": 4.630187604171233e-06, + "loss": 3.4076, + "step": 20107 + }, + { + "epoch": 1.713798687462712, + "grad_norm": 41.33168808368, + "learning_rate": 4.629693116279774e-06, + "loss": 2.8099, + "step": 20108 + }, + { + "epoch": 1.7138839171567373, + "grad_norm": 65.29018048393115, + "learning_rate": 4.6291986320301355e-06, + "loss": 2.4793, + "step": 20109 + }, + { + "epoch": 1.7139691468507627, + "grad_norm": 61.51394112210878, + "learning_rate": 4.628704151427179e-06, + "loss": 2.4903, + "step": 20110 + }, + { + "epoch": 1.7140543765447882, + "grad_norm": 33.27468531905441, + "learning_rate": 4.628209674475773e-06, + "loss": 2.4898, + "step": 20111 + }, + { + "epoch": 1.7141396062388137, + "grad_norm": 76.17172083505027, + "learning_rate": 4.627715201180776e-06, + "loss": 1.3031, + "step": 20112 + }, + { + "epoch": 1.7142248359328391, + "grad_norm": 31.815974049517433, + "learning_rate": 4.627220731547051e-06, + "loss": 2.4383, + "step": 20113 + }, + { + "epoch": 1.7143100656268644, + "grad_norm": 27.96224494464118, + "learning_rate": 4.626726265579464e-06, + "loss": 2.4508, + "step": 20114 + }, + { + "epoch": 1.7143952953208896, + "grad_norm": 36.62904663329939, + "learning_rate": 4.626231803282873e-06, + "loss": 2.8092, + "step": 20115 + }, + { + "epoch": 1.714480525014915, + "grad_norm": 47.58995441435916, + "learning_rate": 4.6257373446621455e-06, + "loss": 2.7219, + "step": 20116 + }, + { + "epoch": 1.7145657547089406, + "grad_norm": 42.07291540219004, + "learning_rate": 4.6252428897221416e-06, + "loss": 3.1082, + "step": 20117 + }, + { + "epoch": 1.714650984402966, + "grad_norm": 68.04342177095731, + "learning_rate": 4.624748438467726e-06, + "loss": 3.1799, + "step": 20118 + }, + { + "epoch": 1.7147362140969915, + "grad_norm": 61.388226624170585, + "learning_rate": 4.624253990903759e-06, + "loss": 2.9218, + "step": 20119 + }, + { + "epoch": 1.7148214437910168, + "grad_norm": 53.55857610725861, + "learning_rate": 4.6237595470351065e-06, + "loss": 3.3484, + "step": 20120 + }, + { + "epoch": 1.7149066734850422, + "grad_norm": 88.8084913972806, + "learning_rate": 4.623265106866629e-06, + "loss": 3.0843, + "step": 20121 + }, + { + "epoch": 1.7149919031790675, + "grad_norm": 92.81593585196805, + "learning_rate": 4.622770670403189e-06, + "loss": 2.321, + "step": 20122 + }, + { + "epoch": 1.715077132873093, + "grad_norm": 95.04640285836095, + "learning_rate": 4.622276237649648e-06, + "loss": 2.8262, + "step": 20123 + }, + { + "epoch": 1.7151623625671184, + "grad_norm": 53.16111716759346, + "learning_rate": 4.621781808610872e-06, + "loss": 3.8035, + "step": 20124 + }, + { + "epoch": 1.7152475922611439, + "grad_norm": 39.53982914819291, + "learning_rate": 4.621287383291721e-06, + "loss": 3.9462, + "step": 20125 + }, + { + "epoch": 1.7153328219551693, + "grad_norm": 105.14249459745753, + "learning_rate": 4.62079296169706e-06, + "loss": 3.2234, + "step": 20126 + }, + { + "epoch": 1.7154180516491946, + "grad_norm": 78.53313794101584, + "learning_rate": 4.620298543831746e-06, + "loss": 2.8086, + "step": 20127 + }, + { + "epoch": 1.7155032813432198, + "grad_norm": 73.31188104591303, + "learning_rate": 4.619804129700648e-06, + "loss": 2.235, + "step": 20128 + }, + { + "epoch": 1.7155885110372453, + "grad_norm": 90.77986965522724, + "learning_rate": 4.619309719308623e-06, + "loss": 2.5412, + "step": 20129 + }, + { + "epoch": 1.7156737407312708, + "grad_norm": 43.71114236941714, + "learning_rate": 4.6188153126605385e-06, + "loss": 2.5549, + "step": 20130 + }, + { + "epoch": 1.7157589704252962, + "grad_norm": 70.70876805519032, + "learning_rate": 4.618320909761252e-06, + "loss": 3.56, + "step": 20131 + }, + { + "epoch": 1.7158442001193217, + "grad_norm": 78.34442653483542, + "learning_rate": 4.617826510615628e-06, + "loss": 2.9216, + "step": 20132 + }, + { + "epoch": 1.715929429813347, + "grad_norm": 75.53692097593434, + "learning_rate": 4.617332115228527e-06, + "loss": 3.4338, + "step": 20133 + }, + { + "epoch": 1.7160146595073724, + "grad_norm": 24.571614129294687, + "learning_rate": 4.616837723604815e-06, + "loss": 1.2652, + "step": 20134 + }, + { + "epoch": 1.7160998892013977, + "grad_norm": 51.90319565788997, + "learning_rate": 4.616343335749352e-06, + "loss": 2.6861, + "step": 20135 + }, + { + "epoch": 1.7161851188954231, + "grad_norm": 92.6594717620851, + "learning_rate": 4.615848951666996e-06, + "loss": 2.9455, + "step": 20136 + }, + { + "epoch": 1.7162703485894486, + "grad_norm": 105.78750191480898, + "learning_rate": 4.615354571362615e-06, + "loss": 2.1558, + "step": 20137 + }, + { + "epoch": 1.716355578283474, + "grad_norm": 43.82031811113961, + "learning_rate": 4.6148601948410706e-06, + "loss": 3.771, + "step": 20138 + }, + { + "epoch": 1.7164408079774993, + "grad_norm": 72.89602889350228, + "learning_rate": 4.614365822107223e-06, + "loss": 4.5677, + "step": 20139 + }, + { + "epoch": 1.7165260376715248, + "grad_norm": 226.0768336953422, + "learning_rate": 4.613871453165931e-06, + "loss": 2.8561, + "step": 20140 + }, + { + "epoch": 1.71661126736555, + "grad_norm": 49.51922767948956, + "learning_rate": 4.6133770880220615e-06, + "loss": 1.552, + "step": 20141 + }, + { + "epoch": 1.7166964970595755, + "grad_norm": 55.084148064593606, + "learning_rate": 4.612882726680476e-06, + "loss": 2.9482, + "step": 20142 + }, + { + "epoch": 1.716781726753601, + "grad_norm": 50.01669044186328, + "learning_rate": 4.612388369146036e-06, + "loss": 3.1248, + "step": 20143 + }, + { + "epoch": 1.7168669564476264, + "grad_norm": 36.57042638069882, + "learning_rate": 4.611894015423599e-06, + "loss": 2.9041, + "step": 20144 + }, + { + "epoch": 1.716952186141652, + "grad_norm": 71.0669615941928, + "learning_rate": 4.611399665518031e-06, + "loss": 3.4449, + "step": 20145 + }, + { + "epoch": 1.7170374158356771, + "grad_norm": 57.90831131614232, + "learning_rate": 4.610905319434195e-06, + "loss": 3.627, + "step": 20146 + }, + { + "epoch": 1.7171226455297024, + "grad_norm": 38.86012936971369, + "learning_rate": 4.61041097717695e-06, + "loss": 3.4564, + "step": 20147 + }, + { + "epoch": 1.7172078752237279, + "grad_norm": 125.99775907249463, + "learning_rate": 4.6099166387511566e-06, + "loss": 3.1664, + "step": 20148 + }, + { + "epoch": 1.7172931049177533, + "grad_norm": 50.05169182451154, + "learning_rate": 4.609422304161677e-06, + "loss": 2.5571, + "step": 20149 + }, + { + "epoch": 1.7173783346117788, + "grad_norm": 142.42335146856064, + "learning_rate": 4.608927973413377e-06, + "loss": 3.5096, + "step": 20150 + }, + { + "epoch": 1.7174635643058043, + "grad_norm": 32.70884066656823, + "learning_rate": 4.608433646511114e-06, + "loss": 2.3754, + "step": 20151 + }, + { + "epoch": 1.7175487939998295, + "grad_norm": 55.0494981914577, + "learning_rate": 4.607939323459751e-06, + "loss": 2.4238, + "step": 20152 + }, + { + "epoch": 1.717634023693855, + "grad_norm": 38.10207896087939, + "learning_rate": 4.607445004264147e-06, + "loss": 3.437, + "step": 20153 + }, + { + "epoch": 1.7177192533878802, + "grad_norm": 49.13291458555488, + "learning_rate": 4.606950688929167e-06, + "loss": 3.4597, + "step": 20154 + }, + { + "epoch": 1.7178044830819057, + "grad_norm": 46.95164765978474, + "learning_rate": 4.606456377459671e-06, + "loss": 2.924, + "step": 20155 + }, + { + "epoch": 1.7178897127759312, + "grad_norm": 52.81417674414919, + "learning_rate": 4.605962069860519e-06, + "loss": 2.2765, + "step": 20156 + }, + { + "epoch": 1.7179749424699566, + "grad_norm": 39.28688683800296, + "learning_rate": 4.6054677661365734e-06, + "loss": 3.4572, + "step": 20157 + }, + { + "epoch": 1.7180601721639819, + "grad_norm": 28.964775585949216, + "learning_rate": 4.6049734662926965e-06, + "loss": 2.2124, + "step": 20158 + }, + { + "epoch": 1.7181454018580073, + "grad_norm": 51.7214365127659, + "learning_rate": 4.604479170333749e-06, + "loss": 3.2761, + "step": 20159 + }, + { + "epoch": 1.7182306315520326, + "grad_norm": 28.37974908357546, + "learning_rate": 4.603984878264591e-06, + "loss": 2.4431, + "step": 20160 + }, + { + "epoch": 1.718315861246058, + "grad_norm": 82.22995405105434, + "learning_rate": 4.6034905900900845e-06, + "loss": 3.7627, + "step": 20161 + }, + { + "epoch": 1.7184010909400835, + "grad_norm": 67.777054479072, + "learning_rate": 4.602996305815089e-06, + "loss": 3.5662, + "step": 20162 + }, + { + "epoch": 1.718486320634109, + "grad_norm": 34.58248838047312, + "learning_rate": 4.602502025444469e-06, + "loss": 2.1937, + "step": 20163 + }, + { + "epoch": 1.7185715503281345, + "grad_norm": 37.38216076532514, + "learning_rate": 4.602007748983082e-06, + "loss": 2.8835, + "step": 20164 + }, + { + "epoch": 1.7186567800221597, + "grad_norm": 66.3908029476545, + "learning_rate": 4.6015134764357935e-06, + "loss": 2.8218, + "step": 20165 + }, + { + "epoch": 1.718742009716185, + "grad_norm": 57.38052820813083, + "learning_rate": 4.601019207807458e-06, + "loss": 2.6235, + "step": 20166 + }, + { + "epoch": 1.7188272394102104, + "grad_norm": 75.5925577355663, + "learning_rate": 4.600524943102942e-06, + "loss": 4.1855, + "step": 20167 + }, + { + "epoch": 1.718912469104236, + "grad_norm": 60.073320982218966, + "learning_rate": 4.600030682327105e-06, + "loss": 3.5494, + "step": 20168 + }, + { + "epoch": 1.7189976987982614, + "grad_norm": 39.57010710949187, + "learning_rate": 4.599536425484807e-06, + "loss": 3.5595, + "step": 20169 + }, + { + "epoch": 1.7190829284922868, + "grad_norm": 83.5108384357742, + "learning_rate": 4.5990421725809075e-06, + "loss": 3.0798, + "step": 20170 + }, + { + "epoch": 1.719168158186312, + "grad_norm": 91.38611729817889, + "learning_rate": 4.598547923620271e-06, + "loss": 2.6622, + "step": 20171 + }, + { + "epoch": 1.7192533878803375, + "grad_norm": 34.04859974284169, + "learning_rate": 4.5980536786077546e-06, + "loss": 2.9575, + "step": 20172 + }, + { + "epoch": 1.7193386175743628, + "grad_norm": 94.05579622376649, + "learning_rate": 4.597559437548222e-06, + "loss": 2.4792, + "step": 20173 + }, + { + "epoch": 1.7194238472683883, + "grad_norm": 38.26673954430856, + "learning_rate": 4.5970652004465324e-06, + "loss": 1.9099, + "step": 20174 + }, + { + "epoch": 1.7195090769624137, + "grad_norm": 60.37762344076787, + "learning_rate": 4.5965709673075445e-06, + "loss": 2.6505, + "step": 20175 + }, + { + "epoch": 1.7195943066564392, + "grad_norm": 55.38170798902694, + "learning_rate": 4.596076738136122e-06, + "loss": 3.0136, + "step": 20176 + }, + { + "epoch": 1.7196795363504647, + "grad_norm": 57.923000926191456, + "learning_rate": 4.595582512937125e-06, + "loss": 2.8896, + "step": 20177 + }, + { + "epoch": 1.71976476604449, + "grad_norm": 49.52076198596574, + "learning_rate": 4.595088291715413e-06, + "loss": 2.5486, + "step": 20178 + }, + { + "epoch": 1.7198499957385152, + "grad_norm": 38.52056816358492, + "learning_rate": 4.594594074475844e-06, + "loss": 1.5232, + "step": 20179 + }, + { + "epoch": 1.7199352254325406, + "grad_norm": 48.552960717805675, + "learning_rate": 4.594099861223284e-06, + "loss": 2.2085, + "step": 20180 + }, + { + "epoch": 1.720020455126566, + "grad_norm": 95.72719664990089, + "learning_rate": 4.59360565196259e-06, + "loss": 3.4108, + "step": 20181 + }, + { + "epoch": 1.7201056848205916, + "grad_norm": 56.40756712617415, + "learning_rate": 4.593111446698625e-06, + "loss": 3.6136, + "step": 20182 + }, + { + "epoch": 1.720190914514617, + "grad_norm": 57.44807619621398, + "learning_rate": 4.592617245436244e-06, + "loss": 2.3886, + "step": 20183 + }, + { + "epoch": 1.7202761442086423, + "grad_norm": 51.53815275299757, + "learning_rate": 4.592123048180311e-06, + "loss": 3.2578, + "step": 20184 + }, + { + "epoch": 1.7203613739026675, + "grad_norm": 55.517699675021035, + "learning_rate": 4.591628854935687e-06, + "loss": 2.4371, + "step": 20185 + }, + { + "epoch": 1.720446603596693, + "grad_norm": 48.11508258502285, + "learning_rate": 4.591134665707232e-06, + "loss": 3.0109, + "step": 20186 + }, + { + "epoch": 1.7205318332907185, + "grad_norm": 36.810906718618256, + "learning_rate": 4.590640480499802e-06, + "loss": 2.8709, + "step": 20187 + }, + { + "epoch": 1.720617062984744, + "grad_norm": 42.38786676781172, + "learning_rate": 4.5901462993182614e-06, + "loss": 2.6392, + "step": 20188 + }, + { + "epoch": 1.7207022926787694, + "grad_norm": 79.70023014457394, + "learning_rate": 4.5896521221674695e-06, + "loss": 2.311, + "step": 20189 + }, + { + "epoch": 1.7207875223727946, + "grad_norm": 54.12707903464143, + "learning_rate": 4.589157949052287e-06, + "loss": 3.3071, + "step": 20190 + }, + { + "epoch": 1.72087275206682, + "grad_norm": 67.11787432737617, + "learning_rate": 4.5886637799775715e-06, + "loss": 2.7807, + "step": 20191 + }, + { + "epoch": 1.7209579817608454, + "grad_norm": 64.31588720373885, + "learning_rate": 4.588169614948183e-06, + "loss": 3.5582, + "step": 20192 + }, + { + "epoch": 1.7210432114548708, + "grad_norm": 33.589269953978814, + "learning_rate": 4.5876754539689844e-06, + "loss": 2.7728, + "step": 20193 + }, + { + "epoch": 1.7211284411488963, + "grad_norm": 51.254200008964176, + "learning_rate": 4.5871812970448344e-06, + "loss": 3.1604, + "step": 20194 + }, + { + "epoch": 1.7212136708429218, + "grad_norm": 58.06773265831159, + "learning_rate": 4.586687144180591e-06, + "loss": 2.7383, + "step": 20195 + }, + { + "epoch": 1.7212989005369472, + "grad_norm": 29.390634463108, + "learning_rate": 4.586192995381114e-06, + "loss": 2.5716, + "step": 20196 + }, + { + "epoch": 1.7213841302309725, + "grad_norm": 44.90835992312167, + "learning_rate": 4.585698850651268e-06, + "loss": 3.8142, + "step": 20197 + }, + { + "epoch": 1.7214693599249977, + "grad_norm": 49.997049823783556, + "learning_rate": 4.585204709995908e-06, + "loss": 2.5435, + "step": 20198 + }, + { + "epoch": 1.7215545896190232, + "grad_norm": 73.93022766653966, + "learning_rate": 4.584710573419894e-06, + "loss": 2.7858, + "step": 20199 + }, + { + "epoch": 1.7216398193130487, + "grad_norm": 49.15938439975211, + "learning_rate": 4.584216440928085e-06, + "loss": 2.3317, + "step": 20200 + }, + { + "epoch": 1.7217250490070741, + "grad_norm": 49.90336951486062, + "learning_rate": 4.5837223125253445e-06, + "loss": 3.0787, + "step": 20201 + }, + { + "epoch": 1.7218102787010996, + "grad_norm": 30.27589311540829, + "learning_rate": 4.58322818821653e-06, + "loss": 2.6249, + "step": 20202 + }, + { + "epoch": 1.7218955083951248, + "grad_norm": 40.165122561659196, + "learning_rate": 4.5827340680065e-06, + "loss": 2.7532, + "step": 20203 + }, + { + "epoch": 1.7219807380891503, + "grad_norm": 32.101214922772975, + "learning_rate": 4.5822399519001145e-06, + "loss": 2.5775, + "step": 20204 + }, + { + "epoch": 1.7220659677831756, + "grad_norm": 41.95673341186705, + "learning_rate": 4.581745839902232e-06, + "loss": 2.6759, + "step": 20205 + }, + { + "epoch": 1.722151197477201, + "grad_norm": 77.15175655526761, + "learning_rate": 4.581251732017714e-06, + "loss": 3.4025, + "step": 20206 + }, + { + "epoch": 1.7222364271712265, + "grad_norm": 61.37339260458796, + "learning_rate": 4.580757628251419e-06, + "loss": 3.5462, + "step": 20207 + }, + { + "epoch": 1.722321656865252, + "grad_norm": 32.27580066759578, + "learning_rate": 4.580263528608207e-06, + "loss": 1.9596, + "step": 20208 + }, + { + "epoch": 1.7224068865592772, + "grad_norm": 75.47954962729646, + "learning_rate": 4.579769433092933e-06, + "loss": 3.7096, + "step": 20209 + }, + { + "epoch": 1.7224921162533027, + "grad_norm": 38.394036631466044, + "learning_rate": 4.579275341710463e-06, + "loss": 3.744, + "step": 20210 + }, + { + "epoch": 1.722577345947328, + "grad_norm": 37.8988168554407, + "learning_rate": 4.578781254465651e-06, + "loss": 2.9193, + "step": 20211 + }, + { + "epoch": 1.7226625756413534, + "grad_norm": 37.34818462978154, + "learning_rate": 4.578287171363359e-06, + "loss": 2.7716, + "step": 20212 + }, + { + "epoch": 1.7227478053353789, + "grad_norm": 96.87672559501318, + "learning_rate": 4.577793092408444e-06, + "loss": 3.4903, + "step": 20213 + }, + { + "epoch": 1.7228330350294043, + "grad_norm": 42.22970621674323, + "learning_rate": 4.577299017605768e-06, + "loss": 2.8312, + "step": 20214 + }, + { + "epoch": 1.7229182647234298, + "grad_norm": 36.77671061509869, + "learning_rate": 4.576804946960186e-06, + "loss": 2.4284, + "step": 20215 + }, + { + "epoch": 1.723003494417455, + "grad_norm": 39.642287426229785, + "learning_rate": 4.576310880476561e-06, + "loss": 2.4878, + "step": 20216 + }, + { + "epoch": 1.7230887241114803, + "grad_norm": 51.7272391969836, + "learning_rate": 4.575816818159751e-06, + "loss": 3.1485, + "step": 20217 + }, + { + "epoch": 1.7231739538055058, + "grad_norm": 45.870723892686115, + "learning_rate": 4.575322760014611e-06, + "loss": 3.5439, + "step": 20218 + }, + { + "epoch": 1.7232591834995312, + "grad_norm": 30.708249692626207, + "learning_rate": 4.574828706046004e-06, + "loss": 1.6878, + "step": 20219 + }, + { + "epoch": 1.7233444131935567, + "grad_norm": 32.40163120761484, + "learning_rate": 4.57433465625879e-06, + "loss": 2.5387, + "step": 20220 + }, + { + "epoch": 1.7234296428875822, + "grad_norm": 31.255820240730095, + "learning_rate": 4.5738406106578234e-06, + "loss": 2.1664, + "step": 20221 + }, + { + "epoch": 1.7235148725816074, + "grad_norm": 68.06092680528947, + "learning_rate": 4.5733465692479636e-06, + "loss": 2.6186, + "step": 20222 + }, + { + "epoch": 1.7236001022756329, + "grad_norm": 53.76690778909633, + "learning_rate": 4.572852532034073e-06, + "loss": 3.2213, + "step": 20223 + }, + { + "epoch": 1.7236853319696581, + "grad_norm": 99.2808137047268, + "learning_rate": 4.572358499021008e-06, + "loss": 2.0023, + "step": 20224 + }, + { + "epoch": 1.7237705616636836, + "grad_norm": 34.17029969607172, + "learning_rate": 4.571864470213627e-06, + "loss": 2.6097, + "step": 20225 + }, + { + "epoch": 1.723855791357709, + "grad_norm": 65.69249127782817, + "learning_rate": 4.571370445616788e-06, + "loss": 3.097, + "step": 20226 + }, + { + "epoch": 1.7239410210517345, + "grad_norm": 31.641379084097267, + "learning_rate": 4.57087642523535e-06, + "loss": 2.8959, + "step": 20227 + }, + { + "epoch": 1.7240262507457598, + "grad_norm": 42.0215962703774, + "learning_rate": 4.570382409074173e-06, + "loss": 1.96, + "step": 20228 + }, + { + "epoch": 1.7241114804397852, + "grad_norm": 56.461402108975214, + "learning_rate": 4.569888397138115e-06, + "loss": 3.6477, + "step": 20229 + }, + { + "epoch": 1.7241967101338105, + "grad_norm": 130.06056374897764, + "learning_rate": 4.569394389432031e-06, + "loss": 3.4036, + "step": 20230 + }, + { + "epoch": 1.724281939827836, + "grad_norm": 46.71686834069053, + "learning_rate": 4.568900385960784e-06, + "loss": 2.7517, + "step": 20231 + }, + { + "epoch": 1.7243671695218614, + "grad_norm": 38.2891930645436, + "learning_rate": 4.568406386729231e-06, + "loss": 2.9683, + "step": 20232 + }, + { + "epoch": 1.7244523992158869, + "grad_norm": 146.6427246631645, + "learning_rate": 4.56791239174223e-06, + "loss": 2.436, + "step": 20233 + }, + { + "epoch": 1.7245376289099124, + "grad_norm": 29.072618930733494, + "learning_rate": 4.567418401004637e-06, + "loss": 2.9931, + "step": 20234 + }, + { + "epoch": 1.7246228586039376, + "grad_norm": 72.28595373164707, + "learning_rate": 4.5669244145213125e-06, + "loss": 2.497, + "step": 20235 + }, + { + "epoch": 1.7247080882979628, + "grad_norm": 40.92442001294817, + "learning_rate": 4.566430432297115e-06, + "loss": 3.1108, + "step": 20236 + }, + { + "epoch": 1.7247933179919883, + "grad_norm": 46.412442934615136, + "learning_rate": 4.565936454336902e-06, + "loss": 3.4391, + "step": 20237 + }, + { + "epoch": 1.7248785476860138, + "grad_norm": 40.68559025342919, + "learning_rate": 4.565442480645531e-06, + "loss": 1.4345, + "step": 20238 + }, + { + "epoch": 1.7249637773800393, + "grad_norm": 59.904055165224904, + "learning_rate": 4.564948511227859e-06, + "loss": 3.6565, + "step": 20239 + }, + { + "epoch": 1.7250490070740647, + "grad_norm": 42.08037538121731, + "learning_rate": 4.564454546088749e-06, + "loss": 2.335, + "step": 20240 + }, + { + "epoch": 1.72513423676809, + "grad_norm": 58.12257126539883, + "learning_rate": 4.563960585233054e-06, + "loss": 2.7718, + "step": 20241 + }, + { + "epoch": 1.7252194664621154, + "grad_norm": 36.08745273583307, + "learning_rate": 4.5634666286656325e-06, + "loss": 2.6407, + "step": 20242 + }, + { + "epoch": 1.7253046961561407, + "grad_norm": 41.48927624984363, + "learning_rate": 4.5629726763913425e-06, + "loss": 2.7609, + "step": 20243 + }, + { + "epoch": 1.7253899258501662, + "grad_norm": 39.9848455704213, + "learning_rate": 4.562478728415044e-06, + "loss": 2.9377, + "step": 20244 + }, + { + "epoch": 1.7254751555441916, + "grad_norm": 30.16479891808285, + "learning_rate": 4.561984784741595e-06, + "loss": 1.7733, + "step": 20245 + }, + { + "epoch": 1.725560385238217, + "grad_norm": 65.76774251064636, + "learning_rate": 4.561490845375849e-06, + "loss": 2.2315, + "step": 20246 + }, + { + "epoch": 1.7256456149322426, + "grad_norm": 66.40915131297928, + "learning_rate": 4.560996910322668e-06, + "loss": 3.0764, + "step": 20247 + }, + { + "epoch": 1.7257308446262678, + "grad_norm": 66.48624539355458, + "learning_rate": 4.5605029795869054e-06, + "loss": 2.9767, + "step": 20248 + }, + { + "epoch": 1.725816074320293, + "grad_norm": 42.6124150231046, + "learning_rate": 4.560009053173423e-06, + "loss": 2.9667, + "step": 20249 + }, + { + "epoch": 1.7259013040143185, + "grad_norm": 39.13680197361465, + "learning_rate": 4.559515131087078e-06, + "loss": 3.1624, + "step": 20250 + }, + { + "epoch": 1.725986533708344, + "grad_norm": 38.80557690498191, + "learning_rate": 4.559021213332725e-06, + "loss": 3.4106, + "step": 20251 + }, + { + "epoch": 1.7260717634023695, + "grad_norm": 46.007908861047824, + "learning_rate": 4.558527299915222e-06, + "loss": 2.7435, + "step": 20252 + }, + { + "epoch": 1.726156993096395, + "grad_norm": 64.48850421415361, + "learning_rate": 4.5580333908394304e-06, + "loss": 4.0561, + "step": 20253 + }, + { + "epoch": 1.7262422227904202, + "grad_norm": 80.53272779295249, + "learning_rate": 4.557539486110203e-06, + "loss": 3.6273, + "step": 20254 + }, + { + "epoch": 1.7263274524844456, + "grad_norm": 70.75515643209295, + "learning_rate": 4.5570455857324e-06, + "loss": 3.6041, + "step": 20255 + }, + { + "epoch": 1.7264126821784709, + "grad_norm": 52.13502724482734, + "learning_rate": 4.556551689710876e-06, + "loss": 1.943, + "step": 20256 + }, + { + "epoch": 1.7264979118724963, + "grad_norm": 81.0642017868527, + "learning_rate": 4.5560577980504915e-06, + "loss": 3.1964, + "step": 20257 + }, + { + "epoch": 1.7265831415665218, + "grad_norm": 36.77297366255086, + "learning_rate": 4.555563910756102e-06, + "loss": 2.6217, + "step": 20258 + }, + { + "epoch": 1.7266683712605473, + "grad_norm": 30.40854516886308, + "learning_rate": 4.555070027832565e-06, + "loss": 2.6477, + "step": 20259 + }, + { + "epoch": 1.7267536009545725, + "grad_norm": 49.2342286323388, + "learning_rate": 4.554576149284737e-06, + "loss": 2.1365, + "step": 20260 + }, + { + "epoch": 1.726838830648598, + "grad_norm": 74.56051755884008, + "learning_rate": 4.5540822751174745e-06, + "loss": 4.0808, + "step": 20261 + }, + { + "epoch": 1.7269240603426232, + "grad_norm": 21.787872282763864, + "learning_rate": 4.553588405335637e-06, + "loss": 1.6361, + "step": 20262 + }, + { + "epoch": 1.7270092900366487, + "grad_norm": 50.594242985162104, + "learning_rate": 4.55309453994408e-06, + "loss": 2.2561, + "step": 20263 + }, + { + "epoch": 1.7270945197306742, + "grad_norm": 71.5808598874113, + "learning_rate": 4.552600678947661e-06, + "loss": 3.6421, + "step": 20264 + }, + { + "epoch": 1.7271797494246997, + "grad_norm": 123.63119170954359, + "learning_rate": 4.552106822351235e-06, + "loss": 2.9985, + "step": 20265 + }, + { + "epoch": 1.7272649791187251, + "grad_norm": 41.34353830237921, + "learning_rate": 4.5516129701596615e-06, + "loss": 2.6764, + "step": 20266 + }, + { + "epoch": 1.7273502088127504, + "grad_norm": 42.450725670310725, + "learning_rate": 4.551119122377796e-06, + "loss": 3.2312, + "step": 20267 + }, + { + "epoch": 1.7274354385067756, + "grad_norm": 53.06352316834878, + "learning_rate": 4.550625279010497e-06, + "loss": 2.3745, + "step": 20268 + }, + { + "epoch": 1.727520668200801, + "grad_norm": 37.739794556693255, + "learning_rate": 4.550131440062617e-06, + "loss": 3.2177, + "step": 20269 + }, + { + "epoch": 1.7276058978948265, + "grad_norm": 23.787896337716067, + "learning_rate": 4.549637605539017e-06, + "loss": 1.7697, + "step": 20270 + }, + { + "epoch": 1.727691127588852, + "grad_norm": 55.68442984079184, + "learning_rate": 4.549143775444553e-06, + "loss": 3.0618, + "step": 20271 + }, + { + "epoch": 1.7277763572828775, + "grad_norm": 73.93582113499158, + "learning_rate": 4.548649949784081e-06, + "loss": 2.746, + "step": 20272 + }, + { + "epoch": 1.7278615869769027, + "grad_norm": 90.77964909851097, + "learning_rate": 4.5481561285624545e-06, + "loss": 3.8772, + "step": 20273 + }, + { + "epoch": 1.7279468166709282, + "grad_norm": 36.633248307378736, + "learning_rate": 4.5476623117845345e-06, + "loss": 2.73, + "step": 20274 + }, + { + "epoch": 1.7280320463649534, + "grad_norm": 61.98871993065241, + "learning_rate": 4.547168499455177e-06, + "loss": 2.8885, + "step": 20275 + }, + { + "epoch": 1.728117276058979, + "grad_norm": 88.32550012770417, + "learning_rate": 4.546674691579238e-06, + "loss": 3.0759, + "step": 20276 + }, + { + "epoch": 1.7282025057530044, + "grad_norm": 36.532750493311056, + "learning_rate": 4.5461808881615714e-06, + "loss": 2.8767, + "step": 20277 + }, + { + "epoch": 1.7282877354470298, + "grad_norm": 19.96651637717032, + "learning_rate": 4.545687089207035e-06, + "loss": 1.5773, + "step": 20278 + }, + { + "epoch": 1.728372965141055, + "grad_norm": 41.606636302774376, + "learning_rate": 4.545193294720488e-06, + "loss": 2.6773, + "step": 20279 + }, + { + "epoch": 1.7284581948350806, + "grad_norm": 80.93717228462975, + "learning_rate": 4.544699504706783e-06, + "loss": 3.1656, + "step": 20280 + }, + { + "epoch": 1.7285434245291058, + "grad_norm": 58.808826673852835, + "learning_rate": 4.544205719170777e-06, + "loss": 2.5155, + "step": 20281 + }, + { + "epoch": 1.7286286542231313, + "grad_norm": 42.40905241770051, + "learning_rate": 4.543711938117325e-06, + "loss": 3.01, + "step": 20282 + }, + { + "epoch": 1.7287138839171567, + "grad_norm": 23.719602214308654, + "learning_rate": 4.543218161551288e-06, + "loss": 1.563, + "step": 20283 + }, + { + "epoch": 1.7287991136111822, + "grad_norm": 81.94962406087183, + "learning_rate": 4.542724389477518e-06, + "loss": 2.9337, + "step": 20284 + }, + { + "epoch": 1.7288843433052077, + "grad_norm": 70.80638381857534, + "learning_rate": 4.542230621900871e-06, + "loss": 3.5091, + "step": 20285 + }, + { + "epoch": 1.728969572999233, + "grad_norm": 38.19516618536734, + "learning_rate": 4.5417368588262024e-06, + "loss": 2.9949, + "step": 20286 + }, + { + "epoch": 1.7290548026932582, + "grad_norm": 37.62535158550312, + "learning_rate": 4.541243100258373e-06, + "loss": 2.7848, + "step": 20287 + }, + { + "epoch": 1.7291400323872836, + "grad_norm": 44.625990032947925, + "learning_rate": 4.540749346202235e-06, + "loss": 2.9705, + "step": 20288 + }, + { + "epoch": 1.7292252620813091, + "grad_norm": 67.63169535243983, + "learning_rate": 4.540255596662643e-06, + "loss": 2.9283, + "step": 20289 + }, + { + "epoch": 1.7293104917753346, + "grad_norm": 72.30845804345633, + "learning_rate": 4.5397618516444565e-06, + "loss": 2.257, + "step": 20290 + }, + { + "epoch": 1.72939572146936, + "grad_norm": 34.43049043190422, + "learning_rate": 4.539268111152527e-06, + "loss": 2.7476, + "step": 20291 + }, + { + "epoch": 1.7294809511633853, + "grad_norm": 36.055828612162244, + "learning_rate": 4.5387743751917144e-06, + "loss": 3.1422, + "step": 20292 + }, + { + "epoch": 1.7295661808574108, + "grad_norm": 43.18011212949198, + "learning_rate": 4.538280643766871e-06, + "loss": 2.382, + "step": 20293 + }, + { + "epoch": 1.729651410551436, + "grad_norm": 91.81939514747023, + "learning_rate": 4.537786916882856e-06, + "loss": 3.8842, + "step": 20294 + }, + { + "epoch": 1.7297366402454615, + "grad_norm": 95.39627853323664, + "learning_rate": 4.5372931945445205e-06, + "loss": 3.5084, + "step": 20295 + }, + { + "epoch": 1.729821869939487, + "grad_norm": 39.09011835747418, + "learning_rate": 4.5367994767567245e-06, + "loss": 2.8008, + "step": 20296 + }, + { + "epoch": 1.7299070996335124, + "grad_norm": 48.614991600169574, + "learning_rate": 4.536305763524321e-06, + "loss": 3.7802, + "step": 20297 + }, + { + "epoch": 1.7299923293275377, + "grad_norm": 36.78622070728595, + "learning_rate": 4.5358120548521665e-06, + "loss": 2.6098, + "step": 20298 + }, + { + "epoch": 1.7300775590215631, + "grad_norm": 80.99777415787223, + "learning_rate": 4.535318350745114e-06, + "loss": 1.8417, + "step": 20299 + }, + { + "epoch": 1.7301627887155884, + "grad_norm": 87.39991530905017, + "learning_rate": 4.534824651208024e-06, + "loss": 3.1475, + "step": 20300 + }, + { + "epoch": 1.7302480184096138, + "grad_norm": 49.18721242918247, + "learning_rate": 4.534330956245746e-06, + "loss": 2.5245, + "step": 20301 + }, + { + "epoch": 1.7303332481036393, + "grad_norm": 39.584179475085456, + "learning_rate": 4.533837265863141e-06, + "loss": 2.062, + "step": 20302 + }, + { + "epoch": 1.7304184777976648, + "grad_norm": 35.524882803303534, + "learning_rate": 4.533343580065057e-06, + "loss": 3.0842, + "step": 20303 + }, + { + "epoch": 1.7305037074916902, + "grad_norm": 73.17003437401917, + "learning_rate": 4.532849898856357e-06, + "loss": 2.7393, + "step": 20304 + }, + { + "epoch": 1.7305889371857155, + "grad_norm": 74.94197156401988, + "learning_rate": 4.532356222241891e-06, + "loss": 2.8304, + "step": 20305 + }, + { + "epoch": 1.7306741668797407, + "grad_norm": 39.97555796162677, + "learning_rate": 4.531862550226517e-06, + "loss": 2.5899, + "step": 20306 + }, + { + "epoch": 1.7307593965737662, + "grad_norm": 81.14154189907374, + "learning_rate": 4.531368882815089e-06, + "loss": 2.5846, + "step": 20307 + }, + { + "epoch": 1.7308446262677917, + "grad_norm": 39.43599075337538, + "learning_rate": 4.530875220012459e-06, + "loss": 2.9175, + "step": 20308 + }, + { + "epoch": 1.7309298559618171, + "grad_norm": 55.81874541592443, + "learning_rate": 4.5303815618234865e-06, + "loss": 3.2798, + "step": 20309 + }, + { + "epoch": 1.7310150856558426, + "grad_norm": 124.0204599558052, + "learning_rate": 4.529887908253026e-06, + "loss": 3.847, + "step": 20310 + }, + { + "epoch": 1.7311003153498679, + "grad_norm": 52.99657128115348, + "learning_rate": 4.52939425930593e-06, + "loss": 3.8416, + "step": 20311 + }, + { + "epoch": 1.7311855450438933, + "grad_norm": 120.73626803969155, + "learning_rate": 4.528900614987053e-06, + "loss": 3.0686, + "step": 20312 + }, + { + "epoch": 1.7312707747379186, + "grad_norm": 51.197209964866374, + "learning_rate": 4.5284069753012525e-06, + "loss": 2.5848, + "step": 20313 + }, + { + "epoch": 1.731356004431944, + "grad_norm": 43.27365261064215, + "learning_rate": 4.527913340253383e-06, + "loss": 2.84, + "step": 20314 + }, + { + "epoch": 1.7314412341259695, + "grad_norm": 45.88395305422825, + "learning_rate": 4.527419709848298e-06, + "loss": 3.0348, + "step": 20315 + }, + { + "epoch": 1.731526463819995, + "grad_norm": 51.70043847009267, + "learning_rate": 4.52692608409085e-06, + "loss": 2.5135, + "step": 20316 + }, + { + "epoch": 1.7316116935140204, + "grad_norm": 36.472353139569016, + "learning_rate": 4.526432462985898e-06, + "loss": 2.329, + "step": 20317 + }, + { + "epoch": 1.7316969232080457, + "grad_norm": 56.68178560706852, + "learning_rate": 4.525938846538294e-06, + "loss": 2.6193, + "step": 20318 + }, + { + "epoch": 1.731782152902071, + "grad_norm": 53.80213268517193, + "learning_rate": 4.525445234752895e-06, + "loss": 3.3752, + "step": 20319 + }, + { + "epoch": 1.7318673825960964, + "grad_norm": 37.892698684334945, + "learning_rate": 4.524951627634552e-06, + "loss": 2.8989, + "step": 20320 + }, + { + "epoch": 1.7319526122901219, + "grad_norm": 98.19093217448825, + "learning_rate": 4.524458025188119e-06, + "loss": 2.9377, + "step": 20321 + }, + { + "epoch": 1.7320378419841473, + "grad_norm": 21.536346925191644, + "learning_rate": 4.5239644274184555e-06, + "loss": 1.4948, + "step": 20322 + }, + { + "epoch": 1.7321230716781728, + "grad_norm": 114.96255873840619, + "learning_rate": 4.523470834330412e-06, + "loss": 3.4108, + "step": 20323 + }, + { + "epoch": 1.732208301372198, + "grad_norm": 91.74986958229691, + "learning_rate": 4.522977245928842e-06, + "loss": 2.785, + "step": 20324 + }, + { + "epoch": 1.7322935310662235, + "grad_norm": 50.66409657107862, + "learning_rate": 4.522483662218601e-06, + "loss": 3.1162, + "step": 20325 + }, + { + "epoch": 1.7323787607602488, + "grad_norm": 34.78429800667263, + "learning_rate": 4.5219900832045455e-06, + "loss": 2.5971, + "step": 20326 + }, + { + "epoch": 1.7324639904542742, + "grad_norm": 60.621722619374914, + "learning_rate": 4.521496508891528e-06, + "loss": 2.961, + "step": 20327 + }, + { + "epoch": 1.7325492201482997, + "grad_norm": 42.40836343757282, + "learning_rate": 4.5210029392844e-06, + "loss": 2.966, + "step": 20328 + }, + { + "epoch": 1.7326344498423252, + "grad_norm": 44.98024235476464, + "learning_rate": 4.520509374388017e-06, + "loss": 3.033, + "step": 20329 + }, + { + "epoch": 1.7327196795363504, + "grad_norm": 61.65222267939134, + "learning_rate": 4.520015814207237e-06, + "loss": 2.7368, + "step": 20330 + }, + { + "epoch": 1.732804909230376, + "grad_norm": 49.560060239526514, + "learning_rate": 4.519522258746909e-06, + "loss": 3.7059, + "step": 20331 + }, + { + "epoch": 1.7328901389244011, + "grad_norm": 56.73791359213708, + "learning_rate": 4.519028708011889e-06, + "loss": 2.6395, + "step": 20332 + }, + { + "epoch": 1.7329753686184266, + "grad_norm": 64.28673264231503, + "learning_rate": 4.5185351620070315e-06, + "loss": 1.8488, + "step": 20333 + }, + { + "epoch": 1.733060598312452, + "grad_norm": 43.403497800178116, + "learning_rate": 4.518041620737187e-06, + "loss": 1.7727, + "step": 20334 + }, + { + "epoch": 1.7331458280064775, + "grad_norm": 42.37965353232508, + "learning_rate": 4.517548084207213e-06, + "loss": 2.7693, + "step": 20335 + }, + { + "epoch": 1.733231057700503, + "grad_norm": 39.891998612438414, + "learning_rate": 4.517054552421962e-06, + "loss": 2.6462, + "step": 20336 + }, + { + "epoch": 1.7333162873945283, + "grad_norm": 73.07991593311586, + "learning_rate": 4.516561025386288e-06, + "loss": 4.8398, + "step": 20337 + }, + { + "epoch": 1.7334015170885535, + "grad_norm": 29.528641036919996, + "learning_rate": 4.5160675031050425e-06, + "loss": 2.2299, + "step": 20338 + }, + { + "epoch": 1.733486746782579, + "grad_norm": 37.66883778118739, + "learning_rate": 4.515573985583082e-06, + "loss": 2.0821, + "step": 20339 + }, + { + "epoch": 1.7335719764766044, + "grad_norm": 39.12673625151499, + "learning_rate": 4.515080472825259e-06, + "loss": 1.9672, + "step": 20340 + }, + { + "epoch": 1.73365720617063, + "grad_norm": 66.39656404924449, + "learning_rate": 4.514586964836427e-06, + "loss": 2.047, + "step": 20341 + }, + { + "epoch": 1.7337424358646554, + "grad_norm": 38.797547185411254, + "learning_rate": 4.514093461621437e-06, + "loss": 2.5212, + "step": 20342 + }, + { + "epoch": 1.7338276655586806, + "grad_norm": 39.48270374813333, + "learning_rate": 4.513599963185148e-06, + "loss": 3.0091, + "step": 20343 + }, + { + "epoch": 1.733912895252706, + "grad_norm": 60.971317720003086, + "learning_rate": 4.513106469532408e-06, + "loss": 3.1496, + "step": 20344 + }, + { + "epoch": 1.7339981249467313, + "grad_norm": 62.56472169047512, + "learning_rate": 4.512612980668074e-06, + "loss": 2.9733, + "step": 20345 + }, + { + "epoch": 1.7340833546407568, + "grad_norm": 79.09064426684164, + "learning_rate": 4.5121194965969956e-06, + "loss": 2.8192, + "step": 20346 + }, + { + "epoch": 1.7341685843347823, + "grad_norm": 36.84470990216722, + "learning_rate": 4.51162601732403e-06, + "loss": 3.2489, + "step": 20347 + }, + { + "epoch": 1.7342538140288077, + "grad_norm": 52.50172586330224, + "learning_rate": 4.511132542854027e-06, + "loss": 2.5625, + "step": 20348 + }, + { + "epoch": 1.734339043722833, + "grad_norm": 60.63844045448667, + "learning_rate": 4.510639073191843e-06, + "loss": 3.4467, + "step": 20349 + }, + { + "epoch": 1.7344242734168585, + "grad_norm": 32.423843058074304, + "learning_rate": 4.510145608342329e-06, + "loss": 2.1141, + "step": 20350 + }, + { + "epoch": 1.7345095031108837, + "grad_norm": 45.03876854310009, + "learning_rate": 4.509652148310336e-06, + "loss": 2.3657, + "step": 20351 + }, + { + "epoch": 1.7345947328049092, + "grad_norm": 38.31536723589109, + "learning_rate": 4.509158693100721e-06, + "loss": 2.8819, + "step": 20352 + }, + { + "epoch": 1.7346799624989346, + "grad_norm": 43.1390865037521, + "learning_rate": 4.508665242718335e-06, + "loss": 2.9724, + "step": 20353 + }, + { + "epoch": 1.73476519219296, + "grad_norm": 34.86642689756626, + "learning_rate": 4.508171797168031e-06, + "loss": 2.6359, + "step": 20354 + }, + { + "epoch": 1.7348504218869856, + "grad_norm": 63.06151024674721, + "learning_rate": 4.507678356454662e-06, + "loss": 3.7064, + "step": 20355 + }, + { + "epoch": 1.7349356515810108, + "grad_norm": 38.865058813797916, + "learning_rate": 4.50718492058308e-06, + "loss": 2.8032, + "step": 20356 + }, + { + "epoch": 1.735020881275036, + "grad_norm": 66.20285650277859, + "learning_rate": 4.50669148955814e-06, + "loss": 2.855, + "step": 20357 + }, + { + "epoch": 1.7351061109690615, + "grad_norm": 71.2659301695589, + "learning_rate": 4.506198063384693e-06, + "loss": 3.2629, + "step": 20358 + }, + { + "epoch": 1.735191340663087, + "grad_norm": 46.888052070054385, + "learning_rate": 4.5057046420675894e-06, + "loss": 2.2575, + "step": 20359 + }, + { + "epoch": 1.7352765703571125, + "grad_norm": 43.20092506351004, + "learning_rate": 4.5052112256116866e-06, + "loss": 3.3071, + "step": 20360 + }, + { + "epoch": 1.735361800051138, + "grad_norm": 28.75346990822544, + "learning_rate": 4.504717814021835e-06, + "loss": 2.3961, + "step": 20361 + }, + { + "epoch": 1.7354470297451632, + "grad_norm": 49.23786558432069, + "learning_rate": 4.504224407302886e-06, + "loss": 3.0382, + "step": 20362 + }, + { + "epoch": 1.7355322594391887, + "grad_norm": 57.986204593805226, + "learning_rate": 4.5037310054596936e-06, + "loss": 2.6775, + "step": 20363 + }, + { + "epoch": 1.735617489133214, + "grad_norm": 32.72081526749782, + "learning_rate": 4.503237608497107e-06, + "loss": 2.9523, + "step": 20364 + }, + { + "epoch": 1.7357027188272394, + "grad_norm": 33.50657067047487, + "learning_rate": 4.502744216419984e-06, + "loss": 2.8707, + "step": 20365 + }, + { + "epoch": 1.7357879485212648, + "grad_norm": 58.13479351677664, + "learning_rate": 4.502250829233174e-06, + "loss": 2.815, + "step": 20366 + }, + { + "epoch": 1.7358731782152903, + "grad_norm": 28.4087611803884, + "learning_rate": 4.501757446941528e-06, + "loss": 2.4595, + "step": 20367 + }, + { + "epoch": 1.7359584079093158, + "grad_norm": 60.52091236022085, + "learning_rate": 4.5012640695498984e-06, + "loss": 3.2761, + "step": 20368 + }, + { + "epoch": 1.736043637603341, + "grad_norm": 70.15279177450205, + "learning_rate": 4.500770697063141e-06, + "loss": 2.9116, + "step": 20369 + }, + { + "epoch": 1.7361288672973663, + "grad_norm": 50.74188979542485, + "learning_rate": 4.500277329486106e-06, + "loss": 2.726, + "step": 20370 + }, + { + "epoch": 1.7362140969913917, + "grad_norm": 35.614134241423585, + "learning_rate": 4.499783966823643e-06, + "loss": 2.9773, + "step": 20371 + }, + { + "epoch": 1.7362993266854172, + "grad_norm": 108.60865221144618, + "learning_rate": 4.499290609080605e-06, + "loss": 2.4418, + "step": 20372 + }, + { + "epoch": 1.7363845563794427, + "grad_norm": 60.01822815943185, + "learning_rate": 4.498797256261847e-06, + "loss": 3.5096, + "step": 20373 + }, + { + "epoch": 1.7364697860734681, + "grad_norm": 32.150073138862275, + "learning_rate": 4.498303908372219e-06, + "loss": 3.5282, + "step": 20374 + }, + { + "epoch": 1.7365550157674934, + "grad_norm": 107.38526824191048, + "learning_rate": 4.497810565416571e-06, + "loss": 3.6756, + "step": 20375 + }, + { + "epoch": 1.7366402454615186, + "grad_norm": 50.112466389290965, + "learning_rate": 4.497317227399756e-06, + "loss": 3.2867, + "step": 20376 + }, + { + "epoch": 1.736725475155544, + "grad_norm": 46.198927006982444, + "learning_rate": 4.496823894326629e-06, + "loss": 2.6068, + "step": 20377 + }, + { + "epoch": 1.7368107048495696, + "grad_norm": 48.2711243181514, + "learning_rate": 4.496330566202038e-06, + "loss": 2.9282, + "step": 20378 + }, + { + "epoch": 1.736895934543595, + "grad_norm": 46.91194632597732, + "learning_rate": 4.495837243030835e-06, + "loss": 3.2861, + "step": 20379 + }, + { + "epoch": 1.7369811642376205, + "grad_norm": 47.13356301084033, + "learning_rate": 4.495343924817874e-06, + "loss": 3.2193, + "step": 20380 + }, + { + "epoch": 1.7370663939316457, + "grad_norm": 47.762157904069824, + "learning_rate": 4.494850611568002e-06, + "loss": 2.801, + "step": 20381 + }, + { + "epoch": 1.7371516236256712, + "grad_norm": 96.9621654631348, + "learning_rate": 4.494357303286076e-06, + "loss": 2.9979, + "step": 20382 + }, + { + "epoch": 1.7372368533196965, + "grad_norm": 40.772927017087326, + "learning_rate": 4.493863999976945e-06, + "loss": 2.9309, + "step": 20383 + }, + { + "epoch": 1.737322083013722, + "grad_norm": 86.2753194015658, + "learning_rate": 4.493370701645461e-06, + "loss": 2.8075, + "step": 20384 + }, + { + "epoch": 1.7374073127077474, + "grad_norm": 35.594693021759575, + "learning_rate": 4.492877408296471e-06, + "loss": 2.2847, + "step": 20385 + }, + { + "epoch": 1.7374925424017729, + "grad_norm": 41.77209532691463, + "learning_rate": 4.492384119934834e-06, + "loss": 2.3182, + "step": 20386 + }, + { + "epoch": 1.7375777720957983, + "grad_norm": 34.015007551390475, + "learning_rate": 4.491890836565396e-06, + "loss": 3.5127, + "step": 20387 + }, + { + "epoch": 1.7376630017898236, + "grad_norm": 38.26914668319966, + "learning_rate": 4.4913975581930106e-06, + "loss": 1.8921, + "step": 20388 + }, + { + "epoch": 1.7377482314838488, + "grad_norm": 67.44047200423272, + "learning_rate": 4.490904284822526e-06, + "loss": 2.6642, + "step": 20389 + }, + { + "epoch": 1.7378334611778743, + "grad_norm": 58.2178307630193, + "learning_rate": 4.4904110164587975e-06, + "loss": 3.2006, + "step": 20390 + }, + { + "epoch": 1.7379186908718998, + "grad_norm": 77.32894575774523, + "learning_rate": 4.489917753106673e-06, + "loss": 3.3269, + "step": 20391 + }, + { + "epoch": 1.7380039205659252, + "grad_norm": 33.23905518237723, + "learning_rate": 4.4894244947710065e-06, + "loss": 3.1585, + "step": 20392 + }, + { + "epoch": 1.7380891502599507, + "grad_norm": 33.99153100875596, + "learning_rate": 4.488931241456647e-06, + "loss": 2.3454, + "step": 20393 + }, + { + "epoch": 1.738174379953976, + "grad_norm": 45.67494506136627, + "learning_rate": 4.488437993168443e-06, + "loss": 3.0758, + "step": 20394 + }, + { + "epoch": 1.7382596096480014, + "grad_norm": 26.574026892497248, + "learning_rate": 4.487944749911249e-06, + "loss": 2.4162, + "step": 20395 + }, + { + "epoch": 1.7383448393420267, + "grad_norm": 27.854382012163573, + "learning_rate": 4.487451511689917e-06, + "loss": 2.7738, + "step": 20396 + }, + { + "epoch": 1.7384300690360521, + "grad_norm": 51.550304454212124, + "learning_rate": 4.4869582785092955e-06, + "loss": 2.5914, + "step": 20397 + }, + { + "epoch": 1.7385152987300776, + "grad_norm": 44.39357319509029, + "learning_rate": 4.486465050374233e-06, + "loss": 2.3027, + "step": 20398 + }, + { + "epoch": 1.738600528424103, + "grad_norm": 80.66920512164567, + "learning_rate": 4.485971827289584e-06, + "loss": 3.1901, + "step": 20399 + }, + { + "epoch": 1.7386857581181283, + "grad_norm": 35.05178158566513, + "learning_rate": 4.485478609260198e-06, + "loss": 2.8061, + "step": 20400 + }, + { + "epoch": 1.7387709878121538, + "grad_norm": 28.356017296780987, + "learning_rate": 4.484985396290926e-06, + "loss": 2.4308, + "step": 20401 + }, + { + "epoch": 1.738856217506179, + "grad_norm": 34.17907505999252, + "learning_rate": 4.484492188386616e-06, + "loss": 1.7081, + "step": 20402 + }, + { + "epoch": 1.7389414472002045, + "grad_norm": 42.38096241526248, + "learning_rate": 4.483998985552121e-06, + "loss": 3.0259, + "step": 20403 + }, + { + "epoch": 1.73902667689423, + "grad_norm": 47.58059109921056, + "learning_rate": 4.483505787792293e-06, + "loss": 2.5715, + "step": 20404 + }, + { + "epoch": 1.7391119065882554, + "grad_norm": 58.492676946149516, + "learning_rate": 4.48301259511198e-06, + "loss": 3.1031, + "step": 20405 + }, + { + "epoch": 1.739197136282281, + "grad_norm": 38.79512738451405, + "learning_rate": 4.482519407516031e-06, + "loss": 2.3499, + "step": 20406 + }, + { + "epoch": 1.7392823659763061, + "grad_norm": 43.669546920769925, + "learning_rate": 4.482026225009297e-06, + "loss": 2.2221, + "step": 20407 + }, + { + "epoch": 1.7393675956703314, + "grad_norm": 84.63288737270341, + "learning_rate": 4.4815330475966326e-06, + "loss": 3.2483, + "step": 20408 + }, + { + "epoch": 1.7394528253643569, + "grad_norm": 85.45717749910162, + "learning_rate": 4.4810398752828835e-06, + "loss": 3.6729, + "step": 20409 + }, + { + "epoch": 1.7395380550583823, + "grad_norm": 51.93064675699613, + "learning_rate": 4.4805467080729e-06, + "loss": 3.2105, + "step": 20410 + }, + { + "epoch": 1.7396232847524078, + "grad_norm": 32.86661842394837, + "learning_rate": 4.4800535459715324e-06, + "loss": 2.037, + "step": 20411 + }, + { + "epoch": 1.7397085144464333, + "grad_norm": 46.177016716144976, + "learning_rate": 4.479560388983633e-06, + "loss": 3.1243, + "step": 20412 + }, + { + "epoch": 1.7397937441404585, + "grad_norm": 31.628984975311397, + "learning_rate": 4.479067237114052e-06, + "loss": 2.1736, + "step": 20413 + }, + { + "epoch": 1.739878973834484, + "grad_norm": 32.63485529279532, + "learning_rate": 4.478574090367636e-06, + "loss": 2.4739, + "step": 20414 + }, + { + "epoch": 1.7399642035285092, + "grad_norm": 67.46216884276127, + "learning_rate": 4.478080948749235e-06, + "loss": 2.7617, + "step": 20415 + }, + { + "epoch": 1.7400494332225347, + "grad_norm": 44.098254028835434, + "learning_rate": 4.477587812263703e-06, + "loss": 2.0454, + "step": 20416 + }, + { + "epoch": 1.7401346629165602, + "grad_norm": 36.22481149110686, + "learning_rate": 4.4770946809158875e-06, + "loss": 1.7322, + "step": 20417 + }, + { + "epoch": 1.7402198926105856, + "grad_norm": 52.57568839327287, + "learning_rate": 4.476601554710636e-06, + "loss": 2.9761, + "step": 20418 + }, + { + "epoch": 1.7403051223046109, + "grad_norm": 45.42152128289574, + "learning_rate": 4.4761084336528e-06, + "loss": 3.0791, + "step": 20419 + }, + { + "epoch": 1.7403903519986363, + "grad_norm": 38.42577623976297, + "learning_rate": 4.475615317747231e-06, + "loss": 2.3212, + "step": 20420 + }, + { + "epoch": 1.7404755816926616, + "grad_norm": 67.48170610334869, + "learning_rate": 4.475122206998777e-06, + "loss": 2.8809, + "step": 20421 + }, + { + "epoch": 1.740560811386687, + "grad_norm": 166.69433155649352, + "learning_rate": 4.474629101412287e-06, + "loss": 3.605, + "step": 20422 + }, + { + "epoch": 1.7406460410807125, + "grad_norm": 77.21363719976947, + "learning_rate": 4.474136000992612e-06, + "loss": 3.3984, + "step": 20423 + }, + { + "epoch": 1.740731270774738, + "grad_norm": 72.4625281238196, + "learning_rate": 4.473642905744599e-06, + "loss": 3.6797, + "step": 20424 + }, + { + "epoch": 1.7408165004687635, + "grad_norm": 46.99319965074974, + "learning_rate": 4.473149815673099e-06, + "loss": 2.8974, + "step": 20425 + }, + { + "epoch": 1.7409017301627887, + "grad_norm": 87.42854308588805, + "learning_rate": 4.472656730782962e-06, + "loss": 4.2358, + "step": 20426 + }, + { + "epoch": 1.740986959856814, + "grad_norm": 67.76943645743026, + "learning_rate": 4.472163651079036e-06, + "loss": 2.6647, + "step": 20427 + }, + { + "epoch": 1.7410721895508394, + "grad_norm": 82.64275374481849, + "learning_rate": 4.47167057656617e-06, + "loss": 2.9447, + "step": 20428 + }, + { + "epoch": 1.741157419244865, + "grad_norm": 49.988607271819376, + "learning_rate": 4.471177507249216e-06, + "loss": 3.4607, + "step": 20429 + }, + { + "epoch": 1.7412426489388904, + "grad_norm": 67.664699298957, + "learning_rate": 4.4706844431330196e-06, + "loss": 3.8425, + "step": 20430 + }, + { + "epoch": 1.7413278786329158, + "grad_norm": 48.307665722076294, + "learning_rate": 4.470191384222432e-06, + "loss": 2.8549, + "step": 20431 + }, + { + "epoch": 1.741413108326941, + "grad_norm": 79.6592372866311, + "learning_rate": 4.4696983305223e-06, + "loss": 4.5796, + "step": 20432 + }, + { + "epoch": 1.7414983380209665, + "grad_norm": 74.57878280162457, + "learning_rate": 4.469205282037477e-06, + "loss": 3.3621, + "step": 20433 + }, + { + "epoch": 1.7415835677149918, + "grad_norm": 46.57549812162037, + "learning_rate": 4.468712238772806e-06, + "loss": 2.3169, + "step": 20434 + }, + { + "epoch": 1.7416687974090173, + "grad_norm": 45.441171374446284, + "learning_rate": 4.468219200733142e-06, + "loss": 3.1258, + "step": 20435 + }, + { + "epoch": 1.7417540271030427, + "grad_norm": 69.95062788055448, + "learning_rate": 4.4677261679233305e-06, + "loss": 3.114, + "step": 20436 + }, + { + "epoch": 1.7418392567970682, + "grad_norm": 34.23642250737441, + "learning_rate": 4.467233140348218e-06, + "loss": 2.5681, + "step": 20437 + }, + { + "epoch": 1.7419244864910937, + "grad_norm": 48.95953346388265, + "learning_rate": 4.466740118012657e-06, + "loss": 2.4142, + "step": 20438 + }, + { + "epoch": 1.742009716185119, + "grad_norm": 55.09988783864051, + "learning_rate": 4.466247100921497e-06, + "loss": 2.2017, + "step": 20439 + }, + { + "epoch": 1.7420949458791442, + "grad_norm": 37.5124448259321, + "learning_rate": 4.465754089079583e-06, + "loss": 2.2443, + "step": 20440 + }, + { + "epoch": 1.7421801755731696, + "grad_norm": 31.271925325862284, + "learning_rate": 4.465261082491765e-06, + "loss": 2.1543, + "step": 20441 + }, + { + "epoch": 1.742265405267195, + "grad_norm": 84.46461522000092, + "learning_rate": 4.464768081162892e-06, + "loss": 3.1079, + "step": 20442 + }, + { + "epoch": 1.7423506349612206, + "grad_norm": 42.127696915901296, + "learning_rate": 4.464275085097814e-06, + "loss": 2.7708, + "step": 20443 + }, + { + "epoch": 1.742435864655246, + "grad_norm": 54.34307219664783, + "learning_rate": 4.463782094301376e-06, + "loss": 2.1491, + "step": 20444 + }, + { + "epoch": 1.7425210943492713, + "grad_norm": 66.69275040016498, + "learning_rate": 4.463289108778427e-06, + "loss": 3.0895, + "step": 20445 + }, + { + "epoch": 1.7426063240432967, + "grad_norm": 55.126033305307075, + "learning_rate": 4.462796128533817e-06, + "loss": 3.1614, + "step": 20446 + }, + { + "epoch": 1.742691553737322, + "grad_norm": 49.68534050536917, + "learning_rate": 4.462303153572395e-06, + "loss": 3.0512, + "step": 20447 + }, + { + "epoch": 1.7427767834313475, + "grad_norm": 97.50310858683203, + "learning_rate": 4.461810183899007e-06, + "loss": 3.1115, + "step": 20448 + }, + { + "epoch": 1.742862013125373, + "grad_norm": 30.922449651248137, + "learning_rate": 4.461317219518501e-06, + "loss": 1.8996, + "step": 20449 + }, + { + "epoch": 1.7429472428193984, + "grad_norm": 40.82743493998143, + "learning_rate": 4.4608242604357265e-06, + "loss": 2.4265, + "step": 20450 + }, + { + "epoch": 1.7430324725134236, + "grad_norm": 70.32986274869114, + "learning_rate": 4.4603313066555325e-06, + "loss": 2.8005, + "step": 20451 + }, + { + "epoch": 1.743117702207449, + "grad_norm": 24.079287184806205, + "learning_rate": 4.459838358182766e-06, + "loss": 2.3477, + "step": 20452 + }, + { + "epoch": 1.7432029319014744, + "grad_norm": 36.4168776444183, + "learning_rate": 4.4593454150222735e-06, + "loss": 2.6593, + "step": 20453 + }, + { + "epoch": 1.7432881615954998, + "grad_norm": 31.406100691926877, + "learning_rate": 4.458852477178902e-06, + "loss": 2.1147, + "step": 20454 + }, + { + "epoch": 1.7433733912895253, + "grad_norm": 42.62350342720156, + "learning_rate": 4.458359544657504e-06, + "loss": 3.4927, + "step": 20455 + }, + { + "epoch": 1.7434586209835508, + "grad_norm": 80.62102640512529, + "learning_rate": 4.457866617462925e-06, + "loss": 3.2094, + "step": 20456 + }, + { + "epoch": 1.7435438506775762, + "grad_norm": 33.66594511282988, + "learning_rate": 4.457373695600012e-06, + "loss": 2.713, + "step": 20457 + }, + { + "epoch": 1.7436290803716015, + "grad_norm": 90.84213678447789, + "learning_rate": 4.45688077907361e-06, + "loss": 2.3011, + "step": 20458 + }, + { + "epoch": 1.7437143100656267, + "grad_norm": 44.47491643930335, + "learning_rate": 4.456387867888573e-06, + "loss": 3.4108, + "step": 20459 + }, + { + "epoch": 1.7437995397596522, + "grad_norm": 106.11624538023666, + "learning_rate": 4.4558949620497455e-06, + "loss": 3.2739, + "step": 20460 + }, + { + "epoch": 1.7438847694536777, + "grad_norm": 62.51719626655719, + "learning_rate": 4.455402061561973e-06, + "loss": 2.8359, + "step": 20461 + }, + { + "epoch": 1.7439699991477031, + "grad_norm": 50.210903396467046, + "learning_rate": 4.454909166430105e-06, + "loss": 2.8938, + "step": 20462 + }, + { + "epoch": 1.7440552288417286, + "grad_norm": 78.17819895717476, + "learning_rate": 4.45441627665899e-06, + "loss": 2.2387, + "step": 20463 + }, + { + "epoch": 1.7441404585357538, + "grad_norm": 66.15381469365066, + "learning_rate": 4.453923392253474e-06, + "loss": 2.8488, + "step": 20464 + }, + { + "epoch": 1.7442256882297793, + "grad_norm": 56.06375709129596, + "learning_rate": 4.453430513218404e-06, + "loss": 2.6954, + "step": 20465 + }, + { + "epoch": 1.7443109179238045, + "grad_norm": 52.54975481375095, + "learning_rate": 4.452937639558628e-06, + "loss": 3.0952, + "step": 20466 + }, + { + "epoch": 1.74439614761783, + "grad_norm": 41.66950677797534, + "learning_rate": 4.452444771278991e-06, + "loss": 3.0029, + "step": 20467 + }, + { + "epoch": 1.7444813773118555, + "grad_norm": 30.154047203858557, + "learning_rate": 4.451951908384345e-06, + "loss": 2.3423, + "step": 20468 + }, + { + "epoch": 1.744566607005881, + "grad_norm": 55.064395592324594, + "learning_rate": 4.451459050879533e-06, + "loss": 3.9293, + "step": 20469 + }, + { + "epoch": 1.7446518366999062, + "grad_norm": 22.780524225299796, + "learning_rate": 4.450966198769403e-06, + "loss": 1.9783, + "step": 20470 + }, + { + "epoch": 1.7447370663939317, + "grad_norm": 45.024131212889095, + "learning_rate": 4.450473352058802e-06, + "loss": 3.5913, + "step": 20471 + }, + { + "epoch": 1.744822296087957, + "grad_norm": 55.721053880984606, + "learning_rate": 4.449980510752578e-06, + "loss": 3.984, + "step": 20472 + }, + { + "epoch": 1.7449075257819824, + "grad_norm": 106.37982476839721, + "learning_rate": 4.4494876748555764e-06, + "loss": 3.4853, + "step": 20473 + }, + { + "epoch": 1.7449927554760079, + "grad_norm": 36.34188088116523, + "learning_rate": 4.448994844372646e-06, + "loss": 2.9651, + "step": 20474 + }, + { + "epoch": 1.7450779851700333, + "grad_norm": 55.18820767695012, + "learning_rate": 4.44850201930863e-06, + "loss": 2.3448, + "step": 20475 + }, + { + "epoch": 1.7451632148640588, + "grad_norm": 34.39516669128845, + "learning_rate": 4.44800919966838e-06, + "loss": 2.4793, + "step": 20476 + }, + { + "epoch": 1.745248444558084, + "grad_norm": 40.45112259784464, + "learning_rate": 4.447516385456739e-06, + "loss": 2.6824, + "step": 20477 + }, + { + "epoch": 1.7453336742521093, + "grad_norm": 38.76882981971001, + "learning_rate": 4.4470235766785555e-06, + "loss": 2.7125, + "step": 20478 + }, + { + "epoch": 1.7454189039461347, + "grad_norm": 53.82743080803627, + "learning_rate": 4.446530773338676e-06, + "loss": 2.8383, + "step": 20479 + }, + { + "epoch": 1.7455041336401602, + "grad_norm": 76.21347921949621, + "learning_rate": 4.446037975441943e-06, + "loss": 3.0375, + "step": 20480 + }, + { + "epoch": 1.7455893633341857, + "grad_norm": 56.58454801369524, + "learning_rate": 4.445545182993209e-06, + "loss": 1.3974, + "step": 20481 + }, + { + "epoch": 1.7456745930282112, + "grad_norm": 43.014337313972376, + "learning_rate": 4.445052395997319e-06, + "loss": 2.5419, + "step": 20482 + }, + { + "epoch": 1.7457598227222364, + "grad_norm": 45.52287725522904, + "learning_rate": 4.444559614459117e-06, + "loss": 2.6433, + "step": 20483 + }, + { + "epoch": 1.7458450524162619, + "grad_norm": 33.42055440885929, + "learning_rate": 4.444066838383448e-06, + "loss": 2.8555, + "step": 20484 + }, + { + "epoch": 1.7459302821102871, + "grad_norm": 73.82729046368154, + "learning_rate": 4.443574067775163e-06, + "loss": 2.6841, + "step": 20485 + }, + { + "epoch": 1.7460155118043126, + "grad_norm": 56.928982202105374, + "learning_rate": 4.443081302639107e-06, + "loss": 3.4758, + "step": 20486 + }, + { + "epoch": 1.746100741498338, + "grad_norm": 72.63117380897428, + "learning_rate": 4.4425885429801245e-06, + "loss": 2.9566, + "step": 20487 + }, + { + "epoch": 1.7461859711923635, + "grad_norm": 46.60966294013684, + "learning_rate": 4.44209578880306e-06, + "loss": 2.7335, + "step": 20488 + }, + { + "epoch": 1.7462712008863888, + "grad_norm": 54.948264190574776, + "learning_rate": 4.441603040112763e-06, + "loss": 2.6607, + "step": 20489 + }, + { + "epoch": 1.7463564305804142, + "grad_norm": 76.82833404347657, + "learning_rate": 4.44111029691408e-06, + "loss": 3.1438, + "step": 20490 + }, + { + "epoch": 1.7464416602744395, + "grad_norm": 60.93629467355214, + "learning_rate": 4.440617559211854e-06, + "loss": 3.1146, + "step": 20491 + }, + { + "epoch": 1.746526889968465, + "grad_norm": 49.50481143255567, + "learning_rate": 4.4401248270109306e-06, + "loss": 2.9504, + "step": 20492 + }, + { + "epoch": 1.7466121196624904, + "grad_norm": 41.77087297842743, + "learning_rate": 4.439632100316158e-06, + "loss": 3.055, + "step": 20493 + }, + { + "epoch": 1.7466973493565159, + "grad_norm": 43.6425419695706, + "learning_rate": 4.439139379132381e-06, + "loss": 3.6486, + "step": 20494 + }, + { + "epoch": 1.7467825790505414, + "grad_norm": 38.83453791401549, + "learning_rate": 4.438646663464447e-06, + "loss": 2.1645, + "step": 20495 + }, + { + "epoch": 1.7468678087445666, + "grad_norm": 47.870800216968206, + "learning_rate": 4.438153953317199e-06, + "loss": 3.3043, + "step": 20496 + }, + { + "epoch": 1.7469530384385918, + "grad_norm": 42.83648174365231, + "learning_rate": 4.437661248695481e-06, + "loss": 2.8377, + "step": 20497 + }, + { + "epoch": 1.7470382681326173, + "grad_norm": 86.46994026471552, + "learning_rate": 4.437168549604144e-06, + "loss": 3.2339, + "step": 20498 + }, + { + "epoch": 1.7471234978266428, + "grad_norm": 34.22854537335957, + "learning_rate": 4.436675856048031e-06, + "loss": 2.057, + "step": 20499 + }, + { + "epoch": 1.7472087275206682, + "grad_norm": 36.78268036883484, + "learning_rate": 4.436183168031986e-06, + "loss": 2.7955, + "step": 20500 + }, + { + "epoch": 1.7472939572146937, + "grad_norm": 47.99604839694808, + "learning_rate": 4.435690485560854e-06, + "loss": 2.883, + "step": 20501 + }, + { + "epoch": 1.747379186908719, + "grad_norm": 33.6173938272225, + "learning_rate": 4.4351978086394845e-06, + "loss": 2.2778, + "step": 20502 + }, + { + "epoch": 1.7474644166027444, + "grad_norm": 25.10425834617465, + "learning_rate": 4.43470513727272e-06, + "loss": 2.2545, + "step": 20503 + }, + { + "epoch": 1.7475496462967697, + "grad_norm": 140.14406559223147, + "learning_rate": 4.434212471465405e-06, + "loss": 3.6206, + "step": 20504 + }, + { + "epoch": 1.7476348759907951, + "grad_norm": 55.966675865351355, + "learning_rate": 4.433719811222383e-06, + "loss": 2.5318, + "step": 20505 + }, + { + "epoch": 1.7477201056848206, + "grad_norm": 72.26287453189475, + "learning_rate": 4.4332271565485055e-06, + "loss": 2.3677, + "step": 20506 + }, + { + "epoch": 1.747805335378846, + "grad_norm": 29.16101606921397, + "learning_rate": 4.432734507448613e-06, + "loss": 3.0038, + "step": 20507 + }, + { + "epoch": 1.7478905650728715, + "grad_norm": 43.95373532061507, + "learning_rate": 4.43224186392755e-06, + "loss": 2.9823, + "step": 20508 + }, + { + "epoch": 1.7479757947668968, + "grad_norm": 44.36172499187233, + "learning_rate": 4.431749225990164e-06, + "loss": 3.109, + "step": 20509 + }, + { + "epoch": 1.748061024460922, + "grad_norm": 101.01149864594427, + "learning_rate": 4.431256593641297e-06, + "loss": 3.6403, + "step": 20510 + }, + { + "epoch": 1.7481462541549475, + "grad_norm": 30.192137229585885, + "learning_rate": 4.4307639668857965e-06, + "loss": 1.8891, + "step": 20511 + }, + { + "epoch": 1.748231483848973, + "grad_norm": 74.53607100101667, + "learning_rate": 4.430271345728505e-06, + "loss": 3.4035, + "step": 20512 + }, + { + "epoch": 1.7483167135429984, + "grad_norm": 66.58435611483732, + "learning_rate": 4.42977873017427e-06, + "loss": 2.6716, + "step": 20513 + }, + { + "epoch": 1.748401943237024, + "grad_norm": 53.11714197061321, + "learning_rate": 4.429286120227933e-06, + "loss": 2.7959, + "step": 20514 + }, + { + "epoch": 1.7484871729310492, + "grad_norm": 35.85244520467879, + "learning_rate": 4.428793515894342e-06, + "loss": 2.785, + "step": 20515 + }, + { + "epoch": 1.7485724026250746, + "grad_norm": 51.24553345508345, + "learning_rate": 4.428300917178339e-06, + "loss": 2.5349, + "step": 20516 + }, + { + "epoch": 1.7486576323190999, + "grad_norm": 42.972935528945506, + "learning_rate": 4.427808324084769e-06, + "loss": 3.918, + "step": 20517 + }, + { + "epoch": 1.7487428620131253, + "grad_norm": 40.5968533907217, + "learning_rate": 4.427315736618476e-06, + "loss": 2.8186, + "step": 20518 + }, + { + "epoch": 1.7488280917071508, + "grad_norm": 34.232461037596565, + "learning_rate": 4.426823154784306e-06, + "loss": 3.2596, + "step": 20519 + }, + { + "epoch": 1.7489133214011763, + "grad_norm": 55.898076052067694, + "learning_rate": 4.426330578587103e-06, + "loss": 2.8634, + "step": 20520 + }, + { + "epoch": 1.7489985510952015, + "grad_norm": 33.943521752356624, + "learning_rate": 4.425838008031711e-06, + "loss": 2.8883, + "step": 20521 + }, + { + "epoch": 1.749083780789227, + "grad_norm": 79.25638992627839, + "learning_rate": 4.425345443122973e-06, + "loss": 3.2322, + "step": 20522 + }, + { + "epoch": 1.7491690104832522, + "grad_norm": 42.28780190912642, + "learning_rate": 4.4248528838657355e-06, + "loss": 2.7973, + "step": 20523 + }, + { + "epoch": 1.7492542401772777, + "grad_norm": 59.7643691912823, + "learning_rate": 4.4243603302648395e-06, + "loss": 1.9499, + "step": 20524 + }, + { + "epoch": 1.7493394698713032, + "grad_norm": 42.14130427691119, + "learning_rate": 4.423867782325133e-06, + "loss": 2.4806, + "step": 20525 + }, + { + "epoch": 1.7494246995653286, + "grad_norm": 43.41837231426404, + "learning_rate": 4.423375240051458e-06, + "loss": 2.7733, + "step": 20526 + }, + { + "epoch": 1.7495099292593541, + "grad_norm": 57.67996188769092, + "learning_rate": 4.422882703448656e-06, + "loss": 2.9107, + "step": 20527 + }, + { + "epoch": 1.7495951589533794, + "grad_norm": 58.27847063229831, + "learning_rate": 4.422390172521574e-06, + "loss": 3.4588, + "step": 20528 + }, + { + "epoch": 1.7496803886474046, + "grad_norm": 57.17498319393892, + "learning_rate": 4.421897647275057e-06, + "loss": 2.3959, + "step": 20529 + }, + { + "epoch": 1.74976561834143, + "grad_norm": 52.98752739849256, + "learning_rate": 4.421405127713946e-06, + "loss": 2.2635, + "step": 20530 + }, + { + "epoch": 1.7498508480354555, + "grad_norm": 44.81080625896607, + "learning_rate": 4.4209126138430835e-06, + "loss": 2.7089, + "step": 20531 + }, + { + "epoch": 1.749936077729481, + "grad_norm": 56.959781901491944, + "learning_rate": 4.420420105667317e-06, + "loss": 2.9221, + "step": 20532 + }, + { + "epoch": 1.7500213074235065, + "grad_norm": 63.672537186057646, + "learning_rate": 4.41992760319149e-06, + "loss": 2.6722, + "step": 20533 + }, + { + "epoch": 1.7501065371175317, + "grad_norm": 48.06293825542545, + "learning_rate": 4.419435106420443e-06, + "loss": 2.9064, + "step": 20534 + }, + { + "epoch": 1.7501917668115572, + "grad_norm": 30.440844224157782, + "learning_rate": 4.41894261535902e-06, + "loss": 2.5989, + "step": 20535 + }, + { + "epoch": 1.7502769965055824, + "grad_norm": 57.27209445103345, + "learning_rate": 4.418450130012066e-06, + "loss": 2.1495, + "step": 20536 + }, + { + "epoch": 1.750362226199608, + "grad_norm": 42.535499289046044, + "learning_rate": 4.417957650384426e-06, + "loss": 2.8076, + "step": 20537 + }, + { + "epoch": 1.7504474558936334, + "grad_norm": 38.630446833725436, + "learning_rate": 4.417465176480941e-06, + "loss": 2.6535, + "step": 20538 + }, + { + "epoch": 1.7505326855876588, + "grad_norm": 41.24531396309127, + "learning_rate": 4.416972708306452e-06, + "loss": 3.1487, + "step": 20539 + }, + { + "epoch": 1.750617915281684, + "grad_norm": 73.73373580626426, + "learning_rate": 4.416480245865805e-06, + "loss": 2.5952, + "step": 20540 + }, + { + "epoch": 1.7507031449757096, + "grad_norm": 56.483943747739374, + "learning_rate": 4.415987789163844e-06, + "loss": 3.0804, + "step": 20541 + }, + { + "epoch": 1.7507883746697348, + "grad_norm": 44.57936117699226, + "learning_rate": 4.415495338205412e-06, + "loss": 2.8798, + "step": 20542 + }, + { + "epoch": 1.7508736043637603, + "grad_norm": 81.29290261322627, + "learning_rate": 4.41500289299535e-06, + "loss": 3.2883, + "step": 20543 + }, + { + "epoch": 1.7509588340577857, + "grad_norm": 76.30336942695371, + "learning_rate": 4.414510453538501e-06, + "loss": 2.9484, + "step": 20544 + }, + { + "epoch": 1.7510440637518112, + "grad_norm": 57.823363501776555, + "learning_rate": 4.414018019839711e-06, + "loss": 2.6, + "step": 20545 + }, + { + "epoch": 1.7511292934458367, + "grad_norm": 105.42996318098267, + "learning_rate": 4.413525591903821e-06, + "loss": 3.7967, + "step": 20546 + }, + { + "epoch": 1.751214523139862, + "grad_norm": 66.09131330378767, + "learning_rate": 4.413033169735672e-06, + "loss": 2.9796, + "step": 20547 + }, + { + "epoch": 1.7512997528338872, + "grad_norm": 60.28612924474413, + "learning_rate": 4.412540753340108e-06, + "loss": 2.0602, + "step": 20548 + }, + { + "epoch": 1.7513849825279126, + "grad_norm": 31.917189648965696, + "learning_rate": 4.412048342721975e-06, + "loss": 2.4518, + "step": 20549 + }, + { + "epoch": 1.751470212221938, + "grad_norm": 83.11626252871585, + "learning_rate": 4.4115559378861125e-06, + "loss": 3.6499, + "step": 20550 + }, + { + "epoch": 1.7515554419159636, + "grad_norm": 27.843855664694484, + "learning_rate": 4.411063538837362e-06, + "loss": 2.1372, + "step": 20551 + }, + { + "epoch": 1.751640671609989, + "grad_norm": 63.846266999928716, + "learning_rate": 4.410571145580569e-06, + "loss": 2.7374, + "step": 20552 + }, + { + "epoch": 1.7517259013040143, + "grad_norm": 44.01972399504593, + "learning_rate": 4.410078758120572e-06, + "loss": 1.8743, + "step": 20553 + }, + { + "epoch": 1.7518111309980398, + "grad_norm": 71.10646625062427, + "learning_rate": 4.409586376462219e-06, + "loss": 2.4439, + "step": 20554 + }, + { + "epoch": 1.751896360692065, + "grad_norm": 26.524838750385406, + "learning_rate": 4.4090940006103475e-06, + "loss": 2.4274, + "step": 20555 + }, + { + "epoch": 1.7519815903860905, + "grad_norm": 40.58237911518845, + "learning_rate": 4.408601630569803e-06, + "loss": 3.1017, + "step": 20556 + }, + { + "epoch": 1.752066820080116, + "grad_norm": 55.075750021846744, + "learning_rate": 4.408109266345425e-06, + "loss": 2.7595, + "step": 20557 + }, + { + "epoch": 1.7521520497741414, + "grad_norm": 30.33845376851161, + "learning_rate": 4.407616907942059e-06, + "loss": 2.3247, + "step": 20558 + }, + { + "epoch": 1.7522372794681669, + "grad_norm": 69.6039426727898, + "learning_rate": 4.4071245553645434e-06, + "loss": 3.1075, + "step": 20559 + }, + { + "epoch": 1.7523225091621921, + "grad_norm": 41.73324430102775, + "learning_rate": 4.406632208617724e-06, + "loss": 3.0864, + "step": 20560 + }, + { + "epoch": 1.7524077388562174, + "grad_norm": 29.92862171975905, + "learning_rate": 4.406139867706438e-06, + "loss": 2.2158, + "step": 20561 + }, + { + "epoch": 1.7524929685502428, + "grad_norm": 49.68541820023595, + "learning_rate": 4.405647532635534e-06, + "loss": 3.2413, + "step": 20562 + }, + { + "epoch": 1.7525781982442683, + "grad_norm": 61.18602552790694, + "learning_rate": 4.405155203409848e-06, + "loss": 3.3362, + "step": 20563 + }, + { + "epoch": 1.7526634279382938, + "grad_norm": 26.9701890956637, + "learning_rate": 4.4046628800342265e-06, + "loss": 1.852, + "step": 20564 + }, + { + "epoch": 1.7527486576323192, + "grad_norm": 47.36895518692537, + "learning_rate": 4.404170562513506e-06, + "loss": 2.0643, + "step": 20565 + }, + { + "epoch": 1.7528338873263445, + "grad_norm": 44.875059061957124, + "learning_rate": 4.403678250852534e-06, + "loss": 3.023, + "step": 20566 + }, + { + "epoch": 1.7529191170203697, + "grad_norm": 22.124284561663774, + "learning_rate": 4.403185945056148e-06, + "loss": 1.9463, + "step": 20567 + }, + { + "epoch": 1.7530043467143952, + "grad_norm": 71.99863247318042, + "learning_rate": 4.402693645129192e-06, + "loss": 2.3618, + "step": 20568 + }, + { + "epoch": 1.7530895764084207, + "grad_norm": 39.73202598883815, + "learning_rate": 4.402201351076507e-06, + "loss": 2.2696, + "step": 20569 + }, + { + "epoch": 1.7531748061024461, + "grad_norm": 80.81152690000245, + "learning_rate": 4.401709062902932e-06, + "loss": 3.029, + "step": 20570 + }, + { + "epoch": 1.7532600357964716, + "grad_norm": 39.387017134779086, + "learning_rate": 4.401216780613311e-06, + "loss": 3.0143, + "step": 20571 + }, + { + "epoch": 1.7533452654904969, + "grad_norm": 320.3105486448734, + "learning_rate": 4.400724504212487e-06, + "loss": 3.1225, + "step": 20572 + }, + { + "epoch": 1.7534304951845223, + "grad_norm": 35.72299123181024, + "learning_rate": 4.400232233705299e-06, + "loss": 2.5635, + "step": 20573 + }, + { + "epoch": 1.7535157248785476, + "grad_norm": 49.80632343672911, + "learning_rate": 4.399739969096586e-06, + "loss": 2.2648, + "step": 20574 + }, + { + "epoch": 1.753600954572573, + "grad_norm": 45.42158047886774, + "learning_rate": 4.399247710391194e-06, + "loss": 2.4019, + "step": 20575 + }, + { + "epoch": 1.7536861842665985, + "grad_norm": 129.09719198370325, + "learning_rate": 4.398755457593963e-06, + "loss": 2.8624, + "step": 20576 + }, + { + "epoch": 1.753771413960624, + "grad_norm": 92.07551168563494, + "learning_rate": 4.398263210709732e-06, + "loss": 4.0719, + "step": 20577 + }, + { + "epoch": 1.7538566436546494, + "grad_norm": 44.99546241823668, + "learning_rate": 4.397770969743342e-06, + "loss": 3.0265, + "step": 20578 + }, + { + "epoch": 1.7539418733486747, + "grad_norm": 57.60310488765422, + "learning_rate": 4.397278734699637e-06, + "loss": 2.4598, + "step": 20579 + }, + { + "epoch": 1.7540271030427, + "grad_norm": 59.97078133042573, + "learning_rate": 4.396786505583455e-06, + "loss": 2.6369, + "step": 20580 + }, + { + "epoch": 1.7541123327367254, + "grad_norm": 28.90104326734689, + "learning_rate": 4.39629428239964e-06, + "loss": 2.8016, + "step": 20581 + }, + { + "epoch": 1.7541975624307509, + "grad_norm": 106.67058278013647, + "learning_rate": 4.395802065153029e-06, + "loss": 4.2222, + "step": 20582 + }, + { + "epoch": 1.7542827921247763, + "grad_norm": 158.83576549050323, + "learning_rate": 4.395309853848464e-06, + "loss": 3.3425, + "step": 20583 + }, + { + "epoch": 1.7543680218188018, + "grad_norm": 91.84932801271886, + "learning_rate": 4.394817648490788e-06, + "loss": 4.1442, + "step": 20584 + }, + { + "epoch": 1.754453251512827, + "grad_norm": 73.68632884399418, + "learning_rate": 4.39432544908484e-06, + "loss": 2.5389, + "step": 20585 + }, + { + "epoch": 1.7545384812068525, + "grad_norm": 38.58676648598695, + "learning_rate": 4.3938332556354604e-06, + "loss": 2.9355, + "step": 20586 + }, + { + "epoch": 1.7546237109008778, + "grad_norm": 32.89420123007976, + "learning_rate": 4.393341068147487e-06, + "loss": 2.7605, + "step": 20587 + }, + { + "epoch": 1.7547089405949032, + "grad_norm": 27.312447737073658, + "learning_rate": 4.392848886625767e-06, + "loss": 2.2762, + "step": 20588 + }, + { + "epoch": 1.7547941702889287, + "grad_norm": 88.86833488190283, + "learning_rate": 4.392356711075136e-06, + "loss": 2.6451, + "step": 20589 + }, + { + "epoch": 1.7548793999829542, + "grad_norm": 63.85556946254645, + "learning_rate": 4.391864541500435e-06, + "loss": 3.5852, + "step": 20590 + }, + { + "epoch": 1.7549646296769794, + "grad_norm": 34.782434080612894, + "learning_rate": 4.391372377906502e-06, + "loss": 2.1267, + "step": 20591 + }, + { + "epoch": 1.7550498593710049, + "grad_norm": 49.697534345976756, + "learning_rate": 4.390880220298183e-06, + "loss": 2.2839, + "step": 20592 + }, + { + "epoch": 1.7551350890650301, + "grad_norm": 30.88754450273019, + "learning_rate": 4.3903880686803155e-06, + "loss": 3.2016, + "step": 20593 + }, + { + "epoch": 1.7552203187590556, + "grad_norm": 52.402519958027, + "learning_rate": 4.389895923057737e-06, + "loss": 3.1095, + "step": 20594 + }, + { + "epoch": 1.755305548453081, + "grad_norm": 48.209097556548464, + "learning_rate": 4.389403783435289e-06, + "loss": 2.2552, + "step": 20595 + }, + { + "epoch": 1.7553907781471065, + "grad_norm": 56.14114858469746, + "learning_rate": 4.388911649817814e-06, + "loss": 3.3166, + "step": 20596 + }, + { + "epoch": 1.755476007841132, + "grad_norm": 129.7060302661058, + "learning_rate": 4.38841952221015e-06, + "loss": 3.508, + "step": 20597 + }, + { + "epoch": 1.7555612375351572, + "grad_norm": 53.53834411793536, + "learning_rate": 4.387927400617137e-06, + "loss": 2.2387, + "step": 20598 + }, + { + "epoch": 1.7556464672291825, + "grad_norm": 21.934031289730406, + "learning_rate": 4.387435285043615e-06, + "loss": 1.8146, + "step": 20599 + }, + { + "epoch": 1.755731696923208, + "grad_norm": 46.19467903373276, + "learning_rate": 4.3869431754944215e-06, + "loss": 2.6258, + "step": 20600 + }, + { + "epoch": 1.7558169266172334, + "grad_norm": 46.6020150887741, + "learning_rate": 4.386451071974401e-06, + "loss": 2.8532, + "step": 20601 + }, + { + "epoch": 1.755902156311259, + "grad_norm": 37.708784817957685, + "learning_rate": 4.385958974488389e-06, + "loss": 2.9457, + "step": 20602 + }, + { + "epoch": 1.7559873860052844, + "grad_norm": 37.51560427969198, + "learning_rate": 4.385466883041228e-06, + "loss": 2.0011, + "step": 20603 + }, + { + "epoch": 1.7560726156993096, + "grad_norm": 55.78947788300381, + "learning_rate": 4.384974797637752e-06, + "loss": 2.9037, + "step": 20604 + }, + { + "epoch": 1.756157845393335, + "grad_norm": 50.00982429487614, + "learning_rate": 4.3844827182828085e-06, + "loss": 2.2552, + "step": 20605 + }, + { + "epoch": 1.7562430750873603, + "grad_norm": 68.70268239749525, + "learning_rate": 4.383990644981231e-06, + "loss": 2.642, + "step": 20606 + }, + { + "epoch": 1.7563283047813858, + "grad_norm": 69.79041418301887, + "learning_rate": 4.383498577737862e-06, + "loss": 3.0563, + "step": 20607 + }, + { + "epoch": 1.7564135344754113, + "grad_norm": 51.36963321781598, + "learning_rate": 4.383006516557537e-06, + "loss": 2.7464, + "step": 20608 + }, + { + "epoch": 1.7564987641694367, + "grad_norm": 46.81997733527974, + "learning_rate": 4.3825144614451e-06, + "loss": 3.085, + "step": 20609 + }, + { + "epoch": 1.756583993863462, + "grad_norm": 53.306040682733965, + "learning_rate": 4.3820224124053876e-06, + "loss": 3.0793, + "step": 20610 + }, + { + "epoch": 1.7566692235574874, + "grad_norm": 62.25514032648215, + "learning_rate": 4.3815303694432395e-06, + "loss": 1.9955, + "step": 20611 + }, + { + "epoch": 1.7567544532515127, + "grad_norm": 31.570754569952115, + "learning_rate": 4.381038332563495e-06, + "loss": 2.7416, + "step": 20612 + }, + { + "epoch": 1.7568396829455382, + "grad_norm": 44.63244817310378, + "learning_rate": 4.3805463017709905e-06, + "loss": 2.9247, + "step": 20613 + }, + { + "epoch": 1.7569249126395636, + "grad_norm": 75.71834793789797, + "learning_rate": 4.380054277070567e-06, + "loss": 3.6544, + "step": 20614 + }, + { + "epoch": 1.757010142333589, + "grad_norm": 59.10727675005727, + "learning_rate": 4.3795622584670655e-06, + "loss": 1.9087, + "step": 20615 + }, + { + "epoch": 1.7570953720276146, + "grad_norm": 50.42508860793651, + "learning_rate": 4.3790702459653225e-06, + "loss": 2.4894, + "step": 20616 + }, + { + "epoch": 1.7571806017216398, + "grad_norm": 56.217024372626724, + "learning_rate": 4.3785782395701746e-06, + "loss": 2.6785, + "step": 20617 + }, + { + "epoch": 1.757265831415665, + "grad_norm": 38.0947530214247, + "learning_rate": 4.378086239286463e-06, + "loss": 3.0786, + "step": 20618 + }, + { + "epoch": 1.7573510611096905, + "grad_norm": 65.16485522693542, + "learning_rate": 4.377594245119028e-06, + "loss": 2.5651, + "step": 20619 + }, + { + "epoch": 1.757436290803716, + "grad_norm": 67.02934207919418, + "learning_rate": 4.377102257072707e-06, + "loss": 2.9049, + "step": 20620 + }, + { + "epoch": 1.7575215204977415, + "grad_norm": 59.11593809272856, + "learning_rate": 4.3766102751523345e-06, + "loss": 2.8335, + "step": 20621 + }, + { + "epoch": 1.757606750191767, + "grad_norm": 58.08595906520499, + "learning_rate": 4.376118299362754e-06, + "loss": 3.6342, + "step": 20622 + }, + { + "epoch": 1.7576919798857922, + "grad_norm": 85.82234919589533, + "learning_rate": 4.3756263297088024e-06, + "loss": 3.3875, + "step": 20623 + }, + { + "epoch": 1.7577772095798176, + "grad_norm": 50.90026159730223, + "learning_rate": 4.375134366195318e-06, + "loss": 2.1429, + "step": 20624 + }, + { + "epoch": 1.757862439273843, + "grad_norm": 37.26288395397931, + "learning_rate": 4.374642408827137e-06, + "loss": 2.1544, + "step": 20625 + }, + { + "epoch": 1.7579476689678684, + "grad_norm": 44.28451721400434, + "learning_rate": 4.374150457609101e-06, + "loss": 2.9366, + "step": 20626 + }, + { + "epoch": 1.7580328986618938, + "grad_norm": 69.9899025578281, + "learning_rate": 4.373658512546047e-06, + "loss": 3.144, + "step": 20627 + }, + { + "epoch": 1.7581181283559193, + "grad_norm": 98.96227433765365, + "learning_rate": 4.373166573642812e-06, + "loss": 3.2032, + "step": 20628 + }, + { + "epoch": 1.7582033580499448, + "grad_norm": 67.95803819619009, + "learning_rate": 4.372674640904235e-06, + "loss": 2.8339, + "step": 20629 + }, + { + "epoch": 1.75828858774397, + "grad_norm": 35.245521518177796, + "learning_rate": 4.3721827143351515e-06, + "loss": 2.4368, + "step": 20630 + }, + { + "epoch": 1.7583738174379953, + "grad_norm": 72.25827045701338, + "learning_rate": 4.371690793940404e-06, + "loss": 3.3261, + "step": 20631 + }, + { + "epoch": 1.7584590471320207, + "grad_norm": 43.765676780073434, + "learning_rate": 4.371198879724828e-06, + "loss": 2.8085, + "step": 20632 + }, + { + "epoch": 1.7585442768260462, + "grad_norm": 53.96675309696157, + "learning_rate": 4.3707069716932595e-06, + "loss": 2.2846, + "step": 20633 + }, + { + "epoch": 1.7586295065200717, + "grad_norm": 54.98806342944266, + "learning_rate": 4.3702150698505376e-06, + "loss": 2.7234, + "step": 20634 + }, + { + "epoch": 1.7587147362140971, + "grad_norm": 63.98527981655356, + "learning_rate": 4.369723174201503e-06, + "loss": 2.9709, + "step": 20635 + }, + { + "epoch": 1.7587999659081224, + "grad_norm": 22.97396978280268, + "learning_rate": 4.369231284750989e-06, + "loss": 1.7829, + "step": 20636 + }, + { + "epoch": 1.7588851956021476, + "grad_norm": 79.18413577837701, + "learning_rate": 4.368739401503833e-06, + "loss": 3.4964, + "step": 20637 + }, + { + "epoch": 1.758970425296173, + "grad_norm": 41.889163039463526, + "learning_rate": 4.3682475244648745e-06, + "loss": 3.7568, + "step": 20638 + }, + { + "epoch": 1.7590556549901986, + "grad_norm": 85.89118752812473, + "learning_rate": 4.367755653638953e-06, + "loss": 3.2681, + "step": 20639 + }, + { + "epoch": 1.759140884684224, + "grad_norm": 134.87513291732654, + "learning_rate": 4.367263789030903e-06, + "loss": 1.8284, + "step": 20640 + }, + { + "epoch": 1.7592261143782495, + "grad_norm": 50.06083417496644, + "learning_rate": 4.366771930645561e-06, + "loss": 2.5398, + "step": 20641 + }, + { + "epoch": 1.7593113440722747, + "grad_norm": 55.632635431023715, + "learning_rate": 4.3662800784877664e-06, + "loss": 1.9949, + "step": 20642 + }, + { + "epoch": 1.7593965737663002, + "grad_norm": 50.828769673231285, + "learning_rate": 4.365788232562354e-06, + "loss": 2.4294, + "step": 20643 + }, + { + "epoch": 1.7594818034603255, + "grad_norm": 49.31491280232888, + "learning_rate": 4.365296392874163e-06, + "loss": 2.6505, + "step": 20644 + }, + { + "epoch": 1.759567033154351, + "grad_norm": 45.029682859592434, + "learning_rate": 4.36480455942803e-06, + "loss": 3.3794, + "step": 20645 + }, + { + "epoch": 1.7596522628483764, + "grad_norm": 28.635241136868817, + "learning_rate": 4.364312732228792e-06, + "loss": 1.9492, + "step": 20646 + }, + { + "epoch": 1.7597374925424019, + "grad_norm": 60.137581230977645, + "learning_rate": 4.363820911281284e-06, + "loss": 3.6044, + "step": 20647 + }, + { + "epoch": 1.7598227222364273, + "grad_norm": 66.39312314605583, + "learning_rate": 4.363329096590347e-06, + "loss": 2.6203, + "step": 20648 + }, + { + "epoch": 1.7599079519304526, + "grad_norm": 80.1860930220504, + "learning_rate": 4.362837288160814e-06, + "loss": 3.3187, + "step": 20649 + }, + { + "epoch": 1.7599931816244778, + "grad_norm": 70.66810594735178, + "learning_rate": 4.362345485997523e-06, + "loss": 3.1575, + "step": 20650 + }, + { + "epoch": 1.7600784113185033, + "grad_norm": 135.1149092587814, + "learning_rate": 4.361853690105311e-06, + "loss": 5.0289, + "step": 20651 + }, + { + "epoch": 1.7601636410125288, + "grad_norm": 88.25031596046686, + "learning_rate": 4.361361900489015e-06, + "loss": 3.7835, + "step": 20652 + }, + { + "epoch": 1.7602488707065542, + "grad_norm": 67.76740445885204, + "learning_rate": 4.36087011715347e-06, + "loss": 4.3887, + "step": 20653 + }, + { + "epoch": 1.7603341004005797, + "grad_norm": 29.06184063298087, + "learning_rate": 4.360378340103515e-06, + "loss": 2.3622, + "step": 20654 + }, + { + "epoch": 1.760419330094605, + "grad_norm": 43.10321377010162, + "learning_rate": 4.359886569343984e-06, + "loss": 2.8358, + "step": 20655 + }, + { + "epoch": 1.7605045597886304, + "grad_norm": 18.65065635721897, + "learning_rate": 4.359394804879712e-06, + "loss": 1.6059, + "step": 20656 + }, + { + "epoch": 1.7605897894826557, + "grad_norm": 71.91594712738099, + "learning_rate": 4.358903046715539e-06, + "loss": 1.6723, + "step": 20657 + }, + { + "epoch": 1.7606750191766811, + "grad_norm": 50.028753511966016, + "learning_rate": 4.358411294856301e-06, + "loss": 2.6708, + "step": 20658 + }, + { + "epoch": 1.7607602488707066, + "grad_norm": 46.45594125734262, + "learning_rate": 4.357919549306833e-06, + "loss": 2.9674, + "step": 20659 + }, + { + "epoch": 1.760845478564732, + "grad_norm": 37.054552027726125, + "learning_rate": 4.357427810071969e-06, + "loss": 2.6298, + "step": 20660 + }, + { + "epoch": 1.7609307082587573, + "grad_norm": 33.70789784564728, + "learning_rate": 4.356936077156549e-06, + "loss": 2.0038, + "step": 20661 + }, + { + "epoch": 1.7610159379527828, + "grad_norm": 183.68688522413936, + "learning_rate": 4.356444350565406e-06, + "loss": 3.1177, + "step": 20662 + }, + { + "epoch": 1.761101167646808, + "grad_norm": 40.431002747320164, + "learning_rate": 4.355952630303379e-06, + "loss": 2.9675, + "step": 20663 + }, + { + "epoch": 1.7611863973408335, + "grad_norm": 75.1623199851795, + "learning_rate": 4.355460916375299e-06, + "loss": 2.4154, + "step": 20664 + }, + { + "epoch": 1.761271627034859, + "grad_norm": 29.46196380907668, + "learning_rate": 4.354969208786007e-06, + "loss": 2.6339, + "step": 20665 + }, + { + "epoch": 1.7613568567288844, + "grad_norm": 31.41508948532704, + "learning_rate": 4.354477507540336e-06, + "loss": 1.8024, + "step": 20666 + }, + { + "epoch": 1.76144208642291, + "grad_norm": 48.341179603913226, + "learning_rate": 4.353985812643123e-06, + "loss": 3.2931, + "step": 20667 + }, + { + "epoch": 1.7615273161169351, + "grad_norm": 35.68297464665174, + "learning_rate": 4.3534941240992005e-06, + "loss": 2.7988, + "step": 20668 + }, + { + "epoch": 1.7616125458109604, + "grad_norm": 34.189026223394386, + "learning_rate": 4.353002441913408e-06, + "loss": 2.0987, + "step": 20669 + }, + { + "epoch": 1.7616977755049859, + "grad_norm": 60.086338181450714, + "learning_rate": 4.35251076609058e-06, + "loss": 2.5724, + "step": 20670 + }, + { + "epoch": 1.7617830051990113, + "grad_norm": 26.832421953889252, + "learning_rate": 4.3520190966355515e-06, + "loss": 2.5185, + "step": 20671 + }, + { + "epoch": 1.7618682348930368, + "grad_norm": 46.74504697959616, + "learning_rate": 4.351527433553156e-06, + "loss": 2.5094, + "step": 20672 + }, + { + "epoch": 1.7619534645870623, + "grad_norm": 43.29070742047253, + "learning_rate": 4.35103577684823e-06, + "loss": 3.2145, + "step": 20673 + }, + { + "epoch": 1.7620386942810875, + "grad_norm": 49.69592999031335, + "learning_rate": 4.350544126525611e-06, + "loss": 3.1337, + "step": 20674 + }, + { + "epoch": 1.762123923975113, + "grad_norm": 48.35300061008021, + "learning_rate": 4.350052482590133e-06, + "loss": 2.3314, + "step": 20675 + }, + { + "epoch": 1.7622091536691382, + "grad_norm": 68.53400278595844, + "learning_rate": 4.34956084504663e-06, + "loss": 3.0309, + "step": 20676 + }, + { + "epoch": 1.7622943833631637, + "grad_norm": 25.276380360979704, + "learning_rate": 4.349069213899936e-06, + "loss": 2.0887, + "step": 20677 + }, + { + "epoch": 1.7623796130571892, + "grad_norm": 61.0119656403132, + "learning_rate": 4.348577589154889e-06, + "loss": 2.7167, + "step": 20678 + }, + { + "epoch": 1.7624648427512146, + "grad_norm": 49.31276212456129, + "learning_rate": 4.348085970816323e-06, + "loss": 2.3706, + "step": 20679 + }, + { + "epoch": 1.7625500724452399, + "grad_norm": 45.6792891765067, + "learning_rate": 4.347594358889072e-06, + "loss": 3.1688, + "step": 20680 + }, + { + "epoch": 1.7626353021392653, + "grad_norm": 56.736152882279164, + "learning_rate": 4.347102753377969e-06, + "loss": 3.3198, + "step": 20681 + }, + { + "epoch": 1.7627205318332906, + "grad_norm": 47.01578775067897, + "learning_rate": 4.3466111542878544e-06, + "loss": 3.0362, + "step": 20682 + }, + { + "epoch": 1.762805761527316, + "grad_norm": 49.63145462748531, + "learning_rate": 4.346119561623559e-06, + "loss": 2.2662, + "step": 20683 + }, + { + "epoch": 1.7628909912213415, + "grad_norm": 30.844433150564566, + "learning_rate": 4.345627975389916e-06, + "loss": 2.5945, + "step": 20684 + }, + { + "epoch": 1.762976220915367, + "grad_norm": 24.12033324483608, + "learning_rate": 4.3451363955917644e-06, + "loss": 1.9907, + "step": 20685 + }, + { + "epoch": 1.7630614506093925, + "grad_norm": 43.827444374812984, + "learning_rate": 4.344644822233934e-06, + "loss": 2.4915, + "step": 20686 + }, + { + "epoch": 1.7631466803034177, + "grad_norm": 31.762326299922783, + "learning_rate": 4.344153255321262e-06, + "loss": 3.1244, + "step": 20687 + }, + { + "epoch": 1.763231909997443, + "grad_norm": 56.95545541098841, + "learning_rate": 4.343661694858582e-06, + "loss": 2.5994, + "step": 20688 + }, + { + "epoch": 1.7633171396914684, + "grad_norm": 39.67826684355091, + "learning_rate": 4.3431701408507284e-06, + "loss": 3.2815, + "step": 20689 + }, + { + "epoch": 1.7634023693854939, + "grad_norm": 37.25134480819733, + "learning_rate": 4.3426785933025344e-06, + "loss": 1.7764, + "step": 20690 + }, + { + "epoch": 1.7634875990795194, + "grad_norm": 55.97601421177491, + "learning_rate": 4.3421870522188375e-06, + "loss": 2.9165, + "step": 20691 + }, + { + "epoch": 1.7635728287735448, + "grad_norm": 34.787647679705465, + "learning_rate": 4.341695517604468e-06, + "loss": 2.4352, + "step": 20692 + }, + { + "epoch": 1.76365805846757, + "grad_norm": 71.02896280426495, + "learning_rate": 4.341203989464263e-06, + "loss": 3.0501, + "step": 20693 + }, + { + "epoch": 1.7637432881615955, + "grad_norm": 78.164863239778, + "learning_rate": 4.340712467803052e-06, + "loss": 3.3384, + "step": 20694 + }, + { + "epoch": 1.7638285178556208, + "grad_norm": 56.818102319093406, + "learning_rate": 4.340220952625675e-06, + "loss": 3.6691, + "step": 20695 + }, + { + "epoch": 1.7639137475496462, + "grad_norm": 41.22254223964009, + "learning_rate": 4.339729443936961e-06, + "loss": 2.8565, + "step": 20696 + }, + { + "epoch": 1.7639989772436717, + "grad_norm": 34.696634352335856, + "learning_rate": 4.339237941741748e-06, + "loss": 2.6336, + "step": 20697 + }, + { + "epoch": 1.7640842069376972, + "grad_norm": 51.73885701463669, + "learning_rate": 4.338746446044864e-06, + "loss": 3.0321, + "step": 20698 + }, + { + "epoch": 1.7641694366317227, + "grad_norm": 153.90220670071022, + "learning_rate": 4.338254956851148e-06, + "loss": 3.4622, + "step": 20699 + }, + { + "epoch": 1.764254666325748, + "grad_norm": 37.809423164544526, + "learning_rate": 4.33776347416543e-06, + "loss": 2.5147, + "step": 20700 + }, + { + "epoch": 1.7643398960197731, + "grad_norm": 68.45896858168706, + "learning_rate": 4.337271997992547e-06, + "loss": 3.0477, + "step": 20701 + }, + { + "epoch": 1.7644251257137986, + "grad_norm": 124.16895665867207, + "learning_rate": 4.33678052833733e-06, + "loss": 4.2255, + "step": 20702 + }, + { + "epoch": 1.764510355407824, + "grad_norm": 41.984317855219594, + "learning_rate": 4.336289065204612e-06, + "loss": 2.7172, + "step": 20703 + }, + { + "epoch": 1.7645955851018496, + "grad_norm": 94.62813625412178, + "learning_rate": 4.335797608599227e-06, + "loss": 3.872, + "step": 20704 + }, + { + "epoch": 1.764680814795875, + "grad_norm": 45.903991721096226, + "learning_rate": 4.33530615852601e-06, + "loss": 2.8101, + "step": 20705 + }, + { + "epoch": 1.7647660444899003, + "grad_norm": 62.05708171157422, + "learning_rate": 4.334814714989794e-06, + "loss": 3.6037, + "step": 20706 + }, + { + "epoch": 1.7648512741839257, + "grad_norm": 34.457925635858246, + "learning_rate": 4.334323277995407e-06, + "loss": 2.6256, + "step": 20707 + }, + { + "epoch": 1.764936503877951, + "grad_norm": 38.140568538405624, + "learning_rate": 4.333831847547688e-06, + "loss": 2.1749, + "step": 20708 + }, + { + "epoch": 1.7650217335719764, + "grad_norm": 28.60772811843564, + "learning_rate": 4.333340423651469e-06, + "loss": 2.4804, + "step": 20709 + }, + { + "epoch": 1.765106963266002, + "grad_norm": 46.121660312573916, + "learning_rate": 4.332849006311582e-06, + "loss": 2.5955, + "step": 20710 + }, + { + "epoch": 1.7651921929600274, + "grad_norm": 60.86984733692522, + "learning_rate": 4.332357595532858e-06, + "loss": 2.9959, + "step": 20711 + }, + { + "epoch": 1.7652774226540526, + "grad_norm": 57.29250179281117, + "learning_rate": 4.331866191320133e-06, + "loss": 2.8479, + "step": 20712 + }, + { + "epoch": 1.765362652348078, + "grad_norm": 77.60717524559752, + "learning_rate": 4.331374793678239e-06, + "loss": 3.3755, + "step": 20713 + }, + { + "epoch": 1.7654478820421033, + "grad_norm": 36.14891870320618, + "learning_rate": 4.330883402612009e-06, + "loss": 1.6473, + "step": 20714 + }, + { + "epoch": 1.7655331117361288, + "grad_norm": 42.143481331012545, + "learning_rate": 4.330392018126273e-06, + "loss": 2.4165, + "step": 20715 + }, + { + "epoch": 1.7656183414301543, + "grad_norm": 74.89970549552162, + "learning_rate": 4.329900640225865e-06, + "loss": 2.3183, + "step": 20716 + }, + { + "epoch": 1.7657035711241797, + "grad_norm": 37.19770103373148, + "learning_rate": 4.3294092689156195e-06, + "loss": 2.7877, + "step": 20717 + }, + { + "epoch": 1.7657888008182052, + "grad_norm": 51.854443737324026, + "learning_rate": 4.328917904200368e-06, + "loss": 3.0667, + "step": 20718 + }, + { + "epoch": 1.7658740305122305, + "grad_norm": 56.48792977933102, + "learning_rate": 4.328426546084941e-06, + "loss": 2.7783, + "step": 20719 + }, + { + "epoch": 1.7659592602062557, + "grad_norm": 56.72702892983416, + "learning_rate": 4.327935194574171e-06, + "loss": 2.851, + "step": 20720 + }, + { + "epoch": 1.7660444899002812, + "grad_norm": 82.50270968508593, + "learning_rate": 4.327443849672893e-06, + "loss": 3.4657, + "step": 20721 + }, + { + "epoch": 1.7661297195943066, + "grad_norm": 73.93800647821368, + "learning_rate": 4.3269525113859375e-06, + "loss": 2.2359, + "step": 20722 + }, + { + "epoch": 1.7662149492883321, + "grad_norm": 35.28676635756213, + "learning_rate": 4.326461179718136e-06, + "loss": 3.2357, + "step": 20723 + }, + { + "epoch": 1.7663001789823576, + "grad_norm": 49.31912092579147, + "learning_rate": 4.325969854674319e-06, + "loss": 3.4886, + "step": 20724 + }, + { + "epoch": 1.7663854086763828, + "grad_norm": 91.55157136076615, + "learning_rate": 4.325478536259325e-06, + "loss": 3.1763, + "step": 20725 + }, + { + "epoch": 1.7664706383704083, + "grad_norm": 59.345297373364986, + "learning_rate": 4.324987224477979e-06, + "loss": 2.9534, + "step": 20726 + }, + { + "epoch": 1.7665558680644335, + "grad_norm": 40.20788556509366, + "learning_rate": 4.324495919335116e-06, + "loss": 2.2128, + "step": 20727 + }, + { + "epoch": 1.766641097758459, + "grad_norm": 70.43496956708535, + "learning_rate": 4.324004620835568e-06, + "loss": 3.7582, + "step": 20728 + }, + { + "epoch": 1.7667263274524845, + "grad_norm": 42.72997112200545, + "learning_rate": 4.323513328984164e-06, + "loss": 2.998, + "step": 20729 + }, + { + "epoch": 1.76681155714651, + "grad_norm": 44.26261324656669, + "learning_rate": 4.323022043785739e-06, + "loss": 3.0641, + "step": 20730 + }, + { + "epoch": 1.7668967868405352, + "grad_norm": 50.90636434912161, + "learning_rate": 4.322530765245122e-06, + "loss": 2.0442, + "step": 20731 + }, + { + "epoch": 1.7669820165345607, + "grad_norm": 75.07864835991127, + "learning_rate": 4.322039493367148e-06, + "loss": 3.016, + "step": 20732 + }, + { + "epoch": 1.767067246228586, + "grad_norm": 31.884787625220977, + "learning_rate": 4.3215482281566436e-06, + "loss": 2.8155, + "step": 20733 + }, + { + "epoch": 1.7671524759226114, + "grad_norm": 69.45431399330415, + "learning_rate": 4.321056969618444e-06, + "loss": 3.3062, + "step": 20734 + }, + { + "epoch": 1.7672377056166368, + "grad_norm": 53.84413041378574, + "learning_rate": 4.320565717757379e-06, + "loss": 3.4017, + "step": 20735 + }, + { + "epoch": 1.7673229353106623, + "grad_norm": 65.86772468379058, + "learning_rate": 4.320074472578281e-06, + "loss": 2.3909, + "step": 20736 + }, + { + "epoch": 1.7674081650046878, + "grad_norm": 84.41762925035164, + "learning_rate": 4.319583234085979e-06, + "loss": 3.7528, + "step": 20737 + }, + { + "epoch": 1.767493394698713, + "grad_norm": 49.444388549192404, + "learning_rate": 4.319092002285306e-06, + "loss": 2.27, + "step": 20738 + }, + { + "epoch": 1.7675786243927383, + "grad_norm": 39.198523212006045, + "learning_rate": 4.318600777181093e-06, + "loss": 2.6793, + "step": 20739 + }, + { + "epoch": 1.7676638540867637, + "grad_norm": 79.74186446163291, + "learning_rate": 4.318109558778172e-06, + "loss": 2.5678, + "step": 20740 + }, + { + "epoch": 1.7677490837807892, + "grad_norm": 153.0587865216855, + "learning_rate": 4.317618347081369e-06, + "loss": 4.9696, + "step": 20741 + }, + { + "epoch": 1.7678343134748147, + "grad_norm": 85.17200118056313, + "learning_rate": 4.317127142095522e-06, + "loss": 2.6646, + "step": 20742 + }, + { + "epoch": 1.7679195431688401, + "grad_norm": 57.35143984591068, + "learning_rate": 4.316635943825456e-06, + "loss": 2.173, + "step": 20743 + }, + { + "epoch": 1.7680047728628654, + "grad_norm": 54.226006818308775, + "learning_rate": 4.3161447522760055e-06, + "loss": 2.0973, + "step": 20744 + }, + { + "epoch": 1.7680900025568909, + "grad_norm": 44.32741104037439, + "learning_rate": 4.3156535674519995e-06, + "loss": 2.2499, + "step": 20745 + }, + { + "epoch": 1.768175232250916, + "grad_norm": 53.96562355645514, + "learning_rate": 4.315162389358267e-06, + "loss": 2.9644, + "step": 20746 + }, + { + "epoch": 1.7682604619449416, + "grad_norm": 42.77183914316529, + "learning_rate": 4.314671217999641e-06, + "loss": 3.1494, + "step": 20747 + }, + { + "epoch": 1.768345691638967, + "grad_norm": 35.3982642264884, + "learning_rate": 4.314180053380952e-06, + "loss": 2.4179, + "step": 20748 + }, + { + "epoch": 1.7684309213329925, + "grad_norm": 27.54518809638553, + "learning_rate": 4.31368889550703e-06, + "loss": 1.6837, + "step": 20749 + }, + { + "epoch": 1.7685161510270178, + "grad_norm": 43.369203933091526, + "learning_rate": 4.313197744382703e-06, + "loss": 2.7887, + "step": 20750 + }, + { + "epoch": 1.7686013807210432, + "grad_norm": 26.81286645130329, + "learning_rate": 4.312706600012803e-06, + "loss": 2.1525, + "step": 20751 + }, + { + "epoch": 1.7686866104150685, + "grad_norm": 54.8692814305462, + "learning_rate": 4.312215462402163e-06, + "loss": 3.5649, + "step": 20752 + }, + { + "epoch": 1.768771840109094, + "grad_norm": 105.4381669485946, + "learning_rate": 4.31172433155561e-06, + "loss": 2.9694, + "step": 20753 + }, + { + "epoch": 1.7688570698031194, + "grad_norm": 53.69048023599981, + "learning_rate": 4.311233207477974e-06, + "loss": 3.541, + "step": 20754 + }, + { + "epoch": 1.7689422994971449, + "grad_norm": 64.43894782845236, + "learning_rate": 4.310742090174086e-06, + "loss": 2.8669, + "step": 20755 + }, + { + "epoch": 1.7690275291911703, + "grad_norm": 29.04239179353272, + "learning_rate": 4.310250979648776e-06, + "loss": 2.4002, + "step": 20756 + }, + { + "epoch": 1.7691127588851956, + "grad_norm": 77.3534781612901, + "learning_rate": 4.309759875906873e-06, + "loss": 4.1231, + "step": 20757 + }, + { + "epoch": 1.7691979885792208, + "grad_norm": 39.34376000136371, + "learning_rate": 4.309268778953208e-06, + "loss": 2.916, + "step": 20758 + }, + { + "epoch": 1.7692832182732463, + "grad_norm": 66.88499387813775, + "learning_rate": 4.308777688792608e-06, + "loss": 2.5274, + "step": 20759 + }, + { + "epoch": 1.7693684479672718, + "grad_norm": 41.5951689595588, + "learning_rate": 4.308286605429907e-06, + "loss": 2.8754, + "step": 20760 + }, + { + "epoch": 1.7694536776612972, + "grad_norm": 38.69898362360379, + "learning_rate": 4.307795528869933e-06, + "loss": 2.8138, + "step": 20761 + }, + { + "epoch": 1.7695389073553227, + "grad_norm": 37.22046404189046, + "learning_rate": 4.307304459117513e-06, + "loss": 1.2068, + "step": 20762 + }, + { + "epoch": 1.769624137049348, + "grad_norm": 49.205549449095116, + "learning_rate": 4.306813396177478e-06, + "loss": 2.8883, + "step": 20763 + }, + { + "epoch": 1.7697093667433734, + "grad_norm": 37.423835795527545, + "learning_rate": 4.30632234005466e-06, + "loss": 2.6268, + "step": 20764 + }, + { + "epoch": 1.7697945964373987, + "grad_norm": 39.41899848539928, + "learning_rate": 4.305831290753885e-06, + "loss": 2.847, + "step": 20765 + }, + { + "epoch": 1.7698798261314241, + "grad_norm": 35.50949222816813, + "learning_rate": 4.305340248279983e-06, + "loss": 2.3642, + "step": 20766 + }, + { + "epoch": 1.7699650558254496, + "grad_norm": 34.3969163042993, + "learning_rate": 4.304849212637783e-06, + "loss": 2.1591, + "step": 20767 + }, + { + "epoch": 1.770050285519475, + "grad_norm": 35.40565587507243, + "learning_rate": 4.304358183832116e-06, + "loss": 2.0524, + "step": 20768 + }, + { + "epoch": 1.7701355152135005, + "grad_norm": 48.1813395520736, + "learning_rate": 4.303867161867811e-06, + "loss": 1.3324, + "step": 20769 + }, + { + "epoch": 1.7702207449075258, + "grad_norm": 22.869782704185532, + "learning_rate": 4.303376146749694e-06, + "loss": 1.7837, + "step": 20770 + }, + { + "epoch": 1.770305974601551, + "grad_norm": 43.375922312403475, + "learning_rate": 4.302885138482595e-06, + "loss": 2.5528, + "step": 20771 + }, + { + "epoch": 1.7703912042955765, + "grad_norm": 71.99177465478002, + "learning_rate": 4.3023941370713455e-06, + "loss": 4.797, + "step": 20772 + }, + { + "epoch": 1.770476433989602, + "grad_norm": 57.2742063338432, + "learning_rate": 4.301903142520773e-06, + "loss": 3.1618, + "step": 20773 + }, + { + "epoch": 1.7705616636836274, + "grad_norm": 46.39160901537153, + "learning_rate": 4.301412154835704e-06, + "loss": 2.8234, + "step": 20774 + }, + { + "epoch": 1.770646893377653, + "grad_norm": 55.53021283233915, + "learning_rate": 4.3009211740209705e-06, + "loss": 2.4404, + "step": 20775 + }, + { + "epoch": 1.7707321230716782, + "grad_norm": 75.02551101696898, + "learning_rate": 4.300430200081397e-06, + "loss": 2.9062, + "step": 20776 + }, + { + "epoch": 1.7708173527657036, + "grad_norm": 45.991928032391286, + "learning_rate": 4.299939233021817e-06, + "loss": 2.9196, + "step": 20777 + }, + { + "epoch": 1.7709025824597289, + "grad_norm": 35.4926710368278, + "learning_rate": 4.299448272847055e-06, + "loss": 2.8933, + "step": 20778 + }, + { + "epoch": 1.7709878121537543, + "grad_norm": 56.10347525981762, + "learning_rate": 4.298957319561942e-06, + "loss": 2.389, + "step": 20779 + }, + { + "epoch": 1.7710730418477798, + "grad_norm": 38.57892720192731, + "learning_rate": 4.2984663731713026e-06, + "loss": 2.5627, + "step": 20780 + }, + { + "epoch": 1.7711582715418053, + "grad_norm": 41.3283518737248, + "learning_rate": 4.29797543367997e-06, + "loss": 2.641, + "step": 20781 + }, + { + "epoch": 1.7712435012358305, + "grad_norm": 26.374219048783612, + "learning_rate": 4.29748450109277e-06, + "loss": 1.9942, + "step": 20782 + }, + { + "epoch": 1.771328730929856, + "grad_norm": 91.79211507195048, + "learning_rate": 4.296993575414531e-06, + "loss": 2.2989, + "step": 20783 + }, + { + "epoch": 1.7714139606238812, + "grad_norm": 33.360822184708944, + "learning_rate": 4.296502656650079e-06, + "loss": 2.5494, + "step": 20784 + }, + { + "epoch": 1.7714991903179067, + "grad_norm": 87.69156828823881, + "learning_rate": 4.296011744804246e-06, + "loss": 3.7541, + "step": 20785 + }, + { + "epoch": 1.7715844200119322, + "grad_norm": 38.12379178816389, + "learning_rate": 4.295520839881856e-06, + "loss": 2.0374, + "step": 20786 + }, + { + "epoch": 1.7716696497059576, + "grad_norm": 40.935828602502255, + "learning_rate": 4.295029941887741e-06, + "loss": 2.8616, + "step": 20787 + }, + { + "epoch": 1.771754879399983, + "grad_norm": 36.3196533447741, + "learning_rate": 4.294539050826725e-06, + "loss": 2.8876, + "step": 20788 + }, + { + "epoch": 1.7718401090940084, + "grad_norm": 33.946674676710735, + "learning_rate": 4.294048166703636e-06, + "loss": 2.9057, + "step": 20789 + }, + { + "epoch": 1.7719253387880336, + "grad_norm": 25.82232705072927, + "learning_rate": 4.293557289523302e-06, + "loss": 2.088, + "step": 20790 + }, + { + "epoch": 1.772010568482059, + "grad_norm": 37.248404271390804, + "learning_rate": 4.293066419290555e-06, + "loss": 2.0819, + "step": 20791 + }, + { + "epoch": 1.7720957981760845, + "grad_norm": 40.85325783918265, + "learning_rate": 4.292575556010217e-06, + "loss": 3.6137, + "step": 20792 + }, + { + "epoch": 1.77218102787011, + "grad_norm": 48.21034734899414, + "learning_rate": 4.292084699687116e-06, + "loss": 2.5477, + "step": 20793 + }, + { + "epoch": 1.7722662575641355, + "grad_norm": 67.9715780975166, + "learning_rate": 4.2915938503260815e-06, + "loss": 3.0922, + "step": 20794 + }, + { + "epoch": 1.7723514872581607, + "grad_norm": 48.88794306895619, + "learning_rate": 4.291103007931941e-06, + "loss": 1.83, + "step": 20795 + }, + { + "epoch": 1.7724367169521862, + "grad_norm": 26.680332917747492, + "learning_rate": 4.2906121725095205e-06, + "loss": 2.2767, + "step": 20796 + }, + { + "epoch": 1.7725219466462114, + "grad_norm": 50.27995276275001, + "learning_rate": 4.290121344063645e-06, + "loss": 1.3802, + "step": 20797 + }, + { + "epoch": 1.772607176340237, + "grad_norm": 50.3351902082757, + "learning_rate": 4.289630522599146e-06, + "loss": 2.7249, + "step": 20798 + }, + { + "epoch": 1.7726924060342624, + "grad_norm": 37.58386440097096, + "learning_rate": 4.289139708120849e-06, + "loss": 2.2202, + "step": 20799 + }, + { + "epoch": 1.7727776357282878, + "grad_norm": 27.008544887434816, + "learning_rate": 4.288648900633582e-06, + "loss": 2.5082, + "step": 20800 + }, + { + "epoch": 1.772862865422313, + "grad_norm": 43.31487721705694, + "learning_rate": 4.288158100142168e-06, + "loss": 3.5004, + "step": 20801 + }, + { + "epoch": 1.7729480951163386, + "grad_norm": 45.527355097456166, + "learning_rate": 4.2876673066514355e-06, + "loss": 2.8638, + "step": 20802 + }, + { + "epoch": 1.7730333248103638, + "grad_norm": 42.33472491803802, + "learning_rate": 4.287176520166213e-06, + "loss": 3.1084, + "step": 20803 + }, + { + "epoch": 1.7731185545043893, + "grad_norm": 83.7988104991545, + "learning_rate": 4.2866857406913275e-06, + "loss": 3.2644, + "step": 20804 + }, + { + "epoch": 1.7732037841984147, + "grad_norm": 18.517782878431092, + "learning_rate": 4.286194968231603e-06, + "loss": 1.5817, + "step": 20805 + }, + { + "epoch": 1.7732890138924402, + "grad_norm": 57.61206843562745, + "learning_rate": 4.285704202791866e-06, + "loss": 2.5977, + "step": 20806 + }, + { + "epoch": 1.7733742435864657, + "grad_norm": 40.794159217695324, + "learning_rate": 4.285213444376946e-06, + "loss": 2.327, + "step": 20807 + }, + { + "epoch": 1.773459473280491, + "grad_norm": 61.21119967239325, + "learning_rate": 4.2847226929916685e-06, + "loss": 2.3202, + "step": 20808 + }, + { + "epoch": 1.7735447029745162, + "grad_norm": 58.852303923283294, + "learning_rate": 4.284231948640858e-06, + "loss": 3.4306, + "step": 20809 + }, + { + "epoch": 1.7736299326685416, + "grad_norm": 31.932862839599846, + "learning_rate": 4.283741211329341e-06, + "loss": 2.5684, + "step": 20810 + }, + { + "epoch": 1.773715162362567, + "grad_norm": 33.60523245703579, + "learning_rate": 4.283250481061946e-06, + "loss": 2.4141, + "step": 20811 + }, + { + "epoch": 1.7738003920565926, + "grad_norm": 52.362497346035696, + "learning_rate": 4.282759757843498e-06, + "loss": 2.8236, + "step": 20812 + }, + { + "epoch": 1.773885621750618, + "grad_norm": 26.784223486315412, + "learning_rate": 4.282269041678822e-06, + "loss": 2.221, + "step": 20813 + }, + { + "epoch": 1.7739708514446433, + "grad_norm": 87.40680783590042, + "learning_rate": 4.281778332572743e-06, + "loss": 2.0584, + "step": 20814 + }, + { + "epoch": 1.7740560811386688, + "grad_norm": 53.28428839121241, + "learning_rate": 4.281287630530091e-06, + "loss": 2.4871, + "step": 20815 + }, + { + "epoch": 1.774141310832694, + "grad_norm": 50.712507611004014, + "learning_rate": 4.28079693555569e-06, + "loss": 2.6828, + "step": 20816 + }, + { + "epoch": 1.7742265405267195, + "grad_norm": 79.45198891589959, + "learning_rate": 4.280306247654364e-06, + "loss": 3.3312, + "step": 20817 + }, + { + "epoch": 1.774311770220745, + "grad_norm": 46.47516934501323, + "learning_rate": 4.279815566830942e-06, + "loss": 3.168, + "step": 20818 + }, + { + "epoch": 1.7743969999147704, + "grad_norm": 39.22601805876664, + "learning_rate": 4.279324893090244e-06, + "loss": 2.7407, + "step": 20819 + }, + { + "epoch": 1.7744822296087959, + "grad_norm": 50.797556453850916, + "learning_rate": 4.278834226437103e-06, + "loss": 2.4316, + "step": 20820 + }, + { + "epoch": 1.7745674593028211, + "grad_norm": 35.66481106278024, + "learning_rate": 4.278343566876339e-06, + "loss": 2.8626, + "step": 20821 + }, + { + "epoch": 1.7746526889968464, + "grad_norm": 113.86340078200945, + "learning_rate": 4.27785291441278e-06, + "loss": 3.3801, + "step": 20822 + }, + { + "epoch": 1.7747379186908718, + "grad_norm": 35.24533905553581, + "learning_rate": 4.277362269051249e-06, + "loss": 2.802, + "step": 20823 + }, + { + "epoch": 1.7748231483848973, + "grad_norm": 37.93161936893159, + "learning_rate": 4.2768716307965755e-06, + "loss": 1.6529, + "step": 20824 + }, + { + "epoch": 1.7749083780789228, + "grad_norm": 58.42714102643262, + "learning_rate": 4.2763809996535796e-06, + "loss": 2.6649, + "step": 20825 + }, + { + "epoch": 1.7749936077729482, + "grad_norm": 33.33424229986415, + "learning_rate": 4.275890375627091e-06, + "loss": 2.9094, + "step": 20826 + }, + { + "epoch": 1.7750788374669735, + "grad_norm": 37.96136761741222, + "learning_rate": 4.275399758721931e-06, + "loss": 2.7749, + "step": 20827 + }, + { + "epoch": 1.7751640671609987, + "grad_norm": 51.53451486644615, + "learning_rate": 4.274909148942928e-06, + "loss": 2.7786, + "step": 20828 + }, + { + "epoch": 1.7752492968550242, + "grad_norm": 65.31879184439555, + "learning_rate": 4.274418546294905e-06, + "loss": 3.1439, + "step": 20829 + }, + { + "epoch": 1.7753345265490497, + "grad_norm": 24.548456236074927, + "learning_rate": 4.273927950782688e-06, + "loss": 1.9516, + "step": 20830 + }, + { + "epoch": 1.7754197562430751, + "grad_norm": 94.50483311070144, + "learning_rate": 4.273437362411102e-06, + "loss": 3.769, + "step": 20831 + }, + { + "epoch": 1.7755049859371006, + "grad_norm": 34.54692628885498, + "learning_rate": 4.272946781184967e-06, + "loss": 2.6214, + "step": 20832 + }, + { + "epoch": 1.7755902156311258, + "grad_norm": 40.4956314951924, + "learning_rate": 4.272456207109113e-06, + "loss": 3.66, + "step": 20833 + }, + { + "epoch": 1.7756754453251513, + "grad_norm": 58.41737740301443, + "learning_rate": 4.271965640188365e-06, + "loss": 2.4674, + "step": 20834 + }, + { + "epoch": 1.7757606750191766, + "grad_norm": 39.639701228225086, + "learning_rate": 4.271475080427545e-06, + "loss": 2.9342, + "step": 20835 + }, + { + "epoch": 1.775845904713202, + "grad_norm": 28.458057804291915, + "learning_rate": 4.270984527831476e-06, + "loss": 2.1388, + "step": 20836 + }, + { + "epoch": 1.7759311344072275, + "grad_norm": 40.68642569957132, + "learning_rate": 4.2704939824049855e-06, + "loss": 3.0654, + "step": 20837 + }, + { + "epoch": 1.776016364101253, + "grad_norm": 52.600850273210426, + "learning_rate": 4.270003444152898e-06, + "loss": 3.4289, + "step": 20838 + }, + { + "epoch": 1.7761015937952784, + "grad_norm": 52.26357395218101, + "learning_rate": 4.269512913080037e-06, + "loss": 2.6875, + "step": 20839 + }, + { + "epoch": 1.7761868234893037, + "grad_norm": 33.07922481777304, + "learning_rate": 4.2690223891912235e-06, + "loss": 2.7948, + "step": 20840 + }, + { + "epoch": 1.776272053183329, + "grad_norm": 37.606589093583494, + "learning_rate": 4.268531872491286e-06, + "loss": 2.6487, + "step": 20841 + }, + { + "epoch": 1.7763572828773544, + "grad_norm": 101.2368668206937, + "learning_rate": 4.268041362985047e-06, + "loss": 4.7544, + "step": 20842 + }, + { + "epoch": 1.7764425125713799, + "grad_norm": 65.31647294247263, + "learning_rate": 4.267550860677331e-06, + "loss": 2.3317, + "step": 20843 + }, + { + "epoch": 1.7765277422654053, + "grad_norm": 58.60824847858158, + "learning_rate": 4.267060365572959e-06, + "loss": 2.0697, + "step": 20844 + }, + { + "epoch": 1.7766129719594308, + "grad_norm": 74.89356099957232, + "learning_rate": 4.266569877676758e-06, + "loss": 4.4345, + "step": 20845 + }, + { + "epoch": 1.776698201653456, + "grad_norm": 63.95101116504358, + "learning_rate": 4.266079396993553e-06, + "loss": 2.4469, + "step": 20846 + }, + { + "epoch": 1.7767834313474815, + "grad_norm": 76.83398697160172, + "learning_rate": 4.265588923528164e-06, + "loss": 3.3614, + "step": 20847 + }, + { + "epoch": 1.7768686610415068, + "grad_norm": 55.264691639051385, + "learning_rate": 4.265098457285416e-06, + "loss": 3.4037, + "step": 20848 + }, + { + "epoch": 1.7769538907355322, + "grad_norm": 19.01359810575191, + "learning_rate": 4.264607998270131e-06, + "loss": 1.1943, + "step": 20849 + }, + { + "epoch": 1.7770391204295577, + "grad_norm": 85.14746491036942, + "learning_rate": 4.2641175464871366e-06, + "loss": 3.3844, + "step": 20850 + }, + { + "epoch": 1.7771243501235832, + "grad_norm": 43.61454778599558, + "learning_rate": 4.263627101941254e-06, + "loss": 1.8823, + "step": 20851 + }, + { + "epoch": 1.7772095798176084, + "grad_norm": 63.53225159207215, + "learning_rate": 4.263136664637304e-06, + "loss": 2.7333, + "step": 20852 + }, + { + "epoch": 1.7772948095116339, + "grad_norm": 50.079125156489695, + "learning_rate": 4.262646234580112e-06, + "loss": 3.4702, + "step": 20853 + }, + { + "epoch": 1.7773800392056591, + "grad_norm": 70.79447045074588, + "learning_rate": 4.262155811774503e-06, + "loss": 2.6651, + "step": 20854 + }, + { + "epoch": 1.7774652688996846, + "grad_norm": 67.54485618432443, + "learning_rate": 4.2616653962252985e-06, + "loss": 3.3421, + "step": 20855 + }, + { + "epoch": 1.77755049859371, + "grad_norm": 25.633542617187448, + "learning_rate": 4.261174987937321e-06, + "loss": 1.4422, + "step": 20856 + }, + { + "epoch": 1.7776357282877355, + "grad_norm": 52.786128841068255, + "learning_rate": 4.260684586915391e-06, + "loss": 3.5917, + "step": 20857 + }, + { + "epoch": 1.777720957981761, + "grad_norm": 82.19448456706569, + "learning_rate": 4.260194193164338e-06, + "loss": 3.2482, + "step": 20858 + }, + { + "epoch": 1.7778061876757862, + "grad_norm": 57.755897643989506, + "learning_rate": 4.259703806688981e-06, + "loss": 2.8802, + "step": 20859 + }, + { + "epoch": 1.7778914173698115, + "grad_norm": 40.2296084445456, + "learning_rate": 4.259213427494142e-06, + "loss": 3.5289, + "step": 20860 + }, + { + "epoch": 1.777976647063837, + "grad_norm": 43.828259140051074, + "learning_rate": 4.258723055584646e-06, + "loss": 3.9212, + "step": 20861 + }, + { + "epoch": 1.7780618767578624, + "grad_norm": 44.6312795579239, + "learning_rate": 4.25823269096531e-06, + "loss": 2.6493, + "step": 20862 + }, + { + "epoch": 1.778147106451888, + "grad_norm": 19.30061917757018, + "learning_rate": 4.257742333640965e-06, + "loss": 2.3936, + "step": 20863 + }, + { + "epoch": 1.7782323361459134, + "grad_norm": 62.12775091342692, + "learning_rate": 4.257251983616427e-06, + "loss": 3.6257, + "step": 20864 + }, + { + "epoch": 1.7783175658399386, + "grad_norm": 43.63235309586719, + "learning_rate": 4.256761640896522e-06, + "loss": 2.8906, + "step": 20865 + }, + { + "epoch": 1.778402795533964, + "grad_norm": 24.72454246936862, + "learning_rate": 4.25627130548607e-06, + "loss": 1.7903, + "step": 20866 + }, + { + "epoch": 1.7784880252279893, + "grad_norm": 29.006686179388318, + "learning_rate": 4.255780977389894e-06, + "loss": 2.3984, + "step": 20867 + }, + { + "epoch": 1.7785732549220148, + "grad_norm": 96.45363175475993, + "learning_rate": 4.255290656612817e-06, + "loss": 3.0912, + "step": 20868 + }, + { + "epoch": 1.7786584846160403, + "grad_norm": 43.92524460276674, + "learning_rate": 4.254800343159661e-06, + "loss": 3.1728, + "step": 20869 + }, + { + "epoch": 1.7787437143100657, + "grad_norm": 30.857497195409678, + "learning_rate": 4.254310037035245e-06, + "loss": 2.5671, + "step": 20870 + }, + { + "epoch": 1.778828944004091, + "grad_norm": 77.08677944844752, + "learning_rate": 4.253819738244397e-06, + "loss": 3.401, + "step": 20871 + }, + { + "epoch": 1.7789141736981164, + "grad_norm": 46.03014800717812, + "learning_rate": 4.2533294467919325e-06, + "loss": 1.2401, + "step": 20872 + }, + { + "epoch": 1.7789994033921417, + "grad_norm": 83.75129394145145, + "learning_rate": 4.252839162682678e-06, + "loss": 3.3073, + "step": 20873 + }, + { + "epoch": 1.7790846330861672, + "grad_norm": 46.343698228930066, + "learning_rate": 4.252348885921454e-06, + "loss": 3.1651, + "step": 20874 + }, + { + "epoch": 1.7791698627801926, + "grad_norm": 38.6681245360067, + "learning_rate": 4.251858616513079e-06, + "loss": 2.548, + "step": 20875 + }, + { + "epoch": 1.779255092474218, + "grad_norm": 116.50177053859065, + "learning_rate": 4.251368354462378e-06, + "loss": 3.445, + "step": 20876 + }, + { + "epoch": 1.7793403221682436, + "grad_norm": 30.281025411491814, + "learning_rate": 4.250878099774173e-06, + "loss": 2.263, + "step": 20877 + }, + { + "epoch": 1.7794255518622688, + "grad_norm": 70.12132822375403, + "learning_rate": 4.250387852453285e-06, + "loss": 3.1784, + "step": 20878 + }, + { + "epoch": 1.779510781556294, + "grad_norm": 58.19470775250118, + "learning_rate": 4.249897612504532e-06, + "loss": 2.9626, + "step": 20879 + }, + { + "epoch": 1.7795960112503195, + "grad_norm": 39.23957478114458, + "learning_rate": 4.249407379932738e-06, + "loss": 2.7805, + "step": 20880 + }, + { + "epoch": 1.779681240944345, + "grad_norm": 55.771350075960456, + "learning_rate": 4.248917154742726e-06, + "loss": 3.5084, + "step": 20881 + }, + { + "epoch": 1.7797664706383705, + "grad_norm": 39.180311409541936, + "learning_rate": 4.2484269369393165e-06, + "loss": 3.0182, + "step": 20882 + }, + { + "epoch": 1.779851700332396, + "grad_norm": 80.41653676727647, + "learning_rate": 4.2479367265273255e-06, + "loss": 3.6493, + "step": 20883 + }, + { + "epoch": 1.7799369300264212, + "grad_norm": 48.464546363610864, + "learning_rate": 4.24744652351158e-06, + "loss": 2.434, + "step": 20884 + }, + { + "epoch": 1.7800221597204466, + "grad_norm": 96.90330322233554, + "learning_rate": 4.246956327896899e-06, + "loss": 4.6448, + "step": 20885 + }, + { + "epoch": 1.7801073894144719, + "grad_norm": 61.959530709666296, + "learning_rate": 4.246466139688105e-06, + "loss": 2.207, + "step": 20886 + }, + { + "epoch": 1.7801926191084974, + "grad_norm": 39.073008993288234, + "learning_rate": 4.245975958890013e-06, + "loss": 2.4848, + "step": 20887 + }, + { + "epoch": 1.7802778488025228, + "grad_norm": 60.497323573666776, + "learning_rate": 4.24548578550745e-06, + "loss": 2.5556, + "step": 20888 + }, + { + "epoch": 1.7803630784965483, + "grad_norm": 76.44249282182848, + "learning_rate": 4.2449956195452354e-06, + "loss": 3.2279, + "step": 20889 + }, + { + "epoch": 1.7804483081905738, + "grad_norm": 52.8881248147607, + "learning_rate": 4.244505461008189e-06, + "loss": 2.9907, + "step": 20890 + }, + { + "epoch": 1.780533537884599, + "grad_norm": 77.14114539471716, + "learning_rate": 4.2440153099011294e-06, + "loss": 3.004, + "step": 20891 + }, + { + "epoch": 1.7806187675786243, + "grad_norm": 56.58697202388274, + "learning_rate": 4.243525166228879e-06, + "loss": 2.0559, + "step": 20892 + }, + { + "epoch": 1.7807039972726497, + "grad_norm": 42.627078173897814, + "learning_rate": 4.243035029996259e-06, + "loss": 2.9391, + "step": 20893 + }, + { + "epoch": 1.7807892269666752, + "grad_norm": 62.002979160343145, + "learning_rate": 4.242544901208089e-06, + "loss": 3.6438, + "step": 20894 + }, + { + "epoch": 1.7808744566607007, + "grad_norm": 32.79775892726179, + "learning_rate": 4.242054779869188e-06, + "loss": 1.7421, + "step": 20895 + }, + { + "epoch": 1.7809596863547261, + "grad_norm": 23.347902475594925, + "learning_rate": 4.241564665984375e-06, + "loss": 1.6287, + "step": 20896 + }, + { + "epoch": 1.7810449160487514, + "grad_norm": 50.914233436353335, + "learning_rate": 4.2410745595584764e-06, + "loss": 2.2693, + "step": 20897 + }, + { + "epoch": 1.7811301457427768, + "grad_norm": 55.87743072855501, + "learning_rate": 4.240584460596307e-06, + "loss": 3.507, + "step": 20898 + }, + { + "epoch": 1.781215375436802, + "grad_norm": 34.44966184974763, + "learning_rate": 4.2400943691026855e-06, + "loss": 2.218, + "step": 20899 + }, + { + "epoch": 1.7813006051308276, + "grad_norm": 75.86157517152694, + "learning_rate": 4.239604285082434e-06, + "loss": 3.533, + "step": 20900 + }, + { + "epoch": 1.781385834824853, + "grad_norm": 56.67311472182367, + "learning_rate": 4.2391142085403745e-06, + "loss": 3.0272, + "step": 20901 + }, + { + "epoch": 1.7814710645188785, + "grad_norm": 47.32086328654108, + "learning_rate": 4.238624139481324e-06, + "loss": 3.5658, + "step": 20902 + }, + { + "epoch": 1.7815562942129037, + "grad_norm": 33.549609839704, + "learning_rate": 4.238134077910101e-06, + "loss": 2.8232, + "step": 20903 + }, + { + "epoch": 1.7816415239069292, + "grad_norm": 73.79362700719025, + "learning_rate": 4.237644023831529e-06, + "loss": 3.2805, + "step": 20904 + }, + { + "epoch": 1.7817267536009544, + "grad_norm": 49.6614459687254, + "learning_rate": 4.237153977250421e-06, + "loss": 1.8125, + "step": 20905 + }, + { + "epoch": 1.78181198329498, + "grad_norm": 76.87749549104699, + "learning_rate": 4.236663938171604e-06, + "loss": 3.2398, + "step": 20906 + }, + { + "epoch": 1.7818972129890054, + "grad_norm": 36.551659928453006, + "learning_rate": 4.236173906599893e-06, + "loss": 2.4888, + "step": 20907 + }, + { + "epoch": 1.7819824426830309, + "grad_norm": 50.44378050771951, + "learning_rate": 4.235683882540108e-06, + "loss": 4.0042, + "step": 20908 + }, + { + "epoch": 1.7820676723770563, + "grad_norm": 34.55093070853702, + "learning_rate": 4.235193865997067e-06, + "loss": 2.223, + "step": 20909 + }, + { + "epoch": 1.7821529020710816, + "grad_norm": 40.54737357780296, + "learning_rate": 4.234703856975593e-06, + "loss": 2.3843, + "step": 20910 + }, + { + "epoch": 1.7822381317651068, + "grad_norm": 55.12315065471335, + "learning_rate": 4.234213855480501e-06, + "loss": 3.3156, + "step": 20911 + }, + { + "epoch": 1.7823233614591323, + "grad_norm": 81.70907454379775, + "learning_rate": 4.233723861516612e-06, + "loss": 3.3586, + "step": 20912 + }, + { + "epoch": 1.7824085911531578, + "grad_norm": 73.5277053924517, + "learning_rate": 4.2332338750887425e-06, + "loss": 3.0494, + "step": 20913 + }, + { + "epoch": 1.7824938208471832, + "grad_norm": 36.37528557121616, + "learning_rate": 4.232743896201715e-06, + "loss": 2.3912, + "step": 20914 + }, + { + "epoch": 1.7825790505412087, + "grad_norm": 42.57089642664999, + "learning_rate": 4.232253924860345e-06, + "loss": 3.0345, + "step": 20915 + }, + { + "epoch": 1.782664280235234, + "grad_norm": 26.144870708206618, + "learning_rate": 4.231763961069455e-06, + "loss": 2.2718, + "step": 20916 + }, + { + "epoch": 1.7827495099292594, + "grad_norm": 105.58756515672941, + "learning_rate": 4.231274004833859e-06, + "loss": 4.0185, + "step": 20917 + }, + { + "epoch": 1.7828347396232846, + "grad_norm": 59.72636552179753, + "learning_rate": 4.230784056158379e-06, + "loss": 2.0734, + "step": 20918 + }, + { + "epoch": 1.7829199693173101, + "grad_norm": 22.981624178147644, + "learning_rate": 4.230294115047831e-06, + "loss": 1.1828, + "step": 20919 + }, + { + "epoch": 1.7830051990113356, + "grad_norm": 54.755712555838365, + "learning_rate": 4.229804181507036e-06, + "loss": 2.587, + "step": 20920 + }, + { + "epoch": 1.783090428705361, + "grad_norm": 68.21052100024157, + "learning_rate": 4.22931425554081e-06, + "loss": 3.457, + "step": 20921 + }, + { + "epoch": 1.7831756583993863, + "grad_norm": 39.37026159450845, + "learning_rate": 4.228824337153971e-06, + "loss": 2.9615, + "step": 20922 + }, + { + "epoch": 1.7832608880934118, + "grad_norm": 79.16978351218472, + "learning_rate": 4.228334426351339e-06, + "loss": 3.2041, + "step": 20923 + }, + { + "epoch": 1.783346117787437, + "grad_norm": 42.98736925432731, + "learning_rate": 4.227844523137732e-06, + "loss": 2.557, + "step": 20924 + }, + { + "epoch": 1.7834313474814625, + "grad_norm": 42.82811242750749, + "learning_rate": 4.227354627517968e-06, + "loss": 2.1997, + "step": 20925 + }, + { + "epoch": 1.783516577175488, + "grad_norm": 61.44374375865417, + "learning_rate": 4.226864739496861e-06, + "loss": 2.0333, + "step": 20926 + }, + { + "epoch": 1.7836018068695134, + "grad_norm": 108.20646551838273, + "learning_rate": 4.226374859079234e-06, + "loss": 3.4842, + "step": 20927 + }, + { + "epoch": 1.7836870365635389, + "grad_norm": 66.86010345466539, + "learning_rate": 4.225884986269904e-06, + "loss": 2.4929, + "step": 20928 + }, + { + "epoch": 1.7837722662575641, + "grad_norm": 28.10963119566665, + "learning_rate": 4.2253951210736874e-06, + "loss": 1.9721, + "step": 20929 + }, + { + "epoch": 1.7838574959515894, + "grad_norm": 72.13343392163407, + "learning_rate": 4.224905263495399e-06, + "loss": 2.7699, + "step": 20930 + }, + { + "epoch": 1.7839427256456148, + "grad_norm": 42.56839753528613, + "learning_rate": 4.224415413539861e-06, + "loss": 2.9552, + "step": 20931 + }, + { + "epoch": 1.7840279553396403, + "grad_norm": 31.498053030264487, + "learning_rate": 4.223925571211891e-06, + "loss": 2.391, + "step": 20932 + }, + { + "epoch": 1.7841131850336658, + "grad_norm": 37.3682947059723, + "learning_rate": 4.223435736516304e-06, + "loss": 2.6477, + "step": 20933 + }, + { + "epoch": 1.7841984147276913, + "grad_norm": 37.42119590628015, + "learning_rate": 4.222945909457916e-06, + "loss": 3.1334, + "step": 20934 + }, + { + "epoch": 1.7842836444217165, + "grad_norm": 25.30527067216013, + "learning_rate": 4.2224560900415465e-06, + "loss": 1.7687, + "step": 20935 + }, + { + "epoch": 1.784368874115742, + "grad_norm": 46.54063531400624, + "learning_rate": 4.221966278272015e-06, + "loss": 2.5735, + "step": 20936 + }, + { + "epoch": 1.7844541038097672, + "grad_norm": 27.95423479857158, + "learning_rate": 4.221476474154135e-06, + "loss": 2.8883, + "step": 20937 + }, + { + "epoch": 1.7845393335037927, + "grad_norm": 53.98245526361205, + "learning_rate": 4.220986677692723e-06, + "loss": 2.4941, + "step": 20938 + }, + { + "epoch": 1.7846245631978181, + "grad_norm": 19.38552883113208, + "learning_rate": 4.220496888892597e-06, + "loss": 1.5993, + "step": 20939 + }, + { + "epoch": 1.7847097928918436, + "grad_norm": 38.142465620384186, + "learning_rate": 4.220007107758576e-06, + "loss": 3.2183, + "step": 20940 + }, + { + "epoch": 1.7847950225858689, + "grad_norm": 70.84239223762577, + "learning_rate": 4.219517334295476e-06, + "loss": 2.9695, + "step": 20941 + }, + { + "epoch": 1.7848802522798943, + "grad_norm": 52.71850671151621, + "learning_rate": 4.219027568508111e-06, + "loss": 3.1138, + "step": 20942 + }, + { + "epoch": 1.7849654819739196, + "grad_norm": 26.08245363180718, + "learning_rate": 4.218537810401299e-06, + "loss": 2.2806, + "step": 20943 + }, + { + "epoch": 1.785050711667945, + "grad_norm": 61.55549860261326, + "learning_rate": 4.218048059979859e-06, + "loss": 3.1741, + "step": 20944 + }, + { + "epoch": 1.7851359413619705, + "grad_norm": 20.359465055609, + "learning_rate": 4.217558317248605e-06, + "loss": 1.7413, + "step": 20945 + }, + { + "epoch": 1.785221171055996, + "grad_norm": 70.02969122905444, + "learning_rate": 4.217068582212355e-06, + "loss": 2.8742, + "step": 20946 + }, + { + "epoch": 1.7853064007500214, + "grad_norm": 48.723564060063694, + "learning_rate": 4.216578854875924e-06, + "loss": 2.0332, + "step": 20947 + }, + { + "epoch": 1.7853916304440467, + "grad_norm": 77.52835900115767, + "learning_rate": 4.216089135244126e-06, + "loss": 2.695, + "step": 20948 + }, + { + "epoch": 1.785476860138072, + "grad_norm": 29.573456423004952, + "learning_rate": 4.215599423321783e-06, + "loss": 3.2061, + "step": 20949 + }, + { + "epoch": 1.7855620898320974, + "grad_norm": 50.223114971867204, + "learning_rate": 4.2151097191137064e-06, + "loss": 2.7526, + "step": 20950 + }, + { + "epoch": 1.7856473195261229, + "grad_norm": 50.776258699480245, + "learning_rate": 4.214620022624715e-06, + "loss": 2.5474, + "step": 20951 + }, + { + "epoch": 1.7857325492201483, + "grad_norm": 63.201743810642675, + "learning_rate": 4.214130333859621e-06, + "loss": 3.5609, + "step": 20952 + }, + { + "epoch": 1.7858177789141738, + "grad_norm": 39.59255156624567, + "learning_rate": 4.213640652823247e-06, + "loss": 2.7512, + "step": 20953 + }, + { + "epoch": 1.785903008608199, + "grad_norm": 51.89379489560913, + "learning_rate": 4.213150979520402e-06, + "loss": 2.8479, + "step": 20954 + }, + { + "epoch": 1.7859882383022245, + "grad_norm": 47.768240489814644, + "learning_rate": 4.2126613139559056e-06, + "loss": 2.8481, + "step": 20955 + }, + { + "epoch": 1.7860734679962498, + "grad_norm": 47.94066628184715, + "learning_rate": 4.21217165613457e-06, + "loss": 2.6018, + "step": 20956 + }, + { + "epoch": 1.7861586976902752, + "grad_norm": 35.63453783871091, + "learning_rate": 4.211682006061215e-06, + "loss": 2.7616, + "step": 20957 + }, + { + "epoch": 1.7862439273843007, + "grad_norm": 43.4492636475977, + "learning_rate": 4.211192363740654e-06, + "loss": 3.3925, + "step": 20958 + }, + { + "epoch": 1.7863291570783262, + "grad_norm": 60.03747253757353, + "learning_rate": 4.2107027291777035e-06, + "loss": 2.4319, + "step": 20959 + }, + { + "epoch": 1.7864143867723516, + "grad_norm": 45.827324605855175, + "learning_rate": 4.2102131023771755e-06, + "loss": 3.1789, + "step": 20960 + }, + { + "epoch": 1.786499616466377, + "grad_norm": 68.84384013501847, + "learning_rate": 4.2097234833438906e-06, + "loss": 2.9085, + "step": 20961 + }, + { + "epoch": 1.7865848461604021, + "grad_norm": 52.145298599196195, + "learning_rate": 4.20923387208266e-06, + "loss": 2.8183, + "step": 20962 + }, + { + "epoch": 1.7866700758544276, + "grad_norm": 45.5561383359111, + "learning_rate": 4.2087442685983e-06, + "loss": 2.7998, + "step": 20963 + }, + { + "epoch": 1.786755305548453, + "grad_norm": 38.506686853635664, + "learning_rate": 4.208254672895627e-06, + "loss": 3.5414, + "step": 20964 + }, + { + "epoch": 1.7868405352424785, + "grad_norm": 36.01505767624515, + "learning_rate": 4.207765084979453e-06, + "loss": 2.296, + "step": 20965 + }, + { + "epoch": 1.786925764936504, + "grad_norm": 33.29647457205937, + "learning_rate": 4.207275504854594e-06, + "loss": 2.2717, + "step": 20966 + }, + { + "epoch": 1.7870109946305293, + "grad_norm": 33.17359904266534, + "learning_rate": 4.206785932525867e-06, + "loss": 1.3688, + "step": 20967 + }, + { + "epoch": 1.7870962243245547, + "grad_norm": 72.54259897970131, + "learning_rate": 4.206296367998086e-06, + "loss": 2.6247, + "step": 20968 + }, + { + "epoch": 1.78718145401858, + "grad_norm": 36.293289321845286, + "learning_rate": 4.205806811276062e-06, + "loss": 2.3507, + "step": 20969 + }, + { + "epoch": 1.7872666837126054, + "grad_norm": 47.25206932975164, + "learning_rate": 4.205317262364614e-06, + "loss": 3.3659, + "step": 20970 + }, + { + "epoch": 1.787351913406631, + "grad_norm": 47.25277128972524, + "learning_rate": 4.204827721268556e-06, + "loss": 2.7174, + "step": 20971 + }, + { + "epoch": 1.7874371431006564, + "grad_norm": 46.210748970468494, + "learning_rate": 4.204338187992701e-06, + "loss": 3.1352, + "step": 20972 + }, + { + "epoch": 1.7875223727946816, + "grad_norm": 40.107457071044124, + "learning_rate": 4.203848662541861e-06, + "loss": 2.1672, + "step": 20973 + }, + { + "epoch": 1.787607602488707, + "grad_norm": 41.289622782248756, + "learning_rate": 4.2033591449208556e-06, + "loss": 3.3775, + "step": 20974 + }, + { + "epoch": 1.7876928321827323, + "grad_norm": 55.074594580102094, + "learning_rate": 4.202869635134496e-06, + "loss": 3.1279, + "step": 20975 + }, + { + "epoch": 1.7877780618767578, + "grad_norm": 53.75832818613141, + "learning_rate": 4.202380133187598e-06, + "loss": 1.3397, + "step": 20976 + }, + { + "epoch": 1.7878632915707833, + "grad_norm": 52.84956188876141, + "learning_rate": 4.201890639084973e-06, + "loss": 2.2675, + "step": 20977 + }, + { + "epoch": 1.7879485212648087, + "grad_norm": 25.89046238283056, + "learning_rate": 4.201401152831435e-06, + "loss": 2.388, + "step": 20978 + }, + { + "epoch": 1.7880337509588342, + "grad_norm": 59.51334078282498, + "learning_rate": 4.200911674431802e-06, + "loss": 3.063, + "step": 20979 + }, + { + "epoch": 1.7881189806528595, + "grad_norm": 127.12811063250496, + "learning_rate": 4.2004222038908845e-06, + "loss": 2.9268, + "step": 20980 + }, + { + "epoch": 1.7882042103468847, + "grad_norm": 47.828512367180046, + "learning_rate": 4.199932741213496e-06, + "loss": 2.988, + "step": 20981 + }, + { + "epoch": 1.7882894400409102, + "grad_norm": 51.530506773942726, + "learning_rate": 4.19944328640445e-06, + "loss": 2.6269, + "step": 20982 + }, + { + "epoch": 1.7883746697349356, + "grad_norm": 68.43816532133191, + "learning_rate": 4.198953839468564e-06, + "loss": 3.0206, + "step": 20983 + }, + { + "epoch": 1.788459899428961, + "grad_norm": 32.04288268895104, + "learning_rate": 4.1984644004106485e-06, + "loss": 2.5711, + "step": 20984 + }, + { + "epoch": 1.7885451291229866, + "grad_norm": 23.434652893003047, + "learning_rate": 4.197974969235515e-06, + "loss": 1.615, + "step": 20985 + }, + { + "epoch": 1.7886303588170118, + "grad_norm": 93.90185826873132, + "learning_rate": 4.197485545947979e-06, + "loss": 3.312, + "step": 20986 + }, + { + "epoch": 1.7887155885110373, + "grad_norm": 33.29000211526983, + "learning_rate": 4.196996130552856e-06, + "loss": 2.8189, + "step": 20987 + }, + { + "epoch": 1.7888008182050625, + "grad_norm": 54.585162467328345, + "learning_rate": 4.196506723054956e-06, + "loss": 2.0484, + "step": 20988 + }, + { + "epoch": 1.788886047899088, + "grad_norm": 51.9150353279841, + "learning_rate": 4.196017323459093e-06, + "loss": 3.3217, + "step": 20989 + }, + { + "epoch": 1.7889712775931135, + "grad_norm": 34.82776340920842, + "learning_rate": 4.195527931770079e-06, + "loss": 2.6625, + "step": 20990 + }, + { + "epoch": 1.789056507287139, + "grad_norm": 37.627674349790475, + "learning_rate": 4.19503854799273e-06, + "loss": 1.8716, + "step": 20991 + }, + { + "epoch": 1.7891417369811642, + "grad_norm": 40.49774493619693, + "learning_rate": 4.194549172131856e-06, + "loss": 2.8104, + "step": 20992 + }, + { + "epoch": 1.7892269666751897, + "grad_norm": 60.110458877793, + "learning_rate": 4.194059804192271e-06, + "loss": 2.7448, + "step": 20993 + }, + { + "epoch": 1.789312196369215, + "grad_norm": 66.2905650704757, + "learning_rate": 4.193570444178789e-06, + "loss": 2.4596, + "step": 20994 + }, + { + "epoch": 1.7893974260632404, + "grad_norm": 42.8586045921799, + "learning_rate": 4.193081092096218e-06, + "loss": 3.6672, + "step": 20995 + }, + { + "epoch": 1.7894826557572658, + "grad_norm": 39.17883166674905, + "learning_rate": 4.192591747949377e-06, + "loss": 2.7378, + "step": 20996 + }, + { + "epoch": 1.7895678854512913, + "grad_norm": 68.41535097975715, + "learning_rate": 4.192102411743073e-06, + "loss": 2.6976, + "step": 20997 + }, + { + "epoch": 1.7896531151453168, + "grad_norm": 43.55302020490983, + "learning_rate": 4.191613083482123e-06, + "loss": 3.0287, + "step": 20998 + }, + { + "epoch": 1.789738344839342, + "grad_norm": 39.034618913191586, + "learning_rate": 4.191123763171334e-06, + "loss": 3.1903, + "step": 20999 + }, + { + "epoch": 1.7898235745333673, + "grad_norm": 65.35866761681484, + "learning_rate": 4.190634450815524e-06, + "loss": 2.8259, + "step": 21000 + }, + { + "epoch": 1.7899088042273927, + "grad_norm": 65.81131679457629, + "learning_rate": 4.190145146419502e-06, + "loss": 2.6962, + "step": 21001 + }, + { + "epoch": 1.7899940339214182, + "grad_norm": 50.59872431442053, + "learning_rate": 4.189655849988081e-06, + "loss": 2.2177, + "step": 21002 + }, + { + "epoch": 1.7900792636154437, + "grad_norm": 74.37617914960306, + "learning_rate": 4.189166561526071e-06, + "loss": 2.5997, + "step": 21003 + }, + { + "epoch": 1.7901644933094691, + "grad_norm": 65.28286037148061, + "learning_rate": 4.188677281038288e-06, + "loss": 3.0685, + "step": 21004 + }, + { + "epoch": 1.7902497230034944, + "grad_norm": 38.0836833436134, + "learning_rate": 4.188188008529539e-06, + "loss": 2.9673, + "step": 21005 + }, + { + "epoch": 1.7903349526975199, + "grad_norm": 34.21733410791913, + "learning_rate": 4.187698744004641e-06, + "loss": 2.9991, + "step": 21006 + }, + { + "epoch": 1.790420182391545, + "grad_norm": 75.82037959705511, + "learning_rate": 4.187209487468401e-06, + "loss": 3.4066, + "step": 21007 + }, + { + "epoch": 1.7905054120855706, + "grad_norm": 70.73519854685779, + "learning_rate": 4.186720238925634e-06, + "loss": 2.5395, + "step": 21008 + }, + { + "epoch": 1.790590641779596, + "grad_norm": 141.4279590818473, + "learning_rate": 4.186230998381149e-06, + "loss": 3.1735, + "step": 21009 + }, + { + "epoch": 1.7906758714736215, + "grad_norm": 55.83838119634453, + "learning_rate": 4.1857417658397605e-06, + "loss": 2.7299, + "step": 21010 + }, + { + "epoch": 1.790761101167647, + "grad_norm": 38.114939856059195, + "learning_rate": 4.185252541306277e-06, + "loss": 3.1433, + "step": 21011 + }, + { + "epoch": 1.7908463308616722, + "grad_norm": 68.2205806094418, + "learning_rate": 4.184763324785511e-06, + "loss": 2.2373, + "step": 21012 + }, + { + "epoch": 1.7909315605556975, + "grad_norm": 64.26666867189338, + "learning_rate": 4.184274116282274e-06, + "loss": 3.3408, + "step": 21013 + }, + { + "epoch": 1.791016790249723, + "grad_norm": 47.82051324916272, + "learning_rate": 4.183784915801378e-06, + "loss": 4.0232, + "step": 21014 + }, + { + "epoch": 1.7911020199437484, + "grad_norm": 89.69966677869114, + "learning_rate": 4.1832957233476326e-06, + "loss": 2.6475, + "step": 21015 + }, + { + "epoch": 1.7911872496377739, + "grad_norm": 28.558964345133166, + "learning_rate": 4.1828065389258475e-06, + "loss": 2.1748, + "step": 21016 + }, + { + "epoch": 1.7912724793317993, + "grad_norm": 58.53722238774809, + "learning_rate": 4.1823173625408355e-06, + "loss": 3.3793, + "step": 21017 + }, + { + "epoch": 1.7913577090258246, + "grad_norm": 63.507378268071406, + "learning_rate": 4.181828194197409e-06, + "loss": 3.3028, + "step": 21018 + }, + { + "epoch": 1.7914429387198498, + "grad_norm": 42.719885416100986, + "learning_rate": 4.181339033900378e-06, + "loss": 3.6243, + "step": 21019 + }, + { + "epoch": 1.7915281684138753, + "grad_norm": 27.739144280863083, + "learning_rate": 4.1808498816545505e-06, + "loss": 2.1803, + "step": 21020 + }, + { + "epoch": 1.7916133981079008, + "grad_norm": 52.25914729101745, + "learning_rate": 4.180360737464737e-06, + "loss": 2.8635, + "step": 21021 + }, + { + "epoch": 1.7916986278019262, + "grad_norm": 37.29125003359552, + "learning_rate": 4.179871601335753e-06, + "loss": 2.6077, + "step": 21022 + }, + { + "epoch": 1.7917838574959517, + "grad_norm": 34.57629451326991, + "learning_rate": 4.179382473272406e-06, + "loss": 2.1805, + "step": 21023 + }, + { + "epoch": 1.791869087189977, + "grad_norm": 31.529984640605285, + "learning_rate": 4.1788933532795055e-06, + "loss": 2.2176, + "step": 21024 + }, + { + "epoch": 1.7919543168840024, + "grad_norm": 36.0274739441762, + "learning_rate": 4.178404241361861e-06, + "loss": 3.3853, + "step": 21025 + }, + { + "epoch": 1.7920395465780277, + "grad_norm": 40.48704289056551, + "learning_rate": 4.177915137524286e-06, + "loss": 2.8333, + "step": 21026 + }, + { + "epoch": 1.7921247762720531, + "grad_norm": 43.23482528551127, + "learning_rate": 4.177426041771589e-06, + "loss": 2.9256, + "step": 21027 + }, + { + "epoch": 1.7922100059660786, + "grad_norm": 79.07675263289616, + "learning_rate": 4.17693695410858e-06, + "loss": 3.4114, + "step": 21028 + }, + { + "epoch": 1.792295235660104, + "grad_norm": 46.94940282521639, + "learning_rate": 4.1764478745400675e-06, + "loss": 2.9348, + "step": 21029 + }, + { + "epoch": 1.7923804653541295, + "grad_norm": 30.03577837526889, + "learning_rate": 4.1759588030708645e-06, + "loss": 2.6363, + "step": 21030 + }, + { + "epoch": 1.7924656950481548, + "grad_norm": 51.45965142146678, + "learning_rate": 4.17546973970578e-06, + "loss": 2.9979, + "step": 21031 + }, + { + "epoch": 1.79255092474218, + "grad_norm": 46.51606235272062, + "learning_rate": 4.174980684449621e-06, + "loss": 1.725, + "step": 21032 + }, + { + "epoch": 1.7926361544362055, + "grad_norm": 61.88923833468866, + "learning_rate": 4.174491637307199e-06, + "loss": 2.9068, + "step": 21033 + }, + { + "epoch": 1.792721384130231, + "grad_norm": 75.77085555703678, + "learning_rate": 4.174002598283325e-06, + "loss": 3.3904, + "step": 21034 + }, + { + "epoch": 1.7928066138242564, + "grad_norm": 48.315752530016574, + "learning_rate": 4.173513567382808e-06, + "loss": 3.0051, + "step": 21035 + }, + { + "epoch": 1.792891843518282, + "grad_norm": 43.43675863846155, + "learning_rate": 4.173024544610455e-06, + "loss": 2.2185, + "step": 21036 + }, + { + "epoch": 1.7929770732123071, + "grad_norm": 30.00705919125806, + "learning_rate": 4.172535529971078e-06, + "loss": 2.4951, + "step": 21037 + }, + { + "epoch": 1.7930623029063326, + "grad_norm": 72.93092002269343, + "learning_rate": 4.172046523469484e-06, + "loss": 3.5339, + "step": 21038 + }, + { + "epoch": 1.7931475326003579, + "grad_norm": 77.09200418948038, + "learning_rate": 4.171557525110485e-06, + "loss": 3.5641, + "step": 21039 + }, + { + "epoch": 1.7932327622943833, + "grad_norm": 54.670147726270564, + "learning_rate": 4.171068534898888e-06, + "loss": 2.5017, + "step": 21040 + }, + { + "epoch": 1.7933179919884088, + "grad_norm": 35.6138626973243, + "learning_rate": 4.170579552839503e-06, + "loss": 2.9223, + "step": 21041 + }, + { + "epoch": 1.7934032216824343, + "grad_norm": 55.34795855148851, + "learning_rate": 4.170090578937136e-06, + "loss": 1.4784, + "step": 21042 + }, + { + "epoch": 1.7934884513764595, + "grad_norm": 31.66634399224995, + "learning_rate": 4.169601613196601e-06, + "loss": 2.6489, + "step": 21043 + }, + { + "epoch": 1.793573681070485, + "grad_norm": 17.171606410391277, + "learning_rate": 4.169112655622703e-06, + "loss": 1.3553, + "step": 21044 + }, + { + "epoch": 1.7936589107645102, + "grad_norm": 59.047361712395904, + "learning_rate": 4.1686237062202536e-06, + "loss": 2.9459, + "step": 21045 + }, + { + "epoch": 1.7937441404585357, + "grad_norm": 55.58520073619232, + "learning_rate": 4.168134764994057e-06, + "loss": 2.9563, + "step": 21046 + }, + { + "epoch": 1.7938293701525612, + "grad_norm": 65.48666472782503, + "learning_rate": 4.167645831948926e-06, + "loss": 2.488, + "step": 21047 + }, + { + "epoch": 1.7939145998465866, + "grad_norm": 85.52773512622343, + "learning_rate": 4.167156907089668e-06, + "loss": 3.0317, + "step": 21048 + }, + { + "epoch": 1.793999829540612, + "grad_norm": 55.78662579789421, + "learning_rate": 4.16666799042109e-06, + "loss": 2.415, + "step": 21049 + }, + { + "epoch": 1.7940850592346373, + "grad_norm": 48.188340516694026, + "learning_rate": 4.166179081948002e-06, + "loss": 2.1287, + "step": 21050 + }, + { + "epoch": 1.7941702889286626, + "grad_norm": 31.022950262429227, + "learning_rate": 4.16569018167521e-06, + "loss": 2.0269, + "step": 21051 + }, + { + "epoch": 1.794255518622688, + "grad_norm": 41.60075540931507, + "learning_rate": 4.165201289607524e-06, + "loss": 2.833, + "step": 21052 + }, + { + "epoch": 1.7943407483167135, + "grad_norm": 33.295426685895336, + "learning_rate": 4.164712405749753e-06, + "loss": 2.5729, + "step": 21053 + }, + { + "epoch": 1.794425978010739, + "grad_norm": 87.88996159027538, + "learning_rate": 4.164223530106702e-06, + "loss": 3.3893, + "step": 21054 + }, + { + "epoch": 1.7945112077047645, + "grad_norm": 43.4516125635832, + "learning_rate": 4.1637346626831806e-06, + "loss": 2.898, + "step": 21055 + }, + { + "epoch": 1.7945964373987897, + "grad_norm": 149.36559996588346, + "learning_rate": 4.163245803483997e-06, + "loss": 2.4299, + "step": 21056 + }, + { + "epoch": 1.7946816670928152, + "grad_norm": 26.701533769191926, + "learning_rate": 4.162756952513959e-06, + "loss": 2.0983, + "step": 21057 + }, + { + "epoch": 1.7947668967868404, + "grad_norm": 41.51731491844117, + "learning_rate": 4.162268109777873e-06, + "loss": 3.0273, + "step": 21058 + }, + { + "epoch": 1.794852126480866, + "grad_norm": 124.73337365384634, + "learning_rate": 4.161779275280547e-06, + "loss": 3.0174, + "step": 21059 + }, + { + "epoch": 1.7949373561748914, + "grad_norm": 36.53361485741533, + "learning_rate": 4.161290449026788e-06, + "loss": 2.6816, + "step": 21060 + }, + { + "epoch": 1.7950225858689168, + "grad_norm": 46.58448570829214, + "learning_rate": 4.160801631021406e-06, + "loss": 2.9867, + "step": 21061 + }, + { + "epoch": 1.795107815562942, + "grad_norm": 47.544373577347045, + "learning_rate": 4.160312821269207e-06, + "loss": 2.2779, + "step": 21062 + }, + { + "epoch": 1.7951930452569675, + "grad_norm": 25.076582718276807, + "learning_rate": 4.159824019774995e-06, + "loss": 1.9821, + "step": 21063 + }, + { + "epoch": 1.7952782749509928, + "grad_norm": 49.25278794485418, + "learning_rate": 4.159335226543581e-06, + "loss": 3.0086, + "step": 21064 + }, + { + "epoch": 1.7953635046450183, + "grad_norm": 66.69446052146156, + "learning_rate": 4.158846441579772e-06, + "loss": 2.6657, + "step": 21065 + }, + { + "epoch": 1.7954487343390437, + "grad_norm": 40.10886616220836, + "learning_rate": 4.158357664888374e-06, + "loss": 2.7516, + "step": 21066 + }, + { + "epoch": 1.7955339640330692, + "grad_norm": 133.58220368051062, + "learning_rate": 4.157868896474193e-06, + "loss": 3.1993, + "step": 21067 + }, + { + "epoch": 1.7956191937270947, + "grad_norm": 46.73079577175278, + "learning_rate": 4.1573801363420354e-06, + "loss": 3.3214, + "step": 21068 + }, + { + "epoch": 1.79570442342112, + "grad_norm": 79.39671253726493, + "learning_rate": 4.156891384496712e-06, + "loss": 2.708, + "step": 21069 + }, + { + "epoch": 1.7957896531151452, + "grad_norm": 64.11429037659397, + "learning_rate": 4.156402640943026e-06, + "loss": 3.0725, + "step": 21070 + }, + { + "epoch": 1.7958748828091706, + "grad_norm": 45.83595761542576, + "learning_rate": 4.155913905685784e-06, + "loss": 2.7803, + "step": 21071 + }, + { + "epoch": 1.795960112503196, + "grad_norm": 81.98334552358611, + "learning_rate": 4.155425178729793e-06, + "loss": 3.6238, + "step": 21072 + }, + { + "epoch": 1.7960453421972216, + "grad_norm": 47.27203237126711, + "learning_rate": 4.154936460079861e-06, + "loss": 2.6186, + "step": 21073 + }, + { + "epoch": 1.796130571891247, + "grad_norm": 28.740589011963905, + "learning_rate": 4.1544477497407946e-06, + "loss": 1.8539, + "step": 21074 + }, + { + "epoch": 1.7962158015852723, + "grad_norm": 51.66304121705026, + "learning_rate": 4.153959047717396e-06, + "loss": 2.3902, + "step": 21075 + }, + { + "epoch": 1.7963010312792977, + "grad_norm": 16.66821361128668, + "learning_rate": 4.153470354014474e-06, + "loss": 0.9447, + "step": 21076 + }, + { + "epoch": 1.796386260973323, + "grad_norm": 65.92565722422412, + "learning_rate": 4.152981668636836e-06, + "loss": 2.9311, + "step": 21077 + }, + { + "epoch": 1.7964714906673485, + "grad_norm": 42.71203191139099, + "learning_rate": 4.152492991589287e-06, + "loss": 3.1336, + "step": 21078 + }, + { + "epoch": 1.796556720361374, + "grad_norm": 90.11638085293565, + "learning_rate": 4.152004322876631e-06, + "loss": 3.7662, + "step": 21079 + }, + { + "epoch": 1.7966419500553994, + "grad_norm": 90.73181137675004, + "learning_rate": 4.151515662503677e-06, + "loss": 2.8174, + "step": 21080 + }, + { + "epoch": 1.7967271797494249, + "grad_norm": 51.05823370146332, + "learning_rate": 4.151027010475228e-06, + "loss": 2.9706, + "step": 21081 + }, + { + "epoch": 1.79681240944345, + "grad_norm": 41.891599366974354, + "learning_rate": 4.150538366796092e-06, + "loss": 2.7697, + "step": 21082 + }, + { + "epoch": 1.7968976391374754, + "grad_norm": 52.17015654058387, + "learning_rate": 4.150049731471073e-06, + "loss": 3.426, + "step": 21083 + }, + { + "epoch": 1.7969828688315008, + "grad_norm": 58.300904459670974, + "learning_rate": 4.149561104504979e-06, + "loss": 2.8686, + "step": 21084 + }, + { + "epoch": 1.7970680985255263, + "grad_norm": 39.974068766731484, + "learning_rate": 4.14907248590261e-06, + "loss": 2.8499, + "step": 21085 + }, + { + "epoch": 1.7971533282195518, + "grad_norm": 76.59335399943805, + "learning_rate": 4.148583875668778e-06, + "loss": 2.8437, + "step": 21086 + }, + { + "epoch": 1.7972385579135772, + "grad_norm": 48.92482953793271, + "learning_rate": 4.148095273808284e-06, + "loss": 3.5809, + "step": 21087 + }, + { + "epoch": 1.7973237876076025, + "grad_norm": 54.89574658207895, + "learning_rate": 4.147606680325935e-06, + "loss": 2.2362, + "step": 21088 + }, + { + "epoch": 1.797409017301628, + "grad_norm": 38.52005895128705, + "learning_rate": 4.1471180952265345e-06, + "loss": 2.9537, + "step": 21089 + }, + { + "epoch": 1.7974942469956532, + "grad_norm": 59.030365389270685, + "learning_rate": 4.14662951851489e-06, + "loss": 1.4316, + "step": 21090 + }, + { + "epoch": 1.7975794766896787, + "grad_norm": 76.26024596908789, + "learning_rate": 4.146140950195804e-06, + "loss": 3.0991, + "step": 21091 + }, + { + "epoch": 1.7976647063837041, + "grad_norm": 28.459522094335394, + "learning_rate": 4.145652390274084e-06, + "loss": 2.1741, + "step": 21092 + }, + { + "epoch": 1.7977499360777296, + "grad_norm": 33.63814008755558, + "learning_rate": 4.145163838754532e-06, + "loss": 2.315, + "step": 21093 + }, + { + "epoch": 1.7978351657717548, + "grad_norm": 25.503722438638594, + "learning_rate": 4.144675295641955e-06, + "loss": 2.9475, + "step": 21094 + }, + { + "epoch": 1.7979203954657803, + "grad_norm": 41.07536297934548, + "learning_rate": 4.144186760941155e-06, + "loss": 3.1104, + "step": 21095 + }, + { + "epoch": 1.7980056251598056, + "grad_norm": 43.92084094783176, + "learning_rate": 4.14369823465694e-06, + "loss": 2.1021, + "step": 21096 + }, + { + "epoch": 1.798090854853831, + "grad_norm": 32.93091611202924, + "learning_rate": 4.143209716794111e-06, + "loss": 2.7122, + "step": 21097 + }, + { + "epoch": 1.7981760845478565, + "grad_norm": 52.730167307851836, + "learning_rate": 4.142721207357474e-06, + "loss": 3.054, + "step": 21098 + }, + { + "epoch": 1.798261314241882, + "grad_norm": 35.202388372683544, + "learning_rate": 4.142232706351834e-06, + "loss": 2.8029, + "step": 21099 + }, + { + "epoch": 1.7983465439359074, + "grad_norm": 75.02766284343986, + "learning_rate": 4.141744213781995e-06, + "loss": 3.8756, + "step": 21100 + }, + { + "epoch": 1.7984317736299327, + "grad_norm": 77.85070278898162, + "learning_rate": 4.14125572965276e-06, + "loss": 3.9276, + "step": 21101 + }, + { + "epoch": 1.798517003323958, + "grad_norm": 42.593652161990285, + "learning_rate": 4.140767253968933e-06, + "loss": 2.512, + "step": 21102 + }, + { + "epoch": 1.7986022330179834, + "grad_norm": 57.72491242150831, + "learning_rate": 4.1402787867353185e-06, + "loss": 2.7776, + "step": 21103 + }, + { + "epoch": 1.7986874627120089, + "grad_norm": 40.84879875628933, + "learning_rate": 4.139790327956723e-06, + "loss": 3.8295, + "step": 21104 + }, + { + "epoch": 1.7987726924060343, + "grad_norm": 110.11926523312452, + "learning_rate": 4.139301877637946e-06, + "loss": 4.0135, + "step": 21105 + }, + { + "epoch": 1.7988579221000598, + "grad_norm": 51.39896415324945, + "learning_rate": 4.138813435783792e-06, + "loss": 3.0285, + "step": 21106 + }, + { + "epoch": 1.798943151794085, + "grad_norm": 91.54102164454956, + "learning_rate": 4.1383250023990666e-06, + "loss": 3.2625, + "step": 21107 + }, + { + "epoch": 1.7990283814881105, + "grad_norm": 42.56887975635173, + "learning_rate": 4.137836577488573e-06, + "loss": 2.8351, + "step": 21108 + }, + { + "epoch": 1.7991136111821358, + "grad_norm": 60.57474506866041, + "learning_rate": 4.137348161057114e-06, + "loss": 3.2893, + "step": 21109 + }, + { + "epoch": 1.7991988408761612, + "grad_norm": 30.051957819588704, + "learning_rate": 4.1368597531094936e-06, + "loss": 1.8924, + "step": 21110 + }, + { + "epoch": 1.7992840705701867, + "grad_norm": 56.54713803631147, + "learning_rate": 4.1363713536505125e-06, + "loss": 2.9185, + "step": 21111 + }, + { + "epoch": 1.7993693002642122, + "grad_norm": 45.581632299307984, + "learning_rate": 4.135882962684977e-06, + "loss": 2.5999, + "step": 21112 + }, + { + "epoch": 1.7994545299582374, + "grad_norm": 34.501905383566, + "learning_rate": 4.1353945802176905e-06, + "loss": 2.4384, + "step": 21113 + }, + { + "epoch": 1.7995397596522629, + "grad_norm": 96.0060444105518, + "learning_rate": 4.134906206253453e-06, + "loss": 2.9626, + "step": 21114 + }, + { + "epoch": 1.7996249893462881, + "grad_norm": 30.180895628944533, + "learning_rate": 4.134417840797068e-06, + "loss": 1.4958, + "step": 21115 + }, + { + "epoch": 1.7997102190403136, + "grad_norm": 56.353974257593414, + "learning_rate": 4.133929483853343e-06, + "loss": 3.1805, + "step": 21116 + }, + { + "epoch": 1.799795448734339, + "grad_norm": 25.776962587150976, + "learning_rate": 4.1334411354270765e-06, + "loss": 2.2999, + "step": 21117 + }, + { + "epoch": 1.7998806784283645, + "grad_norm": 23.187007577480113, + "learning_rate": 4.132952795523071e-06, + "loss": 2.1587, + "step": 21118 + }, + { + "epoch": 1.79996590812239, + "grad_norm": 48.18755074631734, + "learning_rate": 4.132464464146129e-06, + "loss": 3.3488, + "step": 21119 + }, + { + "epoch": 1.8000511378164152, + "grad_norm": 31.466324209725986, + "learning_rate": 4.131976141301057e-06, + "loss": 2.7024, + "step": 21120 + }, + { + "epoch": 1.8001363675104405, + "grad_norm": 28.641969587697893, + "learning_rate": 4.131487826992654e-06, + "loss": 2.1536, + "step": 21121 + }, + { + "epoch": 1.800221597204466, + "grad_norm": 129.0307329945622, + "learning_rate": 4.130999521225723e-06, + "loss": 2.8764, + "step": 21122 + }, + { + "epoch": 1.8003068268984914, + "grad_norm": 69.71652427183722, + "learning_rate": 4.130511224005066e-06, + "loss": 2.4162, + "step": 21123 + }, + { + "epoch": 1.8003920565925169, + "grad_norm": 87.2075172472787, + "learning_rate": 4.1300229353354845e-06, + "loss": 2.9716, + "step": 21124 + }, + { + "epoch": 1.8004772862865424, + "grad_norm": 40.555892539125544, + "learning_rate": 4.129534655221784e-06, + "loss": 3.9548, + "step": 21125 + }, + { + "epoch": 1.8005625159805676, + "grad_norm": 61.89260651175902, + "learning_rate": 4.129046383668763e-06, + "loss": 2.9296, + "step": 21126 + }, + { + "epoch": 1.800647745674593, + "grad_norm": 34.16849648414191, + "learning_rate": 4.128558120681226e-06, + "loss": 2.5834, + "step": 21127 + }, + { + "epoch": 1.8007329753686183, + "grad_norm": 38.52587122630976, + "learning_rate": 4.12806986626397e-06, + "loss": 2.849, + "step": 21128 + }, + { + "epoch": 1.8008182050626438, + "grad_norm": 30.763210415767276, + "learning_rate": 4.1275816204218045e-06, + "loss": 1.6537, + "step": 21129 + }, + { + "epoch": 1.8009034347566693, + "grad_norm": 62.91680910094187, + "learning_rate": 4.127093383159525e-06, + "loss": 3.0598, + "step": 21130 + }, + { + "epoch": 1.8009886644506947, + "grad_norm": 43.368578924091594, + "learning_rate": 4.126605154481937e-06, + "loss": 2.6028, + "step": 21131 + }, + { + "epoch": 1.80107389414472, + "grad_norm": 45.77273297076193, + "learning_rate": 4.1261169343938364e-06, + "loss": 3.703, + "step": 21132 + }, + { + "epoch": 1.8011591238387454, + "grad_norm": 80.27299354650391, + "learning_rate": 4.125628722900032e-06, + "loss": 3.0047, + "step": 21133 + }, + { + "epoch": 1.8012443535327707, + "grad_norm": 86.6423233487936, + "learning_rate": 4.12514052000532e-06, + "loss": 3.2207, + "step": 21134 + }, + { + "epoch": 1.8013295832267961, + "grad_norm": 56.38800211324153, + "learning_rate": 4.1246523257145064e-06, + "loss": 2.4273, + "step": 21135 + }, + { + "epoch": 1.8014148129208216, + "grad_norm": 36.53490115058606, + "learning_rate": 4.124164140032385e-06, + "loss": 2.2466, + "step": 21136 + }, + { + "epoch": 1.801500042614847, + "grad_norm": 52.5021853282771, + "learning_rate": 4.123675962963765e-06, + "loss": 2.132, + "step": 21137 + }, + { + "epoch": 1.8015852723088726, + "grad_norm": 50.30437433530637, + "learning_rate": 4.123187794513441e-06, + "loss": 2.4244, + "step": 21138 + }, + { + "epoch": 1.8016705020028978, + "grad_norm": 49.844849211976815, + "learning_rate": 4.122699634686218e-06, + "loss": 3.2548, + "step": 21139 + }, + { + "epoch": 1.801755731696923, + "grad_norm": 48.52099122463813, + "learning_rate": 4.122211483486895e-06, + "loss": 3.5191, + "step": 21140 + }, + { + "epoch": 1.8018409613909485, + "grad_norm": 29.309420628973285, + "learning_rate": 4.121723340920274e-06, + "loss": 2.203, + "step": 21141 + }, + { + "epoch": 1.801926191084974, + "grad_norm": 84.85972842137608, + "learning_rate": 4.1212352069911535e-06, + "loss": 4.0158, + "step": 21142 + }, + { + "epoch": 1.8020114207789995, + "grad_norm": 36.574086785304765, + "learning_rate": 4.120747081704337e-06, + "loss": 2.6154, + "step": 21143 + }, + { + "epoch": 1.802096650473025, + "grad_norm": 37.41893856080266, + "learning_rate": 4.120258965064623e-06, + "loss": 2.4812, + "step": 21144 + }, + { + "epoch": 1.8021818801670502, + "grad_norm": 25.670299683146613, + "learning_rate": 4.119770857076812e-06, + "loss": 1.7516, + "step": 21145 + }, + { + "epoch": 1.8022671098610756, + "grad_norm": 57.20486909610113, + "learning_rate": 4.1192827577457055e-06, + "loss": 2.4944, + "step": 21146 + }, + { + "epoch": 1.8023523395551009, + "grad_norm": 64.87580116154251, + "learning_rate": 4.118794667076103e-06, + "loss": 3.1229, + "step": 21147 + }, + { + "epoch": 1.8024375692491263, + "grad_norm": 42.800323990076855, + "learning_rate": 4.118306585072805e-06, + "loss": 2.6319, + "step": 21148 + }, + { + "epoch": 1.8025227989431518, + "grad_norm": 58.14460986158806, + "learning_rate": 4.1178185117406106e-06, + "loss": 3.0615, + "step": 21149 + }, + { + "epoch": 1.8026080286371773, + "grad_norm": 49.53668067420533, + "learning_rate": 4.117330447084321e-06, + "loss": 2.3648, + "step": 21150 + }, + { + "epoch": 1.8026932583312028, + "grad_norm": 50.094277723156196, + "learning_rate": 4.1168423911087364e-06, + "loss": 2.9157, + "step": 21151 + }, + { + "epoch": 1.802778488025228, + "grad_norm": 61.95913550661962, + "learning_rate": 4.116354343818655e-06, + "loss": 2.9597, + "step": 21152 + }, + { + "epoch": 1.8028637177192532, + "grad_norm": 41.54884842246228, + "learning_rate": 4.115866305218878e-06, + "loss": 2.7321, + "step": 21153 + }, + { + "epoch": 1.8029489474132787, + "grad_norm": 54.63041360252211, + "learning_rate": 4.115378275314203e-06, + "loss": 3.3076, + "step": 21154 + }, + { + "epoch": 1.8030341771073042, + "grad_norm": 53.88489712421654, + "learning_rate": 4.114890254109434e-06, + "loss": 2.1289, + "step": 21155 + }, + { + "epoch": 1.8031194068013296, + "grad_norm": 51.43681766703751, + "learning_rate": 4.114402241609366e-06, + "loss": 2.3525, + "step": 21156 + }, + { + "epoch": 1.8032046364953551, + "grad_norm": 74.19553316458064, + "learning_rate": 4.113914237818801e-06, + "loss": 2.5595, + "step": 21157 + }, + { + "epoch": 1.8032898661893804, + "grad_norm": 41.87281463928971, + "learning_rate": 4.113426242742536e-06, + "loss": 2.6823, + "step": 21158 + }, + { + "epoch": 1.8033750958834058, + "grad_norm": 34.34037023961497, + "learning_rate": 4.112938256385373e-06, + "loss": 3.4561, + "step": 21159 + }, + { + "epoch": 1.803460325577431, + "grad_norm": 75.04226048582527, + "learning_rate": 4.11245027875211e-06, + "loss": 3.8311, + "step": 21160 + }, + { + "epoch": 1.8035455552714565, + "grad_norm": 54.521148234423336, + "learning_rate": 4.111962309847544e-06, + "loss": 2.617, + "step": 21161 + }, + { + "epoch": 1.803630784965482, + "grad_norm": 81.65478955840275, + "learning_rate": 4.111474349676476e-06, + "loss": 3.7645, + "step": 21162 + }, + { + "epoch": 1.8037160146595075, + "grad_norm": 51.50101278436955, + "learning_rate": 4.110986398243705e-06, + "loss": 2.3894, + "step": 21163 + }, + { + "epoch": 1.8038012443535327, + "grad_norm": 35.78909601566254, + "learning_rate": 4.1104984555540306e-06, + "loss": 2.9194, + "step": 21164 + }, + { + "epoch": 1.8038864740475582, + "grad_norm": 30.569089517530468, + "learning_rate": 4.11001052161225e-06, + "loss": 1.7073, + "step": 21165 + }, + { + "epoch": 1.8039717037415834, + "grad_norm": 28.346032514257043, + "learning_rate": 4.109522596423162e-06, + "loss": 1.7615, + "step": 21166 + }, + { + "epoch": 1.804056933435609, + "grad_norm": 56.658213558924025, + "learning_rate": 4.109034679991565e-06, + "loss": 2.6955, + "step": 21167 + }, + { + "epoch": 1.8041421631296344, + "grad_norm": 39.901852078884474, + "learning_rate": 4.108546772322259e-06, + "loss": 2.4076, + "step": 21168 + }, + { + "epoch": 1.8042273928236598, + "grad_norm": 46.23762365071803, + "learning_rate": 4.10805887342004e-06, + "loss": 3.3611, + "step": 21169 + }, + { + "epoch": 1.8043126225176853, + "grad_norm": 39.12948783373055, + "learning_rate": 4.107570983289709e-06, + "loss": 3.2305, + "step": 21170 + }, + { + "epoch": 1.8043978522117106, + "grad_norm": 255.51451873403641, + "learning_rate": 4.10708310193606e-06, + "loss": 3.7551, + "step": 21171 + }, + { + "epoch": 1.8044830819057358, + "grad_norm": 106.21735495133639, + "learning_rate": 4.1065952293638965e-06, + "loss": 3.5675, + "step": 21172 + }, + { + "epoch": 1.8045683115997613, + "grad_norm": 36.195534776686216, + "learning_rate": 4.1061073655780135e-06, + "loss": 2.8946, + "step": 21173 + }, + { + "epoch": 1.8046535412937867, + "grad_norm": 32.510937348404326, + "learning_rate": 4.10561951058321e-06, + "loss": 2.1561, + "step": 21174 + }, + { + "epoch": 1.8047387709878122, + "grad_norm": 47.538743286061916, + "learning_rate": 4.105131664384281e-06, + "loss": 2.7674, + "step": 21175 + }, + { + "epoch": 1.8048240006818377, + "grad_norm": 31.98251208906579, + "learning_rate": 4.1046438269860285e-06, + "loss": 1.9077, + "step": 21176 + }, + { + "epoch": 1.804909230375863, + "grad_norm": 39.96322843400345, + "learning_rate": 4.104155998393246e-06, + "loss": 2.9548, + "step": 21177 + }, + { + "epoch": 1.8049944600698884, + "grad_norm": 84.75987964983565, + "learning_rate": 4.103668178610736e-06, + "loss": 3.0898, + "step": 21178 + }, + { + "epoch": 1.8050796897639136, + "grad_norm": 54.07405421565745, + "learning_rate": 4.1031803676432914e-06, + "loss": 3.5661, + "step": 21179 + }, + { + "epoch": 1.805164919457939, + "grad_norm": 66.04108040060298, + "learning_rate": 4.102692565495713e-06, + "loss": 3.2517, + "step": 21180 + }, + { + "epoch": 1.8052501491519646, + "grad_norm": 31.88081799648985, + "learning_rate": 4.102204772172795e-06, + "loss": 1.7613, + "step": 21181 + }, + { + "epoch": 1.80533537884599, + "grad_norm": 36.07883643212346, + "learning_rate": 4.101716987679338e-06, + "loss": 3.1009, + "step": 21182 + }, + { + "epoch": 1.8054206085400153, + "grad_norm": 47.02905878633885, + "learning_rate": 4.101229212020136e-06, + "loss": 2.0783, + "step": 21183 + }, + { + "epoch": 1.8055058382340408, + "grad_norm": 66.81830880513817, + "learning_rate": 4.1007414451999886e-06, + "loss": 3.2223, + "step": 21184 + }, + { + "epoch": 1.805591067928066, + "grad_norm": 49.46469408804639, + "learning_rate": 4.100253687223691e-06, + "loss": 3.0051, + "step": 21185 + }, + { + "epoch": 1.8056762976220915, + "grad_norm": 42.38470982319135, + "learning_rate": 4.099765938096042e-06, + "loss": 3.1419, + "step": 21186 + }, + { + "epoch": 1.805761527316117, + "grad_norm": 16.785820951220135, + "learning_rate": 4.099278197821836e-06, + "loss": 1.1343, + "step": 21187 + }, + { + "epoch": 1.8058467570101424, + "grad_norm": 42.8774256555216, + "learning_rate": 4.098790466405871e-06, + "loss": 2.7825, + "step": 21188 + }, + { + "epoch": 1.8059319867041679, + "grad_norm": 47.244505724630436, + "learning_rate": 4.098302743852944e-06, + "loss": 3.5196, + "step": 21189 + }, + { + "epoch": 1.8060172163981931, + "grad_norm": 82.13682521050681, + "learning_rate": 4.097815030167852e-06, + "loss": 3.1314, + "step": 21190 + }, + { + "epoch": 1.8061024460922184, + "grad_norm": 45.70582084134271, + "learning_rate": 4.09732732535539e-06, + "loss": 2.1554, + "step": 21191 + }, + { + "epoch": 1.8061876757862438, + "grad_norm": 39.0340812094983, + "learning_rate": 4.096839629420355e-06, + "loss": 2.0235, + "step": 21192 + }, + { + "epoch": 1.8062729054802693, + "grad_norm": 26.95498442657129, + "learning_rate": 4.096351942367543e-06, + "loss": 2.0282, + "step": 21193 + }, + { + "epoch": 1.8063581351742948, + "grad_norm": 49.95669170034825, + "learning_rate": 4.095864264201751e-06, + "loss": 2.3827, + "step": 21194 + }, + { + "epoch": 1.8064433648683202, + "grad_norm": 72.22330616490072, + "learning_rate": 4.095376594927775e-06, + "loss": 2.7606, + "step": 21195 + }, + { + "epoch": 1.8065285945623455, + "grad_norm": 94.76841418915855, + "learning_rate": 4.094888934550412e-06, + "loss": 3.9206, + "step": 21196 + }, + { + "epoch": 1.806613824256371, + "grad_norm": 32.06522263152586, + "learning_rate": 4.094401283074453e-06, + "loss": 2.2846, + "step": 21197 + }, + { + "epoch": 1.8066990539503962, + "grad_norm": 40.73782865089721, + "learning_rate": 4.093913640504701e-06, + "loss": 2.6147, + "step": 21198 + }, + { + "epoch": 1.8067842836444217, + "grad_norm": 43.28630913776184, + "learning_rate": 4.093426006845946e-06, + "loss": 2.6476, + "step": 21199 + }, + { + "epoch": 1.8068695133384471, + "grad_norm": 42.28494303513749, + "learning_rate": 4.092938382102988e-06, + "loss": 1.2726, + "step": 21200 + }, + { + "epoch": 1.8069547430324726, + "grad_norm": 45.95524235739577, + "learning_rate": 4.092450766280618e-06, + "loss": 3.0853, + "step": 21201 + }, + { + "epoch": 1.807039972726498, + "grad_norm": 42.29120783594351, + "learning_rate": 4.091963159383637e-06, + "loss": 2.6268, + "step": 21202 + }, + { + "epoch": 1.8071252024205233, + "grad_norm": 69.33043175737605, + "learning_rate": 4.091475561416835e-06, + "loss": 2.1365, + "step": 21203 + }, + { + "epoch": 1.8072104321145486, + "grad_norm": 217.9874039257843, + "learning_rate": 4.090987972385012e-06, + "loss": 4.3668, + "step": 21204 + }, + { + "epoch": 1.807295661808574, + "grad_norm": 42.59326505367574, + "learning_rate": 4.090500392292959e-06, + "loss": 3.0852, + "step": 21205 + }, + { + "epoch": 1.8073808915025995, + "grad_norm": 60.535514107374176, + "learning_rate": 4.090012821145475e-06, + "loss": 2.0389, + "step": 21206 + }, + { + "epoch": 1.807466121196625, + "grad_norm": 78.55462594049665, + "learning_rate": 4.0895252589473525e-06, + "loss": 2.5022, + "step": 21207 + }, + { + "epoch": 1.8075513508906504, + "grad_norm": 61.289300597356444, + "learning_rate": 4.089037705703388e-06, + "loss": 2.5859, + "step": 21208 + }, + { + "epoch": 1.8076365805846757, + "grad_norm": 86.53231773402574, + "learning_rate": 4.088550161418374e-06, + "loss": 2.6998, + "step": 21209 + }, + { + "epoch": 1.807721810278701, + "grad_norm": 60.078951875091846, + "learning_rate": 4.088062626097109e-06, + "loss": 3.2126, + "step": 21210 + }, + { + "epoch": 1.8078070399727264, + "grad_norm": 81.45195628432414, + "learning_rate": 4.087575099744385e-06, + "loss": 4.0355, + "step": 21211 + }, + { + "epoch": 1.8078922696667519, + "grad_norm": 53.896757658302995, + "learning_rate": 4.087087582364997e-06, + "loss": 1.9384, + "step": 21212 + }, + { + "epoch": 1.8079774993607773, + "grad_norm": 42.937859740234636, + "learning_rate": 4.086600073963741e-06, + "loss": 2.5396, + "step": 21213 + }, + { + "epoch": 1.8080627290548028, + "grad_norm": 69.26118674426586, + "learning_rate": 4.086112574545408e-06, + "loss": 3.6203, + "step": 21214 + }, + { + "epoch": 1.808147958748828, + "grad_norm": 35.56861263319836, + "learning_rate": 4.085625084114797e-06, + "loss": 2.4181, + "step": 21215 + }, + { + "epoch": 1.8082331884428535, + "grad_norm": 60.22030943592898, + "learning_rate": 4.085137602676699e-06, + "loss": 3.3623, + "step": 21216 + }, + { + "epoch": 1.8083184181368788, + "grad_norm": 35.25452955070377, + "learning_rate": 4.084650130235909e-06, + "loss": 3.3729, + "step": 21217 + }, + { + "epoch": 1.8084036478309042, + "grad_norm": 34.769102137375015, + "learning_rate": 4.08416266679722e-06, + "loss": 2.5619, + "step": 21218 + }, + { + "epoch": 1.8084888775249297, + "grad_norm": 43.88446895751893, + "learning_rate": 4.08367521236543e-06, + "loss": 3.1343, + "step": 21219 + }, + { + "epoch": 1.8085741072189552, + "grad_norm": 44.75069222726988, + "learning_rate": 4.083187766945328e-06, + "loss": 2.9835, + "step": 21220 + }, + { + "epoch": 1.8086593369129806, + "grad_norm": 55.811614724458856, + "learning_rate": 4.0827003305417115e-06, + "loss": 3.878, + "step": 21221 + }, + { + "epoch": 1.8087445666070059, + "grad_norm": 47.60973250321057, + "learning_rate": 4.082212903159371e-06, + "loss": 2.8877, + "step": 21222 + }, + { + "epoch": 1.8088297963010311, + "grad_norm": 64.71869297746802, + "learning_rate": 4.081725484803104e-06, + "loss": 2.4342, + "step": 21223 + }, + { + "epoch": 1.8089150259950566, + "grad_norm": 42.6206239018901, + "learning_rate": 4.0812380754777005e-06, + "loss": 2.7516, + "step": 21224 + }, + { + "epoch": 1.809000255689082, + "grad_norm": 47.994471128002814, + "learning_rate": 4.0807506751879565e-06, + "loss": 3.0656, + "step": 21225 + }, + { + "epoch": 1.8090854853831075, + "grad_norm": 46.88468836641307, + "learning_rate": 4.080263283938664e-06, + "loss": 3.4339, + "step": 21226 + }, + { + "epoch": 1.809170715077133, + "grad_norm": 39.18613354759673, + "learning_rate": 4.079775901734616e-06, + "loss": 2.3775, + "step": 21227 + }, + { + "epoch": 1.8092559447711583, + "grad_norm": 45.043565213735256, + "learning_rate": 4.079288528580606e-06, + "loss": 2.4775, + "step": 21228 + }, + { + "epoch": 1.8093411744651837, + "grad_norm": 83.78417862382625, + "learning_rate": 4.078801164481428e-06, + "loss": 4.2696, + "step": 21229 + }, + { + "epoch": 1.809426404159209, + "grad_norm": 25.845116601913954, + "learning_rate": 4.078313809441875e-06, + "loss": 1.8183, + "step": 21230 + }, + { + "epoch": 1.8095116338532344, + "grad_norm": 37.579523248395525, + "learning_rate": 4.077826463466739e-06, + "loss": 2.7694, + "step": 21231 + }, + { + "epoch": 1.80959686354726, + "grad_norm": 53.14984618367387, + "learning_rate": 4.0773391265608135e-06, + "loss": 2.4277, + "step": 21232 + }, + { + "epoch": 1.8096820932412854, + "grad_norm": 67.97753848432161, + "learning_rate": 4.076851798728891e-06, + "loss": 3.2277, + "step": 21233 + }, + { + "epoch": 1.8097673229353106, + "grad_norm": 24.142306763880846, + "learning_rate": 4.076364479975764e-06, + "loss": 1.86, + "step": 21234 + }, + { + "epoch": 1.809852552629336, + "grad_norm": 50.05010610762488, + "learning_rate": 4.0758771703062255e-06, + "loss": 2.2992, + "step": 21235 + }, + { + "epoch": 1.8099377823233613, + "grad_norm": 48.506601707542195, + "learning_rate": 4.075389869725067e-06, + "loss": 3.4685, + "step": 21236 + }, + { + "epoch": 1.8100230120173868, + "grad_norm": 98.13086046449293, + "learning_rate": 4.074902578237084e-06, + "loss": 3.1197, + "step": 21237 + }, + { + "epoch": 1.8101082417114123, + "grad_norm": 33.92134647779819, + "learning_rate": 4.074415295847065e-06, + "loss": 2.5592, + "step": 21238 + }, + { + "epoch": 1.8101934714054377, + "grad_norm": 50.70235641054465, + "learning_rate": 4.0739280225598035e-06, + "loss": 3.9838, + "step": 21239 + }, + { + "epoch": 1.8102787010994632, + "grad_norm": 42.8615735423746, + "learning_rate": 4.073440758380091e-06, + "loss": 2.496, + "step": 21240 + }, + { + "epoch": 1.8103639307934885, + "grad_norm": 56.81924796555024, + "learning_rate": 4.072953503312723e-06, + "loss": 2.32, + "step": 21241 + }, + { + "epoch": 1.8104491604875137, + "grad_norm": 53.99363480521791, + "learning_rate": 4.072466257362487e-06, + "loss": 3.2279, + "step": 21242 + }, + { + "epoch": 1.8105343901815392, + "grad_norm": 76.17911887178663, + "learning_rate": 4.071979020534179e-06, + "loss": 3.2782, + "step": 21243 + }, + { + "epoch": 1.8106196198755646, + "grad_norm": 43.491762938250815, + "learning_rate": 4.071491792832586e-06, + "loss": 1.5932, + "step": 21244 + }, + { + "epoch": 1.81070484956959, + "grad_norm": 90.15944234536698, + "learning_rate": 4.0710045742625035e-06, + "loss": 2.9751, + "step": 21245 + }, + { + "epoch": 1.8107900792636156, + "grad_norm": 44.19950332080512, + "learning_rate": 4.070517364828722e-06, + "loss": 2.3866, + "step": 21246 + }, + { + "epoch": 1.8108753089576408, + "grad_norm": 58.8286087056216, + "learning_rate": 4.070030164536033e-06, + "loss": 2.8132, + "step": 21247 + }, + { + "epoch": 1.8109605386516663, + "grad_norm": 95.16895311521343, + "learning_rate": 4.069542973389226e-06, + "loss": 2.2039, + "step": 21248 + }, + { + "epoch": 1.8110457683456915, + "grad_norm": 32.36533949599816, + "learning_rate": 4.069055791393096e-06, + "loss": 2.6002, + "step": 21249 + }, + { + "epoch": 1.811130998039717, + "grad_norm": 63.14017220633142, + "learning_rate": 4.068568618552431e-06, + "loss": 2.6897, + "step": 21250 + }, + { + "epoch": 1.8112162277337425, + "grad_norm": 68.69188456460897, + "learning_rate": 4.068081454872026e-06, + "loss": 3.3001, + "step": 21251 + }, + { + "epoch": 1.811301457427768, + "grad_norm": 32.0142352692364, + "learning_rate": 4.067594300356667e-06, + "loss": 2.2278, + "step": 21252 + }, + { + "epoch": 1.8113866871217932, + "grad_norm": 41.29925923629661, + "learning_rate": 4.067107155011149e-06, + "loss": 2.9022, + "step": 21253 + }, + { + "epoch": 1.8114719168158187, + "grad_norm": 80.65794094451282, + "learning_rate": 4.066620018840261e-06, + "loss": 2.3622, + "step": 21254 + }, + { + "epoch": 1.811557146509844, + "grad_norm": 29.03689115188191, + "learning_rate": 4.066132891848794e-06, + "loss": 2.551, + "step": 21255 + }, + { + "epoch": 1.8116423762038694, + "grad_norm": 54.83583766080207, + "learning_rate": 4.06564577404154e-06, + "loss": 2.0095, + "step": 21256 + }, + { + "epoch": 1.8117276058978948, + "grad_norm": 157.59410763741073, + "learning_rate": 4.065158665423287e-06, + "loss": 4.2268, + "step": 21257 + }, + { + "epoch": 1.8118128355919203, + "grad_norm": 48.80878907653583, + "learning_rate": 4.0646715659988275e-06, + "loss": 2.6468, + "step": 21258 + }, + { + "epoch": 1.8118980652859458, + "grad_norm": 60.99400172636438, + "learning_rate": 4.064184475772951e-06, + "loss": 2.8963, + "step": 21259 + }, + { + "epoch": 1.811983294979971, + "grad_norm": 83.70628507386496, + "learning_rate": 4.06369739475045e-06, + "loss": 3.7007, + "step": 21260 + }, + { + "epoch": 1.8120685246739963, + "grad_norm": 29.041136937780625, + "learning_rate": 4.063210322936111e-06, + "loss": 2.4363, + "step": 21261 + }, + { + "epoch": 1.8121537543680217, + "grad_norm": 66.54621263319699, + "learning_rate": 4.062723260334728e-06, + "loss": 3.185, + "step": 21262 + }, + { + "epoch": 1.8122389840620472, + "grad_norm": 36.280876298168366, + "learning_rate": 4.0622362069510885e-06, + "loss": 3.1258, + "step": 21263 + }, + { + "epoch": 1.8123242137560727, + "grad_norm": 35.665693439377684, + "learning_rate": 4.061749162789984e-06, + "loss": 2.9575, + "step": 21264 + }, + { + "epoch": 1.8124094434500981, + "grad_norm": 44.18233894987721, + "learning_rate": 4.061262127856202e-06, + "loss": 2.2149, + "step": 21265 + }, + { + "epoch": 1.8124946731441234, + "grad_norm": 86.42742004915252, + "learning_rate": 4.060775102154536e-06, + "loss": 3.6538, + "step": 21266 + }, + { + "epoch": 1.8125799028381488, + "grad_norm": 82.31939906375548, + "learning_rate": 4.060288085689772e-06, + "loss": 3.0542, + "step": 21267 + }, + { + "epoch": 1.812665132532174, + "grad_norm": 45.24935164139982, + "learning_rate": 4.059801078466703e-06, + "loss": 3.1306, + "step": 21268 + }, + { + "epoch": 1.8127503622261996, + "grad_norm": 48.47655043345223, + "learning_rate": 4.059314080490115e-06, + "loss": 2.0556, + "step": 21269 + }, + { + "epoch": 1.812835591920225, + "grad_norm": 62.260637165127676, + "learning_rate": 4.0588270917648e-06, + "loss": 2.6775, + "step": 21270 + }, + { + "epoch": 1.8129208216142505, + "grad_norm": 44.79369594888092, + "learning_rate": 4.0583401122955455e-06, + "loss": 2.2565, + "step": 21271 + }, + { + "epoch": 1.813006051308276, + "grad_norm": 74.546568369325, + "learning_rate": 4.057853142087144e-06, + "loss": 2.6038, + "step": 21272 + }, + { + "epoch": 1.8130912810023012, + "grad_norm": 36.50125413729274, + "learning_rate": 4.057366181144381e-06, + "loss": 2.7083, + "step": 21273 + }, + { + "epoch": 1.8131765106963265, + "grad_norm": 34.36796136978636, + "learning_rate": 4.056879229472047e-06, + "loss": 2.935, + "step": 21274 + }, + { + "epoch": 1.813261740390352, + "grad_norm": 64.54249555911673, + "learning_rate": 4.056392287074931e-06, + "loss": 2.9509, + "step": 21275 + }, + { + "epoch": 1.8133469700843774, + "grad_norm": 44.84013894900252, + "learning_rate": 4.055905353957823e-06, + "loss": 3.0889, + "step": 21276 + }, + { + "epoch": 1.8134321997784029, + "grad_norm": 76.27118318940306, + "learning_rate": 4.0554184301255094e-06, + "loss": 2.4024, + "step": 21277 + }, + { + "epoch": 1.8135174294724283, + "grad_norm": 36.015453819276296, + "learning_rate": 4.054931515582781e-06, + "loss": 3.3023, + "step": 21278 + }, + { + "epoch": 1.8136026591664536, + "grad_norm": 39.50536762917905, + "learning_rate": 4.054444610334425e-06, + "loss": 2.6599, + "step": 21279 + }, + { + "epoch": 1.8136878888604788, + "grad_norm": 48.45750589235282, + "learning_rate": 4.053957714385232e-06, + "loss": 2.5597, + "step": 21280 + }, + { + "epoch": 1.8137731185545043, + "grad_norm": 33.89184974141363, + "learning_rate": 4.053470827739987e-06, + "loss": 2.241, + "step": 21281 + }, + { + "epoch": 1.8138583482485298, + "grad_norm": 81.08262868260984, + "learning_rate": 4.05298395040348e-06, + "loss": 3.2903, + "step": 21282 + }, + { + "epoch": 1.8139435779425552, + "grad_norm": 60.57652870823149, + "learning_rate": 4.0524970823805016e-06, + "loss": 3.3647, + "step": 21283 + }, + { + "epoch": 1.8140288076365807, + "grad_norm": 37.37981055576086, + "learning_rate": 4.052010223675837e-06, + "loss": 2.7747, + "step": 21284 + }, + { + "epoch": 1.814114037330606, + "grad_norm": 33.198741318270926, + "learning_rate": 4.051523374294276e-06, + "loss": 2.4655, + "step": 21285 + }, + { + "epoch": 1.8141992670246314, + "grad_norm": 63.6777322968379, + "learning_rate": 4.051036534240605e-06, + "loss": 2.752, + "step": 21286 + }, + { + "epoch": 1.8142844967186567, + "grad_norm": 40.96449242219726, + "learning_rate": 4.0505497035196115e-06, + "loss": 2.7227, + "step": 21287 + }, + { + "epoch": 1.8143697264126821, + "grad_norm": 42.89867994055199, + "learning_rate": 4.050062882136086e-06, + "loss": 2.4675, + "step": 21288 + }, + { + "epoch": 1.8144549561067076, + "grad_norm": 56.70589468764753, + "learning_rate": 4.0495760700948134e-06, + "loss": 2.881, + "step": 21289 + }, + { + "epoch": 1.814540185800733, + "grad_norm": 51.82641634624499, + "learning_rate": 4.049089267400584e-06, + "loss": 3.169, + "step": 21290 + }, + { + "epoch": 1.8146254154947585, + "grad_norm": 114.50608765701102, + "learning_rate": 4.048602474058182e-06, + "loss": 3.1993, + "step": 21291 + }, + { + "epoch": 1.8147106451887838, + "grad_norm": 36.852991609981125, + "learning_rate": 4.048115690072398e-06, + "loss": 2.9495, + "step": 21292 + }, + { + "epoch": 1.814795874882809, + "grad_norm": 54.617462296426694, + "learning_rate": 4.047628915448017e-06, + "loss": 2.4601, + "step": 21293 + }, + { + "epoch": 1.8148811045768345, + "grad_norm": 70.86334520839762, + "learning_rate": 4.047142150189828e-06, + "loss": 3.4496, + "step": 21294 + }, + { + "epoch": 1.81496633427086, + "grad_norm": 111.94724443832435, + "learning_rate": 4.0466553943026156e-06, + "loss": 2.6513, + "step": 21295 + }, + { + "epoch": 1.8150515639648854, + "grad_norm": 96.54082149516914, + "learning_rate": 4.046168647791171e-06, + "loss": 3.9531, + "step": 21296 + }, + { + "epoch": 1.815136793658911, + "grad_norm": 81.24908007726239, + "learning_rate": 4.045681910660277e-06, + "loss": 3.9939, + "step": 21297 + }, + { + "epoch": 1.8152220233529361, + "grad_norm": 31.046233108864776, + "learning_rate": 4.045195182914724e-06, + "loss": 2.3367, + "step": 21298 + }, + { + "epoch": 1.8153072530469616, + "grad_norm": 34.318822312861236, + "learning_rate": 4.044708464559296e-06, + "loss": 2.8175, + "step": 21299 + }, + { + "epoch": 1.8153924827409869, + "grad_norm": 82.10462128743873, + "learning_rate": 4.04422175559878e-06, + "loss": 3.2988, + "step": 21300 + }, + { + "epoch": 1.8154777124350123, + "grad_norm": 46.891121323608026, + "learning_rate": 4.043735056037963e-06, + "loss": 2.5917, + "step": 21301 + }, + { + "epoch": 1.8155629421290378, + "grad_norm": 44.67995124318194, + "learning_rate": 4.043248365881633e-06, + "loss": 3.3821, + "step": 21302 + }, + { + "epoch": 1.8156481718230633, + "grad_norm": 57.54749140710498, + "learning_rate": 4.042761685134575e-06, + "loss": 4.2863, + "step": 21303 + }, + { + "epoch": 1.8157334015170885, + "grad_norm": 29.261479545089813, + "learning_rate": 4.0422750138015745e-06, + "loss": 2.2343, + "step": 21304 + }, + { + "epoch": 1.815818631211114, + "grad_norm": 44.69673949643361, + "learning_rate": 4.041788351887418e-06, + "loss": 2.5264, + "step": 21305 + }, + { + "epoch": 1.8159038609051392, + "grad_norm": 100.46790832980953, + "learning_rate": 4.041301699396894e-06, + "loss": 2.8044, + "step": 21306 + }, + { + "epoch": 1.8159890905991647, + "grad_norm": 89.68474754494288, + "learning_rate": 4.040815056334788e-06, + "loss": 4.7978, + "step": 21307 + }, + { + "epoch": 1.8160743202931902, + "grad_norm": 55.500678620134906, + "learning_rate": 4.040328422705882e-06, + "loss": 2.8701, + "step": 21308 + }, + { + "epoch": 1.8161595499872156, + "grad_norm": 87.33282422847974, + "learning_rate": 4.0398417985149664e-06, + "loss": 3.2176, + "step": 21309 + }, + { + "epoch": 1.816244779681241, + "grad_norm": 30.467455408547398, + "learning_rate": 4.039355183766824e-06, + "loss": 2.022, + "step": 21310 + }, + { + "epoch": 1.8163300093752663, + "grad_norm": 78.89570866324664, + "learning_rate": 4.038868578466243e-06, + "loss": 3.0754, + "step": 21311 + }, + { + "epoch": 1.8164152390692916, + "grad_norm": 34.20290960954475, + "learning_rate": 4.038381982618006e-06, + "loss": 1.9576, + "step": 21312 + }, + { + "epoch": 1.816500468763317, + "grad_norm": 36.23022539851024, + "learning_rate": 4.037895396226901e-06, + "loss": 2.6193, + "step": 21313 + }, + { + "epoch": 1.8165856984573425, + "grad_norm": 32.95756474206926, + "learning_rate": 4.037408819297712e-06, + "loss": 2.6327, + "step": 21314 + }, + { + "epoch": 1.816670928151368, + "grad_norm": 39.81256919046218, + "learning_rate": 4.036922251835226e-06, + "loss": 2.3278, + "step": 21315 + }, + { + "epoch": 1.8167561578453935, + "grad_norm": 45.292632667402174, + "learning_rate": 4.0364356938442265e-06, + "loss": 3.2157, + "step": 21316 + }, + { + "epoch": 1.8168413875394187, + "grad_norm": 35.59171972084404, + "learning_rate": 4.035949145329497e-06, + "loss": 3.1381, + "step": 21317 + }, + { + "epoch": 1.8169266172334442, + "grad_norm": 67.78898115625807, + "learning_rate": 4.035462606295827e-06, + "loss": 2.7181, + "step": 21318 + }, + { + "epoch": 1.8170118469274694, + "grad_norm": 38.19985639657166, + "learning_rate": 4.034976076747999e-06, + "loss": 2.5711, + "step": 21319 + }, + { + "epoch": 1.8170970766214949, + "grad_norm": 36.33284764516111, + "learning_rate": 4.034489556690797e-06, + "loss": 2.4949, + "step": 21320 + }, + { + "epoch": 1.8171823063155204, + "grad_norm": 69.13991888354983, + "learning_rate": 4.034003046129006e-06, + "loss": 3.6628, + "step": 21321 + }, + { + "epoch": 1.8172675360095458, + "grad_norm": 29.118687163199873, + "learning_rate": 4.033516545067411e-06, + "loss": 1.9323, + "step": 21322 + }, + { + "epoch": 1.817352765703571, + "grad_norm": 42.54058222757865, + "learning_rate": 4.033030053510799e-06, + "loss": 3.1204, + "step": 21323 + }, + { + "epoch": 1.8174379953975965, + "grad_norm": 43.82707846428252, + "learning_rate": 4.03254357146395e-06, + "loss": 2.5198, + "step": 21324 + }, + { + "epoch": 1.8175232250916218, + "grad_norm": 61.135054581058434, + "learning_rate": 4.032057098931651e-06, + "loss": 2.4518, + "step": 21325 + }, + { + "epoch": 1.8176084547856473, + "grad_norm": 124.06105064306718, + "learning_rate": 4.031570635918686e-06, + "loss": 3.422, + "step": 21326 + }, + { + "epoch": 1.8176936844796727, + "grad_norm": 45.641129422173215, + "learning_rate": 4.031084182429839e-06, + "loss": 2.6895, + "step": 21327 + }, + { + "epoch": 1.8177789141736982, + "grad_norm": 33.55599399947259, + "learning_rate": 4.030597738469895e-06, + "loss": 2.2578, + "step": 21328 + }, + { + "epoch": 1.8178641438677237, + "grad_norm": 32.63635352565856, + "learning_rate": 4.030111304043637e-06, + "loss": 2.392, + "step": 21329 + }, + { + "epoch": 1.817949373561749, + "grad_norm": 139.33305873905704, + "learning_rate": 4.029624879155846e-06, + "loss": 4.8595, + "step": 21330 + }, + { + "epoch": 1.8180346032557742, + "grad_norm": 37.15387051142362, + "learning_rate": 4.029138463811311e-06, + "loss": 2.9813, + "step": 21331 + }, + { + "epoch": 1.8181198329497996, + "grad_norm": 64.20302017377443, + "learning_rate": 4.028652058014813e-06, + "loss": 2.8556, + "step": 21332 + }, + { + "epoch": 1.818205062643825, + "grad_norm": 63.43100070250818, + "learning_rate": 4.028165661771138e-06, + "loss": 2.6983, + "step": 21333 + }, + { + "epoch": 1.8182902923378506, + "grad_norm": 36.90200927979264, + "learning_rate": 4.027679275085065e-06, + "loss": 3.1539, + "step": 21334 + }, + { + "epoch": 1.818375522031876, + "grad_norm": 18.95703369594686, + "learning_rate": 4.027192897961381e-06, + "loss": 1.5026, + "step": 21335 + }, + { + "epoch": 1.8184607517259013, + "grad_norm": 65.65411345790444, + "learning_rate": 4.026706530404868e-06, + "loss": 1.9817, + "step": 21336 + }, + { + "epoch": 1.8185459814199267, + "grad_norm": 35.651070010786285, + "learning_rate": 4.02622017242031e-06, + "loss": 2.7094, + "step": 21337 + }, + { + "epoch": 1.818631211113952, + "grad_norm": 30.17821838511312, + "learning_rate": 4.025733824012488e-06, + "loss": 1.9624, + "step": 21338 + }, + { + "epoch": 1.8187164408079775, + "grad_norm": 18.33703911164321, + "learning_rate": 4.025247485186189e-06, + "loss": 1.5395, + "step": 21339 + }, + { + "epoch": 1.818801670502003, + "grad_norm": 38.20932335735237, + "learning_rate": 4.0247611559461915e-06, + "loss": 2.3149, + "step": 21340 + }, + { + "epoch": 1.8188869001960284, + "grad_norm": 47.30675389562381, + "learning_rate": 4.024274836297283e-06, + "loss": 2.9869, + "step": 21341 + }, + { + "epoch": 1.8189721298900539, + "grad_norm": 61.22629695798277, + "learning_rate": 4.023788526244243e-06, + "loss": 3.6734, + "step": 21342 + }, + { + "epoch": 1.819057359584079, + "grad_norm": 75.95832427544607, + "learning_rate": 4.023302225791853e-06, + "loss": 3.2321, + "step": 21343 + }, + { + "epoch": 1.8191425892781043, + "grad_norm": 49.58528637132535, + "learning_rate": 4.0228159349448985e-06, + "loss": 3.4676, + "step": 21344 + }, + { + "epoch": 1.8192278189721298, + "grad_norm": 54.31301114326431, + "learning_rate": 4.022329653708162e-06, + "loss": 1.8676, + "step": 21345 + }, + { + "epoch": 1.8193130486661553, + "grad_norm": 33.15598117524862, + "learning_rate": 4.0218433820864255e-06, + "loss": 1.8671, + "step": 21346 + }, + { + "epoch": 1.8193982783601808, + "grad_norm": 58.34394264340514, + "learning_rate": 4.021357120084468e-06, + "loss": 1.7345, + "step": 21347 + }, + { + "epoch": 1.8194835080542062, + "grad_norm": 42.645080991165884, + "learning_rate": 4.020870867707075e-06, + "loss": 3.109, + "step": 21348 + }, + { + "epoch": 1.8195687377482315, + "grad_norm": 37.77593593637254, + "learning_rate": 4.020384624959029e-06, + "loss": 3.1825, + "step": 21349 + }, + { + "epoch": 1.819653967442257, + "grad_norm": 56.84957345000609, + "learning_rate": 4.019898391845112e-06, + "loss": 3.4018, + "step": 21350 + }, + { + "epoch": 1.8197391971362822, + "grad_norm": 75.55004973868208, + "learning_rate": 4.019412168370102e-06, + "loss": 4.8703, + "step": 21351 + }, + { + "epoch": 1.8198244268303077, + "grad_norm": 67.84216350112123, + "learning_rate": 4.018925954538785e-06, + "loss": 2.9963, + "step": 21352 + }, + { + "epoch": 1.8199096565243331, + "grad_norm": 125.88086801092025, + "learning_rate": 4.018439750355943e-06, + "loss": 1.5847, + "step": 21353 + }, + { + "epoch": 1.8199948862183586, + "grad_norm": 40.61082115059697, + "learning_rate": 4.017953555826355e-06, + "loss": 2.1962, + "step": 21354 + }, + { + "epoch": 1.8200801159123838, + "grad_norm": 32.02449327211419, + "learning_rate": 4.017467370954802e-06, + "loss": 1.8894, + "step": 21355 + }, + { + "epoch": 1.8201653456064093, + "grad_norm": 72.1099249586104, + "learning_rate": 4.016981195746069e-06, + "loss": 3.3584, + "step": 21356 + }, + { + "epoch": 1.8202505753004345, + "grad_norm": 58.383856402056686, + "learning_rate": 4.016495030204934e-06, + "loss": 2.699, + "step": 21357 + }, + { + "epoch": 1.82033580499446, + "grad_norm": 52.24690728081086, + "learning_rate": 4.0160088743361806e-06, + "loss": 2.5601, + "step": 21358 + }, + { + "epoch": 1.8204210346884855, + "grad_norm": 34.933246582056555, + "learning_rate": 4.015522728144588e-06, + "loss": 2.8835, + "step": 21359 + }, + { + "epoch": 1.820506264382511, + "grad_norm": 89.90397597400666, + "learning_rate": 4.015036591634938e-06, + "loss": 4.0739, + "step": 21360 + }, + { + "epoch": 1.8205914940765364, + "grad_norm": 31.20274464327, + "learning_rate": 4.014550464812012e-06, + "loss": 2.8018, + "step": 21361 + }, + { + "epoch": 1.8206767237705617, + "grad_norm": 32.334897955772185, + "learning_rate": 4.014064347680592e-06, + "loss": 1.5428, + "step": 21362 + }, + { + "epoch": 1.820761953464587, + "grad_norm": 33.8888731838816, + "learning_rate": 4.013578240245456e-06, + "loss": 2.2915, + "step": 21363 + }, + { + "epoch": 1.8208471831586124, + "grad_norm": 57.467746546309705, + "learning_rate": 4.013092142511385e-06, + "loss": 3.4408, + "step": 21364 + }, + { + "epoch": 1.8209324128526378, + "grad_norm": 28.53432194465018, + "learning_rate": 4.012606054483162e-06, + "loss": 2.0222, + "step": 21365 + }, + { + "epoch": 1.8210176425466633, + "grad_norm": 86.48312264618049, + "learning_rate": 4.0121199761655675e-06, + "loss": 3.6416, + "step": 21366 + }, + { + "epoch": 1.8211028722406888, + "grad_norm": 53.64677965335775, + "learning_rate": 4.011633907563378e-06, + "loss": 1.9661, + "step": 21367 + }, + { + "epoch": 1.821188101934714, + "grad_norm": 32.623394562416536, + "learning_rate": 4.011147848681377e-06, + "loss": 2.1847, + "step": 21368 + }, + { + "epoch": 1.8212733316287395, + "grad_norm": 45.60929196910994, + "learning_rate": 4.0106617995243435e-06, + "loss": 3.1292, + "step": 21369 + }, + { + "epoch": 1.8213585613227647, + "grad_norm": 18.48797000253131, + "learning_rate": 4.01017576009706e-06, + "loss": 1.8006, + "step": 21370 + }, + { + "epoch": 1.8214437910167902, + "grad_norm": 50.32430722841298, + "learning_rate": 4.009689730404303e-06, + "loss": 3.5546, + "step": 21371 + }, + { + "epoch": 1.8215290207108157, + "grad_norm": 54.769969555493425, + "learning_rate": 4.009203710450855e-06, + "loss": 3.1085, + "step": 21372 + }, + { + "epoch": 1.8216142504048412, + "grad_norm": 43.48385835086155, + "learning_rate": 4.008717700241493e-06, + "loss": 3.1223, + "step": 21373 + }, + { + "epoch": 1.8216994800988664, + "grad_norm": 52.99508523310876, + "learning_rate": 4.008231699781001e-06, + "loss": 2.5212, + "step": 21374 + }, + { + "epoch": 1.8217847097928919, + "grad_norm": 41.62620443996174, + "learning_rate": 4.007745709074154e-06, + "loss": 2.6432, + "step": 21375 + }, + { + "epoch": 1.8218699394869171, + "grad_norm": 68.13963849536226, + "learning_rate": 4.0072597281257354e-06, + "loss": 3.7192, + "step": 21376 + }, + { + "epoch": 1.8219551691809426, + "grad_norm": 34.66887627171041, + "learning_rate": 4.00677375694052e-06, + "loss": 2.4554, + "step": 21377 + }, + { + "epoch": 1.822040398874968, + "grad_norm": 40.879026828423115, + "learning_rate": 4.0062877955232935e-06, + "loss": 2.4878, + "step": 21378 + }, + { + "epoch": 1.8221256285689935, + "grad_norm": 43.846831624118025, + "learning_rate": 4.0058018438788296e-06, + "loss": 2.6889, + "step": 21379 + }, + { + "epoch": 1.822210858263019, + "grad_norm": 34.69370004668235, + "learning_rate": 4.005315902011911e-06, + "loss": 2.5825, + "step": 21380 + }, + { + "epoch": 1.8222960879570442, + "grad_norm": 41.98860170703785, + "learning_rate": 4.004829969927313e-06, + "loss": 2.603, + "step": 21381 + }, + { + "epoch": 1.8223813176510695, + "grad_norm": 44.12012111451916, + "learning_rate": 4.004344047629819e-06, + "loss": 2.8968, + "step": 21382 + }, + { + "epoch": 1.822466547345095, + "grad_norm": 69.25059291855601, + "learning_rate": 4.003858135124205e-06, + "loss": 2.7415, + "step": 21383 + }, + { + "epoch": 1.8225517770391204, + "grad_norm": 36.27255286899574, + "learning_rate": 4.00337223241525e-06, + "loss": 2.7549, + "step": 21384 + }, + { + "epoch": 1.8226370067331459, + "grad_norm": 47.006433053773456, + "learning_rate": 4.002886339507732e-06, + "loss": 3.0518, + "step": 21385 + }, + { + "epoch": 1.8227222364271713, + "grad_norm": 59.674401473171486, + "learning_rate": 4.0024004564064335e-06, + "loss": 3.3365, + "step": 21386 + }, + { + "epoch": 1.8228074661211966, + "grad_norm": 68.72636594905536, + "learning_rate": 4.001914583116128e-06, + "loss": 2.5455, + "step": 21387 + }, + { + "epoch": 1.822892695815222, + "grad_norm": 140.0520936946403, + "learning_rate": 4.001428719641597e-06, + "loss": 3.773, + "step": 21388 + }, + { + "epoch": 1.8229779255092473, + "grad_norm": 39.50316886592511, + "learning_rate": 4.0009428659876184e-06, + "loss": 3.0227, + "step": 21389 + }, + { + "epoch": 1.8230631552032728, + "grad_norm": 52.88957744013437, + "learning_rate": 4.000457022158967e-06, + "loss": 2.8636, + "step": 21390 + }, + { + "epoch": 1.8231483848972982, + "grad_norm": 37.58003976926396, + "learning_rate": 3.999971188160425e-06, + "loss": 2.301, + "step": 21391 + }, + { + "epoch": 1.8232336145913237, + "grad_norm": 41.33069802072494, + "learning_rate": 3.9994853639967705e-06, + "loss": 2.5452, + "step": 21392 + }, + { + "epoch": 1.823318844285349, + "grad_norm": 40.41894503446589, + "learning_rate": 3.998999549672779e-06, + "loss": 3.054, + "step": 21393 + }, + { + "epoch": 1.8234040739793744, + "grad_norm": 41.26428064034163, + "learning_rate": 3.998513745193228e-06, + "loss": 2.7511, + "step": 21394 + }, + { + "epoch": 1.8234893036733997, + "grad_norm": 72.22900196563941, + "learning_rate": 3.998027950562896e-06, + "loss": 2.9034, + "step": 21395 + }, + { + "epoch": 1.8235745333674251, + "grad_norm": 39.00898663489984, + "learning_rate": 3.997542165786563e-06, + "loss": 2.9893, + "step": 21396 + }, + { + "epoch": 1.8236597630614506, + "grad_norm": 69.7745823544312, + "learning_rate": 3.997056390869004e-06, + "loss": 3.0229, + "step": 21397 + }, + { + "epoch": 1.823744992755476, + "grad_norm": 35.34665235745117, + "learning_rate": 3.996570625814995e-06, + "loss": 2.9539, + "step": 21398 + }, + { + "epoch": 1.8238302224495015, + "grad_norm": 51.778541339980656, + "learning_rate": 3.996084870629317e-06, + "loss": 2.4998, + "step": 21399 + }, + { + "epoch": 1.8239154521435268, + "grad_norm": 40.396144639333095, + "learning_rate": 3.995599125316745e-06, + "loss": 1.9534, + "step": 21400 + }, + { + "epoch": 1.824000681837552, + "grad_norm": 54.17749344523425, + "learning_rate": 3.995113389882058e-06, + "loss": 3.4342, + "step": 21401 + }, + { + "epoch": 1.8240859115315775, + "grad_norm": 43.85970046315582, + "learning_rate": 3.994627664330029e-06, + "loss": 3.0359, + "step": 21402 + }, + { + "epoch": 1.824171141225603, + "grad_norm": 33.887107684921574, + "learning_rate": 3.994141948665437e-06, + "loss": 2.2556, + "step": 21403 + }, + { + "epoch": 1.8242563709196284, + "grad_norm": 46.84863011597038, + "learning_rate": 3.993656242893061e-06, + "loss": 3.2339, + "step": 21404 + }, + { + "epoch": 1.824341600613654, + "grad_norm": 65.16928036234499, + "learning_rate": 3.993170547017678e-06, + "loss": 2.5744, + "step": 21405 + }, + { + "epoch": 1.8244268303076792, + "grad_norm": 56.84877482056797, + "learning_rate": 3.992684861044059e-06, + "loss": 2.8514, + "step": 21406 + }, + { + "epoch": 1.8245120600017046, + "grad_norm": 42.423565249808846, + "learning_rate": 3.992199184976985e-06, + "loss": 2.6914, + "step": 21407 + }, + { + "epoch": 1.8245972896957299, + "grad_norm": 45.35853137645211, + "learning_rate": 3.991713518821232e-06, + "loss": 3.1853, + "step": 21408 + }, + { + "epoch": 1.8246825193897553, + "grad_norm": 34.434012713425574, + "learning_rate": 3.991227862581576e-06, + "loss": 1.7088, + "step": 21409 + }, + { + "epoch": 1.8247677490837808, + "grad_norm": 51.015408959739815, + "learning_rate": 3.990742216262793e-06, + "loss": 2.9995, + "step": 21410 + }, + { + "epoch": 1.8248529787778063, + "grad_norm": 54.0470364984966, + "learning_rate": 3.990256579869659e-06, + "loss": 3.282, + "step": 21411 + }, + { + "epoch": 1.8249382084718317, + "grad_norm": 88.75835801835406, + "learning_rate": 3.98977095340695e-06, + "loss": 4.2425, + "step": 21412 + }, + { + "epoch": 1.825023438165857, + "grad_norm": 68.01373573492509, + "learning_rate": 3.989285336879443e-06, + "loss": 2.5435, + "step": 21413 + }, + { + "epoch": 1.8251086678598822, + "grad_norm": 85.95419362602239, + "learning_rate": 3.988799730291913e-06, + "loss": 3.0154, + "step": 21414 + }, + { + "epoch": 1.8251938975539077, + "grad_norm": 57.185144689713475, + "learning_rate": 3.988314133649136e-06, + "loss": 3.0887, + "step": 21415 + }, + { + "epoch": 1.8252791272479332, + "grad_norm": 37.48126657642679, + "learning_rate": 3.987828546955886e-06, + "loss": 1.6659, + "step": 21416 + }, + { + "epoch": 1.8253643569419586, + "grad_norm": 50.513624288135816, + "learning_rate": 3.987342970216942e-06, + "loss": 2.8927, + "step": 21417 + }, + { + "epoch": 1.8254495866359841, + "grad_norm": 51.5858276409504, + "learning_rate": 3.986857403437076e-06, + "loss": 3.7098, + "step": 21418 + }, + { + "epoch": 1.8255348163300094, + "grad_norm": 49.896355566959734, + "learning_rate": 3.986371846621067e-06, + "loss": 3.3039, + "step": 21419 + }, + { + "epoch": 1.8256200460240348, + "grad_norm": 66.37038262803485, + "learning_rate": 3.985886299773685e-06, + "loss": 3.4808, + "step": 21420 + }, + { + "epoch": 1.82570527571806, + "grad_norm": 45.40764425739288, + "learning_rate": 3.985400762899711e-06, + "loss": 3.2122, + "step": 21421 + }, + { + "epoch": 1.8257905054120855, + "grad_norm": 16.653631642899878, + "learning_rate": 3.984915236003915e-06, + "loss": 1.2642, + "step": 21422 + }, + { + "epoch": 1.825875735106111, + "grad_norm": 30.102825950782044, + "learning_rate": 3.984429719091077e-06, + "loss": 2.2789, + "step": 21423 + }, + { + "epoch": 1.8259609648001365, + "grad_norm": 108.68729771487922, + "learning_rate": 3.983944212165966e-06, + "loss": 3.1318, + "step": 21424 + }, + { + "epoch": 1.8260461944941617, + "grad_norm": 43.61720811230737, + "learning_rate": 3.983458715233363e-06, + "loss": 2.4917, + "step": 21425 + }, + { + "epoch": 1.8261314241881872, + "grad_norm": 59.59564162079613, + "learning_rate": 3.9829732282980375e-06, + "loss": 2.095, + "step": 21426 + }, + { + "epoch": 1.8262166538822124, + "grad_norm": 67.75076856171236, + "learning_rate": 3.982487751364768e-06, + "loss": 2.3933, + "step": 21427 + }, + { + "epoch": 1.826301883576238, + "grad_norm": 67.97194663700999, + "learning_rate": 3.982002284438325e-06, + "loss": 3.2118, + "step": 21428 + }, + { + "epoch": 1.8263871132702634, + "grad_norm": 84.63332283925406, + "learning_rate": 3.981516827523486e-06, + "loss": 4.874, + "step": 21429 + }, + { + "epoch": 1.8264723429642888, + "grad_norm": 53.42196695023688, + "learning_rate": 3.981031380625024e-06, + "loss": 3.3155, + "step": 21430 + }, + { + "epoch": 1.8265575726583143, + "grad_norm": 43.90272053359561, + "learning_rate": 3.980545943747715e-06, + "loss": 2.9589, + "step": 21431 + }, + { + "epoch": 1.8266428023523396, + "grad_norm": 53.08096409724004, + "learning_rate": 3.980060516896332e-06, + "loss": 3.608, + "step": 21432 + }, + { + "epoch": 1.8267280320463648, + "grad_norm": 56.58187792931466, + "learning_rate": 3.979575100075644e-06, + "loss": 2.6696, + "step": 21433 + }, + { + "epoch": 1.8268132617403903, + "grad_norm": 69.8945412572974, + "learning_rate": 3.979089693290433e-06, + "loss": 2.7687, + "step": 21434 + }, + { + "epoch": 1.8268984914344157, + "grad_norm": 49.91245066623548, + "learning_rate": 3.978604296545469e-06, + "loss": 3.3167, + "step": 21435 + }, + { + "epoch": 1.8269837211284412, + "grad_norm": 21.653157237571975, + "learning_rate": 3.978118909845526e-06, + "loss": 1.5355, + "step": 21436 + }, + { + "epoch": 1.8270689508224667, + "grad_norm": 46.66652632511462, + "learning_rate": 3.977633533195376e-06, + "loss": 3.0929, + "step": 21437 + }, + { + "epoch": 1.827154180516492, + "grad_norm": 32.32429879613493, + "learning_rate": 3.977148166599794e-06, + "loss": 1.9641, + "step": 21438 + }, + { + "epoch": 1.8272394102105174, + "grad_norm": 62.503219357888014, + "learning_rate": 3.976662810063555e-06, + "loss": 3.1867, + "step": 21439 + }, + { + "epoch": 1.8273246399045426, + "grad_norm": 116.76647929748816, + "learning_rate": 3.976177463591431e-06, + "loss": 4.8597, + "step": 21440 + }, + { + "epoch": 1.827409869598568, + "grad_norm": 41.74460741174998, + "learning_rate": 3.975692127188192e-06, + "loss": 2.8578, + "step": 21441 + }, + { + "epoch": 1.8274950992925936, + "grad_norm": 76.232581650628, + "learning_rate": 3.975206800858615e-06, + "loss": 3.0964, + "step": 21442 + }, + { + "epoch": 1.827580328986619, + "grad_norm": 44.108639506141074, + "learning_rate": 3.974721484607474e-06, + "loss": 2.7169, + "step": 21443 + }, + { + "epoch": 1.8276655586806443, + "grad_norm": 51.0294683782113, + "learning_rate": 3.974236178439539e-06, + "loss": 3.232, + "step": 21444 + }, + { + "epoch": 1.8277507883746698, + "grad_norm": 24.170837744284327, + "learning_rate": 3.973750882359582e-06, + "loss": 2.0153, + "step": 21445 + }, + { + "epoch": 1.827836018068695, + "grad_norm": 63.23708679069336, + "learning_rate": 3.973265596372377e-06, + "loss": 3.1354, + "step": 21446 + }, + { + "epoch": 1.8279212477627205, + "grad_norm": 31.641460349180683, + "learning_rate": 3.972780320482699e-06, + "loss": 2.1607, + "step": 21447 + }, + { + "epoch": 1.828006477456746, + "grad_norm": 42.416435987244384, + "learning_rate": 3.972295054695318e-06, + "loss": 2.386, + "step": 21448 + }, + { + "epoch": 1.8280917071507714, + "grad_norm": 38.42995756029114, + "learning_rate": 3.9718097990150055e-06, + "loss": 3.1539, + "step": 21449 + }, + { + "epoch": 1.8281769368447969, + "grad_norm": 67.79901904932824, + "learning_rate": 3.971324553446535e-06, + "loss": 2.8765, + "step": 21450 + }, + { + "epoch": 1.8282621665388221, + "grad_norm": 26.828054191994344, + "learning_rate": 3.97083931799468e-06, + "loss": 2.707, + "step": 21451 + }, + { + "epoch": 1.8283473962328474, + "grad_norm": 70.20187360525509, + "learning_rate": 3.970354092664211e-06, + "loss": 2.9058, + "step": 21452 + }, + { + "epoch": 1.8284326259268728, + "grad_norm": 65.48194829443784, + "learning_rate": 3.9698688774599e-06, + "loss": 2.8665, + "step": 21453 + }, + { + "epoch": 1.8285178556208983, + "grad_norm": 34.825251062030375, + "learning_rate": 3.969383672386518e-06, + "loss": 2.5624, + "step": 21454 + }, + { + "epoch": 1.8286030853149238, + "grad_norm": 27.69418537038654, + "learning_rate": 3.968898477448841e-06, + "loss": 2.0128, + "step": 21455 + }, + { + "epoch": 1.8286883150089492, + "grad_norm": 38.953247151823525, + "learning_rate": 3.968413292651636e-06, + "loss": 1.9866, + "step": 21456 + }, + { + "epoch": 1.8287735447029745, + "grad_norm": 66.89400410011761, + "learning_rate": 3.967928117999677e-06, + "loss": 2.893, + "step": 21457 + }, + { + "epoch": 1.828858774397, + "grad_norm": 32.380809475508734, + "learning_rate": 3.967442953497734e-06, + "loss": 1.9609, + "step": 21458 + }, + { + "epoch": 1.8289440040910252, + "grad_norm": 66.80787612397684, + "learning_rate": 3.966957799150579e-06, + "loss": 2.28, + "step": 21459 + }, + { + "epoch": 1.8290292337850507, + "grad_norm": 39.97812823076866, + "learning_rate": 3.966472654962985e-06, + "loss": 2.3813, + "step": 21460 + }, + { + "epoch": 1.8291144634790761, + "grad_norm": 38.974655878603436, + "learning_rate": 3.965987520939721e-06, + "loss": 2.8194, + "step": 21461 + }, + { + "epoch": 1.8291996931731016, + "grad_norm": 80.87018247083846, + "learning_rate": 3.965502397085559e-06, + "loss": 3.3136, + "step": 21462 + }, + { + "epoch": 1.829284922867127, + "grad_norm": 65.60381670648411, + "learning_rate": 3.965017283405269e-06, + "loss": 2.6109, + "step": 21463 + }, + { + "epoch": 1.8293701525611523, + "grad_norm": 41.401045254180765, + "learning_rate": 3.964532179903624e-06, + "loss": 3.1752, + "step": 21464 + }, + { + "epoch": 1.8294553822551776, + "grad_norm": 65.14447542748869, + "learning_rate": 3.964047086585393e-06, + "loss": 2.7065, + "step": 21465 + }, + { + "epoch": 1.829540611949203, + "grad_norm": 35.717949633717474, + "learning_rate": 3.963562003455348e-06, + "loss": 2.7723, + "step": 21466 + }, + { + "epoch": 1.8296258416432285, + "grad_norm": 34.03048057286887, + "learning_rate": 3.9630769305182564e-06, + "loss": 2.0536, + "step": 21467 + }, + { + "epoch": 1.829711071337254, + "grad_norm": 49.29282227410212, + "learning_rate": 3.962591867778894e-06, + "loss": 3.3307, + "step": 21468 + }, + { + "epoch": 1.8297963010312794, + "grad_norm": 67.69138829143644, + "learning_rate": 3.962106815242027e-06, + "loss": 2.6791, + "step": 21469 + }, + { + "epoch": 1.8298815307253047, + "grad_norm": 57.09032701012859, + "learning_rate": 3.961621772912428e-06, + "loss": 2.8906, + "step": 21470 + }, + { + "epoch": 1.82996676041933, + "grad_norm": 44.23146483172692, + "learning_rate": 3.961136740794864e-06, + "loss": 2.054, + "step": 21471 + }, + { + "epoch": 1.8300519901133554, + "grad_norm": 69.38938627993262, + "learning_rate": 3.96065171889411e-06, + "loss": 3.7531, + "step": 21472 + }, + { + "epoch": 1.8301372198073809, + "grad_norm": 48.30791673887217, + "learning_rate": 3.960166707214932e-06, + "loss": 2.3304, + "step": 21473 + }, + { + "epoch": 1.8302224495014063, + "grad_norm": 75.60199774007249, + "learning_rate": 3.959681705762101e-06, + "loss": 2.8906, + "step": 21474 + }, + { + "epoch": 1.8303076791954318, + "grad_norm": 62.6396818709194, + "learning_rate": 3.959196714540389e-06, + "loss": 2.3573, + "step": 21475 + }, + { + "epoch": 1.830392908889457, + "grad_norm": 68.50526346733098, + "learning_rate": 3.958711733554561e-06, + "loss": 2.9472, + "step": 21476 + }, + { + "epoch": 1.8304781385834825, + "grad_norm": 71.3446518880573, + "learning_rate": 3.958226762809391e-06, + "loss": 2.9572, + "step": 21477 + }, + { + "epoch": 1.8305633682775078, + "grad_norm": 57.375493359309864, + "learning_rate": 3.957741802309647e-06, + "loss": 2.4171, + "step": 21478 + }, + { + "epoch": 1.8306485979715332, + "grad_norm": 72.6021031694238, + "learning_rate": 3.957256852060099e-06, + "loss": 4.1827, + "step": 21479 + }, + { + "epoch": 1.8307338276655587, + "grad_norm": 60.30897941461709, + "learning_rate": 3.956771912065513e-06, + "loss": 3.254, + "step": 21480 + }, + { + "epoch": 1.8308190573595842, + "grad_norm": 62.90403142670499, + "learning_rate": 3.956286982330661e-06, + "loss": 2.397, + "step": 21481 + }, + { + "epoch": 1.8309042870536096, + "grad_norm": 34.63948994813322, + "learning_rate": 3.955802062860313e-06, + "loss": 2.5673, + "step": 21482 + }, + { + "epoch": 1.8309895167476349, + "grad_norm": 44.814437617005225, + "learning_rate": 3.955317153659237e-06, + "loss": 3.5929, + "step": 21483 + }, + { + "epoch": 1.8310747464416601, + "grad_norm": 35.2245689071356, + "learning_rate": 3.9548322547322e-06, + "loss": 1.9242, + "step": 21484 + }, + { + "epoch": 1.8311599761356856, + "grad_norm": 51.75218810631055, + "learning_rate": 3.954347366083973e-06, + "loss": 3.0314, + "step": 21485 + }, + { + "epoch": 1.831245205829711, + "grad_norm": 37.44613521158988, + "learning_rate": 3.953862487719326e-06, + "loss": 2.235, + "step": 21486 + }, + { + "epoch": 1.8313304355237365, + "grad_norm": 31.93846891928139, + "learning_rate": 3.9533776196430254e-06, + "loss": 2.4756, + "step": 21487 + }, + { + "epoch": 1.831415665217762, + "grad_norm": 48.11888403377906, + "learning_rate": 3.952892761859839e-06, + "loss": 3.3744, + "step": 21488 + }, + { + "epoch": 1.8315008949117872, + "grad_norm": 48.92677809746457, + "learning_rate": 3.9524079143745344e-06, + "loss": 3.0066, + "step": 21489 + }, + { + "epoch": 1.8315861246058127, + "grad_norm": 55.93175520914729, + "learning_rate": 3.951923077191885e-06, + "loss": 3.094, + "step": 21490 + }, + { + "epoch": 1.831671354299838, + "grad_norm": 44.98729281150636, + "learning_rate": 3.951438250316655e-06, + "loss": 3.6409, + "step": 21491 + }, + { + "epoch": 1.8317565839938634, + "grad_norm": 51.93983636878844, + "learning_rate": 3.950953433753612e-06, + "loss": 2.965, + "step": 21492 + }, + { + "epoch": 1.831841813687889, + "grad_norm": 50.611033478505725, + "learning_rate": 3.950468627507524e-06, + "loss": 3.5518, + "step": 21493 + }, + { + "epoch": 1.8319270433819144, + "grad_norm": 32.93418545815365, + "learning_rate": 3.9499838315831625e-06, + "loss": 2.088, + "step": 21494 + }, + { + "epoch": 1.8320122730759396, + "grad_norm": 29.045390439048013, + "learning_rate": 3.949499045985292e-06, + "loss": 2.3157, + "step": 21495 + }, + { + "epoch": 1.832097502769965, + "grad_norm": 94.74956949211071, + "learning_rate": 3.9490142707186805e-06, + "loss": 2.84, + "step": 21496 + }, + { + "epoch": 1.8321827324639903, + "grad_norm": 57.59528795641535, + "learning_rate": 3.948529505788096e-06, + "loss": 3.4119, + "step": 21497 + }, + { + "epoch": 1.8322679621580158, + "grad_norm": 56.21089185248292, + "learning_rate": 3.948044751198307e-06, + "loss": 1.4872, + "step": 21498 + }, + { + "epoch": 1.8323531918520413, + "grad_norm": 58.066293543283415, + "learning_rate": 3.94756000695408e-06, + "loss": 3.4983, + "step": 21499 + }, + { + "epoch": 1.8324384215460667, + "grad_norm": 23.49194973281629, + "learning_rate": 3.94707527306018e-06, + "loss": 1.602, + "step": 21500 + }, + { + "epoch": 1.8325236512400922, + "grad_norm": 81.77587538575293, + "learning_rate": 3.946590549521377e-06, + "loss": 3.5096, + "step": 21501 + }, + { + "epoch": 1.8326088809341174, + "grad_norm": 33.21714682107467, + "learning_rate": 3.946105836342439e-06, + "loss": 1.8907, + "step": 21502 + }, + { + "epoch": 1.8326941106281427, + "grad_norm": 39.01083656605093, + "learning_rate": 3.94562113352813e-06, + "loss": 2.9588, + "step": 21503 + }, + { + "epoch": 1.8327793403221682, + "grad_norm": 39.14639068937269, + "learning_rate": 3.9451364410832185e-06, + "loss": 2.8243, + "step": 21504 + }, + { + "epoch": 1.8328645700161936, + "grad_norm": 54.2768775842753, + "learning_rate": 3.944651759012472e-06, + "loss": 2.4991, + "step": 21505 + }, + { + "epoch": 1.832949799710219, + "grad_norm": 58.95813834764433, + "learning_rate": 3.944167087320654e-06, + "loss": 2.8361, + "step": 21506 + }, + { + "epoch": 1.8330350294042446, + "grad_norm": 44.235199062308446, + "learning_rate": 3.943682426012535e-06, + "loss": 3.0937, + "step": 21507 + }, + { + "epoch": 1.8331202590982698, + "grad_norm": 54.058203500446595, + "learning_rate": 3.943197775092879e-06, + "loss": 3.0271, + "step": 21508 + }, + { + "epoch": 1.8332054887922953, + "grad_norm": 37.78947867740608, + "learning_rate": 3.942713134566452e-06, + "loss": 2.6267, + "step": 21509 + }, + { + "epoch": 1.8332907184863205, + "grad_norm": 26.5305102529973, + "learning_rate": 3.942228504438021e-06, + "loss": 1.8138, + "step": 21510 + }, + { + "epoch": 1.833375948180346, + "grad_norm": 81.59673944001842, + "learning_rate": 3.941743884712355e-06, + "loss": 2.9173, + "step": 21511 + }, + { + "epoch": 1.8334611778743715, + "grad_norm": 45.88390196622243, + "learning_rate": 3.941259275394215e-06, + "loss": 3.0406, + "step": 21512 + }, + { + "epoch": 1.833546407568397, + "grad_norm": 107.85079238437835, + "learning_rate": 3.94077467648837e-06, + "loss": 2.6024, + "step": 21513 + }, + { + "epoch": 1.8336316372624222, + "grad_norm": 69.10617244272075, + "learning_rate": 3.940290087999584e-06, + "loss": 2.2358, + "step": 21514 + }, + { + "epoch": 1.8337168669564476, + "grad_norm": 76.07519469576684, + "learning_rate": 3.939805509932626e-06, + "loss": 3.8813, + "step": 21515 + }, + { + "epoch": 1.833802096650473, + "grad_norm": 93.78364927418487, + "learning_rate": 3.939320942292258e-06, + "loss": 3.7011, + "step": 21516 + }, + { + "epoch": 1.8338873263444984, + "grad_norm": 42.555489710843496, + "learning_rate": 3.9388363850832486e-06, + "loss": 2.7533, + "step": 21517 + }, + { + "epoch": 1.8339725560385238, + "grad_norm": 43.99085505266545, + "learning_rate": 3.938351838310362e-06, + "loss": 2.6756, + "step": 21518 + }, + { + "epoch": 1.8340577857325493, + "grad_norm": 136.37083380498657, + "learning_rate": 3.93786730197836e-06, + "loss": 4.6098, + "step": 21519 + }, + { + "epoch": 1.8341430154265748, + "grad_norm": 37.18572065593992, + "learning_rate": 3.937382776092013e-06, + "loss": 2.5425, + "step": 21520 + }, + { + "epoch": 1.8342282451206, + "grad_norm": 76.73330068345724, + "learning_rate": 3.936898260656084e-06, + "loss": 2.8851, + "step": 21521 + }, + { + "epoch": 1.8343134748146253, + "grad_norm": 67.5454320758178, + "learning_rate": 3.93641375567534e-06, + "loss": 3.0973, + "step": 21522 + }, + { + "epoch": 1.8343987045086507, + "grad_norm": 35.53438102588326, + "learning_rate": 3.93592926115454e-06, + "loss": 3.0214, + "step": 21523 + }, + { + "epoch": 1.8344839342026762, + "grad_norm": 38.20349801273094, + "learning_rate": 3.935444777098456e-06, + "loss": 1.8888, + "step": 21524 + }, + { + "epoch": 1.8345691638967017, + "grad_norm": 49.9044588884029, + "learning_rate": 3.934960303511849e-06, + "loss": 2.7285, + "step": 21525 + }, + { + "epoch": 1.8346543935907271, + "grad_norm": 38.45050213860297, + "learning_rate": 3.934475840399485e-06, + "loss": 3.018, + "step": 21526 + }, + { + "epoch": 1.8347396232847524, + "grad_norm": 128.80902164565865, + "learning_rate": 3.933991387766125e-06, + "loss": 3.1045, + "step": 21527 + }, + { + "epoch": 1.8348248529787778, + "grad_norm": 32.09395148890701, + "learning_rate": 3.933506945616539e-06, + "loss": 2.8203, + "step": 21528 + }, + { + "epoch": 1.834910082672803, + "grad_norm": 89.68013378974322, + "learning_rate": 3.933022513955488e-06, + "loss": 2.5995, + "step": 21529 + }, + { + "epoch": 1.8349953123668286, + "grad_norm": 42.0303580697291, + "learning_rate": 3.932538092787736e-06, + "loss": 3.263, + "step": 21530 + }, + { + "epoch": 1.835080542060854, + "grad_norm": 58.408968603078236, + "learning_rate": 3.932053682118046e-06, + "loss": 2.517, + "step": 21531 + }, + { + "epoch": 1.8351657717548795, + "grad_norm": 40.05111205421259, + "learning_rate": 3.931569281951185e-06, + "loss": 2.1722, + "step": 21532 + }, + { + "epoch": 1.835251001448905, + "grad_norm": 44.735250232170316, + "learning_rate": 3.931084892291917e-06, + "loss": 3.597, + "step": 21533 + }, + { + "epoch": 1.8353362311429302, + "grad_norm": 34.886208839720574, + "learning_rate": 3.930600513145004e-06, + "loss": 2.8748, + "step": 21534 + }, + { + "epoch": 1.8354214608369555, + "grad_norm": 71.253068335013, + "learning_rate": 3.9301161445152095e-06, + "loss": 2.4033, + "step": 21535 + }, + { + "epoch": 1.835506690530981, + "grad_norm": 51.39596549877629, + "learning_rate": 3.929631786407295e-06, + "loss": 2.2364, + "step": 21536 + }, + { + "epoch": 1.8355919202250064, + "grad_norm": 45.63208726397057, + "learning_rate": 3.92914743882603e-06, + "loss": 2.3276, + "step": 21537 + }, + { + "epoch": 1.8356771499190319, + "grad_norm": 62.34869094805784, + "learning_rate": 3.928663101776174e-06, + "loss": 2.6357, + "step": 21538 + }, + { + "epoch": 1.8357623796130573, + "grad_norm": 192.87743321205505, + "learning_rate": 3.928178775262489e-06, + "loss": 3.5412, + "step": 21539 + }, + { + "epoch": 1.8358476093070826, + "grad_norm": 50.56192793034241, + "learning_rate": 3.92769445928974e-06, + "loss": 2.1337, + "step": 21540 + }, + { + "epoch": 1.835932839001108, + "grad_norm": 36.04145043812639, + "learning_rate": 3.927210153862691e-06, + "loss": 2.8059, + "step": 21541 + }, + { + "epoch": 1.8360180686951333, + "grad_norm": 66.75648455737728, + "learning_rate": 3.926725858986104e-06, + "loss": 1.7881, + "step": 21542 + }, + { + "epoch": 1.8361032983891588, + "grad_norm": 38.02475364760068, + "learning_rate": 3.926241574664742e-06, + "loss": 2.3585, + "step": 21543 + }, + { + "epoch": 1.8361885280831842, + "grad_norm": 44.94762671477363, + "learning_rate": 3.925757300903365e-06, + "loss": 3.1499, + "step": 21544 + }, + { + "epoch": 1.8362737577772097, + "grad_norm": 24.826757198490174, + "learning_rate": 3.92527303770674e-06, + "loss": 1.6865, + "step": 21545 + }, + { + "epoch": 1.836358987471235, + "grad_norm": 36.226792710113806, + "learning_rate": 3.924788785079629e-06, + "loss": 2.1739, + "step": 21546 + }, + { + "epoch": 1.8364442171652604, + "grad_norm": 41.63649679507676, + "learning_rate": 3.9243045430267916e-06, + "loss": 3.0609, + "step": 21547 + }, + { + "epoch": 1.8365294468592857, + "grad_norm": 73.27246785842627, + "learning_rate": 3.9238203115529915e-06, + "loss": 3.1041, + "step": 21548 + }, + { + "epoch": 1.8366146765533111, + "grad_norm": 41.708854557280844, + "learning_rate": 3.92333609066299e-06, + "loss": 1.4068, + "step": 21549 + }, + { + "epoch": 1.8366999062473366, + "grad_norm": 60.39695172688459, + "learning_rate": 3.922851880361552e-06, + "loss": 2.0833, + "step": 21550 + }, + { + "epoch": 1.836785135941362, + "grad_norm": 80.53209518726568, + "learning_rate": 3.922367680653436e-06, + "loss": 3.1882, + "step": 21551 + }, + { + "epoch": 1.8368703656353875, + "grad_norm": 95.59481077773273, + "learning_rate": 3.921883491543408e-06, + "loss": 4.6593, + "step": 21552 + }, + { + "epoch": 1.8369555953294128, + "grad_norm": 102.62382364693292, + "learning_rate": 3.921399313036224e-06, + "loss": 2.9412, + "step": 21553 + }, + { + "epoch": 1.837040825023438, + "grad_norm": 74.61213398742836, + "learning_rate": 3.920915145136652e-06, + "loss": 2.821, + "step": 21554 + }, + { + "epoch": 1.8371260547174635, + "grad_norm": 37.79521943548834, + "learning_rate": 3.920430987849449e-06, + "loss": 2.4918, + "step": 21555 + }, + { + "epoch": 1.837211284411489, + "grad_norm": 46.576537787494516, + "learning_rate": 3.919946841179381e-06, + "loss": 1.2993, + "step": 21556 + }, + { + "epoch": 1.8372965141055144, + "grad_norm": 29.249620140421683, + "learning_rate": 3.919462705131204e-06, + "loss": 2.3611, + "step": 21557 + }, + { + "epoch": 1.83738174379954, + "grad_norm": 71.14195971589061, + "learning_rate": 3.918978579709683e-06, + "loss": 3.2954, + "step": 21558 + }, + { + "epoch": 1.8374669734935651, + "grad_norm": 47.095887685707595, + "learning_rate": 3.918494464919577e-06, + "loss": 3.1319, + "step": 21559 + }, + { + "epoch": 1.8375522031875906, + "grad_norm": 46.71302179348924, + "learning_rate": 3.918010360765651e-06, + "loss": 2.9167, + "step": 21560 + }, + { + "epoch": 1.8376374328816159, + "grad_norm": 64.74690073733156, + "learning_rate": 3.917526267252661e-06, + "loss": 2.199, + "step": 21561 + }, + { + "epoch": 1.8377226625756413, + "grad_norm": 38.20023334915759, + "learning_rate": 3.917042184385369e-06, + "loss": 2.716, + "step": 21562 + }, + { + "epoch": 1.8378078922696668, + "grad_norm": 32.568409136427846, + "learning_rate": 3.916558112168538e-06, + "loss": 2.5084, + "step": 21563 + }, + { + "epoch": 1.8378931219636923, + "grad_norm": 48.64470879832339, + "learning_rate": 3.916074050606928e-06, + "loss": 4.0908, + "step": 21564 + }, + { + "epoch": 1.8379783516577175, + "grad_norm": 43.46489749646406, + "learning_rate": 3.915589999705299e-06, + "loss": 2.7615, + "step": 21565 + }, + { + "epoch": 1.838063581351743, + "grad_norm": 77.06132252008986, + "learning_rate": 3.91510595946841e-06, + "loss": 3.3791, + "step": 21566 + }, + { + "epoch": 1.8381488110457682, + "grad_norm": 56.05574208835715, + "learning_rate": 3.914621929901024e-06, + "loss": 3.2785, + "step": 21567 + }, + { + "epoch": 1.8382340407397937, + "grad_norm": 27.679274744420276, + "learning_rate": 3.914137911007901e-06, + "loss": 2.1185, + "step": 21568 + }, + { + "epoch": 1.8383192704338192, + "grad_norm": 84.34800970141653, + "learning_rate": 3.913653902793799e-06, + "loss": 2.6412, + "step": 21569 + }, + { + "epoch": 1.8384045001278446, + "grad_norm": 61.64745128170054, + "learning_rate": 3.913169905263478e-06, + "loss": 2.9743, + "step": 21570 + }, + { + "epoch": 1.83848972982187, + "grad_norm": 44.28057944796963, + "learning_rate": 3.912685918421699e-06, + "loss": 2.7804, + "step": 21571 + }, + { + "epoch": 1.8385749595158953, + "grad_norm": 122.85469523527391, + "learning_rate": 3.912201942273224e-06, + "loss": 4.5776, + "step": 21572 + }, + { + "epoch": 1.8386601892099206, + "grad_norm": 54.69438454757667, + "learning_rate": 3.9117179768228105e-06, + "loss": 3.56, + "step": 21573 + }, + { + "epoch": 1.838745418903946, + "grad_norm": 92.54354893159044, + "learning_rate": 3.911234022075216e-06, + "loss": 2.259, + "step": 21574 + }, + { + "epoch": 1.8388306485979715, + "grad_norm": 33.75091025057149, + "learning_rate": 3.910750078035204e-06, + "loss": 2.1467, + "step": 21575 + }, + { + "epoch": 1.838915878291997, + "grad_norm": 70.01600977159212, + "learning_rate": 3.910266144707533e-06, + "loss": 2.2582, + "step": 21576 + }, + { + "epoch": 1.8390011079860225, + "grad_norm": 57.49074460564407, + "learning_rate": 3.909782222096962e-06, + "loss": 3.5501, + "step": 21577 + }, + { + "epoch": 1.8390863376800477, + "grad_norm": 54.69416563911946, + "learning_rate": 3.909298310208247e-06, + "loss": 2.8814, + "step": 21578 + }, + { + "epoch": 1.8391715673740732, + "grad_norm": 28.261487204380153, + "learning_rate": 3.9088144090461496e-06, + "loss": 2.2956, + "step": 21579 + }, + { + "epoch": 1.8392567970680984, + "grad_norm": 66.64451604584868, + "learning_rate": 3.908330518615431e-06, + "loss": 3.5219, + "step": 21580 + }, + { + "epoch": 1.8393420267621239, + "grad_norm": 89.65949165788415, + "learning_rate": 3.907846638920848e-06, + "loss": 2.8454, + "step": 21581 + }, + { + "epoch": 1.8394272564561494, + "grad_norm": 33.86294695641431, + "learning_rate": 3.907362769967158e-06, + "loss": 2.1854, + "step": 21582 + }, + { + "epoch": 1.8395124861501748, + "grad_norm": 39.85545781537466, + "learning_rate": 3.90687891175912e-06, + "loss": 2.8056, + "step": 21583 + }, + { + "epoch": 1.8395977158442, + "grad_norm": 30.857905372233116, + "learning_rate": 3.906395064301496e-06, + "loss": 2.313, + "step": 21584 + }, + { + "epoch": 1.8396829455382255, + "grad_norm": 43.49330301471438, + "learning_rate": 3.9059112275990415e-06, + "loss": 2.287, + "step": 21585 + }, + { + "epoch": 1.8397681752322508, + "grad_norm": 57.368540086510926, + "learning_rate": 3.905427401656514e-06, + "loss": 1.9928, + "step": 21586 + }, + { + "epoch": 1.8398534049262762, + "grad_norm": 21.356269416683105, + "learning_rate": 3.904943586478673e-06, + "loss": 1.5007, + "step": 21587 + }, + { + "epoch": 1.8399386346203017, + "grad_norm": 63.543989558199115, + "learning_rate": 3.904459782070278e-06, + "loss": 3.1541, + "step": 21588 + }, + { + "epoch": 1.8400238643143272, + "grad_norm": 59.301380223028175, + "learning_rate": 3.903975988436086e-06, + "loss": 2.9101, + "step": 21589 + }, + { + "epoch": 1.8401090940083527, + "grad_norm": 49.679857942673294, + "learning_rate": 3.903492205580853e-06, + "loss": 3.2689, + "step": 21590 + }, + { + "epoch": 1.840194323702378, + "grad_norm": 82.81931897371334, + "learning_rate": 3.9030084335093405e-06, + "loss": 2.5584, + "step": 21591 + }, + { + "epoch": 1.8402795533964031, + "grad_norm": 27.78890576710324, + "learning_rate": 3.902524672226302e-06, + "loss": 2.1683, + "step": 21592 + }, + { + "epoch": 1.8403647830904286, + "grad_norm": 59.01811514599815, + "learning_rate": 3.902040921736498e-06, + "loss": 4.33, + "step": 21593 + }, + { + "epoch": 1.840450012784454, + "grad_norm": 42.05146915443467, + "learning_rate": 3.901557182044686e-06, + "loss": 2.5822, + "step": 21594 + }, + { + "epoch": 1.8405352424784795, + "grad_norm": 60.76415352818003, + "learning_rate": 3.901073453155623e-06, + "loss": 3.6137, + "step": 21595 + }, + { + "epoch": 1.840620472172505, + "grad_norm": 45.32917767186387, + "learning_rate": 3.900589735074064e-06, + "loss": 3.0364, + "step": 21596 + }, + { + "epoch": 1.8407057018665303, + "grad_norm": 91.17136932409058, + "learning_rate": 3.9001060278047695e-06, + "loss": 2.9664, + "step": 21597 + }, + { + "epoch": 1.8407909315605557, + "grad_norm": 110.75627720877715, + "learning_rate": 3.899622331352494e-06, + "loss": 3.4361, + "step": 21598 + }, + { + "epoch": 1.840876161254581, + "grad_norm": 77.64414439708555, + "learning_rate": 3.899138645721997e-06, + "loss": 2.7455, + "step": 21599 + }, + { + "epoch": 1.8409613909486064, + "grad_norm": 38.191101984333756, + "learning_rate": 3.8986549709180325e-06, + "loss": 2.6813, + "step": 21600 + }, + { + "epoch": 1.841046620642632, + "grad_norm": 39.827897378179046, + "learning_rate": 3.89817130694536e-06, + "loss": 3.3135, + "step": 21601 + }, + { + "epoch": 1.8411318503366574, + "grad_norm": 36.170040949926694, + "learning_rate": 3.897687653808734e-06, + "loss": 2.7744, + "step": 21602 + }, + { + "epoch": 1.8412170800306829, + "grad_norm": 35.23509231270728, + "learning_rate": 3.897204011512913e-06, + "loss": 2.6457, + "step": 21603 + }, + { + "epoch": 1.841302309724708, + "grad_norm": 72.78615264039037, + "learning_rate": 3.896720380062651e-06, + "loss": 2.1739, + "step": 21604 + }, + { + "epoch": 1.8413875394187333, + "grad_norm": 27.14747380106148, + "learning_rate": 3.896236759462707e-06, + "loss": 1.3285, + "step": 21605 + }, + { + "epoch": 1.8414727691127588, + "grad_norm": 163.85937156457553, + "learning_rate": 3.895753149717835e-06, + "loss": 5.2544, + "step": 21606 + }, + { + "epoch": 1.8415579988067843, + "grad_norm": 39.016899134299926, + "learning_rate": 3.895269550832793e-06, + "loss": 2.6657, + "step": 21607 + }, + { + "epoch": 1.8416432285008097, + "grad_norm": 39.579599573864925, + "learning_rate": 3.894785962812336e-06, + "loss": 2.2656, + "step": 21608 + }, + { + "epoch": 1.8417284581948352, + "grad_norm": 53.61942572290556, + "learning_rate": 3.8943023856612174e-06, + "loss": 1.7932, + "step": 21609 + }, + { + "epoch": 1.8418136878888605, + "grad_norm": 62.19636722363297, + "learning_rate": 3.893818819384198e-06, + "loss": 2.826, + "step": 21610 + }, + { + "epoch": 1.841898917582886, + "grad_norm": 37.53799217872751, + "learning_rate": 3.89333526398603e-06, + "loss": 2.6317, + "step": 21611 + }, + { + "epoch": 1.8419841472769112, + "grad_norm": 34.59223600537087, + "learning_rate": 3.892851719471471e-06, + "loss": 2.4289, + "step": 21612 + }, + { + "epoch": 1.8420693769709366, + "grad_norm": 34.03361740216323, + "learning_rate": 3.892368185845273e-06, + "loss": 1.8887, + "step": 21613 + }, + { + "epoch": 1.8421546066649621, + "grad_norm": 106.79322223034372, + "learning_rate": 3.891884663112195e-06, + "loss": 2.8875, + "step": 21614 + }, + { + "epoch": 1.8422398363589876, + "grad_norm": 40.42777837458875, + "learning_rate": 3.891401151276993e-06, + "loss": 3.0052, + "step": 21615 + }, + { + "epoch": 1.8423250660530128, + "grad_norm": 165.00387647542706, + "learning_rate": 3.890917650344419e-06, + "loss": 2.313, + "step": 21616 + }, + { + "epoch": 1.8424102957470383, + "grad_norm": 56.09381104287217, + "learning_rate": 3.890434160319227e-06, + "loss": 2.8174, + "step": 21617 + }, + { + "epoch": 1.8424955254410635, + "grad_norm": 44.12909856346422, + "learning_rate": 3.889950681206176e-06, + "loss": 3.2067, + "step": 21618 + }, + { + "epoch": 1.842580755135089, + "grad_norm": 88.33975000184388, + "learning_rate": 3.889467213010019e-06, + "loss": 3.1361, + "step": 21619 + }, + { + "epoch": 1.8426659848291145, + "grad_norm": 34.55198550020992, + "learning_rate": 3.888983755735511e-06, + "loss": 1.8791, + "step": 21620 + }, + { + "epoch": 1.84275121452314, + "grad_norm": 66.9106165340363, + "learning_rate": 3.888500309387405e-06, + "loss": 2.5449, + "step": 21621 + }, + { + "epoch": 1.8428364442171654, + "grad_norm": 65.55773562044764, + "learning_rate": 3.888016873970456e-06, + "loss": 3.7822, + "step": 21622 + }, + { + "epoch": 1.8429216739111907, + "grad_norm": 48.26441560098495, + "learning_rate": 3.887533449489422e-06, + "loss": 3.0248, + "step": 21623 + }, + { + "epoch": 1.843006903605216, + "grad_norm": 320.769192353636, + "learning_rate": 3.8870500359490535e-06, + "loss": 3.0524, + "step": 21624 + }, + { + "epoch": 1.8430921332992414, + "grad_norm": 45.66901937906189, + "learning_rate": 3.886566633354105e-06, + "loss": 2.8216, + "step": 21625 + }, + { + "epoch": 1.8431773629932668, + "grad_norm": 49.738137284359134, + "learning_rate": 3.88608324170933e-06, + "loss": 2.397, + "step": 21626 + }, + { + "epoch": 1.8432625926872923, + "grad_norm": 61.22711277495418, + "learning_rate": 3.885599861019487e-06, + "loss": 3.4626, + "step": 21627 + }, + { + "epoch": 1.8433478223813178, + "grad_norm": 89.87473174252837, + "learning_rate": 3.885116491289325e-06, + "loss": 2.3511, + "step": 21628 + }, + { + "epoch": 1.843433052075343, + "grad_norm": 50.840447313950705, + "learning_rate": 3.884633132523599e-06, + "loss": 3.1776, + "step": 21629 + }, + { + "epoch": 1.8435182817693685, + "grad_norm": 29.647795507497495, + "learning_rate": 3.884149784727062e-06, + "loss": 2.889, + "step": 21630 + }, + { + "epoch": 1.8436035114633937, + "grad_norm": 93.48878539111487, + "learning_rate": 3.883666447904469e-06, + "loss": 3.6315, + "step": 21631 + }, + { + "epoch": 1.8436887411574192, + "grad_norm": 81.4930959029402, + "learning_rate": 3.883183122060575e-06, + "loss": 4.4662, + "step": 21632 + }, + { + "epoch": 1.8437739708514447, + "grad_norm": 49.493977179872495, + "learning_rate": 3.882699807200129e-06, + "loss": 3.2974, + "step": 21633 + }, + { + "epoch": 1.8438592005454701, + "grad_norm": 46.223946956313966, + "learning_rate": 3.882216503327888e-06, + "loss": 3.0984, + "step": 21634 + }, + { + "epoch": 1.8439444302394954, + "grad_norm": 86.36185832593887, + "learning_rate": 3.881733210448601e-06, + "loss": 3.9529, + "step": 21635 + }, + { + "epoch": 1.8440296599335209, + "grad_norm": 60.12668218420061, + "learning_rate": 3.881249928567026e-06, + "loss": 2.1362, + "step": 21636 + }, + { + "epoch": 1.844114889627546, + "grad_norm": 45.124350888848376, + "learning_rate": 3.8807666576879126e-06, + "loss": 1.2973, + "step": 21637 + }, + { + "epoch": 1.8442001193215716, + "grad_norm": 64.45327501654131, + "learning_rate": 3.880283397816015e-06, + "loss": 2.8249, + "step": 21638 + }, + { + "epoch": 1.844285349015597, + "grad_norm": 56.51916246358451, + "learning_rate": 3.879800148956082e-06, + "loss": 1.9816, + "step": 21639 + }, + { + "epoch": 1.8443705787096225, + "grad_norm": 60.569025324012614, + "learning_rate": 3.879316911112873e-06, + "loss": 3.0807, + "step": 21640 + }, + { + "epoch": 1.844455808403648, + "grad_norm": 61.80502507073703, + "learning_rate": 3.878833684291135e-06, + "loss": 2.4018, + "step": 21641 + }, + { + "epoch": 1.8445410380976732, + "grad_norm": 35.869146955724474, + "learning_rate": 3.878350468495625e-06, + "loss": 2.2047, + "step": 21642 + }, + { + "epoch": 1.8446262677916985, + "grad_norm": 38.94945946171556, + "learning_rate": 3.877867263731089e-06, + "loss": 2.401, + "step": 21643 + }, + { + "epoch": 1.844711497485724, + "grad_norm": 105.50168085585973, + "learning_rate": 3.877384070002285e-06, + "loss": 4.2389, + "step": 21644 + }, + { + "epoch": 1.8447967271797494, + "grad_norm": 65.58394183570994, + "learning_rate": 3.876900887313961e-06, + "loss": 2.9846, + "step": 21645 + }, + { + "epoch": 1.8448819568737749, + "grad_norm": 76.59040588582239, + "learning_rate": 3.876417715670872e-06, + "loss": 3.3153, + "step": 21646 + }, + { + "epoch": 1.8449671865678003, + "grad_norm": 44.63461957216313, + "learning_rate": 3.875934555077766e-06, + "loss": 3.0714, + "step": 21647 + }, + { + "epoch": 1.8450524162618256, + "grad_norm": 39.760118770749195, + "learning_rate": 3.875451405539398e-06, + "loss": 2.5119, + "step": 21648 + }, + { + "epoch": 1.845137645955851, + "grad_norm": 33.77627314021654, + "learning_rate": 3.874968267060519e-06, + "loss": 2.6565, + "step": 21649 + }, + { + "epoch": 1.8452228756498763, + "grad_norm": 80.28148908579175, + "learning_rate": 3.874485139645881e-06, + "loss": 3.2369, + "step": 21650 + }, + { + "epoch": 1.8453081053439018, + "grad_norm": 36.094510273268554, + "learning_rate": 3.874002023300234e-06, + "loss": 2.7754, + "step": 21651 + }, + { + "epoch": 1.8453933350379272, + "grad_norm": 55.725383034690495, + "learning_rate": 3.873518918028329e-06, + "loss": 1.8725, + "step": 21652 + }, + { + "epoch": 1.8454785647319527, + "grad_norm": 33.81708184991697, + "learning_rate": 3.873035823834916e-06, + "loss": 3.3258, + "step": 21653 + }, + { + "epoch": 1.8455637944259782, + "grad_norm": 54.092403423209745, + "learning_rate": 3.872552740724751e-06, + "loss": 2.3743, + "step": 21654 + }, + { + "epoch": 1.8456490241200034, + "grad_norm": 73.36361751935665, + "learning_rate": 3.872069668702582e-06, + "loss": 3.9472, + "step": 21655 + }, + { + "epoch": 1.8457342538140287, + "grad_norm": 51.34097620915667, + "learning_rate": 3.871586607773156e-06, + "loss": 2.6113, + "step": 21656 + }, + { + "epoch": 1.8458194835080541, + "grad_norm": 51.036960008259534, + "learning_rate": 3.871103557941229e-06, + "loss": 3.4683, + "step": 21657 + }, + { + "epoch": 1.8459047132020796, + "grad_norm": 65.485511257108, + "learning_rate": 3.870620519211552e-06, + "loss": 2.8707, + "step": 21658 + }, + { + "epoch": 1.845989942896105, + "grad_norm": 86.62314195019513, + "learning_rate": 3.870137491588873e-06, + "loss": 2.3172, + "step": 21659 + }, + { + "epoch": 1.8460751725901305, + "grad_norm": 51.17124865213394, + "learning_rate": 3.8696544750779394e-06, + "loss": 2.5289, + "step": 21660 + }, + { + "epoch": 1.8461604022841558, + "grad_norm": 58.82107812641428, + "learning_rate": 3.869171469683507e-06, + "loss": 3.3381, + "step": 21661 + }, + { + "epoch": 1.846245631978181, + "grad_norm": 45.94499851463746, + "learning_rate": 3.8686884754103245e-06, + "loss": 2.1495, + "step": 21662 + }, + { + "epoch": 1.8463308616722065, + "grad_norm": 125.57012764603506, + "learning_rate": 3.8682054922631415e-06, + "loss": 4.1237, + "step": 21663 + }, + { + "epoch": 1.846416091366232, + "grad_norm": 46.99206695367427, + "learning_rate": 3.8677225202467074e-06, + "loss": 3.2038, + "step": 21664 + }, + { + "epoch": 1.8465013210602574, + "grad_norm": 64.91719237861972, + "learning_rate": 3.86723955936577e-06, + "loss": 4.3709, + "step": 21665 + }, + { + "epoch": 1.846586550754283, + "grad_norm": 38.55429720014282, + "learning_rate": 3.8667566096250845e-06, + "loss": 2.8391, + "step": 21666 + }, + { + "epoch": 1.8466717804483082, + "grad_norm": 46.614304787465876, + "learning_rate": 3.866273671029398e-06, + "loss": 2.8578, + "step": 21667 + }, + { + "epoch": 1.8467570101423336, + "grad_norm": 43.18664054682039, + "learning_rate": 3.865790743583457e-06, + "loss": 2.6278, + "step": 21668 + }, + { + "epoch": 1.8468422398363589, + "grad_norm": 64.01753591252364, + "learning_rate": 3.865307827292013e-06, + "loss": 2.5218, + "step": 21669 + }, + { + "epoch": 1.8469274695303843, + "grad_norm": 35.22499076920521, + "learning_rate": 3.8648249221598176e-06, + "loss": 2.4832, + "step": 21670 + }, + { + "epoch": 1.8470126992244098, + "grad_norm": 38.96077184320198, + "learning_rate": 3.8643420281916175e-06, + "loss": 1.7843, + "step": 21671 + }, + { + "epoch": 1.8470979289184353, + "grad_norm": 50.10819575788572, + "learning_rate": 3.863859145392162e-06, + "loss": 3.0883, + "step": 21672 + }, + { + "epoch": 1.8471831586124607, + "grad_norm": 75.85894511378655, + "learning_rate": 3.863376273766199e-06, + "loss": 2.2303, + "step": 21673 + }, + { + "epoch": 1.847268388306486, + "grad_norm": 31.827259395417038, + "learning_rate": 3.862893413318481e-06, + "loss": 2.2335, + "step": 21674 + }, + { + "epoch": 1.8473536180005112, + "grad_norm": 42.75313405539021, + "learning_rate": 3.862410564053754e-06, + "loss": 2.892, + "step": 21675 + }, + { + "epoch": 1.8474388476945367, + "grad_norm": 38.96325475066024, + "learning_rate": 3.861927725976766e-06, + "loss": 3.2498, + "step": 21676 + }, + { + "epoch": 1.8475240773885622, + "grad_norm": 33.81916280268497, + "learning_rate": 3.861444899092266e-06, + "loss": 2.8831, + "step": 21677 + }, + { + "epoch": 1.8476093070825876, + "grad_norm": 42.766488973740806, + "learning_rate": 3.860962083405005e-06, + "loss": 3.3448, + "step": 21678 + }, + { + "epoch": 1.847694536776613, + "grad_norm": 32.830821609178, + "learning_rate": 3.86047927891973e-06, + "loss": 2.8918, + "step": 21679 + }, + { + "epoch": 1.8477797664706384, + "grad_norm": 45.04436052685518, + "learning_rate": 3.859996485641186e-06, + "loss": 2.3589, + "step": 21680 + }, + { + "epoch": 1.8478649961646638, + "grad_norm": 32.84996928533261, + "learning_rate": 3.859513703574125e-06, + "loss": 2.8579, + "step": 21681 + }, + { + "epoch": 1.847950225858689, + "grad_norm": 45.87271122094021, + "learning_rate": 3.859030932723291e-06, + "loss": 2.2921, + "step": 21682 + }, + { + "epoch": 1.8480354555527145, + "grad_norm": 55.95883643343868, + "learning_rate": 3.8585481730934385e-06, + "loss": 3.3098, + "step": 21683 + }, + { + "epoch": 1.84812068524674, + "grad_norm": 22.051629379556164, + "learning_rate": 3.858065424689308e-06, + "loss": 1.5345, + "step": 21684 + }, + { + "epoch": 1.8482059149407655, + "grad_norm": 71.72342209222312, + "learning_rate": 3.857582687515652e-06, + "loss": 4.2958, + "step": 21685 + }, + { + "epoch": 1.8482911446347907, + "grad_norm": 39.432004648740175, + "learning_rate": 3.857099961577215e-06, + "loss": 2.8483, + "step": 21686 + }, + { + "epoch": 1.8483763743288162, + "grad_norm": 37.59959741830925, + "learning_rate": 3.856617246878747e-06, + "loss": 2.8955, + "step": 21687 + }, + { + "epoch": 1.8484616040228414, + "grad_norm": 41.969001366831826, + "learning_rate": 3.856134543424993e-06, + "loss": 2.1283, + "step": 21688 + }, + { + "epoch": 1.848546833716867, + "grad_norm": 46.41778329234866, + "learning_rate": 3.855651851220703e-06, + "loss": 2.8338, + "step": 21689 + }, + { + "epoch": 1.8486320634108924, + "grad_norm": 30.97332356846327, + "learning_rate": 3.85516917027062e-06, + "loss": 2.4362, + "step": 21690 + }, + { + "epoch": 1.8487172931049178, + "grad_norm": 69.01590945370047, + "learning_rate": 3.8546865005794945e-06, + "loss": 2.7381, + "step": 21691 + }, + { + "epoch": 1.8488025227989433, + "grad_norm": 46.43168967815356, + "learning_rate": 3.854203842152072e-06, + "loss": 2.0742, + "step": 21692 + }, + { + "epoch": 1.8488877524929686, + "grad_norm": 31.070430292146252, + "learning_rate": 3.8537211949931e-06, + "loss": 2.4725, + "step": 21693 + }, + { + "epoch": 1.8489729821869938, + "grad_norm": 34.63506052464239, + "learning_rate": 3.853238559107325e-06, + "loss": 2.2319, + "step": 21694 + }, + { + "epoch": 1.8490582118810193, + "grad_norm": 37.68834735753811, + "learning_rate": 3.8527559344994905e-06, + "loss": 2.4494, + "step": 21695 + }, + { + "epoch": 1.8491434415750447, + "grad_norm": 29.94719592224091, + "learning_rate": 3.852273321174347e-06, + "loss": 2.2208, + "step": 21696 + }, + { + "epoch": 1.8492286712690702, + "grad_norm": 18.984306870478328, + "learning_rate": 3.85179071913664e-06, + "loss": 0.7492, + "step": 21697 + }, + { + "epoch": 1.8493139009630957, + "grad_norm": 49.9358800346928, + "learning_rate": 3.8513081283911155e-06, + "loss": 3.5855, + "step": 21698 + }, + { + "epoch": 1.849399130657121, + "grad_norm": 60.10449056848144, + "learning_rate": 3.850825548942516e-06, + "loss": 3.059, + "step": 21699 + }, + { + "epoch": 1.8494843603511464, + "grad_norm": 57.67398200464287, + "learning_rate": 3.8503429807955925e-06, + "loss": 3.2945, + "step": 21700 + }, + { + "epoch": 1.8495695900451716, + "grad_norm": 49.55127459328719, + "learning_rate": 3.84986042395509e-06, + "loss": 2.989, + "step": 21701 + }, + { + "epoch": 1.849654819739197, + "grad_norm": 82.25814902690576, + "learning_rate": 3.849377878425753e-06, + "loss": 3.5219, + "step": 21702 + }, + { + "epoch": 1.8497400494332226, + "grad_norm": 51.01343734743188, + "learning_rate": 3.8488953442123255e-06, + "loss": 2.3886, + "step": 21703 + }, + { + "epoch": 1.849825279127248, + "grad_norm": 32.443141762911964, + "learning_rate": 3.848412821319556e-06, + "loss": 2.0789, + "step": 21704 + }, + { + "epoch": 1.8499105088212733, + "grad_norm": 45.91390509098055, + "learning_rate": 3.84793030975219e-06, + "loss": 1.6247, + "step": 21705 + }, + { + "epoch": 1.8499957385152987, + "grad_norm": 109.79821874628722, + "learning_rate": 3.8474478095149705e-06, + "loss": 4.141, + "step": 21706 + }, + { + "epoch": 1.850080968209324, + "grad_norm": 30.605335085806974, + "learning_rate": 3.846965320612643e-06, + "loss": 2.6282, + "step": 21707 + }, + { + "epoch": 1.8501661979033495, + "grad_norm": 70.83787019268185, + "learning_rate": 3.846482843049954e-06, + "loss": 3.8796, + "step": 21708 + }, + { + "epoch": 1.850251427597375, + "grad_norm": 43.8125418812105, + "learning_rate": 3.846000376831648e-06, + "loss": 2.7247, + "step": 21709 + }, + { + "epoch": 1.8503366572914004, + "grad_norm": 101.84873932786051, + "learning_rate": 3.845517921962471e-06, + "loss": 3.5426, + "step": 21710 + }, + { + "epoch": 1.8504218869854259, + "grad_norm": 53.45201912833934, + "learning_rate": 3.8450354784471646e-06, + "loss": 2.2456, + "step": 21711 + }, + { + "epoch": 1.8505071166794511, + "grad_norm": 59.06354628944557, + "learning_rate": 3.844553046290474e-06, + "loss": 2.7286, + "step": 21712 + }, + { + "epoch": 1.8505923463734764, + "grad_norm": 34.5242174115057, + "learning_rate": 3.844070625497149e-06, + "loss": 2.6283, + "step": 21713 + }, + { + "epoch": 1.8506775760675018, + "grad_norm": 21.843062048983622, + "learning_rate": 3.843588216071929e-06, + "loss": 1.7003, + "step": 21714 + }, + { + "epoch": 1.8507628057615273, + "grad_norm": 25.41791330556322, + "learning_rate": 3.843105818019558e-06, + "loss": 2.1395, + "step": 21715 + }, + { + "epoch": 1.8508480354555528, + "grad_norm": 57.03489999396678, + "learning_rate": 3.842623431344781e-06, + "loss": 2.7591, + "step": 21716 + }, + { + "epoch": 1.8509332651495782, + "grad_norm": 35.81688189469297, + "learning_rate": 3.842141056052344e-06, + "loss": 2.8467, + "step": 21717 + }, + { + "epoch": 1.8510184948436035, + "grad_norm": 36.122211189241916, + "learning_rate": 3.84165869214699e-06, + "loss": 2.659, + "step": 21718 + }, + { + "epoch": 1.851103724537629, + "grad_norm": 53.54508034568671, + "learning_rate": 3.8411763396334625e-06, + "loss": 2.9576, + "step": 21719 + }, + { + "epoch": 1.8511889542316542, + "grad_norm": 45.21337030091444, + "learning_rate": 3.840693998516503e-06, + "loss": 2.9104, + "step": 21720 + }, + { + "epoch": 1.8512741839256797, + "grad_norm": 77.50329800875018, + "learning_rate": 3.840211668800859e-06, + "loss": 2.6039, + "step": 21721 + }, + { + "epoch": 1.8513594136197051, + "grad_norm": 47.924966917774384, + "learning_rate": 3.839729350491274e-06, + "loss": 2.8679, + "step": 21722 + }, + { + "epoch": 1.8514446433137306, + "grad_norm": 96.79362327780748, + "learning_rate": 3.839247043592488e-06, + "loss": 3.7914, + "step": 21723 + }, + { + "epoch": 1.851529873007756, + "grad_norm": 46.58476613806396, + "learning_rate": 3.838764748109247e-06, + "loss": 2.5386, + "step": 21724 + }, + { + "epoch": 1.8516151027017813, + "grad_norm": 54.59417609620444, + "learning_rate": 3.8382824640462904e-06, + "loss": 2.1156, + "step": 21725 + }, + { + "epoch": 1.8517003323958066, + "grad_norm": 58.29008054021707, + "learning_rate": 3.837800191408368e-06, + "loss": 2.7712, + "step": 21726 + }, + { + "epoch": 1.851785562089832, + "grad_norm": 21.91202503326919, + "learning_rate": 3.837317930200216e-06, + "loss": 1.3072, + "step": 21727 + }, + { + "epoch": 1.8518707917838575, + "grad_norm": 54.86513273587888, + "learning_rate": 3.836835680426582e-06, + "loss": 2.5339, + "step": 21728 + }, + { + "epoch": 1.851956021477883, + "grad_norm": 73.43550565548321, + "learning_rate": 3.836353442092205e-06, + "loss": 3.2146, + "step": 21729 + }, + { + "epoch": 1.8520412511719084, + "grad_norm": 108.22879361796663, + "learning_rate": 3.835871215201832e-06, + "loss": 3.5783, + "step": 21730 + }, + { + "epoch": 1.8521264808659337, + "grad_norm": 58.5669145781878, + "learning_rate": 3.835388999760201e-06, + "loss": 2.9561, + "step": 21731 + }, + { + "epoch": 1.852211710559959, + "grad_norm": 103.12075502072577, + "learning_rate": 3.8349067957720575e-06, + "loss": 3.0145, + "step": 21732 + }, + { + "epoch": 1.8522969402539844, + "grad_norm": 75.30155177941855, + "learning_rate": 3.834424603242142e-06, + "loss": 3.1922, + "step": 21733 + }, + { + "epoch": 1.8523821699480099, + "grad_norm": 57.25169522698318, + "learning_rate": 3.833942422175198e-06, + "loss": 2.8096, + "step": 21734 + }, + { + "epoch": 1.8524673996420353, + "grad_norm": 50.2202234388659, + "learning_rate": 3.833460252575966e-06, + "loss": 2.7553, + "step": 21735 + }, + { + "epoch": 1.8525526293360608, + "grad_norm": 45.030774271988214, + "learning_rate": 3.832978094449191e-06, + "loss": 3.5554, + "step": 21736 + }, + { + "epoch": 1.852637859030086, + "grad_norm": 42.55160382237611, + "learning_rate": 3.832495947799612e-06, + "loss": 1.8394, + "step": 21737 + }, + { + "epoch": 1.8527230887241115, + "grad_norm": 84.53769084108797, + "learning_rate": 3.832013812631969e-06, + "loss": 3.8427, + "step": 21738 + }, + { + "epoch": 1.8528083184181368, + "grad_norm": 46.86050440415996, + "learning_rate": 3.831531688951007e-06, + "loss": 3.7814, + "step": 21739 + }, + { + "epoch": 1.8528935481121622, + "grad_norm": 60.70733788760162, + "learning_rate": 3.831049576761469e-06, + "loss": 3.0882, + "step": 21740 + }, + { + "epoch": 1.8529787778061877, + "grad_norm": 59.92302718478548, + "learning_rate": 3.8305674760680925e-06, + "loss": 2.1615, + "step": 21741 + }, + { + "epoch": 1.8530640075002132, + "grad_norm": 54.28483374038317, + "learning_rate": 3.830085386875618e-06, + "loss": 2.2303, + "step": 21742 + }, + { + "epoch": 1.8531492371942386, + "grad_norm": 45.88540007424653, + "learning_rate": 3.829603309188791e-06, + "loss": 3.2392, + "step": 21743 + }, + { + "epoch": 1.8532344668882639, + "grad_norm": 54.25665148863487, + "learning_rate": 3.82912124301235e-06, + "loss": 2.5516, + "step": 21744 + }, + { + "epoch": 1.8533196965822891, + "grad_norm": 40.20871430002412, + "learning_rate": 3.828639188351038e-06, + "loss": 2.7412, + "step": 21745 + }, + { + "epoch": 1.8534049262763146, + "grad_norm": 49.71292843805983, + "learning_rate": 3.8281571452095914e-06, + "loss": 3.0351, + "step": 21746 + }, + { + "epoch": 1.85349015597034, + "grad_norm": 75.80505769711682, + "learning_rate": 3.827675113592753e-06, + "loss": 2.6964, + "step": 21747 + }, + { + "epoch": 1.8535753856643655, + "grad_norm": 41.05216796398416, + "learning_rate": 3.827193093505268e-06, + "loss": 2.5903, + "step": 21748 + }, + { + "epoch": 1.853660615358391, + "grad_norm": 38.1877090649413, + "learning_rate": 3.82671108495187e-06, + "loss": 2.6482, + "step": 21749 + }, + { + "epoch": 1.8537458450524162, + "grad_norm": 38.287939617486614, + "learning_rate": 3.826229087937303e-06, + "loss": 2.7487, + "step": 21750 + }, + { + "epoch": 1.8538310747464417, + "grad_norm": 62.87932748434193, + "learning_rate": 3.825747102466306e-06, + "loss": 2.318, + "step": 21751 + }, + { + "epoch": 1.853916304440467, + "grad_norm": 67.10652554397605, + "learning_rate": 3.825265128543621e-06, + "loss": 2.2302, + "step": 21752 + }, + { + "epoch": 1.8540015341344924, + "grad_norm": 103.49339514128147, + "learning_rate": 3.824783166173986e-06, + "loss": 4.0357, + "step": 21753 + }, + { + "epoch": 1.854086763828518, + "grad_norm": 44.1661755650736, + "learning_rate": 3.824301215362141e-06, + "loss": 3.0103, + "step": 21754 + }, + { + "epoch": 1.8541719935225434, + "grad_norm": 96.02769384277057, + "learning_rate": 3.823819276112825e-06, + "loss": 2.8336, + "step": 21755 + }, + { + "epoch": 1.8542572232165686, + "grad_norm": 48.15678830771028, + "learning_rate": 3.823337348430782e-06, + "loss": 1.5047, + "step": 21756 + }, + { + "epoch": 1.854342452910594, + "grad_norm": 59.55005914759234, + "learning_rate": 3.822855432320748e-06, + "loss": 2.7041, + "step": 21757 + }, + { + "epoch": 1.8544276826046193, + "grad_norm": 47.50076585582619, + "learning_rate": 3.822373527787462e-06, + "loss": 2.3893, + "step": 21758 + }, + { + "epoch": 1.8545129122986448, + "grad_norm": 34.39941551455166, + "learning_rate": 3.821891634835664e-06, + "loss": 1.6873, + "step": 21759 + }, + { + "epoch": 1.8545981419926703, + "grad_norm": 70.47608733723457, + "learning_rate": 3.821409753470095e-06, + "loss": 2.418, + "step": 21760 + }, + { + "epoch": 1.8546833716866957, + "grad_norm": 47.423705520904655, + "learning_rate": 3.8209278836954926e-06, + "loss": 1.7301, + "step": 21761 + }, + { + "epoch": 1.8547686013807212, + "grad_norm": 43.4286671390013, + "learning_rate": 3.8204460255165955e-06, + "loss": 2.9972, + "step": 21762 + }, + { + "epoch": 1.8548538310747464, + "grad_norm": 29.167788202236032, + "learning_rate": 3.819964178938142e-06, + "loss": 1.3641, + "step": 21763 + }, + { + "epoch": 1.8549390607687717, + "grad_norm": 78.46071832176551, + "learning_rate": 3.8194823439648735e-06, + "loss": 3.5951, + "step": 21764 + }, + { + "epoch": 1.8550242904627972, + "grad_norm": 68.51071907770172, + "learning_rate": 3.819000520601528e-06, + "loss": 3.2958, + "step": 21765 + }, + { + "epoch": 1.8551095201568226, + "grad_norm": 35.85480059204649, + "learning_rate": 3.818518708852841e-06, + "loss": 2.1374, + "step": 21766 + }, + { + "epoch": 1.855194749850848, + "grad_norm": 23.197991337913134, + "learning_rate": 3.818036908723554e-06, + "loss": 2.1309, + "step": 21767 + }, + { + "epoch": 1.8552799795448736, + "grad_norm": 29.28772626415028, + "learning_rate": 3.817555120218403e-06, + "loss": 1.963, + "step": 21768 + }, + { + "epoch": 1.8553652092388988, + "grad_norm": 24.513166958684728, + "learning_rate": 3.817073343342129e-06, + "loss": 1.7196, + "step": 21769 + }, + { + "epoch": 1.8554504389329243, + "grad_norm": 52.59778838134191, + "learning_rate": 3.8165915780994685e-06, + "loss": 2.0544, + "step": 21770 + }, + { + "epoch": 1.8555356686269495, + "grad_norm": 32.92901777741363, + "learning_rate": 3.81610982449516e-06, + "loss": 2.5163, + "step": 21771 + }, + { + "epoch": 1.855620898320975, + "grad_norm": 29.934144802386747, + "learning_rate": 3.815628082533939e-06, + "loss": 2.4145, + "step": 21772 + }, + { + "epoch": 1.8557061280150005, + "grad_norm": 34.33679847528866, + "learning_rate": 3.815146352220547e-06, + "loss": 2.8805, + "step": 21773 + }, + { + "epoch": 1.855791357709026, + "grad_norm": 38.03932067286348, + "learning_rate": 3.8146646335597185e-06, + "loss": 2.2477, + "step": 21774 + }, + { + "epoch": 1.8558765874030512, + "grad_norm": 67.79997704982082, + "learning_rate": 3.814182926556194e-06, + "loss": 2.7715, + "step": 21775 + }, + { + "epoch": 1.8559618170970766, + "grad_norm": 38.02275756241542, + "learning_rate": 3.813701231214707e-06, + "loss": 2.9156, + "step": 21776 + }, + { + "epoch": 1.8560470467911019, + "grad_norm": 40.74488303823443, + "learning_rate": 3.8132195475399992e-06, + "loss": 2.7936, + "step": 21777 + }, + { + "epoch": 1.8561322764851274, + "grad_norm": 74.31513167884835, + "learning_rate": 3.812737875536804e-06, + "loss": 3.419, + "step": 21778 + }, + { + "epoch": 1.8562175061791528, + "grad_norm": 35.11990178755547, + "learning_rate": 3.8122562152098613e-06, + "loss": 3.5507, + "step": 21779 + }, + { + "epoch": 1.8563027358731783, + "grad_norm": 85.39108387196761, + "learning_rate": 3.811774566563907e-06, + "loss": 3.8856, + "step": 21780 + }, + { + "epoch": 1.8563879655672038, + "grad_norm": 55.83438669364044, + "learning_rate": 3.8112929296036756e-06, + "loss": 2.3455, + "step": 21781 + }, + { + "epoch": 1.856473195261229, + "grad_norm": 36.96412112346453, + "learning_rate": 3.8108113043339067e-06, + "loss": 2.4667, + "step": 21782 + }, + { + "epoch": 1.8565584249552542, + "grad_norm": 60.90362201959482, + "learning_rate": 3.810329690759337e-06, + "loss": 3.2139, + "step": 21783 + }, + { + "epoch": 1.8566436546492797, + "grad_norm": 61.525514407732345, + "learning_rate": 3.809848088884702e-06, + "loss": 3.0109, + "step": 21784 + }, + { + "epoch": 1.8567288843433052, + "grad_norm": 75.0783645594634, + "learning_rate": 3.8093664987147367e-06, + "loss": 2.7044, + "step": 21785 + }, + { + "epoch": 1.8568141140373307, + "grad_norm": 31.352969481153995, + "learning_rate": 3.8088849202541793e-06, + "loss": 2.1891, + "step": 21786 + }, + { + "epoch": 1.8568993437313561, + "grad_norm": 61.99093803964031, + "learning_rate": 3.808403353507767e-06, + "loss": 1.7283, + "step": 21787 + }, + { + "epoch": 1.8569845734253814, + "grad_norm": 21.79209678215675, + "learning_rate": 3.8079217984802335e-06, + "loss": 1.6324, + "step": 21788 + }, + { + "epoch": 1.8570698031194068, + "grad_norm": 33.10250470039117, + "learning_rate": 3.807440255176314e-06, + "loss": 2.5494, + "step": 21789 + }, + { + "epoch": 1.857155032813432, + "grad_norm": 43.23956324481254, + "learning_rate": 3.806958723600746e-06, + "loss": 2.8994, + "step": 21790 + }, + { + "epoch": 1.8572402625074576, + "grad_norm": 46.97811599190599, + "learning_rate": 3.8064772037582663e-06, + "loss": 3.4262, + "step": 21791 + }, + { + "epoch": 1.857325492201483, + "grad_norm": 48.6475375759782, + "learning_rate": 3.8059956956536094e-06, + "loss": 3.3511, + "step": 21792 + }, + { + "epoch": 1.8574107218955085, + "grad_norm": 38.83398833763511, + "learning_rate": 3.805514199291508e-06, + "loss": 3.0952, + "step": 21793 + }, + { + "epoch": 1.857495951589534, + "grad_norm": 29.91461424860675, + "learning_rate": 3.805032714676701e-06, + "loss": 1.7017, + "step": 21794 + }, + { + "epoch": 1.8575811812835592, + "grad_norm": 75.89747235206951, + "learning_rate": 3.804551241813923e-06, + "loss": 2.5256, + "step": 21795 + }, + { + "epoch": 1.8576664109775844, + "grad_norm": 29.11950308641793, + "learning_rate": 3.8040697807079084e-06, + "loss": 3.0032, + "step": 21796 + }, + { + "epoch": 1.85775164067161, + "grad_norm": 21.67353424873368, + "learning_rate": 3.803588331363392e-06, + "loss": 2.3357, + "step": 21797 + }, + { + "epoch": 1.8578368703656354, + "grad_norm": 73.23490937716676, + "learning_rate": 3.803106893785108e-06, + "loss": 2.9722, + "step": 21798 + }, + { + "epoch": 1.8579221000596609, + "grad_norm": 50.423390743571204, + "learning_rate": 3.8026254679777936e-06, + "loss": 3.3279, + "step": 21799 + }, + { + "epoch": 1.8580073297536863, + "grad_norm": 37.8530812918911, + "learning_rate": 3.802144053946182e-06, + "loss": 2.9683, + "step": 21800 + }, + { + "epoch": 1.8580925594477116, + "grad_norm": 32.970320043524545, + "learning_rate": 3.8016626516950067e-06, + "loss": 2.2106, + "step": 21801 + }, + { + "epoch": 1.858177789141737, + "grad_norm": 72.70668024446874, + "learning_rate": 3.8011812612290015e-06, + "loss": 2.5112, + "step": 21802 + }, + { + "epoch": 1.8582630188357623, + "grad_norm": 85.68432485815875, + "learning_rate": 3.800699882552905e-06, + "loss": 4.6595, + "step": 21803 + }, + { + "epoch": 1.8583482485297877, + "grad_norm": 39.31660029963963, + "learning_rate": 3.800218515671449e-06, + "loss": 2.4277, + "step": 21804 + }, + { + "epoch": 1.8584334782238132, + "grad_norm": 32.29877615794025, + "learning_rate": 3.799737160589365e-06, + "loss": 2.7618, + "step": 21805 + }, + { + "epoch": 1.8585187079178387, + "grad_norm": 41.69756415460743, + "learning_rate": 3.7992558173113887e-06, + "loss": 3.4476, + "step": 21806 + }, + { + "epoch": 1.858603937611864, + "grad_norm": 80.3057734939642, + "learning_rate": 3.798774485842256e-06, + "loss": 4.877, + "step": 21807 + }, + { + "epoch": 1.8586891673058894, + "grad_norm": 40.23782898415105, + "learning_rate": 3.7982931661866997e-06, + "loss": 2.7667, + "step": 21808 + }, + { + "epoch": 1.8587743969999146, + "grad_norm": 59.740291398073715, + "learning_rate": 3.7978118583494505e-06, + "loss": 2.3824, + "step": 21809 + }, + { + "epoch": 1.8588596266939401, + "grad_norm": 32.282177345738596, + "learning_rate": 3.7973305623352456e-06, + "loss": 2.569, + "step": 21810 + }, + { + "epoch": 1.8589448563879656, + "grad_norm": 72.92330685865139, + "learning_rate": 3.796849278148815e-06, + "loss": 2.9198, + "step": 21811 + }, + { + "epoch": 1.859030086081991, + "grad_norm": 51.604572374022666, + "learning_rate": 3.7963680057948948e-06, + "loss": 3.1484, + "step": 21812 + }, + { + "epoch": 1.8591153157760165, + "grad_norm": 42.36503353283798, + "learning_rate": 3.7958867452782165e-06, + "loss": 3.1472, + "step": 21813 + }, + { + "epoch": 1.8592005454700418, + "grad_norm": 39.14927142193723, + "learning_rate": 3.795405496603515e-06, + "loss": 2.2857, + "step": 21814 + }, + { + "epoch": 1.859285775164067, + "grad_norm": 44.64834292056042, + "learning_rate": 3.794924259775519e-06, + "loss": 2.5469, + "step": 21815 + }, + { + "epoch": 1.8593710048580925, + "grad_norm": 39.88048002364592, + "learning_rate": 3.794443034798967e-06, + "loss": 2.5273, + "step": 21816 + }, + { + "epoch": 1.859456234552118, + "grad_norm": 32.72080611293315, + "learning_rate": 3.793961821678587e-06, + "loss": 2.9721, + "step": 21817 + }, + { + "epoch": 1.8595414642461434, + "grad_norm": 63.02868284576643, + "learning_rate": 3.793480620419114e-06, + "loss": 2.6557, + "step": 21818 + }, + { + "epoch": 1.8596266939401689, + "grad_norm": 53.95269733859648, + "learning_rate": 3.792999431025278e-06, + "loss": 3.5021, + "step": 21819 + }, + { + "epoch": 1.8597119236341941, + "grad_norm": 47.005130040474235, + "learning_rate": 3.792518253501815e-06, + "loss": 2.9888, + "step": 21820 + }, + { + "epoch": 1.8597971533282196, + "grad_norm": 57.761048629519976, + "learning_rate": 3.792037087853454e-06, + "loss": 3.6855, + "step": 21821 + }, + { + "epoch": 1.8598823830222448, + "grad_norm": 57.42562169736268, + "learning_rate": 3.79155593408493e-06, + "loss": 2.8101, + "step": 21822 + }, + { + "epoch": 1.8599676127162703, + "grad_norm": 48.438790786705994, + "learning_rate": 3.7910747922009705e-06, + "loss": 2.4946, + "step": 21823 + }, + { + "epoch": 1.8600528424102958, + "grad_norm": 33.73310876625156, + "learning_rate": 3.790593662206312e-06, + "loss": 2.6455, + "step": 21824 + }, + { + "epoch": 1.8601380721043212, + "grad_norm": 22.732562666476234, + "learning_rate": 3.7901125441056836e-06, + "loss": 1.6176, + "step": 21825 + }, + { + "epoch": 1.8602233017983465, + "grad_norm": 82.3527509915895, + "learning_rate": 3.7896314379038186e-06, + "loss": 2.7796, + "step": 21826 + }, + { + "epoch": 1.860308531492372, + "grad_norm": 73.4966861195762, + "learning_rate": 3.7891503436054473e-06, + "loss": 3.2546, + "step": 21827 + }, + { + "epoch": 1.8603937611863972, + "grad_norm": 51.816301669231244, + "learning_rate": 3.788669261215299e-06, + "loss": 3.9864, + "step": 21828 + }, + { + "epoch": 1.8604789908804227, + "grad_norm": 34.168342031593305, + "learning_rate": 3.7881881907381085e-06, + "loss": 2.194, + "step": 21829 + }, + { + "epoch": 1.8605642205744481, + "grad_norm": 35.85623261790039, + "learning_rate": 3.7877071321786064e-06, + "loss": 1.7637, + "step": 21830 + }, + { + "epoch": 1.8606494502684736, + "grad_norm": 72.50856835556891, + "learning_rate": 3.7872260855415234e-06, + "loss": 3.2766, + "step": 21831 + }, + { + "epoch": 1.860734679962499, + "grad_norm": 95.3901885513496, + "learning_rate": 3.786745050831587e-06, + "loss": 4.2398, + "step": 21832 + }, + { + "epoch": 1.8608199096565243, + "grad_norm": 38.34394008756577, + "learning_rate": 3.7862640280535325e-06, + "loss": 2.9283, + "step": 21833 + }, + { + "epoch": 1.8609051393505496, + "grad_norm": 65.20530015599488, + "learning_rate": 3.7857830172120902e-06, + "loss": 3.2917, + "step": 21834 + }, + { + "epoch": 1.860990369044575, + "grad_norm": 61.515946828736354, + "learning_rate": 3.7853020183119892e-06, + "loss": 2.4928, + "step": 21835 + }, + { + "epoch": 1.8610755987386005, + "grad_norm": 28.927975955021672, + "learning_rate": 3.784821031357958e-06, + "loss": 2.4758, + "step": 21836 + }, + { + "epoch": 1.861160828432626, + "grad_norm": 69.70777113805015, + "learning_rate": 3.7843400563547306e-06, + "loss": 2.2815, + "step": 21837 + }, + { + "epoch": 1.8612460581266514, + "grad_norm": 72.83923264548068, + "learning_rate": 3.7838590933070364e-06, + "loss": 2.3555, + "step": 21838 + }, + { + "epoch": 1.8613312878206767, + "grad_norm": 45.23045112185161, + "learning_rate": 3.783378142219605e-06, + "loss": 3.1673, + "step": 21839 + }, + { + "epoch": 1.8614165175147022, + "grad_norm": 133.5733808045182, + "learning_rate": 3.7828972030971644e-06, + "loss": 3.3174, + "step": 21840 + }, + { + "epoch": 1.8615017472087274, + "grad_norm": 36.23289343227859, + "learning_rate": 3.7824162759444462e-06, + "loss": 2.6153, + "step": 21841 + }, + { + "epoch": 1.8615869769027529, + "grad_norm": 35.64547537097654, + "learning_rate": 3.7819353607661812e-06, + "loss": 2.5898, + "step": 21842 + }, + { + "epoch": 1.8616722065967783, + "grad_norm": 51.39174828457098, + "learning_rate": 3.7814544575670987e-06, + "loss": 3.0602, + "step": 21843 + }, + { + "epoch": 1.8617574362908038, + "grad_norm": 49.55195895880219, + "learning_rate": 3.780973566351925e-06, + "loss": 2.0848, + "step": 21844 + }, + { + "epoch": 1.861842665984829, + "grad_norm": 51.805528378376465, + "learning_rate": 3.780492687125392e-06, + "loss": 2.4318, + "step": 21845 + }, + { + "epoch": 1.8619278956788545, + "grad_norm": 33.45083741485544, + "learning_rate": 3.780011819892231e-06, + "loss": 2.5128, + "step": 21846 + }, + { + "epoch": 1.8620131253728798, + "grad_norm": 68.82943082279522, + "learning_rate": 3.779530964657169e-06, + "loss": 2.7173, + "step": 21847 + }, + { + "epoch": 1.8620983550669052, + "grad_norm": 25.521780796209057, + "learning_rate": 3.7790501214249337e-06, + "loss": 2.4923, + "step": 21848 + }, + { + "epoch": 1.8621835847609307, + "grad_norm": 88.00644039469408, + "learning_rate": 3.778569290200254e-06, + "loss": 3.6019, + "step": 21849 + }, + { + "epoch": 1.8622688144549562, + "grad_norm": 47.56117661782687, + "learning_rate": 3.7780884709878625e-06, + "loss": 3.2258, + "step": 21850 + }, + { + "epoch": 1.8623540441489816, + "grad_norm": 70.9788766191169, + "learning_rate": 3.7776076637924853e-06, + "loss": 2.7714, + "step": 21851 + }, + { + "epoch": 1.862439273843007, + "grad_norm": 33.54830396307124, + "learning_rate": 3.7771268686188493e-06, + "loss": 2.5966, + "step": 21852 + }, + { + "epoch": 1.8625245035370321, + "grad_norm": 49.381466800132394, + "learning_rate": 3.776646085471684e-06, + "loss": 2.6175, + "step": 21853 + }, + { + "epoch": 1.8626097332310576, + "grad_norm": 38.333507814212375, + "learning_rate": 3.776165314355721e-06, + "loss": 2.7171, + "step": 21854 + }, + { + "epoch": 1.862694962925083, + "grad_norm": 36.695434038772866, + "learning_rate": 3.7756845552756848e-06, + "loss": 2.9543, + "step": 21855 + }, + { + "epoch": 1.8627801926191085, + "grad_norm": 73.94286810578943, + "learning_rate": 3.775203808236304e-06, + "loss": 1.949, + "step": 21856 + }, + { + "epoch": 1.862865422313134, + "grad_norm": 46.43060248089828, + "learning_rate": 3.7747230732423084e-06, + "loss": 3.1844, + "step": 21857 + }, + { + "epoch": 1.8629506520071593, + "grad_norm": 41.77238626798471, + "learning_rate": 3.7742423502984223e-06, + "loss": 2.8839, + "step": 21858 + }, + { + "epoch": 1.8630358817011847, + "grad_norm": 67.15080588869863, + "learning_rate": 3.7737616394093774e-06, + "loss": 2.9005, + "step": 21859 + }, + { + "epoch": 1.86312111139521, + "grad_norm": 33.99992701860939, + "learning_rate": 3.7732809405798985e-06, + "loss": 2.4758, + "step": 21860 + }, + { + "epoch": 1.8632063410892354, + "grad_norm": 43.073463281210216, + "learning_rate": 3.7728002538147153e-06, + "loss": 2.314, + "step": 21861 + }, + { + "epoch": 1.863291570783261, + "grad_norm": 39.331406068995, + "learning_rate": 3.772319579118552e-06, + "loss": 1.8743, + "step": 21862 + }, + { + "epoch": 1.8633768004772864, + "grad_norm": 29.866631233572733, + "learning_rate": 3.7718389164961396e-06, + "loss": 1.7087, + "step": 21863 + }, + { + "epoch": 1.8634620301713118, + "grad_norm": 52.93261110146164, + "learning_rate": 3.7713582659522024e-06, + "loss": 2.8239, + "step": 21864 + }, + { + "epoch": 1.863547259865337, + "grad_norm": 44.79012383674154, + "learning_rate": 3.7708776274914693e-06, + "loss": 4.0854, + "step": 21865 + }, + { + "epoch": 1.8636324895593623, + "grad_norm": 84.68226534042915, + "learning_rate": 3.7703970011186646e-06, + "loss": 2.4168, + "step": 21866 + }, + { + "epoch": 1.8637177192533878, + "grad_norm": 34.731774517331, + "learning_rate": 3.769916386838519e-06, + "loss": 3.0425, + "step": 21867 + }, + { + "epoch": 1.8638029489474133, + "grad_norm": 38.173056817564955, + "learning_rate": 3.7694357846557556e-06, + "loss": 1.3408, + "step": 21868 + }, + { + "epoch": 1.8638881786414387, + "grad_norm": 91.49385048946883, + "learning_rate": 3.7689551945751033e-06, + "loss": 2.9674, + "step": 21869 + }, + { + "epoch": 1.8639734083354642, + "grad_norm": 49.946159206801546, + "learning_rate": 3.7684746166012873e-06, + "loss": 3.2549, + "step": 21870 + }, + { + "epoch": 1.8640586380294895, + "grad_norm": 39.67125953088159, + "learning_rate": 3.767994050739033e-06, + "loss": 2.6543, + "step": 21871 + }, + { + "epoch": 1.864143867723515, + "grad_norm": 44.83622616093649, + "learning_rate": 3.7675134969930675e-06, + "loss": 2.9336, + "step": 21872 + }, + { + "epoch": 1.8642290974175402, + "grad_norm": 54.43627707896443, + "learning_rate": 3.767032955368119e-06, + "loss": 2.8299, + "step": 21873 + }, + { + "epoch": 1.8643143271115656, + "grad_norm": 68.36006276643722, + "learning_rate": 3.7665524258689107e-06, + "loss": 2.8408, + "step": 21874 + }, + { + "epoch": 1.864399556805591, + "grad_norm": 13.815286868574583, + "learning_rate": 3.7660719085001674e-06, + "loss": 0.7132, + "step": 21875 + }, + { + "epoch": 1.8644847864996166, + "grad_norm": 58.44098587320691, + "learning_rate": 3.7655914032666174e-06, + "loss": 2.4409, + "step": 21876 + }, + { + "epoch": 1.8645700161936418, + "grad_norm": 80.8285041660538, + "learning_rate": 3.765110910172987e-06, + "loss": 3.5316, + "step": 21877 + }, + { + "epoch": 1.8646552458876673, + "grad_norm": 36.45528740769089, + "learning_rate": 3.764630429224e-06, + "loss": 2.9471, + "step": 21878 + }, + { + "epoch": 1.8647404755816925, + "grad_norm": 34.383049690038234, + "learning_rate": 3.7641499604243792e-06, + "loss": 2.7458, + "step": 21879 + }, + { + "epoch": 1.864825705275718, + "grad_norm": 37.45585845174462, + "learning_rate": 3.763669503778854e-06, + "loss": 3.082, + "step": 21880 + }, + { + "epoch": 1.8649109349697435, + "grad_norm": 129.23543375455327, + "learning_rate": 3.763189059292149e-06, + "loss": 2.1036, + "step": 21881 + }, + { + "epoch": 1.864996164663769, + "grad_norm": 54.73565239044298, + "learning_rate": 3.7627086269689885e-06, + "loss": 2.4043, + "step": 21882 + }, + { + "epoch": 1.8650813943577944, + "grad_norm": 67.23476968450477, + "learning_rate": 3.7622282068140946e-06, + "loss": 3.3811, + "step": 21883 + }, + { + "epoch": 1.8651666240518197, + "grad_norm": 74.98761765797862, + "learning_rate": 3.761747798832195e-06, + "loss": 3.8061, + "step": 21884 + }, + { + "epoch": 1.865251853745845, + "grad_norm": 55.182873251136236, + "learning_rate": 3.761267403028015e-06, + "loss": 2.9664, + "step": 21885 + }, + { + "epoch": 1.8653370834398704, + "grad_norm": 69.23946899369975, + "learning_rate": 3.7607870194062777e-06, + "loss": 2.7123, + "step": 21886 + }, + { + "epoch": 1.8654223131338958, + "grad_norm": 24.316305176944297, + "learning_rate": 3.760306647971707e-06, + "loss": 2.228, + "step": 21887 + }, + { + "epoch": 1.8655075428279213, + "grad_norm": 36.338330582507915, + "learning_rate": 3.7598262887290265e-06, + "loss": 2.4883, + "step": 21888 + }, + { + "epoch": 1.8655927725219468, + "grad_norm": 80.97211168856941, + "learning_rate": 3.7593459416829638e-06, + "loss": 3.2409, + "step": 21889 + }, + { + "epoch": 1.865678002215972, + "grad_norm": 27.963475705327184, + "learning_rate": 3.7588656068382413e-06, + "loss": 2.1346, + "step": 21890 + }, + { + "epoch": 1.8657632319099975, + "grad_norm": 50.89157785091706, + "learning_rate": 3.7583852841995803e-06, + "loss": 0.8427, + "step": 21891 + }, + { + "epoch": 1.8658484616040227, + "grad_norm": 53.2778716417073, + "learning_rate": 3.7579049737717065e-06, + "loss": 3.2257, + "step": 21892 + }, + { + "epoch": 1.8659336912980482, + "grad_norm": 61.61316553134985, + "learning_rate": 3.757424675559346e-06, + "loss": 2.453, + "step": 21893 + }, + { + "epoch": 1.8660189209920737, + "grad_norm": 55.91947280566233, + "learning_rate": 3.7569443895672194e-06, + "loss": 3.2539, + "step": 21894 + }, + { + "epoch": 1.8661041506860991, + "grad_norm": 56.61829644543324, + "learning_rate": 3.7564641158000496e-06, + "loss": 2.2923, + "step": 21895 + }, + { + "epoch": 1.8661893803801244, + "grad_norm": 69.86289823938776, + "learning_rate": 3.7559838542625605e-06, + "loss": 2.186, + "step": 21896 + }, + { + "epoch": 1.8662746100741499, + "grad_norm": 49.785270010567935, + "learning_rate": 3.7555036049594777e-06, + "loss": 3.4537, + "step": 21897 + }, + { + "epoch": 1.866359839768175, + "grad_norm": 62.2133412479141, + "learning_rate": 3.755023367895523e-06, + "loss": 2.4863, + "step": 21898 + }, + { + "epoch": 1.8664450694622006, + "grad_norm": 40.25507043961357, + "learning_rate": 3.754543143075417e-06, + "loss": 2.5686, + "step": 21899 + }, + { + "epoch": 1.866530299156226, + "grad_norm": 60.459744639019945, + "learning_rate": 3.7540629305038857e-06, + "loss": 2.8444, + "step": 21900 + }, + { + "epoch": 1.8666155288502515, + "grad_norm": 43.01237476616969, + "learning_rate": 3.753582730185648e-06, + "loss": 3.2072, + "step": 21901 + }, + { + "epoch": 1.866700758544277, + "grad_norm": 64.86306318419113, + "learning_rate": 3.7531025421254314e-06, + "loss": 2.4722, + "step": 21902 + }, + { + "epoch": 1.8667859882383022, + "grad_norm": 74.85853733713729, + "learning_rate": 3.752622366327955e-06, + "loss": 2.2303, + "step": 21903 + }, + { + "epoch": 1.8668712179323275, + "grad_norm": 78.00180299881802, + "learning_rate": 3.752142202797942e-06, + "loss": 1.9356, + "step": 21904 + }, + { + "epoch": 1.866956447626353, + "grad_norm": 66.41726313367857, + "learning_rate": 3.7516620515401136e-06, + "loss": 2.7673, + "step": 21905 + }, + { + "epoch": 1.8670416773203784, + "grad_norm": 36.36600110413939, + "learning_rate": 3.751181912559194e-06, + "loss": 1.4656, + "step": 21906 + }, + { + "epoch": 1.8671269070144039, + "grad_norm": 60.32509186182073, + "learning_rate": 3.7507017858599036e-06, + "loss": 2.9015, + "step": 21907 + }, + { + "epoch": 1.8672121367084293, + "grad_norm": 31.28546164519463, + "learning_rate": 3.7502216714469658e-06, + "loss": 1.7678, + "step": 21908 + }, + { + "epoch": 1.8672973664024546, + "grad_norm": 61.46368138477931, + "learning_rate": 3.7497415693250986e-06, + "loss": 3.7236, + "step": 21909 + }, + { + "epoch": 1.86738259609648, + "grad_norm": 60.669812100795276, + "learning_rate": 3.749261479499029e-06, + "loss": 3.3344, + "step": 21910 + }, + { + "epoch": 1.8674678257905053, + "grad_norm": 42.81989703306513, + "learning_rate": 3.748781401973475e-06, + "loss": 1.6872, + "step": 21911 + }, + { + "epoch": 1.8675530554845308, + "grad_norm": 46.524100382877364, + "learning_rate": 3.7483013367531583e-06, + "loss": 3.0037, + "step": 21912 + }, + { + "epoch": 1.8676382851785562, + "grad_norm": 48.87178954831803, + "learning_rate": 3.7478212838428018e-06, + "loss": 2.2695, + "step": 21913 + }, + { + "epoch": 1.8677235148725817, + "grad_norm": 51.46621485618347, + "learning_rate": 3.7473412432471235e-06, + "loss": 2.9667, + "step": 21914 + }, + { + "epoch": 1.8678087445666072, + "grad_norm": 29.783882865978203, + "learning_rate": 3.7468612149708465e-06, + "loss": 2.3311, + "step": 21915 + }, + { + "epoch": 1.8678939742606324, + "grad_norm": 69.07338151129264, + "learning_rate": 3.7463811990186927e-06, + "loss": 3.1323, + "step": 21916 + }, + { + "epoch": 1.8679792039546577, + "grad_norm": 52.2129519920282, + "learning_rate": 3.745901195395382e-06, + "loss": 3.1905, + "step": 21917 + }, + { + "epoch": 1.8680644336486831, + "grad_norm": 33.89440426736257, + "learning_rate": 3.7454212041056326e-06, + "loss": 2.6393, + "step": 21918 + }, + { + "epoch": 1.8681496633427086, + "grad_norm": 15.52235309689082, + "learning_rate": 3.7449412251541683e-06, + "loss": 0.7732, + "step": 21919 + }, + { + "epoch": 1.868234893036734, + "grad_norm": 48.991362061094456, + "learning_rate": 3.744461258545709e-06, + "loss": 2.7574, + "step": 21920 + }, + { + "epoch": 1.8683201227307595, + "grad_norm": 58.80133170213099, + "learning_rate": 3.743981304284975e-06, + "loss": 2.3791, + "step": 21921 + }, + { + "epoch": 1.8684053524247848, + "grad_norm": 79.7388534556436, + "learning_rate": 3.7435013623766826e-06, + "loss": 1.9732, + "step": 21922 + }, + { + "epoch": 1.86849058211881, + "grad_norm": 31.024429101965328, + "learning_rate": 3.7430214328255567e-06, + "loss": 1.8723, + "step": 21923 + }, + { + "epoch": 1.8685758118128355, + "grad_norm": 46.11092744082535, + "learning_rate": 3.7425415156363167e-06, + "loss": 2.9788, + "step": 21924 + }, + { + "epoch": 1.868661041506861, + "grad_norm": 63.79569038394213, + "learning_rate": 3.742061610813681e-06, + "loss": 2.949, + "step": 21925 + }, + { + "epoch": 1.8687462712008864, + "grad_norm": 41.09991512217347, + "learning_rate": 3.7415817183623677e-06, + "loss": 2.4572, + "step": 21926 + }, + { + "epoch": 1.868831500894912, + "grad_norm": 45.82877688539451, + "learning_rate": 3.741101838287099e-06, + "loss": 2.256, + "step": 21927 + }, + { + "epoch": 1.8689167305889371, + "grad_norm": 56.653699773522455, + "learning_rate": 3.7406219705925945e-06, + "loss": 2.4726, + "step": 21928 + }, + { + "epoch": 1.8690019602829626, + "grad_norm": 64.17011288463425, + "learning_rate": 3.7401421152835726e-06, + "loss": 3.4912, + "step": 21929 + }, + { + "epoch": 1.8690871899769879, + "grad_norm": 70.15472233520862, + "learning_rate": 3.7396622723647513e-06, + "loss": 2.6422, + "step": 21930 + }, + { + "epoch": 1.8691724196710133, + "grad_norm": 28.168630367653297, + "learning_rate": 3.7391824418408495e-06, + "loss": 2.6023, + "step": 21931 + }, + { + "epoch": 1.8692576493650388, + "grad_norm": 41.290625687408564, + "learning_rate": 3.738702623716589e-06, + "loss": 2.817, + "step": 21932 + }, + { + "epoch": 1.8693428790590643, + "grad_norm": 37.69453776952559, + "learning_rate": 3.7382228179966886e-06, + "loss": 4.0126, + "step": 21933 + }, + { + "epoch": 1.8694281087530897, + "grad_norm": 53.65767341823947, + "learning_rate": 3.7377430246858625e-06, + "loss": 3.1371, + "step": 21934 + }, + { + "epoch": 1.869513338447115, + "grad_norm": 33.17355588003561, + "learning_rate": 3.7372632437888324e-06, + "loss": 2.532, + "step": 21935 + }, + { + "epoch": 1.8695985681411402, + "grad_norm": 40.99096986297619, + "learning_rate": 3.7367834753103184e-06, + "loss": 2.5306, + "step": 21936 + }, + { + "epoch": 1.8696837978351657, + "grad_norm": 48.55280054040144, + "learning_rate": 3.7363037192550367e-06, + "loss": 3.0306, + "step": 21937 + }, + { + "epoch": 1.8697690275291912, + "grad_norm": 43.03856454534221, + "learning_rate": 3.735823975627705e-06, + "loss": 2.4209, + "step": 21938 + }, + { + "epoch": 1.8698542572232166, + "grad_norm": 26.337284664861915, + "learning_rate": 3.7353442444330405e-06, + "loss": 2.5865, + "step": 21939 + }, + { + "epoch": 1.869939486917242, + "grad_norm": 48.27034972836524, + "learning_rate": 3.7348645256757654e-06, + "loss": 3.4295, + "step": 21940 + }, + { + "epoch": 1.8700247166112673, + "grad_norm": 108.4007966517495, + "learning_rate": 3.734384819360595e-06, + "loss": 4.3752, + "step": 21941 + }, + { + "epoch": 1.8701099463052928, + "grad_norm": 70.34279765075947, + "learning_rate": 3.733905125492246e-06, + "loss": 1.7898, + "step": 21942 + }, + { + "epoch": 1.870195175999318, + "grad_norm": 26.296972840002052, + "learning_rate": 3.733425444075438e-06, + "loss": 2.1341, + "step": 21943 + }, + { + "epoch": 1.8702804056933435, + "grad_norm": 37.99013621074078, + "learning_rate": 3.7329457751148856e-06, + "loss": 2.8947, + "step": 21944 + }, + { + "epoch": 1.870365635387369, + "grad_norm": 57.90274738695744, + "learning_rate": 3.7324661186153093e-06, + "loss": 3.658, + "step": 21945 + }, + { + "epoch": 1.8704508650813945, + "grad_norm": 55.333872624542906, + "learning_rate": 3.731986474581425e-06, + "loss": 2.3489, + "step": 21946 + }, + { + "epoch": 1.8705360947754197, + "grad_norm": 69.63243989796118, + "learning_rate": 3.73150684301795e-06, + "loss": 3.0397, + "step": 21947 + }, + { + "epoch": 1.8706213244694452, + "grad_norm": 53.0242679493293, + "learning_rate": 3.7310272239295997e-06, + "loss": 3.1214, + "step": 21948 + }, + { + "epoch": 1.8707065541634704, + "grad_norm": 35.17171924825033, + "learning_rate": 3.730547617321094e-06, + "loss": 2.6596, + "step": 21949 + }, + { + "epoch": 1.870791783857496, + "grad_norm": 55.4462321933717, + "learning_rate": 3.7300680231971476e-06, + "loss": 2.6322, + "step": 21950 + }, + { + "epoch": 1.8708770135515214, + "grad_norm": 60.22605069348592, + "learning_rate": 3.7295884415624783e-06, + "loss": 3.1955, + "step": 21951 + }, + { + "epoch": 1.8709622432455468, + "grad_norm": 71.23050160381969, + "learning_rate": 3.7291088724218e-06, + "loss": 3.1403, + "step": 21952 + }, + { + "epoch": 1.8710474729395723, + "grad_norm": 28.158338851372495, + "learning_rate": 3.7286293157798326e-06, + "loss": 2.23, + "step": 21953 + }, + { + "epoch": 1.8711327026335975, + "grad_norm": 98.78956402933501, + "learning_rate": 3.7281497716412897e-06, + "loss": 3.8884, + "step": 21954 + }, + { + "epoch": 1.8712179323276228, + "grad_norm": 37.511236410196155, + "learning_rate": 3.72767024001089e-06, + "loss": 2.6896, + "step": 21955 + }, + { + "epoch": 1.8713031620216483, + "grad_norm": 69.94952590999699, + "learning_rate": 3.7271907208933477e-06, + "loss": 3.066, + "step": 21956 + }, + { + "epoch": 1.8713883917156737, + "grad_norm": 24.850710957872327, + "learning_rate": 3.7267112142933763e-06, + "loss": 2.4356, + "step": 21957 + }, + { + "epoch": 1.8714736214096992, + "grad_norm": 60.86496835394286, + "learning_rate": 3.7262317202156955e-06, + "loss": 3.8049, + "step": 21958 + }, + { + "epoch": 1.8715588511037247, + "grad_norm": 41.91061176688662, + "learning_rate": 3.725752238665021e-06, + "loss": 3.0829, + "step": 21959 + }, + { + "epoch": 1.87164408079775, + "grad_norm": 51.21568458550052, + "learning_rate": 3.7252727696460667e-06, + "loss": 3.1492, + "step": 21960 + }, + { + "epoch": 1.8717293104917754, + "grad_norm": 70.48920392157929, + "learning_rate": 3.724793313163546e-06, + "loss": 3.0634, + "step": 21961 + }, + { + "epoch": 1.8718145401858006, + "grad_norm": 34.46897290410468, + "learning_rate": 3.724313869222177e-06, + "loss": 2.0076, + "step": 21962 + }, + { + "epoch": 1.871899769879826, + "grad_norm": 102.06100645801946, + "learning_rate": 3.7238344378266755e-06, + "loss": 3.3589, + "step": 21963 + }, + { + "epoch": 1.8719849995738516, + "grad_norm": 54.593011343248975, + "learning_rate": 3.723355018981756e-06, + "loss": 2.5965, + "step": 21964 + }, + { + "epoch": 1.872070229267877, + "grad_norm": 35.33159607890922, + "learning_rate": 3.7228756126921297e-06, + "loss": 2.4945, + "step": 21965 + }, + { + "epoch": 1.8721554589619023, + "grad_norm": 68.61996488764795, + "learning_rate": 3.722396218962515e-06, + "loss": 3.675, + "step": 21966 + }, + { + "epoch": 1.8722406886559277, + "grad_norm": 39.70079628867359, + "learning_rate": 3.721916837797627e-06, + "loss": 2.3053, + "step": 21967 + }, + { + "epoch": 1.872325918349953, + "grad_norm": 21.71965999191106, + "learning_rate": 3.7214374692021795e-06, + "loss": 1.4542, + "step": 21968 + }, + { + "epoch": 1.8724111480439785, + "grad_norm": 53.09092132055122, + "learning_rate": 3.7209581131808843e-06, + "loss": 2.84, + "step": 21969 + }, + { + "epoch": 1.872496377738004, + "grad_norm": 113.88367347554757, + "learning_rate": 3.7204787697384587e-06, + "loss": 3.7982, + "step": 21970 + }, + { + "epoch": 1.8725816074320294, + "grad_norm": 49.88086375267335, + "learning_rate": 3.719999438879617e-06, + "loss": 2.7725, + "step": 21971 + }, + { + "epoch": 1.8726668371260549, + "grad_norm": 63.546444545816854, + "learning_rate": 3.7195201206090724e-06, + "loss": 1.6437, + "step": 21972 + }, + { + "epoch": 1.87275206682008, + "grad_norm": 26.5287055947176, + "learning_rate": 3.719040814931537e-06, + "loss": 2.5743, + "step": 21973 + }, + { + "epoch": 1.8728372965141054, + "grad_norm": 35.1062278398255, + "learning_rate": 3.718561521851726e-06, + "loss": 1.7258, + "step": 21974 + }, + { + "epoch": 1.8729225262081308, + "grad_norm": 47.55596648771251, + "learning_rate": 3.7180822413743546e-06, + "loss": 2.2302, + "step": 21975 + }, + { + "epoch": 1.8730077559021563, + "grad_norm": 79.75422704077326, + "learning_rate": 3.7176029735041352e-06, + "loss": 2.7747, + "step": 21976 + }, + { + "epoch": 1.8730929855961818, + "grad_norm": 93.46300933869169, + "learning_rate": 3.71712371824578e-06, + "loss": 3.6385, + "step": 21977 + }, + { + "epoch": 1.8731782152902072, + "grad_norm": 27.51797553381738, + "learning_rate": 3.716644475604002e-06, + "loss": 2.5462, + "step": 21978 + }, + { + "epoch": 1.8732634449842325, + "grad_norm": 40.32035029762042, + "learning_rate": 3.7161652455835186e-06, + "loss": 2.5781, + "step": 21979 + }, + { + "epoch": 1.873348674678258, + "grad_norm": 72.93001970695202, + "learning_rate": 3.715686028189039e-06, + "loss": 2.7708, + "step": 21980 + }, + { + "epoch": 1.8734339043722832, + "grad_norm": 110.59814015534228, + "learning_rate": 3.7152068234252757e-06, + "loss": 2.9053, + "step": 21981 + }, + { + "epoch": 1.8735191340663087, + "grad_norm": 79.14630077904194, + "learning_rate": 3.714727631296943e-06, + "loss": 3.0108, + "step": 21982 + }, + { + "epoch": 1.8736043637603341, + "grad_norm": 42.335788979891895, + "learning_rate": 3.714248451808754e-06, + "loss": 2.6538, + "step": 21983 + }, + { + "epoch": 1.8736895934543596, + "grad_norm": 37.86666079627909, + "learning_rate": 3.713769284965421e-06, + "loss": 3.2226, + "step": 21984 + }, + { + "epoch": 1.873774823148385, + "grad_norm": 69.6254959034154, + "learning_rate": 3.713290130771655e-06, + "loss": 2.6366, + "step": 21985 + }, + { + "epoch": 1.8738600528424103, + "grad_norm": 37.92598182300014, + "learning_rate": 3.7128109892321707e-06, + "loss": 2.8721, + "step": 21986 + }, + { + "epoch": 1.8739452825364356, + "grad_norm": 67.4506796991413, + "learning_rate": 3.712331860351676e-06, + "loss": 3.0703, + "step": 21987 + }, + { + "epoch": 1.874030512230461, + "grad_norm": 104.16292421027254, + "learning_rate": 3.7118527441348884e-06, + "loss": 3.8081, + "step": 21988 + }, + { + "epoch": 1.8741157419244865, + "grad_norm": 33.832330062151485, + "learning_rate": 3.7113736405865164e-06, + "loss": 2.8873, + "step": 21989 + }, + { + "epoch": 1.874200971618512, + "grad_norm": 60.82201822282877, + "learning_rate": 3.710894549711273e-06, + "loss": 3.3141, + "step": 21990 + }, + { + "epoch": 1.8742862013125374, + "grad_norm": 42.105964659067816, + "learning_rate": 3.7104154715138673e-06, + "loss": 2.9356, + "step": 21991 + }, + { + "epoch": 1.8743714310065627, + "grad_norm": 57.14269666437628, + "learning_rate": 3.709936405999016e-06, + "loss": 3.0016, + "step": 21992 + }, + { + "epoch": 1.8744566607005881, + "grad_norm": 38.391543378539964, + "learning_rate": 3.7094573531714263e-06, + "loss": 2.4252, + "step": 21993 + }, + { + "epoch": 1.8745418903946134, + "grad_norm": 54.06715225518113, + "learning_rate": 3.7089783130358113e-06, + "loss": 3.0918, + "step": 21994 + }, + { + "epoch": 1.8746271200886389, + "grad_norm": 58.925594035257, + "learning_rate": 3.708499285596879e-06, + "loss": 3.0175, + "step": 21995 + }, + { + "epoch": 1.8747123497826643, + "grad_norm": 162.78332528919728, + "learning_rate": 3.7080202708593462e-06, + "loss": 3.4398, + "step": 21996 + }, + { + "epoch": 1.8747975794766898, + "grad_norm": 42.10137586793234, + "learning_rate": 3.707541268827919e-06, + "loss": 2.6438, + "step": 21997 + }, + { + "epoch": 1.874882809170715, + "grad_norm": 69.34235164965985, + "learning_rate": 3.7070622795073106e-06, + "loss": 3.3615, + "step": 21998 + }, + { + "epoch": 1.8749680388647405, + "grad_norm": 41.67499500426702, + "learning_rate": 3.70658330290223e-06, + "loss": 2.7382, + "step": 21999 + }, + { + "epoch": 1.8750532685587658, + "grad_norm": 65.37518790095005, + "learning_rate": 3.70610433901739e-06, + "loss": 2.951, + "step": 22000 + }, + { + "epoch": 1.8751384982527912, + "grad_norm": 25.119945735909347, + "learning_rate": 3.705625387857499e-06, + "loss": 1.9572, + "step": 22001 + }, + { + "epoch": 1.8752237279468167, + "grad_norm": 63.31964203351649, + "learning_rate": 3.705146449427268e-06, + "loss": 2.7606, + "step": 22002 + }, + { + "epoch": 1.8753089576408422, + "grad_norm": 37.26822018102999, + "learning_rate": 3.7046675237314078e-06, + "loss": 2.6701, + "step": 22003 + }, + { + "epoch": 1.8753941873348676, + "grad_norm": 40.53466456999957, + "learning_rate": 3.704188610774626e-06, + "loss": 1.8906, + "step": 22004 + }, + { + "epoch": 1.8754794170288929, + "grad_norm": 103.94415353368147, + "learning_rate": 3.703709710561635e-06, + "loss": 2.7754, + "step": 22005 + }, + { + "epoch": 1.8755646467229181, + "grad_norm": 48.776628560165676, + "learning_rate": 3.7032308230971446e-06, + "loss": 3.5539, + "step": 22006 + }, + { + "epoch": 1.8756498764169436, + "grad_norm": 66.27946523702776, + "learning_rate": 3.7027519483858643e-06, + "loss": 2.5077, + "step": 22007 + }, + { + "epoch": 1.875735106110969, + "grad_norm": 57.5666439948574, + "learning_rate": 3.7022730864325015e-06, + "loss": 3.4503, + "step": 22008 + }, + { + "epoch": 1.8758203358049945, + "grad_norm": 64.84511028465197, + "learning_rate": 3.701794237241767e-06, + "loss": 2.1509, + "step": 22009 + }, + { + "epoch": 1.87590556549902, + "grad_norm": 41.56933506373508, + "learning_rate": 3.7013154008183727e-06, + "loss": 3.4554, + "step": 22010 + }, + { + "epoch": 1.8759907951930452, + "grad_norm": 35.93576221130524, + "learning_rate": 3.7008365771670245e-06, + "loss": 1.925, + "step": 22011 + }, + { + "epoch": 1.8760760248870707, + "grad_norm": 35.53133692910316, + "learning_rate": 3.7003577662924306e-06, + "loss": 2.523, + "step": 22012 + }, + { + "epoch": 1.876161254581096, + "grad_norm": 26.683356739417402, + "learning_rate": 3.6998789681993025e-06, + "loss": 1.5047, + "step": 22013 + }, + { + "epoch": 1.8762464842751214, + "grad_norm": 68.8964076493224, + "learning_rate": 3.6994001828923493e-06, + "loss": 2.6931, + "step": 22014 + }, + { + "epoch": 1.8763317139691469, + "grad_norm": 69.94104945764704, + "learning_rate": 3.698921410376278e-06, + "loss": 2.9028, + "step": 22015 + }, + { + "epoch": 1.8764169436631724, + "grad_norm": 67.14506363810203, + "learning_rate": 3.6984426506557974e-06, + "loss": 2.886, + "step": 22016 + }, + { + "epoch": 1.8765021733571976, + "grad_norm": 63.10019516435793, + "learning_rate": 3.6979639037356147e-06, + "loss": 2.7117, + "step": 22017 + }, + { + "epoch": 1.876587403051223, + "grad_norm": 41.55708032161245, + "learning_rate": 3.6974851696204415e-06, + "loss": 2.9479, + "step": 22018 + }, + { + "epoch": 1.8766726327452483, + "grad_norm": 56.1050764265879, + "learning_rate": 3.697006448314985e-06, + "loss": 2.7856, + "step": 22019 + }, + { + "epoch": 1.8767578624392738, + "grad_norm": 80.40218534165406, + "learning_rate": 3.69652773982395e-06, + "loss": 2.9266, + "step": 22020 + }, + { + "epoch": 1.8768430921332993, + "grad_norm": 44.65478065451756, + "learning_rate": 3.6960490441520465e-06, + "loss": 2.3544, + "step": 22021 + }, + { + "epoch": 1.8769283218273247, + "grad_norm": 102.69898314678785, + "learning_rate": 3.6955703613039846e-06, + "loss": 3.2163, + "step": 22022 + }, + { + "epoch": 1.8770135515213502, + "grad_norm": 92.12574746349584, + "learning_rate": 3.6950916912844693e-06, + "loss": 4.6436, + "step": 22023 + }, + { + "epoch": 1.8770987812153754, + "grad_norm": 22.023309159885187, + "learning_rate": 3.6946130340982078e-06, + "loss": 1.9491, + "step": 22024 + }, + { + "epoch": 1.8771840109094007, + "grad_norm": 54.08101904604693, + "learning_rate": 3.6941343897499084e-06, + "loss": 2.7003, + "step": 22025 + }, + { + "epoch": 1.8772692406034261, + "grad_norm": 31.465000302156863, + "learning_rate": 3.693655758244279e-06, + "loss": 2.9803, + "step": 22026 + }, + { + "epoch": 1.8773544702974516, + "grad_norm": 16.223993524859456, + "learning_rate": 3.693177139586027e-06, + "loss": 1.2787, + "step": 22027 + }, + { + "epoch": 1.877439699991477, + "grad_norm": 50.05723420649101, + "learning_rate": 3.692698533779857e-06, + "loss": 2.7212, + "step": 22028 + }, + { + "epoch": 1.8775249296855026, + "grad_norm": 22.58974016956956, + "learning_rate": 3.6922199408304793e-06, + "loss": 1.3739, + "step": 22029 + }, + { + "epoch": 1.8776101593795278, + "grad_norm": 70.87027414998056, + "learning_rate": 3.6917413607425957e-06, + "loss": 2.7341, + "step": 22030 + }, + { + "epoch": 1.8776953890735533, + "grad_norm": 19.062598244703512, + "learning_rate": 3.691262793520919e-06, + "loss": 1.4681, + "step": 22031 + }, + { + "epoch": 1.8777806187675785, + "grad_norm": 40.25421083800571, + "learning_rate": 3.690784239170151e-06, + "loss": 3.1453, + "step": 22032 + }, + { + "epoch": 1.877865848461604, + "grad_norm": 53.2970824196374, + "learning_rate": 3.690305697695001e-06, + "loss": 2.8786, + "step": 22033 + }, + { + "epoch": 1.8779510781556294, + "grad_norm": 40.34278347850587, + "learning_rate": 3.689827169100172e-06, + "loss": 2.6674, + "step": 22034 + }, + { + "epoch": 1.878036307849655, + "grad_norm": 36.07604812342093, + "learning_rate": 3.6893486533903744e-06, + "loss": 1.961, + "step": 22035 + }, + { + "epoch": 1.8781215375436802, + "grad_norm": 43.623700975924734, + "learning_rate": 3.6888701505703106e-06, + "loss": 1.9607, + "step": 22036 + }, + { + "epoch": 1.8782067672377056, + "grad_norm": 58.12807865231365, + "learning_rate": 3.688391660644689e-06, + "loss": 3.2418, + "step": 22037 + }, + { + "epoch": 1.8782919969317309, + "grad_norm": 42.410052943673826, + "learning_rate": 3.6879131836182124e-06, + "loss": 2.7603, + "step": 22038 + }, + { + "epoch": 1.8783772266257563, + "grad_norm": 56.86442058751885, + "learning_rate": 3.68743471949559e-06, + "loss": 2.9875, + "step": 22039 + }, + { + "epoch": 1.8784624563197818, + "grad_norm": 46.563622518333865, + "learning_rate": 3.6869562682815247e-06, + "loss": 2.8748, + "step": 22040 + }, + { + "epoch": 1.8785476860138073, + "grad_norm": 31.443608743861603, + "learning_rate": 3.6864778299807237e-06, + "loss": 1.856, + "step": 22041 + }, + { + "epoch": 1.8786329157078328, + "grad_norm": 25.994169636285438, + "learning_rate": 3.6859994045978895e-06, + "loss": 2.0389, + "step": 22042 + }, + { + "epoch": 1.878718145401858, + "grad_norm": 42.692991383588435, + "learning_rate": 3.685520992137731e-06, + "loss": 2.3551, + "step": 22043 + }, + { + "epoch": 1.8788033750958832, + "grad_norm": 68.38413377709924, + "learning_rate": 3.68504259260495e-06, + "loss": 2.4756, + "step": 22044 + }, + { + "epoch": 1.8788886047899087, + "grad_norm": 37.96782570155124, + "learning_rate": 3.684564206004254e-06, + "loss": 2.2006, + "step": 22045 + }, + { + "epoch": 1.8789738344839342, + "grad_norm": 83.24559001960229, + "learning_rate": 3.684085832340346e-06, + "loss": 2.6687, + "step": 22046 + }, + { + "epoch": 1.8790590641779596, + "grad_norm": 35.51590483502708, + "learning_rate": 3.6836074716179286e-06, + "loss": 2.5103, + "step": 22047 + }, + { + "epoch": 1.8791442938719851, + "grad_norm": 47.290336606848406, + "learning_rate": 3.68312912384171e-06, + "loss": 2.4203, + "step": 22048 + }, + { + "epoch": 1.8792295235660104, + "grad_norm": 105.24639198332602, + "learning_rate": 3.6826507890163943e-06, + "loss": 3.6384, + "step": 22049 + }, + { + "epoch": 1.8793147532600358, + "grad_norm": 37.42477398618223, + "learning_rate": 3.682172467146685e-06, + "loss": 2.1642, + "step": 22050 + }, + { + "epoch": 1.879399982954061, + "grad_norm": 34.37603510339806, + "learning_rate": 3.681694158237283e-06, + "loss": 2.4835, + "step": 22051 + }, + { + "epoch": 1.8794852126480865, + "grad_norm": 41.9660163022071, + "learning_rate": 3.681215862292896e-06, + "loss": 2.0207, + "step": 22052 + }, + { + "epoch": 1.879570442342112, + "grad_norm": 37.94730996900663, + "learning_rate": 3.680737579318229e-06, + "loss": 2.6293, + "step": 22053 + }, + { + "epoch": 1.8796556720361375, + "grad_norm": 37.3408869582213, + "learning_rate": 3.680259309317983e-06, + "loss": 2.6305, + "step": 22054 + }, + { + "epoch": 1.879740901730163, + "grad_norm": 49.53893476928249, + "learning_rate": 3.6797810522968603e-06, + "loss": 2.8066, + "step": 22055 + }, + { + "epoch": 1.8798261314241882, + "grad_norm": 51.99116348181069, + "learning_rate": 3.679302808259568e-06, + "loss": 1.9105, + "step": 22056 + }, + { + "epoch": 1.8799113611182134, + "grad_norm": 48.105850956096056, + "learning_rate": 3.678824577210808e-06, + "loss": 2.2236, + "step": 22057 + }, + { + "epoch": 1.879996590812239, + "grad_norm": 93.62276078014202, + "learning_rate": 3.6783463591552837e-06, + "loss": 3.5896, + "step": 22058 + }, + { + "epoch": 1.8800818205062644, + "grad_norm": 43.7901149454781, + "learning_rate": 3.6778681540976966e-06, + "loss": 2.7287, + "step": 22059 + }, + { + "epoch": 1.8801670502002898, + "grad_norm": 65.00912414164743, + "learning_rate": 3.67738996204275e-06, + "loss": 3.9144, + "step": 22060 + }, + { + "epoch": 1.8802522798943153, + "grad_norm": 123.5753400786611, + "learning_rate": 3.6769117829951494e-06, + "loss": 3.6749, + "step": 22061 + }, + { + "epoch": 1.8803375095883406, + "grad_norm": 30.92651855932291, + "learning_rate": 3.676433616959596e-06, + "loss": 3.3153, + "step": 22062 + }, + { + "epoch": 1.880422739282366, + "grad_norm": 48.719733277498065, + "learning_rate": 3.67595546394079e-06, + "loss": 1.8134, + "step": 22063 + }, + { + "epoch": 1.8805079689763913, + "grad_norm": 45.332030885610465, + "learning_rate": 3.675477323943436e-06, + "loss": 3.0004, + "step": 22064 + }, + { + "epoch": 1.8805931986704167, + "grad_norm": 43.98037020497013, + "learning_rate": 3.6749991969722376e-06, + "loss": 2.5445, + "step": 22065 + }, + { + "epoch": 1.8806784283644422, + "grad_norm": 40.720198895763794, + "learning_rate": 3.674521083031896e-06, + "loss": 2.9817, + "step": 22066 + }, + { + "epoch": 1.8807636580584677, + "grad_norm": 84.62266704619587, + "learning_rate": 3.674042982127112e-06, + "loss": 2.5168, + "step": 22067 + }, + { + "epoch": 1.880848887752493, + "grad_norm": 58.73040759567531, + "learning_rate": 3.6735648942625868e-06, + "loss": 2.5921, + "step": 22068 + }, + { + "epoch": 1.8809341174465184, + "grad_norm": 58.69867874579888, + "learning_rate": 3.673086819443026e-06, + "loss": 2.6009, + "step": 22069 + }, + { + "epoch": 1.8810193471405436, + "grad_norm": 57.193678836155925, + "learning_rate": 3.6726087576731296e-06, + "loss": 1.9236, + "step": 22070 + }, + { + "epoch": 1.881104576834569, + "grad_norm": 48.40057806191573, + "learning_rate": 3.672130708957597e-06, + "loss": 2.9245, + "step": 22071 + }, + { + "epoch": 1.8811898065285946, + "grad_norm": 77.68696402305676, + "learning_rate": 3.6716526733011305e-06, + "loss": 3.5257, + "step": 22072 + }, + { + "epoch": 1.88127503622262, + "grad_norm": 21.38385442568485, + "learning_rate": 3.671174650708433e-06, + "loss": 1.2854, + "step": 22073 + }, + { + "epoch": 1.8813602659166455, + "grad_norm": 31.93594268472185, + "learning_rate": 3.6706966411842066e-06, + "loss": 3.0264, + "step": 22074 + }, + { + "epoch": 1.8814454956106708, + "grad_norm": 60.8495164039008, + "learning_rate": 3.670218644733149e-06, + "loss": 3.749, + "step": 22075 + }, + { + "epoch": 1.881530725304696, + "grad_norm": 50.70948035208432, + "learning_rate": 3.669740661359963e-06, + "loss": 2.9134, + "step": 22076 + }, + { + "epoch": 1.8816159549987215, + "grad_norm": 47.90877372623326, + "learning_rate": 3.6692626910693475e-06, + "loss": 3.2411, + "step": 22077 + }, + { + "epoch": 1.881701184692747, + "grad_norm": 61.52570856082264, + "learning_rate": 3.6687847338660064e-06, + "loss": 3.3794, + "step": 22078 + }, + { + "epoch": 1.8817864143867724, + "grad_norm": 28.441526724889552, + "learning_rate": 3.668306789754638e-06, + "loss": 1.9529, + "step": 22079 + }, + { + "epoch": 1.8818716440807979, + "grad_norm": 72.38125076217466, + "learning_rate": 3.6678288587399436e-06, + "loss": 2.0218, + "step": 22080 + }, + { + "epoch": 1.8819568737748231, + "grad_norm": 53.935769923600375, + "learning_rate": 3.6673509408266205e-06, + "loss": 2.2775, + "step": 22081 + }, + { + "epoch": 1.8820421034688486, + "grad_norm": 39.278913188415316, + "learning_rate": 3.666873036019374e-06, + "loss": 3.1054, + "step": 22082 + }, + { + "epoch": 1.8821273331628738, + "grad_norm": 36.450555083346586, + "learning_rate": 3.6663951443229007e-06, + "loss": 2.4901, + "step": 22083 + }, + { + "epoch": 1.8822125628568993, + "grad_norm": 32.663941241870056, + "learning_rate": 3.6659172657419024e-06, + "loss": 2.7486, + "step": 22084 + }, + { + "epoch": 1.8822977925509248, + "grad_norm": 79.4225632691854, + "learning_rate": 3.665439400281075e-06, + "loss": 4.0356, + "step": 22085 + }, + { + "epoch": 1.8823830222449502, + "grad_norm": 38.035775330400874, + "learning_rate": 3.6649615479451228e-06, + "loss": 3.2116, + "step": 22086 + }, + { + "epoch": 1.8824682519389755, + "grad_norm": 55.262287327644074, + "learning_rate": 3.664483708738743e-06, + "loss": 2.5517, + "step": 22087 + }, + { + "epoch": 1.882553481633001, + "grad_norm": 89.11111596195074, + "learning_rate": 3.6640058826666356e-06, + "loss": 3.8218, + "step": 22088 + }, + { + "epoch": 1.8826387113270262, + "grad_norm": 116.27759283821261, + "learning_rate": 3.6635280697334994e-06, + "loss": 2.5233, + "step": 22089 + }, + { + "epoch": 1.8827239410210517, + "grad_norm": 36.964469999403995, + "learning_rate": 3.6630502699440322e-06, + "loss": 2.487, + "step": 22090 + }, + { + "epoch": 1.8828091707150771, + "grad_norm": 92.98322430506238, + "learning_rate": 3.662572483302935e-06, + "loss": 3.0194, + "step": 22091 + }, + { + "epoch": 1.8828944004091026, + "grad_norm": 64.61504408987352, + "learning_rate": 3.6620947098149073e-06, + "loss": 1.7751, + "step": 22092 + }, + { + "epoch": 1.882979630103128, + "grad_norm": 62.94179890397215, + "learning_rate": 3.6616169494846458e-06, + "loss": 3.3008, + "step": 22093 + }, + { + "epoch": 1.8830648597971533, + "grad_norm": 52.230833098270665, + "learning_rate": 3.661139202316848e-06, + "loss": 2.3212, + "step": 22094 + }, + { + "epoch": 1.8831500894911786, + "grad_norm": 59.19217399015368, + "learning_rate": 3.6606614683162155e-06, + "loss": 2.811, + "step": 22095 + }, + { + "epoch": 1.883235319185204, + "grad_norm": 45.338304416508954, + "learning_rate": 3.6601837474874464e-06, + "loss": 3.1366, + "step": 22096 + }, + { + "epoch": 1.8833205488792295, + "grad_norm": 35.251116588022406, + "learning_rate": 3.659706039835237e-06, + "loss": 2.4305, + "step": 22097 + }, + { + "epoch": 1.883405778573255, + "grad_norm": 38.72026331335905, + "learning_rate": 3.6592283453642846e-06, + "loss": 2.8884, + "step": 22098 + }, + { + "epoch": 1.8834910082672804, + "grad_norm": 58.434678414532755, + "learning_rate": 3.6587506640792897e-06, + "loss": 2.8229, + "step": 22099 + }, + { + "epoch": 1.8835762379613057, + "grad_norm": 58.2805967539704, + "learning_rate": 3.65827299598495e-06, + "loss": 2.8339, + "step": 22100 + }, + { + "epoch": 1.8836614676553312, + "grad_norm": 54.785088235230624, + "learning_rate": 3.6577953410859623e-06, + "loss": 2.6846, + "step": 22101 + }, + { + "epoch": 1.8837466973493564, + "grad_norm": 37.59241169321274, + "learning_rate": 3.657317699387023e-06, + "loss": 2.6912, + "step": 22102 + }, + { + "epoch": 1.8838319270433819, + "grad_norm": 30.8950535264734, + "learning_rate": 3.6568400708928292e-06, + "loss": 2.6166, + "step": 22103 + }, + { + "epoch": 1.8839171567374073, + "grad_norm": 34.099882342973885, + "learning_rate": 3.6563624556080825e-06, + "loss": 2.851, + "step": 22104 + }, + { + "epoch": 1.8840023864314328, + "grad_norm": 46.46624097142949, + "learning_rate": 3.655884853537476e-06, + "loss": 3.3953, + "step": 22105 + }, + { + "epoch": 1.8840876161254583, + "grad_norm": 58.706161757989065, + "learning_rate": 3.6554072646857073e-06, + "loss": 2.9081, + "step": 22106 + }, + { + "epoch": 1.8841728458194835, + "grad_norm": 74.29601695707903, + "learning_rate": 3.6549296890574733e-06, + "loss": 2.3926, + "step": 22107 + }, + { + "epoch": 1.8842580755135088, + "grad_norm": 54.505501171565555, + "learning_rate": 3.654452126657473e-06, + "loss": 2.7865, + "step": 22108 + }, + { + "epoch": 1.8843433052075342, + "grad_norm": 31.554365189669205, + "learning_rate": 3.6539745774904012e-06, + "loss": 2.7272, + "step": 22109 + }, + { + "epoch": 1.8844285349015597, + "grad_norm": 38.56415425766893, + "learning_rate": 3.653497041560954e-06, + "loss": 1.3504, + "step": 22110 + }, + { + "epoch": 1.8845137645955852, + "grad_norm": 47.16095012302409, + "learning_rate": 3.653019518873827e-06, + "loss": 2.7961, + "step": 22111 + }, + { + "epoch": 1.8845989942896106, + "grad_norm": 45.677444240053774, + "learning_rate": 3.65254200943372e-06, + "loss": 3.1904, + "step": 22112 + }, + { + "epoch": 1.8846842239836359, + "grad_norm": 59.80314621932244, + "learning_rate": 3.652064513245327e-06, + "loss": 2.4742, + "step": 22113 + }, + { + "epoch": 1.8847694536776611, + "grad_norm": 48.22009804419899, + "learning_rate": 3.6515870303133426e-06, + "loss": 2.1165, + "step": 22114 + }, + { + "epoch": 1.8848546833716866, + "grad_norm": 41.93308261955694, + "learning_rate": 3.651109560642463e-06, + "loss": 2.9557, + "step": 22115 + }, + { + "epoch": 1.884939913065712, + "grad_norm": 63.14029817221079, + "learning_rate": 3.650632104237387e-06, + "loss": 3.1942, + "step": 22116 + }, + { + "epoch": 1.8850251427597375, + "grad_norm": 70.81756194795508, + "learning_rate": 3.650154661102808e-06, + "loss": 2.8814, + "step": 22117 + }, + { + "epoch": 1.885110372453763, + "grad_norm": 33.80928931977041, + "learning_rate": 3.6496772312434204e-06, + "loss": 2.296, + "step": 22118 + }, + { + "epoch": 1.8851956021477883, + "grad_norm": 31.24500218206083, + "learning_rate": 3.649199814663922e-06, + "loss": 2.8006, + "step": 22119 + }, + { + "epoch": 1.8852808318418137, + "grad_norm": 68.65397561848087, + "learning_rate": 3.6487224113690044e-06, + "loss": 2.9202, + "step": 22120 + }, + { + "epoch": 1.885366061535839, + "grad_norm": 83.90885343749169, + "learning_rate": 3.6482450213633668e-06, + "loss": 3.2386, + "step": 22121 + }, + { + "epoch": 1.8854512912298644, + "grad_norm": 61.35461138703802, + "learning_rate": 3.647767644651702e-06, + "loss": 2.6189, + "step": 22122 + }, + { + "epoch": 1.88553652092389, + "grad_norm": 47.8075771457975, + "learning_rate": 3.647290281238705e-06, + "loss": 3.7586, + "step": 22123 + }, + { + "epoch": 1.8856217506179154, + "grad_norm": 20.619330341358403, + "learning_rate": 3.646812931129069e-06, + "loss": 2.0874, + "step": 22124 + }, + { + "epoch": 1.8857069803119408, + "grad_norm": 45.19846033590045, + "learning_rate": 3.6463355943274925e-06, + "loss": 2.176, + "step": 22125 + }, + { + "epoch": 1.885792210005966, + "grad_norm": 82.84333359614395, + "learning_rate": 3.6458582708386665e-06, + "loss": 3.312, + "step": 22126 + }, + { + "epoch": 1.8858774396999913, + "grad_norm": 75.3237336600433, + "learning_rate": 3.6453809606672873e-06, + "loss": 2.6718, + "step": 22127 + }, + { + "epoch": 1.8859626693940168, + "grad_norm": 31.98996254300194, + "learning_rate": 3.6449036638180457e-06, + "loss": 1.956, + "step": 22128 + }, + { + "epoch": 1.8860478990880423, + "grad_norm": 72.21384685884838, + "learning_rate": 3.6444263802956405e-06, + "loss": 2.414, + "step": 22129 + }, + { + "epoch": 1.8861331287820677, + "grad_norm": 46.426854131279235, + "learning_rate": 3.643949110104762e-06, + "loss": 3.1225, + "step": 22130 + }, + { + "epoch": 1.8862183584760932, + "grad_norm": 81.90423111429268, + "learning_rate": 3.6434718532501057e-06, + "loss": 2.7954, + "step": 22131 + }, + { + "epoch": 1.8863035881701185, + "grad_norm": 64.62564682355557, + "learning_rate": 3.6429946097363655e-06, + "loss": 2.6847, + "step": 22132 + }, + { + "epoch": 1.886388817864144, + "grad_norm": 38.093059657217566, + "learning_rate": 3.6425173795682315e-06, + "loss": 2.9553, + "step": 22133 + }, + { + "epoch": 1.8864740475581692, + "grad_norm": 87.05389727731185, + "learning_rate": 3.642040162750401e-06, + "loss": 3.1491, + "step": 22134 + }, + { + "epoch": 1.8865592772521946, + "grad_norm": 79.71235564377128, + "learning_rate": 3.641562959287567e-06, + "loss": 3.4228, + "step": 22135 + }, + { + "epoch": 1.88664450694622, + "grad_norm": 52.52697865455872, + "learning_rate": 3.6410857691844216e-06, + "loss": 2.9731, + "step": 22136 + }, + { + "epoch": 1.8867297366402456, + "grad_norm": 102.48733026314255, + "learning_rate": 3.6406085924456547e-06, + "loss": 2.9161, + "step": 22137 + }, + { + "epoch": 1.8868149663342708, + "grad_norm": 62.73538237520485, + "learning_rate": 3.6401314290759637e-06, + "loss": 2.3779, + "step": 22138 + }, + { + "epoch": 1.8869001960282963, + "grad_norm": 66.33735316249798, + "learning_rate": 3.6396542790800416e-06, + "loss": 3.5554, + "step": 22139 + }, + { + "epoch": 1.8869854257223215, + "grad_norm": 174.0147445103705, + "learning_rate": 3.6391771424625775e-06, + "loss": 2.6219, + "step": 22140 + }, + { + "epoch": 1.887070655416347, + "grad_norm": 42.75270383555386, + "learning_rate": 3.638700019228265e-06, + "loss": 2.7817, + "step": 22141 + }, + { + "epoch": 1.8871558851103725, + "grad_norm": 39.49504772394983, + "learning_rate": 3.638222909381797e-06, + "loss": 3.2306, + "step": 22142 + }, + { + "epoch": 1.887241114804398, + "grad_norm": 37.578743110843405, + "learning_rate": 3.637745812927867e-06, + "loss": 2.3761, + "step": 22143 + }, + { + "epoch": 1.8873263444984234, + "grad_norm": 80.92347347189325, + "learning_rate": 3.637268729871165e-06, + "loss": 3.5525, + "step": 22144 + }, + { + "epoch": 1.8874115741924486, + "grad_norm": 34.88810526400896, + "learning_rate": 3.636791660216382e-06, + "loss": 2.7299, + "step": 22145 + }, + { + "epoch": 1.887496803886474, + "grad_norm": 40.03574775410263, + "learning_rate": 3.636314603968213e-06, + "loss": 3.7333, + "step": 22146 + }, + { + "epoch": 1.8875820335804994, + "grad_norm": 35.97046341317454, + "learning_rate": 3.635837561131348e-06, + "loss": 2.5617, + "step": 22147 + }, + { + "epoch": 1.8876672632745248, + "grad_norm": 35.9573688727083, + "learning_rate": 3.635360531710479e-06, + "loss": 2.8069, + "step": 22148 + }, + { + "epoch": 1.8877524929685503, + "grad_norm": 59.80906767744344, + "learning_rate": 3.6348835157102957e-06, + "loss": 3.1542, + "step": 22149 + }, + { + "epoch": 1.8878377226625758, + "grad_norm": 33.90186071747282, + "learning_rate": 3.634406513135489e-06, + "loss": 2.6005, + "step": 22150 + }, + { + "epoch": 1.887922952356601, + "grad_norm": 53.65162597924023, + "learning_rate": 3.6339295239907546e-06, + "loss": 2.9151, + "step": 22151 + }, + { + "epoch": 1.8880081820506265, + "grad_norm": 75.18923598925613, + "learning_rate": 3.6334525482807803e-06, + "loss": 2.038, + "step": 22152 + }, + { + "epoch": 1.8880934117446517, + "grad_norm": 43.63584043335506, + "learning_rate": 3.6329755860102554e-06, + "loss": 2.8434, + "step": 22153 + }, + { + "epoch": 1.8881786414386772, + "grad_norm": 65.51143871219034, + "learning_rate": 3.632498637183872e-06, + "loss": 3.3215, + "step": 22154 + }, + { + "epoch": 1.8882638711327027, + "grad_norm": 36.70423712977215, + "learning_rate": 3.6320217018063237e-06, + "loss": 2.2332, + "step": 22155 + }, + { + "epoch": 1.8883491008267281, + "grad_norm": 36.39601671382654, + "learning_rate": 3.6315447798822977e-06, + "loss": 2.7148, + "step": 22156 + }, + { + "epoch": 1.8884343305207534, + "grad_norm": 32.06857774128535, + "learning_rate": 3.631067871416484e-06, + "loss": 2.4801, + "step": 22157 + }, + { + "epoch": 1.8885195602147788, + "grad_norm": 29.499389353835298, + "learning_rate": 3.6305909764135726e-06, + "loss": 2.4656, + "step": 22158 + }, + { + "epoch": 1.888604789908804, + "grad_norm": 33.101282369576055, + "learning_rate": 3.6301140948782574e-06, + "loss": 2.217, + "step": 22159 + }, + { + "epoch": 1.8886900196028296, + "grad_norm": 33.49734844350809, + "learning_rate": 3.6296372268152257e-06, + "loss": 1.9586, + "step": 22160 + }, + { + "epoch": 1.888775249296855, + "grad_norm": 65.97034286194555, + "learning_rate": 3.6291603722291658e-06, + "loss": 2.9407, + "step": 22161 + }, + { + "epoch": 1.8888604789908805, + "grad_norm": 49.68597063147538, + "learning_rate": 3.6286835311247702e-06, + "loss": 3.3622, + "step": 22162 + }, + { + "epoch": 1.888945708684906, + "grad_norm": 50.960390922808855, + "learning_rate": 3.6282067035067257e-06, + "loss": 3.5415, + "step": 22163 + }, + { + "epoch": 1.8890309383789312, + "grad_norm": 68.07656250824625, + "learning_rate": 3.6277298893797244e-06, + "loss": 3.2293, + "step": 22164 + }, + { + "epoch": 1.8891161680729565, + "grad_norm": 52.59498796499759, + "learning_rate": 3.6272530887484536e-06, + "loss": 2.3266, + "step": 22165 + }, + { + "epoch": 1.889201397766982, + "grad_norm": 65.33514716579053, + "learning_rate": 3.6267763016176044e-06, + "loss": 3.0406, + "step": 22166 + }, + { + "epoch": 1.8892866274610074, + "grad_norm": 35.793372449644906, + "learning_rate": 3.6262995279918624e-06, + "loss": 3.4182, + "step": 22167 + }, + { + "epoch": 1.8893718571550329, + "grad_norm": 61.07682331526453, + "learning_rate": 3.6258227678759208e-06, + "loss": 3.0153, + "step": 22168 + }, + { + "epoch": 1.8894570868490583, + "grad_norm": 36.442500196787655, + "learning_rate": 3.625346021274465e-06, + "loss": 3.0933, + "step": 22169 + }, + { + "epoch": 1.8895423165430836, + "grad_norm": 39.162166895853005, + "learning_rate": 3.6248692881921865e-06, + "loss": 3.3657, + "step": 22170 + }, + { + "epoch": 1.889627546237109, + "grad_norm": 68.13907017151143, + "learning_rate": 3.624392568633769e-06, + "loss": 2.9427, + "step": 22171 + }, + { + "epoch": 1.8897127759311343, + "grad_norm": 94.34093491919002, + "learning_rate": 3.6239158626039073e-06, + "loss": 3.9566, + "step": 22172 + }, + { + "epoch": 1.8897980056251598, + "grad_norm": 57.96029390919516, + "learning_rate": 3.6234391701072842e-06, + "loss": 2.9354, + "step": 22173 + }, + { + "epoch": 1.8898832353191852, + "grad_norm": 46.533971285470614, + "learning_rate": 3.6229624911485915e-06, + "loss": 2.905, + "step": 22174 + }, + { + "epoch": 1.8899684650132107, + "grad_norm": 78.09290805241582, + "learning_rate": 3.622485825732515e-06, + "loss": 2.6791, + "step": 22175 + }, + { + "epoch": 1.8900536947072362, + "grad_norm": 79.47462545908603, + "learning_rate": 3.6220091738637414e-06, + "loss": 3.4388, + "step": 22176 + }, + { + "epoch": 1.8901389244012614, + "grad_norm": 84.86589689820177, + "learning_rate": 3.621532535546961e-06, + "loss": 4.1803, + "step": 22177 + }, + { + "epoch": 1.8902241540952867, + "grad_norm": 63.51832947819814, + "learning_rate": 3.6210559107868613e-06, + "loss": 2.7238, + "step": 22178 + }, + { + "epoch": 1.8903093837893121, + "grad_norm": 49.659900098460206, + "learning_rate": 3.620579299588129e-06, + "loss": 3.3841, + "step": 22179 + }, + { + "epoch": 1.8903946134833376, + "grad_norm": 75.00275028591342, + "learning_rate": 3.6201027019554487e-06, + "loss": 2.2, + "step": 22180 + }, + { + "epoch": 1.890479843177363, + "grad_norm": 23.21877801740656, + "learning_rate": 3.619626117893511e-06, + "loss": 1.8593, + "step": 22181 + }, + { + "epoch": 1.8905650728713885, + "grad_norm": 47.977959788920266, + "learning_rate": 3.6191495474070036e-06, + "loss": 3.4995, + "step": 22182 + }, + { + "epoch": 1.8906503025654138, + "grad_norm": 69.88152283383815, + "learning_rate": 3.618672990500611e-06, + "loss": 2.6343, + "step": 22183 + }, + { + "epoch": 1.890735532259439, + "grad_norm": 38.81998029290101, + "learning_rate": 3.6181964471790186e-06, + "loss": 2.6751, + "step": 22184 + }, + { + "epoch": 1.8908207619534645, + "grad_norm": 46.60048736857973, + "learning_rate": 3.617719917446917e-06, + "loss": 3.8212, + "step": 22185 + }, + { + "epoch": 1.89090599164749, + "grad_norm": 70.0888382402208, + "learning_rate": 3.617243401308991e-06, + "loss": 3.301, + "step": 22186 + }, + { + "epoch": 1.8909912213415154, + "grad_norm": 39.151415764199456, + "learning_rate": 3.6167668987699265e-06, + "loss": 1.2754, + "step": 22187 + }, + { + "epoch": 1.891076451035541, + "grad_norm": 59.290540510120806, + "learning_rate": 3.6162904098344086e-06, + "loss": 2.4655, + "step": 22188 + }, + { + "epoch": 1.8911616807295661, + "grad_norm": 38.525787248855366, + "learning_rate": 3.615813934507125e-06, + "loss": 2.9951, + "step": 22189 + }, + { + "epoch": 1.8912469104235916, + "grad_norm": 96.71848479815387, + "learning_rate": 3.6153374727927626e-06, + "loss": 3.5298, + "step": 22190 + }, + { + "epoch": 1.8913321401176169, + "grad_norm": 26.607829765211335, + "learning_rate": 3.6148610246960065e-06, + "loss": 1.5913, + "step": 22191 + }, + { + "epoch": 1.8914173698116423, + "grad_norm": 73.14348743177867, + "learning_rate": 3.61438459022154e-06, + "loss": 2.6829, + "step": 22192 + }, + { + "epoch": 1.8915025995056678, + "grad_norm": 58.795528203257106, + "learning_rate": 3.6139081693740495e-06, + "loss": 2.687, + "step": 22193 + }, + { + "epoch": 1.8915878291996933, + "grad_norm": 27.56613430720812, + "learning_rate": 3.6134317621582243e-06, + "loss": 2.3351, + "step": 22194 + }, + { + "epoch": 1.8916730588937187, + "grad_norm": 65.35927419811193, + "learning_rate": 3.612955368578746e-06, + "loss": 3.02, + "step": 22195 + }, + { + "epoch": 1.891758288587744, + "grad_norm": 54.55196627204728, + "learning_rate": 3.6124789886403e-06, + "loss": 3.4845, + "step": 22196 + }, + { + "epoch": 1.8918435182817692, + "grad_norm": 43.31572800754773, + "learning_rate": 3.61200262234757e-06, + "loss": 2.5335, + "step": 22197 + }, + { + "epoch": 1.8919287479757947, + "grad_norm": 99.64688651285077, + "learning_rate": 3.6115262697052455e-06, + "loss": 3.4054, + "step": 22198 + }, + { + "epoch": 1.8920139776698202, + "grad_norm": 89.23585491670052, + "learning_rate": 3.611049930718008e-06, + "loss": 2.5412, + "step": 22199 + }, + { + "epoch": 1.8920992073638456, + "grad_norm": 115.3851876526501, + "learning_rate": 3.6105736053905416e-06, + "loss": 3.0314, + "step": 22200 + }, + { + "epoch": 1.892184437057871, + "grad_norm": 29.480149247854598, + "learning_rate": 3.6100972937275304e-06, + "loss": 2.2471, + "step": 22201 + }, + { + "epoch": 1.8922696667518963, + "grad_norm": 47.52039227321279, + "learning_rate": 3.609620995733662e-06, + "loss": 3.1119, + "step": 22202 + }, + { + "epoch": 1.8923548964459218, + "grad_norm": 42.359224784675376, + "learning_rate": 3.6091447114136193e-06, + "loss": 3.2608, + "step": 22203 + }, + { + "epoch": 1.892440126139947, + "grad_norm": 34.92817675393603, + "learning_rate": 3.608668440772084e-06, + "loss": 1.8733, + "step": 22204 + }, + { + "epoch": 1.8925253558339725, + "grad_norm": 41.745313570016975, + "learning_rate": 3.608192183813743e-06, + "loss": 3.4409, + "step": 22205 + }, + { + "epoch": 1.892610585527998, + "grad_norm": 43.845387558729726, + "learning_rate": 3.6077159405432768e-06, + "loss": 1.602, + "step": 22206 + }, + { + "epoch": 1.8926958152220235, + "grad_norm": 60.953762284151814, + "learning_rate": 3.607239710965373e-06, + "loss": 2.836, + "step": 22207 + }, + { + "epoch": 1.8927810449160487, + "grad_norm": 43.014715594518606, + "learning_rate": 3.606763495084712e-06, + "loss": 2.9237, + "step": 22208 + }, + { + "epoch": 1.8928662746100742, + "grad_norm": 84.69768746281537, + "learning_rate": 3.60628729290598e-06, + "loss": 2.9136, + "step": 22209 + }, + { + "epoch": 1.8929515043040994, + "grad_norm": 114.85886837029328, + "learning_rate": 3.605811104433856e-06, + "loss": 3.5801, + "step": 22210 + }, + { + "epoch": 1.8930367339981249, + "grad_norm": 50.89927738761556, + "learning_rate": 3.605334929673029e-06, + "loss": 3.4599, + "step": 22211 + }, + { + "epoch": 1.8931219636921504, + "grad_norm": 34.82340168488987, + "learning_rate": 3.6048587686281765e-06, + "loss": 2.7908, + "step": 22212 + }, + { + "epoch": 1.8932071933861758, + "grad_norm": 25.700452297041103, + "learning_rate": 3.604382621303985e-06, + "loss": 2.0657, + "step": 22213 + }, + { + "epoch": 1.8932924230802013, + "grad_norm": 35.207025744973876, + "learning_rate": 3.603906487705134e-06, + "loss": 2.6725, + "step": 22214 + }, + { + "epoch": 1.8933776527742265, + "grad_norm": 46.56112358916684, + "learning_rate": 3.6034303678363098e-06, + "loss": 3.5546, + "step": 22215 + }, + { + "epoch": 1.8934628824682518, + "grad_norm": 68.44215408850151, + "learning_rate": 3.6029542617021916e-06, + "loss": 2.859, + "step": 22216 + }, + { + "epoch": 1.8935481121622773, + "grad_norm": 65.31036957047917, + "learning_rate": 3.602478169307465e-06, + "loss": 2.0766, + "step": 22217 + }, + { + "epoch": 1.8936333418563027, + "grad_norm": 80.3978871846041, + "learning_rate": 3.6020020906568077e-06, + "loss": 2.7223, + "step": 22218 + }, + { + "epoch": 1.8937185715503282, + "grad_norm": 29.281349885072608, + "learning_rate": 3.6015260257549067e-06, + "loss": 2.1689, + "step": 22219 + }, + { + "epoch": 1.8938038012443537, + "grad_norm": 61.539628433111936, + "learning_rate": 3.60104997460644e-06, + "loss": 3.7126, + "step": 22220 + }, + { + "epoch": 1.893889030938379, + "grad_norm": 74.35167260414077, + "learning_rate": 3.6005739372160926e-06, + "loss": 3.0145, + "step": 22221 + }, + { + "epoch": 1.8939742606324044, + "grad_norm": 63.158168888335666, + "learning_rate": 3.600097913588544e-06, + "loss": 3.2342, + "step": 22222 + }, + { + "epoch": 1.8940594903264296, + "grad_norm": 58.057311101905675, + "learning_rate": 3.599621903728474e-06, + "loss": 2.8878, + "step": 22223 + }, + { + "epoch": 1.894144720020455, + "grad_norm": 45.11824405609287, + "learning_rate": 3.5991459076405676e-06, + "loss": 2.5061, + "step": 22224 + }, + { + "epoch": 1.8942299497144806, + "grad_norm": 36.07845785801057, + "learning_rate": 3.5986699253295056e-06, + "loss": 2.5698, + "step": 22225 + }, + { + "epoch": 1.894315179408506, + "grad_norm": 41.31157361496873, + "learning_rate": 3.5981939567999678e-06, + "loss": 2.8255, + "step": 22226 + }, + { + "epoch": 1.8944004091025313, + "grad_norm": 83.28285293944101, + "learning_rate": 3.597718002056633e-06, + "loss": 3.7134, + "step": 22227 + }, + { + "epoch": 1.8944856387965567, + "grad_norm": 27.73762763796649, + "learning_rate": 3.597242061104186e-06, + "loss": 2.4769, + "step": 22228 + }, + { + "epoch": 1.894570868490582, + "grad_norm": 31.005679080982894, + "learning_rate": 3.596766133947307e-06, + "loss": 2.145, + "step": 22229 + }, + { + "epoch": 1.8946560981846075, + "grad_norm": 28.076263131506636, + "learning_rate": 3.5962902205906745e-06, + "loss": 2.1865, + "step": 22230 + }, + { + "epoch": 1.894741327878633, + "grad_norm": 73.07004503340532, + "learning_rate": 3.595814321038969e-06, + "loss": 3.523, + "step": 22231 + }, + { + "epoch": 1.8948265575726584, + "grad_norm": 39.37445398786455, + "learning_rate": 3.5953384352968722e-06, + "loss": 2.402, + "step": 22232 + }, + { + "epoch": 1.8949117872666839, + "grad_norm": 67.98043486656655, + "learning_rate": 3.594862563369065e-06, + "loss": 2.5372, + "step": 22233 + }, + { + "epoch": 1.894997016960709, + "grad_norm": 74.31048880073698, + "learning_rate": 3.594386705260226e-06, + "loss": 3.541, + "step": 22234 + }, + { + "epoch": 1.8950822466547343, + "grad_norm": 46.3808810623369, + "learning_rate": 3.593910860975034e-06, + "loss": 1.8113, + "step": 22235 + }, + { + "epoch": 1.8951674763487598, + "grad_norm": 45.13694371637341, + "learning_rate": 3.593435030518169e-06, + "loss": 2.391, + "step": 22236 + }, + { + "epoch": 1.8952527060427853, + "grad_norm": 24.489049362545128, + "learning_rate": 3.592959213894314e-06, + "loss": 1.2893, + "step": 22237 + }, + { + "epoch": 1.8953379357368108, + "grad_norm": 45.309312960848644, + "learning_rate": 3.5924834111081458e-06, + "loss": 2.6253, + "step": 22238 + }, + { + "epoch": 1.8954231654308362, + "grad_norm": 120.93549151971438, + "learning_rate": 3.592007622164343e-06, + "loss": 2.897, + "step": 22239 + }, + { + "epoch": 1.8955083951248615, + "grad_norm": 35.4595616534309, + "learning_rate": 3.591531847067584e-06, + "loss": 2.1156, + "step": 22240 + }, + { + "epoch": 1.895593624818887, + "grad_norm": 49.822841117192894, + "learning_rate": 3.591056085822553e-06, + "loss": 3.1484, + "step": 22241 + }, + { + "epoch": 1.8956788545129122, + "grad_norm": 55.91050706066942, + "learning_rate": 3.5905803384339244e-06, + "loss": 2.6653, + "step": 22242 + }, + { + "epoch": 1.8957640842069376, + "grad_norm": 30.47622405562062, + "learning_rate": 3.5901046049063777e-06, + "loss": 2.4412, + "step": 22243 + }, + { + "epoch": 1.8958493139009631, + "grad_norm": 97.51912586701204, + "learning_rate": 3.5896288852445905e-06, + "loss": 3.7853, + "step": 22244 + }, + { + "epoch": 1.8959345435949886, + "grad_norm": 34.21890480541153, + "learning_rate": 3.589153179453245e-06, + "loss": 2.6614, + "step": 22245 + }, + { + "epoch": 1.896019773289014, + "grad_norm": 49.08684286498218, + "learning_rate": 3.588677487537018e-06, + "loss": 2.1208, + "step": 22246 + }, + { + "epoch": 1.8961050029830393, + "grad_norm": 49.629511366632826, + "learning_rate": 3.5882018095005854e-06, + "loss": 2.6469, + "step": 22247 + }, + { + "epoch": 1.8961902326770645, + "grad_norm": 35.942477672454466, + "learning_rate": 3.587726145348628e-06, + "loss": 3.3021, + "step": 22248 + }, + { + "epoch": 1.89627546237109, + "grad_norm": 49.12584052158346, + "learning_rate": 3.587250495085821e-06, + "loss": 1.8795, + "step": 22249 + }, + { + "epoch": 1.8963606920651155, + "grad_norm": 274.73591361623136, + "learning_rate": 3.586774858716846e-06, + "loss": 3.0607, + "step": 22250 + }, + { + "epoch": 1.896445921759141, + "grad_norm": 92.35694953384457, + "learning_rate": 3.586299236246378e-06, + "loss": 3.4898, + "step": 22251 + }, + { + "epoch": 1.8965311514531664, + "grad_norm": 71.94908274537919, + "learning_rate": 3.585823627679096e-06, + "loss": 4.0448, + "step": 22252 + }, + { + "epoch": 1.8966163811471917, + "grad_norm": 33.313075809728986, + "learning_rate": 3.5853480330196743e-06, + "loss": 2.2505, + "step": 22253 + }, + { + "epoch": 1.8967016108412171, + "grad_norm": 58.55733320388543, + "learning_rate": 3.5848724522727952e-06, + "loss": 2.9446, + "step": 22254 + }, + { + "epoch": 1.8967868405352424, + "grad_norm": 37.060526027458344, + "learning_rate": 3.5843968854431316e-06, + "loss": 2.0738, + "step": 22255 + }, + { + "epoch": 1.8968720702292678, + "grad_norm": 64.31026437868934, + "learning_rate": 3.5839213325353634e-06, + "loss": 2.8058, + "step": 22256 + }, + { + "epoch": 1.8969572999232933, + "grad_norm": 46.85107880077441, + "learning_rate": 3.5834457935541644e-06, + "loss": 2.6714, + "step": 22257 + }, + { + "epoch": 1.8970425296173188, + "grad_norm": 74.43734486272507, + "learning_rate": 3.5829702685042145e-06, + "loss": 2.8087, + "step": 22258 + }, + { + "epoch": 1.897127759311344, + "grad_norm": 55.26459203615092, + "learning_rate": 3.5824947573901886e-06, + "loss": 2.6413, + "step": 22259 + }, + { + "epoch": 1.8972129890053695, + "grad_norm": 45.60952355432054, + "learning_rate": 3.582019260216765e-06, + "loss": 2.9862, + "step": 22260 + }, + { + "epoch": 1.8972982186993947, + "grad_norm": 38.43903099014142, + "learning_rate": 3.5815437769886153e-06, + "loss": 3.1261, + "step": 22261 + }, + { + "epoch": 1.8973834483934202, + "grad_norm": 33.9740134360895, + "learning_rate": 3.581068307710421e-06, + "loss": 3.1475, + "step": 22262 + }, + { + "epoch": 1.8974686780874457, + "grad_norm": 65.87745763164213, + "learning_rate": 3.580592852386856e-06, + "loss": 2.3362, + "step": 22263 + }, + { + "epoch": 1.8975539077814711, + "grad_norm": 35.31972220424051, + "learning_rate": 3.5801174110225967e-06, + "loss": 2.6449, + "step": 22264 + }, + { + "epoch": 1.8976391374754966, + "grad_norm": 78.41542885634908, + "learning_rate": 3.5796419836223184e-06, + "loss": 3.4782, + "step": 22265 + }, + { + "epoch": 1.8977243671695219, + "grad_norm": 52.99279190026608, + "learning_rate": 3.579166570190695e-06, + "loss": 2.7464, + "step": 22266 + }, + { + "epoch": 1.897809596863547, + "grad_norm": 44.74575266418678, + "learning_rate": 3.5786911707324045e-06, + "loss": 3.0744, + "step": 22267 + }, + { + "epoch": 1.8978948265575726, + "grad_norm": 55.81697009357572, + "learning_rate": 3.5782157852521226e-06, + "loss": 3.3462, + "step": 22268 + }, + { + "epoch": 1.897980056251598, + "grad_norm": 50.834607584443745, + "learning_rate": 3.5777404137545243e-06, + "loss": 3.4955, + "step": 22269 + }, + { + "epoch": 1.8980652859456235, + "grad_norm": 48.401525584841664, + "learning_rate": 3.5772650562442812e-06, + "loss": 2.5767, + "step": 22270 + }, + { + "epoch": 1.898150515639649, + "grad_norm": 42.30112761501123, + "learning_rate": 3.576789712726072e-06, + "loss": 2.9447, + "step": 22271 + }, + { + "epoch": 1.8982357453336742, + "grad_norm": 52.190664814866416, + "learning_rate": 3.576314383204571e-06, + "loss": 2.8433, + "step": 22272 + }, + { + "epoch": 1.8983209750276997, + "grad_norm": 58.733182543652454, + "learning_rate": 3.5758390676844527e-06, + "loss": 2.5341, + "step": 22273 + }, + { + "epoch": 1.898406204721725, + "grad_norm": 97.42904909132841, + "learning_rate": 3.5753637661703893e-06, + "loss": 2.7327, + "step": 22274 + }, + { + "epoch": 1.8984914344157504, + "grad_norm": 59.098970858985105, + "learning_rate": 3.5748884786670586e-06, + "loss": 2.4559, + "step": 22275 + }, + { + "epoch": 1.8985766641097759, + "grad_norm": 31.437346058642717, + "learning_rate": 3.5744132051791335e-06, + "loss": 3.025, + "step": 22276 + }, + { + "epoch": 1.8986618938038013, + "grad_norm": 35.06250046974707, + "learning_rate": 3.573937945711289e-06, + "loss": 2.0871, + "step": 22277 + }, + { + "epoch": 1.8987471234978266, + "grad_norm": 73.30718127708452, + "learning_rate": 3.5734627002681964e-06, + "loss": 3.3469, + "step": 22278 + }, + { + "epoch": 1.898832353191852, + "grad_norm": 28.967239653882, + "learning_rate": 3.5729874688545307e-06, + "loss": 2.4628, + "step": 22279 + }, + { + "epoch": 1.8989175828858773, + "grad_norm": 47.838287802049585, + "learning_rate": 3.5725122514749684e-06, + "loss": 3.2443, + "step": 22280 + }, + { + "epoch": 1.8990028125799028, + "grad_norm": 41.495748839800584, + "learning_rate": 3.5720370481341806e-06, + "loss": 3.4167, + "step": 22281 + }, + { + "epoch": 1.8990880422739282, + "grad_norm": 70.27480672293055, + "learning_rate": 3.5715618588368395e-06, + "loss": 2.1649, + "step": 22282 + }, + { + "epoch": 1.8991732719679537, + "grad_norm": 37.112984168629666, + "learning_rate": 3.5710866835876203e-06, + "loss": 2.3816, + "step": 22283 + }, + { + "epoch": 1.8992585016619792, + "grad_norm": 40.56484452428299, + "learning_rate": 3.5706115223911965e-06, + "loss": 2.5985, + "step": 22284 + }, + { + "epoch": 1.8993437313560044, + "grad_norm": 68.09963421988348, + "learning_rate": 3.570136375252241e-06, + "loss": 2.9411, + "step": 22285 + }, + { + "epoch": 1.8994289610500297, + "grad_norm": 50.01053429646975, + "learning_rate": 3.5696612421754244e-06, + "loss": 2.7351, + "step": 22286 + }, + { + "epoch": 1.8995141907440551, + "grad_norm": 39.46480514973493, + "learning_rate": 3.5691861231654207e-06, + "loss": 2.4136, + "step": 22287 + }, + { + "epoch": 1.8995994204380806, + "grad_norm": 73.80266915860464, + "learning_rate": 3.5687110182269046e-06, + "loss": 2.5221, + "step": 22288 + }, + { + "epoch": 1.899684650132106, + "grad_norm": 36.91995533641443, + "learning_rate": 3.5682359273645473e-06, + "loss": 2.773, + "step": 22289 + }, + { + "epoch": 1.8997698798261315, + "grad_norm": 71.18770384932115, + "learning_rate": 3.5677608505830196e-06, + "loss": 2.9766, + "step": 22290 + }, + { + "epoch": 1.8998551095201568, + "grad_norm": 29.233860310717976, + "learning_rate": 3.5672857878869938e-06, + "loss": 2.61, + "step": 22291 + }, + { + "epoch": 1.8999403392141823, + "grad_norm": 69.37759064646981, + "learning_rate": 3.5668107392811446e-06, + "loss": 3.3442, + "step": 22292 + }, + { + "epoch": 1.9000255689082075, + "grad_norm": 56.76132767292622, + "learning_rate": 3.566335704770143e-06, + "loss": 2.4674, + "step": 22293 + }, + { + "epoch": 1.900110798602233, + "grad_norm": 37.85297991668546, + "learning_rate": 3.5658606843586584e-06, + "loss": 2.8838, + "step": 22294 + }, + { + "epoch": 1.9001960282962584, + "grad_norm": 43.22936722861652, + "learning_rate": 3.565385678051366e-06, + "loss": 3.7169, + "step": 22295 + }, + { + "epoch": 1.900281257990284, + "grad_norm": 45.4019813749848, + "learning_rate": 3.5649106858529326e-06, + "loss": 3.0621, + "step": 22296 + }, + { + "epoch": 1.9003664876843092, + "grad_norm": 74.59089341525689, + "learning_rate": 3.564435707768034e-06, + "loss": 3.4388, + "step": 22297 + }, + { + "epoch": 1.9004517173783346, + "grad_norm": 45.26900004993782, + "learning_rate": 3.5639607438013394e-06, + "loss": 2.1104, + "step": 22298 + }, + { + "epoch": 1.9005369470723599, + "grad_norm": 66.12090209975366, + "learning_rate": 3.563485793957521e-06, + "loss": 1.7707, + "step": 22299 + }, + { + "epoch": 1.9006221767663853, + "grad_norm": 37.59320989964681, + "learning_rate": 3.563010858241247e-06, + "loss": 2.8219, + "step": 22300 + }, + { + "epoch": 1.9007074064604108, + "grad_norm": 45.66700742329917, + "learning_rate": 3.562535936657193e-06, + "loss": 2.6905, + "step": 22301 + }, + { + "epoch": 1.9007926361544363, + "grad_norm": 76.68231371800624, + "learning_rate": 3.562061029210025e-06, + "loss": 2.3895, + "step": 22302 + }, + { + "epoch": 1.9008778658484617, + "grad_norm": 26.501212471720113, + "learning_rate": 3.561586135904417e-06, + "loss": 2.0857, + "step": 22303 + }, + { + "epoch": 1.900963095542487, + "grad_norm": 41.303922012340344, + "learning_rate": 3.561111256745035e-06, + "loss": 2.7202, + "step": 22304 + }, + { + "epoch": 1.9010483252365122, + "grad_norm": 47.44803170396075, + "learning_rate": 3.560636391736555e-06, + "loss": 2.844, + "step": 22305 + }, + { + "epoch": 1.9011335549305377, + "grad_norm": 32.29610983605702, + "learning_rate": 3.560161540883642e-06, + "loss": 2.3366, + "step": 22306 + }, + { + "epoch": 1.9012187846245632, + "grad_norm": 46.677585389534904, + "learning_rate": 3.55968670419097e-06, + "loss": 2.9176, + "step": 22307 + }, + { + "epoch": 1.9013040143185886, + "grad_norm": 28.551096369609564, + "learning_rate": 3.5592118816632064e-06, + "loss": 1.4693, + "step": 22308 + }, + { + "epoch": 1.901389244012614, + "grad_norm": 71.72564728275395, + "learning_rate": 3.5587370733050196e-06, + "loss": 2.4243, + "step": 22309 + }, + { + "epoch": 1.9014744737066394, + "grad_norm": 57.39420746728321, + "learning_rate": 3.558262279121082e-06, + "loss": 2.4542, + "step": 22310 + }, + { + "epoch": 1.9015597034006648, + "grad_norm": 57.690024677836675, + "learning_rate": 3.557787499116063e-06, + "loss": 3.1272, + "step": 22311 + }, + { + "epoch": 1.90164493309469, + "grad_norm": 61.76073203598115, + "learning_rate": 3.5573127332946312e-06, + "loss": 2.4417, + "step": 22312 + }, + { + "epoch": 1.9017301627887155, + "grad_norm": 92.50802523567336, + "learning_rate": 3.5568379816614527e-06, + "loss": 3.8542, + "step": 22313 + }, + { + "epoch": 1.901815392482741, + "grad_norm": 50.99253507126252, + "learning_rate": 3.556363244221201e-06, + "loss": 2.5167, + "step": 22314 + }, + { + "epoch": 1.9019006221767665, + "grad_norm": 35.073411056557575, + "learning_rate": 3.5558885209785437e-06, + "loss": 3.1309, + "step": 22315 + }, + { + "epoch": 1.901985851870792, + "grad_norm": 32.9644958731241, + "learning_rate": 3.5554138119381492e-06, + "loss": 2.8269, + "step": 22316 + }, + { + "epoch": 1.9020710815648172, + "grad_norm": 84.40465071231006, + "learning_rate": 3.554939117104683e-06, + "loss": 2.5227, + "step": 22317 + }, + { + "epoch": 1.9021563112588424, + "grad_norm": 33.02758576480544, + "learning_rate": 3.5544644364828183e-06, + "loss": 2.4816, + "step": 22318 + }, + { + "epoch": 1.902241540952868, + "grad_norm": 38.579736938338094, + "learning_rate": 3.5539897700772224e-06, + "loss": 2.8983, + "step": 22319 + }, + { + "epoch": 1.9023267706468934, + "grad_norm": 43.02113391513055, + "learning_rate": 3.5535151178925627e-06, + "loss": 2.9354, + "step": 22320 + }, + { + "epoch": 1.9024120003409188, + "grad_norm": 63.57509869995193, + "learning_rate": 3.5530404799335054e-06, + "loss": 3.0244, + "step": 22321 + }, + { + "epoch": 1.9024972300349443, + "grad_norm": 63.669011646416685, + "learning_rate": 3.5525658562047193e-06, + "loss": 2.6748, + "step": 22322 + }, + { + "epoch": 1.9025824597289696, + "grad_norm": 86.88198061711182, + "learning_rate": 3.552091246710875e-06, + "loss": 4.1666, + "step": 22323 + }, + { + "epoch": 1.902667689422995, + "grad_norm": 39.501912502300115, + "learning_rate": 3.5516166514566375e-06, + "loss": 3.2531, + "step": 22324 + }, + { + "epoch": 1.9027529191170203, + "grad_norm": 59.552232554056125, + "learning_rate": 3.551142070446674e-06, + "loss": 2.3803, + "step": 22325 + }, + { + "epoch": 1.9028381488110457, + "grad_norm": 65.42561839119068, + "learning_rate": 3.550667503685652e-06, + "loss": 3.6126, + "step": 22326 + }, + { + "epoch": 1.9029233785050712, + "grad_norm": 53.38841586409712, + "learning_rate": 3.5501929511782406e-06, + "loss": 2.8805, + "step": 22327 + }, + { + "epoch": 1.9030086081990967, + "grad_norm": 79.25857364787562, + "learning_rate": 3.5497184129291053e-06, + "loss": 3.191, + "step": 22328 + }, + { + "epoch": 1.903093837893122, + "grad_norm": 60.77298797631213, + "learning_rate": 3.5492438889429126e-06, + "loss": 2.4265, + "step": 22329 + }, + { + "epoch": 1.9031790675871474, + "grad_norm": 96.25032543952302, + "learning_rate": 3.5487693792243284e-06, + "loss": 2.2985, + "step": 22330 + }, + { + "epoch": 1.9032642972811726, + "grad_norm": 45.112373172646954, + "learning_rate": 3.5482948837780228e-06, + "loss": 3.328, + "step": 22331 + }, + { + "epoch": 1.903349526975198, + "grad_norm": 48.2034174589778, + "learning_rate": 3.54782040260866e-06, + "loss": 2.5268, + "step": 22332 + }, + { + "epoch": 1.9034347566692236, + "grad_norm": 58.085609397389554, + "learning_rate": 3.5473459357209057e-06, + "loss": 2.6284, + "step": 22333 + }, + { + "epoch": 1.903519986363249, + "grad_norm": 74.64557814521217, + "learning_rate": 3.5468714831194256e-06, + "loss": 1.7845, + "step": 22334 + }, + { + "epoch": 1.9036052160572745, + "grad_norm": 50.99044268718252, + "learning_rate": 3.5463970448088893e-06, + "loss": 2.9517, + "step": 22335 + }, + { + "epoch": 1.9036904457512998, + "grad_norm": 42.97065217265939, + "learning_rate": 3.5459226207939605e-06, + "loss": 3.3121, + "step": 22336 + }, + { + "epoch": 1.903775675445325, + "grad_norm": 39.816968796181214, + "learning_rate": 3.5454482110793044e-06, + "loss": 2.8101, + "step": 22337 + }, + { + "epoch": 1.9038609051393505, + "grad_norm": 42.712040944458515, + "learning_rate": 3.5449738156695875e-06, + "loss": 3.0029, + "step": 22338 + }, + { + "epoch": 1.903946134833376, + "grad_norm": 48.971557976148944, + "learning_rate": 3.544499434569473e-06, + "loss": 3.0982, + "step": 22339 + }, + { + "epoch": 1.9040313645274014, + "grad_norm": 55.493094509111714, + "learning_rate": 3.5440250677836307e-06, + "loss": 3.0536, + "step": 22340 + }, + { + "epoch": 1.9041165942214269, + "grad_norm": 33.33884820950505, + "learning_rate": 3.5435507153167216e-06, + "loss": 2.5074, + "step": 22341 + }, + { + "epoch": 1.9042018239154521, + "grad_norm": 36.87793526717135, + "learning_rate": 3.543076377173414e-06, + "loss": 2.477, + "step": 22342 + }, + { + "epoch": 1.9042870536094776, + "grad_norm": 22.22711408391591, + "learning_rate": 3.542602053358369e-06, + "loss": 1.8176, + "step": 22343 + }, + { + "epoch": 1.9043722833035028, + "grad_norm": 48.62799229116532, + "learning_rate": 3.542127743876256e-06, + "loss": 2.1769, + "step": 22344 + }, + { + "epoch": 1.9044575129975283, + "grad_norm": 86.22076812552007, + "learning_rate": 3.5416534487317357e-06, + "loss": 2.9153, + "step": 22345 + }, + { + "epoch": 1.9045427426915538, + "grad_norm": 47.659395877738305, + "learning_rate": 3.541179167929476e-06, + "loss": 2.4424, + "step": 22346 + }, + { + "epoch": 1.9046279723855792, + "grad_norm": 72.31437886157791, + "learning_rate": 3.540704901474137e-06, + "loss": 3.1597, + "step": 22347 + }, + { + "epoch": 1.9047132020796045, + "grad_norm": 35.68382045465277, + "learning_rate": 3.5402306493703877e-06, + "loss": 2.5755, + "step": 22348 + }, + { + "epoch": 1.90479843177363, + "grad_norm": 58.510752484209135, + "learning_rate": 3.539756411622889e-06, + "loss": 3.3018, + "step": 22349 + }, + { + "epoch": 1.9048836614676552, + "grad_norm": 59.22002187746703, + "learning_rate": 3.5392821882363064e-06, + "loss": 2.9458, + "step": 22350 + }, + { + "epoch": 1.9049688911616807, + "grad_norm": 98.04322471363234, + "learning_rate": 3.538807979215303e-06, + "loss": 4.3351, + "step": 22351 + }, + { + "epoch": 1.9050541208557061, + "grad_norm": 36.07504983425633, + "learning_rate": 3.538333784564541e-06, + "loss": 2.467, + "step": 22352 + }, + { + "epoch": 1.9051393505497316, + "grad_norm": 50.297426784826676, + "learning_rate": 3.537859604288686e-06, + "loss": 2.8812, + "step": 22353 + }, + { + "epoch": 1.905224580243757, + "grad_norm": 51.103341317146956, + "learning_rate": 3.537385438392402e-06, + "loss": 2.7038, + "step": 22354 + }, + { + "epoch": 1.9053098099377823, + "grad_norm": 71.31897338625485, + "learning_rate": 3.536911286880351e-06, + "loss": 3.4751, + "step": 22355 + }, + { + "epoch": 1.9053950396318076, + "grad_norm": 87.48464755440658, + "learning_rate": 3.536437149757194e-06, + "loss": 3.3399, + "step": 22356 + }, + { + "epoch": 1.905480269325833, + "grad_norm": 53.76507290037302, + "learning_rate": 3.5359630270275967e-06, + "loss": 2.89, + "step": 22357 + }, + { + "epoch": 1.9055654990198585, + "grad_norm": 39.384725466109494, + "learning_rate": 3.535488918696223e-06, + "loss": 2.7771, + "step": 22358 + }, + { + "epoch": 1.905650728713884, + "grad_norm": 37.45244527533216, + "learning_rate": 3.535014824767734e-06, + "loss": 2.7644, + "step": 22359 + }, + { + "epoch": 1.9057359584079094, + "grad_norm": 50.39218637730962, + "learning_rate": 3.5345407452467895e-06, + "loss": 2.1891, + "step": 22360 + }, + { + "epoch": 1.9058211881019347, + "grad_norm": 65.64097321631719, + "learning_rate": 3.5340666801380554e-06, + "loss": 2.5746, + "step": 22361 + }, + { + "epoch": 1.9059064177959602, + "grad_norm": 46.564342373664836, + "learning_rate": 3.5335926294461943e-06, + "loss": 2.7258, + "step": 22362 + }, + { + "epoch": 1.9059916474899854, + "grad_norm": 66.31593180316258, + "learning_rate": 3.533118593175867e-06, + "loss": 3.0904, + "step": 22363 + }, + { + "epoch": 1.9060768771840109, + "grad_norm": 60.807693491891655, + "learning_rate": 3.5326445713317335e-06, + "loss": 3.4064, + "step": 22364 + }, + { + "epoch": 1.9061621068780363, + "grad_norm": 44.93591953535416, + "learning_rate": 3.5321705639184585e-06, + "loss": 3.1589, + "step": 22365 + }, + { + "epoch": 1.9062473365720618, + "grad_norm": 74.26531716770164, + "learning_rate": 3.5316965709407034e-06, + "loss": 3.8378, + "step": 22366 + }, + { + "epoch": 1.9063325662660873, + "grad_norm": 65.47781906832515, + "learning_rate": 3.5312225924031296e-06, + "loss": 2.2251, + "step": 22367 + }, + { + "epoch": 1.9064177959601125, + "grad_norm": 42.77853197560557, + "learning_rate": 3.5307486283103966e-06, + "loss": 2.8834, + "step": 22368 + }, + { + "epoch": 1.9065030256541378, + "grad_norm": 69.54121602731666, + "learning_rate": 3.530274678667166e-06, + "loss": 2.9806, + "step": 22369 + }, + { + "epoch": 1.9065882553481632, + "grad_norm": 33.242861331780205, + "learning_rate": 3.529800743478102e-06, + "loss": 2.2316, + "step": 22370 + }, + { + "epoch": 1.9066734850421887, + "grad_norm": 73.65126682593352, + "learning_rate": 3.529326822747864e-06, + "loss": 2.6634, + "step": 22371 + }, + { + "epoch": 1.9067587147362142, + "grad_norm": 61.37197827761617, + "learning_rate": 3.52885291648111e-06, + "loss": 2.9446, + "step": 22372 + }, + { + "epoch": 1.9068439444302396, + "grad_norm": 29.177415966543805, + "learning_rate": 3.5283790246825033e-06, + "loss": 2.3995, + "step": 22373 + }, + { + "epoch": 1.9069291741242649, + "grad_norm": 32.67474974707389, + "learning_rate": 3.527905147356706e-06, + "loss": 2.4055, + "step": 22374 + }, + { + "epoch": 1.9070144038182901, + "grad_norm": 44.98469995574832, + "learning_rate": 3.527431284508376e-06, + "loss": 3.4461, + "step": 22375 + }, + { + "epoch": 1.9070996335123156, + "grad_norm": 33.321489723263596, + "learning_rate": 3.5269574361421733e-06, + "loss": 2.3807, + "step": 22376 + }, + { + "epoch": 1.907184863206341, + "grad_norm": 41.98494944402029, + "learning_rate": 3.526483602262758e-06, + "loss": 3.1037, + "step": 22377 + }, + { + "epoch": 1.9072700929003665, + "grad_norm": 118.28229093619903, + "learning_rate": 3.526009782874793e-06, + "loss": 4.0095, + "step": 22378 + }, + { + "epoch": 1.907355322594392, + "grad_norm": 103.34552399221603, + "learning_rate": 3.5255359779829368e-06, + "loss": 3.8351, + "step": 22379 + }, + { + "epoch": 1.9074405522884172, + "grad_norm": 71.06038572910244, + "learning_rate": 3.525062187591846e-06, + "loss": 2.9978, + "step": 22380 + }, + { + "epoch": 1.9075257819824427, + "grad_norm": 62.89291151281808, + "learning_rate": 3.5245884117061846e-06, + "loss": 3.7584, + "step": 22381 + }, + { + "epoch": 1.907611011676468, + "grad_norm": 38.1420806702347, + "learning_rate": 3.5241146503306074e-06, + "loss": 3.042, + "step": 22382 + }, + { + "epoch": 1.9076962413704934, + "grad_norm": 62.031895501162516, + "learning_rate": 3.5236409034697783e-06, + "loss": 3.2167, + "step": 22383 + }, + { + "epoch": 1.907781471064519, + "grad_norm": 77.76382113137724, + "learning_rate": 3.5231671711283535e-06, + "loss": 3.1059, + "step": 22384 + }, + { + "epoch": 1.9078667007585444, + "grad_norm": 34.53841538932813, + "learning_rate": 3.522693453310993e-06, + "loss": 1.5679, + "step": 22385 + }, + { + "epoch": 1.9079519304525698, + "grad_norm": 58.855267318889815, + "learning_rate": 3.522219750022354e-06, + "loss": 3.0612, + "step": 22386 + }, + { + "epoch": 1.908037160146595, + "grad_norm": 34.74631506474398, + "learning_rate": 3.5217460612670977e-06, + "loss": 2.9624, + "step": 22387 + }, + { + "epoch": 1.9081223898406203, + "grad_norm": 117.80083285101429, + "learning_rate": 3.521272387049881e-06, + "loss": 2.7123, + "step": 22388 + }, + { + "epoch": 1.9082076195346458, + "grad_norm": 81.10799729988176, + "learning_rate": 3.5207987273753643e-06, + "loss": 2.9932, + "step": 22389 + }, + { + "epoch": 1.9082928492286713, + "grad_norm": 42.71039135835205, + "learning_rate": 3.5203250822482015e-06, + "loss": 2.6111, + "step": 22390 + }, + { + "epoch": 1.9083780789226967, + "grad_norm": 50.83942584637926, + "learning_rate": 3.5198514516730563e-06, + "loss": 2.4218, + "step": 22391 + }, + { + "epoch": 1.9084633086167222, + "grad_norm": 43.78744217615663, + "learning_rate": 3.5193778356545827e-06, + "loss": 2.948, + "step": 22392 + }, + { + "epoch": 1.9085485383107474, + "grad_norm": 50.54717739359561, + "learning_rate": 3.5189042341974406e-06, + "loss": 2.863, + "step": 22393 + }, + { + "epoch": 1.908633768004773, + "grad_norm": 67.73501857801506, + "learning_rate": 3.5184306473062866e-06, + "loss": 3.3534, + "step": 22394 + }, + { + "epoch": 1.9087189976987982, + "grad_norm": 43.40835692913023, + "learning_rate": 3.5179570749857773e-06, + "loss": 2.7768, + "step": 22395 + }, + { + "epoch": 1.9088042273928236, + "grad_norm": 25.128807956419706, + "learning_rate": 3.5174835172405714e-06, + "loss": 1.9726, + "step": 22396 + }, + { + "epoch": 1.908889457086849, + "grad_norm": 34.010873705515095, + "learning_rate": 3.517009974075327e-06, + "loss": 2.6234, + "step": 22397 + }, + { + "epoch": 1.9089746867808746, + "grad_norm": 47.87422132045545, + "learning_rate": 3.516536445494701e-06, + "loss": 3.2859, + "step": 22398 + }, + { + "epoch": 1.9090599164748998, + "grad_norm": 52.11004926309001, + "learning_rate": 3.5160629315033468e-06, + "loss": 2.5773, + "step": 22399 + }, + { + "epoch": 1.9091451461689253, + "grad_norm": 45.70018216108891, + "learning_rate": 3.5155894321059258e-06, + "loss": 3.3062, + "step": 22400 + }, + { + "epoch": 1.9092303758629505, + "grad_norm": 45.740087544161355, + "learning_rate": 3.515115947307093e-06, + "loss": 2.0552, + "step": 22401 + }, + { + "epoch": 1.909315605556976, + "grad_norm": 27.175895322705927, + "learning_rate": 3.514642477111505e-06, + "loss": 1.9365, + "step": 22402 + }, + { + "epoch": 1.9094008352510015, + "grad_norm": 961.0449248007086, + "learning_rate": 3.5141690215238165e-06, + "loss": 1.7754, + "step": 22403 + }, + { + "epoch": 1.909486064945027, + "grad_norm": 49.50434692109111, + "learning_rate": 3.513695580548686e-06, + "loss": 2.1444, + "step": 22404 + }, + { + "epoch": 1.9095712946390524, + "grad_norm": 49.72081966527185, + "learning_rate": 3.5132221541907702e-06, + "loss": 3.3367, + "step": 22405 + }, + { + "epoch": 1.9096565243330776, + "grad_norm": 58.33213870577402, + "learning_rate": 3.5127487424547235e-06, + "loss": 2.9984, + "step": 22406 + }, + { + "epoch": 1.909741754027103, + "grad_norm": 30.54246259979958, + "learning_rate": 3.5122753453451994e-06, + "loss": 2.0621, + "step": 22407 + }, + { + "epoch": 1.9098269837211284, + "grad_norm": 34.48775893715524, + "learning_rate": 3.5118019628668574e-06, + "loss": 2.6063, + "step": 22408 + }, + { + "epoch": 1.9099122134151538, + "grad_norm": 46.690069219083156, + "learning_rate": 3.5113285950243537e-06, + "loss": 2.4668, + "step": 22409 + }, + { + "epoch": 1.9099974431091793, + "grad_norm": 34.7431532371028, + "learning_rate": 3.5108552418223405e-06, + "loss": 1.9348, + "step": 22410 + }, + { + "epoch": 1.9100826728032048, + "grad_norm": 50.37714331139301, + "learning_rate": 3.510381903265474e-06, + "loss": 3.2148, + "step": 22411 + }, + { + "epoch": 1.91016790249723, + "grad_norm": 87.89338362473067, + "learning_rate": 3.509908579358408e-06, + "loss": 3.3058, + "step": 22412 + }, + { + "epoch": 1.9102531321912555, + "grad_norm": 42.23857579435014, + "learning_rate": 3.5094352701058017e-06, + "loss": 3.1182, + "step": 22413 + }, + { + "epoch": 1.9103383618852807, + "grad_norm": 46.72797247349046, + "learning_rate": 3.5089619755123067e-06, + "loss": 2.0631, + "step": 22414 + }, + { + "epoch": 1.9104235915793062, + "grad_norm": 39.41870382015389, + "learning_rate": 3.508488695582577e-06, + "loss": 2.674, + "step": 22415 + }, + { + "epoch": 1.9105088212733317, + "grad_norm": 36.23986833752502, + "learning_rate": 3.5080154303212677e-06, + "loss": 3.0331, + "step": 22416 + }, + { + "epoch": 1.9105940509673571, + "grad_norm": 74.38315662739217, + "learning_rate": 3.5075421797330355e-06, + "loss": 3.5334, + "step": 22417 + }, + { + "epoch": 1.9106792806613824, + "grad_norm": 39.786653481028594, + "learning_rate": 3.5070689438225337e-06, + "loss": 2.3164, + "step": 22418 + }, + { + "epoch": 1.9107645103554078, + "grad_norm": 37.28262857358552, + "learning_rate": 3.506595722594414e-06, + "loss": 1.6833, + "step": 22419 + }, + { + "epoch": 1.910849740049433, + "grad_norm": 53.16869368092541, + "learning_rate": 3.5061225160533306e-06, + "loss": 3.5631, + "step": 22420 + }, + { + "epoch": 1.9109349697434586, + "grad_norm": 17.506100231213928, + "learning_rate": 3.5056493242039415e-06, + "loss": 1.5016, + "step": 22421 + }, + { + "epoch": 1.911020199437484, + "grad_norm": 55.977795906983616, + "learning_rate": 3.5051761470508964e-06, + "loss": 2.4249, + "step": 22422 + }, + { + "epoch": 1.9111054291315095, + "grad_norm": 51.097896559274034, + "learning_rate": 3.50470298459885e-06, + "loss": 2.1123, + "step": 22423 + }, + { + "epoch": 1.911190658825535, + "grad_norm": 53.817781451424544, + "learning_rate": 3.504229836852456e-06, + "loss": 3.481, + "step": 22424 + }, + { + "epoch": 1.9112758885195602, + "grad_norm": 37.40850775670952, + "learning_rate": 3.5037567038163657e-06, + "loss": 2.6096, + "step": 22425 + }, + { + "epoch": 1.9113611182135855, + "grad_norm": 64.57641498404178, + "learning_rate": 3.5032835854952352e-06, + "loss": 2.3264, + "step": 22426 + }, + { + "epoch": 1.911446347907611, + "grad_norm": 46.29564472840153, + "learning_rate": 3.5028104818937146e-06, + "loss": 3.1992, + "step": 22427 + }, + { + "epoch": 1.9115315776016364, + "grad_norm": 32.83640066760201, + "learning_rate": 3.5023373930164595e-06, + "loss": 1.2396, + "step": 22428 + }, + { + "epoch": 1.9116168072956619, + "grad_norm": 36.6312021203797, + "learning_rate": 3.501864318868119e-06, + "loss": 2.6182, + "step": 22429 + }, + { + "epoch": 1.9117020369896873, + "grad_norm": 32.477992704517234, + "learning_rate": 3.5013912594533496e-06, + "loss": 2.3615, + "step": 22430 + }, + { + "epoch": 1.9117872666837126, + "grad_norm": 99.90340178877004, + "learning_rate": 3.5009182147768e-06, + "loss": 2.2921, + "step": 22431 + }, + { + "epoch": 1.911872496377738, + "grad_norm": 26.049708552806383, + "learning_rate": 3.500445184843126e-06, + "loss": 2.1203, + "step": 22432 + }, + { + "epoch": 1.9119577260717633, + "grad_norm": 65.58684964802103, + "learning_rate": 3.4999721696569754e-06, + "loss": 2.8844, + "step": 22433 + }, + { + "epoch": 1.9120429557657888, + "grad_norm": 31.5503881870738, + "learning_rate": 3.499499169223004e-06, + "loss": 2.5171, + "step": 22434 + }, + { + "epoch": 1.9121281854598142, + "grad_norm": 41.01290969559513, + "learning_rate": 3.4990261835458616e-06, + "loss": 1.9851, + "step": 22435 + }, + { + "epoch": 1.9122134151538397, + "grad_norm": 41.8404725142147, + "learning_rate": 3.4985532126302014e-06, + "loss": 3.147, + "step": 22436 + }, + { + "epoch": 1.9122986448478652, + "grad_norm": 46.71626295682169, + "learning_rate": 3.4980802564806717e-06, + "loss": 2.9233, + "step": 22437 + }, + { + "epoch": 1.9123838745418904, + "grad_norm": 54.85982746378578, + "learning_rate": 3.4976073151019274e-06, + "loss": 2.5834, + "step": 22438 + }, + { + "epoch": 1.9124691042359157, + "grad_norm": 32.94008310599171, + "learning_rate": 3.4971343884986175e-06, + "loss": 1.949, + "step": 22439 + }, + { + "epoch": 1.9125543339299411, + "grad_norm": 29.48882336609039, + "learning_rate": 3.4966614766753948e-06, + "loss": 2.2494, + "step": 22440 + }, + { + "epoch": 1.9126395636239666, + "grad_norm": 92.5104394824468, + "learning_rate": 3.496188579636909e-06, + "loss": 3.7283, + "step": 22441 + }, + { + "epoch": 1.912724793317992, + "grad_norm": 57.74780302538137, + "learning_rate": 3.495715697387809e-06, + "loss": 2.3492, + "step": 22442 + }, + { + "epoch": 1.9128100230120175, + "grad_norm": 37.351858599594145, + "learning_rate": 3.495242829932748e-06, + "loss": 2.3946, + "step": 22443 + }, + { + "epoch": 1.9128952527060428, + "grad_norm": 97.70581631375876, + "learning_rate": 3.494769977276377e-06, + "loss": 3.1169, + "step": 22444 + }, + { + "epoch": 1.9129804824000682, + "grad_norm": 42.40174094930178, + "learning_rate": 3.4942971394233454e-06, + "loss": 3.0756, + "step": 22445 + }, + { + "epoch": 1.9130657120940935, + "grad_norm": 42.806993552916765, + "learning_rate": 3.493824316378301e-06, + "loss": 2.9657, + "step": 22446 + }, + { + "epoch": 1.913150941788119, + "grad_norm": 100.3532502807617, + "learning_rate": 3.493351508145897e-06, + "loss": 3.9318, + "step": 22447 + }, + { + "epoch": 1.9132361714821444, + "grad_norm": 73.37023965892487, + "learning_rate": 3.492878714730783e-06, + "loss": 4.0566, + "step": 22448 + }, + { + "epoch": 1.91332140117617, + "grad_norm": 69.74874859176224, + "learning_rate": 3.4924059361376083e-06, + "loss": 3.9652, + "step": 22449 + }, + { + "epoch": 1.9134066308701951, + "grad_norm": 37.89493897772475, + "learning_rate": 3.49193317237102e-06, + "loss": 2.5207, + "step": 22450 + }, + { + "epoch": 1.9134918605642206, + "grad_norm": 22.786837309831647, + "learning_rate": 3.491460423435671e-06, + "loss": 1.5474, + "step": 22451 + }, + { + "epoch": 1.9135770902582458, + "grad_norm": 41.28007871359806, + "learning_rate": 3.4909876893362092e-06, + "loss": 2.7226, + "step": 22452 + }, + { + "epoch": 1.9136623199522713, + "grad_norm": 37.72599756054727, + "learning_rate": 3.490514970077285e-06, + "loss": 2.9501, + "step": 22453 + }, + { + "epoch": 1.9137475496462968, + "grad_norm": 36.00147479377527, + "learning_rate": 3.490042265663545e-06, + "loss": 3.2987, + "step": 22454 + }, + { + "epoch": 1.9138327793403223, + "grad_norm": 43.16244607559806, + "learning_rate": 3.4895695760996384e-06, + "loss": 2.447, + "step": 22455 + }, + { + "epoch": 1.9139180090343477, + "grad_norm": 258.91413200672207, + "learning_rate": 3.4890969013902165e-06, + "loss": 3.805, + "step": 22456 + }, + { + "epoch": 1.914003238728373, + "grad_norm": 47.77631336463261, + "learning_rate": 3.4886242415399264e-06, + "loss": 2.7241, + "step": 22457 + }, + { + "epoch": 1.9140884684223982, + "grad_norm": 29.04179538728557, + "learning_rate": 3.488151596553415e-06, + "loss": 2.3288, + "step": 22458 + }, + { + "epoch": 1.9141736981164237, + "grad_norm": 56.369798939934114, + "learning_rate": 3.4876789664353315e-06, + "loss": 3.1646, + "step": 22459 + }, + { + "epoch": 1.9142589278104492, + "grad_norm": 52.39527379733648, + "learning_rate": 3.4872063511903263e-06, + "loss": 2.1741, + "step": 22460 + }, + { + "epoch": 1.9143441575044746, + "grad_norm": 67.79206984775001, + "learning_rate": 3.486733750823045e-06, + "loss": 1.9319, + "step": 22461 + }, + { + "epoch": 1.9144293871985, + "grad_norm": 49.99317667041569, + "learning_rate": 3.486261165338135e-06, + "loss": 2.9181, + "step": 22462 + }, + { + "epoch": 1.9145146168925253, + "grad_norm": 49.881038704616635, + "learning_rate": 3.4857885947402446e-06, + "loss": 2.7703, + "step": 22463 + }, + { + "epoch": 1.9145998465865508, + "grad_norm": 64.62014520024373, + "learning_rate": 3.4853160390340234e-06, + "loss": 2.7691, + "step": 22464 + }, + { + "epoch": 1.914685076280576, + "grad_norm": 73.63937225842967, + "learning_rate": 3.484843498224117e-06, + "loss": 2.9338, + "step": 22465 + }, + { + "epoch": 1.9147703059746015, + "grad_norm": 72.66264908955033, + "learning_rate": 3.484370972315172e-06, + "loss": 2.5776, + "step": 22466 + }, + { + "epoch": 1.914855535668627, + "grad_norm": 30.889184181917962, + "learning_rate": 3.4838984613118344e-06, + "loss": 2.3034, + "step": 22467 + }, + { + "epoch": 1.9149407653626525, + "grad_norm": 49.632007184699255, + "learning_rate": 3.4834259652187556e-06, + "loss": 3.3152, + "step": 22468 + }, + { + "epoch": 1.9150259950566777, + "grad_norm": 47.26396553928787, + "learning_rate": 3.48295348404058e-06, + "loss": 3.3347, + "step": 22469 + }, + { + "epoch": 1.9151112247507032, + "grad_norm": 29.883753493922605, + "learning_rate": 3.482481017781952e-06, + "loss": 2.9361, + "step": 22470 + }, + { + "epoch": 1.9151964544447284, + "grad_norm": 33.71836129523446, + "learning_rate": 3.482008566447523e-06, + "loss": 2.711, + "step": 22471 + }, + { + "epoch": 1.9152816841387539, + "grad_norm": 41.25223827616115, + "learning_rate": 3.4815361300419332e-06, + "loss": 2.892, + "step": 22472 + }, + { + "epoch": 1.9153669138327793, + "grad_norm": 71.18304515420277, + "learning_rate": 3.4810637085698347e-06, + "loss": 3.2787, + "step": 22473 + }, + { + "epoch": 1.9154521435268048, + "grad_norm": 59.43895222966084, + "learning_rate": 3.4805913020358695e-06, + "loss": 2.1241, + "step": 22474 + }, + { + "epoch": 1.9155373732208303, + "grad_norm": 33.53053755961118, + "learning_rate": 3.480118910444687e-06, + "loss": 2.7786, + "step": 22475 + }, + { + "epoch": 1.9156226029148555, + "grad_norm": 49.15499968618946, + "learning_rate": 3.4796465338009283e-06, + "loss": 2.9854, + "step": 22476 + }, + { + "epoch": 1.9157078326088808, + "grad_norm": 31.626937142390805, + "learning_rate": 3.4791741721092444e-06, + "loss": 2.7186, + "step": 22477 + }, + { + "epoch": 1.9157930623029062, + "grad_norm": 68.67625743361347, + "learning_rate": 3.4787018253742767e-06, + "loss": 2.9068, + "step": 22478 + }, + { + "epoch": 1.9158782919969317, + "grad_norm": 48.48553656888106, + "learning_rate": 3.478229493600673e-06, + "loss": 1.5744, + "step": 22479 + }, + { + "epoch": 1.9159635216909572, + "grad_norm": 29.787593376052218, + "learning_rate": 3.477757176793076e-06, + "loss": 2.3763, + "step": 22480 + }, + { + "epoch": 1.9160487513849827, + "grad_norm": 35.21878716833415, + "learning_rate": 3.477284874956134e-06, + "loss": 2.7342, + "step": 22481 + }, + { + "epoch": 1.916133981079008, + "grad_norm": 40.52602473241062, + "learning_rate": 3.4768125880944893e-06, + "loss": 3.0383, + "step": 22482 + }, + { + "epoch": 1.9162192107730334, + "grad_norm": 81.28960528620992, + "learning_rate": 3.4763403162127884e-06, + "loss": 3.4339, + "step": 22483 + }, + { + "epoch": 1.9163044404670586, + "grad_norm": 43.438456201029524, + "learning_rate": 3.475868059315675e-06, + "loss": 2.8359, + "step": 22484 + }, + { + "epoch": 1.916389670161084, + "grad_norm": 37.76516457781735, + "learning_rate": 3.475395817407792e-06, + "loss": 2.3064, + "step": 22485 + }, + { + "epoch": 1.9164748998551095, + "grad_norm": 54.52398508883612, + "learning_rate": 3.474923590493786e-06, + "loss": 2.6398, + "step": 22486 + }, + { + "epoch": 1.916560129549135, + "grad_norm": 31.511573619180915, + "learning_rate": 3.474451378578302e-06, + "loss": 2.1199, + "step": 22487 + }, + { + "epoch": 1.9166453592431603, + "grad_norm": 47.65809382193477, + "learning_rate": 3.4739791816659816e-06, + "loss": 2.4832, + "step": 22488 + }, + { + "epoch": 1.9167305889371857, + "grad_norm": 35.31918728138977, + "learning_rate": 3.4735069997614677e-06, + "loss": 3.1642, + "step": 22489 + }, + { + "epoch": 1.916815818631211, + "grad_norm": 27.68025838190866, + "learning_rate": 3.4730348328694075e-06, + "loss": 1.9235, + "step": 22490 + }, + { + "epoch": 1.9169010483252364, + "grad_norm": 67.58923228126055, + "learning_rate": 3.4725626809944435e-06, + "loss": 2.8616, + "step": 22491 + }, + { + "epoch": 1.916986278019262, + "grad_norm": 50.3218781921586, + "learning_rate": 3.4720905441412188e-06, + "loss": 2.5233, + "step": 22492 + }, + { + "epoch": 1.9170715077132874, + "grad_norm": 43.683733013384476, + "learning_rate": 3.4716184223143746e-06, + "loss": 2.6751, + "step": 22493 + }, + { + "epoch": 1.9171567374073128, + "grad_norm": 27.972682020365895, + "learning_rate": 3.4711463155185566e-06, + "loss": 1.7433, + "step": 22494 + }, + { + "epoch": 1.917241967101338, + "grad_norm": 37.04956134338304, + "learning_rate": 3.4706742237584077e-06, + "loss": 2.7672, + "step": 22495 + }, + { + "epoch": 1.9173271967953633, + "grad_norm": 64.83869116044312, + "learning_rate": 3.470202147038571e-06, + "loss": 3.1374, + "step": 22496 + }, + { + "epoch": 1.9174124264893888, + "grad_norm": 86.1540199089347, + "learning_rate": 3.469730085363687e-06, + "loss": 2.2996, + "step": 22497 + }, + { + "epoch": 1.9174976561834143, + "grad_norm": 56.33008723407295, + "learning_rate": 3.469258038738398e-06, + "loss": 3.6561, + "step": 22498 + }, + { + "epoch": 1.9175828858774397, + "grad_norm": 39.19550867038584, + "learning_rate": 3.4687860071673507e-06, + "loss": 3.1272, + "step": 22499 + }, + { + "epoch": 1.9176681155714652, + "grad_norm": 83.20865959972741, + "learning_rate": 3.4683139906551843e-06, + "loss": 4.431, + "step": 22500 + }, + { + "epoch": 1.9177533452654905, + "grad_norm": 61.634792804838064, + "learning_rate": 3.4678419892065395e-06, + "loss": 3.4594, + "step": 22501 + }, + { + "epoch": 1.917838574959516, + "grad_norm": 55.04066889044702, + "learning_rate": 3.46737000282606e-06, + "loss": 3.0719, + "step": 22502 + }, + { + "epoch": 1.9179238046535412, + "grad_norm": 71.3699087563439, + "learning_rate": 3.4668980315183888e-06, + "loss": 2.6617, + "step": 22503 + }, + { + "epoch": 1.9180090343475666, + "grad_norm": 93.50274615999727, + "learning_rate": 3.4664260752881668e-06, + "loss": 3.8554, + "step": 22504 + }, + { + "epoch": 1.9180942640415921, + "grad_norm": 30.29506012950551, + "learning_rate": 3.465954134140034e-06, + "loss": 2.2626, + "step": 22505 + }, + { + "epoch": 1.9181794937356176, + "grad_norm": 43.20406881045927, + "learning_rate": 3.465482208078631e-06, + "loss": 2.8441, + "step": 22506 + }, + { + "epoch": 1.918264723429643, + "grad_norm": 32.52810847210465, + "learning_rate": 3.465010297108603e-06, + "loss": 2.6114, + "step": 22507 + }, + { + "epoch": 1.9183499531236683, + "grad_norm": 29.645514226373024, + "learning_rate": 3.464538401234589e-06, + "loss": 1.8679, + "step": 22508 + }, + { + "epoch": 1.9184351828176935, + "grad_norm": 63.08849026757144, + "learning_rate": 3.464066520461229e-06, + "loss": 2.3964, + "step": 22509 + }, + { + "epoch": 1.918520412511719, + "grad_norm": 81.50986184330664, + "learning_rate": 3.4635946547931626e-06, + "loss": 3.3292, + "step": 22510 + }, + { + "epoch": 1.9186056422057445, + "grad_norm": 43.34951914085417, + "learning_rate": 3.463122804235035e-06, + "loss": 3.3467, + "step": 22511 + }, + { + "epoch": 1.91869087189977, + "grad_norm": 81.72187988102665, + "learning_rate": 3.462650968791483e-06, + "loss": 3.6393, + "step": 22512 + }, + { + "epoch": 1.9187761015937954, + "grad_norm": 74.73462975496808, + "learning_rate": 3.462179148467148e-06, + "loss": 2.6412, + "step": 22513 + }, + { + "epoch": 1.9188613312878207, + "grad_norm": 57.54225613697799, + "learning_rate": 3.4617073432666704e-06, + "loss": 2.3599, + "step": 22514 + }, + { + "epoch": 1.9189465609818461, + "grad_norm": 42.637514327460266, + "learning_rate": 3.4612355531946877e-06, + "loss": 2.9809, + "step": 22515 + }, + { + "epoch": 1.9190317906758714, + "grad_norm": 65.25544516238945, + "learning_rate": 3.4607637782558447e-06, + "loss": 2.2733, + "step": 22516 + }, + { + "epoch": 1.9191170203698968, + "grad_norm": 44.501088470449965, + "learning_rate": 3.460292018454776e-06, + "loss": 3.0505, + "step": 22517 + }, + { + "epoch": 1.9192022500639223, + "grad_norm": 42.912470589589475, + "learning_rate": 3.4598202737961255e-06, + "loss": 3.0968, + "step": 22518 + }, + { + "epoch": 1.9192874797579478, + "grad_norm": 41.31041669765266, + "learning_rate": 3.459348544284528e-06, + "loss": 1.9161, + "step": 22519 + }, + { + "epoch": 1.919372709451973, + "grad_norm": 22.60660412147093, + "learning_rate": 3.4588768299246277e-06, + "loss": 1.5125, + "step": 22520 + }, + { + "epoch": 1.9194579391459985, + "grad_norm": 72.6120041529617, + "learning_rate": 3.458405130721061e-06, + "loss": 2.8124, + "step": 22521 + }, + { + "epoch": 1.9195431688400237, + "grad_norm": 65.77801359210356, + "learning_rate": 3.4579334466784674e-06, + "loss": 3.3732, + "step": 22522 + }, + { + "epoch": 1.9196283985340492, + "grad_norm": 42.53722709224691, + "learning_rate": 3.4574617778014834e-06, + "loss": 2.9787, + "step": 22523 + }, + { + "epoch": 1.9197136282280747, + "grad_norm": 50.94288235299415, + "learning_rate": 3.4569901240947523e-06, + "loss": 3.3481, + "step": 22524 + }, + { + "epoch": 1.9197988579221001, + "grad_norm": 34.242725473365475, + "learning_rate": 3.4565184855629096e-06, + "loss": 2.5406, + "step": 22525 + }, + { + "epoch": 1.9198840876161256, + "grad_norm": 75.654084232724, + "learning_rate": 3.456046862210595e-06, + "loss": 2.6472, + "step": 22526 + }, + { + "epoch": 1.9199693173101509, + "grad_norm": 35.62643644634195, + "learning_rate": 3.455575254042446e-06, + "loss": 2.4802, + "step": 22527 + }, + { + "epoch": 1.920054547004176, + "grad_norm": 114.37250865959172, + "learning_rate": 3.4551036610630982e-06, + "loss": 3.1378, + "step": 22528 + }, + { + "epoch": 1.9201397766982016, + "grad_norm": 44.772145704151455, + "learning_rate": 3.4546320832771927e-06, + "loss": 3.985, + "step": 22529 + }, + { + "epoch": 1.920225006392227, + "grad_norm": 86.03561062527031, + "learning_rate": 3.4541605206893686e-06, + "loss": 4.0026, + "step": 22530 + }, + { + "epoch": 1.9203102360862525, + "grad_norm": 50.04968935574366, + "learning_rate": 3.453688973304261e-06, + "loss": 2.946, + "step": 22531 + }, + { + "epoch": 1.920395465780278, + "grad_norm": 77.37169393620803, + "learning_rate": 3.4532174411265064e-06, + "loss": 3.0704, + "step": 22532 + }, + { + "epoch": 1.9204806954743032, + "grad_norm": 83.35731347869954, + "learning_rate": 3.452745924160744e-06, + "loss": 3.321, + "step": 22533 + }, + { + "epoch": 1.9205659251683287, + "grad_norm": 77.73503929178794, + "learning_rate": 3.4522744224116124e-06, + "loss": 2.582, + "step": 22534 + }, + { + "epoch": 1.920651154862354, + "grad_norm": 46.18523202918086, + "learning_rate": 3.451802935883746e-06, + "loss": 2.59, + "step": 22535 + }, + { + "epoch": 1.9207363845563794, + "grad_norm": 78.00501574755596, + "learning_rate": 3.451331464581781e-06, + "loss": 2.8642, + "step": 22536 + }, + { + "epoch": 1.9208216142504049, + "grad_norm": 41.64953457944911, + "learning_rate": 3.4508600085103565e-06, + "loss": 3.1215, + "step": 22537 + }, + { + "epoch": 1.9209068439444303, + "grad_norm": 57.87589631385666, + "learning_rate": 3.4503885676741096e-06, + "loss": 2.5358, + "step": 22538 + }, + { + "epoch": 1.9209920736384556, + "grad_norm": 19.9400464305731, + "learning_rate": 3.449917142077675e-06, + "loss": 1.7695, + "step": 22539 + }, + { + "epoch": 1.921077303332481, + "grad_norm": 45.258899164890884, + "learning_rate": 3.4494457317256874e-06, + "loss": 2.9348, + "step": 22540 + }, + { + "epoch": 1.9211625330265063, + "grad_norm": 66.35267095800027, + "learning_rate": 3.4489743366227856e-06, + "loss": 3.3557, + "step": 22541 + }, + { + "epoch": 1.9212477627205318, + "grad_norm": 38.87577346052171, + "learning_rate": 3.448502956773607e-06, + "loss": 2.393, + "step": 22542 + }, + { + "epoch": 1.9213329924145572, + "grad_norm": 39.26389278747821, + "learning_rate": 3.4480315921827843e-06, + "loss": 3.077, + "step": 22543 + }, + { + "epoch": 1.9214182221085827, + "grad_norm": 17.869175949008746, + "learning_rate": 3.4475602428549534e-06, + "loss": 0.9152, + "step": 22544 + }, + { + "epoch": 1.9215034518026082, + "grad_norm": 42.770814115766775, + "learning_rate": 3.4470889087947494e-06, + "loss": 2.6022, + "step": 22545 + }, + { + "epoch": 1.9215886814966334, + "grad_norm": 49.73694348383101, + "learning_rate": 3.4466175900068112e-06, + "loss": 1.75, + "step": 22546 + }, + { + "epoch": 1.9216739111906587, + "grad_norm": 40.3060779432454, + "learning_rate": 3.4461462864957717e-06, + "loss": 2.6726, + "step": 22547 + }, + { + "epoch": 1.9217591408846841, + "grad_norm": 36.511348752910244, + "learning_rate": 3.4456749982662653e-06, + "loss": 2.083, + "step": 22548 + }, + { + "epoch": 1.9218443705787096, + "grad_norm": 66.86641385913425, + "learning_rate": 3.4452037253229266e-06, + "loss": 2.8376, + "step": 22549 + }, + { + "epoch": 1.921929600272735, + "grad_norm": 30.348189978761294, + "learning_rate": 3.444732467670393e-06, + "loss": 1.9795, + "step": 22550 + }, + { + "epoch": 1.9220148299667605, + "grad_norm": 41.431705145404635, + "learning_rate": 3.444261225313298e-06, + "loss": 1.8336, + "step": 22551 + }, + { + "epoch": 1.9221000596607858, + "grad_norm": 54.161513635764656, + "learning_rate": 3.4437899982562744e-06, + "loss": 2.6303, + "step": 22552 + }, + { + "epoch": 1.9221852893548113, + "grad_norm": 59.26485156239762, + "learning_rate": 3.4433187865039574e-06, + "loss": 2.3069, + "step": 22553 + }, + { + "epoch": 1.9222705190488365, + "grad_norm": 67.55971059993969, + "learning_rate": 3.4428475900609825e-06, + "loss": 2.6355, + "step": 22554 + }, + { + "epoch": 1.922355748742862, + "grad_norm": 38.20850142707576, + "learning_rate": 3.4423764089319834e-06, + "loss": 2.67, + "step": 22555 + }, + { + "epoch": 1.9224409784368874, + "grad_norm": 72.84657634836962, + "learning_rate": 3.4419052431215925e-06, + "loss": 2.9987, + "step": 22556 + }, + { + "epoch": 1.922526208130913, + "grad_norm": 38.25642674883038, + "learning_rate": 3.4414340926344457e-06, + "loss": 2.903, + "step": 22557 + }, + { + "epoch": 1.9226114378249384, + "grad_norm": 45.13886706244441, + "learning_rate": 3.4409629574751723e-06, + "loss": 2.7509, + "step": 22558 + }, + { + "epoch": 1.9226966675189636, + "grad_norm": 26.926893036206593, + "learning_rate": 3.440491837648412e-06, + "loss": 1.8285, + "step": 22559 + }, + { + "epoch": 1.9227818972129889, + "grad_norm": 31.132530462663734, + "learning_rate": 3.4400207331587927e-06, + "loss": 1.9716, + "step": 22560 + }, + { + "epoch": 1.9228671269070143, + "grad_norm": 31.248469819321283, + "learning_rate": 3.4395496440109512e-06, + "loss": 2.5971, + "step": 22561 + }, + { + "epoch": 1.9229523566010398, + "grad_norm": 42.98156816514109, + "learning_rate": 3.4390785702095164e-06, + "loss": 3.3718, + "step": 22562 + }, + { + "epoch": 1.9230375862950653, + "grad_norm": 39.397228139475864, + "learning_rate": 3.4386075117591256e-06, + "loss": 2.5037, + "step": 22563 + }, + { + "epoch": 1.9231228159890907, + "grad_norm": 16.770373252013023, + "learning_rate": 3.438136468664409e-06, + "loss": 0.9448, + "step": 22564 + }, + { + "epoch": 1.923208045683116, + "grad_norm": 74.81119293095789, + "learning_rate": 3.4376654409300004e-06, + "loss": 2.4295, + "step": 22565 + }, + { + "epoch": 1.9232932753771412, + "grad_norm": 56.564447432014234, + "learning_rate": 3.4371944285605296e-06, + "loss": 3.3938, + "step": 22566 + }, + { + "epoch": 1.9233785050711667, + "grad_norm": 38.50920633249076, + "learning_rate": 3.4367234315606325e-06, + "loss": 2.7282, + "step": 22567 + }, + { + "epoch": 1.9234637347651922, + "grad_norm": 41.42362018862382, + "learning_rate": 3.4362524499349383e-06, + "loss": 3.1836, + "step": 22568 + }, + { + "epoch": 1.9235489644592176, + "grad_norm": 101.64581335526721, + "learning_rate": 3.4357814836880796e-06, + "loss": 4.9172, + "step": 22569 + }, + { + "epoch": 1.923634194153243, + "grad_norm": 43.53721024210507, + "learning_rate": 3.4353105328246904e-06, + "loss": 3.1594, + "step": 22570 + }, + { + "epoch": 1.9237194238472684, + "grad_norm": 33.09881795914722, + "learning_rate": 3.4348395973493974e-06, + "loss": 2.3215, + "step": 22571 + }, + { + "epoch": 1.9238046535412938, + "grad_norm": 71.05098371043925, + "learning_rate": 3.434368677266836e-06, + "loss": 3.5621, + "step": 22572 + }, + { + "epoch": 1.923889883235319, + "grad_norm": 89.60458168748667, + "learning_rate": 3.433897772581638e-06, + "loss": 3.1531, + "step": 22573 + }, + { + "epoch": 1.9239751129293445, + "grad_norm": 75.91347262239498, + "learning_rate": 3.4334268832984324e-06, + "loss": 2.2298, + "step": 22574 + }, + { + "epoch": 1.92406034262337, + "grad_norm": 40.93168707321672, + "learning_rate": 3.4329560094218495e-06, + "loss": 2.5125, + "step": 22575 + }, + { + "epoch": 1.9241455723173955, + "grad_norm": 62.34968149823863, + "learning_rate": 3.432485150956522e-06, + "loss": 2.7969, + "step": 22576 + }, + { + "epoch": 1.924230802011421, + "grad_norm": 75.81034700211275, + "learning_rate": 3.4320143079070815e-06, + "loss": 3.07, + "step": 22577 + }, + { + "epoch": 1.9243160317054462, + "grad_norm": 50.23814960382765, + "learning_rate": 3.431543480278158e-06, + "loss": 2.4121, + "step": 22578 + }, + { + "epoch": 1.9244012613994714, + "grad_norm": 41.205679327536274, + "learning_rate": 3.431072668074378e-06, + "loss": 3.0173, + "step": 22579 + }, + { + "epoch": 1.924486491093497, + "grad_norm": 41.69611396126355, + "learning_rate": 3.4306018713003765e-06, + "loss": 3.0628, + "step": 22580 + }, + { + "epoch": 1.9245717207875224, + "grad_norm": 45.051724125083844, + "learning_rate": 3.430131089960782e-06, + "loss": 2.0364, + "step": 22581 + }, + { + "epoch": 1.9246569504815478, + "grad_norm": 42.44180186859461, + "learning_rate": 3.4296603240602253e-06, + "loss": 1.9975, + "step": 22582 + }, + { + "epoch": 1.9247421801755733, + "grad_norm": 29.88022420164051, + "learning_rate": 3.429189573603333e-06, + "loss": 2.0902, + "step": 22583 + }, + { + "epoch": 1.9248274098695985, + "grad_norm": 57.505209673208704, + "learning_rate": 3.428718838594739e-06, + "loss": 2.3517, + "step": 22584 + }, + { + "epoch": 1.924912639563624, + "grad_norm": 41.13245146869787, + "learning_rate": 3.4282481190390705e-06, + "loss": 3.2481, + "step": 22585 + }, + { + "epoch": 1.9249978692576493, + "grad_norm": 56.11539773512064, + "learning_rate": 3.4277774149409584e-06, + "loss": 2.8823, + "step": 22586 + }, + { + "epoch": 1.9250830989516747, + "grad_norm": 101.47072692290034, + "learning_rate": 3.4273067263050287e-06, + "loss": 2.3284, + "step": 22587 + }, + { + "epoch": 1.9251683286457002, + "grad_norm": 58.858990800806254, + "learning_rate": 3.4268360531359117e-06, + "loss": 3.5172, + "step": 22588 + }, + { + "epoch": 1.9252535583397257, + "grad_norm": 36.7258909460248, + "learning_rate": 3.426365395438239e-06, + "loss": 3.6464, + "step": 22589 + }, + { + "epoch": 1.925338788033751, + "grad_norm": 37.41891497031975, + "learning_rate": 3.4258947532166376e-06, + "loss": 2.5276, + "step": 22590 + }, + { + "epoch": 1.9254240177277764, + "grad_norm": 60.819316200975116, + "learning_rate": 3.4254241264757347e-06, + "loss": 2.5449, + "step": 22591 + }, + { + "epoch": 1.9255092474218016, + "grad_norm": 78.35362281075246, + "learning_rate": 3.4249535152201585e-06, + "loss": 3.3239, + "step": 22592 + }, + { + "epoch": 1.925594477115827, + "grad_norm": 33.08317372222727, + "learning_rate": 3.4244829194545416e-06, + "loss": 2.0049, + "step": 22593 + }, + { + "epoch": 1.9256797068098526, + "grad_norm": 41.69588175921715, + "learning_rate": 3.4240123391835077e-06, + "loss": 2.8236, + "step": 22594 + }, + { + "epoch": 1.925764936503878, + "grad_norm": 39.837818413034704, + "learning_rate": 3.423541774411686e-06, + "loss": 2.6412, + "step": 22595 + }, + { + "epoch": 1.9258501661979035, + "grad_norm": 41.01524399681261, + "learning_rate": 3.423071225143704e-06, + "loss": 3.1852, + "step": 22596 + }, + { + "epoch": 1.9259353958919287, + "grad_norm": 67.4112547220131, + "learning_rate": 3.4226006913841914e-06, + "loss": 2.1826, + "step": 22597 + }, + { + "epoch": 1.926020625585954, + "grad_norm": 33.53235177675411, + "learning_rate": 3.4221301731377745e-06, + "loss": 2.3017, + "step": 22598 + }, + { + "epoch": 1.9261058552799795, + "grad_norm": 61.78987424888721, + "learning_rate": 3.4216596704090786e-06, + "loss": 2.7302, + "step": 22599 + }, + { + "epoch": 1.926191084974005, + "grad_norm": 79.49572919897243, + "learning_rate": 3.4211891832027337e-06, + "loss": 4.0286, + "step": 22600 + }, + { + "epoch": 1.9262763146680304, + "grad_norm": 34.87226601860074, + "learning_rate": 3.4207187115233642e-06, + "loss": 2.3093, + "step": 22601 + }, + { + "epoch": 1.9263615443620559, + "grad_norm": 55.4529182498137, + "learning_rate": 3.420248255375601e-06, + "loss": 2.3186, + "step": 22602 + }, + { + "epoch": 1.9264467740560811, + "grad_norm": 31.961495952056843, + "learning_rate": 3.419777814764067e-06, + "loss": 3.104, + "step": 22603 + }, + { + "epoch": 1.9265320037501066, + "grad_norm": 76.38436332810569, + "learning_rate": 3.4193073896933914e-06, + "loss": 4.0777, + "step": 22604 + }, + { + "epoch": 1.9266172334441318, + "grad_norm": 34.426099213362086, + "learning_rate": 3.4188369801681975e-06, + "loss": 2.7079, + "step": 22605 + }, + { + "epoch": 1.9267024631381573, + "grad_norm": 54.05215673077053, + "learning_rate": 3.4183665861931158e-06, + "loss": 3.5125, + "step": 22606 + }, + { + "epoch": 1.9267876928321828, + "grad_norm": 60.674524010870215, + "learning_rate": 3.417896207772769e-06, + "loss": 2.2357, + "step": 22607 + }, + { + "epoch": 1.9268729225262082, + "grad_norm": 38.95919740687437, + "learning_rate": 3.417425844911786e-06, + "loss": 2.5008, + "step": 22608 + }, + { + "epoch": 1.9269581522202335, + "grad_norm": 41.026956284903974, + "learning_rate": 3.416955497614788e-06, + "loss": 2.5702, + "step": 22609 + }, + { + "epoch": 1.927043381914259, + "grad_norm": 23.685201282506988, + "learning_rate": 3.416485165886406e-06, + "loss": 1.4799, + "step": 22610 + }, + { + "epoch": 1.9271286116082842, + "grad_norm": 43.96577386796511, + "learning_rate": 3.4160148497312617e-06, + "loss": 2.8906, + "step": 22611 + }, + { + "epoch": 1.9272138413023097, + "grad_norm": 54.1872537967152, + "learning_rate": 3.415544549153984e-06, + "loss": 2.5932, + "step": 22612 + }, + { + "epoch": 1.9272990709963351, + "grad_norm": 63.6099103414873, + "learning_rate": 3.4150742641591926e-06, + "loss": 2.7573, + "step": 22613 + }, + { + "epoch": 1.9273843006903606, + "grad_norm": 55.50546638726707, + "learning_rate": 3.4146039947515196e-06, + "loss": 2.6218, + "step": 22614 + }, + { + "epoch": 1.927469530384386, + "grad_norm": 65.40811110151327, + "learning_rate": 3.414133740935585e-06, + "loss": 2.835, + "step": 22615 + }, + { + "epoch": 1.9275547600784113, + "grad_norm": 32.81367893175592, + "learning_rate": 3.4136635027160147e-06, + "loss": 2.6089, + "step": 22616 + }, + { + "epoch": 1.9276399897724366, + "grad_norm": 42.466521309271386, + "learning_rate": 3.413193280097435e-06, + "loss": 3.1227, + "step": 22617 + }, + { + "epoch": 1.927725219466462, + "grad_norm": 46.48161223400303, + "learning_rate": 3.4127230730844662e-06, + "loss": 3.1815, + "step": 22618 + }, + { + "epoch": 1.9278104491604875, + "grad_norm": 36.552933113514946, + "learning_rate": 3.4122528816817357e-06, + "loss": 2.4828, + "step": 22619 + }, + { + "epoch": 1.927895678854513, + "grad_norm": 77.09429252914103, + "learning_rate": 3.411782705893869e-06, + "loss": 3.8328, + "step": 22620 + }, + { + "epoch": 1.9279809085485384, + "grad_norm": 56.289025077211654, + "learning_rate": 3.411312545725488e-06, + "loss": 2.4457, + "step": 22621 + }, + { + "epoch": 1.9280661382425637, + "grad_norm": 26.342971913833303, + "learning_rate": 3.4108424011812146e-06, + "loss": 2.4072, + "step": 22622 + }, + { + "epoch": 1.9281513679365891, + "grad_norm": 35.81198653605514, + "learning_rate": 3.4103722722656763e-06, + "loss": 2.6998, + "step": 22623 + }, + { + "epoch": 1.9282365976306144, + "grad_norm": 56.07901169691567, + "learning_rate": 3.4099021589834956e-06, + "loss": 2.7392, + "step": 22624 + }, + { + "epoch": 1.9283218273246399, + "grad_norm": 60.57710668445751, + "learning_rate": 3.4094320613392956e-06, + "loss": 2.84, + "step": 22625 + }, + { + "epoch": 1.9284070570186653, + "grad_norm": 48.38981178503587, + "learning_rate": 3.408961979337697e-06, + "loss": 2.6881, + "step": 22626 + }, + { + "epoch": 1.9284922867126908, + "grad_norm": 111.0205962012674, + "learning_rate": 3.408491912983326e-06, + "loss": 3.8012, + "step": 22627 + }, + { + "epoch": 1.9285775164067163, + "grad_norm": 46.10470419383138, + "learning_rate": 3.408021862280806e-06, + "loss": 3.1221, + "step": 22628 + }, + { + "epoch": 1.9286627461007415, + "grad_norm": 29.97739675351054, + "learning_rate": 3.4075518272347586e-06, + "loss": 2.8604, + "step": 22629 + }, + { + "epoch": 1.9287479757947668, + "grad_norm": 68.41785759898764, + "learning_rate": 3.4070818078498046e-06, + "loss": 2.8324, + "step": 22630 + }, + { + "epoch": 1.9288332054887922, + "grad_norm": 49.24849775515008, + "learning_rate": 3.4066118041305673e-06, + "loss": 2.5138, + "step": 22631 + }, + { + "epoch": 1.9289184351828177, + "grad_norm": 23.6535184552037, + "learning_rate": 3.4061418160816717e-06, + "loss": 1.773, + "step": 22632 + }, + { + "epoch": 1.9290036648768432, + "grad_norm": 45.12971527041663, + "learning_rate": 3.4056718437077387e-06, + "loss": 3.1326, + "step": 22633 + }, + { + "epoch": 1.9290888945708686, + "grad_norm": 92.3962109961754, + "learning_rate": 3.4052018870133876e-06, + "loss": 3.7459, + "step": 22634 + }, + { + "epoch": 1.9291741242648939, + "grad_norm": 64.4627605544736, + "learning_rate": 3.4047319460032414e-06, + "loss": 2.2575, + "step": 22635 + }, + { + "epoch": 1.9292593539589193, + "grad_norm": 91.6469543272428, + "learning_rate": 3.4042620206819242e-06, + "loss": 3.7641, + "step": 22636 + }, + { + "epoch": 1.9293445836529446, + "grad_norm": 60.35326881280385, + "learning_rate": 3.4037921110540574e-06, + "loss": 2.2419, + "step": 22637 + }, + { + "epoch": 1.92942981334697, + "grad_norm": 74.23214440473048, + "learning_rate": 3.4033222171242586e-06, + "loss": 3.6178, + "step": 22638 + }, + { + "epoch": 1.9295150430409955, + "grad_norm": 65.46565780851749, + "learning_rate": 3.4028523388971512e-06, + "loss": 2.3182, + "step": 22639 + }, + { + "epoch": 1.929600272735021, + "grad_norm": 64.74602671413962, + "learning_rate": 3.4023824763773576e-06, + "loss": 2.9109, + "step": 22640 + }, + { + "epoch": 1.9296855024290462, + "grad_norm": 85.72727854890198, + "learning_rate": 3.4019126295694983e-06, + "loss": 2.4287, + "step": 22641 + }, + { + "epoch": 1.9297707321230717, + "grad_norm": 42.20590079176608, + "learning_rate": 3.401442798478192e-06, + "loss": 2.7682, + "step": 22642 + }, + { + "epoch": 1.929855961817097, + "grad_norm": 148.20279256991464, + "learning_rate": 3.400972983108061e-06, + "loss": 4.3482, + "step": 22643 + }, + { + "epoch": 1.9299411915111224, + "grad_norm": 47.910447475765665, + "learning_rate": 3.4005031834637235e-06, + "loss": 3.3204, + "step": 22644 + }, + { + "epoch": 1.930026421205148, + "grad_norm": 41.41082551954843, + "learning_rate": 3.400033399549804e-06, + "loss": 2.5567, + "step": 22645 + }, + { + "epoch": 1.9301116508991734, + "grad_norm": 66.61692154931725, + "learning_rate": 3.399563631370919e-06, + "loss": 3.086, + "step": 22646 + }, + { + "epoch": 1.9301968805931988, + "grad_norm": 79.48524670069024, + "learning_rate": 3.399093878931691e-06, + "loss": 2.3684, + "step": 22647 + }, + { + "epoch": 1.930282110287224, + "grad_norm": 70.09184488264852, + "learning_rate": 3.3986241422367365e-06, + "loss": 3.1125, + "step": 22648 + }, + { + "epoch": 1.9303673399812493, + "grad_norm": 55.890864271393625, + "learning_rate": 3.3981544212906793e-06, + "loss": 2.6323, + "step": 22649 + }, + { + "epoch": 1.9304525696752748, + "grad_norm": 39.831059235040534, + "learning_rate": 3.3976847160981357e-06, + "loss": 2.8037, + "step": 22650 + }, + { + "epoch": 1.9305377993693003, + "grad_norm": 51.81139234446658, + "learning_rate": 3.3972150266637273e-06, + "loss": 1.9238, + "step": 22651 + }, + { + "epoch": 1.9306230290633257, + "grad_norm": 81.54380364662464, + "learning_rate": 3.39674535299207e-06, + "loss": 2.5555, + "step": 22652 + }, + { + "epoch": 1.9307082587573512, + "grad_norm": 39.80726541744329, + "learning_rate": 3.3962756950877874e-06, + "loss": 3.0566, + "step": 22653 + }, + { + "epoch": 1.9307934884513764, + "grad_norm": 89.29266335566354, + "learning_rate": 3.3958060529554948e-06, + "loss": 3.6742, + "step": 22654 + }, + { + "epoch": 1.930878718145402, + "grad_norm": 51.73036313536601, + "learning_rate": 3.3953364265998134e-06, + "loss": 2.3788, + "step": 22655 + }, + { + "epoch": 1.9309639478394272, + "grad_norm": 25.92846886577157, + "learning_rate": 3.3948668160253583e-06, + "loss": 2.0148, + "step": 22656 + }, + { + "epoch": 1.9310491775334526, + "grad_norm": 79.68146636903016, + "learning_rate": 3.3943972212367525e-06, + "loss": 3.6975, + "step": 22657 + }, + { + "epoch": 1.931134407227478, + "grad_norm": 34.76944988851611, + "learning_rate": 3.3939276422386112e-06, + "loss": 2.0957, + "step": 22658 + }, + { + "epoch": 1.9312196369215036, + "grad_norm": 87.76004551429146, + "learning_rate": 3.393458079035554e-06, + "loss": 2.7272, + "step": 22659 + }, + { + "epoch": 1.9313048666155288, + "grad_norm": 40.626717726018335, + "learning_rate": 3.3929885316321987e-06, + "loss": 2.6184, + "step": 22660 + }, + { + "epoch": 1.9313900963095543, + "grad_norm": 63.613375249504585, + "learning_rate": 3.392519000033161e-06, + "loss": 1.7662, + "step": 22661 + }, + { + "epoch": 1.9314753260035795, + "grad_norm": 59.18047753926118, + "learning_rate": 3.39204948424306e-06, + "loss": 2.0435, + "step": 22662 + }, + { + "epoch": 1.931560555697605, + "grad_norm": 53.862853142250444, + "learning_rate": 3.391579984266515e-06, + "loss": 3.1143, + "step": 22663 + }, + { + "epoch": 1.9316457853916305, + "grad_norm": 19.077922744771538, + "learning_rate": 3.391110500108142e-06, + "loss": 0.7166, + "step": 22664 + }, + { + "epoch": 1.931731015085656, + "grad_norm": 105.41716630256933, + "learning_rate": 3.390641031772555e-06, + "loss": 2.5857, + "step": 22665 + }, + { + "epoch": 1.9318162447796814, + "grad_norm": 68.71691570609671, + "learning_rate": 3.3901715792643756e-06, + "loss": 1.7603, + "step": 22666 + }, + { + "epoch": 1.9319014744737066, + "grad_norm": 47.756990688300355, + "learning_rate": 3.38970214258822e-06, + "loss": 3.6088, + "step": 22667 + }, + { + "epoch": 1.9319867041677319, + "grad_norm": 60.25271547552343, + "learning_rate": 3.3892327217487044e-06, + "loss": 3.2271, + "step": 22668 + }, + { + "epoch": 1.9320719338617574, + "grad_norm": 37.147519611719474, + "learning_rate": 3.388763316750442e-06, + "loss": 2.7255, + "step": 22669 + }, + { + "epoch": 1.9321571635557828, + "grad_norm": 53.745037068102945, + "learning_rate": 3.3882939275980535e-06, + "loss": 2.1351, + "step": 22670 + }, + { + "epoch": 1.9322423932498083, + "grad_norm": 44.39461768293494, + "learning_rate": 3.3878245542961552e-06, + "loss": 2.8946, + "step": 22671 + }, + { + "epoch": 1.9323276229438338, + "grad_norm": 98.16587383179635, + "learning_rate": 3.387355196849361e-06, + "loss": 2.9925, + "step": 22672 + }, + { + "epoch": 1.932412852637859, + "grad_norm": 58.799511189563304, + "learning_rate": 3.3868858552622873e-06, + "loss": 2.857, + "step": 22673 + }, + { + "epoch": 1.9324980823318845, + "grad_norm": 66.93409401331036, + "learning_rate": 3.386416529539549e-06, + "loss": 2.2088, + "step": 22674 + }, + { + "epoch": 1.9325833120259097, + "grad_norm": 63.48354276834602, + "learning_rate": 3.385947219685765e-06, + "loss": 2.6296, + "step": 22675 + }, + { + "epoch": 1.9326685417199352, + "grad_norm": 31.273971852408888, + "learning_rate": 3.385477925705549e-06, + "loss": 2.0429, + "step": 22676 + }, + { + "epoch": 1.9327537714139607, + "grad_norm": 42.33654301737771, + "learning_rate": 3.3850086476035147e-06, + "loss": 2.952, + "step": 22677 + }, + { + "epoch": 1.9328390011079861, + "grad_norm": 52.72926358598465, + "learning_rate": 3.3845393853842777e-06, + "loss": 2.5461, + "step": 22678 + }, + { + "epoch": 1.9329242308020114, + "grad_norm": 92.07686241518861, + "learning_rate": 3.3840701390524556e-06, + "loss": 2.3734, + "step": 22679 + }, + { + "epoch": 1.9330094604960368, + "grad_norm": 34.19866550122724, + "learning_rate": 3.383600908612662e-06, + "loss": 2.0022, + "step": 22680 + }, + { + "epoch": 1.933094690190062, + "grad_norm": 40.48436605953155, + "learning_rate": 3.3831316940695103e-06, + "loss": 2.5815, + "step": 22681 + }, + { + "epoch": 1.9331799198840875, + "grad_norm": 40.48630352685196, + "learning_rate": 3.3826624954276142e-06, + "loss": 2.7276, + "step": 22682 + }, + { + "epoch": 1.933265149578113, + "grad_norm": 49.12805908315825, + "learning_rate": 3.382193312691593e-06, + "loss": 2.7752, + "step": 22683 + }, + { + "epoch": 1.9333503792721385, + "grad_norm": 15.340467520002834, + "learning_rate": 3.3817241458660564e-06, + "loss": 1.0238, + "step": 22684 + }, + { + "epoch": 1.933435608966164, + "grad_norm": 46.43377636434129, + "learning_rate": 3.3812549949556193e-06, + "loss": 2.9944, + "step": 22685 + }, + { + "epoch": 1.9335208386601892, + "grad_norm": 64.21404183902798, + "learning_rate": 3.380785859964895e-06, + "loss": 2.7564, + "step": 22686 + }, + { + "epoch": 1.9336060683542144, + "grad_norm": 75.69335448368336, + "learning_rate": 3.3803167408985006e-06, + "loss": 3.2789, + "step": 22687 + }, + { + "epoch": 1.93369129804824, + "grad_norm": 92.65422477819352, + "learning_rate": 3.379847637761048e-06, + "loss": 2.2077, + "step": 22688 + }, + { + "epoch": 1.9337765277422654, + "grad_norm": 131.3136173631071, + "learning_rate": 3.379378550557148e-06, + "loss": 2.6931, + "step": 22689 + }, + { + "epoch": 1.9338617574362909, + "grad_norm": 62.73331139566571, + "learning_rate": 3.378909479291418e-06, + "loss": 1.8565, + "step": 22690 + }, + { + "epoch": 1.9339469871303163, + "grad_norm": 38.10517485010704, + "learning_rate": 3.378440423968466e-06, + "loss": 2.1381, + "step": 22691 + }, + { + "epoch": 1.9340322168243416, + "grad_norm": 61.14907609491086, + "learning_rate": 3.377971384592911e-06, + "loss": 2.9582, + "step": 22692 + }, + { + "epoch": 1.934117446518367, + "grad_norm": 81.30775495455305, + "learning_rate": 3.3775023611693615e-06, + "loss": 3.6038, + "step": 22693 + }, + { + "epoch": 1.9342026762123923, + "grad_norm": 56.996882838942895, + "learning_rate": 3.3770333537024323e-06, + "loss": 2.7082, + "step": 22694 + }, + { + "epoch": 1.9342879059064177, + "grad_norm": 34.055481873913585, + "learning_rate": 3.3765643621967337e-06, + "loss": 2.8511, + "step": 22695 + }, + { + "epoch": 1.9343731356004432, + "grad_norm": 36.79668718181224, + "learning_rate": 3.3760953866568814e-06, + "loss": 2.3642, + "step": 22696 + }, + { + "epoch": 1.9344583652944687, + "grad_norm": 62.143476748697346, + "learning_rate": 3.375626427087484e-06, + "loss": 2.392, + "step": 22697 + }, + { + "epoch": 1.9345435949884942, + "grad_norm": 67.0680580281023, + "learning_rate": 3.3751574834931565e-06, + "loss": 3.6874, + "step": 22698 + }, + { + "epoch": 1.9346288246825194, + "grad_norm": 51.657455958149235, + "learning_rate": 3.374688555878508e-06, + "loss": 2.831, + "step": 22699 + }, + { + "epoch": 1.9347140543765446, + "grad_norm": 34.050971144731704, + "learning_rate": 3.3742196442481535e-06, + "loss": 2.1106, + "step": 22700 + }, + { + "epoch": 1.9347992840705701, + "grad_norm": 18.417810330506878, + "learning_rate": 3.3737507486067024e-06, + "loss": 1.428, + "step": 22701 + }, + { + "epoch": 1.9348845137645956, + "grad_norm": 92.76384151034522, + "learning_rate": 3.3732818689587666e-06, + "loss": 3.294, + "step": 22702 + }, + { + "epoch": 1.934969743458621, + "grad_norm": 53.34941353532714, + "learning_rate": 3.3728130053089576e-06, + "loss": 2.1632, + "step": 22703 + }, + { + "epoch": 1.9350549731526465, + "grad_norm": 77.27227975372045, + "learning_rate": 3.3723441576618837e-06, + "loss": 3.3383, + "step": 22704 + }, + { + "epoch": 1.9351402028466718, + "grad_norm": 76.26341293234535, + "learning_rate": 3.371875326022159e-06, + "loss": 2.4824, + "step": 22705 + }, + { + "epoch": 1.9352254325406972, + "grad_norm": 35.661696327727334, + "learning_rate": 3.371406510394396e-06, + "loss": 2.5695, + "step": 22706 + }, + { + "epoch": 1.9353106622347225, + "grad_norm": 55.0351071186327, + "learning_rate": 3.370937710783202e-06, + "loss": 3.5012, + "step": 22707 + }, + { + "epoch": 1.935395891928748, + "grad_norm": 76.52221720948624, + "learning_rate": 3.3704689271931856e-06, + "loss": 2.5522, + "step": 22708 + }, + { + "epoch": 1.9354811216227734, + "grad_norm": 34.392571871510874, + "learning_rate": 3.3700001596289615e-06, + "loss": 2.1528, + "step": 22709 + }, + { + "epoch": 1.9355663513167989, + "grad_norm": 46.4932971131623, + "learning_rate": 3.3695314080951393e-06, + "loss": 2.2887, + "step": 22710 + }, + { + "epoch": 1.9356515810108241, + "grad_norm": 31.244172388071075, + "learning_rate": 3.369062672596327e-06, + "loss": 2.62, + "step": 22711 + }, + { + "epoch": 1.9357368107048496, + "grad_norm": 73.84811319037611, + "learning_rate": 3.3685939531371336e-06, + "loss": 3.5652, + "step": 22712 + }, + { + "epoch": 1.9358220403988748, + "grad_norm": 27.93888126693052, + "learning_rate": 3.368125249722172e-06, + "loss": 2.2915, + "step": 22713 + }, + { + "epoch": 1.9359072700929003, + "grad_norm": 96.07222922433226, + "learning_rate": 3.367656562356051e-06, + "loss": 4.006, + "step": 22714 + }, + { + "epoch": 1.9359924997869258, + "grad_norm": 34.67642824471209, + "learning_rate": 3.3671878910433786e-06, + "loss": 1.6817, + "step": 22715 + }, + { + "epoch": 1.9360777294809512, + "grad_norm": 91.0031613095471, + "learning_rate": 3.3667192357887633e-06, + "loss": 4.2601, + "step": 22716 + }, + { + "epoch": 1.9361629591749767, + "grad_norm": 34.21260476023262, + "learning_rate": 3.366250596596815e-06, + "loss": 2.2468, + "step": 22717 + }, + { + "epoch": 1.936248188869002, + "grad_norm": 36.650694023259554, + "learning_rate": 3.3657819734721444e-06, + "loss": 2.352, + "step": 22718 + }, + { + "epoch": 1.9363334185630272, + "grad_norm": 33.813774285038676, + "learning_rate": 3.3653133664193594e-06, + "loss": 2.8736, + "step": 22719 + }, + { + "epoch": 1.9364186482570527, + "grad_norm": 46.119982869044804, + "learning_rate": 3.364844775443066e-06, + "loss": 1.878, + "step": 22720 + }, + { + "epoch": 1.9365038779510781, + "grad_norm": 30.260078343363855, + "learning_rate": 3.364376200547874e-06, + "loss": 2.0885, + "step": 22721 + }, + { + "epoch": 1.9365891076451036, + "grad_norm": 125.80887179813045, + "learning_rate": 3.3639076417383944e-06, + "loss": 3.2712, + "step": 22722 + }, + { + "epoch": 1.936674337339129, + "grad_norm": 47.37570719362467, + "learning_rate": 3.3634390990192322e-06, + "loss": 2.2538, + "step": 22723 + }, + { + "epoch": 1.9367595670331543, + "grad_norm": 29.655916423834576, + "learning_rate": 3.362970572394996e-06, + "loss": 1.911, + "step": 22724 + }, + { + "epoch": 1.9368447967271798, + "grad_norm": 86.43428985253524, + "learning_rate": 3.362502061870292e-06, + "loss": 3.8261, + "step": 22725 + }, + { + "epoch": 1.936930026421205, + "grad_norm": 43.42005984726693, + "learning_rate": 3.3620335674497318e-06, + "loss": 2.6059, + "step": 22726 + }, + { + "epoch": 1.9370152561152305, + "grad_norm": 51.92382449544585, + "learning_rate": 3.3615650891379203e-06, + "loss": 2.924, + "step": 22727 + }, + { + "epoch": 1.937100485809256, + "grad_norm": 65.16052398796859, + "learning_rate": 3.3610966269394647e-06, + "loss": 2.3394, + "step": 22728 + }, + { + "epoch": 1.9371857155032814, + "grad_norm": 37.5689054084902, + "learning_rate": 3.3606281808589712e-06, + "loss": 2.6718, + "step": 22729 + }, + { + "epoch": 1.9372709451973067, + "grad_norm": 64.05706149549124, + "learning_rate": 3.36015975090105e-06, + "loss": 2.4641, + "step": 22730 + }, + { + "epoch": 1.9373561748913322, + "grad_norm": 62.18207539636038, + "learning_rate": 3.3596913370703067e-06, + "loss": 3.1067, + "step": 22731 + }, + { + "epoch": 1.9374414045853574, + "grad_norm": 39.66057279714528, + "learning_rate": 3.3592229393713454e-06, + "loss": 2.6163, + "step": 22732 + }, + { + "epoch": 1.9375266342793829, + "grad_norm": 38.38909569627002, + "learning_rate": 3.358754557808776e-06, + "loss": 2.7066, + "step": 22733 + }, + { + "epoch": 1.9376118639734083, + "grad_norm": 73.11758431378644, + "learning_rate": 3.358286192387201e-06, + "loss": 2.8458, + "step": 22734 + }, + { + "epoch": 1.9376970936674338, + "grad_norm": 47.074430051325166, + "learning_rate": 3.357817843111232e-06, + "loss": 3.5476, + "step": 22735 + }, + { + "epoch": 1.9377823233614593, + "grad_norm": 50.64361862352319, + "learning_rate": 3.35734950998547e-06, + "loss": 3.0257, + "step": 22736 + }, + { + "epoch": 1.9378675530554845, + "grad_norm": 57.393196336671174, + "learning_rate": 3.356881193014524e-06, + "loss": 1.8016, + "step": 22737 + }, + { + "epoch": 1.9379527827495098, + "grad_norm": 54.966053345939486, + "learning_rate": 3.3564128922029965e-06, + "loss": 2.4875, + "step": 22738 + }, + { + "epoch": 1.9380380124435352, + "grad_norm": 43.22064275518354, + "learning_rate": 3.355944607555497e-06, + "loss": 2.4813, + "step": 22739 + }, + { + "epoch": 1.9381232421375607, + "grad_norm": 120.0664806857126, + "learning_rate": 3.3554763390766287e-06, + "loss": 3.8104, + "step": 22740 + }, + { + "epoch": 1.9382084718315862, + "grad_norm": 42.55739746189595, + "learning_rate": 3.3550080867709977e-06, + "loss": 2.4325, + "step": 22741 + }, + { + "epoch": 1.9382937015256116, + "grad_norm": 100.20481257356212, + "learning_rate": 3.354539850643206e-06, + "loss": 2.1226, + "step": 22742 + }, + { + "epoch": 1.938378931219637, + "grad_norm": 59.10589788817666, + "learning_rate": 3.3540716306978637e-06, + "loss": 2.4836, + "step": 22743 + }, + { + "epoch": 1.9384641609136624, + "grad_norm": 63.2408743328886, + "learning_rate": 3.3536034269395716e-06, + "loss": 2.8962, + "step": 22744 + }, + { + "epoch": 1.9385493906076876, + "grad_norm": 33.898922685172515, + "learning_rate": 3.3531352393729365e-06, + "loss": 1.7817, + "step": 22745 + }, + { + "epoch": 1.938634620301713, + "grad_norm": 54.39175684142651, + "learning_rate": 3.3526670680025623e-06, + "loss": 1.6496, + "step": 22746 + }, + { + "epoch": 1.9387198499957385, + "grad_norm": 43.187560227613034, + "learning_rate": 3.3521989128330507e-06, + "loss": 2.3703, + "step": 22747 + }, + { + "epoch": 1.938805079689764, + "grad_norm": 78.53688948072745, + "learning_rate": 3.351730773869009e-06, + "loss": 3.5768, + "step": 22748 + }, + { + "epoch": 1.9388903093837895, + "grad_norm": 21.58162223077297, + "learning_rate": 3.3512626511150416e-06, + "loss": 2.1116, + "step": 22749 + }, + { + "epoch": 1.9389755390778147, + "grad_norm": 37.61428178645004, + "learning_rate": 3.3507945445757506e-06, + "loss": 3.4475, + "step": 22750 + }, + { + "epoch": 1.93906076877184, + "grad_norm": 124.62474330079304, + "learning_rate": 3.3503264542557374e-06, + "loss": 3.0624, + "step": 22751 + }, + { + "epoch": 1.9391459984658654, + "grad_norm": 64.39170745019884, + "learning_rate": 3.3498583801596095e-06, + "loss": 2.6594, + "step": 22752 + }, + { + "epoch": 1.939231228159891, + "grad_norm": 62.047965781562894, + "learning_rate": 3.34939032229197e-06, + "loss": 1.5497, + "step": 22753 + }, + { + "epoch": 1.9393164578539164, + "grad_norm": 59.43448435493674, + "learning_rate": 3.34892228065742e-06, + "loss": 2.888, + "step": 22754 + }, + { + "epoch": 1.9394016875479418, + "grad_norm": 99.5248083857929, + "learning_rate": 3.3484542552605613e-06, + "loss": 3.6875, + "step": 22755 + }, + { + "epoch": 1.939486917241967, + "grad_norm": 42.236627685654994, + "learning_rate": 3.347986246106001e-06, + "loss": 2.4551, + "step": 22756 + }, + { + "epoch": 1.9395721469359923, + "grad_norm": 52.73418896451115, + "learning_rate": 3.3475182531983393e-06, + "loss": 3.0642, + "step": 22757 + }, + { + "epoch": 1.9396573766300178, + "grad_norm": 45.157806457161094, + "learning_rate": 3.347050276542179e-06, + "loss": 2.8497, + "step": 22758 + }, + { + "epoch": 1.9397426063240433, + "grad_norm": 86.77428006303268, + "learning_rate": 3.3465823161421208e-06, + "loss": 4.037, + "step": 22759 + }, + { + "epoch": 1.9398278360180687, + "grad_norm": 45.33461979237185, + "learning_rate": 3.34611437200277e-06, + "loss": 2.4898, + "step": 22760 + }, + { + "epoch": 1.9399130657120942, + "grad_norm": 60.0234600283729, + "learning_rate": 3.345646444128727e-06, + "loss": 3.7166, + "step": 22761 + }, + { + "epoch": 1.9399982954061195, + "grad_norm": 68.84687902775065, + "learning_rate": 3.3451785325245944e-06, + "loss": 4.45, + "step": 22762 + }, + { + "epoch": 1.940083525100145, + "grad_norm": 54.444457634884, + "learning_rate": 3.344710637194973e-06, + "loss": 3.0599, + "step": 22763 + }, + { + "epoch": 1.9401687547941702, + "grad_norm": 82.21804117871426, + "learning_rate": 3.344242758144463e-06, + "loss": 2.2376, + "step": 22764 + }, + { + "epoch": 1.9402539844881956, + "grad_norm": 42.31679871432273, + "learning_rate": 3.3437748953776693e-06, + "loss": 2.8291, + "step": 22765 + }, + { + "epoch": 1.940339214182221, + "grad_norm": 44.79181179927932, + "learning_rate": 3.3433070488991924e-06, + "loss": 4.106, + "step": 22766 + }, + { + "epoch": 1.9404244438762466, + "grad_norm": 21.38080588442218, + "learning_rate": 3.3428392187136305e-06, + "loss": 1.1582, + "step": 22767 + }, + { + "epoch": 1.940509673570272, + "grad_norm": 39.144707841707685, + "learning_rate": 3.3423714048255856e-06, + "loss": 2.9175, + "step": 22768 + }, + { + "epoch": 1.9405949032642973, + "grad_norm": 29.140868952985333, + "learning_rate": 3.3419036072396614e-06, + "loss": 2.051, + "step": 22769 + }, + { + "epoch": 1.9406801329583225, + "grad_norm": 43.804524947962754, + "learning_rate": 3.3414358259604563e-06, + "loss": 2.823, + "step": 22770 + }, + { + "epoch": 1.940765362652348, + "grad_norm": 95.58633384430316, + "learning_rate": 3.3409680609925698e-06, + "loss": 2.3767, + "step": 22771 + }, + { + "epoch": 1.9408505923463735, + "grad_norm": 68.0580478566052, + "learning_rate": 3.340500312340602e-06, + "loss": 4.0088, + "step": 22772 + }, + { + "epoch": 1.940935822040399, + "grad_norm": 65.12809210309125, + "learning_rate": 3.3400325800091567e-06, + "loss": 2.5012, + "step": 22773 + }, + { + "epoch": 1.9410210517344244, + "grad_norm": 59.9531078850331, + "learning_rate": 3.339564864002831e-06, + "loss": 3.5374, + "step": 22774 + }, + { + "epoch": 1.9411062814284497, + "grad_norm": 59.629820468290895, + "learning_rate": 3.339097164326224e-06, + "loss": 2.7114, + "step": 22775 + }, + { + "epoch": 1.9411915111224751, + "grad_norm": 13.294357434841286, + "learning_rate": 3.3386294809839375e-06, + "loss": 0.7577, + "step": 22776 + }, + { + "epoch": 1.9412767408165004, + "grad_norm": 35.616565787940154, + "learning_rate": 3.3381618139805685e-06, + "loss": 2.5825, + "step": 22777 + }, + { + "epoch": 1.9413619705105258, + "grad_norm": 53.43524760735984, + "learning_rate": 3.337694163320719e-06, + "loss": 3.0114, + "step": 22778 + }, + { + "epoch": 1.9414472002045513, + "grad_norm": 44.58420134166799, + "learning_rate": 3.3372265290089857e-06, + "loss": 2.7476, + "step": 22779 + }, + { + "epoch": 1.9415324298985768, + "grad_norm": 60.762848465880964, + "learning_rate": 3.33675891104997e-06, + "loss": 2.517, + "step": 22780 + }, + { + "epoch": 1.941617659592602, + "grad_norm": 69.17879919620805, + "learning_rate": 3.3362913094482675e-06, + "loss": 1.8405, + "step": 22781 + }, + { + "epoch": 1.9417028892866275, + "grad_norm": 22.295763728814546, + "learning_rate": 3.335823724208481e-06, + "loss": 1.5051, + "step": 22782 + }, + { + "epoch": 1.9417881189806527, + "grad_norm": 36.74559654635233, + "learning_rate": 3.335356155335206e-06, + "loss": 1.6992, + "step": 22783 + }, + { + "epoch": 1.9418733486746782, + "grad_norm": 44.01238607682294, + "learning_rate": 3.3348886028330425e-06, + "loss": 2.148, + "step": 22784 + }, + { + "epoch": 1.9419585783687037, + "grad_norm": 57.34909228938289, + "learning_rate": 3.3344210667065858e-06, + "loss": 3.0064, + "step": 22785 + }, + { + "epoch": 1.9420438080627291, + "grad_norm": 72.37111679957555, + "learning_rate": 3.333953546960439e-06, + "loss": 2.8229, + "step": 22786 + }, + { + "epoch": 1.9421290377567546, + "grad_norm": 84.0616576525727, + "learning_rate": 3.3334860435991943e-06, + "loss": 4.5143, + "step": 22787 + }, + { + "epoch": 1.9422142674507799, + "grad_norm": 41.38923192945111, + "learning_rate": 3.333018556627455e-06, + "loss": 2.2907, + "step": 22788 + }, + { + "epoch": 1.942299497144805, + "grad_norm": 47.56098104606255, + "learning_rate": 3.3325510860498143e-06, + "loss": 2.7675, + "step": 22789 + }, + { + "epoch": 1.9423847268388306, + "grad_norm": 44.298009988934474, + "learning_rate": 3.3320836318708704e-06, + "loss": 1.9343, + "step": 22790 + }, + { + "epoch": 1.942469956532856, + "grad_norm": 34.16805491536457, + "learning_rate": 3.331616194095222e-06, + "loss": 2.502, + "step": 22791 + }, + { + "epoch": 1.9425551862268815, + "grad_norm": 69.6983746636991, + "learning_rate": 3.3311487727274654e-06, + "loss": 3.1224, + "step": 22792 + }, + { + "epoch": 1.942640415920907, + "grad_norm": 40.12176504598425, + "learning_rate": 3.3306813677721984e-06, + "loss": 2.6003, + "step": 22793 + }, + { + "epoch": 1.9427256456149322, + "grad_norm": 50.90373252904132, + "learning_rate": 3.3302139792340143e-06, + "loss": 3.4406, + "step": 22794 + }, + { + "epoch": 1.9428108753089577, + "grad_norm": 33.392321324357816, + "learning_rate": 3.3297466071175134e-06, + "loss": 2.8871, + "step": 22795 + }, + { + "epoch": 1.942896105002983, + "grad_norm": 61.23900747065526, + "learning_rate": 3.329279251427292e-06, + "loss": 1.7789, + "step": 22796 + }, + { + "epoch": 1.9429813346970084, + "grad_norm": 52.48420033249406, + "learning_rate": 3.3288119121679453e-06, + "loss": 3.0159, + "step": 22797 + }, + { + "epoch": 1.9430665643910339, + "grad_norm": 56.71401118356223, + "learning_rate": 3.3283445893440667e-06, + "loss": 2.0276, + "step": 22798 + }, + { + "epoch": 1.9431517940850593, + "grad_norm": 45.26439775550618, + "learning_rate": 3.327877282960257e-06, + "loss": 2.8411, + "step": 22799 + }, + { + "epoch": 1.9432370237790846, + "grad_norm": 46.688300531450295, + "learning_rate": 3.32740999302111e-06, + "loss": 2.775, + "step": 22800 + }, + { + "epoch": 1.94332225347311, + "grad_norm": 45.53768662878467, + "learning_rate": 3.326942719531221e-06, + "loss": 3.0296, + "step": 22801 + }, + { + "epoch": 1.9434074831671353, + "grad_norm": 46.504929880643495, + "learning_rate": 3.326475462495184e-06, + "loss": 2.8095, + "step": 22802 + }, + { + "epoch": 1.9434927128611608, + "grad_norm": 43.13451822634161, + "learning_rate": 3.3260082219175972e-06, + "loss": 2.9792, + "step": 22803 + }, + { + "epoch": 1.9435779425551862, + "grad_norm": 37.72589484841855, + "learning_rate": 3.3255409978030547e-06, + "loss": 2.4934, + "step": 22804 + }, + { + "epoch": 1.9436631722492117, + "grad_norm": 31.65220420507119, + "learning_rate": 3.3250737901561507e-06, + "loss": 2.3214, + "step": 22805 + }, + { + "epoch": 1.9437484019432372, + "grad_norm": 48.94316703066301, + "learning_rate": 3.3246065989814803e-06, + "loss": 1.7762, + "step": 22806 + }, + { + "epoch": 1.9438336316372624, + "grad_norm": 76.01608335975962, + "learning_rate": 3.3241394242836365e-06, + "loss": 3.1514, + "step": 22807 + }, + { + "epoch": 1.9439188613312877, + "grad_norm": 55.64828176219026, + "learning_rate": 3.3236722660672184e-06, + "loss": 3.8619, + "step": 22808 + }, + { + "epoch": 1.9440040910253131, + "grad_norm": 41.692622729383984, + "learning_rate": 3.323205124336817e-06, + "loss": 2.421, + "step": 22809 + }, + { + "epoch": 1.9440893207193386, + "grad_norm": 50.78451477297827, + "learning_rate": 3.3227379990970258e-06, + "loss": 1.6293, + "step": 22810 + }, + { + "epoch": 1.944174550413364, + "grad_norm": 80.70623864295736, + "learning_rate": 3.3222708903524394e-06, + "loss": 3.4091, + "step": 22811 + }, + { + "epoch": 1.9442597801073895, + "grad_norm": 62.0250310868811, + "learning_rate": 3.3218037981076534e-06, + "loss": 2.8822, + "step": 22812 + }, + { + "epoch": 1.9443450098014148, + "grad_norm": 37.70267767187249, + "learning_rate": 3.321336722367261e-06, + "loss": 2.8407, + "step": 22813 + }, + { + "epoch": 1.9444302394954402, + "grad_norm": 38.22461591200244, + "learning_rate": 3.3208696631358527e-06, + "loss": 2.5916, + "step": 22814 + }, + { + "epoch": 1.9445154691894655, + "grad_norm": 71.83116896861476, + "learning_rate": 3.320402620418024e-06, + "loss": 3.07, + "step": 22815 + }, + { + "epoch": 1.944600698883491, + "grad_norm": 34.70234058098309, + "learning_rate": 3.3199355942183697e-06, + "loss": 2.8628, + "step": 22816 + }, + { + "epoch": 1.9446859285775164, + "grad_norm": 45.155495636900554, + "learning_rate": 3.319468584541482e-06, + "loss": 2.5205, + "step": 22817 + }, + { + "epoch": 1.944771158271542, + "grad_norm": 27.102322461576488, + "learning_rate": 3.3190015913919516e-06, + "loss": 1.9921, + "step": 22818 + }, + { + "epoch": 1.9448563879655674, + "grad_norm": 81.08794132184792, + "learning_rate": 3.3185346147743737e-06, + "loss": 2.8291, + "step": 22819 + }, + { + "epoch": 1.9449416176595926, + "grad_norm": 104.93565537498982, + "learning_rate": 3.3180676546933365e-06, + "loss": 3.434, + "step": 22820 + }, + { + "epoch": 1.9450268473536179, + "grad_norm": 76.39268707778434, + "learning_rate": 3.317600711153439e-06, + "loss": 3.242, + "step": 22821 + }, + { + "epoch": 1.9451120770476433, + "grad_norm": 86.36942138114422, + "learning_rate": 3.3171337841592686e-06, + "loss": 2.8979, + "step": 22822 + }, + { + "epoch": 1.9451973067416688, + "grad_norm": 38.99317208960289, + "learning_rate": 3.3166668737154196e-06, + "loss": 2.3509, + "step": 22823 + }, + { + "epoch": 1.9452825364356943, + "grad_norm": 37.37353611242122, + "learning_rate": 3.316199979826481e-06, + "loss": 2.4927, + "step": 22824 + }, + { + "epoch": 1.9453677661297197, + "grad_norm": 65.53125608331032, + "learning_rate": 3.3157331024970486e-06, + "loss": 2.9988, + "step": 22825 + }, + { + "epoch": 1.945452995823745, + "grad_norm": 50.59950102915899, + "learning_rate": 3.3152662417317103e-06, + "loss": 3.1, + "step": 22826 + }, + { + "epoch": 1.9455382255177702, + "grad_norm": 81.93383331042716, + "learning_rate": 3.3147993975350612e-06, + "loss": 2.8473, + "step": 22827 + }, + { + "epoch": 1.9456234552117957, + "grad_norm": 115.85014779312618, + "learning_rate": 3.314332569911688e-06, + "loss": 2.1525, + "step": 22828 + }, + { + "epoch": 1.9457086849058212, + "grad_norm": 41.57569059734624, + "learning_rate": 3.313865758866186e-06, + "loss": 1.9997, + "step": 22829 + }, + { + "epoch": 1.9457939145998466, + "grad_norm": 50.61103956911506, + "learning_rate": 3.3133989644031427e-06, + "loss": 2.451, + "step": 22830 + }, + { + "epoch": 1.945879144293872, + "grad_norm": 43.881561406230304, + "learning_rate": 3.3129321865271524e-06, + "loss": 2.6608, + "step": 22831 + }, + { + "epoch": 1.9459643739878973, + "grad_norm": 60.636176097764604, + "learning_rate": 3.312465425242801e-06, + "loss": 3.4032, + "step": 22832 + }, + { + "epoch": 1.9460496036819228, + "grad_norm": 56.056704570963284, + "learning_rate": 3.311998680554684e-06, + "loss": 2.9338, + "step": 22833 + }, + { + "epoch": 1.946134833375948, + "grad_norm": 23.83969526612794, + "learning_rate": 3.311531952467388e-06, + "loss": 1.9574, + "step": 22834 + }, + { + "epoch": 1.9462200630699735, + "grad_norm": 48.57425277973683, + "learning_rate": 3.3110652409855047e-06, + "loss": 2.9508, + "step": 22835 + }, + { + "epoch": 1.946305292763999, + "grad_norm": 63.666965372534456, + "learning_rate": 3.310598546113625e-06, + "loss": 4.0162, + "step": 22836 + }, + { + "epoch": 1.9463905224580245, + "grad_norm": 56.555339480872966, + "learning_rate": 3.310131867856334e-06, + "loss": 2.7912, + "step": 22837 + }, + { + "epoch": 1.94647575215205, + "grad_norm": 44.02236339842196, + "learning_rate": 3.3096652062182266e-06, + "loss": 2.9099, + "step": 22838 + }, + { + "epoch": 1.9465609818460752, + "grad_norm": 63.17875872963093, + "learning_rate": 3.3091985612038908e-06, + "loss": 3.3672, + "step": 22839 + }, + { + "epoch": 1.9466462115401004, + "grad_norm": 31.740065495501707, + "learning_rate": 3.3087319328179153e-06, + "loss": 2.1028, + "step": 22840 + }, + { + "epoch": 1.946731441234126, + "grad_norm": 49.36488646407733, + "learning_rate": 3.308265321064887e-06, + "loss": 2.5216, + "step": 22841 + }, + { + "epoch": 1.9468166709281514, + "grad_norm": 47.990090710337476, + "learning_rate": 3.3077987259493984e-06, + "loss": 2.2598, + "step": 22842 + }, + { + "epoch": 1.9469019006221768, + "grad_norm": 60.71404423328793, + "learning_rate": 3.307332147476038e-06, + "loss": 3.6834, + "step": 22843 + }, + { + "epoch": 1.9469871303162023, + "grad_norm": 44.754629292073275, + "learning_rate": 3.3068655856493927e-06, + "loss": 1.3811, + "step": 22844 + }, + { + "epoch": 1.9470723600102275, + "grad_norm": 69.62520568202585, + "learning_rate": 3.3063990404740497e-06, + "loss": 2.8985, + "step": 22845 + }, + { + "epoch": 1.947157589704253, + "grad_norm": 62.567624092774935, + "learning_rate": 3.3059325119546005e-06, + "loss": 2.9552, + "step": 22846 + }, + { + "epoch": 1.9472428193982783, + "grad_norm": 54.703461929699785, + "learning_rate": 3.305466000095633e-06, + "loss": 2.6281, + "step": 22847 + }, + { + "epoch": 1.9473280490923037, + "grad_norm": 59.16569345136891, + "learning_rate": 3.3049995049017335e-06, + "loss": 3.0555, + "step": 22848 + }, + { + "epoch": 1.9474132787863292, + "grad_norm": 43.88905408895477, + "learning_rate": 3.30453302637749e-06, + "loss": 3.0473, + "step": 22849 + }, + { + "epoch": 1.9474985084803547, + "grad_norm": 61.37979745629562, + "learning_rate": 3.304066564527489e-06, + "loss": 3.7971, + "step": 22850 + }, + { + "epoch": 1.94758373817438, + "grad_norm": 46.469765235108255, + "learning_rate": 3.3036001193563215e-06, + "loss": 2.0683, + "step": 22851 + }, + { + "epoch": 1.9476689678684054, + "grad_norm": 59.647499994476746, + "learning_rate": 3.3031336908685728e-06, + "loss": 2.6695, + "step": 22852 + }, + { + "epoch": 1.9477541975624306, + "grad_norm": 147.25356730139706, + "learning_rate": 3.3026672790688284e-06, + "loss": 4.385, + "step": 22853 + }, + { + "epoch": 1.947839427256456, + "grad_norm": 106.05330941405613, + "learning_rate": 3.302200883961677e-06, + "loss": 2.4727, + "step": 22854 + }, + { + "epoch": 1.9479246569504816, + "grad_norm": 23.598119066671856, + "learning_rate": 3.301734505551707e-06, + "loss": 2.2385, + "step": 22855 + }, + { + "epoch": 1.948009886644507, + "grad_norm": 48.593221137519365, + "learning_rate": 3.3012681438435027e-06, + "loss": 1.735, + "step": 22856 + }, + { + "epoch": 1.9480951163385325, + "grad_norm": 254.80342281108986, + "learning_rate": 3.3008017988416507e-06, + "loss": 3.4258, + "step": 22857 + }, + { + "epoch": 1.9481803460325577, + "grad_norm": 81.28634513698526, + "learning_rate": 3.3003354705507363e-06, + "loss": 3.7416, + "step": 22858 + }, + { + "epoch": 1.948265575726583, + "grad_norm": 51.649076712689684, + "learning_rate": 3.2998691589753496e-06, + "loss": 3.2269, + "step": 22859 + }, + { + "epoch": 1.9483508054206085, + "grad_norm": 46.34127978064622, + "learning_rate": 3.2994028641200745e-06, + "loss": 3.7666, + "step": 22860 + }, + { + "epoch": 1.948436035114634, + "grad_norm": 32.003634275446316, + "learning_rate": 3.298936585989495e-06, + "loss": 1.5567, + "step": 22861 + }, + { + "epoch": 1.9485212648086594, + "grad_norm": 36.31102543966831, + "learning_rate": 3.298470324588199e-06, + "loss": 2.0661, + "step": 22862 + }, + { + "epoch": 1.9486064945026849, + "grad_norm": 91.97594438446048, + "learning_rate": 3.29800407992077e-06, + "loss": 3.0691, + "step": 22863 + }, + { + "epoch": 1.94869172419671, + "grad_norm": 45.392450483640445, + "learning_rate": 3.2975378519917955e-06, + "loss": 2.3175, + "step": 22864 + }, + { + "epoch": 1.9487769538907356, + "grad_norm": 48.056099007497075, + "learning_rate": 3.2970716408058593e-06, + "loss": 2.8448, + "step": 22865 + }, + { + "epoch": 1.9488621835847608, + "grad_norm": 90.44039735046509, + "learning_rate": 3.296605446367548e-06, + "loss": 2.9665, + "step": 22866 + }, + { + "epoch": 1.9489474132787863, + "grad_norm": 47.79134936078339, + "learning_rate": 3.296139268681443e-06, + "loss": 1.9651, + "step": 22867 + }, + { + "epoch": 1.9490326429728118, + "grad_norm": 53.75055696696708, + "learning_rate": 3.2956731077521335e-06, + "loss": 2.5084, + "step": 22868 + }, + { + "epoch": 1.9491178726668372, + "grad_norm": 46.06091878887213, + "learning_rate": 3.295206963584201e-06, + "loss": 2.7457, + "step": 22869 + }, + { + "epoch": 1.9492031023608625, + "grad_norm": 48.62182460300268, + "learning_rate": 3.294740836182231e-06, + "loss": 2.9467, + "step": 22870 + }, + { + "epoch": 1.949288332054888, + "grad_norm": 51.94241327237678, + "learning_rate": 3.2942747255508057e-06, + "loss": 3.4386, + "step": 22871 + }, + { + "epoch": 1.9493735617489132, + "grad_norm": 69.37660815136765, + "learning_rate": 3.293808631694512e-06, + "loss": 2.6351, + "step": 22872 + }, + { + "epoch": 1.9494587914429387, + "grad_norm": 21.09581551330839, + "learning_rate": 3.293342554617932e-06, + "loss": 1.0943, + "step": 22873 + }, + { + "epoch": 1.9495440211369641, + "grad_norm": 50.562438018718055, + "learning_rate": 3.2928764943256506e-06, + "loss": 2.618, + "step": 22874 + }, + { + "epoch": 1.9496292508309896, + "grad_norm": 125.55240471282764, + "learning_rate": 3.2924104508222488e-06, + "loss": 4.8851, + "step": 22875 + }, + { + "epoch": 1.949714480525015, + "grad_norm": 41.99243164360256, + "learning_rate": 3.291944424112314e-06, + "loss": 2.7922, + "step": 22876 + }, + { + "epoch": 1.9497997102190403, + "grad_norm": 38.3399207193293, + "learning_rate": 3.2914784142004254e-06, + "loss": 2.5755, + "step": 22877 + }, + { + "epoch": 1.9498849399130656, + "grad_norm": 33.039447689341436, + "learning_rate": 3.2910124210911687e-06, + "loss": 2.6843, + "step": 22878 + }, + { + "epoch": 1.949970169607091, + "grad_norm": 37.08810326996115, + "learning_rate": 3.2905464447891255e-06, + "loss": 2.5098, + "step": 22879 + }, + { + "epoch": 1.9500553993011165, + "grad_norm": 49.48341495914335, + "learning_rate": 3.2900804852988776e-06, + "loss": 3.3065, + "step": 22880 + }, + { + "epoch": 1.950140628995142, + "grad_norm": 59.60972325255217, + "learning_rate": 3.2896145426250098e-06, + "loss": 2.2539, + "step": 22881 + }, + { + "epoch": 1.9502258586891674, + "grad_norm": 69.27702487869739, + "learning_rate": 3.2891486167721037e-06, + "loss": 2.9263, + "step": 22882 + }, + { + "epoch": 1.9503110883831927, + "grad_norm": 96.40916540101881, + "learning_rate": 3.28868270774474e-06, + "loss": 3.2954, + "step": 22883 + }, + { + "epoch": 1.9503963180772181, + "grad_norm": 69.26355354560175, + "learning_rate": 3.2882168155475024e-06, + "loss": 2.4864, + "step": 22884 + }, + { + "epoch": 1.9504815477712434, + "grad_norm": 41.78477560703266, + "learning_rate": 3.2877509401849717e-06, + "loss": 2.5737, + "step": 22885 + }, + { + "epoch": 1.9505667774652689, + "grad_norm": 59.322425058993666, + "learning_rate": 3.287285081661732e-06, + "loss": 1.9472, + "step": 22886 + }, + { + "epoch": 1.9506520071592943, + "grad_norm": 67.02267514899614, + "learning_rate": 3.2868192399823617e-06, + "loss": 2.4522, + "step": 22887 + }, + { + "epoch": 1.9507372368533198, + "grad_norm": 53.95416556157833, + "learning_rate": 3.286353415151442e-06, + "loss": 2.8536, + "step": 22888 + }, + { + "epoch": 1.9508224665473453, + "grad_norm": 48.390239586742005, + "learning_rate": 3.285887607173557e-06, + "loss": 3.1743, + "step": 22889 + }, + { + "epoch": 1.9509076962413705, + "grad_norm": 29.521544894153603, + "learning_rate": 3.285421816053287e-06, + "loss": 2.0473, + "step": 22890 + }, + { + "epoch": 1.9509929259353957, + "grad_norm": 58.98866572181727, + "learning_rate": 3.284956041795212e-06, + "loss": 2.0805, + "step": 22891 + }, + { + "epoch": 1.9510781556294212, + "grad_norm": 58.75846878183177, + "learning_rate": 3.284490284403912e-06, + "loss": 3.4463, + "step": 22892 + }, + { + "epoch": 1.9511633853234467, + "grad_norm": 84.6217896242539, + "learning_rate": 3.284024543883967e-06, + "loss": 3.1722, + "step": 22893 + }, + { + "epoch": 1.9512486150174722, + "grad_norm": 62.799305451766394, + "learning_rate": 3.2835588202399605e-06, + "loss": 2.6974, + "step": 22894 + }, + { + "epoch": 1.9513338447114976, + "grad_norm": 37.06831982038673, + "learning_rate": 3.2830931134764716e-06, + "loss": 2.1339, + "step": 22895 + }, + { + "epoch": 1.9514190744055229, + "grad_norm": 53.33783930459685, + "learning_rate": 3.2826274235980783e-06, + "loss": 2.8809, + "step": 22896 + }, + { + "epoch": 1.9515043040995483, + "grad_norm": 56.759858568499396, + "learning_rate": 3.282161750609361e-06, + "loss": 3.7288, + "step": 22897 + }, + { + "epoch": 1.9515895337935736, + "grad_norm": 28.563613104989724, + "learning_rate": 3.2816960945149023e-06, + "loss": 3.3746, + "step": 22898 + }, + { + "epoch": 1.951674763487599, + "grad_norm": 65.5947559726834, + "learning_rate": 3.28123045531928e-06, + "loss": 3.8888, + "step": 22899 + }, + { + "epoch": 1.9517599931816245, + "grad_norm": 84.28470589380835, + "learning_rate": 3.280764833027072e-06, + "loss": 2.9246, + "step": 22900 + }, + { + "epoch": 1.95184522287565, + "grad_norm": 40.171962602307154, + "learning_rate": 3.2802992276428574e-06, + "loss": 2.6839, + "step": 22901 + }, + { + "epoch": 1.9519304525696752, + "grad_norm": 29.412964129482923, + "learning_rate": 3.2798336391712193e-06, + "loss": 1.8351, + "step": 22902 + }, + { + "epoch": 1.9520156822637007, + "grad_norm": 33.78512343262057, + "learning_rate": 3.279368067616733e-06, + "loss": 2.9321, + "step": 22903 + }, + { + "epoch": 1.952100911957726, + "grad_norm": 78.07055062879145, + "learning_rate": 3.2789025129839773e-06, + "loss": 2.2738, + "step": 22904 + }, + { + "epoch": 1.9521861416517514, + "grad_norm": 44.46328043984455, + "learning_rate": 3.278436975277531e-06, + "loss": 3.7425, + "step": 22905 + }, + { + "epoch": 1.9522713713457769, + "grad_norm": 53.48194282895468, + "learning_rate": 3.2779714545019744e-06, + "loss": 2.6305, + "step": 22906 + }, + { + "epoch": 1.9523566010398024, + "grad_norm": 15.199915671186659, + "learning_rate": 3.2775059506618844e-06, + "loss": 0.9507, + "step": 22907 + }, + { + "epoch": 1.9524418307338278, + "grad_norm": 48.39842843125256, + "learning_rate": 3.277040463761838e-06, + "loss": 3.7854, + "step": 22908 + }, + { + "epoch": 1.952527060427853, + "grad_norm": 109.39009439831207, + "learning_rate": 3.276574993806415e-06, + "loss": 5.0184, + "step": 22909 + }, + { + "epoch": 1.9526122901218783, + "grad_norm": 28.93684618332098, + "learning_rate": 3.2761095408001896e-06, + "loss": 2.4722, + "step": 22910 + }, + { + "epoch": 1.9526975198159038, + "grad_norm": 35.03939422263365, + "learning_rate": 3.275644104747745e-06, + "loss": 2.9766, + "step": 22911 + }, + { + "epoch": 1.9527827495099292, + "grad_norm": 34.15565520528291, + "learning_rate": 3.2751786856536533e-06, + "loss": 2.3669, + "step": 22912 + }, + { + "epoch": 1.9528679792039547, + "grad_norm": 35.67518471734321, + "learning_rate": 3.2747132835224954e-06, + "loss": 2.895, + "step": 22913 + }, + { + "epoch": 1.9529532088979802, + "grad_norm": 46.06407150909237, + "learning_rate": 3.2742478983588445e-06, + "loss": 2.198, + "step": 22914 + }, + { + "epoch": 1.9530384385920054, + "grad_norm": 23.85414132038704, + "learning_rate": 3.273782530167282e-06, + "loss": 1.6826, + "step": 22915 + }, + { + "epoch": 1.953123668286031, + "grad_norm": 49.186040260434154, + "learning_rate": 3.2733171789523817e-06, + "loss": 2.921, + "step": 22916 + }, + { + "epoch": 1.9532088979800561, + "grad_norm": 68.26768615653562, + "learning_rate": 3.2728518447187213e-06, + "loss": 2.069, + "step": 22917 + }, + { + "epoch": 1.9532941276740816, + "grad_norm": 35.939869447092384, + "learning_rate": 3.272386527470875e-06, + "loss": 2.043, + "step": 22918 + }, + { + "epoch": 1.953379357368107, + "grad_norm": 37.949949784566954, + "learning_rate": 3.2719212272134226e-06, + "loss": 2.3029, + "step": 22919 + }, + { + "epoch": 1.9534645870621326, + "grad_norm": 27.734625757290143, + "learning_rate": 3.2714559439509375e-06, + "loss": 1.9108, + "step": 22920 + }, + { + "epoch": 1.9535498167561578, + "grad_norm": 74.60272756498354, + "learning_rate": 3.2709906776879966e-06, + "loss": 2.8277, + "step": 22921 + }, + { + "epoch": 1.9536350464501833, + "grad_norm": 41.74187505620814, + "learning_rate": 3.2705254284291753e-06, + "loss": 3.1992, + "step": 22922 + }, + { + "epoch": 1.9537202761442085, + "grad_norm": 39.621451883372835, + "learning_rate": 3.2700601961790486e-06, + "loss": 2.5334, + "step": 22923 + }, + { + "epoch": 1.953805505838234, + "grad_norm": 57.847048775326115, + "learning_rate": 3.2695949809421924e-06, + "loss": 2.8252, + "step": 22924 + }, + { + "epoch": 1.9538907355322594, + "grad_norm": 41.38189571798518, + "learning_rate": 3.269129782723183e-06, + "loss": 2.5869, + "step": 22925 + }, + { + "epoch": 1.953975965226285, + "grad_norm": 68.06565661503676, + "learning_rate": 3.268664601526594e-06, + "loss": 3.346, + "step": 22926 + }, + { + "epoch": 1.9540611949203104, + "grad_norm": 77.7481487266434, + "learning_rate": 3.268199437357e-06, + "loss": 3.8705, + "step": 22927 + }, + { + "epoch": 1.9541464246143356, + "grad_norm": 49.25971645512681, + "learning_rate": 3.267734290218977e-06, + "loss": 2.4215, + "step": 22928 + }, + { + "epoch": 1.9542316543083609, + "grad_norm": 54.45904280422345, + "learning_rate": 3.2672691601170993e-06, + "loss": 2.6553, + "step": 22929 + }, + { + "epoch": 1.9543168840023863, + "grad_norm": 40.547955398992016, + "learning_rate": 3.2668040470559404e-06, + "loss": 3.3826, + "step": 22930 + }, + { + "epoch": 1.9544021136964118, + "grad_norm": 38.739924758458095, + "learning_rate": 3.2663389510400743e-06, + "loss": 2.5258, + "step": 22931 + }, + { + "epoch": 1.9544873433904373, + "grad_norm": 63.21872157075125, + "learning_rate": 3.265873872074077e-06, + "loss": 3.114, + "step": 22932 + }, + { + "epoch": 1.9545725730844627, + "grad_norm": 124.03162645260404, + "learning_rate": 3.2654088101625203e-06, + "loss": 4.4574, + "step": 22933 + }, + { + "epoch": 1.954657802778488, + "grad_norm": 77.47220042988077, + "learning_rate": 3.2649437653099803e-06, + "loss": 3.5011, + "step": 22934 + }, + { + "epoch": 1.9547430324725135, + "grad_norm": 55.41397490463929, + "learning_rate": 3.2644787375210274e-06, + "loss": 3.0593, + "step": 22935 + }, + { + "epoch": 1.9548282621665387, + "grad_norm": 89.14405281328301, + "learning_rate": 3.264013726800235e-06, + "loss": 3.1542, + "step": 22936 + }, + { + "epoch": 1.9549134918605642, + "grad_norm": 97.22681304069562, + "learning_rate": 3.26354873315218e-06, + "loss": 3.6978, + "step": 22937 + }, + { + "epoch": 1.9549987215545896, + "grad_norm": 51.564676509798566, + "learning_rate": 3.263083756581433e-06, + "loss": 2.3763, + "step": 22938 + }, + { + "epoch": 1.9550839512486151, + "grad_norm": 54.20723583109015, + "learning_rate": 3.262618797092567e-06, + "loss": 2.819, + "step": 22939 + }, + { + "epoch": 1.9551691809426404, + "grad_norm": 32.34973242535646, + "learning_rate": 3.262153854690153e-06, + "loss": 2.0927, + "step": 22940 + }, + { + "epoch": 1.9552544106366658, + "grad_norm": 45.38303704623584, + "learning_rate": 3.2616889293787677e-06, + "loss": 3.493, + "step": 22941 + }, + { + "epoch": 1.955339640330691, + "grad_norm": 58.54813238867261, + "learning_rate": 3.261224021162981e-06, + "loss": 3.2174, + "step": 22942 + }, + { + "epoch": 1.9554248700247165, + "grad_norm": 47.99503627465919, + "learning_rate": 3.260759130047364e-06, + "loss": 2.9642, + "step": 22943 + }, + { + "epoch": 1.955510099718742, + "grad_norm": 56.297978233647775, + "learning_rate": 3.2602942560364888e-06, + "loss": 1.7454, + "step": 22944 + }, + { + "epoch": 1.9555953294127675, + "grad_norm": 38.912694026329056, + "learning_rate": 3.25982939913493e-06, + "loss": 1.9439, + "step": 22945 + }, + { + "epoch": 1.955680559106793, + "grad_norm": 40.86412884920737, + "learning_rate": 3.2593645593472576e-06, + "loss": 3.0365, + "step": 22946 + }, + { + "epoch": 1.9557657888008182, + "grad_norm": 39.5162321052098, + "learning_rate": 3.258899736678043e-06, + "loss": 3.0779, + "step": 22947 + }, + { + "epoch": 1.9558510184948434, + "grad_norm": 39.49607138623565, + "learning_rate": 3.2584349311318557e-06, + "loss": 2.7709, + "step": 22948 + }, + { + "epoch": 1.955936248188869, + "grad_norm": 46.2892538772192, + "learning_rate": 3.2579701427132715e-06, + "loss": 2.9904, + "step": 22949 + }, + { + "epoch": 1.9560214778828944, + "grad_norm": 67.24880019285338, + "learning_rate": 3.257505371426858e-06, + "loss": 2.6736, + "step": 22950 + }, + { + "epoch": 1.9561067075769198, + "grad_norm": 49.5452100249282, + "learning_rate": 3.257040617277186e-06, + "loss": 2.249, + "step": 22951 + }, + { + "epoch": 1.9561919372709453, + "grad_norm": 43.517331748796614, + "learning_rate": 3.2565758802688284e-06, + "loss": 2.8415, + "step": 22952 + }, + { + "epoch": 1.9562771669649706, + "grad_norm": 31.36563170372356, + "learning_rate": 3.256111160406352e-06, + "loss": 2.8533, + "step": 22953 + }, + { + "epoch": 1.956362396658996, + "grad_norm": 55.05205397131939, + "learning_rate": 3.2556464576943316e-06, + "loss": 1.7565, + "step": 22954 + }, + { + "epoch": 1.9564476263530213, + "grad_norm": 88.26588736678018, + "learning_rate": 3.2551817721373334e-06, + "loss": 2.6622, + "step": 22955 + }, + { + "epoch": 1.9565328560470467, + "grad_norm": 36.44817309690698, + "learning_rate": 3.2547171037399305e-06, + "loss": 1.8908, + "step": 22956 + }, + { + "epoch": 1.9566180857410722, + "grad_norm": 62.12379505471893, + "learning_rate": 3.25425245250669e-06, + "loss": 3.8195, + "step": 22957 + }, + { + "epoch": 1.9567033154350977, + "grad_norm": 44.7463950793699, + "learning_rate": 3.2537878184421844e-06, + "loss": 2.3644, + "step": 22958 + }, + { + "epoch": 1.9567885451291231, + "grad_norm": 49.425236497480604, + "learning_rate": 3.253323201550981e-06, + "loss": 2.9771, + "step": 22959 + }, + { + "epoch": 1.9568737748231484, + "grad_norm": 53.48294427271102, + "learning_rate": 3.2528586018376507e-06, + "loss": 3.2401, + "step": 22960 + }, + { + "epoch": 1.9569590045171736, + "grad_norm": 40.73279957827007, + "learning_rate": 3.2523940193067593e-06, + "loss": 3.3351, + "step": 22961 + }, + { + "epoch": 1.957044234211199, + "grad_norm": 61.01948381430217, + "learning_rate": 3.251929453962881e-06, + "loss": 2.2898, + "step": 22962 + }, + { + "epoch": 1.9571294639052246, + "grad_norm": 44.8117004578967, + "learning_rate": 3.2514649058105807e-06, + "loss": 4.0552, + "step": 22963 + }, + { + "epoch": 1.95721469359925, + "grad_norm": 55.00785558067664, + "learning_rate": 3.251000374854429e-06, + "loss": 2.814, + "step": 22964 + }, + { + "epoch": 1.9572999232932755, + "grad_norm": 51.807262327672596, + "learning_rate": 3.2505358610989923e-06, + "loss": 2.1378, + "step": 22965 + }, + { + "epoch": 1.9573851529873008, + "grad_norm": 33.47505293921316, + "learning_rate": 3.2500713645488413e-06, + "loss": 1.8702, + "step": 22966 + }, + { + "epoch": 1.9574703826813262, + "grad_norm": 31.446650116344728, + "learning_rate": 3.2496068852085426e-06, + "loss": 2.195, + "step": 22967 + }, + { + "epoch": 1.9575556123753515, + "grad_norm": 59.548749915587635, + "learning_rate": 3.249142423082665e-06, + "loss": 2.1815, + "step": 22968 + }, + { + "epoch": 1.957640842069377, + "grad_norm": 47.75957189567425, + "learning_rate": 3.248677978175776e-06, + "loss": 2.9334, + "step": 22969 + }, + { + "epoch": 1.9577260717634024, + "grad_norm": 55.90664961500201, + "learning_rate": 3.248213550492443e-06, + "loss": 2.8939, + "step": 22970 + }, + { + "epoch": 1.9578113014574279, + "grad_norm": 44.73580095036238, + "learning_rate": 3.247749140037233e-06, + "loss": 2.9017, + "step": 22971 + }, + { + "epoch": 1.9578965311514531, + "grad_norm": 26.275665571645938, + "learning_rate": 3.247284746814715e-06, + "loss": 1.3692, + "step": 22972 + }, + { + "epoch": 1.9579817608454786, + "grad_norm": 120.70629520839078, + "learning_rate": 3.2468203708294544e-06, + "loss": 3.1008, + "step": 22973 + }, + { + "epoch": 1.9580669905395038, + "grad_norm": 42.78398998236319, + "learning_rate": 3.246356012086019e-06, + "loss": 2.8219, + "step": 22974 + }, + { + "epoch": 1.9581522202335293, + "grad_norm": 39.690554675458245, + "learning_rate": 3.245891670588975e-06, + "loss": 2.5937, + "step": 22975 + }, + { + "epoch": 1.9582374499275548, + "grad_norm": 100.8777675322212, + "learning_rate": 3.2454273463428906e-06, + "loss": 2.4797, + "step": 22976 + }, + { + "epoch": 1.9583226796215802, + "grad_norm": 170.9692773894386, + "learning_rate": 3.2449630393523303e-06, + "loss": 3.4362, + "step": 22977 + }, + { + "epoch": 1.9584079093156057, + "grad_norm": 41.10728951296296, + "learning_rate": 3.2444987496218607e-06, + "loss": 2.4493, + "step": 22978 + }, + { + "epoch": 1.958493139009631, + "grad_norm": 31.664555058281728, + "learning_rate": 3.2440344771560483e-06, + "loss": 2.7002, + "step": 22979 + }, + { + "epoch": 1.9585783687036562, + "grad_norm": 66.50131229539096, + "learning_rate": 3.24357022195946e-06, + "loss": 2.3117, + "step": 22980 + }, + { + "epoch": 1.9586635983976817, + "grad_norm": 99.37883701485238, + "learning_rate": 3.2431059840366596e-06, + "loss": 4.7434, + "step": 22981 + }, + { + "epoch": 1.9587488280917071, + "grad_norm": 38.31503283202622, + "learning_rate": 3.2426417633922153e-06, + "loss": 2.2907, + "step": 22982 + }, + { + "epoch": 1.9588340577857326, + "grad_norm": 68.94302609620813, + "learning_rate": 3.242177560030688e-06, + "loss": 2.9391, + "step": 22983 + }, + { + "epoch": 1.958919287479758, + "grad_norm": 56.91079933383816, + "learning_rate": 3.2417133739566487e-06, + "loss": 2.7163, + "step": 22984 + }, + { + "epoch": 1.9590045171737833, + "grad_norm": 62.5811054590436, + "learning_rate": 3.2412492051746595e-06, + "loss": 2.3821, + "step": 22985 + }, + { + "epoch": 1.9590897468678088, + "grad_norm": 99.44530377408876, + "learning_rate": 3.2407850536892848e-06, + "loss": 3.7758, + "step": 22986 + }, + { + "epoch": 1.959174976561834, + "grad_norm": 36.90801103645586, + "learning_rate": 3.240320919505089e-06, + "loss": 2.8481, + "step": 22987 + }, + { + "epoch": 1.9592602062558595, + "grad_norm": 44.119601387665526, + "learning_rate": 3.239856802626639e-06, + "loss": 3.2262, + "step": 22988 + }, + { + "epoch": 1.959345435949885, + "grad_norm": 47.96600871110394, + "learning_rate": 3.239392703058499e-06, + "loss": 3.132, + "step": 22989 + }, + { + "epoch": 1.9594306656439104, + "grad_norm": 69.32362650001102, + "learning_rate": 3.2389286208052307e-06, + "loss": 2.9952, + "step": 22990 + }, + { + "epoch": 1.9595158953379357, + "grad_norm": 67.55435591820451, + "learning_rate": 3.2384645558713977e-06, + "loss": 2.9324, + "step": 22991 + }, + { + "epoch": 1.9596011250319612, + "grad_norm": 53.70512728375991, + "learning_rate": 3.238000508261569e-06, + "loss": 2.7111, + "step": 22992 + }, + { + "epoch": 1.9596863547259864, + "grad_norm": 23.99584778882609, + "learning_rate": 3.2375364779803047e-06, + "loss": 1.8404, + "step": 22993 + }, + { + "epoch": 1.9597715844200119, + "grad_norm": 161.7432134891841, + "learning_rate": 3.2370724650321685e-06, + "loss": 3.3135, + "step": 22994 + }, + { + "epoch": 1.9598568141140373, + "grad_norm": 67.5494854302954, + "learning_rate": 3.2366084694217243e-06, + "loss": 3.2476, + "step": 22995 + }, + { + "epoch": 1.9599420438080628, + "grad_norm": 34.07400683679069, + "learning_rate": 3.236144491153533e-06, + "loss": 1.7499, + "step": 22996 + }, + { + "epoch": 1.9600272735020883, + "grad_norm": 30.96668658592142, + "learning_rate": 3.2356805302321625e-06, + "loss": 2.7142, + "step": 22997 + }, + { + "epoch": 1.9601125031961135, + "grad_norm": 48.410759336231884, + "learning_rate": 3.235216586662171e-06, + "loss": 3.3128, + "step": 22998 + }, + { + "epoch": 1.9601977328901388, + "grad_norm": 50.51948036127289, + "learning_rate": 3.234752660448125e-06, + "loss": 2.7436, + "step": 22999 + }, + { + "epoch": 1.9602829625841642, + "grad_norm": 45.68334431873287, + "learning_rate": 3.234288751594582e-06, + "loss": 2.7651, + "step": 23000 + }, + { + "epoch": 1.9603681922781897, + "grad_norm": 64.20284450261683, + "learning_rate": 3.2338248601061106e-06, + "loss": 2.6386, + "step": 23001 + }, + { + "epoch": 1.9604534219722152, + "grad_norm": 81.41237351012745, + "learning_rate": 3.2333609859872682e-06, + "loss": 2.8309, + "step": 23002 + }, + { + "epoch": 1.9605386516662406, + "grad_norm": 27.447938146630037, + "learning_rate": 3.2328971292426204e-06, + "loss": 1.9811, + "step": 23003 + }, + { + "epoch": 1.9606238813602659, + "grad_norm": 39.58034548813923, + "learning_rate": 3.232433289876724e-06, + "loss": 2.9634, + "step": 23004 + }, + { + "epoch": 1.9607091110542914, + "grad_norm": 20.25752689877199, + "learning_rate": 3.231969467894147e-06, + "loss": 1.5261, + "step": 23005 + }, + { + "epoch": 1.9607943407483166, + "grad_norm": 42.84429581708537, + "learning_rate": 3.231505663299446e-06, + "loss": 2.4459, + "step": 23006 + }, + { + "epoch": 1.960879570442342, + "grad_norm": 54.07040875212486, + "learning_rate": 3.231041876097185e-06, + "loss": 1.9109, + "step": 23007 + }, + { + "epoch": 1.9609648001363675, + "grad_norm": 78.36930217688123, + "learning_rate": 3.230578106291924e-06, + "loss": 2.0677, + "step": 23008 + }, + { + "epoch": 1.961050029830393, + "grad_norm": 50.6103951750784, + "learning_rate": 3.230114353888223e-06, + "loss": 3.0981, + "step": 23009 + }, + { + "epoch": 1.9611352595244185, + "grad_norm": 86.00828409607108, + "learning_rate": 3.229650618890645e-06, + "loss": 3.937, + "step": 23010 + }, + { + "epoch": 1.9612204892184437, + "grad_norm": 80.23815891114442, + "learning_rate": 3.2291869013037504e-06, + "loss": 3.0364, + "step": 23011 + }, + { + "epoch": 1.961305718912469, + "grad_norm": 75.86466403288001, + "learning_rate": 3.228723201132098e-06, + "loss": 2.8781, + "step": 23012 + }, + { + "epoch": 1.9613909486064944, + "grad_norm": 43.728810901295425, + "learning_rate": 3.228259518380249e-06, + "loss": 3.1454, + "step": 23013 + }, + { + "epoch": 1.96147617830052, + "grad_norm": 60.990871345984296, + "learning_rate": 3.2277958530527637e-06, + "loss": 2.3918, + "step": 23014 + }, + { + "epoch": 1.9615614079945454, + "grad_norm": 34.52449367745189, + "learning_rate": 3.2273322051542034e-06, + "loss": 2.1475, + "step": 23015 + }, + { + "epoch": 1.9616466376885708, + "grad_norm": 92.84884231211724, + "learning_rate": 3.226868574689125e-06, + "loss": 2.3596, + "step": 23016 + }, + { + "epoch": 1.961731867382596, + "grad_norm": 104.949008382603, + "learning_rate": 3.2264049616620895e-06, + "loss": 2.9983, + "step": 23017 + }, + { + "epoch": 1.9618170970766213, + "grad_norm": 60.16205734367889, + "learning_rate": 3.2259413660776562e-06, + "loss": 2.8088, + "step": 23018 + }, + { + "epoch": 1.9619023267706468, + "grad_norm": 55.97714969420275, + "learning_rate": 3.2254777879403864e-06, + "loss": 2.469, + "step": 23019 + }, + { + "epoch": 1.9619875564646723, + "grad_norm": 50.41227717583741, + "learning_rate": 3.225014227254836e-06, + "loss": 3.0438, + "step": 23020 + }, + { + "epoch": 1.9620727861586977, + "grad_norm": 51.31214964492108, + "learning_rate": 3.2245506840255654e-06, + "loss": 1.9834, + "step": 23021 + }, + { + "epoch": 1.9621580158527232, + "grad_norm": 73.26931620290829, + "learning_rate": 3.2240871582571333e-06, + "loss": 2.3499, + "step": 23022 + }, + { + "epoch": 1.9622432455467484, + "grad_norm": 81.80543970886691, + "learning_rate": 3.2236236499540992e-06, + "loss": 3.64, + "step": 23023 + }, + { + "epoch": 1.962328475240774, + "grad_norm": 37.62075955960064, + "learning_rate": 3.2231601591210193e-06, + "loss": 2.0284, + "step": 23024 + }, + { + "epoch": 1.9624137049347992, + "grad_norm": 35.16835029648477, + "learning_rate": 3.2226966857624552e-06, + "loss": 3.9017, + "step": 23025 + }, + { + "epoch": 1.9624989346288246, + "grad_norm": 42.875312823991564, + "learning_rate": 3.2222332298829607e-06, + "loss": 2.9857, + "step": 23026 + }, + { + "epoch": 1.96258416432285, + "grad_norm": 77.5729176803078, + "learning_rate": 3.2217697914870973e-06, + "loss": 3.1162, + "step": 23027 + }, + { + "epoch": 1.9626693940168756, + "grad_norm": 111.84698955737915, + "learning_rate": 3.2213063705794214e-06, + "loss": 3.2319, + "step": 23028 + }, + { + "epoch": 1.962754623710901, + "grad_norm": 53.89558901068009, + "learning_rate": 3.2208429671644902e-06, + "loss": 2.357, + "step": 23029 + }, + { + "epoch": 1.9628398534049263, + "grad_norm": 97.12115165466236, + "learning_rate": 3.2203795812468606e-06, + "loss": 3.2637, + "step": 23030 + }, + { + "epoch": 1.9629250830989515, + "grad_norm": 76.4493686584235, + "learning_rate": 3.2199162128310924e-06, + "loss": 2.8851, + "step": 23031 + }, + { + "epoch": 1.963010312792977, + "grad_norm": 88.71753889139133, + "learning_rate": 3.2194528619217402e-06, + "loss": 2.2532, + "step": 23032 + }, + { + "epoch": 1.9630955424870025, + "grad_norm": 47.44869859314992, + "learning_rate": 3.218989528523362e-06, + "loss": 2.1696, + "step": 23033 + }, + { + "epoch": 1.963180772181028, + "grad_norm": 38.374676132821435, + "learning_rate": 3.2185262126405115e-06, + "loss": 1.8303, + "step": 23034 + }, + { + "epoch": 1.9632660018750534, + "grad_norm": 29.964951559852036, + "learning_rate": 3.2180629142777507e-06, + "loss": 1.8664, + "step": 23035 + }, + { + "epoch": 1.9633512315690786, + "grad_norm": 46.18471828071333, + "learning_rate": 3.217599633439633e-06, + "loss": 3.7129, + "step": 23036 + }, + { + "epoch": 1.9634364612631041, + "grad_norm": 60.53024956975356, + "learning_rate": 3.2171363701307134e-06, + "loss": 2.3702, + "step": 23037 + }, + { + "epoch": 1.9635216909571294, + "grad_norm": 25.096735439285265, + "learning_rate": 3.2166731243555505e-06, + "loss": 2.2118, + "step": 23038 + }, + { + "epoch": 1.9636069206511548, + "grad_norm": 75.47786032094567, + "learning_rate": 3.2162098961186965e-06, + "loss": 2.5488, + "step": 23039 + }, + { + "epoch": 1.9636921503451803, + "grad_norm": 58.20464045338748, + "learning_rate": 3.2157466854247123e-06, + "loss": 2.7398, + "step": 23040 + }, + { + "epoch": 1.9637773800392058, + "grad_norm": 49.89504886883176, + "learning_rate": 3.2152834922781485e-06, + "loss": 2.4612, + "step": 23041 + }, + { + "epoch": 1.963862609733231, + "grad_norm": 65.22368536785686, + "learning_rate": 3.2148203166835633e-06, + "loss": 3.4366, + "step": 23042 + }, + { + "epoch": 1.9639478394272565, + "grad_norm": 61.069768058707034, + "learning_rate": 3.2143571586455093e-06, + "loss": 2.2907, + "step": 23043 + }, + { + "epoch": 1.9640330691212817, + "grad_norm": 49.79193632805444, + "learning_rate": 3.2138940181685447e-06, + "loss": 3.2689, + "step": 23044 + }, + { + "epoch": 1.9641182988153072, + "grad_norm": 33.80140044419768, + "learning_rate": 3.213430895257222e-06, + "loss": 2.2598, + "step": 23045 + }, + { + "epoch": 1.9642035285093327, + "grad_norm": 44.30450157903345, + "learning_rate": 3.212967789916097e-06, + "loss": 3.1399, + "step": 23046 + }, + { + "epoch": 1.9642887582033581, + "grad_norm": 29.27416875136082, + "learning_rate": 3.2125047021497225e-06, + "loss": 2.0972, + "step": 23047 + }, + { + "epoch": 1.9643739878973836, + "grad_norm": 62.128638621170666, + "learning_rate": 3.2120416319626557e-06, + "loss": 3.0258, + "step": 23048 + }, + { + "epoch": 1.9644592175914088, + "grad_norm": 37.14791055637569, + "learning_rate": 3.211578579359448e-06, + "loss": 2.0525, + "step": 23049 + }, + { + "epoch": 1.964544447285434, + "grad_norm": 107.9250545713854, + "learning_rate": 3.2111155443446553e-06, + "loss": 3.7356, + "step": 23050 + }, + { + "epoch": 1.9646296769794596, + "grad_norm": 48.16993300669215, + "learning_rate": 3.2106525269228284e-06, + "loss": 2.506, + "step": 23051 + }, + { + "epoch": 1.964714906673485, + "grad_norm": 30.967958645468542, + "learning_rate": 3.210189527098525e-06, + "loss": 1.9404, + "step": 23052 + }, + { + "epoch": 1.9648001363675105, + "grad_norm": 27.48808453296988, + "learning_rate": 3.209726544876295e-06, + "loss": 1.7044, + "step": 23053 + }, + { + "epoch": 1.964885366061536, + "grad_norm": 56.53610585900581, + "learning_rate": 3.2092635802606942e-06, + "loss": 3.1698, + "step": 23054 + }, + { + "epoch": 1.9649705957555612, + "grad_norm": 122.42953389450516, + "learning_rate": 3.208800633256274e-06, + "loss": 4.8133, + "step": 23055 + }, + { + "epoch": 1.9650558254495867, + "grad_norm": 78.3405575645755, + "learning_rate": 3.2083377038675874e-06, + "loss": 3.1992, + "step": 23056 + }, + { + "epoch": 1.965141055143612, + "grad_norm": 35.996609380296405, + "learning_rate": 3.207874792099187e-06, + "loss": 2.5409, + "step": 23057 + }, + { + "epoch": 1.9652262848376374, + "grad_norm": 89.61783900138019, + "learning_rate": 3.207411897955628e-06, + "loss": 2.4959, + "step": 23058 + }, + { + "epoch": 1.9653115145316629, + "grad_norm": 51.62358171026554, + "learning_rate": 3.2069490214414588e-06, + "loss": 3.3329, + "step": 23059 + }, + { + "epoch": 1.9653967442256883, + "grad_norm": 40.58957624597216, + "learning_rate": 3.2064861625612344e-06, + "loss": 2.6285, + "step": 23060 + }, + { + "epoch": 1.9654819739197136, + "grad_norm": 47.52807535873067, + "learning_rate": 3.2060233213195047e-06, + "loss": 2.4414, + "step": 23061 + }, + { + "epoch": 1.965567203613739, + "grad_norm": 33.15259766246245, + "learning_rate": 3.2055604977208242e-06, + "loss": 2.3731, + "step": 23062 + }, + { + "epoch": 1.9656524333077643, + "grad_norm": 36.116550709596275, + "learning_rate": 3.205097691769743e-06, + "loss": 2.654, + "step": 23063 + }, + { + "epoch": 1.9657376630017898, + "grad_norm": 95.49052218189938, + "learning_rate": 3.2046349034708125e-06, + "loss": 2.4347, + "step": 23064 + }, + { + "epoch": 1.9658228926958152, + "grad_norm": 95.52374831884504, + "learning_rate": 3.2041721328285836e-06, + "loss": 2.8171, + "step": 23065 + }, + { + "epoch": 1.9659081223898407, + "grad_norm": 19.728480857557663, + "learning_rate": 3.2037093798476095e-06, + "loss": 1.3628, + "step": 23066 + }, + { + "epoch": 1.9659933520838662, + "grad_norm": 54.35363074118937, + "learning_rate": 3.203246644532439e-06, + "loss": 2.3864, + "step": 23067 + }, + { + "epoch": 1.9660785817778914, + "grad_norm": 46.94069615311674, + "learning_rate": 3.202783926887625e-06, + "loss": 3.2512, + "step": 23068 + }, + { + "epoch": 1.9661638114719167, + "grad_norm": 37.02112810004052, + "learning_rate": 3.202321226917714e-06, + "loss": 2.5459, + "step": 23069 + }, + { + "epoch": 1.9662490411659421, + "grad_norm": 37.3759474395967, + "learning_rate": 3.201858544627262e-06, + "loss": 2.9554, + "step": 23070 + }, + { + "epoch": 1.9663342708599676, + "grad_norm": 55.09617757574831, + "learning_rate": 3.201395880020816e-06, + "loss": 2.3954, + "step": 23071 + }, + { + "epoch": 1.966419500553993, + "grad_norm": 63.681916500799176, + "learning_rate": 3.2009332331029275e-06, + "loss": 3.8671, + "step": 23072 + }, + { + "epoch": 1.9665047302480185, + "grad_norm": 48.2629724933673, + "learning_rate": 3.200470603878144e-06, + "loss": 3.0336, + "step": 23073 + }, + { + "epoch": 1.9665899599420438, + "grad_norm": 35.781297003371215, + "learning_rate": 3.200007992351019e-06, + "loss": 2.8602, + "step": 23074 + }, + { + "epoch": 1.9666751896360692, + "grad_norm": 59.27855859515089, + "learning_rate": 3.1995453985260995e-06, + "loss": 2.5658, + "step": 23075 + }, + { + "epoch": 1.9667604193300945, + "grad_norm": 80.26906033439684, + "learning_rate": 3.1990828224079363e-06, + "loss": 3.0117, + "step": 23076 + }, + { + "epoch": 1.96684564902412, + "grad_norm": 81.36903651355028, + "learning_rate": 3.1986202640010753e-06, + "loss": 3.5816, + "step": 23077 + }, + { + "epoch": 1.9669308787181454, + "grad_norm": 67.5461070434657, + "learning_rate": 3.1981577233100706e-06, + "loss": 1.8579, + "step": 23078 + }, + { + "epoch": 1.967016108412171, + "grad_norm": 64.26446429614738, + "learning_rate": 3.1976952003394678e-06, + "loss": 1.9972, + "step": 23079 + }, + { + "epoch": 1.9671013381061964, + "grad_norm": 52.64982925535491, + "learning_rate": 3.1972326950938176e-06, + "loss": 2.0134, + "step": 23080 + }, + { + "epoch": 1.9671865678002216, + "grad_norm": 47.504863209390514, + "learning_rate": 3.1967702075776672e-06, + "loss": 3.7662, + "step": 23081 + }, + { + "epoch": 1.9672717974942469, + "grad_norm": 157.8419478189627, + "learning_rate": 3.1963077377955636e-06, + "loss": 3.7704, + "step": 23082 + }, + { + "epoch": 1.9673570271882723, + "grad_norm": 66.16129410555034, + "learning_rate": 3.1958452857520583e-06, + "loss": 1.8022, + "step": 23083 + }, + { + "epoch": 1.9674422568822978, + "grad_norm": 37.8873392043751, + "learning_rate": 3.195382851451697e-06, + "loss": 3.0351, + "step": 23084 + }, + { + "epoch": 1.9675274865763233, + "grad_norm": 34.722665976692134, + "learning_rate": 3.1949204348990297e-06, + "loss": 2.489, + "step": 23085 + }, + { + "epoch": 1.9676127162703487, + "grad_norm": 35.61367507255058, + "learning_rate": 3.1944580360986e-06, + "loss": 2.3452, + "step": 23086 + }, + { + "epoch": 1.967697945964374, + "grad_norm": 61.01919593612796, + "learning_rate": 3.1939956550549604e-06, + "loss": 2.9929, + "step": 23087 + }, + { + "epoch": 1.9677831756583994, + "grad_norm": 45.26380631659643, + "learning_rate": 3.1935332917726547e-06, + "loss": 2.673, + "step": 23088 + }, + { + "epoch": 1.9678684053524247, + "grad_norm": 58.063288850088256, + "learning_rate": 3.193070946256233e-06, + "loss": 2.9112, + "step": 23089 + }, + { + "epoch": 1.9679536350464502, + "grad_norm": 59.4220267543463, + "learning_rate": 3.1926086185102373e-06, + "loss": 3.643, + "step": 23090 + }, + { + "epoch": 1.9680388647404756, + "grad_norm": 82.94739211809502, + "learning_rate": 3.19214630853922e-06, + "loss": 2.1848, + "step": 23091 + }, + { + "epoch": 1.968124094434501, + "grad_norm": 91.31397267895727, + "learning_rate": 3.1916840163477247e-06, + "loss": 2.6978, + "step": 23092 + }, + { + "epoch": 1.9682093241285263, + "grad_norm": 42.61636174283332, + "learning_rate": 3.1912217419402998e-06, + "loss": 2.81, + "step": 23093 + }, + { + "epoch": 1.9682945538225518, + "grad_norm": 39.94625620331395, + "learning_rate": 3.190759485321488e-06, + "loss": 2.9036, + "step": 23094 + }, + { + "epoch": 1.968379783516577, + "grad_norm": 83.17970800686645, + "learning_rate": 3.1902972464958393e-06, + "loss": 3.4423, + "step": 23095 + }, + { + "epoch": 1.9684650132106025, + "grad_norm": 68.49580858466469, + "learning_rate": 3.189835025467898e-06, + "loss": 2.8125, + "step": 23096 + }, + { + "epoch": 1.968550242904628, + "grad_norm": 47.001295404737164, + "learning_rate": 3.1893728222422106e-06, + "loss": 3.0191, + "step": 23097 + }, + { + "epoch": 1.9686354725986535, + "grad_norm": 30.764883133385133, + "learning_rate": 3.1889106368233204e-06, + "loss": 2.2053, + "step": 23098 + }, + { + "epoch": 1.968720702292679, + "grad_norm": 37.8952523377724, + "learning_rate": 3.1884484692157747e-06, + "loss": 2.8475, + "step": 23099 + }, + { + "epoch": 1.9688059319867042, + "grad_norm": 48.283873412983404, + "learning_rate": 3.187986319424119e-06, + "loss": 1.8576, + "step": 23100 + }, + { + "epoch": 1.9688911616807294, + "grad_norm": 92.83674880664223, + "learning_rate": 3.1875241874528985e-06, + "loss": 4.1368, + "step": 23101 + }, + { + "epoch": 1.9689763913747549, + "grad_norm": 83.4093243719871, + "learning_rate": 3.187062073306656e-06, + "loss": 2.9821, + "step": 23102 + }, + { + "epoch": 1.9690616210687804, + "grad_norm": 83.71468343811253, + "learning_rate": 3.186599976989938e-06, + "loss": 2.9347, + "step": 23103 + }, + { + "epoch": 1.9691468507628058, + "grad_norm": 65.42401076906013, + "learning_rate": 3.186137898507289e-06, + "loss": 4.1855, + "step": 23104 + }, + { + "epoch": 1.9692320804568313, + "grad_norm": 33.87541915624607, + "learning_rate": 3.1856758378632534e-06, + "loss": 2.7259, + "step": 23105 + }, + { + "epoch": 1.9693173101508565, + "grad_norm": 40.51083428777608, + "learning_rate": 3.1852137950623744e-06, + "loss": 2.9729, + "step": 23106 + }, + { + "epoch": 1.969402539844882, + "grad_norm": 55.23002966995873, + "learning_rate": 3.1847517701091967e-06, + "loss": 2.7845, + "step": 23107 + }, + { + "epoch": 1.9694877695389073, + "grad_norm": 43.09064323802869, + "learning_rate": 3.1842897630082636e-06, + "loss": 2.897, + "step": 23108 + }, + { + "epoch": 1.9695729992329327, + "grad_norm": 51.24552728332085, + "learning_rate": 3.1838277737641205e-06, + "loss": 3.0927, + "step": 23109 + }, + { + "epoch": 1.9696582289269582, + "grad_norm": 40.148760153550015, + "learning_rate": 3.1833658023813085e-06, + "loss": 3.2188, + "step": 23110 + }, + { + "epoch": 1.9697434586209837, + "grad_norm": 49.12850883365253, + "learning_rate": 3.1829038488643726e-06, + "loss": 2.8478, + "step": 23111 + }, + { + "epoch": 1.969828688315009, + "grad_norm": 56.971012956644906, + "learning_rate": 3.182441913217854e-06, + "loss": 3.0335, + "step": 23112 + }, + { + "epoch": 1.9699139180090344, + "grad_norm": 39.13595648699427, + "learning_rate": 3.181979995446298e-06, + "loss": 2.0014, + "step": 23113 + }, + { + "epoch": 1.9699991477030596, + "grad_norm": 72.16348547488072, + "learning_rate": 3.1815180955542463e-06, + "loss": 3.744, + "step": 23114 + }, + { + "epoch": 1.970084377397085, + "grad_norm": 30.99275941957566, + "learning_rate": 3.181056213546242e-06, + "loss": 3.073, + "step": 23115 + }, + { + "epoch": 1.9701696070911106, + "grad_norm": 50.30312451581832, + "learning_rate": 3.1805943494268252e-06, + "loss": 1.9961, + "step": 23116 + }, + { + "epoch": 1.970254836785136, + "grad_norm": 100.06514120105685, + "learning_rate": 3.1801325032005424e-06, + "loss": 3.4152, + "step": 23117 + }, + { + "epoch": 1.9703400664791615, + "grad_norm": 54.288100156923726, + "learning_rate": 3.179670674871932e-06, + "loss": 2.0397, + "step": 23118 + }, + { + "epoch": 1.9704252961731867, + "grad_norm": 57.154099365660116, + "learning_rate": 3.1792088644455376e-06, + "loss": 2.3705, + "step": 23119 + }, + { + "epoch": 1.970510525867212, + "grad_norm": 34.57418798753192, + "learning_rate": 3.1787470719258997e-06, + "loss": 3.0815, + "step": 23120 + }, + { + "epoch": 1.9705957555612374, + "grad_norm": 19.38905875608003, + "learning_rate": 3.1782852973175616e-06, + "loss": 1.2551, + "step": 23121 + }, + { + "epoch": 1.970680985255263, + "grad_norm": 55.67875285248339, + "learning_rate": 3.177823540625063e-06, + "loss": 3.8258, + "step": 23122 + }, + { + "epoch": 1.9707662149492884, + "grad_norm": 30.623759219336904, + "learning_rate": 3.1773618018529474e-06, + "loss": 2.5244, + "step": 23123 + }, + { + "epoch": 1.9708514446433139, + "grad_norm": 41.414212089080074, + "learning_rate": 3.1769000810057516e-06, + "loss": 2.7783, + "step": 23124 + }, + { + "epoch": 1.970936674337339, + "grad_norm": 79.94541904518572, + "learning_rate": 3.176438378088021e-06, + "loss": 2.0823, + "step": 23125 + }, + { + "epoch": 1.9710219040313646, + "grad_norm": 49.23729206177993, + "learning_rate": 3.1759766931042933e-06, + "loss": 3.2342, + "step": 23126 + }, + { + "epoch": 1.9711071337253898, + "grad_norm": 39.820262174798835, + "learning_rate": 3.1755150260591115e-06, + "loss": 2.6403, + "step": 23127 + }, + { + "epoch": 1.9711923634194153, + "grad_norm": 43.114963829418244, + "learning_rate": 3.175053376957014e-06, + "loss": 2.7819, + "step": 23128 + }, + { + "epoch": 1.9712775931134408, + "grad_norm": 70.33657090061908, + "learning_rate": 3.17459174580254e-06, + "loss": 2.768, + "step": 23129 + }, + { + "epoch": 1.9713628228074662, + "grad_norm": 32.72935445928611, + "learning_rate": 3.174130132600232e-06, + "loss": 2.3788, + "step": 23130 + }, + { + "epoch": 1.9714480525014915, + "grad_norm": 111.27644430469769, + "learning_rate": 3.1736685373546285e-06, + "loss": 3.6024, + "step": 23131 + }, + { + "epoch": 1.971533282195517, + "grad_norm": 42.233702273020896, + "learning_rate": 3.1732069600702696e-06, + "loss": 1.7722, + "step": 23132 + }, + { + "epoch": 1.9716185118895422, + "grad_norm": 44.41797724734834, + "learning_rate": 3.1727454007516923e-06, + "loss": 0.9997, + "step": 23133 + }, + { + "epoch": 1.9717037415835676, + "grad_norm": 125.67871882547776, + "learning_rate": 3.1722838594034398e-06, + "loss": 2.4938, + "step": 23134 + }, + { + "epoch": 1.9717889712775931, + "grad_norm": 32.59760786603858, + "learning_rate": 3.1718223360300493e-06, + "loss": 2.9359, + "step": 23135 + }, + { + "epoch": 1.9718742009716186, + "grad_norm": 22.957228044798317, + "learning_rate": 3.1713608306360597e-06, + "loss": 1.6485, + "step": 23136 + }, + { + "epoch": 1.971959430665644, + "grad_norm": 60.790483203492585, + "learning_rate": 3.170899343226008e-06, + "loss": 2.9505, + "step": 23137 + }, + { + "epoch": 1.9720446603596693, + "grad_norm": 131.00378792934853, + "learning_rate": 3.1704378738044363e-06, + "loss": 4.1867, + "step": 23138 + }, + { + "epoch": 1.9721298900536945, + "grad_norm": 103.90871208994871, + "learning_rate": 3.16997642237588e-06, + "loss": 2.5491, + "step": 23139 + }, + { + "epoch": 1.97221511974772, + "grad_norm": 64.09613059588136, + "learning_rate": 3.16951498894488e-06, + "loss": 2.0484, + "step": 23140 + }, + { + "epoch": 1.9723003494417455, + "grad_norm": 71.2015615737233, + "learning_rate": 3.1690535735159713e-06, + "loss": 2.258, + "step": 23141 + }, + { + "epoch": 1.972385579135771, + "grad_norm": 68.57656986418104, + "learning_rate": 3.1685921760936933e-06, + "loss": 1.8492, + "step": 23142 + }, + { + "epoch": 1.9724708088297964, + "grad_norm": 39.49118418340021, + "learning_rate": 3.1681307966825837e-06, + "loss": 2.7544, + "step": 23143 + }, + { + "epoch": 1.9725560385238217, + "grad_norm": 53.52937242641437, + "learning_rate": 3.16766943528718e-06, + "loss": 3.1076, + "step": 23144 + }, + { + "epoch": 1.9726412682178471, + "grad_norm": 69.93689239887732, + "learning_rate": 3.167208091912019e-06, + "loss": 2.4986, + "step": 23145 + }, + { + "epoch": 1.9727264979118724, + "grad_norm": 103.68025095943912, + "learning_rate": 3.166746766561638e-06, + "loss": 3.5727, + "step": 23146 + }, + { + "epoch": 1.9728117276058978, + "grad_norm": 57.63587952175463, + "learning_rate": 3.166285459240574e-06, + "loss": 4.1906, + "step": 23147 + }, + { + "epoch": 1.9728969572999233, + "grad_norm": 88.73641447350234, + "learning_rate": 3.1658241699533654e-06, + "loss": 3.8413, + "step": 23148 + }, + { + "epoch": 1.9729821869939488, + "grad_norm": 27.41059254984815, + "learning_rate": 3.1653628987045456e-06, + "loss": 2.8592, + "step": 23149 + }, + { + "epoch": 1.9730674166879743, + "grad_norm": 70.37086321478199, + "learning_rate": 3.1649016454986524e-06, + "loss": 3.6737, + "step": 23150 + }, + { + "epoch": 1.9731526463819995, + "grad_norm": 66.08351645216831, + "learning_rate": 3.164440410340223e-06, + "loss": 3.1894, + "step": 23151 + }, + { + "epoch": 1.9732378760760247, + "grad_norm": 76.49764764321607, + "learning_rate": 3.1639791932337937e-06, + "loss": 2.1457, + "step": 23152 + }, + { + "epoch": 1.9733231057700502, + "grad_norm": 38.99065280229361, + "learning_rate": 3.163517994183898e-06, + "loss": 2.8829, + "step": 23153 + }, + { + "epoch": 1.9734083354640757, + "grad_norm": 38.338496317951304, + "learning_rate": 3.1630568131950736e-06, + "loss": 2.3479, + "step": 23154 + }, + { + "epoch": 1.9734935651581011, + "grad_norm": 64.39353097549814, + "learning_rate": 3.1625956502718558e-06, + "loss": 2.1861, + "step": 23155 + }, + { + "epoch": 1.9735787948521266, + "grad_norm": 49.079564150005936, + "learning_rate": 3.16213450541878e-06, + "loss": 2.4297, + "step": 23156 + }, + { + "epoch": 1.9736640245461519, + "grad_norm": 84.75044226495301, + "learning_rate": 3.1616733786403797e-06, + "loss": 3.348, + "step": 23157 + }, + { + "epoch": 1.9737492542401773, + "grad_norm": 37.21440756848419, + "learning_rate": 3.1612122699411928e-06, + "loss": 2.7107, + "step": 23158 + }, + { + "epoch": 1.9738344839342026, + "grad_norm": 42.36808956817993, + "learning_rate": 3.1607511793257506e-06, + "loss": 2.9198, + "step": 23159 + }, + { + "epoch": 1.973919713628228, + "grad_norm": 38.99825852330168, + "learning_rate": 3.160290106798592e-06, + "loss": 3.4371, + "step": 23160 + }, + { + "epoch": 1.9740049433222535, + "grad_norm": 62.94419874372942, + "learning_rate": 3.1598290523642476e-06, + "loss": 3.1501, + "step": 23161 + }, + { + "epoch": 1.974090173016279, + "grad_norm": 55.86869865440507, + "learning_rate": 3.1593680160272543e-06, + "loss": 2.5553, + "step": 23162 + }, + { + "epoch": 1.9741754027103042, + "grad_norm": 70.11853982651634, + "learning_rate": 3.1589069977921438e-06, + "loss": 3.3281, + "step": 23163 + }, + { + "epoch": 1.9742606324043297, + "grad_norm": 66.33481076920256, + "learning_rate": 3.158445997663453e-06, + "loss": 2.6606, + "step": 23164 + }, + { + "epoch": 1.974345862098355, + "grad_norm": 55.630776287764874, + "learning_rate": 3.1579850156457127e-06, + "loss": 2.4345, + "step": 23165 + }, + { + "epoch": 1.9744310917923804, + "grad_norm": 63.957591670425046, + "learning_rate": 3.157524051743459e-06, + "loss": 3.4541, + "step": 23166 + }, + { + "epoch": 1.9745163214864059, + "grad_norm": 57.23401949190594, + "learning_rate": 3.1570631059612224e-06, + "loss": 2.9671, + "step": 23167 + }, + { + "epoch": 1.9746015511804313, + "grad_norm": 45.82328355395042, + "learning_rate": 3.15660217830354e-06, + "loss": 2.4697, + "step": 23168 + }, + { + "epoch": 1.9746867808744568, + "grad_norm": 80.49730817872073, + "learning_rate": 3.1561412687749415e-06, + "loss": 3.7616, + "step": 23169 + }, + { + "epoch": 1.974772010568482, + "grad_norm": 73.40789558732881, + "learning_rate": 3.1556803773799616e-06, + "loss": 2.0531, + "step": 23170 + }, + { + "epoch": 1.9748572402625073, + "grad_norm": 58.17266548232829, + "learning_rate": 3.1552195041231327e-06, + "loss": 2.966, + "step": 23171 + }, + { + "epoch": 1.9749424699565328, + "grad_norm": 94.87689222776122, + "learning_rate": 3.154758649008984e-06, + "loss": 2.5307, + "step": 23172 + }, + { + "epoch": 1.9750276996505582, + "grad_norm": 96.00019434776188, + "learning_rate": 3.154297812042053e-06, + "loss": 3.763, + "step": 23173 + }, + { + "epoch": 1.9751129293445837, + "grad_norm": 61.449357659293796, + "learning_rate": 3.15383699322687e-06, + "loss": 3.253, + "step": 23174 + }, + { + "epoch": 1.9751981590386092, + "grad_norm": 29.67941039913069, + "learning_rate": 3.153376192567967e-06, + "loss": 2.4731, + "step": 23175 + }, + { + "epoch": 1.9752833887326344, + "grad_norm": 38.20150806389295, + "learning_rate": 3.152915410069873e-06, + "loss": 2.4624, + "step": 23176 + }, + { + "epoch": 1.97536861842666, + "grad_norm": 45.11094345611612, + "learning_rate": 3.152454645737123e-06, + "loss": 2.3383, + "step": 23177 + }, + { + "epoch": 1.9754538481206851, + "grad_norm": 34.42745825759337, + "learning_rate": 3.151993899574249e-06, + "loss": 2.8264, + "step": 23178 + }, + { + "epoch": 1.9755390778147106, + "grad_norm": 29.277820659312212, + "learning_rate": 3.15153317158578e-06, + "loss": 2.4196, + "step": 23179 + }, + { + "epoch": 1.975624307508736, + "grad_norm": 69.57136827760733, + "learning_rate": 3.1510724617762457e-06, + "loss": 3.2371, + "step": 23180 + }, + { + "epoch": 1.9757095372027615, + "grad_norm": 54.63757464418837, + "learning_rate": 3.1506117701501816e-06, + "loss": 2.4456, + "step": 23181 + }, + { + "epoch": 1.9757947668967868, + "grad_norm": 37.69593482692031, + "learning_rate": 3.150151096712114e-06, + "loss": 2.5079, + "step": 23182 + }, + { + "epoch": 1.9758799965908123, + "grad_norm": 73.33734194282874, + "learning_rate": 3.1496904414665776e-06, + "loss": 2.9951, + "step": 23183 + }, + { + "epoch": 1.9759652262848375, + "grad_norm": 38.95334822868872, + "learning_rate": 3.149229804418098e-06, + "loss": 2.9826, + "step": 23184 + }, + { + "epoch": 1.976050455978863, + "grad_norm": 55.17193352293171, + "learning_rate": 3.1487691855712087e-06, + "loss": 2.3804, + "step": 23185 + }, + { + "epoch": 1.9761356856728884, + "grad_norm": 33.07016674344254, + "learning_rate": 3.1483085849304384e-06, + "loss": 2.2403, + "step": 23186 + }, + { + "epoch": 1.976220915366914, + "grad_norm": 91.23606028870053, + "learning_rate": 3.1478480025003187e-06, + "loss": 3.0846, + "step": 23187 + }, + { + "epoch": 1.9763061450609394, + "grad_norm": 82.379989288405, + "learning_rate": 3.1473874382853774e-06, + "loss": 3.1407, + "step": 23188 + }, + { + "epoch": 1.9763913747549646, + "grad_norm": 48.004513908511974, + "learning_rate": 3.146926892290144e-06, + "loss": 2.8555, + "step": 23189 + }, + { + "epoch": 1.9764766044489899, + "grad_norm": 35.65257355525709, + "learning_rate": 3.1464663645191484e-06, + "loss": 2.4589, + "step": 23190 + }, + { + "epoch": 1.9765618341430153, + "grad_norm": 24.05769084080414, + "learning_rate": 3.1460058549769202e-06, + "loss": 2.4715, + "step": 23191 + }, + { + "epoch": 1.9766470638370408, + "grad_norm": 56.73871252448012, + "learning_rate": 3.145545363667987e-06, + "loss": 2.2603, + "step": 23192 + }, + { + "epoch": 1.9767322935310663, + "grad_norm": 33.79837595805924, + "learning_rate": 3.1450848905968785e-06, + "loss": 2.371, + "step": 23193 + }, + { + "epoch": 1.9768175232250917, + "grad_norm": 23.56386458984967, + "learning_rate": 3.1446244357681222e-06, + "loss": 2.4933, + "step": 23194 + }, + { + "epoch": 1.976902752919117, + "grad_norm": 98.26390490951653, + "learning_rate": 3.1441639991862493e-06, + "loss": 2.1109, + "step": 23195 + }, + { + "epoch": 1.9769879826131425, + "grad_norm": 58.83484849000896, + "learning_rate": 3.143703580855785e-06, + "loss": 2.121, + "step": 23196 + }, + { + "epoch": 1.9770732123071677, + "grad_norm": 75.6192275900023, + "learning_rate": 3.1432431807812576e-06, + "loss": 3.3587, + "step": 23197 + }, + { + "epoch": 1.9771584420011932, + "grad_norm": 34.18347226310406, + "learning_rate": 3.142782798967196e-06, + "loss": 2.4409, + "step": 23198 + }, + { + "epoch": 1.9772436716952186, + "grad_norm": 43.52661922087722, + "learning_rate": 3.1423224354181286e-06, + "loss": 3.2301, + "step": 23199 + }, + { + "epoch": 1.977328901389244, + "grad_norm": 93.87299709553207, + "learning_rate": 3.1418620901385812e-06, + "loss": 4.4065, + "step": 23200 + }, + { + "epoch": 1.9774141310832696, + "grad_norm": 33.39093703647878, + "learning_rate": 3.1414017631330823e-06, + "loss": 2.1853, + "step": 23201 + }, + { + "epoch": 1.9774993607772948, + "grad_norm": 61.62362812656439, + "learning_rate": 3.1409414544061566e-06, + "loss": 3.5085, + "step": 23202 + }, + { + "epoch": 1.97758459047132, + "grad_norm": 48.463848409321606, + "learning_rate": 3.1404811639623355e-06, + "loss": 3.0352, + "step": 23203 + }, + { + "epoch": 1.9776698201653455, + "grad_norm": 58.82664333325429, + "learning_rate": 3.1400208918061412e-06, + "loss": 2.5671, + "step": 23204 + }, + { + "epoch": 1.977755049859371, + "grad_norm": 26.125593451907033, + "learning_rate": 3.1395606379421046e-06, + "loss": 2.0749, + "step": 23205 + }, + { + "epoch": 1.9778402795533965, + "grad_norm": 76.02497742766433, + "learning_rate": 3.1391004023747474e-06, + "loss": 2.7051, + "step": 23206 + }, + { + "epoch": 1.977925509247422, + "grad_norm": 53.73967387899773, + "learning_rate": 3.1386401851086e-06, + "loss": 3.8178, + "step": 23207 + }, + { + "epoch": 1.9780107389414472, + "grad_norm": 50.38622251182793, + "learning_rate": 3.1381799861481858e-06, + "loss": 2.8796, + "step": 23208 + }, + { + "epoch": 1.9780959686354724, + "grad_norm": 89.18945441483248, + "learning_rate": 3.1377198054980327e-06, + "loss": 2.9215, + "step": 23209 + }, + { + "epoch": 1.978181198329498, + "grad_norm": 23.05434256478268, + "learning_rate": 3.137259643162663e-06, + "loss": 1.7577, + "step": 23210 + }, + { + "epoch": 1.9782664280235234, + "grad_norm": 35.179203549848275, + "learning_rate": 3.1367994991466076e-06, + "loss": 2.262, + "step": 23211 + }, + { + "epoch": 1.9783516577175488, + "grad_norm": 31.15841148663805, + "learning_rate": 3.136339373454387e-06, + "loss": 1.5998, + "step": 23212 + }, + { + "epoch": 1.9784368874115743, + "grad_norm": 40.89684147917684, + "learning_rate": 3.135879266090529e-06, + "loss": 2.7671, + "step": 23213 + }, + { + "epoch": 1.9785221171055996, + "grad_norm": 64.57132530028827, + "learning_rate": 3.1354191770595584e-06, + "loss": 2.6382, + "step": 23214 + }, + { + "epoch": 1.978607346799625, + "grad_norm": 70.20913549538108, + "learning_rate": 3.1349591063659967e-06, + "loss": 2.9374, + "step": 23215 + }, + { + "epoch": 1.9786925764936503, + "grad_norm": 53.14371763735061, + "learning_rate": 3.1344990540143716e-06, + "loss": 2.582, + "step": 23216 + }, + { + "epoch": 1.9787778061876757, + "grad_norm": 87.72389583291444, + "learning_rate": 3.1340390200092086e-06, + "loss": 2.6411, + "step": 23217 + }, + { + "epoch": 1.9788630358817012, + "grad_norm": 148.2512340863062, + "learning_rate": 3.133579004355031e-06, + "loss": 2.9174, + "step": 23218 + }, + { + "epoch": 1.9789482655757267, + "grad_norm": 39.16984557256265, + "learning_rate": 3.1331190070563588e-06, + "loss": 2.7646, + "step": 23219 + }, + { + "epoch": 1.9790334952697521, + "grad_norm": 53.51237976442566, + "learning_rate": 3.1326590281177216e-06, + "loss": 3.1172, + "step": 23220 + }, + { + "epoch": 1.9791187249637774, + "grad_norm": 64.12828042785956, + "learning_rate": 3.1321990675436414e-06, + "loss": 2.8756, + "step": 23221 + }, + { + "epoch": 1.9792039546578026, + "grad_norm": 41.514672245716, + "learning_rate": 3.1317391253386408e-06, + "loss": 2.0658, + "step": 23222 + }, + { + "epoch": 1.979289184351828, + "grad_norm": 60.70775611386336, + "learning_rate": 3.1312792015072425e-06, + "loss": 2.359, + "step": 23223 + }, + { + "epoch": 1.9793744140458536, + "grad_norm": 47.141568202658334, + "learning_rate": 3.1308192960539707e-06, + "loss": 2.054, + "step": 23224 + }, + { + "epoch": 1.979459643739879, + "grad_norm": 39.949039028057655, + "learning_rate": 3.13035940898335e-06, + "loss": 2.6931, + "step": 23225 + }, + { + "epoch": 1.9795448734339045, + "grad_norm": 40.291247547033045, + "learning_rate": 3.1298995402999012e-06, + "loss": 2.532, + "step": 23226 + }, + { + "epoch": 1.9796301031279298, + "grad_norm": 39.88677231956671, + "learning_rate": 3.1294396900081457e-06, + "loss": 2.3819, + "step": 23227 + }, + { + "epoch": 1.9797153328219552, + "grad_norm": 40.834004165583856, + "learning_rate": 3.1289798581126097e-06, + "loss": 2.7271, + "step": 23228 + }, + { + "epoch": 1.9798005625159805, + "grad_norm": 25.194349792751847, + "learning_rate": 3.1285200446178115e-06, + "loss": 1.7346, + "step": 23229 + }, + { + "epoch": 1.979885792210006, + "grad_norm": 31.241415130350845, + "learning_rate": 3.1280602495282765e-06, + "loss": 1.7121, + "step": 23230 + }, + { + "epoch": 1.9799710219040314, + "grad_norm": 50.62985337443207, + "learning_rate": 3.1276004728485245e-06, + "loss": 2.7844, + "step": 23231 + }, + { + "epoch": 1.9800562515980569, + "grad_norm": 181.79935025849608, + "learning_rate": 3.127140714583077e-06, + "loss": 3.6978, + "step": 23232 + }, + { + "epoch": 1.9801414812920821, + "grad_norm": 18.058640393287213, + "learning_rate": 3.126680974736457e-06, + "loss": 0.8847, + "step": 23233 + }, + { + "epoch": 1.9802267109861076, + "grad_norm": 35.29943910636581, + "learning_rate": 3.126221253313186e-06, + "loss": 3.2551, + "step": 23234 + }, + { + "epoch": 1.9803119406801328, + "grad_norm": 41.13943586418076, + "learning_rate": 3.125761550317783e-06, + "loss": 2.2709, + "step": 23235 + }, + { + "epoch": 1.9803971703741583, + "grad_norm": 46.069279775919156, + "learning_rate": 3.1253018657547714e-06, + "loss": 3.2174, + "step": 23236 + }, + { + "epoch": 1.9804824000681838, + "grad_norm": 46.21063141793698, + "learning_rate": 3.1248421996286704e-06, + "loss": 1.7285, + "step": 23237 + }, + { + "epoch": 1.9805676297622092, + "grad_norm": 43.898670865579525, + "learning_rate": 3.1243825519440017e-06, + "loss": 4.0299, + "step": 23238 + }, + { + "epoch": 1.9806528594562347, + "grad_norm": 32.03763304673255, + "learning_rate": 3.123922922705285e-06, + "loss": 3.0521, + "step": 23239 + }, + { + "epoch": 1.98073808915026, + "grad_norm": 56.02664748527049, + "learning_rate": 3.123463311917041e-06, + "loss": 2.1321, + "step": 23240 + }, + { + "epoch": 1.9808233188442852, + "grad_norm": 99.25577600297828, + "learning_rate": 3.123003719583789e-06, + "loss": 2.6979, + "step": 23241 + }, + { + "epoch": 1.9809085485383107, + "grad_norm": 63.696196912957326, + "learning_rate": 3.1225441457100513e-06, + "loss": 2.5348, + "step": 23242 + }, + { + "epoch": 1.9809937782323361, + "grad_norm": 45.79175448252787, + "learning_rate": 3.122084590300345e-06, + "loss": 2.5889, + "step": 23243 + }, + { + "epoch": 1.9810790079263616, + "grad_norm": 81.58590522286647, + "learning_rate": 3.1216250533591907e-06, + "loss": 2.4915, + "step": 23244 + }, + { + "epoch": 1.981164237620387, + "grad_norm": 37.643792934985214, + "learning_rate": 3.121165534891106e-06, + "loss": 2.7883, + "step": 23245 + }, + { + "epoch": 1.9812494673144123, + "grad_norm": 30.944917599687493, + "learning_rate": 3.1207060349006147e-06, + "loss": 2.8154, + "step": 23246 + }, + { + "epoch": 1.9813346970084378, + "grad_norm": 65.36490158428911, + "learning_rate": 3.1202465533922305e-06, + "loss": 2.6605, + "step": 23247 + }, + { + "epoch": 1.981419926702463, + "grad_norm": 26.812862776339276, + "learning_rate": 3.119787090370476e-06, + "loss": 2.18, + "step": 23248 + }, + { + "epoch": 1.9815051563964885, + "grad_norm": 41.23432877942753, + "learning_rate": 3.119327645839866e-06, + "loss": 2.913, + "step": 23249 + }, + { + "epoch": 1.981590386090514, + "grad_norm": 51.60884607443127, + "learning_rate": 3.1188682198049235e-06, + "loss": 1.5853, + "step": 23250 + }, + { + "epoch": 1.9816756157845394, + "grad_norm": 16.032844824353834, + "learning_rate": 3.118408812270164e-06, + "loss": 1.2034, + "step": 23251 + }, + { + "epoch": 1.9817608454785647, + "grad_norm": 46.43229711441521, + "learning_rate": 3.1179494232401064e-06, + "loss": 2.1747, + "step": 23252 + }, + { + "epoch": 1.9818460751725901, + "grad_norm": 53.10268528126932, + "learning_rate": 3.117490052719266e-06, + "loss": 3.1374, + "step": 23253 + }, + { + "epoch": 1.9819313048666154, + "grad_norm": 42.24006266092149, + "learning_rate": 3.1170307007121658e-06, + "loss": 3.1167, + "step": 23254 + }, + { + "epoch": 1.9820165345606409, + "grad_norm": 68.5837429922446, + "learning_rate": 3.1165713672233193e-06, + "loss": 3.0053, + "step": 23255 + }, + { + "epoch": 1.9821017642546663, + "grad_norm": 30.731119050796078, + "learning_rate": 3.1161120522572462e-06, + "loss": 2.1966, + "step": 23256 + }, + { + "epoch": 1.9821869939486918, + "grad_norm": 61.459811901965246, + "learning_rate": 3.1156527558184624e-06, + "loss": 3.3388, + "step": 23257 + }, + { + "epoch": 1.9822722236427173, + "grad_norm": 53.23524426112301, + "learning_rate": 3.115193477911482e-06, + "loss": 3.1516, + "step": 23258 + }, + { + "epoch": 1.9823574533367425, + "grad_norm": 36.50126962940393, + "learning_rate": 3.114734218540827e-06, + "loss": 2.9892, + "step": 23259 + }, + { + "epoch": 1.9824426830307678, + "grad_norm": 45.10908830999525, + "learning_rate": 3.1142749777110127e-06, + "loss": 2.8691, + "step": 23260 + }, + { + "epoch": 1.9825279127247932, + "grad_norm": 119.87566595397624, + "learning_rate": 3.1138157554265546e-06, + "loss": 3.5977, + "step": 23261 + }, + { + "epoch": 1.9826131424188187, + "grad_norm": 64.58063270651762, + "learning_rate": 3.113356551691967e-06, + "loss": 2.8494, + "step": 23262 + }, + { + "epoch": 1.9826983721128442, + "grad_norm": 40.47206677601553, + "learning_rate": 3.1128973665117683e-06, + "loss": 2.2934, + "step": 23263 + }, + { + "epoch": 1.9827836018068696, + "grad_norm": 77.66337887324705, + "learning_rate": 3.1124381998904762e-06, + "loss": 3.5212, + "step": 23264 + }, + { + "epoch": 1.9828688315008949, + "grad_norm": 63.90288521844097, + "learning_rate": 3.1119790518326037e-06, + "loss": 2.9893, + "step": 23265 + }, + { + "epoch": 1.9829540611949203, + "grad_norm": 37.02295819505021, + "learning_rate": 3.111519922342665e-06, + "loss": 2.6989, + "step": 23266 + }, + { + "epoch": 1.9830392908889456, + "grad_norm": 69.0910036902222, + "learning_rate": 3.1110608114251783e-06, + "loss": 2.5235, + "step": 23267 + }, + { + "epoch": 1.983124520582971, + "grad_norm": 66.76443806613709, + "learning_rate": 3.110601719084659e-06, + "loss": 2.7237, + "step": 23268 + }, + { + "epoch": 1.9832097502769965, + "grad_norm": 33.95839425646661, + "learning_rate": 3.1101426453256224e-06, + "loss": 2.0679, + "step": 23269 + }, + { + "epoch": 1.983294979971022, + "grad_norm": 70.49039390186093, + "learning_rate": 3.1096835901525786e-06, + "loss": 2.4641, + "step": 23270 + }, + { + "epoch": 1.9833802096650475, + "grad_norm": 70.08351191665353, + "learning_rate": 3.1092245535700467e-06, + "loss": 2.5853, + "step": 23271 + }, + { + "epoch": 1.9834654393590727, + "grad_norm": 44.33813110051344, + "learning_rate": 3.1087655355825408e-06, + "loss": 2.4511, + "step": 23272 + }, + { + "epoch": 1.983550669053098, + "grad_norm": 47.86250359533478, + "learning_rate": 3.1083065361945753e-06, + "loss": 2.0924, + "step": 23273 + }, + { + "epoch": 1.9836358987471234, + "grad_norm": 84.21709926648403, + "learning_rate": 3.1078475554106614e-06, + "loss": 3.2325, + "step": 23274 + }, + { + "epoch": 1.983721128441149, + "grad_norm": 57.76571909172865, + "learning_rate": 3.1073885932353143e-06, + "loss": 2.5373, + "step": 23275 + }, + { + "epoch": 1.9838063581351744, + "grad_norm": 58.013377386017964, + "learning_rate": 3.10692964967305e-06, + "loss": 2.1639, + "step": 23276 + }, + { + "epoch": 1.9838915878291998, + "grad_norm": 45.15496216465401, + "learning_rate": 3.1064707247283805e-06, + "loss": 3.0492, + "step": 23277 + }, + { + "epoch": 1.983976817523225, + "grad_norm": 54.96043605151079, + "learning_rate": 3.106011818405818e-06, + "loss": 2.2483, + "step": 23278 + }, + { + "epoch": 1.9840620472172503, + "grad_norm": 66.49706191573452, + "learning_rate": 3.1055529307098764e-06, + "loss": 2.547, + "step": 23279 + }, + { + "epoch": 1.9841472769112758, + "grad_norm": 86.23147140854819, + "learning_rate": 3.10509406164507e-06, + "loss": 4.2861, + "step": 23280 + }, + { + "epoch": 1.9842325066053013, + "grad_norm": 43.87105606902964, + "learning_rate": 3.1046352112159106e-06, + "loss": 3.0469, + "step": 23281 + }, + { + "epoch": 1.9843177362993267, + "grad_norm": 40.35804896522922, + "learning_rate": 3.10417637942691e-06, + "loss": 2.42, + "step": 23282 + }, + { + "epoch": 1.9844029659933522, + "grad_norm": 47.1645420340922, + "learning_rate": 3.1037175662825812e-06, + "loss": 3.2821, + "step": 23283 + }, + { + "epoch": 1.9844881956873774, + "grad_norm": 36.826326211216156, + "learning_rate": 3.103258771787437e-06, + "loss": 2.1025, + "step": 23284 + }, + { + "epoch": 1.984573425381403, + "grad_norm": 31.089906040519462, + "learning_rate": 3.10279999594599e-06, + "loss": 1.9079, + "step": 23285 + }, + { + "epoch": 1.9846586550754282, + "grad_norm": 39.698117019570624, + "learning_rate": 3.10234123876275e-06, + "loss": 2.5691, + "step": 23286 + }, + { + "epoch": 1.9847438847694536, + "grad_norm": 36.282240464382305, + "learning_rate": 3.101882500242231e-06, + "loss": 2.2281, + "step": 23287 + }, + { + "epoch": 1.984829114463479, + "grad_norm": 37.19604658604044, + "learning_rate": 3.1014237803889414e-06, + "loss": 2.6957, + "step": 23288 + }, + { + "epoch": 1.9849143441575046, + "grad_norm": 51.7925040449913, + "learning_rate": 3.1009650792073965e-06, + "loss": 3.0791, + "step": 23289 + }, + { + "epoch": 1.98499957385153, + "grad_norm": 45.35567390212294, + "learning_rate": 3.1005063967021045e-06, + "loss": 2.6233, + "step": 23290 + }, + { + "epoch": 1.9850848035455553, + "grad_norm": 44.424211018966716, + "learning_rate": 3.100047732877578e-06, + "loss": 2.779, + "step": 23291 + }, + { + "epoch": 1.9851700332395805, + "grad_norm": 64.8515125470404, + "learning_rate": 3.099589087738325e-06, + "loss": 3.5912, + "step": 23292 + }, + { + "epoch": 1.985255262933606, + "grad_norm": 58.427929758418266, + "learning_rate": 3.0991304612888602e-06, + "loss": 2.5859, + "step": 23293 + }, + { + "epoch": 1.9853404926276315, + "grad_norm": 44.989739015717, + "learning_rate": 3.0986718535336913e-06, + "loss": 3.4574, + "step": 23294 + }, + { + "epoch": 1.985425722321657, + "grad_norm": 79.9023588784905, + "learning_rate": 3.09821326447733e-06, + "loss": 3.2067, + "step": 23295 + }, + { + "epoch": 1.9855109520156824, + "grad_norm": 35.7571999236792, + "learning_rate": 3.0977546941242834e-06, + "loss": 2.151, + "step": 23296 + }, + { + "epoch": 1.9855961817097076, + "grad_norm": 36.33313709926269, + "learning_rate": 3.097296142479066e-06, + "loss": 2.6196, + "step": 23297 + }, + { + "epoch": 1.985681411403733, + "grad_norm": 40.32584139088609, + "learning_rate": 3.096837609546184e-06, + "loss": 2.7096, + "step": 23298 + }, + { + "epoch": 1.9857666410977584, + "grad_norm": 30.84014828415009, + "learning_rate": 3.0963790953301482e-06, + "loss": 2.0649, + "step": 23299 + }, + { + "epoch": 1.9858518707917838, + "grad_norm": 57.476592544872915, + "learning_rate": 3.0959205998354656e-06, + "loss": 2.9052, + "step": 23300 + }, + { + "epoch": 1.9859371004858093, + "grad_norm": 60.02049012368564, + "learning_rate": 3.0954621230666502e-06, + "loss": 3.8581, + "step": 23301 + }, + { + "epoch": 1.9860223301798348, + "grad_norm": 55.63514402774089, + "learning_rate": 3.0950036650282067e-06, + "loss": 3.1757, + "step": 23302 + }, + { + "epoch": 1.98610755987386, + "grad_norm": 49.17474079911882, + "learning_rate": 3.0945452257246454e-06, + "loss": 2.1221, + "step": 23303 + }, + { + "epoch": 1.9861927895678855, + "grad_norm": 54.430085248153254, + "learning_rate": 3.0940868051604755e-06, + "loss": 2.1883, + "step": 23304 + }, + { + "epoch": 1.9862780192619107, + "grad_norm": 36.350824655459334, + "learning_rate": 3.093628403340202e-06, + "loss": 2.6258, + "step": 23305 + }, + { + "epoch": 1.9863632489559362, + "grad_norm": 31.254438725649074, + "learning_rate": 3.0931700202683366e-06, + "loss": 2.7529, + "step": 23306 + }, + { + "epoch": 1.9864484786499617, + "grad_norm": 94.45207681069405, + "learning_rate": 3.092711655949388e-06, + "loss": 4.5993, + "step": 23307 + }, + { + "epoch": 1.9865337083439871, + "grad_norm": 97.71633724463874, + "learning_rate": 3.092253310387862e-06, + "loss": 3.4054, + "step": 23308 + }, + { + "epoch": 1.9866189380380126, + "grad_norm": 34.68179154614859, + "learning_rate": 3.0917949835882643e-06, + "loss": 1.8553, + "step": 23309 + }, + { + "epoch": 1.9867041677320378, + "grad_norm": 51.44881502840903, + "learning_rate": 3.0913366755551065e-06, + "loss": 3.0588, + "step": 23310 + }, + { + "epoch": 1.986789397426063, + "grad_norm": 58.23053605450481, + "learning_rate": 3.0908783862928948e-06, + "loss": 3.4878, + "step": 23311 + }, + { + "epoch": 1.9868746271200886, + "grad_norm": 32.07779728542285, + "learning_rate": 3.090420115806135e-06, + "loss": 1.782, + "step": 23312 + }, + { + "epoch": 1.986959856814114, + "grad_norm": 87.49700976301713, + "learning_rate": 3.0899618640993335e-06, + "loss": 3.3359, + "step": 23313 + }, + { + "epoch": 1.9870450865081395, + "grad_norm": 120.14214052742388, + "learning_rate": 3.0895036311769977e-06, + "loss": 3.0001, + "step": 23314 + }, + { + "epoch": 1.987130316202165, + "grad_norm": 49.55351593509615, + "learning_rate": 3.089045417043637e-06, + "loss": 2.9561, + "step": 23315 + }, + { + "epoch": 1.9872155458961902, + "grad_norm": 69.23566478421867, + "learning_rate": 3.0885872217037544e-06, + "loss": 2.2504, + "step": 23316 + }, + { + "epoch": 1.9873007755902157, + "grad_norm": 55.62327660227181, + "learning_rate": 3.088129045161856e-06, + "loss": 1.2103, + "step": 23317 + }, + { + "epoch": 1.987386005284241, + "grad_norm": 40.57052459100798, + "learning_rate": 3.087670887422448e-06, + "loss": 2.7444, + "step": 23318 + }, + { + "epoch": 1.9874712349782664, + "grad_norm": 44.62518143700017, + "learning_rate": 3.0872127484900394e-06, + "loss": 3.2848, + "step": 23319 + }, + { + "epoch": 1.9875564646722919, + "grad_norm": 53.92134812556917, + "learning_rate": 3.0867546283691323e-06, + "loss": 1.9492, + "step": 23320 + }, + { + "epoch": 1.9876416943663173, + "grad_norm": 50.174159555152556, + "learning_rate": 3.0862965270642333e-06, + "loss": 3.0962, + "step": 23321 + }, + { + "epoch": 1.9877269240603426, + "grad_norm": 36.89295191867207, + "learning_rate": 3.085838444579845e-06, + "loss": 2.2393, + "step": 23322 + }, + { + "epoch": 1.987812153754368, + "grad_norm": 34.51985415981923, + "learning_rate": 3.0853803809204784e-06, + "loss": 2.4608, + "step": 23323 + }, + { + "epoch": 1.9878973834483933, + "grad_norm": 55.080253777597676, + "learning_rate": 3.0849223360906342e-06, + "loss": 3.3966, + "step": 23324 + }, + { + "epoch": 1.9879826131424188, + "grad_norm": 22.67760103900347, + "learning_rate": 3.0844643100948175e-06, + "loss": 1.4039, + "step": 23325 + }, + { + "epoch": 1.9880678428364442, + "grad_norm": 33.50588347020788, + "learning_rate": 3.084006302937532e-06, + "loss": 3.2295, + "step": 23326 + }, + { + "epoch": 1.9881530725304697, + "grad_norm": 43.45166776387722, + "learning_rate": 3.083548314623285e-06, + "loss": 2.4572, + "step": 23327 + }, + { + "epoch": 1.9882383022244952, + "grad_norm": 61.60372429437437, + "learning_rate": 3.0830903451565785e-06, + "loss": 3.1631, + "step": 23328 + }, + { + "epoch": 1.9883235319185204, + "grad_norm": 97.8066389289644, + "learning_rate": 3.0826323945419155e-06, + "loss": 2.9547, + "step": 23329 + }, + { + "epoch": 1.9884087616125456, + "grad_norm": 49.52509642716241, + "learning_rate": 3.0821744627838026e-06, + "loss": 2.3469, + "step": 23330 + }, + { + "epoch": 1.9884939913065711, + "grad_norm": 65.41508558632746, + "learning_rate": 3.0817165498867396e-06, + "loss": 3.3729, + "step": 23331 + }, + { + "epoch": 1.9885792210005966, + "grad_norm": 46.43969995382225, + "learning_rate": 3.0812586558552344e-06, + "loss": 1.7406, + "step": 23332 + }, + { + "epoch": 1.988664450694622, + "grad_norm": 60.97016965458522, + "learning_rate": 3.0808007806937866e-06, + "loss": 2.4663, + "step": 23333 + }, + { + "epoch": 1.9887496803886475, + "grad_norm": 55.903953481529676, + "learning_rate": 3.0803429244069017e-06, + "loss": 2.4057, + "step": 23334 + }, + { + "epoch": 1.9888349100826728, + "grad_norm": 49.94109733057517, + "learning_rate": 3.0798850869990794e-06, + "loss": 2.1441, + "step": 23335 + }, + { + "epoch": 1.9889201397766982, + "grad_norm": 25.091731850509365, + "learning_rate": 3.0794272684748263e-06, + "loss": 1.9197, + "step": 23336 + }, + { + "epoch": 1.9890053694707235, + "grad_norm": 57.57680626238664, + "learning_rate": 3.0789694688386413e-06, + "loss": 3.3894, + "step": 23337 + }, + { + "epoch": 1.989090599164749, + "grad_norm": 44.6893771970379, + "learning_rate": 3.0785116880950294e-06, + "loss": 3.009, + "step": 23338 + }, + { + "epoch": 1.9891758288587744, + "grad_norm": 69.11148979427955, + "learning_rate": 3.07805392624849e-06, + "loss": 2.2827, + "step": 23339 + }, + { + "epoch": 1.9892610585527999, + "grad_norm": 69.91128563034017, + "learning_rate": 3.0775961833035285e-06, + "loss": 3.0318, + "step": 23340 + }, + { + "epoch": 1.9893462882468254, + "grad_norm": 97.08378262281745, + "learning_rate": 3.0771384592646423e-06, + "loss": 3.4432, + "step": 23341 + }, + { + "epoch": 1.9894315179408506, + "grad_norm": 37.29378069448026, + "learning_rate": 3.076680754136338e-06, + "loss": 2.7789, + "step": 23342 + }, + { + "epoch": 1.9895167476348758, + "grad_norm": 120.93248139098453, + "learning_rate": 3.0762230679231115e-06, + "loss": 4.5839, + "step": 23343 + }, + { + "epoch": 1.9896019773289013, + "grad_norm": 66.52100768284943, + "learning_rate": 3.075765400629469e-06, + "loss": 3.8015, + "step": 23344 + }, + { + "epoch": 1.9896872070229268, + "grad_norm": 37.85802422803807, + "learning_rate": 3.075307752259908e-06, + "loss": 3.0531, + "step": 23345 + }, + { + "epoch": 1.9897724367169523, + "grad_norm": 38.697428055701444, + "learning_rate": 3.074850122818931e-06, + "loss": 2.6962, + "step": 23346 + }, + { + "epoch": 1.9898576664109777, + "grad_norm": 39.46166777898273, + "learning_rate": 3.0743925123110386e-06, + "loss": 1.0971, + "step": 23347 + }, + { + "epoch": 1.989942896105003, + "grad_norm": 92.07388020446209, + "learning_rate": 3.0739349207407286e-06, + "loss": 3.7334, + "step": 23348 + }, + { + "epoch": 1.9900281257990284, + "grad_norm": 62.140711785477855, + "learning_rate": 3.0734773481125045e-06, + "loss": 3.425, + "step": 23349 + }, + { + "epoch": 1.9901133554930537, + "grad_norm": 36.41737040878581, + "learning_rate": 3.073019794430866e-06, + "loss": 2.3013, + "step": 23350 + }, + { + "epoch": 1.9901985851870791, + "grad_norm": 71.47648110097823, + "learning_rate": 3.072562259700312e-06, + "loss": 2.7268, + "step": 23351 + }, + { + "epoch": 1.9902838148811046, + "grad_norm": 43.395830279528575, + "learning_rate": 3.0721047439253404e-06, + "loss": 2.6774, + "step": 23352 + }, + { + "epoch": 1.99036904457513, + "grad_norm": 65.71092997123017, + "learning_rate": 3.071647247110453e-06, + "loss": 2.8172, + "step": 23353 + }, + { + "epoch": 1.9904542742691553, + "grad_norm": 69.13000146190147, + "learning_rate": 3.0711897692601505e-06, + "loss": 2.7284, + "step": 23354 + }, + { + "epoch": 1.9905395039631808, + "grad_norm": 46.07303192456861, + "learning_rate": 3.07073231037893e-06, + "loss": 3.0023, + "step": 23355 + }, + { + "epoch": 1.990624733657206, + "grad_norm": 58.37862985609769, + "learning_rate": 3.070274870471288e-06, + "loss": 2.9699, + "step": 23356 + }, + { + "epoch": 1.9907099633512315, + "grad_norm": 58.09580961333263, + "learning_rate": 3.069817449541728e-06, + "loss": 2.5292, + "step": 23357 + }, + { + "epoch": 1.990795193045257, + "grad_norm": 97.2712233180224, + "learning_rate": 3.069360047594746e-06, + "loss": 3.3387, + "step": 23358 + }, + { + "epoch": 1.9908804227392825, + "grad_norm": 79.52277560565528, + "learning_rate": 3.0689026646348418e-06, + "loss": 2.6402, + "step": 23359 + }, + { + "epoch": 1.990965652433308, + "grad_norm": 58.427737224327736, + "learning_rate": 3.0684453006665117e-06, + "loss": 2.9996, + "step": 23360 + }, + { + "epoch": 1.9910508821273332, + "grad_norm": 37.11630710712001, + "learning_rate": 3.0679879556942525e-06, + "loss": 2.5249, + "step": 23361 + }, + { + "epoch": 1.9911361118213584, + "grad_norm": 24.698552623404506, + "learning_rate": 3.0675306297225673e-06, + "loss": 2.0892, + "step": 23362 + }, + { + "epoch": 1.9912213415153839, + "grad_norm": 48.507788154773884, + "learning_rate": 3.067073322755949e-06, + "loss": 2.2341, + "step": 23363 + }, + { + "epoch": 1.9913065712094093, + "grad_norm": 52.36452251914291, + "learning_rate": 3.0666160347988964e-06, + "loss": 2.8787, + "step": 23364 + }, + { + "epoch": 1.9913918009034348, + "grad_norm": 60.49701407811058, + "learning_rate": 3.0661587658559062e-06, + "loss": 1.9398, + "step": 23365 + }, + { + "epoch": 1.9914770305974603, + "grad_norm": 40.64147241100755, + "learning_rate": 3.0657015159314773e-06, + "loss": 2.5073, + "step": 23366 + }, + { + "epoch": 1.9915622602914855, + "grad_norm": 61.338009039291016, + "learning_rate": 3.0652442850301056e-06, + "loss": 2.6243, + "step": 23367 + }, + { + "epoch": 1.991647489985511, + "grad_norm": 75.72260963688564, + "learning_rate": 3.0647870731562868e-06, + "loss": 3.2726, + "step": 23368 + }, + { + "epoch": 1.9917327196795362, + "grad_norm": 61.12650088992516, + "learning_rate": 3.0643298803145173e-06, + "loss": 2.7244, + "step": 23369 + }, + { + "epoch": 1.9918179493735617, + "grad_norm": 100.54173445732661, + "learning_rate": 3.0638727065092962e-06, + "loss": 4.1668, + "step": 23370 + }, + { + "epoch": 1.9919031790675872, + "grad_norm": 27.191294065569405, + "learning_rate": 3.063415551745117e-06, + "loss": 2.456, + "step": 23371 + }, + { + "epoch": 1.9919884087616127, + "grad_norm": 37.01983001499961, + "learning_rate": 3.0629584160264758e-06, + "loss": 2.8345, + "step": 23372 + }, + { + "epoch": 1.992073638455638, + "grad_norm": 60.671760301632986, + "learning_rate": 3.0625012993578685e-06, + "loss": 2.479, + "step": 23373 + }, + { + "epoch": 1.9921588681496634, + "grad_norm": 35.58590136484411, + "learning_rate": 3.0620442017437926e-06, + "loss": 2.4715, + "step": 23374 + }, + { + "epoch": 1.9922440978436886, + "grad_norm": 42.09525375720408, + "learning_rate": 3.061587123188742e-06, + "loss": 3.4503, + "step": 23375 + }, + { + "epoch": 1.992329327537714, + "grad_norm": 44.92339723191449, + "learning_rate": 3.06113006369721e-06, + "loss": 3.3256, + "step": 23376 + }, + { + "epoch": 1.9924145572317395, + "grad_norm": 59.22527066747636, + "learning_rate": 3.060673023273695e-06, + "loss": 2.0267, + "step": 23377 + }, + { + "epoch": 1.992499786925765, + "grad_norm": 76.19043845259536, + "learning_rate": 3.0602160019226877e-06, + "loss": 3.6974, + "step": 23378 + }, + { + "epoch": 1.9925850166197905, + "grad_norm": 31.114990874372687, + "learning_rate": 3.0597589996486874e-06, + "loss": 2.0249, + "step": 23379 + }, + { + "epoch": 1.9926702463138157, + "grad_norm": 74.50092211992286, + "learning_rate": 3.059302016456186e-06, + "loss": 2.6376, + "step": 23380 + }, + { + "epoch": 1.992755476007841, + "grad_norm": 26.02101651506473, + "learning_rate": 3.058845052349679e-06, + "loss": 1.8531, + "step": 23381 + }, + { + "epoch": 1.9928407057018664, + "grad_norm": 46.97973026558555, + "learning_rate": 3.0583881073336573e-06, + "loss": 2.7968, + "step": 23382 + }, + { + "epoch": 1.992925935395892, + "grad_norm": 59.741733909583296, + "learning_rate": 3.057931181412619e-06, + "loss": 0.7815, + "step": 23383 + }, + { + "epoch": 1.9930111650899174, + "grad_norm": 46.164488888132766, + "learning_rate": 3.0574742745910545e-06, + "loss": 2.1702, + "step": 23384 + }, + { + "epoch": 1.9930963947839428, + "grad_norm": 47.8000636159519, + "learning_rate": 3.0570173868734603e-06, + "loss": 2.571, + "step": 23385 + }, + { + "epoch": 1.993181624477968, + "grad_norm": 56.30848969707365, + "learning_rate": 3.056560518264326e-06, + "loss": 2.4757, + "step": 23386 + }, + { + "epoch": 1.9932668541719936, + "grad_norm": 42.163907896823076, + "learning_rate": 3.0561036687681484e-06, + "loss": 2.7014, + "step": 23387 + }, + { + "epoch": 1.9933520838660188, + "grad_norm": 127.84952819016821, + "learning_rate": 3.0556468383894178e-06, + "loss": 1.7672, + "step": 23388 + }, + { + "epoch": 1.9934373135600443, + "grad_norm": 45.228271545527456, + "learning_rate": 3.055190027132629e-06, + "loss": 2.7265, + "step": 23389 + }, + { + "epoch": 1.9935225432540697, + "grad_norm": 115.07749756242663, + "learning_rate": 3.054733235002274e-06, + "loss": 3.0348, + "step": 23390 + }, + { + "epoch": 1.9936077729480952, + "grad_norm": 67.66724256290715, + "learning_rate": 3.054276462002842e-06, + "loss": 3.2742, + "step": 23391 + }, + { + "epoch": 1.9936930026421205, + "grad_norm": 40.551206613939435, + "learning_rate": 3.053819708138829e-06, + "loss": 3.2265, + "step": 23392 + }, + { + "epoch": 1.993778232336146, + "grad_norm": 16.338689555095705, + "learning_rate": 3.0533629734147265e-06, + "loss": 1.2029, + "step": 23393 + }, + { + "epoch": 1.9938634620301712, + "grad_norm": 40.53037598212126, + "learning_rate": 3.0529062578350264e-06, + "loss": 2.16, + "step": 23394 + }, + { + "epoch": 1.9939486917241966, + "grad_norm": 70.45192080907454, + "learning_rate": 3.052449561404217e-06, + "loss": 1.8012, + "step": 23395 + }, + { + "epoch": 1.994033921418222, + "grad_norm": 55.946443236070785, + "learning_rate": 3.051992884126793e-06, + "loss": 2.7813, + "step": 23396 + }, + { + "epoch": 1.9941191511122476, + "grad_norm": 79.64008256348825, + "learning_rate": 3.0515362260072466e-06, + "loss": 3.6198, + "step": 23397 + }, + { + "epoch": 1.994204380806273, + "grad_norm": 55.21077468262977, + "learning_rate": 3.051079587050067e-06, + "loss": 3.2584, + "step": 23398 + }, + { + "epoch": 1.9942896105002983, + "grad_norm": 100.0464685212587, + "learning_rate": 3.050622967259743e-06, + "loss": 2.6767, + "step": 23399 + }, + { + "epoch": 1.9943748401943235, + "grad_norm": 17.533558867157588, + "learning_rate": 3.0501663666407688e-06, + "loss": 1.1443, + "step": 23400 + }, + { + "epoch": 1.994460069888349, + "grad_norm": 44.15109324279227, + "learning_rate": 3.0497097851976344e-06, + "loss": 2.8071, + "step": 23401 + }, + { + "epoch": 1.9945452995823745, + "grad_norm": 31.54690113109765, + "learning_rate": 3.0492532229348293e-06, + "loss": 1.8859, + "step": 23402 + }, + { + "epoch": 1.9946305292764, + "grad_norm": 38.00749329898735, + "learning_rate": 3.048796679856843e-06, + "loss": 2.6792, + "step": 23403 + }, + { + "epoch": 1.9947157589704254, + "grad_norm": 24.088474302023904, + "learning_rate": 3.0483401559681648e-06, + "loss": 1.2825, + "step": 23404 + }, + { + "epoch": 1.9948009886644507, + "grad_norm": 70.00169775003413, + "learning_rate": 3.0478836512732873e-06, + "loss": 2.8499, + "step": 23405 + }, + { + "epoch": 1.9948862183584761, + "grad_norm": 27.940213386634305, + "learning_rate": 3.0474271657766983e-06, + "loss": 1.7168, + "step": 23406 + }, + { + "epoch": 1.9949714480525014, + "grad_norm": 45.093794826419085, + "learning_rate": 3.0469706994828875e-06, + "loss": 2.8069, + "step": 23407 + }, + { + "epoch": 1.9950566777465268, + "grad_norm": 40.366247524976025, + "learning_rate": 3.0465142523963415e-06, + "loss": 3.0121, + "step": 23408 + }, + { + "epoch": 1.9951419074405523, + "grad_norm": 28.70020481648195, + "learning_rate": 3.046057824521554e-06, + "loss": 2.2482, + "step": 23409 + }, + { + "epoch": 1.9952271371345778, + "grad_norm": 26.545299926243445, + "learning_rate": 3.045601415863011e-06, + "loss": 1.3569, + "step": 23410 + }, + { + "epoch": 1.9953123668286032, + "grad_norm": 44.2844974928114, + "learning_rate": 3.0451450264252016e-06, + "loss": 2.4891, + "step": 23411 + }, + { + "epoch": 1.9953975965226285, + "grad_norm": 25.641858463013275, + "learning_rate": 3.0446886562126122e-06, + "loss": 2.5627, + "step": 23412 + }, + { + "epoch": 1.9954828262166537, + "grad_norm": 50.290119577492, + "learning_rate": 3.0442323052297346e-06, + "loss": 3.0072, + "step": 23413 + }, + { + "epoch": 1.9955680559106792, + "grad_norm": 28.748743462514984, + "learning_rate": 3.043775973481056e-06, + "loss": 2.1526, + "step": 23414 + }, + { + "epoch": 1.9956532856047047, + "grad_norm": 43.488353863072085, + "learning_rate": 3.0433196609710613e-06, + "loss": 2.6295, + "step": 23415 + }, + { + "epoch": 1.9957385152987301, + "grad_norm": 44.74330382410216, + "learning_rate": 3.0428633677042396e-06, + "loss": 2.6455, + "step": 23416 + }, + { + "epoch": 1.9958237449927556, + "grad_norm": 33.203333005134894, + "learning_rate": 3.042407093685081e-06, + "loss": 2.0018, + "step": 23417 + }, + { + "epoch": 1.9959089746867809, + "grad_norm": 35.60014998540319, + "learning_rate": 3.041950838918072e-06, + "loss": 2.1434, + "step": 23418 + }, + { + "epoch": 1.9959942043808063, + "grad_norm": 27.62870006422847, + "learning_rate": 3.0414946034076965e-06, + "loss": 2.1727, + "step": 23419 + }, + { + "epoch": 1.9960794340748316, + "grad_norm": 33.65111634705863, + "learning_rate": 3.0410383871584437e-06, + "loss": 2.1312, + "step": 23420 + }, + { + "epoch": 1.996164663768857, + "grad_norm": 43.77710970336774, + "learning_rate": 3.040582190174799e-06, + "loss": 1.5584, + "step": 23421 + }, + { + "epoch": 1.9962498934628825, + "grad_norm": 72.9856719287839, + "learning_rate": 3.0401260124612508e-06, + "loss": 3.0579, + "step": 23422 + }, + { + "epoch": 1.996335123156908, + "grad_norm": 52.13673956341767, + "learning_rate": 3.0396698540222846e-06, + "loss": 2.0488, + "step": 23423 + }, + { + "epoch": 1.9964203528509332, + "grad_norm": 46.30376137525895, + "learning_rate": 3.0392137148623867e-06, + "loss": 2.4115, + "step": 23424 + }, + { + "epoch": 1.9965055825449587, + "grad_norm": 29.47572153768925, + "learning_rate": 3.038757594986041e-06, + "loss": 1.848, + "step": 23425 + }, + { + "epoch": 1.996590812238984, + "grad_norm": 98.70042878292816, + "learning_rate": 3.0383014943977374e-06, + "loss": 3.4437, + "step": 23426 + }, + { + "epoch": 1.9966760419330094, + "grad_norm": 58.233057300216124, + "learning_rate": 3.0378454131019576e-06, + "loss": 3.7934, + "step": 23427 + }, + { + "epoch": 1.9967612716270349, + "grad_norm": 42.56609124400003, + "learning_rate": 3.0373893511031893e-06, + "loss": 2.2227, + "step": 23428 + }, + { + "epoch": 1.9968465013210603, + "grad_norm": 50.54201089823967, + "learning_rate": 3.036933308405915e-06, + "loss": 3.1244, + "step": 23429 + }, + { + "epoch": 1.9969317310150858, + "grad_norm": 29.236217351724566, + "learning_rate": 3.036477285014624e-06, + "loss": 2.7529, + "step": 23430 + }, + { + "epoch": 1.997016960709111, + "grad_norm": 71.27074435598163, + "learning_rate": 3.0360212809337974e-06, + "loss": 2.0838, + "step": 23431 + }, + { + "epoch": 1.9971021904031363, + "grad_norm": 30.05009754405209, + "learning_rate": 3.0355652961679223e-06, + "loss": 0.6633, + "step": 23432 + }, + { + "epoch": 1.9971874200971618, + "grad_norm": 35.91678255351004, + "learning_rate": 3.035109330721482e-06, + "loss": 2.3409, + "step": 23433 + }, + { + "epoch": 1.9972726497911872, + "grad_norm": 32.94571190597153, + "learning_rate": 3.034653384598958e-06, + "loss": 2.3621, + "step": 23434 + }, + { + "epoch": 1.9973578794852127, + "grad_norm": 31.705733054041527, + "learning_rate": 3.0341974578048385e-06, + "loss": 2.5312, + "step": 23435 + }, + { + "epoch": 1.9974431091792382, + "grad_norm": 34.09123801809371, + "learning_rate": 3.033741550343606e-06, + "loss": 2.5761, + "step": 23436 + }, + { + "epoch": 1.9975283388732634, + "grad_norm": 28.62856003611077, + "learning_rate": 3.0332856622197455e-06, + "loss": 1.6067, + "step": 23437 + }, + { + "epoch": 1.9976135685672889, + "grad_norm": 47.224350859830956, + "learning_rate": 3.032829793437736e-06, + "loss": 2.6918, + "step": 23438 + }, + { + "epoch": 1.9976987982613141, + "grad_norm": 36.01851428942779, + "learning_rate": 3.032373944002065e-06, + "loss": 2.5315, + "step": 23439 + }, + { + "epoch": 1.9977840279553396, + "grad_norm": 57.24345280051918, + "learning_rate": 3.031918113917216e-06, + "loss": 3.2977, + "step": 23440 + }, + { + "epoch": 1.997869257649365, + "grad_norm": 35.570605756479715, + "learning_rate": 3.03146230318767e-06, + "loss": 2.515, + "step": 23441 + }, + { + "epoch": 1.9979544873433905, + "grad_norm": 48.50448121013279, + "learning_rate": 3.0310065118179076e-06, + "loss": 2.7086, + "step": 23442 + }, + { + "epoch": 1.9980397170374158, + "grad_norm": 41.29285362491286, + "learning_rate": 3.0305507398124147e-06, + "loss": 2.6185, + "step": 23443 + }, + { + "epoch": 1.9981249467314413, + "grad_norm": 49.4704587230175, + "learning_rate": 3.0300949871756748e-06, + "loss": 3.0928, + "step": 23444 + }, + { + "epoch": 1.9982101764254665, + "grad_norm": 47.67424007753313, + "learning_rate": 3.0296392539121666e-06, + "loss": 2.2105, + "step": 23445 + }, + { + "epoch": 1.998295406119492, + "grad_norm": 40.68591130314455, + "learning_rate": 3.029183540026372e-06, + "loss": 2.5039, + "step": 23446 + }, + { + "epoch": 1.9983806358135174, + "grad_norm": 30.05315353816121, + "learning_rate": 3.028727845522775e-06, + "loss": 1.9066, + "step": 23447 + }, + { + "epoch": 1.998465865507543, + "grad_norm": 40.40539604845742, + "learning_rate": 3.028272170405857e-06, + "loss": 2.7977, + "step": 23448 + }, + { + "epoch": 1.9985510952015684, + "grad_norm": 73.6314931003967, + "learning_rate": 3.0278165146800997e-06, + "loss": 2.3763, + "step": 23449 + }, + { + "epoch": 1.9986363248955936, + "grad_norm": 71.87464088347816, + "learning_rate": 3.027360878349982e-06, + "loss": 2.8692, + "step": 23450 + }, + { + "epoch": 1.9987215545896189, + "grad_norm": 109.28453424683072, + "learning_rate": 3.0269052614199845e-06, + "loss": 1.1642, + "step": 23451 + }, + { + "epoch": 1.9988067842836443, + "grad_norm": 131.4555222152097, + "learning_rate": 3.0264496638945924e-06, + "loss": 2.7295, + "step": 23452 + }, + { + "epoch": 1.9988920139776698, + "grad_norm": 47.47455126959555, + "learning_rate": 3.025994085778283e-06, + "loss": 2.8249, + "step": 23453 + }, + { + "epoch": 1.9989772436716953, + "grad_norm": 74.52361996522183, + "learning_rate": 3.0255385270755373e-06, + "loss": 3.8856, + "step": 23454 + }, + { + "epoch": 1.9990624733657207, + "grad_norm": 59.81048426962377, + "learning_rate": 3.025082987790834e-06, + "loss": 3.5278, + "step": 23455 + }, + { + "epoch": 1.999147703059746, + "grad_norm": 64.22112319662133, + "learning_rate": 3.024627467928657e-06, + "loss": 2.7592, + "step": 23456 + }, + { + "epoch": 1.9992329327537715, + "grad_norm": 34.20205977765916, + "learning_rate": 3.0241719674934838e-06, + "loss": 3.0338, + "step": 23457 + }, + { + "epoch": 1.9993181624477967, + "grad_norm": 20.570844674519343, + "learning_rate": 3.023716486489793e-06, + "loss": 1.8429, + "step": 23458 + }, + { + "epoch": 1.9994033921418222, + "grad_norm": 33.833474013276046, + "learning_rate": 3.0232610249220644e-06, + "loss": 1.8505, + "step": 23459 + }, + { + "epoch": 1.9994886218358476, + "grad_norm": 34.735851764007016, + "learning_rate": 3.02280558279478e-06, + "loss": 2.3653, + "step": 23460 + }, + { + "epoch": 1.999573851529873, + "grad_norm": 51.803588908144135, + "learning_rate": 3.022350160112417e-06, + "loss": 2.3879, + "step": 23461 + }, + { + "epoch": 1.9996590812238986, + "grad_norm": 43.65522096529251, + "learning_rate": 3.0218947568794543e-06, + "loss": 2.8333, + "step": 23462 + }, + { + "epoch": 1.9997443109179238, + "grad_norm": 61.92829311507464, + "learning_rate": 3.021439373100371e-06, + "loss": 2.215, + "step": 23463 + }, + { + "epoch": 1.999829540611949, + "grad_norm": 44.134300600738996, + "learning_rate": 3.0209840087796426e-06, + "loss": 1.9242, + "step": 23464 + }, + { + "epoch": 1.9999147703059745, + "grad_norm": 51.380573248549595, + "learning_rate": 3.0205286639217527e-06, + "loss": 2.4155, + "step": 23465 + }, + { + "epoch": 2.0, + "grad_norm": 34.374635764183836, + "learning_rate": 3.020073338531176e-06, + "loss": 2.4877, + "step": 23466 + }, + { + "epoch": 2.0000852296940255, + "grad_norm": 48.10666624068603, + "learning_rate": 3.0196180326123917e-06, + "loss": 1.4831, + "step": 23467 + }, + { + "epoch": 2.000170459388051, + "grad_norm": 62.240147560126665, + "learning_rate": 3.019162746169876e-06, + "loss": 1.5712, + "step": 23468 + }, + { + "epoch": 2.0002556890820764, + "grad_norm": 35.43212688193314, + "learning_rate": 3.018707479208109e-06, + "loss": 1.8391, + "step": 23469 + }, + { + "epoch": 2.0003409187761014, + "grad_norm": 67.42008409951163, + "learning_rate": 3.0182522317315654e-06, + "loss": 2.5454, + "step": 23470 + }, + { + "epoch": 2.000426148470127, + "grad_norm": 60.674723231447125, + "learning_rate": 3.0177970037447253e-06, + "loss": 1.3737, + "step": 23471 + }, + { + "epoch": 2.0005113781641524, + "grad_norm": 32.348519461583074, + "learning_rate": 3.0173417952520613e-06, + "loss": 1.2512, + "step": 23472 + }, + { + "epoch": 2.000596607858178, + "grad_norm": 49.24515232388472, + "learning_rate": 3.0168866062580556e-06, + "loss": 2.2122, + "step": 23473 + }, + { + "epoch": 2.0006818375522033, + "grad_norm": 20.932002181054603, + "learning_rate": 3.0164314367671803e-06, + "loss": 1.4022, + "step": 23474 + }, + { + "epoch": 2.0007670672462288, + "grad_norm": 27.365908467832043, + "learning_rate": 3.0159762867839156e-06, + "loss": 1.9334, + "step": 23475 + }, + { + "epoch": 2.000852296940254, + "grad_norm": 28.785559410997195, + "learning_rate": 3.015521156312735e-06, + "loss": 1.2748, + "step": 23476 + }, + { + "epoch": 2.0009375266342793, + "grad_norm": 30.475953439750622, + "learning_rate": 3.015066045358114e-06, + "loss": 1.3723, + "step": 23477 + }, + { + "epoch": 2.0010227563283047, + "grad_norm": 45.73509823092063, + "learning_rate": 3.0146109539245294e-06, + "loss": 1.2515, + "step": 23478 + }, + { + "epoch": 2.00110798602233, + "grad_norm": 19.297964358244343, + "learning_rate": 3.0141558820164597e-06, + "loss": 1.0837, + "step": 23479 + }, + { + "epoch": 2.0011932157163557, + "grad_norm": 34.38921517638613, + "learning_rate": 3.013700829638377e-06, + "loss": 1.248, + "step": 23480 + }, + { + "epoch": 2.001278445410381, + "grad_norm": 48.44124046620597, + "learning_rate": 3.013245796794755e-06, + "loss": 2.5585, + "step": 23481 + }, + { + "epoch": 2.001363675104406, + "grad_norm": 44.63868049189865, + "learning_rate": 3.012790783490073e-06, + "loss": 1.3894, + "step": 23482 + }, + { + "epoch": 2.0014489047984316, + "grad_norm": 37.217981143330874, + "learning_rate": 3.012335789728804e-06, + "loss": 2.1649, + "step": 23483 + }, + { + "epoch": 2.001534134492457, + "grad_norm": 26.66437006663849, + "learning_rate": 3.0118808155154232e-06, + "loss": 1.4972, + "step": 23484 + }, + { + "epoch": 2.0016193641864826, + "grad_norm": 42.041317733898026, + "learning_rate": 3.0114258608544033e-06, + "loss": 1.7931, + "step": 23485 + }, + { + "epoch": 2.001704593880508, + "grad_norm": 45.42076659538391, + "learning_rate": 3.01097092575022e-06, + "loss": 2.2179, + "step": 23486 + }, + { + "epoch": 2.0017898235745335, + "grad_norm": 27.002158228483168, + "learning_rate": 3.0105160102073482e-06, + "loss": 1.5597, + "step": 23487 + }, + { + "epoch": 2.001875053268559, + "grad_norm": 27.644809693205858, + "learning_rate": 3.0100611142302616e-06, + "loss": 1.5697, + "step": 23488 + }, + { + "epoch": 2.001960282962584, + "grad_norm": 31.661463364502218, + "learning_rate": 3.009606237823431e-06, + "loss": 1.4842, + "step": 23489 + }, + { + "epoch": 2.0020455126566095, + "grad_norm": 42.93185697765033, + "learning_rate": 3.0091513809913332e-06, + "loss": 1.8013, + "step": 23490 + }, + { + "epoch": 2.002130742350635, + "grad_norm": 22.994152276121255, + "learning_rate": 3.0086965437384407e-06, + "loss": 0.9879, + "step": 23491 + }, + { + "epoch": 2.0022159720446604, + "grad_norm": 75.1032936488002, + "learning_rate": 3.008241726069227e-06, + "loss": 1.5353, + "step": 23492 + }, + { + "epoch": 2.002301201738686, + "grad_norm": 58.03811898353853, + "learning_rate": 3.0077869279881634e-06, + "loss": 1.9893, + "step": 23493 + }, + { + "epoch": 2.0023864314327113, + "grad_norm": 34.804577981393386, + "learning_rate": 3.0073321494997232e-06, + "loss": 1.4245, + "step": 23494 + }, + { + "epoch": 2.0024716611267364, + "grad_norm": 42.96883636131095, + "learning_rate": 3.006877390608381e-06, + "loss": 1.8145, + "step": 23495 + }, + { + "epoch": 2.002556890820762, + "grad_norm": 86.265595965491, + "learning_rate": 3.006422651318608e-06, + "loss": 2.3657, + "step": 23496 + }, + { + "epoch": 2.0026421205147873, + "grad_norm": 77.86488169391825, + "learning_rate": 3.0059679316348744e-06, + "loss": 2.3254, + "step": 23497 + }, + { + "epoch": 2.0027273502088128, + "grad_norm": 52.55317541802179, + "learning_rate": 3.0055132315616532e-06, + "loss": 1.7151, + "step": 23498 + }, + { + "epoch": 2.0028125799028382, + "grad_norm": 36.79205989066193, + "learning_rate": 3.0050585511034193e-06, + "loss": 1.2786, + "step": 23499 + }, + { + "epoch": 2.0028978095968637, + "grad_norm": 56.67935774377856, + "learning_rate": 3.0046038902646414e-06, + "loss": 2.4363, + "step": 23500 + }, + { + "epoch": 2.002983039290889, + "grad_norm": 28.925441241862274, + "learning_rate": 3.0041492490497903e-06, + "loss": 1.1003, + "step": 23501 + }, + { + "epoch": 2.003068268984914, + "grad_norm": 76.96005547716447, + "learning_rate": 3.003694627463337e-06, + "loss": 2.1525, + "step": 23502 + }, + { + "epoch": 2.0031534986789397, + "grad_norm": 46.36438970328281, + "learning_rate": 3.003240025509756e-06, + "loss": 1.0958, + "step": 23503 + }, + { + "epoch": 2.003238728372965, + "grad_norm": 43.59537599959479, + "learning_rate": 3.0027854431935166e-06, + "loss": 1.4268, + "step": 23504 + }, + { + "epoch": 2.0033239580669906, + "grad_norm": 30.7928555745383, + "learning_rate": 3.0023308805190866e-06, + "loss": 1.5273, + "step": 23505 + }, + { + "epoch": 2.003409187761016, + "grad_norm": 37.138401633007724, + "learning_rate": 3.0018763374909403e-06, + "loss": 1.4222, + "step": 23506 + }, + { + "epoch": 2.0034944174550415, + "grad_norm": 67.13200068418108, + "learning_rate": 3.0014218141135442e-06, + "loss": 2.2883, + "step": 23507 + }, + { + "epoch": 2.0035796471490666, + "grad_norm": 37.28840644769715, + "learning_rate": 3.0009673103913728e-06, + "loss": 1.2573, + "step": 23508 + }, + { + "epoch": 2.003664876843092, + "grad_norm": 26.164504496616225, + "learning_rate": 3.000512826328892e-06, + "loss": 1.2842, + "step": 23509 + }, + { + "epoch": 2.0037501065371175, + "grad_norm": 27.311751207849426, + "learning_rate": 3.000058361930574e-06, + "loss": 1.4173, + "step": 23510 + }, + { + "epoch": 2.003835336231143, + "grad_norm": 54.73282590628931, + "learning_rate": 2.999603917200885e-06, + "loss": 1.6837, + "step": 23511 + }, + { + "epoch": 2.0039205659251684, + "grad_norm": 105.41170507749078, + "learning_rate": 2.9991494921442994e-06, + "loss": 3.0365, + "step": 23512 + }, + { + "epoch": 2.004005795619194, + "grad_norm": 51.92867071947112, + "learning_rate": 2.998695086765282e-06, + "loss": 1.9464, + "step": 23513 + }, + { + "epoch": 2.004091025313219, + "grad_norm": 26.598925787228946, + "learning_rate": 2.9982407010683044e-06, + "loss": 1.3429, + "step": 23514 + }, + { + "epoch": 2.0041762550072444, + "grad_norm": 67.8520274723271, + "learning_rate": 2.9977863350578316e-06, + "loss": 1.9501, + "step": 23515 + }, + { + "epoch": 2.00426148470127, + "grad_norm": 36.842752414837086, + "learning_rate": 2.997331988738337e-06, + "loss": 1.308, + "step": 23516 + }, + { + "epoch": 2.0043467143952953, + "grad_norm": 39.65356609994722, + "learning_rate": 2.996877662114286e-06, + "loss": 1.4903, + "step": 23517 + }, + { + "epoch": 2.004431944089321, + "grad_norm": 21.299834412837317, + "learning_rate": 2.996423355190148e-06, + "loss": 1.0197, + "step": 23518 + }, + { + "epoch": 2.0045171737833463, + "grad_norm": 31.978425522482432, + "learning_rate": 2.995969067970388e-06, + "loss": 1.367, + "step": 23519 + }, + { + "epoch": 2.0046024034773717, + "grad_norm": 15.621984809709808, + "learning_rate": 2.995514800459477e-06, + "loss": 0.7432, + "step": 23520 + }, + { + "epoch": 2.0046876331713968, + "grad_norm": 64.25328504874692, + "learning_rate": 2.9950605526618813e-06, + "loss": 1.5029, + "step": 23521 + }, + { + "epoch": 2.0047728628654222, + "grad_norm": 61.879220069030005, + "learning_rate": 2.9946063245820693e-06, + "loss": 1.9418, + "step": 23522 + }, + { + "epoch": 2.0048580925594477, + "grad_norm": 58.099079197736664, + "learning_rate": 2.9941521162245074e-06, + "loss": 1.9027, + "step": 23523 + }, + { + "epoch": 2.004943322253473, + "grad_norm": 64.4791095129298, + "learning_rate": 2.9936979275936604e-06, + "loss": 2.703, + "step": 23524 + }, + { + "epoch": 2.0050285519474986, + "grad_norm": 37.71946495577925, + "learning_rate": 2.993243758693998e-06, + "loss": 1.698, + "step": 23525 + }, + { + "epoch": 2.005113781641524, + "grad_norm": 44.17627407577838, + "learning_rate": 2.9927896095299862e-06, + "loss": 1.3857, + "step": 23526 + }, + { + "epoch": 2.005199011335549, + "grad_norm": 34.92491977576998, + "learning_rate": 2.992335480106092e-06, + "loss": 1.7123, + "step": 23527 + }, + { + "epoch": 2.0052842410295746, + "grad_norm": 44.698775102627536, + "learning_rate": 2.991881370426778e-06, + "loss": 1.6749, + "step": 23528 + }, + { + "epoch": 2.0053694707236, + "grad_norm": 39.047892744562915, + "learning_rate": 2.991427280496514e-06, + "loss": 0.9274, + "step": 23529 + }, + { + "epoch": 2.0054547004176255, + "grad_norm": 56.27747226209061, + "learning_rate": 2.9909732103197654e-06, + "loss": 2.119, + "step": 23530 + }, + { + "epoch": 2.005539930111651, + "grad_norm": 60.086909188243105, + "learning_rate": 2.9905191599009963e-06, + "loss": 1.9059, + "step": 23531 + }, + { + "epoch": 2.0056251598056765, + "grad_norm": 50.22771997028579, + "learning_rate": 2.9900651292446714e-06, + "loss": 1.4178, + "step": 23532 + }, + { + "epoch": 2.0057103894997015, + "grad_norm": 22.562215389226125, + "learning_rate": 2.9896111183552588e-06, + "loss": 0.8669, + "step": 23533 + }, + { + "epoch": 2.005795619193727, + "grad_norm": 37.22874411859958, + "learning_rate": 2.9891571272372223e-06, + "loss": 1.3329, + "step": 23534 + }, + { + "epoch": 2.0058808488877524, + "grad_norm": 42.97497934472132, + "learning_rate": 2.988703155895027e-06, + "loss": 1.9876, + "step": 23535 + }, + { + "epoch": 2.005966078581778, + "grad_norm": 68.6178093892533, + "learning_rate": 2.9882492043331357e-06, + "loss": 2.0449, + "step": 23536 + }, + { + "epoch": 2.0060513082758034, + "grad_norm": 46.716254577064056, + "learning_rate": 2.9877952725560126e-06, + "loss": 1.5931, + "step": 23537 + }, + { + "epoch": 2.006136537969829, + "grad_norm": 26.703599569565217, + "learning_rate": 2.987341360568126e-06, + "loss": 1.1016, + "step": 23538 + }, + { + "epoch": 2.0062217676638543, + "grad_norm": 28.43892059548764, + "learning_rate": 2.9868874683739375e-06, + "loss": 1.3813, + "step": 23539 + }, + { + "epoch": 2.0063069973578793, + "grad_norm": 23.164519303139876, + "learning_rate": 2.9864335959779088e-06, + "loss": 0.9702, + "step": 23540 + }, + { + "epoch": 2.006392227051905, + "grad_norm": 47.51162609200966, + "learning_rate": 2.985979743384506e-06, + "loss": 1.4623, + "step": 23541 + }, + { + "epoch": 2.0064774567459303, + "grad_norm": 31.899496836901093, + "learning_rate": 2.985525910598193e-06, + "loss": 1.4008, + "step": 23542 + }, + { + "epoch": 2.0065626864399557, + "grad_norm": 53.21326311503193, + "learning_rate": 2.9850720976234327e-06, + "loss": 1.9455, + "step": 23543 + }, + { + "epoch": 2.006647916133981, + "grad_norm": 26.04361275314875, + "learning_rate": 2.984618304464687e-06, + "loss": 1.2524, + "step": 23544 + }, + { + "epoch": 2.0067331458280067, + "grad_norm": 57.54619447647641, + "learning_rate": 2.984164531126418e-06, + "loss": 1.6792, + "step": 23545 + }, + { + "epoch": 2.0068183755220317, + "grad_norm": 32.02403212511829, + "learning_rate": 2.9837107776130913e-06, + "loss": 1.7159, + "step": 23546 + }, + { + "epoch": 2.006903605216057, + "grad_norm": 77.88677322060818, + "learning_rate": 2.9832570439291685e-06, + "loss": 2.2237, + "step": 23547 + }, + { + "epoch": 2.0069888349100826, + "grad_norm": 43.47189662597899, + "learning_rate": 2.9828033300791104e-06, + "loss": 1.2057, + "step": 23548 + }, + { + "epoch": 2.007074064604108, + "grad_norm": 43.083198975493026, + "learning_rate": 2.982349636067381e-06, + "loss": 1.4214, + "step": 23549 + }, + { + "epoch": 2.0071592942981336, + "grad_norm": 25.86973916178384, + "learning_rate": 2.981895961898439e-06, + "loss": 1.4037, + "step": 23550 + }, + { + "epoch": 2.007244523992159, + "grad_norm": 68.74778260839051, + "learning_rate": 2.98144230757675e-06, + "loss": 2.2914, + "step": 23551 + }, + { + "epoch": 2.007329753686184, + "grad_norm": 40.419940380273665, + "learning_rate": 2.9809886731067734e-06, + "loss": 1.9906, + "step": 23552 + }, + { + "epoch": 2.0074149833802095, + "grad_norm": 44.71686338389973, + "learning_rate": 2.9805350584929715e-06, + "loss": 1.7719, + "step": 23553 + }, + { + "epoch": 2.007500213074235, + "grad_norm": 28.155527939726575, + "learning_rate": 2.9800814637398035e-06, + "loss": 1.1862, + "step": 23554 + }, + { + "epoch": 2.0075854427682605, + "grad_norm": 18.32977309192258, + "learning_rate": 2.979627888851733e-06, + "loss": 0.7582, + "step": 23555 + }, + { + "epoch": 2.007670672462286, + "grad_norm": 24.976796904673698, + "learning_rate": 2.9791743338332184e-06, + "loss": 1.0852, + "step": 23556 + }, + { + "epoch": 2.0077559021563114, + "grad_norm": 38.05123264068221, + "learning_rate": 2.9787207986887223e-06, + "loss": 0.9017, + "step": 23557 + }, + { + "epoch": 2.007841131850337, + "grad_norm": 26.95557994719523, + "learning_rate": 2.9782672834227024e-06, + "loss": 0.9862, + "step": 23558 + }, + { + "epoch": 2.007926361544362, + "grad_norm": 61.61993569760813, + "learning_rate": 2.977813788039622e-06, + "loss": 1.8285, + "step": 23559 + }, + { + "epoch": 2.0080115912383873, + "grad_norm": 55.44518405407654, + "learning_rate": 2.977360312543939e-06, + "loss": 1.6083, + "step": 23560 + }, + { + "epoch": 2.008096820932413, + "grad_norm": 51.255268173779314, + "learning_rate": 2.976906856940114e-06, + "loss": 1.7194, + "step": 23561 + }, + { + "epoch": 2.0081820506264383, + "grad_norm": 51.44076879281912, + "learning_rate": 2.9764534212326034e-06, + "loss": 1.9768, + "step": 23562 + }, + { + "epoch": 2.0082672803204638, + "grad_norm": 61.815863935419486, + "learning_rate": 2.9760000054258724e-06, + "loss": 2.4093, + "step": 23563 + }, + { + "epoch": 2.0083525100144892, + "grad_norm": 45.506212022745935, + "learning_rate": 2.975546609524376e-06, + "loss": 2.1231, + "step": 23564 + }, + { + "epoch": 2.0084377397085142, + "grad_norm": 37.534818673977874, + "learning_rate": 2.9750932335325754e-06, + "loss": 1.6375, + "step": 23565 + }, + { + "epoch": 2.0085229694025397, + "grad_norm": 19.08532312207353, + "learning_rate": 2.974639877454928e-06, + "loss": 0.6468, + "step": 23566 + }, + { + "epoch": 2.008608199096565, + "grad_norm": 52.522681706889756, + "learning_rate": 2.9741865412958902e-06, + "loss": 2.1231, + "step": 23567 + }, + { + "epoch": 2.0086934287905907, + "grad_norm": 45.76653297657269, + "learning_rate": 2.9737332250599248e-06, + "loss": 1.3667, + "step": 23568 + }, + { + "epoch": 2.008778658484616, + "grad_norm": 58.99351377050027, + "learning_rate": 2.973279928751488e-06, + "loss": 1.3439, + "step": 23569 + }, + { + "epoch": 2.0088638881786416, + "grad_norm": 47.310760509014884, + "learning_rate": 2.972826652375038e-06, + "loss": 1.6915, + "step": 23570 + }, + { + "epoch": 2.008949117872667, + "grad_norm": 43.61050442173306, + "learning_rate": 2.972373395935031e-06, + "loss": 1.2816, + "step": 23571 + }, + { + "epoch": 2.009034347566692, + "grad_norm": 29.676860754150454, + "learning_rate": 2.9719201594359258e-06, + "loss": 1.501, + "step": 23572 + }, + { + "epoch": 2.0091195772607175, + "grad_norm": 137.35483371997753, + "learning_rate": 2.971466942882182e-06, + "loss": 1.7024, + "step": 23573 + }, + { + "epoch": 2.009204806954743, + "grad_norm": 69.03410861584463, + "learning_rate": 2.971013746278254e-06, + "loss": 1.8314, + "step": 23574 + }, + { + "epoch": 2.0092900366487685, + "grad_norm": 37.60084214669053, + "learning_rate": 2.9705605696285984e-06, + "loss": 1.6983, + "step": 23575 + }, + { + "epoch": 2.009375266342794, + "grad_norm": 64.11306095974189, + "learning_rate": 2.970107412937674e-06, + "loss": 1.1161, + "step": 23576 + }, + { + "epoch": 2.0094604960368194, + "grad_norm": 24.910667655853622, + "learning_rate": 2.9696542762099366e-06, + "loss": 1.1239, + "step": 23577 + }, + { + "epoch": 2.0095457257308444, + "grad_norm": 26.647806593367385, + "learning_rate": 2.969201159449844e-06, + "loss": 1.4412, + "step": 23578 + }, + { + "epoch": 2.00963095542487, + "grad_norm": 47.010842334761705, + "learning_rate": 2.9687480626618492e-06, + "loss": 1.5057, + "step": 23579 + }, + { + "epoch": 2.0097161851188954, + "grad_norm": 37.07211017949078, + "learning_rate": 2.9682949858504083e-06, + "loss": 1.4516, + "step": 23580 + }, + { + "epoch": 2.009801414812921, + "grad_norm": 42.586095355974116, + "learning_rate": 2.967841929019982e-06, + "loss": 1.2216, + "step": 23581 + }, + { + "epoch": 2.0098866445069463, + "grad_norm": 63.61292077896896, + "learning_rate": 2.9673888921750225e-06, + "loss": 2.1761, + "step": 23582 + }, + { + "epoch": 2.009971874200972, + "grad_norm": 58.185399996416436, + "learning_rate": 2.966935875319985e-06, + "loss": 2.0491, + "step": 23583 + }, + { + "epoch": 2.010057103894997, + "grad_norm": 59.61330443085564, + "learning_rate": 2.966482878459324e-06, + "loss": 2.2196, + "step": 23584 + }, + { + "epoch": 2.0101423335890223, + "grad_norm": 38.529661645971835, + "learning_rate": 2.966029901597498e-06, + "loss": 1.4707, + "step": 23585 + }, + { + "epoch": 2.0102275632830477, + "grad_norm": 51.389184772964704, + "learning_rate": 2.9655769447389594e-06, + "loss": 2.754, + "step": 23586 + }, + { + "epoch": 2.010312792977073, + "grad_norm": 65.79887280435669, + "learning_rate": 2.9651240078881625e-06, + "loss": 1.5417, + "step": 23587 + }, + { + "epoch": 2.0103980226710987, + "grad_norm": 130.56375819093066, + "learning_rate": 2.964671091049561e-06, + "loss": 1.0176, + "step": 23588 + }, + { + "epoch": 2.010483252365124, + "grad_norm": 45.59440074918811, + "learning_rate": 2.9642181942276126e-06, + "loss": 1.684, + "step": 23589 + }, + { + "epoch": 2.0105684820591496, + "grad_norm": 37.66270490971538, + "learning_rate": 2.96376531742677e-06, + "loss": 1.6914, + "step": 23590 + }, + { + "epoch": 2.0106537117531746, + "grad_norm": 47.49864346162497, + "learning_rate": 2.9633124606514848e-06, + "loss": 1.7674, + "step": 23591 + }, + { + "epoch": 2.0107389414472, + "grad_norm": 39.19371379672114, + "learning_rate": 2.962859623906211e-06, + "loss": 1.7014, + "step": 23592 + }, + { + "epoch": 2.0108241711412256, + "grad_norm": 55.59013420194276, + "learning_rate": 2.962406807195406e-06, + "loss": 1.8584, + "step": 23593 + }, + { + "epoch": 2.010909400835251, + "grad_norm": 51.572082233964224, + "learning_rate": 2.9619540105235193e-06, + "loss": 1.5807, + "step": 23594 + }, + { + "epoch": 2.0109946305292765, + "grad_norm": 53.4897529513067, + "learning_rate": 2.961501233895005e-06, + "loss": 1.4044, + "step": 23595 + }, + { + "epoch": 2.011079860223302, + "grad_norm": 61.93521056675354, + "learning_rate": 2.9610484773143167e-06, + "loss": 2.1628, + "step": 23596 + }, + { + "epoch": 2.011165089917327, + "grad_norm": 32.40775707838338, + "learning_rate": 2.960595740785905e-06, + "loss": 1.0518, + "step": 23597 + }, + { + "epoch": 2.0112503196113525, + "grad_norm": 51.682707959637334, + "learning_rate": 2.9601430243142247e-06, + "loss": 1.7148, + "step": 23598 + }, + { + "epoch": 2.011335549305378, + "grad_norm": 40.43333110118523, + "learning_rate": 2.959690327903727e-06, + "loss": 1.7904, + "step": 23599 + }, + { + "epoch": 2.0114207789994034, + "grad_norm": 18.920905449101998, + "learning_rate": 2.959237651558865e-06, + "loss": 0.4026, + "step": 23600 + }, + { + "epoch": 2.011506008693429, + "grad_norm": 48.63502095948484, + "learning_rate": 2.958784995284088e-06, + "loss": 1.2149, + "step": 23601 + }, + { + "epoch": 2.0115912383874544, + "grad_norm": 22.449276232562838, + "learning_rate": 2.9583323590838515e-06, + "loss": 1.1618, + "step": 23602 + }, + { + "epoch": 2.0116764680814794, + "grad_norm": 22.803341296492377, + "learning_rate": 2.957879742962604e-06, + "loss": 1.6617, + "step": 23603 + }, + { + "epoch": 2.011761697775505, + "grad_norm": 64.51223012899297, + "learning_rate": 2.9574271469247985e-06, + "loss": 2.3699, + "step": 23604 + }, + { + "epoch": 2.0118469274695303, + "grad_norm": 34.05009976553714, + "learning_rate": 2.9569745709748834e-06, + "loss": 1.2221, + "step": 23605 + }, + { + "epoch": 2.011932157163556, + "grad_norm": 56.8569454962587, + "learning_rate": 2.9565220151173135e-06, + "loss": 1.5268, + "step": 23606 + }, + { + "epoch": 2.0120173868575812, + "grad_norm": 65.18200148959293, + "learning_rate": 2.9560694793565364e-06, + "loss": 2.3747, + "step": 23607 + }, + { + "epoch": 2.0121026165516067, + "grad_norm": 23.5602032281061, + "learning_rate": 2.955616963697005e-06, + "loss": 1.1293, + "step": 23608 + }, + { + "epoch": 2.012187846245632, + "grad_norm": 36.29262129199026, + "learning_rate": 2.9551644681431684e-06, + "loss": 0.8106, + "step": 23609 + }, + { + "epoch": 2.012273075939657, + "grad_norm": 41.45146447547868, + "learning_rate": 2.954711992699475e-06, + "loss": 1.1019, + "step": 23610 + }, + { + "epoch": 2.0123583056336827, + "grad_norm": 87.95434453245855, + "learning_rate": 2.954259537370377e-06, + "loss": 1.8777, + "step": 23611 + }, + { + "epoch": 2.012443535327708, + "grad_norm": 28.292610077330618, + "learning_rate": 2.953807102160325e-06, + "loss": 1.1413, + "step": 23612 + }, + { + "epoch": 2.0125287650217336, + "grad_norm": 49.172125627728114, + "learning_rate": 2.953354687073767e-06, + "loss": 1.8367, + "step": 23613 + }, + { + "epoch": 2.012613994715759, + "grad_norm": 55.715192305499045, + "learning_rate": 2.9529022921151495e-06, + "loss": 2.0068, + "step": 23614 + }, + { + "epoch": 2.0126992244097845, + "grad_norm": 58.98026845138425, + "learning_rate": 2.9524499172889263e-06, + "loss": 2.129, + "step": 23615 + }, + { + "epoch": 2.0127844541038096, + "grad_norm": 30.305180653078846, + "learning_rate": 2.9519975625995457e-06, + "loss": 1.1851, + "step": 23616 + }, + { + "epoch": 2.012869683797835, + "grad_norm": 44.25430210074729, + "learning_rate": 2.951545228051455e-06, + "loss": 1.6532, + "step": 23617 + }, + { + "epoch": 2.0129549134918605, + "grad_norm": 24.956879038751772, + "learning_rate": 2.9510929136491013e-06, + "loss": 0.9222, + "step": 23618 + }, + { + "epoch": 2.013040143185886, + "grad_norm": 28.805201148523384, + "learning_rate": 2.950640619396935e-06, + "loss": 0.8517, + "step": 23619 + }, + { + "epoch": 2.0131253728799114, + "grad_norm": 20.820546027778313, + "learning_rate": 2.9501883452994055e-06, + "loss": 0.9497, + "step": 23620 + }, + { + "epoch": 2.013210602573937, + "grad_norm": 34.86349641332882, + "learning_rate": 2.949736091360959e-06, + "loss": 1.4077, + "step": 23621 + }, + { + "epoch": 2.013295832267962, + "grad_norm": 52.23766263837412, + "learning_rate": 2.9492838575860423e-06, + "loss": 1.6663, + "step": 23622 + }, + { + "epoch": 2.0133810619619874, + "grad_norm": 52.53389411670649, + "learning_rate": 2.9488316439791033e-06, + "loss": 1.4358, + "step": 23623 + }, + { + "epoch": 2.013466291656013, + "grad_norm": 27.78482260783966, + "learning_rate": 2.948379450544591e-06, + "loss": 0.8951, + "step": 23624 + }, + { + "epoch": 2.0135515213500383, + "grad_norm": 38.18650317355531, + "learning_rate": 2.9479272772869527e-06, + "loss": 1.4549, + "step": 23625 + }, + { + "epoch": 2.013636751044064, + "grad_norm": 57.60172190085174, + "learning_rate": 2.9474751242106327e-06, + "loss": 1.9313, + "step": 23626 + }, + { + "epoch": 2.0137219807380893, + "grad_norm": 64.26917393025298, + "learning_rate": 2.9470229913200776e-06, + "loss": 1.6156, + "step": 23627 + }, + { + "epoch": 2.0138072104321147, + "grad_norm": 41.950300516717036, + "learning_rate": 2.9465708786197388e-06, + "loss": 1.6361, + "step": 23628 + }, + { + "epoch": 2.0138924401261398, + "grad_norm": 55.857120950498896, + "learning_rate": 2.9461187861140587e-06, + "loss": 1.6055, + "step": 23629 + }, + { + "epoch": 2.0139776698201652, + "grad_norm": 29.619254383723906, + "learning_rate": 2.9456667138074836e-06, + "loss": 1.3585, + "step": 23630 + }, + { + "epoch": 2.0140628995141907, + "grad_norm": 26.93450227604434, + "learning_rate": 2.9452146617044586e-06, + "loss": 1.0083, + "step": 23631 + }, + { + "epoch": 2.014148129208216, + "grad_norm": 34.867054594751664, + "learning_rate": 2.9447626298094336e-06, + "loss": 1.1938, + "step": 23632 + }, + { + "epoch": 2.0142333589022416, + "grad_norm": 61.19902576495617, + "learning_rate": 2.944310618126851e-06, + "loss": 1.1061, + "step": 23633 + }, + { + "epoch": 2.014318588596267, + "grad_norm": 81.29007166372413, + "learning_rate": 2.9438586266611557e-06, + "loss": 1.3101, + "step": 23634 + }, + { + "epoch": 2.014403818290292, + "grad_norm": 39.435830997570534, + "learning_rate": 2.943406655416793e-06, + "loss": 1.1238, + "step": 23635 + }, + { + "epoch": 2.0144890479843176, + "grad_norm": 37.81894951665707, + "learning_rate": 2.9429547043982106e-06, + "loss": 1.0136, + "step": 23636 + }, + { + "epoch": 2.014574277678343, + "grad_norm": 79.41016058873385, + "learning_rate": 2.942502773609851e-06, + "loss": 2.4091, + "step": 23637 + }, + { + "epoch": 2.0146595073723685, + "grad_norm": 53.021940093078406, + "learning_rate": 2.942050863056158e-06, + "loss": 2.3425, + "step": 23638 + }, + { + "epoch": 2.014744737066394, + "grad_norm": 44.26972806397628, + "learning_rate": 2.941598972741578e-06, + "loss": 1.2581, + "step": 23639 + }, + { + "epoch": 2.0148299667604195, + "grad_norm": 19.604932351407577, + "learning_rate": 2.941147102670553e-06, + "loss": 0.7981, + "step": 23640 + }, + { + "epoch": 2.014915196454445, + "grad_norm": 19.28082521926261, + "learning_rate": 2.940695252847529e-06, + "loss": 0.717, + "step": 23641 + }, + { + "epoch": 2.01500042614847, + "grad_norm": 62.36841597566629, + "learning_rate": 2.9402434232769483e-06, + "loss": 2.2507, + "step": 23642 + }, + { + "epoch": 2.0150856558424954, + "grad_norm": 55.513513432891486, + "learning_rate": 2.9397916139632566e-06, + "loss": 2.294, + "step": 23643 + }, + { + "epoch": 2.015170885536521, + "grad_norm": 46.24913058655575, + "learning_rate": 2.939339824910893e-06, + "loss": 1.4896, + "step": 23644 + }, + { + "epoch": 2.0152561152305464, + "grad_norm": 53.82233534545096, + "learning_rate": 2.938888056124306e-06, + "loss": 1.3651, + "step": 23645 + }, + { + "epoch": 2.015341344924572, + "grad_norm": 68.1524358660391, + "learning_rate": 2.9384363076079344e-06, + "loss": 1.8603, + "step": 23646 + }, + { + "epoch": 2.0154265746185973, + "grad_norm": 28.632288341795473, + "learning_rate": 2.9379845793662237e-06, + "loss": 1.4467, + "step": 23647 + }, + { + "epoch": 2.0155118043126223, + "grad_norm": 40.13459646372974, + "learning_rate": 2.9375328714036135e-06, + "loss": 1.8689, + "step": 23648 + }, + { + "epoch": 2.015597034006648, + "grad_norm": 52.0442948780303, + "learning_rate": 2.9370811837245493e-06, + "loss": 0.9582, + "step": 23649 + }, + { + "epoch": 2.0156822637006733, + "grad_norm": 59.21519629173016, + "learning_rate": 2.9366295163334717e-06, + "loss": 2.2427, + "step": 23650 + }, + { + "epoch": 2.0157674933946987, + "grad_norm": 52.654250678804836, + "learning_rate": 2.9361778692348242e-06, + "loss": 1.2848, + "step": 23651 + }, + { + "epoch": 2.015852723088724, + "grad_norm": 51.26324432902642, + "learning_rate": 2.9357262424330467e-06, + "loss": 1.6459, + "step": 23652 + }, + { + "epoch": 2.0159379527827497, + "grad_norm": 19.292468331896476, + "learning_rate": 2.935274635932579e-06, + "loss": 0.9913, + "step": 23653 + }, + { + "epoch": 2.0160231824767747, + "grad_norm": 72.06736314962288, + "learning_rate": 2.934823049737866e-06, + "loss": 1.8127, + "step": 23654 + }, + { + "epoch": 2.0161084121708, + "grad_norm": 75.40745812387584, + "learning_rate": 2.9343714838533495e-06, + "loss": 2.021, + "step": 23655 + }, + { + "epoch": 2.0161936418648256, + "grad_norm": 51.22030589550603, + "learning_rate": 2.933919938283468e-06, + "loss": 1.5646, + "step": 23656 + }, + { + "epoch": 2.016278871558851, + "grad_norm": 80.02194413706839, + "learning_rate": 2.9334684130326608e-06, + "loss": 1.5435, + "step": 23657 + }, + { + "epoch": 2.0163641012528766, + "grad_norm": 33.25523699685935, + "learning_rate": 2.933016908105372e-06, + "loss": 1.0515, + "step": 23658 + }, + { + "epoch": 2.016449330946902, + "grad_norm": 32.19002643517725, + "learning_rate": 2.932565423506041e-06, + "loss": 1.083, + "step": 23659 + }, + { + "epoch": 2.0165345606409275, + "grad_norm": 59.97319583800416, + "learning_rate": 2.932113959239108e-06, + "loss": 1.8784, + "step": 23660 + }, + { + "epoch": 2.0166197903349525, + "grad_norm": 28.686097659251363, + "learning_rate": 2.9316625153090106e-06, + "loss": 1.2561, + "step": 23661 + }, + { + "epoch": 2.016705020028978, + "grad_norm": 53.285236529572586, + "learning_rate": 2.9312110917201907e-06, + "loss": 1.5025, + "step": 23662 + }, + { + "epoch": 2.0167902497230035, + "grad_norm": 76.0837865987155, + "learning_rate": 2.9307596884770895e-06, + "loss": 1.7213, + "step": 23663 + }, + { + "epoch": 2.016875479417029, + "grad_norm": 47.73754984492966, + "learning_rate": 2.930308305584144e-06, + "loss": 1.6068, + "step": 23664 + }, + { + "epoch": 2.0169607091110544, + "grad_norm": 45.37108014950114, + "learning_rate": 2.929856943045792e-06, + "loss": 1.803, + "step": 23665 + }, + { + "epoch": 2.01704593880508, + "grad_norm": 33.45579681928772, + "learning_rate": 2.929405600866476e-06, + "loss": 1.2671, + "step": 23666 + }, + { + "epoch": 2.017131168499105, + "grad_norm": 35.29451700077348, + "learning_rate": 2.928954279050634e-06, + "loss": 1.1606, + "step": 23667 + }, + { + "epoch": 2.0172163981931304, + "grad_norm": 58.916573687371994, + "learning_rate": 2.928502977602703e-06, + "loss": 1.8435, + "step": 23668 + }, + { + "epoch": 2.017301627887156, + "grad_norm": 46.637344002890316, + "learning_rate": 2.928051696527122e-06, + "loss": 1.0464, + "step": 23669 + }, + { + "epoch": 2.0173868575811813, + "grad_norm": 65.46279497682842, + "learning_rate": 2.927600435828327e-06, + "loss": 1.7022, + "step": 23670 + }, + { + "epoch": 2.0174720872752068, + "grad_norm": 45.881528525149456, + "learning_rate": 2.927149195510761e-06, + "loss": 1.4232, + "step": 23671 + }, + { + "epoch": 2.0175573169692322, + "grad_norm": 40.234256582896634, + "learning_rate": 2.926697975578859e-06, + "loss": 1.5802, + "step": 23672 + }, + { + "epoch": 2.0176425466632573, + "grad_norm": 66.79014612669715, + "learning_rate": 2.9262467760370584e-06, + "loss": 1.5893, + "step": 23673 + }, + { + "epoch": 2.0177277763572827, + "grad_norm": 60.28223615139844, + "learning_rate": 2.9257955968897943e-06, + "loss": 1.6058, + "step": 23674 + }, + { + "epoch": 2.017813006051308, + "grad_norm": 18.640686360493667, + "learning_rate": 2.9253444381415085e-06, + "loss": 0.671, + "step": 23675 + }, + { + "epoch": 2.0178982357453337, + "grad_norm": 53.87264849276763, + "learning_rate": 2.9248932997966363e-06, + "loss": 1.7032, + "step": 23676 + }, + { + "epoch": 2.017983465439359, + "grad_norm": 51.527517357649856, + "learning_rate": 2.924442181859613e-06, + "loss": 1.497, + "step": 23677 + }, + { + "epoch": 2.0180686951333846, + "grad_norm": 50.56871449474446, + "learning_rate": 2.923991084334875e-06, + "loss": 2.0224, + "step": 23678 + }, + { + "epoch": 2.01815392482741, + "grad_norm": 59.595696566671506, + "learning_rate": 2.9235400072268617e-06, + "loss": 2.1647, + "step": 23679 + }, + { + "epoch": 2.018239154521435, + "grad_norm": 29.883584583465154, + "learning_rate": 2.9230889505400068e-06, + "loss": 1.208, + "step": 23680 + }, + { + "epoch": 2.0183243842154606, + "grad_norm": 29.245173501038185, + "learning_rate": 2.9226379142787466e-06, + "loss": 0.814, + "step": 23681 + }, + { + "epoch": 2.018409613909486, + "grad_norm": 21.045787500642373, + "learning_rate": 2.922186898447518e-06, + "loss": 0.7552, + "step": 23682 + }, + { + "epoch": 2.0184948436035115, + "grad_norm": 53.692501758615464, + "learning_rate": 2.9217359030507523e-06, + "loss": 1.6285, + "step": 23683 + }, + { + "epoch": 2.018580073297537, + "grad_norm": 39.03171424928945, + "learning_rate": 2.9212849280928914e-06, + "loss": 0.9211, + "step": 23684 + }, + { + "epoch": 2.0186653029915624, + "grad_norm": 40.709549938066125, + "learning_rate": 2.920833973578366e-06, + "loss": 1.3669, + "step": 23685 + }, + { + "epoch": 2.0187505326855875, + "grad_norm": 34.656323793667255, + "learning_rate": 2.920383039511613e-06, + "loss": 0.6809, + "step": 23686 + }, + { + "epoch": 2.018835762379613, + "grad_norm": 73.64615827428551, + "learning_rate": 2.919932125897065e-06, + "loss": 2.1403, + "step": 23687 + }, + { + "epoch": 2.0189209920736384, + "grad_norm": 28.557446305787014, + "learning_rate": 2.9194812327391598e-06, + "loss": 1.4694, + "step": 23688 + }, + { + "epoch": 2.019006221767664, + "grad_norm": 53.45917981179373, + "learning_rate": 2.9190303600423296e-06, + "loss": 1.6285, + "step": 23689 + }, + { + "epoch": 2.0190914514616893, + "grad_norm": 59.56118550455141, + "learning_rate": 2.9185795078110093e-06, + "loss": 2.5824, + "step": 23690 + }, + { + "epoch": 2.019176681155715, + "grad_norm": 60.296304511758855, + "learning_rate": 2.91812867604963e-06, + "loss": 2.1589, + "step": 23691 + }, + { + "epoch": 2.01926191084974, + "grad_norm": 70.55333204393698, + "learning_rate": 2.9176778647626284e-06, + "loss": 2.1916, + "step": 23692 + }, + { + "epoch": 2.0193471405437653, + "grad_norm": 23.988544419248093, + "learning_rate": 2.9172270739544393e-06, + "loss": 0.8036, + "step": 23693 + }, + { + "epoch": 2.0194323702377908, + "grad_norm": 57.958011291653115, + "learning_rate": 2.916776303629495e-06, + "loss": 1.8411, + "step": 23694 + }, + { + "epoch": 2.0195175999318162, + "grad_norm": 40.1718221606163, + "learning_rate": 2.9163255537922276e-06, + "loss": 1.348, + "step": 23695 + }, + { + "epoch": 2.0196028296258417, + "grad_norm": 35.376385245570255, + "learning_rate": 2.915874824447068e-06, + "loss": 1.2844, + "step": 23696 + }, + { + "epoch": 2.019688059319867, + "grad_norm": 79.72416384929772, + "learning_rate": 2.915424115598453e-06, + "loss": 1.8829, + "step": 23697 + }, + { + "epoch": 2.0197732890138926, + "grad_norm": 66.53804336876581, + "learning_rate": 2.9149734272508144e-06, + "loss": 1.305, + "step": 23698 + }, + { + "epoch": 2.0198585187079177, + "grad_norm": 29.918564378796766, + "learning_rate": 2.914522759408581e-06, + "loss": 1.0004, + "step": 23699 + }, + { + "epoch": 2.019943748401943, + "grad_norm": 32.84159815167691, + "learning_rate": 2.9140721120761883e-06, + "loss": 0.973, + "step": 23700 + }, + { + "epoch": 2.0200289780959686, + "grad_norm": 48.70791489502086, + "learning_rate": 2.913621485258069e-06, + "loss": 1.9363, + "step": 23701 + }, + { + "epoch": 2.020114207789994, + "grad_norm": 50.762830676331774, + "learning_rate": 2.913170878958653e-06, + "loss": 1.612, + "step": 23702 + }, + { + "epoch": 2.0201994374840195, + "grad_norm": 76.03335487895059, + "learning_rate": 2.912720293182373e-06, + "loss": 2.4827, + "step": 23703 + }, + { + "epoch": 2.020284667178045, + "grad_norm": 20.995881186621236, + "learning_rate": 2.912269727933657e-06, + "loss": 0.528, + "step": 23704 + }, + { + "epoch": 2.02036989687207, + "grad_norm": 28.881436349483014, + "learning_rate": 2.911819183216941e-06, + "loss": 1.1606, + "step": 23705 + }, + { + "epoch": 2.0204551265660955, + "grad_norm": 72.66337897744425, + "learning_rate": 2.911368659036653e-06, + "loss": 1.9401, + "step": 23706 + }, + { + "epoch": 2.020540356260121, + "grad_norm": 58.28112979531275, + "learning_rate": 2.910918155397223e-06, + "loss": 1.786, + "step": 23707 + }, + { + "epoch": 2.0206255859541464, + "grad_norm": 43.22433925581756, + "learning_rate": 2.9104676723030823e-06, + "loss": 2.1005, + "step": 23708 + }, + { + "epoch": 2.020710815648172, + "grad_norm": 50.10346407969717, + "learning_rate": 2.9100172097586643e-06, + "loss": 1.6326, + "step": 23709 + }, + { + "epoch": 2.0207960453421974, + "grad_norm": 26.205075262070306, + "learning_rate": 2.909566767768396e-06, + "loss": 1.0954, + "step": 23710 + }, + { + "epoch": 2.020881275036223, + "grad_norm": 26.65354762033517, + "learning_rate": 2.909116346336708e-06, + "loss": 1.0828, + "step": 23711 + }, + { + "epoch": 2.020966504730248, + "grad_norm": 30.458089787605484, + "learning_rate": 2.90866594546803e-06, + "loss": 1.2102, + "step": 23712 + }, + { + "epoch": 2.0210517344242733, + "grad_norm": 30.26433825946351, + "learning_rate": 2.9082155651667892e-06, + "loss": 1.5626, + "step": 23713 + }, + { + "epoch": 2.021136964118299, + "grad_norm": 39.419727948455616, + "learning_rate": 2.9077652054374195e-06, + "loss": 1.4712, + "step": 23714 + }, + { + "epoch": 2.0212221938123243, + "grad_norm": 40.314350637760185, + "learning_rate": 2.907314866284345e-06, + "loss": 1.4978, + "step": 23715 + }, + { + "epoch": 2.0213074235063497, + "grad_norm": 60.24166627093045, + "learning_rate": 2.9068645477119993e-06, + "loss": 2.0176, + "step": 23716 + }, + { + "epoch": 2.021392653200375, + "grad_norm": 44.76844318467713, + "learning_rate": 2.9064142497248084e-06, + "loss": 1.3528, + "step": 23717 + }, + { + "epoch": 2.0214778828944002, + "grad_norm": 23.115751552562436, + "learning_rate": 2.905963972327202e-06, + "loss": 0.8936, + "step": 23718 + }, + { + "epoch": 2.0215631125884257, + "grad_norm": 62.10497928325625, + "learning_rate": 2.905513715523608e-06, + "loss": 2.0656, + "step": 23719 + }, + { + "epoch": 2.021648342282451, + "grad_norm": 31.411146632704813, + "learning_rate": 2.9050634793184553e-06, + "loss": 1.0631, + "step": 23720 + }, + { + "epoch": 2.0217335719764766, + "grad_norm": 66.67750554087856, + "learning_rate": 2.9046132637161677e-06, + "loss": 1.6705, + "step": 23721 + }, + { + "epoch": 2.021818801670502, + "grad_norm": 60.34563744525295, + "learning_rate": 2.9041630687211786e-06, + "loss": 2.2563, + "step": 23722 + }, + { + "epoch": 2.0219040313645276, + "grad_norm": 48.39190477689055, + "learning_rate": 2.9037128943379105e-06, + "loss": 2.1473, + "step": 23723 + }, + { + "epoch": 2.0219892610585526, + "grad_norm": 46.847278017875965, + "learning_rate": 2.9032627405707958e-06, + "loss": 1.7927, + "step": 23724 + }, + { + "epoch": 2.022074490752578, + "grad_norm": 46.92594203758178, + "learning_rate": 2.902812607424258e-06, + "loss": 1.8365, + "step": 23725 + }, + { + "epoch": 2.0221597204466035, + "grad_norm": 44.01735569316725, + "learning_rate": 2.9023624949027233e-06, + "loss": 2.0805, + "step": 23726 + }, + { + "epoch": 2.022244950140629, + "grad_norm": 57.18025469415706, + "learning_rate": 2.9019124030106225e-06, + "loss": 1.9225, + "step": 23727 + }, + { + "epoch": 2.0223301798346545, + "grad_norm": 48.501300685239606, + "learning_rate": 2.9014623317523786e-06, + "loss": 1.3038, + "step": 23728 + }, + { + "epoch": 2.02241540952868, + "grad_norm": 31.452704194656985, + "learning_rate": 2.9010122811324194e-06, + "loss": 1.1859, + "step": 23729 + }, + { + "epoch": 2.0225006392227054, + "grad_norm": 24.25218746625927, + "learning_rate": 2.9005622511551677e-06, + "loss": 1.4351, + "step": 23730 + }, + { + "epoch": 2.0225858689167304, + "grad_norm": 30.39368368302549, + "learning_rate": 2.9001122418250527e-06, + "loss": 1.4276, + "step": 23731 + }, + { + "epoch": 2.022671098610756, + "grad_norm": 40.653807986865985, + "learning_rate": 2.8996622531465013e-06, + "loss": 1.6769, + "step": 23732 + }, + { + "epoch": 2.0227563283047814, + "grad_norm": 49.613012059824584, + "learning_rate": 2.8992122851239367e-06, + "loss": 1.7022, + "step": 23733 + }, + { + "epoch": 2.022841557998807, + "grad_norm": 24.902566841514794, + "learning_rate": 2.8987623377617835e-06, + "loss": 1.5963, + "step": 23734 + }, + { + "epoch": 2.0229267876928323, + "grad_norm": 26.788084492931347, + "learning_rate": 2.898312411064469e-06, + "loss": 0.5805, + "step": 23735 + }, + { + "epoch": 2.0230120173868578, + "grad_norm": 252.12763166452388, + "learning_rate": 2.897862505036416e-06, + "loss": 3.386, + "step": 23736 + }, + { + "epoch": 2.023097247080883, + "grad_norm": 27.371224594974137, + "learning_rate": 2.8974126196820514e-06, + "loss": 0.7207, + "step": 23737 + }, + { + "epoch": 2.0231824767749083, + "grad_norm": 42.189189138728466, + "learning_rate": 2.8969627550057955e-06, + "loss": 1.29, + "step": 23738 + }, + { + "epoch": 2.0232677064689337, + "grad_norm": 31.329104702762553, + "learning_rate": 2.896512911012075e-06, + "loss": 1.5398, + "step": 23739 + }, + { + "epoch": 2.023352936162959, + "grad_norm": 184.7448949397412, + "learning_rate": 2.8960630877053164e-06, + "loss": 1.9284, + "step": 23740 + }, + { + "epoch": 2.0234381658569847, + "grad_norm": 42.59936410220351, + "learning_rate": 2.8956132850899414e-06, + "loss": 1.3729, + "step": 23741 + }, + { + "epoch": 2.02352339555101, + "grad_norm": 67.3137499073488, + "learning_rate": 2.895163503170374e-06, + "loss": 1.5392, + "step": 23742 + }, + { + "epoch": 2.023608625245035, + "grad_norm": 39.87751130944194, + "learning_rate": 2.894713741951034e-06, + "loss": 1.1061, + "step": 23743 + }, + { + "epoch": 2.0236938549390606, + "grad_norm": 58.307319324588875, + "learning_rate": 2.894264001436351e-06, + "loss": 1.5684, + "step": 23744 + }, + { + "epoch": 2.023779084633086, + "grad_norm": 34.65038685803358, + "learning_rate": 2.893814281630744e-06, + "loss": 1.1085, + "step": 23745 + }, + { + "epoch": 2.0238643143271116, + "grad_norm": 88.18007228428083, + "learning_rate": 2.8933645825386348e-06, + "loss": 2.6374, + "step": 23746 + }, + { + "epoch": 2.023949544021137, + "grad_norm": 56.50634923801519, + "learning_rate": 2.8929149041644478e-06, + "loss": 1.7179, + "step": 23747 + }, + { + "epoch": 2.0240347737151625, + "grad_norm": 45.850948885367934, + "learning_rate": 2.892465246512608e-06, + "loss": 0.9823, + "step": 23748 + }, + { + "epoch": 2.024120003409188, + "grad_norm": 46.824116844546666, + "learning_rate": 2.8920156095875344e-06, + "loss": 1.7867, + "step": 23749 + }, + { + "epoch": 2.024205233103213, + "grad_norm": 57.475619329636515, + "learning_rate": 2.8915659933936503e-06, + "loss": 1.8466, + "step": 23750 + }, + { + "epoch": 2.0242904627972385, + "grad_norm": 70.72136896062095, + "learning_rate": 2.891116397935374e-06, + "loss": 2.1513, + "step": 23751 + }, + { + "epoch": 2.024375692491264, + "grad_norm": 53.69563838415972, + "learning_rate": 2.8906668232171324e-06, + "loss": 1.7834, + "step": 23752 + }, + { + "epoch": 2.0244609221852894, + "grad_norm": 56.93284918291647, + "learning_rate": 2.8902172692433442e-06, + "loss": 1.0746, + "step": 23753 + }, + { + "epoch": 2.024546151879315, + "grad_norm": 52.95856788625142, + "learning_rate": 2.8897677360184294e-06, + "loss": 1.6754, + "step": 23754 + }, + { + "epoch": 2.0246313815733403, + "grad_norm": 76.09563516187274, + "learning_rate": 2.889318223546812e-06, + "loss": 1.5454, + "step": 23755 + }, + { + "epoch": 2.0247166112673654, + "grad_norm": 58.530855709872334, + "learning_rate": 2.888868731832909e-06, + "loss": 1.2778, + "step": 23756 + }, + { + "epoch": 2.024801840961391, + "grad_norm": 29.897404293072235, + "learning_rate": 2.8884192608811457e-06, + "loss": 0.9463, + "step": 23757 + }, + { + "epoch": 2.0248870706554163, + "grad_norm": 58.930912177993896, + "learning_rate": 2.8879698106959393e-06, + "loss": 1.9646, + "step": 23758 + }, + { + "epoch": 2.0249723003494418, + "grad_norm": 37.445977535070064, + "learning_rate": 2.887520381281711e-06, + "loss": 1.5511, + "step": 23759 + }, + { + "epoch": 2.0250575300434672, + "grad_norm": 24.543697392169296, + "learning_rate": 2.887070972642878e-06, + "loss": 0.9688, + "step": 23760 + }, + { + "epoch": 2.0251427597374927, + "grad_norm": 56.385519621183946, + "learning_rate": 2.886621584783864e-06, + "loss": 1.5558, + "step": 23761 + }, + { + "epoch": 2.025227989431518, + "grad_norm": 52.83992276153292, + "learning_rate": 2.8861722177090855e-06, + "loss": 1.1363, + "step": 23762 + }, + { + "epoch": 2.025313219125543, + "grad_norm": 81.34265498652536, + "learning_rate": 2.8857228714229657e-06, + "loss": 2.1518, + "step": 23763 + }, + { + "epoch": 2.0253984488195687, + "grad_norm": 30.04417164981416, + "learning_rate": 2.8852735459299187e-06, + "loss": 1.6936, + "step": 23764 + }, + { + "epoch": 2.025483678513594, + "grad_norm": 67.3789594839471, + "learning_rate": 2.8848242412343686e-06, + "loss": 2.4025, + "step": 23765 + }, + { + "epoch": 2.0255689082076196, + "grad_norm": 31.11984324261882, + "learning_rate": 2.884374957340731e-06, + "loss": 0.6999, + "step": 23766 + }, + { + "epoch": 2.025654137901645, + "grad_norm": 7.534260237941609, + "learning_rate": 2.883925694253425e-06, + "loss": 0.2673, + "step": 23767 + }, + { + "epoch": 2.0257393675956705, + "grad_norm": 66.38307140229949, + "learning_rate": 2.8834764519768672e-06, + "loss": 1.5668, + "step": 23768 + }, + { + "epoch": 2.0258245972896956, + "grad_norm": 21.058400427745095, + "learning_rate": 2.8830272305154793e-06, + "loss": 0.913, + "step": 23769 + }, + { + "epoch": 2.025909826983721, + "grad_norm": 61.87921520594715, + "learning_rate": 2.882578029873676e-06, + "loss": 1.6704, + "step": 23770 + }, + { + "epoch": 2.0259950566777465, + "grad_norm": 63.04927291687737, + "learning_rate": 2.8821288500558776e-06, + "loss": 2.0994, + "step": 23771 + }, + { + "epoch": 2.026080286371772, + "grad_norm": 49.58967752430464, + "learning_rate": 2.8816796910665014e-06, + "loss": 2.0947, + "step": 23772 + }, + { + "epoch": 2.0261655160657974, + "grad_norm": 35.80522457145382, + "learning_rate": 2.8812305529099606e-06, + "loss": 1.3713, + "step": 23773 + }, + { + "epoch": 2.026250745759823, + "grad_norm": 40.203500933809295, + "learning_rate": 2.8807814355906777e-06, + "loss": 1.767, + "step": 23774 + }, + { + "epoch": 2.026335975453848, + "grad_norm": 42.76356115503579, + "learning_rate": 2.8803323391130678e-06, + "loss": 1.6185, + "step": 23775 + }, + { + "epoch": 2.0264212051478734, + "grad_norm": 54.560148581594696, + "learning_rate": 2.8798832634815464e-06, + "loss": 1.6156, + "step": 23776 + }, + { + "epoch": 2.026506434841899, + "grad_norm": 39.242807280970716, + "learning_rate": 2.8794342087005288e-06, + "loss": 2.1101, + "step": 23777 + }, + { + "epoch": 2.0265916645359243, + "grad_norm": 24.488379488467494, + "learning_rate": 2.8789851747744334e-06, + "loss": 1.2261, + "step": 23778 + }, + { + "epoch": 2.02667689422995, + "grad_norm": 37.67193910991718, + "learning_rate": 2.878536161707678e-06, + "loss": 1.6548, + "step": 23779 + }, + { + "epoch": 2.0267621239239753, + "grad_norm": 36.90084024207809, + "learning_rate": 2.878087169504676e-06, + "loss": 0.8527, + "step": 23780 + }, + { + "epoch": 2.0268473536180007, + "grad_norm": 64.05509858322263, + "learning_rate": 2.877638198169842e-06, + "loss": 2.2198, + "step": 23781 + }, + { + "epoch": 2.0269325833120257, + "grad_norm": 29.170446836784986, + "learning_rate": 2.8771892477075946e-06, + "loss": 1.5532, + "step": 23782 + }, + { + "epoch": 2.027017813006051, + "grad_norm": 53.951134664585716, + "learning_rate": 2.8767403181223487e-06, + "loss": 2.1591, + "step": 23783 + }, + { + "epoch": 2.0271030427000767, + "grad_norm": 37.9964472101973, + "learning_rate": 2.876291409418517e-06, + "loss": 1.9236, + "step": 23784 + }, + { + "epoch": 2.027188272394102, + "grad_norm": 36.72280030881946, + "learning_rate": 2.8758425216005125e-06, + "loss": 1.2521, + "step": 23785 + }, + { + "epoch": 2.0272735020881276, + "grad_norm": 58.52914544512538, + "learning_rate": 2.8753936546727547e-06, + "loss": 1.6368, + "step": 23786 + }, + { + "epoch": 2.027358731782153, + "grad_norm": 54.60450281753089, + "learning_rate": 2.8749448086396565e-06, + "loss": 1.2547, + "step": 23787 + }, + { + "epoch": 2.027443961476178, + "grad_norm": 44.84741548429827, + "learning_rate": 2.8744959835056326e-06, + "loss": 1.3288, + "step": 23788 + }, + { + "epoch": 2.0275291911702036, + "grad_norm": 41.393541301321584, + "learning_rate": 2.8740471792750956e-06, + "loss": 1.9954, + "step": 23789 + }, + { + "epoch": 2.027614420864229, + "grad_norm": 36.04726116778433, + "learning_rate": 2.873598395952458e-06, + "loss": 1.1597, + "step": 23790 + }, + { + "epoch": 2.0276996505582545, + "grad_norm": 31.541414704514754, + "learning_rate": 2.873149633542137e-06, + "loss": 1.8052, + "step": 23791 + }, + { + "epoch": 2.02778488025228, + "grad_norm": 54.40778357405163, + "learning_rate": 2.8727008920485446e-06, + "loss": 1.6627, + "step": 23792 + }, + { + "epoch": 2.0278701099463055, + "grad_norm": 33.75278658646436, + "learning_rate": 2.8722521714760916e-06, + "loss": 1.0103, + "step": 23793 + }, + { + "epoch": 2.0279553396403305, + "grad_norm": 93.10073988112583, + "learning_rate": 2.8718034718291925e-06, + "loss": 1.8669, + "step": 23794 + }, + { + "epoch": 2.028040569334356, + "grad_norm": 57.642041034370386, + "learning_rate": 2.871354793112263e-06, + "loss": 1.2302, + "step": 23795 + }, + { + "epoch": 2.0281257990283814, + "grad_norm": 45.41904590822052, + "learning_rate": 2.8709061353297133e-06, + "loss": 1.5156, + "step": 23796 + }, + { + "epoch": 2.028211028722407, + "grad_norm": 73.237925726137, + "learning_rate": 2.8704574984859556e-06, + "loss": 2.308, + "step": 23797 + }, + { + "epoch": 2.0282962584164324, + "grad_norm": 37.67510237215842, + "learning_rate": 2.870008882585402e-06, + "loss": 1.2023, + "step": 23798 + }, + { + "epoch": 2.028381488110458, + "grad_norm": 73.83153134085552, + "learning_rate": 2.8695602876324635e-06, + "loss": 2.5625, + "step": 23799 + }, + { + "epoch": 2.0284667178044833, + "grad_norm": 21.74680970766832, + "learning_rate": 2.8691117136315543e-06, + "loss": 1.1501, + "step": 23800 + }, + { + "epoch": 2.0285519474985083, + "grad_norm": 24.588743739246873, + "learning_rate": 2.868663160587083e-06, + "loss": 0.9826, + "step": 23801 + }, + { + "epoch": 2.028637177192534, + "grad_norm": 47.58887164922507, + "learning_rate": 2.868214628503464e-06, + "loss": 1.6663, + "step": 23802 + }, + { + "epoch": 2.0287224068865592, + "grad_norm": 88.47320042932652, + "learning_rate": 2.8677661173851058e-06, + "loss": 2.4002, + "step": 23803 + }, + { + "epoch": 2.0288076365805847, + "grad_norm": 81.07775721170447, + "learning_rate": 2.8673176272364223e-06, + "loss": 2.104, + "step": 23804 + }, + { + "epoch": 2.02889286627461, + "grad_norm": 43.209401282912744, + "learning_rate": 2.8668691580618226e-06, + "loss": 1.0845, + "step": 23805 + }, + { + "epoch": 2.0289780959686357, + "grad_norm": 46.11609407385068, + "learning_rate": 2.866420709865717e-06, + "loss": 2.0118, + "step": 23806 + }, + { + "epoch": 2.0290633256626607, + "grad_norm": 58.611426887428294, + "learning_rate": 2.8659722826525137e-06, + "loss": 2.5265, + "step": 23807 + }, + { + "epoch": 2.029148555356686, + "grad_norm": 63.2613434990139, + "learning_rate": 2.865523876426627e-06, + "loss": 1.8523, + "step": 23808 + }, + { + "epoch": 2.0292337850507116, + "grad_norm": 22.836244282808973, + "learning_rate": 2.865075491192463e-06, + "loss": 0.9448, + "step": 23809 + }, + { + "epoch": 2.029319014744737, + "grad_norm": 36.82339409456464, + "learning_rate": 2.864627126954436e-06, + "loss": 1.4868, + "step": 23810 + }, + { + "epoch": 2.0294042444387626, + "grad_norm": 70.16955206530145, + "learning_rate": 2.8641787837169493e-06, + "loss": 1.778, + "step": 23811 + }, + { + "epoch": 2.029489474132788, + "grad_norm": 43.140103207857926, + "learning_rate": 2.863730461484418e-06, + "loss": 1.5675, + "step": 23812 + }, + { + "epoch": 2.029574703826813, + "grad_norm": 62.28629372329168, + "learning_rate": 2.8632821602612494e-06, + "loss": 1.8752, + "step": 23813 + }, + { + "epoch": 2.0296599335208385, + "grad_norm": 38.312720460698024, + "learning_rate": 2.8628338800518507e-06, + "loss": 1.4772, + "step": 23814 + }, + { + "epoch": 2.029745163214864, + "grad_norm": 46.266998521098984, + "learning_rate": 2.8623856208606306e-06, + "loss": 1.4886, + "step": 23815 + }, + { + "epoch": 2.0298303929088894, + "grad_norm": 64.70436456583268, + "learning_rate": 2.861937382691999e-06, + "loss": 1.9226, + "step": 23816 + }, + { + "epoch": 2.029915622602915, + "grad_norm": 35.37871294180474, + "learning_rate": 2.861489165550363e-06, + "loss": 1.0902, + "step": 23817 + }, + { + "epoch": 2.0300008522969404, + "grad_norm": 76.1388470460516, + "learning_rate": 2.8610409694401332e-06, + "loss": 1.9943, + "step": 23818 + }, + { + "epoch": 2.030086081990966, + "grad_norm": 56.01666171182118, + "learning_rate": 2.860592794365715e-06, + "loss": 1.7501, + "step": 23819 + }, + { + "epoch": 2.030171311684991, + "grad_norm": 29.319186077801195, + "learning_rate": 2.860144640331515e-06, + "loss": 1.045, + "step": 23820 + }, + { + "epoch": 2.0302565413790163, + "grad_norm": 96.85654431733595, + "learning_rate": 2.8596965073419443e-06, + "loss": 1.7677, + "step": 23821 + }, + { + "epoch": 2.030341771073042, + "grad_norm": 61.990890310514146, + "learning_rate": 2.859248395401407e-06, + "loss": 1.6318, + "step": 23822 + }, + { + "epoch": 2.0304270007670673, + "grad_norm": 88.68148958280884, + "learning_rate": 2.858800304514312e-06, + "loss": 2.0745, + "step": 23823 + }, + { + "epoch": 2.0305122304610927, + "grad_norm": 26.032358792681265, + "learning_rate": 2.8583522346850635e-06, + "loss": 1.1102, + "step": 23824 + }, + { + "epoch": 2.030597460155118, + "grad_norm": 57.48291936730839, + "learning_rate": 2.857904185918069e-06, + "loss": 1.4796, + "step": 23825 + }, + { + "epoch": 2.0306826898491432, + "grad_norm": 60.183388216907126, + "learning_rate": 2.857456158217737e-06, + "loss": 1.5452, + "step": 23826 + }, + { + "epoch": 2.0307679195431687, + "grad_norm": 43.82487871407584, + "learning_rate": 2.857008151588474e-06, + "loss": 1.6234, + "step": 23827 + }, + { + "epoch": 2.030853149237194, + "grad_norm": 37.40216978006281, + "learning_rate": 2.856560166034683e-06, + "loss": 1.2788, + "step": 23828 + }, + { + "epoch": 2.0309383789312196, + "grad_norm": 66.24003059086822, + "learning_rate": 2.85611220156077e-06, + "loss": 1.8275, + "step": 23829 + }, + { + "epoch": 2.031023608625245, + "grad_norm": 46.50247686405032, + "learning_rate": 2.855664258171143e-06, + "loss": 1.3704, + "step": 23830 + }, + { + "epoch": 2.0311088383192706, + "grad_norm": 67.88389645521373, + "learning_rate": 2.8552163358702055e-06, + "loss": 1.6079, + "step": 23831 + }, + { + "epoch": 2.031194068013296, + "grad_norm": 29.673360397694978, + "learning_rate": 2.8547684346623618e-06, + "loss": 0.946, + "step": 23832 + }, + { + "epoch": 2.031279297707321, + "grad_norm": 47.37073948083027, + "learning_rate": 2.854320554552018e-06, + "loss": 1.4056, + "step": 23833 + }, + { + "epoch": 2.0313645274013465, + "grad_norm": 62.492027008839514, + "learning_rate": 2.8538726955435813e-06, + "loss": 1.5476, + "step": 23834 + }, + { + "epoch": 2.031449757095372, + "grad_norm": 25.871129337843406, + "learning_rate": 2.853424857641453e-06, + "loss": 0.9074, + "step": 23835 + }, + { + "epoch": 2.0315349867893975, + "grad_norm": 27.170098914231737, + "learning_rate": 2.852977040850039e-06, + "loss": 0.855, + "step": 23836 + }, + { + "epoch": 2.031620216483423, + "grad_norm": 29.588571768510356, + "learning_rate": 2.8525292451737403e-06, + "loss": 0.8999, + "step": 23837 + }, + { + "epoch": 2.0317054461774484, + "grad_norm": 57.84629012044477, + "learning_rate": 2.8520814706169653e-06, + "loss": 1.4529, + "step": 23838 + }, + { + "epoch": 2.0317906758714734, + "grad_norm": 22.716569104341207, + "learning_rate": 2.851633717184115e-06, + "loss": 0.9662, + "step": 23839 + }, + { + "epoch": 2.031875905565499, + "grad_norm": 151.26447772903856, + "learning_rate": 2.851185984879592e-06, + "loss": 1.4182, + "step": 23840 + }, + { + "epoch": 2.0319611352595244, + "grad_norm": 26.463222178710534, + "learning_rate": 2.850738273707801e-06, + "loss": 1.0698, + "step": 23841 + }, + { + "epoch": 2.03204636495355, + "grad_norm": 26.133457045195172, + "learning_rate": 2.8502905836731463e-06, + "loss": 0.9996, + "step": 23842 + }, + { + "epoch": 2.0321315946475753, + "grad_norm": 43.30463960553267, + "learning_rate": 2.8498429147800304e-06, + "loss": 1.3503, + "step": 23843 + }, + { + "epoch": 2.032216824341601, + "grad_norm": 102.43774603178974, + "learning_rate": 2.849395267032855e-06, + "loss": 1.7721, + "step": 23844 + }, + { + "epoch": 2.032302054035626, + "grad_norm": 31.028913089526768, + "learning_rate": 2.848947640436022e-06, + "loss": 1.4288, + "step": 23845 + }, + { + "epoch": 2.0323872837296513, + "grad_norm": 56.95176130697555, + "learning_rate": 2.848500034993933e-06, + "loss": 0.9211, + "step": 23846 + }, + { + "epoch": 2.0324725134236767, + "grad_norm": 35.91338310585491, + "learning_rate": 2.8480524507109927e-06, + "loss": 0.898, + "step": 23847 + }, + { + "epoch": 2.032557743117702, + "grad_norm": 24.52009621834469, + "learning_rate": 2.8476048875915997e-06, + "loss": 0.7848, + "step": 23848 + }, + { + "epoch": 2.0326429728117277, + "grad_norm": 15.263170073746219, + "learning_rate": 2.847157345640159e-06, + "loss": 0.7167, + "step": 23849 + }, + { + "epoch": 2.032728202505753, + "grad_norm": 37.670774936977736, + "learning_rate": 2.846709824861068e-06, + "loss": 0.9715, + "step": 23850 + }, + { + "epoch": 2.0328134321997786, + "grad_norm": 45.976641353014955, + "learning_rate": 2.8462623252587328e-06, + "loss": 1.413, + "step": 23851 + }, + { + "epoch": 2.0328986618938036, + "grad_norm": 46.25650223897452, + "learning_rate": 2.845814846837552e-06, + "loss": 1.1064, + "step": 23852 + }, + { + "epoch": 2.032983891587829, + "grad_norm": 69.25810739724105, + "learning_rate": 2.8453673896019263e-06, + "loss": 1.6913, + "step": 23853 + }, + { + "epoch": 2.0330691212818546, + "grad_norm": 52.09432983188166, + "learning_rate": 2.8449199535562532e-06, + "loss": 1.4596, + "step": 23854 + }, + { + "epoch": 2.03315435097588, + "grad_norm": 87.36329865177875, + "learning_rate": 2.8444725387049387e-06, + "loss": 1.4759, + "step": 23855 + }, + { + "epoch": 2.0332395806699055, + "grad_norm": 36.44426080003536, + "learning_rate": 2.8440251450523782e-06, + "loss": 1.5204, + "step": 23856 + }, + { + "epoch": 2.033324810363931, + "grad_norm": 22.829374810486637, + "learning_rate": 2.8435777726029757e-06, + "loss": 0.8973, + "step": 23857 + }, + { + "epoch": 2.033410040057956, + "grad_norm": 54.95263197834213, + "learning_rate": 2.8431304213611288e-06, + "loss": 0.9538, + "step": 23858 + }, + { + "epoch": 2.0334952697519815, + "grad_norm": 49.35492783221819, + "learning_rate": 2.8426830913312353e-06, + "loss": 1.2817, + "step": 23859 + }, + { + "epoch": 2.033580499446007, + "grad_norm": 57.88249730784598, + "learning_rate": 2.842235782517698e-06, + "loss": 1.923, + "step": 23860 + }, + { + "epoch": 2.0336657291400324, + "grad_norm": 71.07487850451933, + "learning_rate": 2.841788494924915e-06, + "loss": 2.1052, + "step": 23861 + }, + { + "epoch": 2.033750958834058, + "grad_norm": 24.671004260694, + "learning_rate": 2.8413412285572827e-06, + "loss": 1.441, + "step": 23862 + }, + { + "epoch": 2.0338361885280833, + "grad_norm": 48.71722133959216, + "learning_rate": 2.840893983419203e-06, + "loss": 1.8689, + "step": 23863 + }, + { + "epoch": 2.0339214182221084, + "grad_norm": 25.505342024807973, + "learning_rate": 2.840446759515071e-06, + "loss": 0.4997, + "step": 23864 + }, + { + "epoch": 2.034006647916134, + "grad_norm": 118.0193247293357, + "learning_rate": 2.8399995568492894e-06, + "loss": 1.7802, + "step": 23865 + }, + { + "epoch": 2.0340918776101593, + "grad_norm": 34.51301387322936, + "learning_rate": 2.8395523754262533e-06, + "loss": 0.8212, + "step": 23866 + }, + { + "epoch": 2.0341771073041848, + "grad_norm": 22.71495737029683, + "learning_rate": 2.8391052152503597e-06, + "loss": 0.8731, + "step": 23867 + }, + { + "epoch": 2.0342623369982102, + "grad_norm": 71.66698211869577, + "learning_rate": 2.83865807632601e-06, + "loss": 2.1653, + "step": 23868 + }, + { + "epoch": 2.0343475666922357, + "grad_norm": 45.129003378961436, + "learning_rate": 2.8382109586575978e-06, + "loss": 1.0667, + "step": 23869 + }, + { + "epoch": 2.034432796386261, + "grad_norm": 49.12230848016183, + "learning_rate": 2.837763862249523e-06, + "loss": 1.2695, + "step": 23870 + }, + { + "epoch": 2.034518026080286, + "grad_norm": 82.14305750211626, + "learning_rate": 2.837316787106179e-06, + "loss": 1.7019, + "step": 23871 + }, + { + "epoch": 2.0346032557743117, + "grad_norm": 69.79262848712688, + "learning_rate": 2.8368697332319656e-06, + "loss": 1.493, + "step": 23872 + }, + { + "epoch": 2.034688485468337, + "grad_norm": 40.44794430464674, + "learning_rate": 2.8364227006312804e-06, + "loss": 1.4164, + "step": 23873 + }, + { + "epoch": 2.0347737151623626, + "grad_norm": 15.94039246013073, + "learning_rate": 2.8359756893085185e-06, + "loss": 0.9179, + "step": 23874 + }, + { + "epoch": 2.034858944856388, + "grad_norm": 29.93224816045479, + "learning_rate": 2.835528699268075e-06, + "loss": 0.8064, + "step": 23875 + }, + { + "epoch": 2.0349441745504135, + "grad_norm": 58.82472435461143, + "learning_rate": 2.8350817305143453e-06, + "loss": 1.6428, + "step": 23876 + }, + { + "epoch": 2.0350294042444386, + "grad_norm": 55.36872391871936, + "learning_rate": 2.8346347830517283e-06, + "loss": 1.142, + "step": 23877 + }, + { + "epoch": 2.035114633938464, + "grad_norm": 58.314859167257076, + "learning_rate": 2.8341878568846182e-06, + "loss": 1.5945, + "step": 23878 + }, + { + "epoch": 2.0351998636324895, + "grad_norm": 56.81125806527361, + "learning_rate": 2.8337409520174073e-06, + "loss": 1.3647, + "step": 23879 + }, + { + "epoch": 2.035285093326515, + "grad_norm": 38.26795776699692, + "learning_rate": 2.833294068454494e-06, + "loss": 1.2863, + "step": 23880 + }, + { + "epoch": 2.0353703230205404, + "grad_norm": 49.27175023408348, + "learning_rate": 2.8328472062002743e-06, + "loss": 1.1786, + "step": 23881 + }, + { + "epoch": 2.035455552714566, + "grad_norm": 35.618671326215896, + "learning_rate": 2.8324003652591414e-06, + "loss": 0.8792, + "step": 23882 + }, + { + "epoch": 2.0355407824085914, + "grad_norm": 40.39523198223394, + "learning_rate": 2.831953545635489e-06, + "loss": 1.7437, + "step": 23883 + }, + { + "epoch": 2.0356260121026164, + "grad_norm": 43.45534518076851, + "learning_rate": 2.8315067473337106e-06, + "loss": 1.4021, + "step": 23884 + }, + { + "epoch": 2.035711241796642, + "grad_norm": 25.493101994956064, + "learning_rate": 2.831059970358203e-06, + "loss": 0.8528, + "step": 23885 + }, + { + "epoch": 2.0357964714906673, + "grad_norm": 24.048135324319627, + "learning_rate": 2.830613214713359e-06, + "loss": 0.7696, + "step": 23886 + }, + { + "epoch": 2.035881701184693, + "grad_norm": 47.81394167574565, + "learning_rate": 2.8301664804035705e-06, + "loss": 1.204, + "step": 23887 + }, + { + "epoch": 2.0359669308787183, + "grad_norm": 37.73851375951754, + "learning_rate": 2.8297197674332345e-06, + "loss": 0.9039, + "step": 23888 + }, + { + "epoch": 2.0360521605727437, + "grad_norm": 46.69900348163924, + "learning_rate": 2.829273075806739e-06, + "loss": 1.1493, + "step": 23889 + }, + { + "epoch": 2.0361373902667688, + "grad_norm": 44.99734713142632, + "learning_rate": 2.8288264055284832e-06, + "loss": 1.5001, + "step": 23890 + }, + { + "epoch": 2.0362226199607942, + "grad_norm": 60.1284019758959, + "learning_rate": 2.828379756602857e-06, + "loss": 2.5462, + "step": 23891 + }, + { + "epoch": 2.0363078496548197, + "grad_norm": 59.97709066578079, + "learning_rate": 2.8279331290342526e-06, + "loss": 1.8662, + "step": 23892 + }, + { + "epoch": 2.036393079348845, + "grad_norm": 47.20605564878411, + "learning_rate": 2.827486522827061e-06, + "loss": 1.3839, + "step": 23893 + }, + { + "epoch": 2.0364783090428706, + "grad_norm": 58.55820212902806, + "learning_rate": 2.8270399379856784e-06, + "loss": 1.3607, + "step": 23894 + }, + { + "epoch": 2.036563538736896, + "grad_norm": 50.823768390527114, + "learning_rate": 2.8265933745144925e-06, + "loss": 2.6487, + "step": 23895 + }, + { + "epoch": 2.036648768430921, + "grad_norm": 31.310053932385834, + "learning_rate": 2.8261468324178985e-06, + "loss": 1.2871, + "step": 23896 + }, + { + "epoch": 2.0367339981249466, + "grad_norm": 48.83444072272405, + "learning_rate": 2.8257003117002848e-06, + "loss": 1.4327, + "step": 23897 + }, + { + "epoch": 2.036819227818972, + "grad_norm": 50.46822421588003, + "learning_rate": 2.8252538123660466e-06, + "loss": 0.856, + "step": 23898 + }, + { + "epoch": 2.0369044575129975, + "grad_norm": 60.799466983349824, + "learning_rate": 2.8248073344195737e-06, + "loss": 1.7507, + "step": 23899 + }, + { + "epoch": 2.036989687207023, + "grad_norm": 29.307630986502875, + "learning_rate": 2.8243608778652555e-06, + "loss": 0.6276, + "step": 23900 + }, + { + "epoch": 2.0370749169010485, + "grad_norm": 26.230455205671344, + "learning_rate": 2.8239144427074816e-06, + "loss": 1.0333, + "step": 23901 + }, + { + "epoch": 2.037160146595074, + "grad_norm": 27.24397392200905, + "learning_rate": 2.8234680289506465e-06, + "loss": 0.864, + "step": 23902 + }, + { + "epoch": 2.037245376289099, + "grad_norm": 104.88574187465377, + "learning_rate": 2.823021636599137e-06, + "loss": 2.2567, + "step": 23903 + }, + { + "epoch": 2.0373306059831244, + "grad_norm": 16.576493291135485, + "learning_rate": 2.822575265657347e-06, + "loss": 0.6399, + "step": 23904 + }, + { + "epoch": 2.03741583567715, + "grad_norm": 58.77483814134716, + "learning_rate": 2.822128916129663e-06, + "loss": 1.2448, + "step": 23905 + }, + { + "epoch": 2.0375010653711754, + "grad_norm": 46.825814490572355, + "learning_rate": 2.821682588020474e-06, + "loss": 1.4264, + "step": 23906 + }, + { + "epoch": 2.037586295065201, + "grad_norm": 35.129455695658606, + "learning_rate": 2.821236281334173e-06, + "loss": 1.3065, + "step": 23907 + }, + { + "epoch": 2.0376715247592263, + "grad_norm": 56.40100558781514, + "learning_rate": 2.820789996075148e-06, + "loss": 0.9881, + "step": 23908 + }, + { + "epoch": 2.0377567544532513, + "grad_norm": 47.938471565022006, + "learning_rate": 2.820343732247785e-06, + "loss": 1.459, + "step": 23909 + }, + { + "epoch": 2.037841984147277, + "grad_norm": 27.997973365826198, + "learning_rate": 2.8198974898564774e-06, + "loss": 1.5856, + "step": 23910 + }, + { + "epoch": 2.0379272138413023, + "grad_norm": 24.318966796582362, + "learning_rate": 2.8194512689056096e-06, + "loss": 0.9452, + "step": 23911 + }, + { + "epoch": 2.0380124435353277, + "grad_norm": 48.47563024091419, + "learning_rate": 2.819005069399574e-06, + "loss": 1.633, + "step": 23912 + }, + { + "epoch": 2.038097673229353, + "grad_norm": 29.339009751057993, + "learning_rate": 2.8185588913427574e-06, + "loss": 1.0898, + "step": 23913 + }, + { + "epoch": 2.0381829029233787, + "grad_norm": 53.046461407414974, + "learning_rate": 2.818112734739545e-06, + "loss": 1.3765, + "step": 23914 + }, + { + "epoch": 2.0382681326174037, + "grad_norm": 52.10952159823793, + "learning_rate": 2.8176665995943285e-06, + "loss": 1.4976, + "step": 23915 + }, + { + "epoch": 2.038353362311429, + "grad_norm": 49.78116196480033, + "learning_rate": 2.8172204859114943e-06, + "loss": 1.4424, + "step": 23916 + }, + { + "epoch": 2.0384385920054546, + "grad_norm": 45.210812289971074, + "learning_rate": 2.8167743936954272e-06, + "loss": 2.1789, + "step": 23917 + }, + { + "epoch": 2.03852382169948, + "grad_norm": 59.30088670376991, + "learning_rate": 2.8163283229505184e-06, + "loss": 1.7118, + "step": 23918 + }, + { + "epoch": 2.0386090513935056, + "grad_norm": 47.45178871569693, + "learning_rate": 2.815882273681151e-06, + "loss": 2.1062, + "step": 23919 + }, + { + "epoch": 2.038694281087531, + "grad_norm": 58.0512941310971, + "learning_rate": 2.815436245891715e-06, + "loss": 1.4697, + "step": 23920 + }, + { + "epoch": 2.0387795107815565, + "grad_norm": 64.21012534217586, + "learning_rate": 2.8149902395865957e-06, + "loss": 1.3073, + "step": 23921 + }, + { + "epoch": 2.0388647404755815, + "grad_norm": 57.18480155033422, + "learning_rate": 2.8145442547701797e-06, + "loss": 1.8415, + "step": 23922 + }, + { + "epoch": 2.038949970169607, + "grad_norm": 97.27944160439695, + "learning_rate": 2.8140982914468494e-06, + "loss": 1.3204, + "step": 23923 + }, + { + "epoch": 2.0390351998636325, + "grad_norm": 47.12532773659237, + "learning_rate": 2.813652349620997e-06, + "loss": 1.4165, + "step": 23924 + }, + { + "epoch": 2.039120429557658, + "grad_norm": 39.56441305731427, + "learning_rate": 2.813206429297004e-06, + "loss": 1.3608, + "step": 23925 + }, + { + "epoch": 2.0392056592516834, + "grad_norm": 70.0443580341333, + "learning_rate": 2.812760530479256e-06, + "loss": 2.1263, + "step": 23926 + }, + { + "epoch": 2.039290888945709, + "grad_norm": 56.82475099591589, + "learning_rate": 2.812314653172138e-06, + "loss": 2.1931, + "step": 23927 + }, + { + "epoch": 2.039376118639734, + "grad_norm": 17.09114099658457, + "learning_rate": 2.8118687973800386e-06, + "loss": 0.5467, + "step": 23928 + }, + { + "epoch": 2.0394613483337594, + "grad_norm": 44.803556866274846, + "learning_rate": 2.8114229631073397e-06, + "loss": 1.6341, + "step": 23929 + }, + { + "epoch": 2.039546578027785, + "grad_norm": 83.85450560403874, + "learning_rate": 2.8109771503584273e-06, + "loss": 2.3009, + "step": 23930 + }, + { + "epoch": 2.0396318077218103, + "grad_norm": 59.32035783469721, + "learning_rate": 2.810531359137684e-06, + "loss": 2.2856, + "step": 23931 + }, + { + "epoch": 2.0397170374158358, + "grad_norm": 26.84142529002928, + "learning_rate": 2.810085589449492e-06, + "loss": 0.8281, + "step": 23932 + }, + { + "epoch": 2.0398022671098612, + "grad_norm": 56.79660038024526, + "learning_rate": 2.8096398412982413e-06, + "loss": 1.764, + "step": 23933 + }, + { + "epoch": 2.0398874968038863, + "grad_norm": 31.882345051480574, + "learning_rate": 2.8091941146883093e-06, + "loss": 0.8983, + "step": 23934 + }, + { + "epoch": 2.0399727264979117, + "grad_norm": 59.53260369004538, + "learning_rate": 2.808748409624086e-06, + "loss": 1.6203, + "step": 23935 + }, + { + "epoch": 2.040057956191937, + "grad_norm": 80.89270563468484, + "learning_rate": 2.808302726109948e-06, + "loss": 2.255, + "step": 23936 + }, + { + "epoch": 2.0401431858859627, + "grad_norm": 51.32937614408912, + "learning_rate": 2.8078570641502843e-06, + "loss": 1.8197, + "step": 23937 + }, + { + "epoch": 2.040228415579988, + "grad_norm": 31.83302366089354, + "learning_rate": 2.807411423749476e-06, + "loss": 1.2858, + "step": 23938 + }, + { + "epoch": 2.0403136452740136, + "grad_norm": 46.40682463096139, + "learning_rate": 2.8069658049119047e-06, + "loss": 2.1126, + "step": 23939 + }, + { + "epoch": 2.040398874968039, + "grad_norm": 22.21313484039731, + "learning_rate": 2.8065202076419505e-06, + "loss": 0.8021, + "step": 23940 + }, + { + "epoch": 2.040484104662064, + "grad_norm": 39.62556877563235, + "learning_rate": 2.8060746319440014e-06, + "loss": 0.7485, + "step": 23941 + }, + { + "epoch": 2.0405693343560896, + "grad_norm": 30.936691573341978, + "learning_rate": 2.8056290778224345e-06, + "loss": 1.52, + "step": 23942 + }, + { + "epoch": 2.040654564050115, + "grad_norm": 291.82215371381824, + "learning_rate": 2.8051835452816357e-06, + "loss": 2.5068, + "step": 23943 + }, + { + "epoch": 2.0407397937441405, + "grad_norm": 44.668675175734194, + "learning_rate": 2.8047380343259837e-06, + "loss": 1.6532, + "step": 23944 + }, + { + "epoch": 2.040825023438166, + "grad_norm": 42.03886779258365, + "learning_rate": 2.8042925449598595e-06, + "loss": 1.3237, + "step": 23945 + }, + { + "epoch": 2.0409102531321914, + "grad_norm": 76.19327973514457, + "learning_rate": 2.8038470771876476e-06, + "loss": 2.1975, + "step": 23946 + }, + { + "epoch": 2.0409954828262165, + "grad_norm": 119.29136368900505, + "learning_rate": 2.803401631013727e-06, + "loss": 1.8108, + "step": 23947 + }, + { + "epoch": 2.041080712520242, + "grad_norm": 15.314323021421355, + "learning_rate": 2.8029562064424763e-06, + "loss": 0.5101, + "step": 23948 + }, + { + "epoch": 2.0411659422142674, + "grad_norm": 59.83943672112573, + "learning_rate": 2.8025108034782807e-06, + "loss": 2.0277, + "step": 23949 + }, + { + "epoch": 2.041251171908293, + "grad_norm": 69.30913880474326, + "learning_rate": 2.8020654221255163e-06, + "loss": 1.6724, + "step": 23950 + }, + { + "epoch": 2.0413364016023183, + "grad_norm": 29.99075860990696, + "learning_rate": 2.801620062388566e-06, + "loss": 1.5199, + "step": 23951 + }, + { + "epoch": 2.041421631296344, + "grad_norm": 59.50650687907804, + "learning_rate": 2.8011747242718097e-06, + "loss": 1.9799, + "step": 23952 + }, + { + "epoch": 2.041506860990369, + "grad_norm": 74.3334466408357, + "learning_rate": 2.800729407779624e-06, + "loss": 1.872, + "step": 23953 + }, + { + "epoch": 2.0415920906843943, + "grad_norm": 40.29132900223117, + "learning_rate": 2.8002841129163927e-06, + "loss": 1.0094, + "step": 23954 + }, + { + "epoch": 2.0416773203784198, + "grad_norm": 26.344862405633418, + "learning_rate": 2.7998388396864933e-06, + "loss": 0.6683, + "step": 23955 + }, + { + "epoch": 2.0417625500724452, + "grad_norm": 39.22847109455423, + "learning_rate": 2.7993935880943026e-06, + "loss": 1.2927, + "step": 23956 + }, + { + "epoch": 2.0418477797664707, + "grad_norm": 63.91397865266352, + "learning_rate": 2.7989483581442034e-06, + "loss": 2.2939, + "step": 23957 + }, + { + "epoch": 2.041933009460496, + "grad_norm": 55.84522615367037, + "learning_rate": 2.79850314984057e-06, + "loss": 1.4114, + "step": 23958 + }, + { + "epoch": 2.0420182391545216, + "grad_norm": 26.97226857789557, + "learning_rate": 2.7980579631877864e-06, + "loss": 1.1571, + "step": 23959 + }, + { + "epoch": 2.0421034688485467, + "grad_norm": 27.747103017238892, + "learning_rate": 2.797612798190227e-06, + "loss": 0.9977, + "step": 23960 + }, + { + "epoch": 2.042188698542572, + "grad_norm": 31.26670812053647, + "learning_rate": 2.797167654852271e-06, + "loss": 1.3195, + "step": 23961 + }, + { + "epoch": 2.0422739282365976, + "grad_norm": 45.063301394497834, + "learning_rate": 2.7967225331782934e-06, + "loss": 1.4974, + "step": 23962 + }, + { + "epoch": 2.042359157930623, + "grad_norm": 36.12259022662439, + "learning_rate": 2.796277433172676e-06, + "loss": 1.3282, + "step": 23963 + }, + { + "epoch": 2.0424443876246485, + "grad_norm": 48.6924112512712, + "learning_rate": 2.795832354839793e-06, + "loss": 1.5255, + "step": 23964 + }, + { + "epoch": 2.042529617318674, + "grad_norm": 26.891074992281464, + "learning_rate": 2.7953872981840245e-06, + "loss": 0.9603, + "step": 23965 + }, + { + "epoch": 2.042614847012699, + "grad_norm": 38.925456613830946, + "learning_rate": 2.7949422632097446e-06, + "loss": 1.1035, + "step": 23966 + }, + { + "epoch": 2.0427000767067245, + "grad_norm": 48.70640895159654, + "learning_rate": 2.794497249921333e-06, + "loss": 1.7298, + "step": 23967 + }, + { + "epoch": 2.04278530640075, + "grad_norm": 41.53057811276833, + "learning_rate": 2.7940522583231646e-06, + "loss": 1.2722, + "step": 23968 + }, + { + "epoch": 2.0428705360947754, + "grad_norm": 35.652000933632344, + "learning_rate": 2.7936072884196155e-06, + "loss": 0.9556, + "step": 23969 + }, + { + "epoch": 2.042955765788801, + "grad_norm": 42.164024835476226, + "learning_rate": 2.79316234021506e-06, + "loss": 0.8583, + "step": 23970 + }, + { + "epoch": 2.0430409954828264, + "grad_norm": 39.301064584141585, + "learning_rate": 2.7927174137138776e-06, + "loss": 1.4376, + "step": 23971 + }, + { + "epoch": 2.043126225176852, + "grad_norm": 53.453347104143354, + "learning_rate": 2.792272508920443e-06, + "loss": 1.8199, + "step": 23972 + }, + { + "epoch": 2.043211454870877, + "grad_norm": 49.11493773155487, + "learning_rate": 2.791827625839128e-06, + "loss": 1.2477, + "step": 23973 + }, + { + "epoch": 2.0432966845649023, + "grad_norm": 32.463194026518806, + "learning_rate": 2.7913827644743137e-06, + "loss": 1.0786, + "step": 23974 + }, + { + "epoch": 2.043381914258928, + "grad_norm": 31.16675094913951, + "learning_rate": 2.7909379248303703e-06, + "loss": 1.0759, + "step": 23975 + }, + { + "epoch": 2.0434671439529533, + "grad_norm": 37.162653173538324, + "learning_rate": 2.790493106911676e-06, + "loss": 1.3943, + "step": 23976 + }, + { + "epoch": 2.0435523736469787, + "grad_norm": 50.58695508768076, + "learning_rate": 2.790048310722605e-06, + "loss": 2.077, + "step": 23977 + }, + { + "epoch": 2.043637603341004, + "grad_norm": 33.38461288930977, + "learning_rate": 2.78960353626753e-06, + "loss": 0.8287, + "step": 23978 + }, + { + "epoch": 2.043722833035029, + "grad_norm": 78.35050209573805, + "learning_rate": 2.789158783550824e-06, + "loss": 2.0129, + "step": 23979 + }, + { + "epoch": 2.0438080627290547, + "grad_norm": 41.817054703400586, + "learning_rate": 2.7887140525768652e-06, + "loss": 0.8241, + "step": 23980 + }, + { + "epoch": 2.04389329242308, + "grad_norm": 56.93987825664524, + "learning_rate": 2.7882693433500223e-06, + "loss": 1.4448, + "step": 23981 + }, + { + "epoch": 2.0439785221171056, + "grad_norm": 61.78831045909522, + "learning_rate": 2.787824655874674e-06, + "loss": 1.8307, + "step": 23982 + }, + { + "epoch": 2.044063751811131, + "grad_norm": 350.66562357457667, + "learning_rate": 2.787379990155189e-06, + "loss": 2.2181, + "step": 23983 + }, + { + "epoch": 2.0441489815051566, + "grad_norm": 35.85819817050827, + "learning_rate": 2.7869353461959446e-06, + "loss": 1.3733, + "step": 23984 + }, + { + "epoch": 2.0442342111991816, + "grad_norm": 37.801721816831304, + "learning_rate": 2.786490724001312e-06, + "loss": 0.8309, + "step": 23985 + }, + { + "epoch": 2.044319440893207, + "grad_norm": 65.9750728073184, + "learning_rate": 2.786046123575664e-06, + "loss": 2.1385, + "step": 23986 + }, + { + "epoch": 2.0444046705872325, + "grad_norm": 64.03272817267965, + "learning_rate": 2.78560154492337e-06, + "loss": 1.2112, + "step": 23987 + }, + { + "epoch": 2.044489900281258, + "grad_norm": 45.69831952442979, + "learning_rate": 2.785156988048807e-06, + "loss": 1.801, + "step": 23988 + }, + { + "epoch": 2.0445751299752835, + "grad_norm": 57.0373715246141, + "learning_rate": 2.784712452956343e-06, + "loss": 1.7242, + "step": 23989 + }, + { + "epoch": 2.044660359669309, + "grad_norm": 33.64407926603549, + "learning_rate": 2.7842679396503536e-06, + "loss": 1.517, + "step": 23990 + }, + { + "epoch": 2.0447455893633344, + "grad_norm": 44.018143361297476, + "learning_rate": 2.7838234481352083e-06, + "loss": 1.5574, + "step": 23991 + }, + { + "epoch": 2.0448308190573594, + "grad_norm": 84.32869452347329, + "learning_rate": 2.783378978415277e-06, + "loss": 2.4655, + "step": 23992 + }, + { + "epoch": 2.044916048751385, + "grad_norm": 49.29853319849405, + "learning_rate": 2.782934530494935e-06, + "loss": 1.5096, + "step": 23993 + }, + { + "epoch": 2.0450012784454104, + "grad_norm": 128.5761417798351, + "learning_rate": 2.78249010437855e-06, + "loss": 4.0185, + "step": 23994 + }, + { + "epoch": 2.045086508139436, + "grad_norm": 54.53275644041292, + "learning_rate": 2.7820457000704926e-06, + "loss": 1.5889, + "step": 23995 + }, + { + "epoch": 2.0451717378334613, + "grad_norm": 72.23395472597508, + "learning_rate": 2.781601317575137e-06, + "loss": 2.1534, + "step": 23996 + }, + { + "epoch": 2.0452569675274868, + "grad_norm": 51.89614172670407, + "learning_rate": 2.7811569568968477e-06, + "loss": 1.4915, + "step": 23997 + }, + { + "epoch": 2.045342197221512, + "grad_norm": 38.486022118399745, + "learning_rate": 2.78071261804e-06, + "loss": 1.2051, + "step": 23998 + }, + { + "epoch": 2.0454274269155373, + "grad_norm": 29.43394484617436, + "learning_rate": 2.7802683010089624e-06, + "loss": 1.3298, + "step": 23999 + }, + { + "epoch": 2.0455126566095627, + "grad_norm": 42.99068752467158, + "learning_rate": 2.779824005808102e-06, + "loss": 1.334, + "step": 24000 + }, + { + "epoch": 2.045597886303588, + "grad_norm": 43.08857322220313, + "learning_rate": 2.7793797324417922e-06, + "loss": 1.547, + "step": 24001 + }, + { + "epoch": 2.0456831159976137, + "grad_norm": 20.595937369305407, + "learning_rate": 2.7789354809143997e-06, + "loss": 0.671, + "step": 24002 + }, + { + "epoch": 2.045768345691639, + "grad_norm": 41.059697874497324, + "learning_rate": 2.778491251230293e-06, + "loss": 0.9773, + "step": 24003 + }, + { + "epoch": 2.045853575385664, + "grad_norm": 73.9853467721914, + "learning_rate": 2.7780470433938445e-06, + "loss": 1.9905, + "step": 24004 + }, + { + "epoch": 2.0459388050796896, + "grad_norm": 26.389487663698322, + "learning_rate": 2.777602857409417e-06, + "loss": 1.178, + "step": 24005 + }, + { + "epoch": 2.046024034773715, + "grad_norm": 27.547475003210927, + "learning_rate": 2.7771586932813854e-06, + "loss": 1.1104, + "step": 24006 + }, + { + "epoch": 2.0461092644677406, + "grad_norm": 55.49270408974651, + "learning_rate": 2.7767145510141145e-06, + "loss": 1.6108, + "step": 24007 + }, + { + "epoch": 2.046194494161766, + "grad_norm": 50.53708076782742, + "learning_rate": 2.7762704306119725e-06, + "loss": 1.4269, + "step": 24008 + }, + { + "epoch": 2.0462797238557915, + "grad_norm": 47.71248335833103, + "learning_rate": 2.7758263320793255e-06, + "loss": 1.7145, + "step": 24009 + }, + { + "epoch": 2.046364953549817, + "grad_norm": 63.72866069314374, + "learning_rate": 2.7753822554205445e-06, + "loss": 1.7937, + "step": 24010 + }, + { + "epoch": 2.046450183243842, + "grad_norm": 40.93818286313963, + "learning_rate": 2.7749382006399927e-06, + "loss": 1.5631, + "step": 24011 + }, + { + "epoch": 2.0465354129378674, + "grad_norm": 40.90333974031171, + "learning_rate": 2.7744941677420424e-06, + "loss": 1.2271, + "step": 24012 + }, + { + "epoch": 2.046620642631893, + "grad_norm": 76.13784796296162, + "learning_rate": 2.7740501567310545e-06, + "loss": 2.3901, + "step": 24013 + }, + { + "epoch": 2.0467058723259184, + "grad_norm": 13.062541906296339, + "learning_rate": 2.773606167611401e-06, + "loss": 0.2294, + "step": 24014 + }, + { + "epoch": 2.046791102019944, + "grad_norm": 31.955603659164762, + "learning_rate": 2.773162200387447e-06, + "loss": 1.2508, + "step": 24015 + }, + { + "epoch": 2.0468763317139693, + "grad_norm": 72.14515901524044, + "learning_rate": 2.772718255063557e-06, + "loss": 2.3598, + "step": 24016 + }, + { + "epoch": 2.0469615614079943, + "grad_norm": 40.3335990042328, + "learning_rate": 2.7722743316440985e-06, + "loss": 1.3042, + "step": 24017 + }, + { + "epoch": 2.04704679110202, + "grad_norm": 26.63628732837734, + "learning_rate": 2.7718304301334347e-06, + "loss": 1.0233, + "step": 24018 + }, + { + "epoch": 2.0471320207960453, + "grad_norm": 68.38572096136336, + "learning_rate": 2.771386550535935e-06, + "loss": 2.408, + "step": 24019 + }, + { + "epoch": 2.0472172504900708, + "grad_norm": 30.874913489437542, + "learning_rate": 2.7709426928559614e-06, + "loss": 1.0655, + "step": 24020 + }, + { + "epoch": 2.047302480184096, + "grad_norm": 19.749185870686834, + "learning_rate": 2.7704988570978827e-06, + "loss": 0.6871, + "step": 24021 + }, + { + "epoch": 2.0473877098781217, + "grad_norm": 23.204055233438254, + "learning_rate": 2.770055043266059e-06, + "loss": 1.0384, + "step": 24022 + }, + { + "epoch": 2.047472939572147, + "grad_norm": 35.36836569584163, + "learning_rate": 2.769611251364861e-06, + "loss": 1.0523, + "step": 24023 + }, + { + "epoch": 2.047558169266172, + "grad_norm": 32.357923133842206, + "learning_rate": 2.769167481398649e-06, + "loss": 0.9392, + "step": 24024 + }, + { + "epoch": 2.0476433989601976, + "grad_norm": 51.925676156746846, + "learning_rate": 2.768723733371789e-06, + "loss": 1.3192, + "step": 24025 + }, + { + "epoch": 2.047728628654223, + "grad_norm": 50.71258936363422, + "learning_rate": 2.768280007288642e-06, + "loss": 2.2493, + "step": 24026 + }, + { + "epoch": 2.0478138583482486, + "grad_norm": 49.93799415400472, + "learning_rate": 2.767836303153577e-06, + "loss": 1.8222, + "step": 24027 + }, + { + "epoch": 2.047899088042274, + "grad_norm": 66.14877391578568, + "learning_rate": 2.7673926209709525e-06, + "loss": 0.8914, + "step": 24028 + }, + { + "epoch": 2.0479843177362995, + "grad_norm": 45.15915450917999, + "learning_rate": 2.766948960745136e-06, + "loss": 1.6378, + "step": 24029 + }, + { + "epoch": 2.0480695474303245, + "grad_norm": 43.51055741670874, + "learning_rate": 2.7665053224804885e-06, + "loss": 1.2026, + "step": 24030 + }, + { + "epoch": 2.04815477712435, + "grad_norm": 35.99949140234796, + "learning_rate": 2.766061706181374e-06, + "loss": 1.1846, + "step": 24031 + }, + { + "epoch": 2.0482400068183755, + "grad_norm": 41.205159585908085, + "learning_rate": 2.7656181118521562e-06, + "loss": 1.1399, + "step": 24032 + }, + { + "epoch": 2.048325236512401, + "grad_norm": 30.02124796691679, + "learning_rate": 2.7651745394971964e-06, + "loss": 1.2418, + "step": 24033 + }, + { + "epoch": 2.0484104662064264, + "grad_norm": 40.4328993869254, + "learning_rate": 2.764730989120854e-06, + "loss": 1.2164, + "step": 24034 + }, + { + "epoch": 2.048495695900452, + "grad_norm": 26.196846097729097, + "learning_rate": 2.7642874607274965e-06, + "loss": 0.9357, + "step": 24035 + }, + { + "epoch": 2.048580925594477, + "grad_norm": 42.45258489418888, + "learning_rate": 2.7638439543214817e-06, + "loss": 0.9915, + "step": 24036 + }, + { + "epoch": 2.0486661552885024, + "grad_norm": 35.57689721330754, + "learning_rate": 2.763400469907175e-06, + "loss": 1.1161, + "step": 24037 + }, + { + "epoch": 2.048751384982528, + "grad_norm": 45.31200302056413, + "learning_rate": 2.7629570074889363e-06, + "loss": 1.6106, + "step": 24038 + }, + { + "epoch": 2.0488366146765533, + "grad_norm": 22.94784929253742, + "learning_rate": 2.762513567071124e-06, + "loss": 1.0991, + "step": 24039 + }, + { + "epoch": 2.048921844370579, + "grad_norm": 68.582862490569, + "learning_rate": 2.7620701486581034e-06, + "loss": 2.0208, + "step": 24040 + }, + { + "epoch": 2.0490070740646043, + "grad_norm": 46.03129009352543, + "learning_rate": 2.761626752254234e-06, + "loss": 0.9625, + "step": 24041 + }, + { + "epoch": 2.0490923037586297, + "grad_norm": 21.75979025829955, + "learning_rate": 2.761183377863874e-06, + "loss": 0.7282, + "step": 24042 + }, + { + "epoch": 2.0491775334526547, + "grad_norm": 42.74031778008546, + "learning_rate": 2.7607400254913887e-06, + "loss": 1.405, + "step": 24043 + }, + { + "epoch": 2.04926276314668, + "grad_norm": 39.57329192632245, + "learning_rate": 2.7602966951411327e-06, + "loss": 1.3988, + "step": 24044 + }, + { + "epoch": 2.0493479928407057, + "grad_norm": 50.32872819472476, + "learning_rate": 2.7598533868174704e-06, + "loss": 1.9929, + "step": 24045 + }, + { + "epoch": 2.049433222534731, + "grad_norm": 37.68986643408552, + "learning_rate": 2.7594101005247598e-06, + "loss": 0.945, + "step": 24046 + }, + { + "epoch": 2.0495184522287566, + "grad_norm": 19.234479719095052, + "learning_rate": 2.758966836267361e-06, + "loss": 0.5895, + "step": 24047 + }, + { + "epoch": 2.049603681922782, + "grad_norm": 52.09656789529212, + "learning_rate": 2.758523594049631e-06, + "loss": 1.3789, + "step": 24048 + }, + { + "epoch": 2.049688911616807, + "grad_norm": 49.821895306949585, + "learning_rate": 2.758080373875932e-06, + "loss": 1.9682, + "step": 24049 + }, + { + "epoch": 2.0497741413108326, + "grad_norm": 25.645114122590748, + "learning_rate": 2.75763717575062e-06, + "loss": 0.5971, + "step": 24050 + }, + { + "epoch": 2.049859371004858, + "grad_norm": 55.21288452185229, + "learning_rate": 2.7571939996780583e-06, + "loss": 1.8163, + "step": 24051 + }, + { + "epoch": 2.0499446006988835, + "grad_norm": 20.16308501002349, + "learning_rate": 2.7567508456625995e-06, + "loss": 1.5926, + "step": 24052 + }, + { + "epoch": 2.050029830392909, + "grad_norm": 31.834683896446727, + "learning_rate": 2.756307713708607e-06, + "loss": 0.8347, + "step": 24053 + }, + { + "epoch": 2.0501150600869344, + "grad_norm": 37.558580919700944, + "learning_rate": 2.7558646038204367e-06, + "loss": 1.162, + "step": 24054 + }, + { + "epoch": 2.0502002897809595, + "grad_norm": 65.85187425223228, + "learning_rate": 2.7554215160024463e-06, + "loss": 2.1988, + "step": 24055 + }, + { + "epoch": 2.050285519474985, + "grad_norm": 85.72127874327946, + "learning_rate": 2.754978450258991e-06, + "loss": 1.894, + "step": 24056 + }, + { + "epoch": 2.0503707491690104, + "grad_norm": 45.109104430860654, + "learning_rate": 2.754535406594433e-06, + "loss": 1.2613, + "step": 24057 + }, + { + "epoch": 2.050455978863036, + "grad_norm": 37.57993454078471, + "learning_rate": 2.7540923850131246e-06, + "loss": 1.0341, + "step": 24058 + }, + { + "epoch": 2.0505412085570613, + "grad_norm": 41.54598304178065, + "learning_rate": 2.7536493855194274e-06, + "loss": 1.2381, + "step": 24059 + }, + { + "epoch": 2.050626438251087, + "grad_norm": 39.53720138119212, + "learning_rate": 2.7532064081176937e-06, + "loss": 1.295, + "step": 24060 + }, + { + "epoch": 2.0507116679451123, + "grad_norm": 43.32268835128324, + "learning_rate": 2.752763452812285e-06, + "loss": 1.8461, + "step": 24061 + }, + { + "epoch": 2.0507968976391373, + "grad_norm": 61.08340940713433, + "learning_rate": 2.7523205196075538e-06, + "loss": 1.9411, + "step": 24062 + }, + { + "epoch": 2.0508821273331628, + "grad_norm": 32.86408053859619, + "learning_rate": 2.7518776085078568e-06, + "loss": 1.0474, + "step": 24063 + }, + { + "epoch": 2.0509673570271882, + "grad_norm": 48.28945388355038, + "learning_rate": 2.751434719517551e-06, + "loss": 1.7102, + "step": 24064 + }, + { + "epoch": 2.0510525867212137, + "grad_norm": 30.230200759886, + "learning_rate": 2.7509918526409897e-06, + "loss": 0.7254, + "step": 24065 + }, + { + "epoch": 2.051137816415239, + "grad_norm": 30.354768849389828, + "learning_rate": 2.750549007882531e-06, + "loss": 0.5793, + "step": 24066 + }, + { + "epoch": 2.0512230461092646, + "grad_norm": 33.98884736199912, + "learning_rate": 2.7501061852465273e-06, + "loss": 1.6029, + "step": 24067 + }, + { + "epoch": 2.0513082758032897, + "grad_norm": 50.01116782329969, + "learning_rate": 2.7496633847373365e-06, + "loss": 1.5187, + "step": 24068 + }, + { + "epoch": 2.051393505497315, + "grad_norm": 53.83083759285143, + "learning_rate": 2.7492206063593112e-06, + "loss": 1.6008, + "step": 24069 + }, + { + "epoch": 2.0514787351913406, + "grad_norm": 41.987384619053664, + "learning_rate": 2.748777850116808e-06, + "loss": 1.3347, + "step": 24070 + }, + { + "epoch": 2.051563964885366, + "grad_norm": 36.88125393967478, + "learning_rate": 2.7483351160141814e-06, + "loss": 1.1344, + "step": 24071 + }, + { + "epoch": 2.0516491945793915, + "grad_norm": 43.457040343537344, + "learning_rate": 2.747892404055783e-06, + "loss": 1.3742, + "step": 24072 + }, + { + "epoch": 2.051734424273417, + "grad_norm": 45.86840064269808, + "learning_rate": 2.747449714245966e-06, + "loss": 1.3649, + "step": 24073 + }, + { + "epoch": 2.051819653967442, + "grad_norm": 33.933686162363834, + "learning_rate": 2.7470070465890886e-06, + "loss": 1.3588, + "step": 24074 + }, + { + "epoch": 2.0519048836614675, + "grad_norm": 52.16853966298338, + "learning_rate": 2.746564401089499e-06, + "loss": 1.9299, + "step": 24075 + }, + { + "epoch": 2.051990113355493, + "grad_norm": 61.633300789307604, + "learning_rate": 2.7461217777515558e-06, + "loss": 2.5326, + "step": 24076 + }, + { + "epoch": 2.0520753430495184, + "grad_norm": 48.338392283862206, + "learning_rate": 2.7456791765796086e-06, + "loss": 1.6343, + "step": 24077 + }, + { + "epoch": 2.052160572743544, + "grad_norm": 25.349467893006864, + "learning_rate": 2.7452365975780094e-06, + "loss": 1.107, + "step": 24078 + }, + { + "epoch": 2.0522458024375694, + "grad_norm": 38.79817886979331, + "learning_rate": 2.744794040751114e-06, + "loss": 1.2919, + "step": 24079 + }, + { + "epoch": 2.052331032131595, + "grad_norm": 33.89529894007853, + "learning_rate": 2.7443515061032732e-06, + "loss": 1.2513, + "step": 24080 + }, + { + "epoch": 2.05241626182562, + "grad_norm": 45.17342526764685, + "learning_rate": 2.743908993638837e-06, + "loss": 2.0373, + "step": 24081 + }, + { + "epoch": 2.0525014915196453, + "grad_norm": 59.488302071667086, + "learning_rate": 2.743466503362161e-06, + "loss": 1.5539, + "step": 24082 + }, + { + "epoch": 2.052586721213671, + "grad_norm": 35.94967249456938, + "learning_rate": 2.7430240352775937e-06, + "loss": 1.1017, + "step": 24083 + }, + { + "epoch": 2.0526719509076963, + "grad_norm": 31.532926648662247, + "learning_rate": 2.74258158938949e-06, + "loss": 1.2028, + "step": 24084 + }, + { + "epoch": 2.0527571806017217, + "grad_norm": 34.003871960016035, + "learning_rate": 2.7421391657022e-06, + "loss": 1.3016, + "step": 24085 + }, + { + "epoch": 2.052842410295747, + "grad_norm": 31.887033405198913, + "learning_rate": 2.7416967642200715e-06, + "loss": 1.1477, + "step": 24086 + }, + { + "epoch": 2.0529276399897722, + "grad_norm": 65.70691435315973, + "learning_rate": 2.7412543849474606e-06, + "loss": 1.6814, + "step": 24087 + }, + { + "epoch": 2.0530128696837977, + "grad_norm": 78.5525301719671, + "learning_rate": 2.740812027888714e-06, + "loss": 2.0421, + "step": 24088 + }, + { + "epoch": 2.053098099377823, + "grad_norm": 58.74423637446738, + "learning_rate": 2.7403696930481827e-06, + "loss": 1.3797, + "step": 24089 + }, + { + "epoch": 2.0531833290718486, + "grad_norm": 33.92115095240214, + "learning_rate": 2.7399273804302186e-06, + "loss": 0.9651, + "step": 24090 + }, + { + "epoch": 2.053268558765874, + "grad_norm": 44.39372972784497, + "learning_rate": 2.739485090039169e-06, + "loss": 1.7394, + "step": 24091 + }, + { + "epoch": 2.0533537884598996, + "grad_norm": 65.9037113155023, + "learning_rate": 2.7390428218793872e-06, + "loss": 0.9435, + "step": 24092 + }, + { + "epoch": 2.053439018153925, + "grad_norm": 78.66173186713604, + "learning_rate": 2.7386005759552215e-06, + "loss": 1.0002, + "step": 24093 + }, + { + "epoch": 2.05352424784795, + "grad_norm": 52.12681594129067, + "learning_rate": 2.7381583522710196e-06, + "loss": 1.528, + "step": 24094 + }, + { + "epoch": 2.0536094775419755, + "grad_norm": 488.70126404109766, + "learning_rate": 2.73771615083113e-06, + "loss": 1.4914, + "step": 24095 + }, + { + "epoch": 2.053694707236001, + "grad_norm": 62.463389680861376, + "learning_rate": 2.7372739716399056e-06, + "loss": 2.1607, + "step": 24096 + }, + { + "epoch": 2.0537799369300265, + "grad_norm": 24.284610114575457, + "learning_rate": 2.7368318147016903e-06, + "loss": 0.9356, + "step": 24097 + }, + { + "epoch": 2.053865166624052, + "grad_norm": 39.4685931591202, + "learning_rate": 2.736389680020836e-06, + "loss": 1.2829, + "step": 24098 + }, + { + "epoch": 2.0539503963180774, + "grad_norm": 64.84895155787184, + "learning_rate": 2.735947567601689e-06, + "loss": 2.3469, + "step": 24099 + }, + { + "epoch": 2.0540356260121024, + "grad_norm": 24.935956207664645, + "learning_rate": 2.735505477448599e-06, + "loss": 0.9095, + "step": 24100 + }, + { + "epoch": 2.054120855706128, + "grad_norm": 125.20400716924432, + "learning_rate": 2.7350634095659133e-06, + "loss": 1.6722, + "step": 24101 + }, + { + "epoch": 2.0542060854001534, + "grad_norm": 58.70284843017981, + "learning_rate": 2.7346213639579787e-06, + "loss": 2.1368, + "step": 24102 + }, + { + "epoch": 2.054291315094179, + "grad_norm": 55.697724708480656, + "learning_rate": 2.7341793406291405e-06, + "loss": 1.1417, + "step": 24103 + }, + { + "epoch": 2.0543765447882043, + "grad_norm": 56.32916613946939, + "learning_rate": 2.7337373395837508e-06, + "loss": 1.5694, + "step": 24104 + }, + { + "epoch": 2.0544617744822298, + "grad_norm": 65.02391639599831, + "learning_rate": 2.7332953608261513e-06, + "loss": 1.6395, + "step": 24105 + }, + { + "epoch": 2.054547004176255, + "grad_norm": 51.56162914019961, + "learning_rate": 2.7328534043606934e-06, + "loss": 1.8378, + "step": 24106 + }, + { + "epoch": 2.0546322338702803, + "grad_norm": 67.22026220029058, + "learning_rate": 2.732411470191722e-06, + "loss": 1.8398, + "step": 24107 + }, + { + "epoch": 2.0547174635643057, + "grad_norm": 59.404885136013895, + "learning_rate": 2.7319695583235807e-06, + "loss": 1.5586, + "step": 24108 + }, + { + "epoch": 2.054802693258331, + "grad_norm": 25.15581921252327, + "learning_rate": 2.7315276687606187e-06, + "loss": 1.0663, + "step": 24109 + }, + { + "epoch": 2.0548879229523567, + "grad_norm": 59.49085946989631, + "learning_rate": 2.7310858015071814e-06, + "loss": 1.4571, + "step": 24110 + }, + { + "epoch": 2.054973152646382, + "grad_norm": 46.738657215064954, + "learning_rate": 2.7306439565676137e-06, + "loss": 1.2742, + "step": 24111 + }, + { + "epoch": 2.0550583823404076, + "grad_norm": 70.00856288778523, + "learning_rate": 2.730202133946258e-06, + "loss": 1.8502, + "step": 24112 + }, + { + "epoch": 2.0551436120344326, + "grad_norm": 25.334746466595416, + "learning_rate": 2.729760333647463e-06, + "loss": 1.0076, + "step": 24113 + }, + { + "epoch": 2.055228841728458, + "grad_norm": 24.27401199513801, + "learning_rate": 2.729318555675575e-06, + "loss": 0.643, + "step": 24114 + }, + { + "epoch": 2.0553140714224836, + "grad_norm": 46.440612890910764, + "learning_rate": 2.728876800034937e-06, + "loss": 1.6927, + "step": 24115 + }, + { + "epoch": 2.055399301116509, + "grad_norm": 43.00383413589939, + "learning_rate": 2.7284350667298908e-06, + "loss": 2.0529, + "step": 24116 + }, + { + "epoch": 2.0554845308105345, + "grad_norm": 83.48124587796644, + "learning_rate": 2.7279933557647853e-06, + "loss": 2.9873, + "step": 24117 + }, + { + "epoch": 2.05556976050456, + "grad_norm": 45.04186476479922, + "learning_rate": 2.727551667143962e-06, + "loss": 1.0424, + "step": 24118 + }, + { + "epoch": 2.055654990198585, + "grad_norm": 55.58875318230325, + "learning_rate": 2.7271100008717654e-06, + "loss": 2.0213, + "step": 24119 + }, + { + "epoch": 2.0557402198926105, + "grad_norm": 34.308477244372675, + "learning_rate": 2.7266683569525365e-06, + "loss": 1.4016, + "step": 24120 + }, + { + "epoch": 2.055825449586636, + "grad_norm": 68.60747351586146, + "learning_rate": 2.7262267353906236e-06, + "loss": 1.5688, + "step": 24121 + }, + { + "epoch": 2.0559106792806614, + "grad_norm": 51.29181469028475, + "learning_rate": 2.725785136190365e-06, + "loss": 2.305, + "step": 24122 + }, + { + "epoch": 2.055995908974687, + "grad_norm": 48.390731738981536, + "learning_rate": 2.725343559356107e-06, + "loss": 1.2071, + "step": 24123 + }, + { + "epoch": 2.0560811386687123, + "grad_norm": 39.72115339743933, + "learning_rate": 2.7249020048921925e-06, + "loss": 1.529, + "step": 24124 + }, + { + "epoch": 2.0561663683627374, + "grad_norm": 56.729050983313506, + "learning_rate": 2.72446047280296e-06, + "loss": 1.2649, + "step": 24125 + }, + { + "epoch": 2.056251598056763, + "grad_norm": 59.702888502165756, + "learning_rate": 2.7240189630927573e-06, + "loss": 1.8853, + "step": 24126 + }, + { + "epoch": 2.0563368277507883, + "grad_norm": 38.56338404824808, + "learning_rate": 2.723577475765923e-06, + "loss": 1.0753, + "step": 24127 + }, + { + "epoch": 2.0564220574448138, + "grad_norm": 68.35984835804211, + "learning_rate": 2.723136010826798e-06, + "loss": 2.0344, + "step": 24128 + }, + { + "epoch": 2.0565072871388392, + "grad_norm": 40.28259847076822, + "learning_rate": 2.7226945682797275e-06, + "loss": 0.9873, + "step": 24129 + }, + { + "epoch": 2.0565925168328647, + "grad_norm": 65.64249891758286, + "learning_rate": 2.722253148129049e-06, + "loss": 2.0367, + "step": 24130 + }, + { + "epoch": 2.05667774652689, + "grad_norm": 89.13450915433587, + "learning_rate": 2.7218117503791074e-06, + "loss": 2.2728, + "step": 24131 + }, + { + "epoch": 2.056762976220915, + "grad_norm": 53.4426221588817, + "learning_rate": 2.7213703750342423e-06, + "loss": 1.6818, + "step": 24132 + }, + { + "epoch": 2.0568482059149407, + "grad_norm": 53.265389818662925, + "learning_rate": 2.7209290220987917e-06, + "loss": 1.6126, + "step": 24133 + }, + { + "epoch": 2.056933435608966, + "grad_norm": 36.745206336565076, + "learning_rate": 2.7204876915771013e-06, + "loss": 1.1972, + "step": 24134 + }, + { + "epoch": 2.0570186653029916, + "grad_norm": 36.1296161646499, + "learning_rate": 2.720046383473508e-06, + "loss": 1.5865, + "step": 24135 + }, + { + "epoch": 2.057103894997017, + "grad_norm": 27.35853737704476, + "learning_rate": 2.719605097792351e-06, + "loss": 1.364, + "step": 24136 + }, + { + "epoch": 2.0571891246910425, + "grad_norm": 32.75054557853511, + "learning_rate": 2.7191638345379735e-06, + "loss": 0.8049, + "step": 24137 + }, + { + "epoch": 2.0572743543850676, + "grad_norm": 40.6253904768965, + "learning_rate": 2.718722593714711e-06, + "loss": 1.3899, + "step": 24138 + }, + { + "epoch": 2.057359584079093, + "grad_norm": 58.09405700151297, + "learning_rate": 2.718281375326908e-06, + "loss": 1.5419, + "step": 24139 + }, + { + "epoch": 2.0574448137731185, + "grad_norm": 40.65255873548466, + "learning_rate": 2.717840179378901e-06, + "loss": 1.7425, + "step": 24140 + }, + { + "epoch": 2.057530043467144, + "grad_norm": 44.49660192969661, + "learning_rate": 2.7173990058750288e-06, + "loss": 1.403, + "step": 24141 + }, + { + "epoch": 2.0576152731611694, + "grad_norm": 20.31572839475994, + "learning_rate": 2.716957854819628e-06, + "loss": 0.6798, + "step": 24142 + }, + { + "epoch": 2.057700502855195, + "grad_norm": 108.51460655592066, + "learning_rate": 2.716516726217042e-06, + "loss": 1.7233, + "step": 24143 + }, + { + "epoch": 2.0577857325492204, + "grad_norm": 25.53445137431705, + "learning_rate": 2.716075620071604e-06, + "loss": 0.9667, + "step": 24144 + }, + { + "epoch": 2.0578709622432454, + "grad_norm": 45.20197802418809, + "learning_rate": 2.7156345363876576e-06, + "loss": 1.5929, + "step": 24145 + }, + { + "epoch": 2.057956191937271, + "grad_norm": 46.75178770471561, + "learning_rate": 2.7151934751695353e-06, + "loss": 1.5338, + "step": 24146 + }, + { + "epoch": 2.0580414216312963, + "grad_norm": 83.28943992138626, + "learning_rate": 2.7147524364215794e-06, + "loss": 2.4692, + "step": 24147 + }, + { + "epoch": 2.058126651325322, + "grad_norm": 84.82844000078298, + "learning_rate": 2.714311420148125e-06, + "loss": 2.2849, + "step": 24148 + }, + { + "epoch": 2.0582118810193473, + "grad_norm": 43.072940035179656, + "learning_rate": 2.7138704263535097e-06, + "loss": 0.9085, + "step": 24149 + }, + { + "epoch": 2.0582971107133727, + "grad_norm": 42.795983509704286, + "learning_rate": 2.7134294550420704e-06, + "loss": 0.8869, + "step": 24150 + }, + { + "epoch": 2.0583823404073978, + "grad_norm": 31.52218033188378, + "learning_rate": 2.7129885062181416e-06, + "loss": 1.4435, + "step": 24151 + }, + { + "epoch": 2.0584675701014232, + "grad_norm": 97.67826035888918, + "learning_rate": 2.7125475798860623e-06, + "loss": 1.165, + "step": 24152 + }, + { + "epoch": 2.0585527997954487, + "grad_norm": 130.19223683296127, + "learning_rate": 2.71210667605017e-06, + "loss": 2.9116, + "step": 24153 + }, + { + "epoch": 2.058638029489474, + "grad_norm": 51.845013429980646, + "learning_rate": 2.7116657947148005e-06, + "loss": 1.474, + "step": 24154 + }, + { + "epoch": 2.0587232591834996, + "grad_norm": 27.433053567346985, + "learning_rate": 2.7112249358842858e-06, + "loss": 1.0623, + "step": 24155 + }, + { + "epoch": 2.058808488877525, + "grad_norm": 54.111662654271875, + "learning_rate": 2.710784099562967e-06, + "loss": 1.5422, + "step": 24156 + }, + { + "epoch": 2.05889371857155, + "grad_norm": 65.32427862336283, + "learning_rate": 2.7103432857551764e-06, + "loss": 2.1128, + "step": 24157 + }, + { + "epoch": 2.0589789482655756, + "grad_norm": 32.755639485767595, + "learning_rate": 2.70990249446525e-06, + "loss": 1.1661, + "step": 24158 + }, + { + "epoch": 2.059064177959601, + "grad_norm": 50.20071720110939, + "learning_rate": 2.709461725697521e-06, + "loss": 1.6265, + "step": 24159 + }, + { + "epoch": 2.0591494076536265, + "grad_norm": 63.530697477930346, + "learning_rate": 2.709020979456325e-06, + "loss": 1.9249, + "step": 24160 + }, + { + "epoch": 2.059234637347652, + "grad_norm": 44.97944431886842, + "learning_rate": 2.7085802557459995e-06, + "loss": 1.542, + "step": 24161 + }, + { + "epoch": 2.0593198670416775, + "grad_norm": 30.868853773754033, + "learning_rate": 2.7081395545708766e-06, + "loss": 1.1464, + "step": 24162 + }, + { + "epoch": 2.059405096735703, + "grad_norm": 65.97548506285699, + "learning_rate": 2.7076988759352906e-06, + "loss": 1.8509, + "step": 24163 + }, + { + "epoch": 2.059490326429728, + "grad_norm": 59.21172416045381, + "learning_rate": 2.7072582198435737e-06, + "loss": 1.6595, + "step": 24164 + }, + { + "epoch": 2.0595755561237534, + "grad_norm": 70.7048597176197, + "learning_rate": 2.7068175863000624e-06, + "loss": 2.4984, + "step": 24165 + }, + { + "epoch": 2.059660785817779, + "grad_norm": 50.28360096689731, + "learning_rate": 2.7063769753090896e-06, + "loss": 1.3562, + "step": 24166 + }, + { + "epoch": 2.0597460155118044, + "grad_norm": 55.33900749127138, + "learning_rate": 2.7059363868749853e-06, + "loss": 1.4802, + "step": 24167 + }, + { + "epoch": 2.05983124520583, + "grad_norm": 50.41682219951436, + "learning_rate": 2.7054958210020877e-06, + "loss": 1.6427, + "step": 24168 + }, + { + "epoch": 2.0599164748998553, + "grad_norm": 35.464488884943805, + "learning_rate": 2.7050552776947246e-06, + "loss": 1.5377, + "step": 24169 + }, + { + "epoch": 2.0600017045938803, + "grad_norm": 52.225474495002366, + "learning_rate": 2.704614756957233e-06, + "loss": 1.2581, + "step": 24170 + }, + { + "epoch": 2.060086934287906, + "grad_norm": 66.59809312528311, + "learning_rate": 2.704174258793943e-06, + "loss": 1.1681, + "step": 24171 + }, + { + "epoch": 2.0601721639819313, + "grad_norm": 30.27644293334844, + "learning_rate": 2.703733783209185e-06, + "loss": 0.8602, + "step": 24172 + }, + { + "epoch": 2.0602573936759567, + "grad_norm": 60.55828839497776, + "learning_rate": 2.703293330207295e-06, + "loss": 1.6084, + "step": 24173 + }, + { + "epoch": 2.060342623369982, + "grad_norm": 53.92115788746221, + "learning_rate": 2.7028528997926017e-06, + "loss": 1.5716, + "step": 24174 + }, + { + "epoch": 2.0604278530640077, + "grad_norm": 45.56818711131617, + "learning_rate": 2.7024124919694362e-06, + "loss": 1.3223, + "step": 24175 + }, + { + "epoch": 2.0605130827580327, + "grad_norm": 23.731545369992162, + "learning_rate": 2.701972106742132e-06, + "loss": 1.1183, + "step": 24176 + }, + { + "epoch": 2.060598312452058, + "grad_norm": 51.49550238765277, + "learning_rate": 2.701531744115018e-06, + "loss": 1.1987, + "step": 24177 + }, + { + "epoch": 2.0606835421460836, + "grad_norm": 64.77735589854012, + "learning_rate": 2.7010914040924274e-06, + "loss": 2.1262, + "step": 24178 + }, + { + "epoch": 2.060768771840109, + "grad_norm": 59.52514945093624, + "learning_rate": 2.7006510866786894e-06, + "loss": 1.4011, + "step": 24179 + }, + { + "epoch": 2.0608540015341346, + "grad_norm": 24.961433559573273, + "learning_rate": 2.7002107918781346e-06, + "loss": 0.9954, + "step": 24180 + }, + { + "epoch": 2.06093923122816, + "grad_norm": 34.50476599000839, + "learning_rate": 2.6997705196950908e-06, + "loss": 0.9279, + "step": 24181 + }, + { + "epoch": 2.0610244609221855, + "grad_norm": 67.28590019870286, + "learning_rate": 2.6993302701338914e-06, + "loss": 1.1823, + "step": 24182 + }, + { + "epoch": 2.0611096906162105, + "grad_norm": 42.098938597172015, + "learning_rate": 2.698890043198863e-06, + "loss": 1.7511, + "step": 24183 + }, + { + "epoch": 2.061194920310236, + "grad_norm": 50.618939052766535, + "learning_rate": 2.698449838894338e-06, + "loss": 1.7449, + "step": 24184 + }, + { + "epoch": 2.0612801500042615, + "grad_norm": 25.24131408780107, + "learning_rate": 2.698009657224643e-06, + "loss": 0.9831, + "step": 24185 + }, + { + "epoch": 2.061365379698287, + "grad_norm": 40.43250034662896, + "learning_rate": 2.6975694981941102e-06, + "loss": 0.9464, + "step": 24186 + }, + { + "epoch": 2.0614506093923124, + "grad_norm": 49.62983480465608, + "learning_rate": 2.6971293618070666e-06, + "loss": 1.4243, + "step": 24187 + }, + { + "epoch": 2.061535839086338, + "grad_norm": 82.97057003555176, + "learning_rate": 2.6966892480678405e-06, + "loss": 2.5216, + "step": 24188 + }, + { + "epoch": 2.061621068780363, + "grad_norm": 71.03761201263197, + "learning_rate": 2.696249156980758e-06, + "loss": 1.6711, + "step": 24189 + }, + { + "epoch": 2.0617062984743884, + "grad_norm": 66.48539325617105, + "learning_rate": 2.695809088550152e-06, + "loss": 1.9565, + "step": 24190 + }, + { + "epoch": 2.061791528168414, + "grad_norm": 57.54108022987499, + "learning_rate": 2.695369042780346e-06, + "loss": 1.7104, + "step": 24191 + }, + { + "epoch": 2.0618767578624393, + "grad_norm": 64.42508565896053, + "learning_rate": 2.6949290196756716e-06, + "loss": 1.5877, + "step": 24192 + }, + { + "epoch": 2.0619619875564648, + "grad_norm": 45.71611695556003, + "learning_rate": 2.6944890192404547e-06, + "loss": 1.6032, + "step": 24193 + }, + { + "epoch": 2.0620472172504902, + "grad_norm": 45.99391089665403, + "learning_rate": 2.69404904147902e-06, + "loss": 1.0371, + "step": 24194 + }, + { + "epoch": 2.0621324469445153, + "grad_norm": 35.19199038326114, + "learning_rate": 2.6936090863956987e-06, + "loss": 1.3404, + "step": 24195 + }, + { + "epoch": 2.0622176766385407, + "grad_norm": 28.759299333296973, + "learning_rate": 2.6931691539948153e-06, + "loss": 0.6871, + "step": 24196 + }, + { + "epoch": 2.062302906332566, + "grad_norm": 36.36679908332754, + "learning_rate": 2.6927292442806964e-06, + "loss": 1.846, + "step": 24197 + }, + { + "epoch": 2.0623881360265917, + "grad_norm": 33.060495005812456, + "learning_rate": 2.692289357257667e-06, + "loss": 1.2469, + "step": 24198 + }, + { + "epoch": 2.062473365720617, + "grad_norm": 50.9385140921884, + "learning_rate": 2.6918494929300555e-06, + "loss": 1.7527, + "step": 24199 + }, + { + "epoch": 2.0625585954146426, + "grad_norm": 32.645894601887015, + "learning_rate": 2.691409651302188e-06, + "loss": 0.9227, + "step": 24200 + }, + { + "epoch": 2.062643825108668, + "grad_norm": 52.79338030945566, + "learning_rate": 2.6909698323783895e-06, + "loss": 1.9555, + "step": 24201 + }, + { + "epoch": 2.062729054802693, + "grad_norm": 67.2914987084707, + "learning_rate": 2.6905300361629827e-06, + "loss": 1.9296, + "step": 24202 + }, + { + "epoch": 2.0628142844967186, + "grad_norm": 26.922822512406494, + "learning_rate": 2.690090262660298e-06, + "loss": 1.1069, + "step": 24203 + }, + { + "epoch": 2.062899514190744, + "grad_norm": 64.80228144879507, + "learning_rate": 2.6896505118746572e-06, + "loss": 1.9806, + "step": 24204 + }, + { + "epoch": 2.0629847438847695, + "grad_norm": 47.64706535351879, + "learning_rate": 2.6892107838103863e-06, + "loss": 1.5119, + "step": 24205 + }, + { + "epoch": 2.063069973578795, + "grad_norm": 65.4528316159013, + "learning_rate": 2.688771078471807e-06, + "loss": 1.8604, + "step": 24206 + }, + { + "epoch": 2.0631552032728204, + "grad_norm": 45.23975294280831, + "learning_rate": 2.688331395863245e-06, + "loss": 1.4478, + "step": 24207 + }, + { + "epoch": 2.0632404329668455, + "grad_norm": 51.60968884149706, + "learning_rate": 2.6878917359890278e-06, + "loss": 1.8166, + "step": 24208 + }, + { + "epoch": 2.063325662660871, + "grad_norm": 85.3787454174049, + "learning_rate": 2.6874520988534765e-06, + "loss": 2.2006, + "step": 24209 + }, + { + "epoch": 2.0634108923548964, + "grad_norm": 29.532222051015015, + "learning_rate": 2.6870124844609145e-06, + "loss": 0.7666, + "step": 24210 + }, + { + "epoch": 2.063496122048922, + "grad_norm": 26.95227348792084, + "learning_rate": 2.686572892815664e-06, + "loss": 1.228, + "step": 24211 + }, + { + "epoch": 2.0635813517429473, + "grad_norm": 31.69772789975971, + "learning_rate": 2.686133323922052e-06, + "loss": 1.7579, + "step": 24212 + }, + { + "epoch": 2.063666581436973, + "grad_norm": 31.359862152368205, + "learning_rate": 2.6856937777843993e-06, + "loss": 0.6146, + "step": 24213 + }, + { + "epoch": 2.063751811130998, + "grad_norm": 28.644252953358205, + "learning_rate": 2.685254254407026e-06, + "loss": 0.9897, + "step": 24214 + }, + { + "epoch": 2.0638370408250233, + "grad_norm": 22.97854347361175, + "learning_rate": 2.6848147537942604e-06, + "loss": 0.7152, + "step": 24215 + }, + { + "epoch": 2.0639222705190488, + "grad_norm": 42.72867556485096, + "learning_rate": 2.6843752759504187e-06, + "loss": 1.1035, + "step": 24216 + }, + { + "epoch": 2.064007500213074, + "grad_norm": 69.90264387476775, + "learning_rate": 2.683935820879828e-06, + "loss": 1.4546, + "step": 24217 + }, + { + "epoch": 2.0640927299070997, + "grad_norm": 48.50661522106891, + "learning_rate": 2.6834963885868094e-06, + "loss": 1.3644, + "step": 24218 + }, + { + "epoch": 2.064177959601125, + "grad_norm": 54.90532412969433, + "learning_rate": 2.6830569790756804e-06, + "loss": 1.908, + "step": 24219 + }, + { + "epoch": 2.0642631892951506, + "grad_norm": 49.7591829542069, + "learning_rate": 2.682617592350768e-06, + "loss": 1.4515, + "step": 24220 + }, + { + "epoch": 2.0643484189891756, + "grad_norm": 86.38533307900326, + "learning_rate": 2.68217822841639e-06, + "loss": 2.1384, + "step": 24221 + }, + { + "epoch": 2.064433648683201, + "grad_norm": 32.241279014471736, + "learning_rate": 2.681738887276866e-06, + "loss": 1.3465, + "step": 24222 + }, + { + "epoch": 2.0645188783772266, + "grad_norm": 45.98599515084084, + "learning_rate": 2.6812995689365217e-06, + "loss": 1.3484, + "step": 24223 + }, + { + "epoch": 2.064604108071252, + "grad_norm": 35.412196759122224, + "learning_rate": 2.6808602733996723e-06, + "loss": 1.2543, + "step": 24224 + }, + { + "epoch": 2.0646893377652775, + "grad_norm": 42.50049800222392, + "learning_rate": 2.6804210006706423e-06, + "loss": 0.9469, + "step": 24225 + }, + { + "epoch": 2.064774567459303, + "grad_norm": 36.4662212988762, + "learning_rate": 2.6799817507537507e-06, + "loss": 1.3027, + "step": 24226 + }, + { + "epoch": 2.064859797153328, + "grad_norm": 52.34786146267477, + "learning_rate": 2.6795425236533162e-06, + "loss": 1.8803, + "step": 24227 + }, + { + "epoch": 2.0649450268473535, + "grad_norm": 58.54926409559905, + "learning_rate": 2.679103319373657e-06, + "loss": 2.4058, + "step": 24228 + }, + { + "epoch": 2.065030256541379, + "grad_norm": 43.110094325251, + "learning_rate": 2.6786641379190965e-06, + "loss": 1.8514, + "step": 24229 + }, + { + "epoch": 2.0651154862354044, + "grad_norm": 48.59041222580656, + "learning_rate": 2.6782249792939498e-06, + "loss": 1.5409, + "step": 24230 + }, + { + "epoch": 2.06520071592943, + "grad_norm": 49.9134283718125, + "learning_rate": 2.6777858435025396e-06, + "loss": 1.8903, + "step": 24231 + }, + { + "epoch": 2.0652859456234554, + "grad_norm": 51.66435972768372, + "learning_rate": 2.677346730549181e-06, + "loss": 2.0546, + "step": 24232 + }, + { + "epoch": 2.065371175317481, + "grad_norm": 49.19104725818453, + "learning_rate": 2.6769076404381967e-06, + "loss": 1.3438, + "step": 24233 + }, + { + "epoch": 2.065456405011506, + "grad_norm": 54.37662022879694, + "learning_rate": 2.6764685731739026e-06, + "loss": 1.3707, + "step": 24234 + }, + { + "epoch": 2.0655416347055313, + "grad_norm": 61.75721412420607, + "learning_rate": 2.6760295287606164e-06, + "loss": 1.3629, + "step": 24235 + }, + { + "epoch": 2.065626864399557, + "grad_norm": 64.59234245969678, + "learning_rate": 2.6755905072026567e-06, + "loss": 1.4653, + "step": 24236 + }, + { + "epoch": 2.0657120940935823, + "grad_norm": 28.29273678908511, + "learning_rate": 2.6751515085043388e-06, + "loss": 0.8405, + "step": 24237 + }, + { + "epoch": 2.0657973237876077, + "grad_norm": 112.28753637728606, + "learning_rate": 2.674712532669982e-06, + "loss": 1.8579, + "step": 24238 + }, + { + "epoch": 2.065882553481633, + "grad_norm": 33.20026909529431, + "learning_rate": 2.674273579703905e-06, + "loss": 1.0348, + "step": 24239 + }, + { + "epoch": 2.065967783175658, + "grad_norm": 27.76486763100158, + "learning_rate": 2.6738346496104227e-06, + "loss": 0.4995, + "step": 24240 + }, + { + "epoch": 2.0660530128696837, + "grad_norm": 46.019538778310284, + "learning_rate": 2.673395742393852e-06, + "loss": 0.8262, + "step": 24241 + }, + { + "epoch": 2.066138242563709, + "grad_norm": 42.73152209267826, + "learning_rate": 2.67295685805851e-06, + "loss": 1.2536, + "step": 24242 + }, + { + "epoch": 2.0662234722577346, + "grad_norm": 56.67454981970813, + "learning_rate": 2.6725179966087133e-06, + "loss": 1.7216, + "step": 24243 + }, + { + "epoch": 2.06630870195176, + "grad_norm": 53.90709498513051, + "learning_rate": 2.6720791580487774e-06, + "loss": 1.2644, + "step": 24244 + }, + { + "epoch": 2.0663939316457856, + "grad_norm": 45.97312246136634, + "learning_rate": 2.671640342383016e-06, + "loss": 1.9351, + "step": 24245 + }, + { + "epoch": 2.0664791613398106, + "grad_norm": 51.62479770550283, + "learning_rate": 2.6712015496157463e-06, + "loss": 1.767, + "step": 24246 + }, + { + "epoch": 2.066564391033836, + "grad_norm": 56.56806665604615, + "learning_rate": 2.6707627797512866e-06, + "loss": 1.4758, + "step": 24247 + }, + { + "epoch": 2.0666496207278615, + "grad_norm": 40.424168174726134, + "learning_rate": 2.6703240327939486e-06, + "loss": 1.5674, + "step": 24248 + }, + { + "epoch": 2.066734850421887, + "grad_norm": 40.41778547399262, + "learning_rate": 2.669885308748047e-06, + "loss": 1.2422, + "step": 24249 + }, + { + "epoch": 2.0668200801159125, + "grad_norm": 80.7276313080305, + "learning_rate": 2.6694466076178982e-06, + "loss": 1.546, + "step": 24250 + }, + { + "epoch": 2.066905309809938, + "grad_norm": 75.18808425865558, + "learning_rate": 2.6690079294078168e-06, + "loss": 2.2311, + "step": 24251 + }, + { + "epoch": 2.0669905395039634, + "grad_norm": 54.576843237670175, + "learning_rate": 2.6685692741221158e-06, + "loss": 1.3873, + "step": 24252 + }, + { + "epoch": 2.0670757691979884, + "grad_norm": 22.46330310299271, + "learning_rate": 2.6681306417651075e-06, + "loss": 1.3522, + "step": 24253 + }, + { + "epoch": 2.067160998892014, + "grad_norm": 16.466673292000642, + "learning_rate": 2.667692032341108e-06, + "loss": 0.362, + "step": 24254 + }, + { + "epoch": 2.0672462285860393, + "grad_norm": 70.80400473437854, + "learning_rate": 2.667253445854433e-06, + "loss": 1.2061, + "step": 24255 + }, + { + "epoch": 2.067331458280065, + "grad_norm": 54.62107510147541, + "learning_rate": 2.666814882309393e-06, + "loss": 1.0587, + "step": 24256 + }, + { + "epoch": 2.0674166879740903, + "grad_norm": 75.42764816289103, + "learning_rate": 2.6663763417103015e-06, + "loss": 1.6887, + "step": 24257 + }, + { + "epoch": 2.0675019176681158, + "grad_norm": 22.23314607987196, + "learning_rate": 2.6659378240614698e-06, + "loss": 1.0931, + "step": 24258 + }, + { + "epoch": 2.0675871473621408, + "grad_norm": 57.56680690767643, + "learning_rate": 2.665499329367214e-06, + "loss": 1.7708, + "step": 24259 + }, + { + "epoch": 2.0676723770561662, + "grad_norm": 32.36255881407973, + "learning_rate": 2.6650608576318437e-06, + "loss": 1.6784, + "step": 24260 + }, + { + "epoch": 2.0677576067501917, + "grad_norm": 46.62576142945829, + "learning_rate": 2.6646224088596714e-06, + "loss": 1.9725, + "step": 24261 + }, + { + "epoch": 2.067842836444217, + "grad_norm": 51.54034903354564, + "learning_rate": 2.6641839830550087e-06, + "loss": 1.491, + "step": 24262 + }, + { + "epoch": 2.0679280661382426, + "grad_norm": 87.02486434313654, + "learning_rate": 2.6637455802221706e-06, + "loss": 2.3119, + "step": 24263 + }, + { + "epoch": 2.068013295832268, + "grad_norm": 60.67123380302267, + "learning_rate": 2.663307200365467e-06, + "loss": 2.2822, + "step": 24264 + }, + { + "epoch": 2.0680985255262936, + "grad_norm": 50.667242628182436, + "learning_rate": 2.6628688434892082e-06, + "loss": 2.336, + "step": 24265 + }, + { + "epoch": 2.0681837552203186, + "grad_norm": 28.577341509308468, + "learning_rate": 2.6624305095977056e-06, + "loss": 1.2674, + "step": 24266 + }, + { + "epoch": 2.068268984914344, + "grad_norm": 46.067799779822955, + "learning_rate": 2.661992198695268e-06, + "loss": 1.4517, + "step": 24267 + }, + { + "epoch": 2.0683542146083695, + "grad_norm": 58.22662073796894, + "learning_rate": 2.66155391078621e-06, + "loss": 2.1115, + "step": 24268 + }, + { + "epoch": 2.068439444302395, + "grad_norm": 25.68135154324874, + "learning_rate": 2.6611156458748378e-06, + "loss": 0.9557, + "step": 24269 + }, + { + "epoch": 2.0685246739964205, + "grad_norm": 9.222831961179294, + "learning_rate": 2.660677403965466e-06, + "loss": 0.3796, + "step": 24270 + }, + { + "epoch": 2.068609903690446, + "grad_norm": 64.6315601453335, + "learning_rate": 2.660239185062401e-06, + "loss": 2.3081, + "step": 24271 + }, + { + "epoch": 2.068695133384471, + "grad_norm": 36.37880997744355, + "learning_rate": 2.6598009891699555e-06, + "loss": 1.0626, + "step": 24272 + }, + { + "epoch": 2.0687803630784964, + "grad_norm": 31.855467838190734, + "learning_rate": 2.6593628162924374e-06, + "loss": 1.0099, + "step": 24273 + }, + { + "epoch": 2.068865592772522, + "grad_norm": 29.757756074132832, + "learning_rate": 2.658924666434156e-06, + "loss": 1.0546, + "step": 24274 + }, + { + "epoch": 2.0689508224665474, + "grad_norm": 65.31160249912831, + "learning_rate": 2.658486539599418e-06, + "loss": 2.5339, + "step": 24275 + }, + { + "epoch": 2.069036052160573, + "grad_norm": 16.611314984094427, + "learning_rate": 2.6580484357925364e-06, + "loss": 0.7644, + "step": 24276 + }, + { + "epoch": 2.0691212818545983, + "grad_norm": 90.92250838426021, + "learning_rate": 2.6576103550178162e-06, + "loss": 2.6886, + "step": 24277 + }, + { + "epoch": 2.0692065115486233, + "grad_norm": 53.686019069386795, + "learning_rate": 2.6571722972795687e-06, + "loss": 1.9576, + "step": 24278 + }, + { + "epoch": 2.069291741242649, + "grad_norm": 60.36094587378573, + "learning_rate": 2.6567342625820996e-06, + "loss": 2.1286, + "step": 24279 + }, + { + "epoch": 2.0693769709366743, + "grad_norm": 42.16686737888585, + "learning_rate": 2.6562962509297195e-06, + "loss": 1.6248, + "step": 24280 + }, + { + "epoch": 2.0694622006306997, + "grad_norm": 43.5711287258822, + "learning_rate": 2.6558582623267348e-06, + "loss": 0.9056, + "step": 24281 + }, + { + "epoch": 2.069547430324725, + "grad_norm": 31.193889490887617, + "learning_rate": 2.655420296777452e-06, + "loss": 1.6939, + "step": 24282 + }, + { + "epoch": 2.0696326600187507, + "grad_norm": 68.00125803230297, + "learning_rate": 2.6549823542861795e-06, + "loss": 1.5304, + "step": 24283 + }, + { + "epoch": 2.069717889712776, + "grad_norm": 42.51086648615538, + "learning_rate": 2.6545444348572213e-06, + "loss": 0.87, + "step": 24284 + }, + { + "epoch": 2.069803119406801, + "grad_norm": 79.40413255713406, + "learning_rate": 2.6541065384948874e-06, + "loss": 2.6625, + "step": 24285 + }, + { + "epoch": 2.0698883491008266, + "grad_norm": 70.036561685966, + "learning_rate": 2.6536686652034847e-06, + "loss": 2.0305, + "step": 24286 + }, + { + "epoch": 2.069973578794852, + "grad_norm": 111.14731698609712, + "learning_rate": 2.6532308149873186e-06, + "loss": 1.0616, + "step": 24287 + }, + { + "epoch": 2.0700588084888776, + "grad_norm": 69.04191374733001, + "learning_rate": 2.6527929878506925e-06, + "loss": 1.074, + "step": 24288 + }, + { + "epoch": 2.070144038182903, + "grad_norm": 42.940773806885524, + "learning_rate": 2.6523551837979173e-06, + "loss": 1.4596, + "step": 24289 + }, + { + "epoch": 2.0702292678769285, + "grad_norm": 26.70061861349225, + "learning_rate": 2.6519174028332955e-06, + "loss": 1.0697, + "step": 24290 + }, + { + "epoch": 2.0703144975709535, + "grad_norm": 46.84562789110622, + "learning_rate": 2.6514796449611336e-06, + "loss": 1.2124, + "step": 24291 + }, + { + "epoch": 2.070399727264979, + "grad_norm": 40.5327708621695, + "learning_rate": 2.651041910185733e-06, + "loss": 1.7679, + "step": 24292 + }, + { + "epoch": 2.0704849569590045, + "grad_norm": 56.6012234403316, + "learning_rate": 2.650604198511403e-06, + "loss": 2.0112, + "step": 24293 + }, + { + "epoch": 2.07057018665303, + "grad_norm": 38.423523271303445, + "learning_rate": 2.650166509942449e-06, + "loss": 1.8487, + "step": 24294 + }, + { + "epoch": 2.0706554163470554, + "grad_norm": 42.51589276561728, + "learning_rate": 2.6497288444831737e-06, + "loss": 1.8564, + "step": 24295 + }, + { + "epoch": 2.070740646041081, + "grad_norm": 52.6716461051756, + "learning_rate": 2.6492912021378807e-06, + "loss": 2.0385, + "step": 24296 + }, + { + "epoch": 2.070825875735106, + "grad_norm": 45.403321357099024, + "learning_rate": 2.648853582910873e-06, + "loss": 1.5503, + "step": 24297 + }, + { + "epoch": 2.0709111054291314, + "grad_norm": 58.24481135551399, + "learning_rate": 2.6484159868064585e-06, + "loss": 0.9647, + "step": 24298 + }, + { + "epoch": 2.070996335123157, + "grad_norm": 36.842318736800834, + "learning_rate": 2.6479784138289376e-06, + "loss": 1.5541, + "step": 24299 + }, + { + "epoch": 2.0710815648171823, + "grad_norm": 25.96788715239464, + "learning_rate": 2.6475408639826126e-06, + "loss": 1.2404, + "step": 24300 + }, + { + "epoch": 2.0711667945112078, + "grad_norm": 56.53839636384492, + "learning_rate": 2.6471033372717893e-06, + "loss": 1.6017, + "step": 24301 + }, + { + "epoch": 2.0712520242052332, + "grad_norm": 59.27387837432448, + "learning_rate": 2.64666583370077e-06, + "loss": 1.9314, + "step": 24302 + }, + { + "epoch": 2.0713372538992587, + "grad_norm": 41.6627198133056, + "learning_rate": 2.6462283532738585e-06, + "loss": 1.3023, + "step": 24303 + }, + { + "epoch": 2.0714224835932837, + "grad_norm": 69.89762193741, + "learning_rate": 2.6457908959953555e-06, + "loss": 1.7009, + "step": 24304 + }, + { + "epoch": 2.071507713287309, + "grad_norm": 35.98318932317528, + "learning_rate": 2.6453534618695612e-06, + "loss": 1.0155, + "step": 24305 + }, + { + "epoch": 2.0715929429813347, + "grad_norm": 47.40574463643722, + "learning_rate": 2.644916050900782e-06, + "loss": 1.4869, + "step": 24306 + }, + { + "epoch": 2.07167817267536, + "grad_norm": 70.56442675954177, + "learning_rate": 2.6444786630933185e-06, + "loss": 1.9198, + "step": 24307 + }, + { + "epoch": 2.0717634023693856, + "grad_norm": 63.62974193642181, + "learning_rate": 2.6440412984514687e-06, + "loss": 1.9212, + "step": 24308 + }, + { + "epoch": 2.071848632063411, + "grad_norm": 59.15755472213732, + "learning_rate": 2.643603956979538e-06, + "loss": 1.1664, + "step": 24309 + }, + { + "epoch": 2.071933861757436, + "grad_norm": 25.884574496715395, + "learning_rate": 2.643166638681825e-06, + "loss": 0.9405, + "step": 24310 + }, + { + "epoch": 2.0720190914514616, + "grad_norm": 69.19385721572253, + "learning_rate": 2.6427293435626335e-06, + "loss": 1.235, + "step": 24311 + }, + { + "epoch": 2.072104321145487, + "grad_norm": 53.12886549291731, + "learning_rate": 2.6422920716262617e-06, + "loss": 1.6649, + "step": 24312 + }, + { + "epoch": 2.0721895508395125, + "grad_norm": 36.04176243038576, + "learning_rate": 2.6418548228770106e-06, + "loss": 1.4865, + "step": 24313 + }, + { + "epoch": 2.072274780533538, + "grad_norm": 25.597081917132684, + "learning_rate": 2.6414175973191785e-06, + "loss": 0.7083, + "step": 24314 + }, + { + "epoch": 2.0723600102275634, + "grad_norm": 31.560823346048327, + "learning_rate": 2.640980394957069e-06, + "loss": 1.3696, + "step": 24315 + }, + { + "epoch": 2.0724452399215885, + "grad_norm": 52.73022983124277, + "learning_rate": 2.6405432157949782e-06, + "loss": 1.8181, + "step": 24316 + }, + { + "epoch": 2.072530469615614, + "grad_norm": 47.74485752096732, + "learning_rate": 2.640106059837209e-06, + "loss": 1.5919, + "step": 24317 + }, + { + "epoch": 2.0726156993096394, + "grad_norm": 48.20759888846627, + "learning_rate": 2.639668927088057e-06, + "loss": 1.7684, + "step": 24318 + }, + { + "epoch": 2.072700929003665, + "grad_norm": 42.489375997471775, + "learning_rate": 2.6392318175518252e-06, + "loss": 1.0241, + "step": 24319 + }, + { + "epoch": 2.0727861586976903, + "grad_norm": 39.77923195073415, + "learning_rate": 2.638794731232811e-06, + "loss": 1.0264, + "step": 24320 + }, + { + "epoch": 2.072871388391716, + "grad_norm": 27.68803290506444, + "learning_rate": 2.638357668135312e-06, + "loss": 1.3387, + "step": 24321 + }, + { + "epoch": 2.0729566180857413, + "grad_norm": 30.799894182704733, + "learning_rate": 2.6379206282636242e-06, + "loss": 1.4085, + "step": 24322 + }, + { + "epoch": 2.0730418477797663, + "grad_norm": 33.9706937262407, + "learning_rate": 2.637483611622051e-06, + "loss": 1.1211, + "step": 24323 + }, + { + "epoch": 2.0731270774737918, + "grad_norm": 48.78218092864243, + "learning_rate": 2.6370466182148857e-06, + "loss": 2.1816, + "step": 24324 + }, + { + "epoch": 2.0732123071678172, + "grad_norm": 24.871108627795625, + "learning_rate": 2.63660964804643e-06, + "loss": 0.9172, + "step": 24325 + }, + { + "epoch": 2.0732975368618427, + "grad_norm": 50.52135312695728, + "learning_rate": 2.6361727011209794e-06, + "loss": 1.4618, + "step": 24326 + }, + { + "epoch": 2.073382766555868, + "grad_norm": 46.635775386861276, + "learning_rate": 2.635735777442828e-06, + "loss": 1.8942, + "step": 24327 + }, + { + "epoch": 2.0734679962498936, + "grad_norm": 77.70310071585622, + "learning_rate": 2.635298877016279e-06, + "loss": 1.0964, + "step": 24328 + }, + { + "epoch": 2.0735532259439187, + "grad_norm": 80.51018931670315, + "learning_rate": 2.634861999845626e-06, + "loss": 2.0606, + "step": 24329 + }, + { + "epoch": 2.073638455637944, + "grad_norm": 43.12570451139774, + "learning_rate": 2.6344251459351644e-06, + "loss": 0.7441, + "step": 24330 + }, + { + "epoch": 2.0737236853319696, + "grad_norm": 44.78667563491304, + "learning_rate": 2.63398831528919e-06, + "loss": 1.6989, + "step": 24331 + }, + { + "epoch": 2.073808915025995, + "grad_norm": 51.72270940989362, + "learning_rate": 2.6335515079120004e-06, + "loss": 1.4014, + "step": 24332 + }, + { + "epoch": 2.0738941447200205, + "grad_norm": 20.415556443189942, + "learning_rate": 2.6331147238078937e-06, + "loss": 0.7207, + "step": 24333 + }, + { + "epoch": 2.073979374414046, + "grad_norm": 48.55278708536118, + "learning_rate": 2.632677962981163e-06, + "loss": 1.6543, + "step": 24334 + }, + { + "epoch": 2.074064604108071, + "grad_norm": 49.50450754185527, + "learning_rate": 2.632241225436102e-06, + "loss": 1.4052, + "step": 24335 + }, + { + "epoch": 2.0741498338020965, + "grad_norm": 47.62321290851554, + "learning_rate": 2.6318045111770086e-06, + "loss": 2.0211, + "step": 24336 + }, + { + "epoch": 2.074235063496122, + "grad_norm": 35.26934744475406, + "learning_rate": 2.631367820208178e-06, + "loss": 1.2816, + "step": 24337 + }, + { + "epoch": 2.0743202931901474, + "grad_norm": 24.942821199046275, + "learning_rate": 2.630931152533903e-06, + "loss": 1.3554, + "step": 24338 + }, + { + "epoch": 2.074405522884173, + "grad_norm": 30.211674398093237, + "learning_rate": 2.630494508158477e-06, + "loss": 1.0441, + "step": 24339 + }, + { + "epoch": 2.0744907525781984, + "grad_norm": 18.310544241197817, + "learning_rate": 2.630057887086196e-06, + "loss": 0.7121, + "step": 24340 + }, + { + "epoch": 2.074575982272224, + "grad_norm": 39.18791242702328, + "learning_rate": 2.6296212893213556e-06, + "loss": 1.1945, + "step": 24341 + }, + { + "epoch": 2.074661211966249, + "grad_norm": 41.05701766876643, + "learning_rate": 2.6291847148682486e-06, + "loss": 1.231, + "step": 24342 + }, + { + "epoch": 2.0747464416602743, + "grad_norm": 55.40585291391003, + "learning_rate": 2.6287481637311676e-06, + "loss": 1.7023, + "step": 24343 + }, + { + "epoch": 2.0748316713543, + "grad_norm": 54.03751176285718, + "learning_rate": 2.6283116359144045e-06, + "loss": 1.8408, + "step": 24344 + }, + { + "epoch": 2.0749169010483253, + "grad_norm": 42.74425549230095, + "learning_rate": 2.627875131422255e-06, + "loss": 0.7794, + "step": 24345 + }, + { + "epoch": 2.0750021307423507, + "grad_norm": 24.83239855538408, + "learning_rate": 2.627438650259012e-06, + "loss": 0.9974, + "step": 24346 + }, + { + "epoch": 2.075087360436376, + "grad_norm": 49.925723490010114, + "learning_rate": 2.6270021924289645e-06, + "loss": 1.5252, + "step": 24347 + }, + { + "epoch": 2.0751725901304012, + "grad_norm": 41.210160510873884, + "learning_rate": 2.626565757936408e-06, + "loss": 1.4885, + "step": 24348 + }, + { + "epoch": 2.0752578198244267, + "grad_norm": 37.32335871380889, + "learning_rate": 2.626129346785636e-06, + "loss": 1.1852, + "step": 24349 + }, + { + "epoch": 2.075343049518452, + "grad_norm": 63.528415690723484, + "learning_rate": 2.625692958980938e-06, + "loss": 1.7795, + "step": 24350 + }, + { + "epoch": 2.0754282792124776, + "grad_norm": 27.41023614040817, + "learning_rate": 2.625256594526607e-06, + "loss": 1.247, + "step": 24351 + }, + { + "epoch": 2.075513508906503, + "grad_norm": 25.034079140194617, + "learning_rate": 2.6248202534269316e-06, + "loss": 1.0989, + "step": 24352 + }, + { + "epoch": 2.0755987386005286, + "grad_norm": 27.782149722815706, + "learning_rate": 2.6243839356862073e-06, + "loss": 1.1177, + "step": 24353 + }, + { + "epoch": 2.0756839682945536, + "grad_norm": 71.92755850105105, + "learning_rate": 2.6239476413087227e-06, + "loss": 1.6507, + "step": 24354 + }, + { + "epoch": 2.075769197988579, + "grad_norm": 49.12663218456027, + "learning_rate": 2.623511370298767e-06, + "loss": 1.5898, + "step": 24355 + }, + { + "epoch": 2.0758544276826045, + "grad_norm": 48.492720886505545, + "learning_rate": 2.6230751226606346e-06, + "loss": 1.8967, + "step": 24356 + }, + { + "epoch": 2.07593965737663, + "grad_norm": 38.25156769249773, + "learning_rate": 2.6226388983986124e-06, + "loss": 1.5742, + "step": 24357 + }, + { + "epoch": 2.0760248870706555, + "grad_norm": 66.9330775100023, + "learning_rate": 2.6222026975169933e-06, + "loss": 1.9245, + "step": 24358 + }, + { + "epoch": 2.076110116764681, + "grad_norm": 33.04394922161132, + "learning_rate": 2.621766520020066e-06, + "loss": 0.777, + "step": 24359 + }, + { + "epoch": 2.0761953464587064, + "grad_norm": 57.786165232966695, + "learning_rate": 2.6213303659121204e-06, + "loss": 1.2663, + "step": 24360 + }, + { + "epoch": 2.0762805761527314, + "grad_norm": 29.761258811223907, + "learning_rate": 2.620894235197443e-06, + "loss": 1.0118, + "step": 24361 + }, + { + "epoch": 2.076365805846757, + "grad_norm": 79.61698543287368, + "learning_rate": 2.6204581278803277e-06, + "loss": 2.5721, + "step": 24362 + }, + { + "epoch": 2.0764510355407824, + "grad_norm": 32.632151991315304, + "learning_rate": 2.620022043965059e-06, + "loss": 0.9578, + "step": 24363 + }, + { + "epoch": 2.076536265234808, + "grad_norm": 57.48859887120035, + "learning_rate": 2.6195859834559294e-06, + "loss": 1.4694, + "step": 24364 + }, + { + "epoch": 2.0766214949288333, + "grad_norm": 42.581020127909234, + "learning_rate": 2.6191499463572245e-06, + "loss": 1.3978, + "step": 24365 + }, + { + "epoch": 2.0767067246228588, + "grad_norm": 43.384826660392825, + "learning_rate": 2.618713932673235e-06, + "loss": 1.2705, + "step": 24366 + }, + { + "epoch": 2.076791954316884, + "grad_norm": 41.52825310360521, + "learning_rate": 2.618277942408248e-06, + "loss": 1.2665, + "step": 24367 + }, + { + "epoch": 2.0768771840109093, + "grad_norm": 46.934931095504105, + "learning_rate": 2.617841975566552e-06, + "loss": 1.2753, + "step": 24368 + }, + { + "epoch": 2.0769624137049347, + "grad_norm": 25.986738270045326, + "learning_rate": 2.6174060321524325e-06, + "loss": 1.2293, + "step": 24369 + }, + { + "epoch": 2.07704764339896, + "grad_norm": 57.61903730634538, + "learning_rate": 2.6169701121701762e-06, + "loss": 1.9348, + "step": 24370 + }, + { + "epoch": 2.0771328730929857, + "grad_norm": 71.42575207282573, + "learning_rate": 2.616534215624072e-06, + "loss": 1.5204, + "step": 24371 + }, + { + "epoch": 2.077218102787011, + "grad_norm": 51.68519195685023, + "learning_rate": 2.6160983425184083e-06, + "loss": 1.4447, + "step": 24372 + }, + { + "epoch": 2.0773033324810366, + "grad_norm": 64.20912699571825, + "learning_rate": 2.615662492857471e-06, + "loss": 0.8965, + "step": 24373 + }, + { + "epoch": 2.0773885621750616, + "grad_norm": 37.891613839206386, + "learning_rate": 2.615226666645543e-06, + "loss": 1.6706, + "step": 24374 + }, + { + "epoch": 2.077473791869087, + "grad_norm": 55.98360192259854, + "learning_rate": 2.614790863886916e-06, + "loss": 1.2962, + "step": 24375 + }, + { + "epoch": 2.0775590215631126, + "grad_norm": 38.80048489969806, + "learning_rate": 2.6143550845858715e-06, + "loss": 1.2945, + "step": 24376 + }, + { + "epoch": 2.077644251257138, + "grad_norm": 32.48893317974689, + "learning_rate": 2.613919328746698e-06, + "loss": 1.3202, + "step": 24377 + }, + { + "epoch": 2.0777294809511635, + "grad_norm": 58.495397385848015, + "learning_rate": 2.613483596373677e-06, + "loss": 1.6347, + "step": 24378 + }, + { + "epoch": 2.077814710645189, + "grad_norm": 46.66014066219284, + "learning_rate": 2.6130478874710973e-06, + "loss": 0.7739, + "step": 24379 + }, + { + "epoch": 2.077899940339214, + "grad_norm": 49.13053491178404, + "learning_rate": 2.612612202043244e-06, + "loss": 1.5351, + "step": 24380 + }, + { + "epoch": 2.0779851700332395, + "grad_norm": 69.39168367235496, + "learning_rate": 2.612176540094402e-06, + "loss": 1.4868, + "step": 24381 + }, + { + "epoch": 2.078070399727265, + "grad_norm": 36.29679590743808, + "learning_rate": 2.611740901628853e-06, + "loss": 0.7511, + "step": 24382 + }, + { + "epoch": 2.0781556294212904, + "grad_norm": 54.897276643219925, + "learning_rate": 2.611305286650885e-06, + "loss": 1.1972, + "step": 24383 + }, + { + "epoch": 2.078240859115316, + "grad_norm": 46.10946492676426, + "learning_rate": 2.6108696951647807e-06, + "loss": 1.7924, + "step": 24384 + }, + { + "epoch": 2.0783260888093413, + "grad_norm": 47.53298644530735, + "learning_rate": 2.6104341271748224e-06, + "loss": 1.0371, + "step": 24385 + }, + { + "epoch": 2.078411318503367, + "grad_norm": 34.31614182563581, + "learning_rate": 2.6099985826852934e-06, + "loss": 1.6142, + "step": 24386 + }, + { + "epoch": 2.078496548197392, + "grad_norm": 28.523607821551433, + "learning_rate": 2.60956306170048e-06, + "loss": 1.3537, + "step": 24387 + }, + { + "epoch": 2.0785817778914173, + "grad_norm": 47.27641076354924, + "learning_rate": 2.6091275642246642e-06, + "loss": 1.6561, + "step": 24388 + }, + { + "epoch": 2.0786670075854428, + "grad_norm": 50.693480192918884, + "learning_rate": 2.608692090262129e-06, + "loss": 1.3242, + "step": 24389 + }, + { + "epoch": 2.0787522372794682, + "grad_norm": 39.93131553795777, + "learning_rate": 2.608256639817158e-06, + "loss": 1.4744, + "step": 24390 + }, + { + "epoch": 2.0788374669734937, + "grad_norm": 79.64138194023963, + "learning_rate": 2.60782121289403e-06, + "loss": 2.1495, + "step": 24391 + }, + { + "epoch": 2.078922696667519, + "grad_norm": 70.0571350446373, + "learning_rate": 2.6073858094970316e-06, + "loss": 1.6991, + "step": 24392 + }, + { + "epoch": 2.079007926361544, + "grad_norm": 55.289337736318856, + "learning_rate": 2.606950429630444e-06, + "loss": 1.1809, + "step": 24393 + }, + { + "epoch": 2.0790931560555697, + "grad_norm": 33.14315331650646, + "learning_rate": 2.6065150732985443e-06, + "loss": 1.3751, + "step": 24394 + }, + { + "epoch": 2.079178385749595, + "grad_norm": 73.63215051161305, + "learning_rate": 2.6060797405056195e-06, + "loss": 2.2928, + "step": 24395 + }, + { + "epoch": 2.0792636154436206, + "grad_norm": 42.8986514835841, + "learning_rate": 2.6056444312559505e-06, + "loss": 1.2377, + "step": 24396 + }, + { + "epoch": 2.079348845137646, + "grad_norm": 28.91355266099988, + "learning_rate": 2.6052091455538165e-06, + "loss": 1.4259, + "step": 24397 + }, + { + "epoch": 2.0794340748316715, + "grad_norm": 51.6448198029625, + "learning_rate": 2.6047738834034997e-06, + "loss": 1.0123, + "step": 24398 + }, + { + "epoch": 2.0795193045256966, + "grad_norm": 46.661143357403176, + "learning_rate": 2.6043386448092796e-06, + "loss": 1.4866, + "step": 24399 + }, + { + "epoch": 2.079604534219722, + "grad_norm": 37.64602390191965, + "learning_rate": 2.603903429775435e-06, + "loss": 0.8401, + "step": 24400 + }, + { + "epoch": 2.0796897639137475, + "grad_norm": 84.08066350421531, + "learning_rate": 2.6034682383062506e-06, + "loss": 2.538, + "step": 24401 + }, + { + "epoch": 2.079774993607773, + "grad_norm": 31.1527780419539, + "learning_rate": 2.6030330704060013e-06, + "loss": 0.9021, + "step": 24402 + }, + { + "epoch": 2.0798602233017984, + "grad_norm": 37.25303396994939, + "learning_rate": 2.602597926078971e-06, + "loss": 1.4679, + "step": 24403 + }, + { + "epoch": 2.079945452995824, + "grad_norm": 28.993315061560946, + "learning_rate": 2.6021628053294356e-06, + "loss": 0.684, + "step": 24404 + }, + { + "epoch": 2.0800306826898494, + "grad_norm": 36.8594925528651, + "learning_rate": 2.601727708161679e-06, + "loss": 1.005, + "step": 24405 + }, + { + "epoch": 2.0801159123838744, + "grad_norm": 50.92307931755272, + "learning_rate": 2.6012926345799767e-06, + "loss": 1.7895, + "step": 24406 + }, + { + "epoch": 2.0802011420779, + "grad_norm": 78.34548077007382, + "learning_rate": 2.6008575845886085e-06, + "loss": 2.2797, + "step": 24407 + }, + { + "epoch": 2.0802863717719253, + "grad_norm": 28.028462522217342, + "learning_rate": 2.60042255819185e-06, + "loss": 1.2611, + "step": 24408 + }, + { + "epoch": 2.080371601465951, + "grad_norm": 50.294599711612925, + "learning_rate": 2.5999875553939846e-06, + "loss": 1.6688, + "step": 24409 + }, + { + "epoch": 2.0804568311599763, + "grad_norm": 42.45352541404602, + "learning_rate": 2.599552576199286e-06, + "loss": 1.1819, + "step": 24410 + }, + { + "epoch": 2.0805420608540017, + "grad_norm": 53.988655806907815, + "learning_rate": 2.599117620612036e-06, + "loss": 1.438, + "step": 24411 + }, + { + "epoch": 2.0806272905480268, + "grad_norm": 55.30450957864244, + "learning_rate": 2.59868268863651e-06, + "loss": 0.8457, + "step": 24412 + }, + { + "epoch": 2.080712520242052, + "grad_norm": 58.38172898606302, + "learning_rate": 2.5982477802769832e-06, + "loss": 2.0719, + "step": 24413 + }, + { + "epoch": 2.0807977499360777, + "grad_norm": 36.67213674119189, + "learning_rate": 2.597812895537738e-06, + "loss": 1.3648, + "step": 24414 + }, + { + "epoch": 2.080882979630103, + "grad_norm": 30.359160904004337, + "learning_rate": 2.597378034423048e-06, + "loss": 0.9515, + "step": 24415 + }, + { + "epoch": 2.0809682093241286, + "grad_norm": 66.43730593399746, + "learning_rate": 2.5969431969371904e-06, + "loss": 0.9211, + "step": 24416 + }, + { + "epoch": 2.081053439018154, + "grad_norm": 184.3994467984451, + "learning_rate": 2.5965083830844395e-06, + "loss": 1.9651, + "step": 24417 + }, + { + "epoch": 2.081138668712179, + "grad_norm": 80.61165111840336, + "learning_rate": 2.596073592869074e-06, + "loss": 2.3303, + "step": 24418 + }, + { + "epoch": 2.0812238984062046, + "grad_norm": 51.47439599844276, + "learning_rate": 2.5956388262953708e-06, + "loss": 2.0679, + "step": 24419 + }, + { + "epoch": 2.08130912810023, + "grad_norm": 36.59127606087116, + "learning_rate": 2.5952040833676045e-06, + "loss": 1.0307, + "step": 24420 + }, + { + "epoch": 2.0813943577942555, + "grad_norm": 68.01697400165095, + "learning_rate": 2.5947693640900485e-06, + "loss": 2.3143, + "step": 24421 + }, + { + "epoch": 2.081479587488281, + "grad_norm": 34.117268931868466, + "learning_rate": 2.5943346684669822e-06, + "loss": 0.818, + "step": 24422 + }, + { + "epoch": 2.0815648171823065, + "grad_norm": 62.57932152136197, + "learning_rate": 2.5938999965026783e-06, + "loss": 2.0412, + "step": 24423 + }, + { + "epoch": 2.081650046876332, + "grad_norm": 41.99153426157612, + "learning_rate": 2.593465348201412e-06, + "loss": 1.2559, + "step": 24424 + }, + { + "epoch": 2.081735276570357, + "grad_norm": 89.71045142605618, + "learning_rate": 2.5930307235674555e-06, + "loss": 2.0062, + "step": 24425 + }, + { + "epoch": 2.0818205062643824, + "grad_norm": 43.87113465789374, + "learning_rate": 2.5925961226050865e-06, + "loss": 1.5855, + "step": 24426 + }, + { + "epoch": 2.081905735958408, + "grad_norm": 69.16046319635838, + "learning_rate": 2.592161545318579e-06, + "loss": 1.4252, + "step": 24427 + }, + { + "epoch": 2.0819909656524334, + "grad_norm": 23.503787072367736, + "learning_rate": 2.5917269917122055e-06, + "loss": 0.8433, + "step": 24428 + }, + { + "epoch": 2.082076195346459, + "grad_norm": 25.826106272806864, + "learning_rate": 2.5912924617902412e-06, + "loss": 1.2067, + "step": 24429 + }, + { + "epoch": 2.0821614250404843, + "grad_norm": 39.72812866662929, + "learning_rate": 2.5908579555569556e-06, + "loss": 1.6759, + "step": 24430 + }, + { + "epoch": 2.0822466547345093, + "grad_norm": 37.85004062690506, + "learning_rate": 2.5904234730166274e-06, + "loss": 0.767, + "step": 24431 + }, + { + "epoch": 2.082331884428535, + "grad_norm": 46.368089805388855, + "learning_rate": 2.589989014173526e-06, + "loss": 1.5819, + "step": 24432 + }, + { + "epoch": 2.0824171141225603, + "grad_norm": 87.78056074065344, + "learning_rate": 2.5895545790319237e-06, + "loss": 2.1848, + "step": 24433 + }, + { + "epoch": 2.0825023438165857, + "grad_norm": 32.379148209708916, + "learning_rate": 2.5891201675960943e-06, + "loss": 1.1356, + "step": 24434 + }, + { + "epoch": 2.082587573510611, + "grad_norm": 34.209706810943075, + "learning_rate": 2.588685779870311e-06, + "loss": 0.987, + "step": 24435 + }, + { + "epoch": 2.0826728032046367, + "grad_norm": 36.35681581923365, + "learning_rate": 2.588251415858847e-06, + "loss": 1.0452, + "step": 24436 + }, + { + "epoch": 2.0827580328986617, + "grad_norm": 82.55378278449123, + "learning_rate": 2.5878170755659703e-06, + "loss": 2.1268, + "step": 24437 + }, + { + "epoch": 2.082843262592687, + "grad_norm": 16.80258868510468, + "learning_rate": 2.5873827589959523e-06, + "loss": 0.7526, + "step": 24438 + }, + { + "epoch": 2.0829284922867126, + "grad_norm": 58.066285969411865, + "learning_rate": 2.5869484661530686e-06, + "loss": 1.8296, + "step": 24439 + }, + { + "epoch": 2.083013721980738, + "grad_norm": 59.405237050370616, + "learning_rate": 2.5865141970415875e-06, + "loss": 2.1598, + "step": 24440 + }, + { + "epoch": 2.0830989516747636, + "grad_norm": 26.629905887754564, + "learning_rate": 2.586079951665779e-06, + "loss": 1.1194, + "step": 24441 + }, + { + "epoch": 2.083184181368789, + "grad_norm": 48.59990992787105, + "learning_rate": 2.5856457300299163e-06, + "loss": 1.4734, + "step": 24442 + }, + { + "epoch": 2.0832694110628145, + "grad_norm": 126.74947291669193, + "learning_rate": 2.5852115321382664e-06, + "loss": 2.4087, + "step": 24443 + }, + { + "epoch": 2.0833546407568395, + "grad_norm": 51.50765239364502, + "learning_rate": 2.584777357995104e-06, + "loss": 1.8876, + "step": 24444 + }, + { + "epoch": 2.083439870450865, + "grad_norm": 87.9026802364564, + "learning_rate": 2.584343207604696e-06, + "loss": 3.1578, + "step": 24445 + }, + { + "epoch": 2.0835251001448905, + "grad_norm": 46.40604346914185, + "learning_rate": 2.583909080971313e-06, + "loss": 1.9377, + "step": 24446 + }, + { + "epoch": 2.083610329838916, + "grad_norm": 47.77437110523141, + "learning_rate": 2.5834749780992223e-06, + "loss": 1.404, + "step": 24447 + }, + { + "epoch": 2.0836955595329414, + "grad_norm": 46.53322679865261, + "learning_rate": 2.5830408989926968e-06, + "loss": 2.074, + "step": 24448 + }, + { + "epoch": 2.083780789226967, + "grad_norm": 46.51725464832443, + "learning_rate": 2.5826068436560016e-06, + "loss": 1.5671, + "step": 24449 + }, + { + "epoch": 2.083866018920992, + "grad_norm": 41.77404364279489, + "learning_rate": 2.5821728120934096e-06, + "loss": 1.0845, + "step": 24450 + }, + { + "epoch": 2.0839512486150173, + "grad_norm": 63.42428065295095, + "learning_rate": 2.581738804309185e-06, + "loss": 1.1581, + "step": 24451 + }, + { + "epoch": 2.084036478309043, + "grad_norm": 18.513853128395336, + "learning_rate": 2.5813048203076007e-06, + "loss": 0.7686, + "step": 24452 + }, + { + "epoch": 2.0841217080030683, + "grad_norm": 66.26534794109996, + "learning_rate": 2.5808708600929224e-06, + "loss": 2.0685, + "step": 24453 + }, + { + "epoch": 2.0842069376970938, + "grad_norm": 57.965577505970536, + "learning_rate": 2.580436923669418e-06, + "loss": 1.8491, + "step": 24454 + }, + { + "epoch": 2.084292167391119, + "grad_norm": 53.06820724930654, + "learning_rate": 2.5800030110413525e-06, + "loss": 1.5775, + "step": 24455 + }, + { + "epoch": 2.0843773970851442, + "grad_norm": 18.743080886758783, + "learning_rate": 2.579569122212998e-06, + "loss": 0.9242, + "step": 24456 + }, + { + "epoch": 2.0844626267791697, + "grad_norm": 26.748775770211115, + "learning_rate": 2.5791352571886176e-06, + "loss": 0.5568, + "step": 24457 + }, + { + "epoch": 2.084547856473195, + "grad_norm": 59.53082296093512, + "learning_rate": 2.578701415972482e-06, + "loss": 2.0215, + "step": 24458 + }, + { + "epoch": 2.0846330861672207, + "grad_norm": 47.29304302491964, + "learning_rate": 2.5782675985688554e-06, + "loss": 1.4798, + "step": 24459 + }, + { + "epoch": 2.084718315861246, + "grad_norm": 50.975839527063776, + "learning_rate": 2.577833804982003e-06, + "loss": 0.9967, + "step": 24460 + }, + { + "epoch": 2.0848035455552716, + "grad_norm": 42.74893700313686, + "learning_rate": 2.5774000352161945e-06, + "loss": 0.9602, + "step": 24461 + }, + { + "epoch": 2.084888775249297, + "grad_norm": 63.007329232609514, + "learning_rate": 2.576966289275694e-06, + "loss": 1.4571, + "step": 24462 + }, + { + "epoch": 2.084974004943322, + "grad_norm": 24.237755562387655, + "learning_rate": 2.5765325671647677e-06, + "loss": 1.2244, + "step": 24463 + }, + { + "epoch": 2.0850592346373475, + "grad_norm": 57.280907072935015, + "learning_rate": 2.576098868887678e-06, + "loss": 1.3863, + "step": 24464 + }, + { + "epoch": 2.085144464331373, + "grad_norm": 41.74002834405421, + "learning_rate": 2.575665194448693e-06, + "loss": 1.1545, + "step": 24465 + }, + { + "epoch": 2.0852296940253985, + "grad_norm": 78.02374642475979, + "learning_rate": 2.5752315438520797e-06, + "loss": 1.7734, + "step": 24466 + }, + { + "epoch": 2.085314923719424, + "grad_norm": 27.975212716131104, + "learning_rate": 2.5747979171021004e-06, + "loss": 0.8842, + "step": 24467 + }, + { + "epoch": 2.0854001534134494, + "grad_norm": 39.164130687463356, + "learning_rate": 2.5743643142030183e-06, + "loss": 1.3505, + "step": 24468 + }, + { + "epoch": 2.0854853831074744, + "grad_norm": 25.30377096936205, + "learning_rate": 2.5739307351591e-06, + "loss": 1.3843, + "step": 24469 + }, + { + "epoch": 2.0855706128015, + "grad_norm": 51.97719154884514, + "learning_rate": 2.5734971799746106e-06, + "loss": 1.3843, + "step": 24470 + }, + { + "epoch": 2.0856558424955254, + "grad_norm": 32.50766683600209, + "learning_rate": 2.5730636486538114e-06, + "loss": 1.3032, + "step": 24471 + }, + { + "epoch": 2.085741072189551, + "grad_norm": 25.861213767596563, + "learning_rate": 2.5726301412009646e-06, + "loss": 1.0525, + "step": 24472 + }, + { + "epoch": 2.0858263018835763, + "grad_norm": 59.14132294406248, + "learning_rate": 2.572196657620336e-06, + "loss": 1.0396, + "step": 24473 + }, + { + "epoch": 2.085911531577602, + "grad_norm": 34.4221520654185, + "learning_rate": 2.571763197916191e-06, + "loss": 1.321, + "step": 24474 + }, + { + "epoch": 2.085996761271627, + "grad_norm": 53.43840855986679, + "learning_rate": 2.5713297620927904e-06, + "loss": 1.8274, + "step": 24475 + }, + { + "epoch": 2.0860819909656523, + "grad_norm": 59.39664970182216, + "learning_rate": 2.5708963501543954e-06, + "loss": 1.9398, + "step": 24476 + }, + { + "epoch": 2.0861672206596777, + "grad_norm": 60.75642661951535, + "learning_rate": 2.570462962105268e-06, + "loss": 1.2699, + "step": 24477 + }, + { + "epoch": 2.086252450353703, + "grad_norm": 53.87136799639751, + "learning_rate": 2.5700295979496743e-06, + "loss": 1.1581, + "step": 24478 + }, + { + "epoch": 2.0863376800477287, + "grad_norm": 31.361578584944887, + "learning_rate": 2.5695962576918732e-06, + "loss": 1.2694, + "step": 24479 + }, + { + "epoch": 2.086422909741754, + "grad_norm": 44.94269066827196, + "learning_rate": 2.5691629413361264e-06, + "loss": 1.218, + "step": 24480 + }, + { + "epoch": 2.0865081394357796, + "grad_norm": 12.842563980688855, + "learning_rate": 2.5687296488866953e-06, + "loss": 0.5778, + "step": 24481 + }, + { + "epoch": 2.0865933691298046, + "grad_norm": 61.926673433318015, + "learning_rate": 2.568296380347844e-06, + "loss": 2.1064, + "step": 24482 + }, + { + "epoch": 2.08667859882383, + "grad_norm": 35.18810386553768, + "learning_rate": 2.5678631357238314e-06, + "loss": 0.8679, + "step": 24483 + }, + { + "epoch": 2.0867638285178556, + "grad_norm": 69.98161715794335, + "learning_rate": 2.5674299150189184e-06, + "loss": 1.7659, + "step": 24484 + }, + { + "epoch": 2.086849058211881, + "grad_norm": 40.734644635392655, + "learning_rate": 2.566996718237366e-06, + "loss": 1.2847, + "step": 24485 + }, + { + "epoch": 2.0869342879059065, + "grad_norm": 34.22817139047814, + "learning_rate": 2.566563545383432e-06, + "loss": 1.3443, + "step": 24486 + }, + { + "epoch": 2.087019517599932, + "grad_norm": 56.144619030756395, + "learning_rate": 2.5661303964613806e-06, + "loss": 1.6251, + "step": 24487 + }, + { + "epoch": 2.087104747293957, + "grad_norm": 39.17855757047763, + "learning_rate": 2.5656972714754673e-06, + "loss": 1.0236, + "step": 24488 + }, + { + "epoch": 2.0871899769879825, + "grad_norm": 55.403915481823404, + "learning_rate": 2.5652641704299565e-06, + "loss": 1.4335, + "step": 24489 + }, + { + "epoch": 2.087275206682008, + "grad_norm": 36.90464409073602, + "learning_rate": 2.5648310933291032e-06, + "loss": 1.6043, + "step": 24490 + }, + { + "epoch": 2.0873604363760334, + "grad_norm": 18.82772059796624, + "learning_rate": 2.5643980401771697e-06, + "loss": 0.8776, + "step": 24491 + }, + { + "epoch": 2.087445666070059, + "grad_norm": 62.07976462434195, + "learning_rate": 2.5639650109784144e-06, + "loss": 2.1405, + "step": 24492 + }, + { + "epoch": 2.0875308957640843, + "grad_norm": 96.39674593105492, + "learning_rate": 2.563532005737095e-06, + "loss": 2.9683, + "step": 24493 + }, + { + "epoch": 2.08761612545811, + "grad_norm": 35.04111687735716, + "learning_rate": 2.5630990244574676e-06, + "loss": 1.254, + "step": 24494 + }, + { + "epoch": 2.087701355152135, + "grad_norm": 40.29396514032935, + "learning_rate": 2.562666067143795e-06, + "loss": 1.2456, + "step": 24495 + }, + { + "epoch": 2.0877865848461603, + "grad_norm": 53.65215614295256, + "learning_rate": 2.5622331338003327e-06, + "loss": 1.8259, + "step": 24496 + }, + { + "epoch": 2.0878718145401858, + "grad_norm": 40.87510770939576, + "learning_rate": 2.561800224431339e-06, + "loss": 0.8027, + "step": 24497 + }, + { + "epoch": 2.0879570442342112, + "grad_norm": 58.5115630946242, + "learning_rate": 2.5613673390410705e-06, + "loss": 1.5594, + "step": 24498 + }, + { + "epoch": 2.0880422739282367, + "grad_norm": 32.48687957706183, + "learning_rate": 2.5609344776337864e-06, + "loss": 1.0718, + "step": 24499 + }, + { + "epoch": 2.088127503622262, + "grad_norm": 34.74792672880593, + "learning_rate": 2.5605016402137427e-06, + "loss": 0.9319, + "step": 24500 + }, + { + "epoch": 2.088212733316287, + "grad_norm": 64.20693887474592, + "learning_rate": 2.5600688267851958e-06, + "loss": 1.1831, + "step": 24501 + }, + { + "epoch": 2.0882979630103127, + "grad_norm": 47.93408163102522, + "learning_rate": 2.559636037352403e-06, + "loss": 1.6308, + "step": 24502 + }, + { + "epoch": 2.088383192704338, + "grad_norm": 27.126822558204747, + "learning_rate": 2.559203271919618e-06, + "loss": 0.8416, + "step": 24503 + }, + { + "epoch": 2.0884684223983636, + "grad_norm": 19.928716448426204, + "learning_rate": 2.558770530491099e-06, + "loss": 0.7342, + "step": 24504 + }, + { + "epoch": 2.088553652092389, + "grad_norm": 84.5850566342049, + "learning_rate": 2.5583378130711033e-06, + "loss": 1.1771, + "step": 24505 + }, + { + "epoch": 2.0886388817864145, + "grad_norm": 68.45605203546333, + "learning_rate": 2.5579051196638855e-06, + "loss": 2.0673, + "step": 24506 + }, + { + "epoch": 2.0887241114804396, + "grad_norm": 47.62843092396056, + "learning_rate": 2.5574724502736982e-06, + "loss": 0.9896, + "step": 24507 + }, + { + "epoch": 2.088809341174465, + "grad_norm": 48.755135156712214, + "learning_rate": 2.557039804904801e-06, + "loss": 1.7093, + "step": 24508 + }, + { + "epoch": 2.0888945708684905, + "grad_norm": 54.63752395524031, + "learning_rate": 2.5566071835614458e-06, + "loss": 1.9814, + "step": 24509 + }, + { + "epoch": 2.088979800562516, + "grad_norm": 69.8153500821161, + "learning_rate": 2.5561745862478894e-06, + "loss": 2.5135, + "step": 24510 + }, + { + "epoch": 2.0890650302565414, + "grad_norm": 58.32088857597975, + "learning_rate": 2.5557420129683817e-06, + "loss": 1.6124, + "step": 24511 + }, + { + "epoch": 2.089150259950567, + "grad_norm": 71.87298817282093, + "learning_rate": 2.5553094637271804e-06, + "loss": 2.1718, + "step": 24512 + }, + { + "epoch": 2.0892354896445924, + "grad_norm": 22.04168342315407, + "learning_rate": 2.5548769385285412e-06, + "loss": 1.2672, + "step": 24513 + }, + { + "epoch": 2.0893207193386174, + "grad_norm": 33.35431863478103, + "learning_rate": 2.554444437376715e-06, + "loss": 1.0263, + "step": 24514 + }, + { + "epoch": 2.089405949032643, + "grad_norm": 32.036413826913176, + "learning_rate": 2.554011960275957e-06, + "loss": 0.8864, + "step": 24515 + }, + { + "epoch": 2.0894911787266683, + "grad_norm": 37.816137228526124, + "learning_rate": 2.553579507230517e-06, + "loss": 1.1705, + "step": 24516 + }, + { + "epoch": 2.089576408420694, + "grad_norm": 24.131167967647638, + "learning_rate": 2.553147078244653e-06, + "loss": 1.0588, + "step": 24517 + }, + { + "epoch": 2.0896616381147193, + "grad_norm": 52.05184344217084, + "learning_rate": 2.5527146733226144e-06, + "loss": 1.2766, + "step": 24518 + }, + { + "epoch": 2.0897468678087447, + "grad_norm": 42.78258855732754, + "learning_rate": 2.552282292468653e-06, + "loss": 1.454, + "step": 24519 + }, + { + "epoch": 2.0898320975027698, + "grad_norm": 52.0164561914508, + "learning_rate": 2.5518499356870223e-06, + "loss": 2.0418, + "step": 24520 + }, + { + "epoch": 2.0899173271967952, + "grad_norm": 51.03300435425171, + "learning_rate": 2.551417602981977e-06, + "loss": 1.4065, + "step": 24521 + }, + { + "epoch": 2.0900025568908207, + "grad_norm": 63.65852049174253, + "learning_rate": 2.5509852943577664e-06, + "loss": 1.9176, + "step": 24522 + }, + { + "epoch": 2.090087786584846, + "grad_norm": 53.23201752363085, + "learning_rate": 2.550553009818643e-06, + "loss": 1.3015, + "step": 24523 + }, + { + "epoch": 2.0901730162788716, + "grad_norm": 58.73404186279134, + "learning_rate": 2.550120749368854e-06, + "loss": 2.1752, + "step": 24524 + }, + { + "epoch": 2.090258245972897, + "grad_norm": 48.36798181616651, + "learning_rate": 2.549688513012657e-06, + "loss": 1.3587, + "step": 24525 + }, + { + "epoch": 2.0903434756669226, + "grad_norm": 81.84256478998208, + "learning_rate": 2.549256300754299e-06, + "loss": 2.5472, + "step": 24526 + }, + { + "epoch": 2.0904287053609476, + "grad_norm": 45.68529138259924, + "learning_rate": 2.5488241125980305e-06, + "loss": 1.357, + "step": 24527 + }, + { + "epoch": 2.090513935054973, + "grad_norm": 80.65774224085017, + "learning_rate": 2.5483919485481024e-06, + "loss": 1.5273, + "step": 24528 + }, + { + "epoch": 2.0905991647489985, + "grad_norm": 29.698346747608543, + "learning_rate": 2.547959808608767e-06, + "loss": 0.8647, + "step": 24529 + }, + { + "epoch": 2.090684394443024, + "grad_norm": 21.40423976705108, + "learning_rate": 2.5475276927842727e-06, + "loss": 0.9463, + "step": 24530 + }, + { + "epoch": 2.0907696241370495, + "grad_norm": 50.581413649585436, + "learning_rate": 2.5470956010788695e-06, + "loss": 1.237, + "step": 24531 + }, + { + "epoch": 2.090854853831075, + "grad_norm": 65.6880778202369, + "learning_rate": 2.546663533496806e-06, + "loss": 1.9462, + "step": 24532 + }, + { + "epoch": 2.0909400835251, + "grad_norm": 28.455014163476623, + "learning_rate": 2.546231490042329e-06, + "loss": 1.2272, + "step": 24533 + }, + { + "epoch": 2.0910253132191254, + "grad_norm": 45.855207882893, + "learning_rate": 2.5457994707196932e-06, + "loss": 1.5235, + "step": 24534 + }, + { + "epoch": 2.091110542913151, + "grad_norm": 67.02153906724061, + "learning_rate": 2.5453674755331425e-06, + "loss": 1.1227, + "step": 24535 + }, + { + "epoch": 2.0911957726071764, + "grad_norm": 72.80805577325202, + "learning_rate": 2.5449355044869288e-06, + "loss": 1.7669, + "step": 24536 + }, + { + "epoch": 2.091281002301202, + "grad_norm": 20.83047965959244, + "learning_rate": 2.5445035575852973e-06, + "loss": 0.5576, + "step": 24537 + }, + { + "epoch": 2.0913662319952273, + "grad_norm": 26.422008367417135, + "learning_rate": 2.5440716348324997e-06, + "loss": 0.9876, + "step": 24538 + }, + { + "epoch": 2.0914514616892523, + "grad_norm": 36.44149716170368, + "learning_rate": 2.5436397362327814e-06, + "loss": 1.4585, + "step": 24539 + }, + { + "epoch": 2.091536691383278, + "grad_norm": 30.12469658290081, + "learning_rate": 2.54320786179039e-06, + "loss": 0.8502, + "step": 24540 + }, + { + "epoch": 2.0916219210773033, + "grad_norm": 53.274567583599286, + "learning_rate": 2.542776011509571e-06, + "loss": 2.1208, + "step": 24541 + }, + { + "epoch": 2.0917071507713287, + "grad_norm": 75.4686267315569, + "learning_rate": 2.542344185394576e-06, + "loss": 1.6507, + "step": 24542 + }, + { + "epoch": 2.091792380465354, + "grad_norm": 47.40043679822901, + "learning_rate": 2.5419123834496473e-06, + "loss": 1.1501, + "step": 24543 + }, + { + "epoch": 2.0918776101593797, + "grad_norm": 55.924984543072014, + "learning_rate": 2.5414806056790354e-06, + "loss": 1.6665, + "step": 24544 + }, + { + "epoch": 2.091962839853405, + "grad_norm": 42.26144905903023, + "learning_rate": 2.541048852086985e-06, + "loss": 1.5134, + "step": 24545 + }, + { + "epoch": 2.09204806954743, + "grad_norm": 48.596205118790145, + "learning_rate": 2.54061712267774e-06, + "loss": 1.3262, + "step": 24546 + }, + { + "epoch": 2.0921332992414556, + "grad_norm": 71.64335230812327, + "learning_rate": 2.5401854174555498e-06, + "loss": 1.9221, + "step": 24547 + }, + { + "epoch": 2.092218528935481, + "grad_norm": 67.13905639804372, + "learning_rate": 2.539753736424659e-06, + "loss": 1.6576, + "step": 24548 + }, + { + "epoch": 2.0923037586295066, + "grad_norm": 36.87228166116336, + "learning_rate": 2.5393220795893127e-06, + "loss": 1.8682, + "step": 24549 + }, + { + "epoch": 2.092388988323532, + "grad_norm": 40.10293219236336, + "learning_rate": 2.5388904469537534e-06, + "loss": 0.9258, + "step": 24550 + }, + { + "epoch": 2.0924742180175575, + "grad_norm": 30.544566888246745, + "learning_rate": 2.5384588385222296e-06, + "loss": 0.8601, + "step": 24551 + }, + { + "epoch": 2.0925594477115825, + "grad_norm": 68.45911919184023, + "learning_rate": 2.5380272542989858e-06, + "loss": 1.9795, + "step": 24552 + }, + { + "epoch": 2.092644677405608, + "grad_norm": 85.91762740117464, + "learning_rate": 2.537595694288267e-06, + "loss": 2.5228, + "step": 24553 + }, + { + "epoch": 2.0927299070996335, + "grad_norm": 46.440612854009174, + "learning_rate": 2.537164158494313e-06, + "loss": 1.6038, + "step": 24554 + }, + { + "epoch": 2.092815136793659, + "grad_norm": 45.41900415798398, + "learning_rate": 2.536732646921374e-06, + "loss": 1.7013, + "step": 24555 + }, + { + "epoch": 2.0929003664876844, + "grad_norm": 26.420702203256084, + "learning_rate": 2.5363011595736897e-06, + "loss": 1.1034, + "step": 24556 + }, + { + "epoch": 2.09298559618171, + "grad_norm": 59.1912427559955, + "learning_rate": 2.535869696455504e-06, + "loss": 1.6482, + "step": 24557 + }, + { + "epoch": 2.093070825875735, + "grad_norm": 64.21557619464328, + "learning_rate": 2.53543825757106e-06, + "loss": 1.296, + "step": 24558 + }, + { + "epoch": 2.0931560555697604, + "grad_norm": 62.044290016808915, + "learning_rate": 2.5350068429246003e-06, + "loss": 2.6017, + "step": 24559 + }, + { + "epoch": 2.093241285263786, + "grad_norm": 47.055225207583874, + "learning_rate": 2.5345754525203715e-06, + "loss": 1.5635, + "step": 24560 + }, + { + "epoch": 2.0933265149578113, + "grad_norm": 56.716276606412514, + "learning_rate": 2.534144086362613e-06, + "loss": 1.7911, + "step": 24561 + }, + { + "epoch": 2.0934117446518368, + "grad_norm": 66.48457732273333, + "learning_rate": 2.5337127444555682e-06, + "loss": 1.4657, + "step": 24562 + }, + { + "epoch": 2.0934969743458622, + "grad_norm": 35.7500779157301, + "learning_rate": 2.533281426803476e-06, + "loss": 1.3723, + "step": 24563 + }, + { + "epoch": 2.0935822040398877, + "grad_norm": 31.54781576491804, + "learning_rate": 2.5328501334105833e-06, + "loss": 1.0331, + "step": 24564 + }, + { + "epoch": 2.0936674337339127, + "grad_norm": 50.622675161285144, + "learning_rate": 2.532418864281129e-06, + "loss": 1.8107, + "step": 24565 + }, + { + "epoch": 2.093752663427938, + "grad_norm": 50.647317694107514, + "learning_rate": 2.5319876194193526e-06, + "loss": 1.8481, + "step": 24566 + }, + { + "epoch": 2.0938378931219637, + "grad_norm": 33.46455310618302, + "learning_rate": 2.5315563988294965e-06, + "loss": 1.3276, + "step": 24567 + }, + { + "epoch": 2.093923122815989, + "grad_norm": 44.7286965550445, + "learning_rate": 2.5311252025158057e-06, + "loss": 1.5299, + "step": 24568 + }, + { + "epoch": 2.0940083525100146, + "grad_norm": 69.4656090602899, + "learning_rate": 2.530694030482517e-06, + "loss": 1.0765, + "step": 24569 + }, + { + "epoch": 2.09409358220404, + "grad_norm": 51.80005528726464, + "learning_rate": 2.5302628827338706e-06, + "loss": 1.8055, + "step": 24570 + }, + { + "epoch": 2.094178811898065, + "grad_norm": 31.768741239051053, + "learning_rate": 2.5298317592741062e-06, + "loss": 0.6869, + "step": 24571 + }, + { + "epoch": 2.0942640415920906, + "grad_norm": 63.67986273237699, + "learning_rate": 2.5294006601074666e-06, + "loss": 2.1533, + "step": 24572 + }, + { + "epoch": 2.094349271286116, + "grad_norm": 55.16186608022586, + "learning_rate": 2.5289695852381903e-06, + "loss": 1.3667, + "step": 24573 + }, + { + "epoch": 2.0944345009801415, + "grad_norm": 51.43963476525313, + "learning_rate": 2.5285385346705132e-06, + "loss": 1.6334, + "step": 24574 + }, + { + "epoch": 2.094519730674167, + "grad_norm": 35.62841534881346, + "learning_rate": 2.5281075084086804e-06, + "loss": 1.2527, + "step": 24575 + }, + { + "epoch": 2.0946049603681924, + "grad_norm": 61.39463756355308, + "learning_rate": 2.5276765064569252e-06, + "loss": 1.4636, + "step": 24576 + }, + { + "epoch": 2.0946901900622175, + "grad_norm": 55.964758463529115, + "learning_rate": 2.5272455288194918e-06, + "loss": 1.4711, + "step": 24577 + }, + { + "epoch": 2.094775419756243, + "grad_norm": 47.10966464212273, + "learning_rate": 2.5268145755006157e-06, + "loss": 0.9891, + "step": 24578 + }, + { + "epoch": 2.0948606494502684, + "grad_norm": 60.12740305914837, + "learning_rate": 2.526383646504536e-06, + "loss": 1.4536, + "step": 24579 + }, + { + "epoch": 2.094945879144294, + "grad_norm": 29.439938494679463, + "learning_rate": 2.525952741835488e-06, + "loss": 1.4672, + "step": 24580 + }, + { + "epoch": 2.0950311088383193, + "grad_norm": 57.52677084653969, + "learning_rate": 2.525521861497714e-06, + "loss": 2.1046, + "step": 24581 + }, + { + "epoch": 2.095116338532345, + "grad_norm": 57.56455647615876, + "learning_rate": 2.525091005495447e-06, + "loss": 1.494, + "step": 24582 + }, + { + "epoch": 2.0952015682263703, + "grad_norm": 72.61559304393958, + "learning_rate": 2.524660173832928e-06, + "loss": 1.2241, + "step": 24583 + }, + { + "epoch": 2.0952867979203953, + "grad_norm": 28.891710703328442, + "learning_rate": 2.524229366514391e-06, + "loss": 1.1366, + "step": 24584 + }, + { + "epoch": 2.0953720276144208, + "grad_norm": 47.29771982959775, + "learning_rate": 2.5237985835440763e-06, + "loss": 1.4305, + "step": 24585 + }, + { + "epoch": 2.0954572573084462, + "grad_norm": 54.275792247148026, + "learning_rate": 2.5233678249262183e-06, + "loss": 1.7629, + "step": 24586 + }, + { + "epoch": 2.0955424870024717, + "grad_norm": 77.42167975448412, + "learning_rate": 2.522937090665054e-06, + "loss": 2.0273, + "step": 24587 + }, + { + "epoch": 2.095627716696497, + "grad_norm": 46.31532298807526, + "learning_rate": 2.5225063807648198e-06, + "loss": 1.9517, + "step": 24588 + }, + { + "epoch": 2.0957129463905226, + "grad_norm": 64.34229227305705, + "learning_rate": 2.5220756952297475e-06, + "loss": 2.0295, + "step": 24589 + }, + { + "epoch": 2.0957981760845477, + "grad_norm": 36.08290831284929, + "learning_rate": 2.5216450340640774e-06, + "loss": 1.3792, + "step": 24590 + }, + { + "epoch": 2.095883405778573, + "grad_norm": 71.73847998812235, + "learning_rate": 2.5212143972720447e-06, + "loss": 1.792, + "step": 24591 + }, + { + "epoch": 2.0959686354725986, + "grad_norm": 33.28418809492231, + "learning_rate": 2.5207837848578844e-06, + "loss": 1.2607, + "step": 24592 + }, + { + "epoch": 2.096053865166624, + "grad_norm": 25.205173352021262, + "learning_rate": 2.5203531968258276e-06, + "loss": 1.6424, + "step": 24593 + }, + { + "epoch": 2.0961390948606495, + "grad_norm": 44.8263741768168, + "learning_rate": 2.519922633180114e-06, + "loss": 1.7619, + "step": 24594 + }, + { + "epoch": 2.096224324554675, + "grad_norm": 69.9973688611497, + "learning_rate": 2.5194920939249754e-06, + "loss": 1.5865, + "step": 24595 + }, + { + "epoch": 2.0963095542487, + "grad_norm": 41.15958197644422, + "learning_rate": 2.5190615790646466e-06, + "loss": 1.0597, + "step": 24596 + }, + { + "epoch": 2.0963947839427255, + "grad_norm": 77.60755963806442, + "learning_rate": 2.51863108860336e-06, + "loss": 1.9305, + "step": 24597 + }, + { + "epoch": 2.096480013636751, + "grad_norm": 75.9129336982751, + "learning_rate": 2.5182006225453505e-06, + "loss": 1.2708, + "step": 24598 + }, + { + "epoch": 2.0965652433307764, + "grad_norm": 47.96611685013229, + "learning_rate": 2.5177701808948527e-06, + "loss": 0.8547, + "step": 24599 + }, + { + "epoch": 2.096650473024802, + "grad_norm": 45.209509496882454, + "learning_rate": 2.5173397636561e-06, + "loss": 1.7132, + "step": 24600 + }, + { + "epoch": 2.0967357027188274, + "grad_norm": 23.730670853714418, + "learning_rate": 2.5169093708333215e-06, + "loss": 1.0443, + "step": 24601 + }, + { + "epoch": 2.096820932412853, + "grad_norm": 51.67809131775776, + "learning_rate": 2.516479002430755e-06, + "loss": 1.8879, + "step": 24602 + }, + { + "epoch": 2.096906162106878, + "grad_norm": 44.71356776114171, + "learning_rate": 2.51604865845263e-06, + "loss": 1.1498, + "step": 24603 + }, + { + "epoch": 2.0969913918009033, + "grad_norm": 89.34197987968348, + "learning_rate": 2.51561833890318e-06, + "loss": 1.5955, + "step": 24604 + }, + { + "epoch": 2.097076621494929, + "grad_norm": 57.025859716868375, + "learning_rate": 2.515188043786635e-06, + "loss": 1.5348, + "step": 24605 + }, + { + "epoch": 2.0971618511889543, + "grad_norm": 36.33250797610631, + "learning_rate": 2.514757773107227e-06, + "loss": 1.4163, + "step": 24606 + }, + { + "epoch": 2.0972470808829797, + "grad_norm": 55.3120403995965, + "learning_rate": 2.5143275268691912e-06, + "loss": 1.5968, + "step": 24607 + }, + { + "epoch": 2.097332310577005, + "grad_norm": 34.14247250373388, + "learning_rate": 2.513897305076757e-06, + "loss": 0.7447, + "step": 24608 + }, + { + "epoch": 2.0974175402710302, + "grad_norm": 50.53705121417978, + "learning_rate": 2.5134671077341543e-06, + "loss": 1.4501, + "step": 24609 + }, + { + "epoch": 2.0975027699650557, + "grad_norm": 24.95817343081069, + "learning_rate": 2.5130369348456127e-06, + "loss": 0.7813, + "step": 24610 + }, + { + "epoch": 2.097587999659081, + "grad_norm": 57.95619504022327, + "learning_rate": 2.512606786415367e-06, + "loss": 2.0804, + "step": 24611 + }, + { + "epoch": 2.0976732293531066, + "grad_norm": 61.968932372302106, + "learning_rate": 2.512176662447645e-06, + "loss": 1.6955, + "step": 24612 + }, + { + "epoch": 2.097758459047132, + "grad_norm": 69.80584636797026, + "learning_rate": 2.511746562946675e-06, + "loss": 2.0444, + "step": 24613 + }, + { + "epoch": 2.0978436887411576, + "grad_norm": 30.513242151117332, + "learning_rate": 2.5113164879166885e-06, + "loss": 0.9705, + "step": 24614 + }, + { + "epoch": 2.097928918435183, + "grad_norm": 49.585883302700246, + "learning_rate": 2.510886437361918e-06, + "loss": 1.5756, + "step": 24615 + }, + { + "epoch": 2.098014148129208, + "grad_norm": 19.915757341810654, + "learning_rate": 2.5104564112865893e-06, + "loss": 1.1802, + "step": 24616 + }, + { + "epoch": 2.0980993778232335, + "grad_norm": 64.02449882965655, + "learning_rate": 2.5100264096949334e-06, + "loss": 1.7722, + "step": 24617 + }, + { + "epoch": 2.098184607517259, + "grad_norm": 48.2091570668836, + "learning_rate": 2.509596432591178e-06, + "loss": 1.908, + "step": 24618 + }, + { + "epoch": 2.0982698372112845, + "grad_norm": 29.49015612739709, + "learning_rate": 2.50916647997955e-06, + "loss": 1.6242, + "step": 24619 + }, + { + "epoch": 2.09835506690531, + "grad_norm": 77.02399282671036, + "learning_rate": 2.5087365518642827e-06, + "loss": 2.3915, + "step": 24620 + }, + { + "epoch": 2.0984402965993354, + "grad_norm": 54.31137174948541, + "learning_rate": 2.5083066482495988e-06, + "loss": 1.0653, + "step": 24621 + }, + { + "epoch": 2.0985255262933604, + "grad_norm": 63.69158251261039, + "learning_rate": 2.507876769139731e-06, + "loss": 1.6837, + "step": 24622 + }, + { + "epoch": 2.098610755987386, + "grad_norm": 39.150465349633855, + "learning_rate": 2.507446914538903e-06, + "loss": 1.2916, + "step": 24623 + }, + { + "epoch": 2.0986959856814114, + "grad_norm": 55.69155515503951, + "learning_rate": 2.5070170844513458e-06, + "loss": 1.7453, + "step": 24624 + }, + { + "epoch": 2.098781215375437, + "grad_norm": 65.54474973134316, + "learning_rate": 2.506587278881285e-06, + "loss": 2.3374, + "step": 24625 + }, + { + "epoch": 2.0988664450694623, + "grad_norm": 55.001344384345245, + "learning_rate": 2.506157497832948e-06, + "loss": 1.6856, + "step": 24626 + }, + { + "epoch": 2.0989516747634878, + "grad_norm": 47.91805399784356, + "learning_rate": 2.5057277413105584e-06, + "loss": 1.9408, + "step": 24627 + }, + { + "epoch": 2.099036904457513, + "grad_norm": 146.42838432461213, + "learning_rate": 2.5052980093183483e-06, + "loss": 4.0651, + "step": 24628 + }, + { + "epoch": 2.0991221341515383, + "grad_norm": 36.985045476834664, + "learning_rate": 2.504868301860538e-06, + "loss": 0.9802, + "step": 24629 + }, + { + "epoch": 2.0992073638455637, + "grad_norm": 56.908867154470215, + "learning_rate": 2.5044386189413596e-06, + "loss": 1.731, + "step": 24630 + }, + { + "epoch": 2.099292593539589, + "grad_norm": 61.299608617918906, + "learning_rate": 2.504008960565035e-06, + "loss": 1.3322, + "step": 24631 + }, + { + "epoch": 2.0993778232336147, + "grad_norm": 56.40983365746965, + "learning_rate": 2.503579326735788e-06, + "loss": 1.5219, + "step": 24632 + }, + { + "epoch": 2.09946305292764, + "grad_norm": 42.22852669977444, + "learning_rate": 2.503149717457849e-06, + "loss": 1.712, + "step": 24633 + }, + { + "epoch": 2.0995482826216656, + "grad_norm": 47.643281677860664, + "learning_rate": 2.50272013273544e-06, + "loss": 1.0807, + "step": 24634 + }, + { + "epoch": 2.0996335123156906, + "grad_norm": 50.01851069966976, + "learning_rate": 2.5022905725727864e-06, + "loss": 2.2115, + "step": 24635 + }, + { + "epoch": 2.099718742009716, + "grad_norm": 21.942207847365182, + "learning_rate": 2.501861036974111e-06, + "loss": 0.6632, + "step": 24636 + }, + { + "epoch": 2.0998039717037416, + "grad_norm": 58.45134378472193, + "learning_rate": 2.501431525943639e-06, + "loss": 1.5388, + "step": 24637 + }, + { + "epoch": 2.099889201397767, + "grad_norm": 49.582377244416776, + "learning_rate": 2.5010020394855974e-06, + "loss": 2.1499, + "step": 24638 + }, + { + "epoch": 2.0999744310917925, + "grad_norm": 83.22768620755834, + "learning_rate": 2.500572577604208e-06, + "loss": 1.8998, + "step": 24639 + }, + { + "epoch": 2.100059660785818, + "grad_norm": 113.45847616216719, + "learning_rate": 2.500143140303691e-06, + "loss": 1.8781, + "step": 24640 + }, + { + "epoch": 2.100144890479843, + "grad_norm": 28.74012069229304, + "learning_rate": 2.499713727588276e-06, + "loss": 0.8709, + "step": 24641 + }, + { + "epoch": 2.1002301201738685, + "grad_norm": 90.27744381686952, + "learning_rate": 2.4992843394621822e-06, + "loss": 2.059, + "step": 24642 + }, + { + "epoch": 2.100315349867894, + "grad_norm": 97.55070503867373, + "learning_rate": 2.4988549759296336e-06, + "loss": 2.3141, + "step": 24643 + }, + { + "epoch": 2.1004005795619194, + "grad_norm": 70.11615495529882, + "learning_rate": 2.4984256369948507e-06, + "loss": 1.96, + "step": 24644 + }, + { + "epoch": 2.100485809255945, + "grad_norm": 29.255709721125946, + "learning_rate": 2.4979963226620575e-06, + "loss": 1.4542, + "step": 24645 + }, + { + "epoch": 2.1005710389499703, + "grad_norm": 53.24725054648712, + "learning_rate": 2.497567032935478e-06, + "loss": 1.6325, + "step": 24646 + }, + { + "epoch": 2.100656268643996, + "grad_norm": 24.990961857070538, + "learning_rate": 2.4971377678193325e-06, + "loss": 1.0326, + "step": 24647 + }, + { + "epoch": 2.100741498338021, + "grad_norm": 64.81794373490409, + "learning_rate": 2.4967085273178426e-06, + "loss": 1.1274, + "step": 24648 + }, + { + "epoch": 2.1008267280320463, + "grad_norm": 67.97492792806771, + "learning_rate": 2.4962793114352273e-06, + "loss": 1.8349, + "step": 24649 + }, + { + "epoch": 2.1009119577260718, + "grad_norm": 78.12897454920842, + "learning_rate": 2.4958501201757128e-06, + "loss": 1.3315, + "step": 24650 + }, + { + "epoch": 2.1009971874200972, + "grad_norm": 45.932738046211874, + "learning_rate": 2.495420953543517e-06, + "loss": 0.9358, + "step": 24651 + }, + { + "epoch": 2.1010824171141227, + "grad_norm": 42.24656727984062, + "learning_rate": 2.494991811542858e-06, + "loss": 1.0717, + "step": 24652 + }, + { + "epoch": 2.101167646808148, + "grad_norm": 45.415783844139874, + "learning_rate": 2.4945626941779606e-06, + "loss": 1.2734, + "step": 24653 + }, + { + "epoch": 2.101252876502173, + "grad_norm": 66.3679158705404, + "learning_rate": 2.4941336014530447e-06, + "loss": 1.4319, + "step": 24654 + }, + { + "epoch": 2.1013381061961987, + "grad_norm": 46.1056423413364, + "learning_rate": 2.493704533372329e-06, + "loss": 1.7586, + "step": 24655 + }, + { + "epoch": 2.101423335890224, + "grad_norm": 35.777092727624876, + "learning_rate": 2.4932754899400336e-06, + "loss": 1.0133, + "step": 24656 + }, + { + "epoch": 2.1015085655842496, + "grad_norm": 32.5363068312025, + "learning_rate": 2.492846471160376e-06, + "loss": 0.9431, + "step": 24657 + }, + { + "epoch": 2.101593795278275, + "grad_norm": 48.08193190391596, + "learning_rate": 2.4924174770375784e-06, + "loss": 1.464, + "step": 24658 + }, + { + "epoch": 2.1016790249723005, + "grad_norm": 24.60587660710838, + "learning_rate": 2.491988507575859e-06, + "loss": 0.9401, + "step": 24659 + }, + { + "epoch": 2.1017642546663255, + "grad_norm": 40.290850209432215, + "learning_rate": 2.4915595627794343e-06, + "loss": 0.7348, + "step": 24660 + }, + { + "epoch": 2.101849484360351, + "grad_norm": 47.63470142898709, + "learning_rate": 2.4911306426525255e-06, + "loss": 1.5085, + "step": 24661 + }, + { + "epoch": 2.1019347140543765, + "grad_norm": 85.66082519722661, + "learning_rate": 2.490701747199349e-06, + "loss": 2.3423, + "step": 24662 + }, + { + "epoch": 2.102019943748402, + "grad_norm": 36.57735857986586, + "learning_rate": 2.490272876424125e-06, + "loss": 1.0289, + "step": 24663 + }, + { + "epoch": 2.1021051734424274, + "grad_norm": 28.80698024546869, + "learning_rate": 2.4898440303310706e-06, + "loss": 1.2215, + "step": 24664 + }, + { + "epoch": 2.102190403136453, + "grad_norm": 63.648197305516966, + "learning_rate": 2.489415208924402e-06, + "loss": 1.8443, + "step": 24665 + }, + { + "epoch": 2.1022756328304784, + "grad_norm": 50.536934596544555, + "learning_rate": 2.488986412208336e-06, + "loss": 2.2601, + "step": 24666 + }, + { + "epoch": 2.1023608625245034, + "grad_norm": 50.61139364590621, + "learning_rate": 2.488557640187092e-06, + "loss": 1.872, + "step": 24667 + }, + { + "epoch": 2.102446092218529, + "grad_norm": 63.337449058580745, + "learning_rate": 2.4881288928648843e-06, + "loss": 1.7484, + "step": 24668 + }, + { + "epoch": 2.1025313219125543, + "grad_norm": 37.6980561793004, + "learning_rate": 2.4877001702459325e-06, + "loss": 1.178, + "step": 24669 + }, + { + "epoch": 2.10261655160658, + "grad_norm": 79.16056245688593, + "learning_rate": 2.4872714723344494e-06, + "loss": 1.7893, + "step": 24670 + }, + { + "epoch": 2.1027017813006053, + "grad_norm": 85.8856820308746, + "learning_rate": 2.486842799134655e-06, + "loss": 2.3281, + "step": 24671 + }, + { + "epoch": 2.1027870109946307, + "grad_norm": 33.315224695010265, + "learning_rate": 2.4864141506507626e-06, + "loss": 1.5723, + "step": 24672 + }, + { + "epoch": 2.1028722406886557, + "grad_norm": 66.04785235651309, + "learning_rate": 2.4859855268869886e-06, + "loss": 2.0378, + "step": 24673 + }, + { + "epoch": 2.102957470382681, + "grad_norm": 62.24865723590931, + "learning_rate": 2.485556927847546e-06, + "loss": 1.7605, + "step": 24674 + }, + { + "epoch": 2.1030427000767067, + "grad_norm": 49.742377898504785, + "learning_rate": 2.485128353536654e-06, + "loss": 1.8246, + "step": 24675 + }, + { + "epoch": 2.103127929770732, + "grad_norm": 33.254777036278064, + "learning_rate": 2.4846998039585227e-06, + "loss": 1.2167, + "step": 24676 + }, + { + "epoch": 2.1032131594647576, + "grad_norm": 53.11675211078985, + "learning_rate": 2.4842712791173723e-06, + "loss": 1.727, + "step": 24677 + }, + { + "epoch": 2.103298389158783, + "grad_norm": 43.33684090315331, + "learning_rate": 2.4838427790174134e-06, + "loss": 1.5489, + "step": 24678 + }, + { + "epoch": 2.103383618852808, + "grad_norm": 41.37197734666781, + "learning_rate": 2.4834143036628593e-06, + "loss": 1.2035, + "step": 24679 + }, + { + "epoch": 2.1034688485468336, + "grad_norm": 51.036121178022206, + "learning_rate": 2.4829858530579278e-06, + "loss": 1.8354, + "step": 24680 + }, + { + "epoch": 2.103554078240859, + "grad_norm": 36.839498644907685, + "learning_rate": 2.4825574272068298e-06, + "loss": 1.1743, + "step": 24681 + }, + { + "epoch": 2.1036393079348845, + "grad_norm": 56.688544581204255, + "learning_rate": 2.4821290261137794e-06, + "loss": 1.274, + "step": 24682 + }, + { + "epoch": 2.10372453762891, + "grad_norm": 46.56561026526224, + "learning_rate": 2.481700649782987e-06, + "loss": 1.465, + "step": 24683 + }, + { + "epoch": 2.1038097673229355, + "grad_norm": 91.21514484857896, + "learning_rate": 2.4812722982186687e-06, + "loss": 1.5189, + "step": 24684 + }, + { + "epoch": 2.103894997016961, + "grad_norm": 44.254463991280716, + "learning_rate": 2.480843971425038e-06, + "loss": 1.3844, + "step": 24685 + }, + { + "epoch": 2.103980226710986, + "grad_norm": 52.503046891827175, + "learning_rate": 2.4804156694063064e-06, + "loss": 1.0263, + "step": 24686 + }, + { + "epoch": 2.1040654564050114, + "grad_norm": 53.676274616026824, + "learning_rate": 2.479987392166683e-06, + "loss": 2.1272, + "step": 24687 + }, + { + "epoch": 2.104150686099037, + "grad_norm": 40.987822058375656, + "learning_rate": 2.4795591397103846e-06, + "loss": 1.7515, + "step": 24688 + }, + { + "epoch": 2.1042359157930624, + "grad_norm": 47.3755064102939, + "learning_rate": 2.4791309120416203e-06, + "loss": 1.3895, + "step": 24689 + }, + { + "epoch": 2.104321145487088, + "grad_norm": 53.801433213573304, + "learning_rate": 2.4787027091646014e-06, + "loss": 2.0156, + "step": 24690 + }, + { + "epoch": 2.1044063751811133, + "grad_norm": 36.58827264571654, + "learning_rate": 2.4782745310835383e-06, + "loss": 1.5836, + "step": 24691 + }, + { + "epoch": 2.1044916048751383, + "grad_norm": 27.777891853482846, + "learning_rate": 2.4778463778026427e-06, + "loss": 1.1208, + "step": 24692 + }, + { + "epoch": 2.104576834569164, + "grad_norm": 51.74804020492596, + "learning_rate": 2.477418249326128e-06, + "loss": 1.8844, + "step": 24693 + }, + { + "epoch": 2.1046620642631892, + "grad_norm": 26.99733073872224, + "learning_rate": 2.4769901456582025e-06, + "loss": 0.9774, + "step": 24694 + }, + { + "epoch": 2.1047472939572147, + "grad_norm": 58.39273319708895, + "learning_rate": 2.476562066803076e-06, + "loss": 2.1999, + "step": 24695 + }, + { + "epoch": 2.10483252365124, + "grad_norm": 39.572008829980255, + "learning_rate": 2.4761340127649568e-06, + "loss": 1.8663, + "step": 24696 + }, + { + "epoch": 2.1049177533452657, + "grad_norm": 21.609889431735347, + "learning_rate": 2.475705983548059e-06, + "loss": 0.5388, + "step": 24697 + }, + { + "epoch": 2.1050029830392907, + "grad_norm": 49.45876475977659, + "learning_rate": 2.4752779791565902e-06, + "loss": 1.8107, + "step": 24698 + }, + { + "epoch": 2.105088212733316, + "grad_norm": 33.345802618988, + "learning_rate": 2.4748499995947567e-06, + "loss": 0.9162, + "step": 24699 + }, + { + "epoch": 2.1051734424273416, + "grad_norm": 55.4439357424659, + "learning_rate": 2.47442204486677e-06, + "loss": 2.2789, + "step": 24700 + }, + { + "epoch": 2.105258672121367, + "grad_norm": 32.285949765148466, + "learning_rate": 2.473994114976841e-06, + "loss": 1.297, + "step": 24701 + }, + { + "epoch": 2.1053439018153925, + "grad_norm": 22.31904223818635, + "learning_rate": 2.473566209929176e-06, + "loss": 0.7798, + "step": 24702 + }, + { + "epoch": 2.105429131509418, + "grad_norm": 35.59078346531205, + "learning_rate": 2.473138329727984e-06, + "loss": 1.0141, + "step": 24703 + }, + { + "epoch": 2.1055143612034435, + "grad_norm": 34.52154268183764, + "learning_rate": 2.4727104743774715e-06, + "loss": 1.5793, + "step": 24704 + }, + { + "epoch": 2.1055995908974685, + "grad_norm": 21.71310016688149, + "learning_rate": 2.472282643881846e-06, + "loss": 0.8855, + "step": 24705 + }, + { + "epoch": 2.105684820591494, + "grad_norm": 53.271459388892374, + "learning_rate": 2.4718548382453184e-06, + "loss": 1.2181, + "step": 24706 + }, + { + "epoch": 2.1057700502855194, + "grad_norm": 17.942866200595834, + "learning_rate": 2.4714270574720917e-06, + "loss": 1.0037, + "step": 24707 + }, + { + "epoch": 2.105855279979545, + "grad_norm": 83.87871390099504, + "learning_rate": 2.470999301566377e-06, + "loss": 2.1588, + "step": 24708 + }, + { + "epoch": 2.1059405096735704, + "grad_norm": 32.68560545281613, + "learning_rate": 2.470571570532377e-06, + "loss": 1.7087, + "step": 24709 + }, + { + "epoch": 2.106025739367596, + "grad_norm": 55.15630481647521, + "learning_rate": 2.4701438643743023e-06, + "loss": 1.9746, + "step": 24710 + }, + { + "epoch": 2.106110969061621, + "grad_norm": 29.09833847514921, + "learning_rate": 2.4697161830963582e-06, + "loss": 0.7443, + "step": 24711 + }, + { + "epoch": 2.1061961987556463, + "grad_norm": 45.957209571834085, + "learning_rate": 2.46928852670275e-06, + "loss": 1.9708, + "step": 24712 + }, + { + "epoch": 2.106281428449672, + "grad_norm": 44.13475520520131, + "learning_rate": 2.4688608951976813e-06, + "loss": 1.476, + "step": 24713 + }, + { + "epoch": 2.1063666581436973, + "grad_norm": 89.6734895823385, + "learning_rate": 2.468433288585361e-06, + "loss": 2.3512, + "step": 24714 + }, + { + "epoch": 2.1064518878377227, + "grad_norm": 47.06816862109832, + "learning_rate": 2.4680057068699927e-06, + "loss": 1.6765, + "step": 24715 + }, + { + "epoch": 2.106537117531748, + "grad_norm": 33.371268496712275, + "learning_rate": 2.467578150055784e-06, + "loss": 1.5021, + "step": 24716 + }, + { + "epoch": 2.1066223472257732, + "grad_norm": 37.24707986236889, + "learning_rate": 2.4671506181469353e-06, + "loss": 0.8175, + "step": 24717 + }, + { + "epoch": 2.1067075769197987, + "grad_norm": 36.339930179121964, + "learning_rate": 2.4667231111476557e-06, + "loss": 1.3318, + "step": 24718 + }, + { + "epoch": 2.106792806613824, + "grad_norm": 67.4021017093105, + "learning_rate": 2.4662956290621482e-06, + "loss": 2.2532, + "step": 24719 + }, + { + "epoch": 2.1068780363078496, + "grad_norm": 26.383634869195767, + "learning_rate": 2.4658681718946165e-06, + "loss": 0.8876, + "step": 24720 + }, + { + "epoch": 2.106963266001875, + "grad_norm": 30.913612681155772, + "learning_rate": 2.465440739649264e-06, + "loss": 0.9758, + "step": 24721 + }, + { + "epoch": 2.1070484956959006, + "grad_norm": 40.33051280008256, + "learning_rate": 2.4650133323302926e-06, + "loss": 1.079, + "step": 24722 + }, + { + "epoch": 2.107133725389926, + "grad_norm": 21.248646389758193, + "learning_rate": 2.4645859499419085e-06, + "loss": 0.6149, + "step": 24723 + }, + { + "epoch": 2.107218955083951, + "grad_norm": 31.474165682063077, + "learning_rate": 2.4641585924883155e-06, + "loss": 1.3411, + "step": 24724 + }, + { + "epoch": 2.1073041847779765, + "grad_norm": 59.61393521216628, + "learning_rate": 2.463731259973715e-06, + "loss": 2.2067, + "step": 24725 + }, + { + "epoch": 2.107389414472002, + "grad_norm": 34.23311874521355, + "learning_rate": 2.4633039524023078e-06, + "loss": 1.673, + "step": 24726 + }, + { + "epoch": 2.1074746441660275, + "grad_norm": 100.1175789614173, + "learning_rate": 2.4628766697783006e-06, + "loss": 1.4537, + "step": 24727 + }, + { + "epoch": 2.107559873860053, + "grad_norm": 41.64490260575922, + "learning_rate": 2.4624494121058927e-06, + "loss": 1.7006, + "step": 24728 + }, + { + "epoch": 2.1076451035540784, + "grad_norm": 43.741911392651346, + "learning_rate": 2.4620221793892866e-06, + "loss": 1.4146, + "step": 24729 + }, + { + "epoch": 2.1077303332481034, + "grad_norm": 72.12189252310819, + "learning_rate": 2.4615949716326827e-06, + "loss": 1.2448, + "step": 24730 + }, + { + "epoch": 2.107815562942129, + "grad_norm": 69.69606825309971, + "learning_rate": 2.461167788840283e-06, + "loss": 2.3959, + "step": 24731 + }, + { + "epoch": 2.1079007926361544, + "grad_norm": 47.134178766571694, + "learning_rate": 2.4607406310162907e-06, + "loss": 1.6455, + "step": 24732 + }, + { + "epoch": 2.10798602233018, + "grad_norm": 64.26312634112656, + "learning_rate": 2.460313498164906e-06, + "loss": 2.0721, + "step": 24733 + }, + { + "epoch": 2.1080712520242053, + "grad_norm": 32.18459159276323, + "learning_rate": 2.4598863902903285e-06, + "loss": 1.0812, + "step": 24734 + }, + { + "epoch": 2.108156481718231, + "grad_norm": 64.25621248929713, + "learning_rate": 2.4594593073967572e-06, + "loss": 1.3298, + "step": 24735 + }, + { + "epoch": 2.108241711412256, + "grad_norm": 54.427673004801605, + "learning_rate": 2.4590322494883963e-06, + "loss": 1.1891, + "step": 24736 + }, + { + "epoch": 2.1083269411062813, + "grad_norm": 42.55106745698149, + "learning_rate": 2.458605216569443e-06, + "loss": 1.5382, + "step": 24737 + }, + { + "epoch": 2.1084121708003067, + "grad_norm": 44.111039694474854, + "learning_rate": 2.458178208644096e-06, + "loss": 1.2882, + "step": 24738 + }, + { + "epoch": 2.108497400494332, + "grad_norm": 33.98774966537068, + "learning_rate": 2.4577512257165564e-06, + "loss": 1.3504, + "step": 24739 + }, + { + "epoch": 2.1085826301883577, + "grad_norm": 63.59511081942382, + "learning_rate": 2.457324267791025e-06, + "loss": 2.5834, + "step": 24740 + }, + { + "epoch": 2.108667859882383, + "grad_norm": 82.05533807823113, + "learning_rate": 2.456897334871699e-06, + "loss": 1.5451, + "step": 24741 + }, + { + "epoch": 2.1087530895764086, + "grad_norm": 66.64166414739998, + "learning_rate": 2.4564704269627777e-06, + "loss": 2.0984, + "step": 24742 + }, + { + "epoch": 2.1088383192704336, + "grad_norm": 40.86560390930327, + "learning_rate": 2.4560435440684566e-06, + "loss": 1.6077, + "step": 24743 + }, + { + "epoch": 2.108923548964459, + "grad_norm": 30.313830179382485, + "learning_rate": 2.455616686192938e-06, + "loss": 1.3647, + "step": 24744 + }, + { + "epoch": 2.1090087786584846, + "grad_norm": 37.59214192790226, + "learning_rate": 2.455189853340419e-06, + "loss": 1.0163, + "step": 24745 + }, + { + "epoch": 2.10909400835251, + "grad_norm": 69.71485533149188, + "learning_rate": 2.4547630455150937e-06, + "loss": 1.8761, + "step": 24746 + }, + { + "epoch": 2.1091792380465355, + "grad_norm": 24.87178418853143, + "learning_rate": 2.454336262721164e-06, + "loss": 0.9065, + "step": 24747 + }, + { + "epoch": 2.109264467740561, + "grad_norm": 29.583902273438984, + "learning_rate": 2.4539095049628263e-06, + "loss": 1.3636, + "step": 24748 + }, + { + "epoch": 2.109349697434586, + "grad_norm": 23.53322315941705, + "learning_rate": 2.453482772244277e-06, + "loss": 0.9219, + "step": 24749 + }, + { + "epoch": 2.1094349271286115, + "grad_norm": 70.60238849735678, + "learning_rate": 2.4530560645697132e-06, + "loss": 2.4103, + "step": 24750 + }, + { + "epoch": 2.109520156822637, + "grad_norm": 21.63659514175168, + "learning_rate": 2.4526293819433306e-06, + "loss": 0.7823, + "step": 24751 + }, + { + "epoch": 2.1096053865166624, + "grad_norm": 18.56580465062032, + "learning_rate": 2.4522027243693243e-06, + "loss": 0.7498, + "step": 24752 + }, + { + "epoch": 2.109690616210688, + "grad_norm": 39.66119529968273, + "learning_rate": 2.451776091851893e-06, + "loss": 1.2049, + "step": 24753 + }, + { + "epoch": 2.1097758459047133, + "grad_norm": 54.329920563282265, + "learning_rate": 2.4513494843952303e-06, + "loss": 1.4912, + "step": 24754 + }, + { + "epoch": 2.109861075598739, + "grad_norm": 56.77120542471559, + "learning_rate": 2.4509229020035334e-06, + "loss": 1.6157, + "step": 24755 + }, + { + "epoch": 2.109946305292764, + "grad_norm": 59.6103288927425, + "learning_rate": 2.450496344680996e-06, + "loss": 1.6775, + "step": 24756 + }, + { + "epoch": 2.1100315349867893, + "grad_norm": 42.48589998699736, + "learning_rate": 2.4500698124318155e-06, + "loss": 1.0476, + "step": 24757 + }, + { + "epoch": 2.1101167646808148, + "grad_norm": 39.17005707035793, + "learning_rate": 2.4496433052601853e-06, + "loss": 1.4373, + "step": 24758 + }, + { + "epoch": 2.1102019943748402, + "grad_norm": 23.2551105850935, + "learning_rate": 2.4492168231702997e-06, + "loss": 0.7053, + "step": 24759 + }, + { + "epoch": 2.1102872240688657, + "grad_norm": 101.52945187149396, + "learning_rate": 2.448790366166351e-06, + "loss": 2.5429, + "step": 24760 + }, + { + "epoch": 2.110372453762891, + "grad_norm": 43.71283902435734, + "learning_rate": 2.4483639342525373e-06, + "loss": 1.2228, + "step": 24761 + }, + { + "epoch": 2.110457683456916, + "grad_norm": 53.91130279434824, + "learning_rate": 2.4479375274330487e-06, + "loss": 1.4175, + "step": 24762 + }, + { + "epoch": 2.1105429131509417, + "grad_norm": 23.49014822197637, + "learning_rate": 2.447511145712082e-06, + "loss": 1.1672, + "step": 24763 + }, + { + "epoch": 2.110628142844967, + "grad_norm": 61.492741345166735, + "learning_rate": 2.447084789093829e-06, + "loss": 1.5316, + "step": 24764 + }, + { + "epoch": 2.1107133725389926, + "grad_norm": 53.372321336009, + "learning_rate": 2.446658457582481e-06, + "loss": 1.4947, + "step": 24765 + }, + { + "epoch": 2.110798602233018, + "grad_norm": 27.42462250707674, + "learning_rate": 2.446232151182234e-06, + "loss": 0.6315, + "step": 24766 + }, + { + "epoch": 2.1108838319270435, + "grad_norm": 36.85333040010558, + "learning_rate": 2.4458058698972793e-06, + "loss": 1.7252, + "step": 24767 + }, + { + "epoch": 2.1109690616210686, + "grad_norm": 40.16072092188884, + "learning_rate": 2.4453796137318085e-06, + "loss": 1.0757, + "step": 24768 + }, + { + "epoch": 2.111054291315094, + "grad_norm": 72.17269381000955, + "learning_rate": 2.444953382690013e-06, + "loss": 2.1041, + "step": 24769 + }, + { + "epoch": 2.1111395210091195, + "grad_norm": 50.1669561153045, + "learning_rate": 2.4445271767760854e-06, + "loss": 1.3648, + "step": 24770 + }, + { + "epoch": 2.111224750703145, + "grad_norm": 23.40629963349687, + "learning_rate": 2.4441009959942193e-06, + "loss": 0.7569, + "step": 24771 + }, + { + "epoch": 2.1113099803971704, + "grad_norm": 32.47855672985014, + "learning_rate": 2.443674840348604e-06, + "loss": 1.1673, + "step": 24772 + }, + { + "epoch": 2.111395210091196, + "grad_norm": 47.4682060800149, + "learning_rate": 2.4432487098434294e-06, + "loss": 1.7819, + "step": 24773 + }, + { + "epoch": 2.1114804397852214, + "grad_norm": 24.05470861273227, + "learning_rate": 2.4428226044828896e-06, + "loss": 0.5093, + "step": 24774 + }, + { + "epoch": 2.1115656694792464, + "grad_norm": 61.8334170833979, + "learning_rate": 2.442396524271173e-06, + "loss": 1.6511, + "step": 24775 + }, + { + "epoch": 2.111650899173272, + "grad_norm": 67.35964415843011, + "learning_rate": 2.4419704692124706e-06, + "loss": 1.6499, + "step": 24776 + }, + { + "epoch": 2.1117361288672973, + "grad_norm": 32.93629031168128, + "learning_rate": 2.4415444393109706e-06, + "loss": 1.3773, + "step": 24777 + }, + { + "epoch": 2.111821358561323, + "grad_norm": 57.387693821626556, + "learning_rate": 2.4411184345708638e-06, + "loss": 1.5945, + "step": 24778 + }, + { + "epoch": 2.1119065882553483, + "grad_norm": 40.89759792732243, + "learning_rate": 2.440692454996343e-06, + "loss": 1.2065, + "step": 24779 + }, + { + "epoch": 2.1119918179493737, + "grad_norm": 64.03784809241164, + "learning_rate": 2.4402665005915943e-06, + "loss": 2.13, + "step": 24780 + }, + { + "epoch": 2.1120770476433988, + "grad_norm": 54.47126745857981, + "learning_rate": 2.4398405713608075e-06, + "loss": 1.7375, + "step": 24781 + }, + { + "epoch": 2.1121622773374242, + "grad_norm": 30.41112917640843, + "learning_rate": 2.43941466730817e-06, + "loss": 0.7837, + "step": 24782 + }, + { + "epoch": 2.1122475070314497, + "grad_norm": 68.24401392799773, + "learning_rate": 2.438988788437873e-06, + "loss": 1.9073, + "step": 24783 + }, + { + "epoch": 2.112332736725475, + "grad_norm": 45.62728503580637, + "learning_rate": 2.438562934754104e-06, + "loss": 0.9566, + "step": 24784 + }, + { + "epoch": 2.1124179664195006, + "grad_norm": 20.73349957494338, + "learning_rate": 2.438137106261049e-06, + "loss": 0.6388, + "step": 24785 + }, + { + "epoch": 2.112503196113526, + "grad_norm": 37.59817292151824, + "learning_rate": 2.437711302962897e-06, + "loss": 1.4506, + "step": 24786 + }, + { + "epoch": 2.1125884258075516, + "grad_norm": 84.44775402073374, + "learning_rate": 2.4372855248638384e-06, + "loss": 2.1202, + "step": 24787 + }, + { + "epoch": 2.1126736555015766, + "grad_norm": 26.767816194591166, + "learning_rate": 2.436859771968058e-06, + "loss": 0.9453, + "step": 24788 + }, + { + "epoch": 2.112758885195602, + "grad_norm": 52.11217835115807, + "learning_rate": 2.436434044279744e-06, + "loss": 1.9997, + "step": 24789 + }, + { + "epoch": 2.1128441148896275, + "grad_norm": 40.99134462850356, + "learning_rate": 2.43600834180308e-06, + "loss": 0.8854, + "step": 24790 + }, + { + "epoch": 2.112929344583653, + "grad_norm": 69.55005041135963, + "learning_rate": 2.4355826645422576e-06, + "loss": 1.9929, + "step": 24791 + }, + { + "epoch": 2.1130145742776785, + "grad_norm": 51.89865048823266, + "learning_rate": 2.43515701250146e-06, + "loss": 1.5679, + "step": 24792 + }, + { + "epoch": 2.113099803971704, + "grad_norm": 64.67293908293993, + "learning_rate": 2.4347313856848727e-06, + "loss": 2.4499, + "step": 24793 + }, + { + "epoch": 2.113185033665729, + "grad_norm": 67.04943509334454, + "learning_rate": 2.4343057840966845e-06, + "loss": 1.6303, + "step": 24794 + }, + { + "epoch": 2.1132702633597544, + "grad_norm": 39.673423155912154, + "learning_rate": 2.433880207741077e-06, + "loss": 1.4267, + "step": 24795 + }, + { + "epoch": 2.11335549305378, + "grad_norm": 36.735705590704335, + "learning_rate": 2.4334546566222405e-06, + "loss": 1.3345, + "step": 24796 + }, + { + "epoch": 2.1134407227478054, + "grad_norm": 63.71841705517585, + "learning_rate": 2.4330291307443576e-06, + "loss": 1.1477, + "step": 24797 + }, + { + "epoch": 2.113525952441831, + "grad_norm": 46.636014657510245, + "learning_rate": 2.4326036301116136e-06, + "loss": 1.2627, + "step": 24798 + }, + { + "epoch": 2.1136111821358563, + "grad_norm": 19.563564075959064, + "learning_rate": 2.43217815472819e-06, + "loss": 0.8281, + "step": 24799 + }, + { + "epoch": 2.1136964118298813, + "grad_norm": 26.564711411712427, + "learning_rate": 2.4317527045982764e-06, + "loss": 1.0941, + "step": 24800 + }, + { + "epoch": 2.113781641523907, + "grad_norm": 71.51754011312622, + "learning_rate": 2.431327279726052e-06, + "loss": 2.127, + "step": 24801 + }, + { + "epoch": 2.1138668712179323, + "grad_norm": 51.667118584215, + "learning_rate": 2.4309018801157055e-06, + "loss": 1.2918, + "step": 24802 + }, + { + "epoch": 2.1139521009119577, + "grad_norm": 55.137456511932854, + "learning_rate": 2.430476505771416e-06, + "loss": 1.8893, + "step": 24803 + }, + { + "epoch": 2.114037330605983, + "grad_norm": 53.28395681371379, + "learning_rate": 2.4300511566973704e-06, + "loss": 1.7861, + "step": 24804 + }, + { + "epoch": 2.1141225603000087, + "grad_norm": 33.02421246578293, + "learning_rate": 2.4296258328977513e-06, + "loss": 1.2897, + "step": 24805 + }, + { + "epoch": 2.114207789994034, + "grad_norm": 28.128518294060484, + "learning_rate": 2.4292005343767403e-06, + "loss": 0.9242, + "step": 24806 + }, + { + "epoch": 2.114293019688059, + "grad_norm": 44.85668956822272, + "learning_rate": 2.4287752611385207e-06, + "loss": 1.1382, + "step": 24807 + }, + { + "epoch": 2.1143782493820846, + "grad_norm": 107.3959990993451, + "learning_rate": 2.428350013187273e-06, + "loss": 1.6241, + "step": 24808 + }, + { + "epoch": 2.11446347907611, + "grad_norm": 63.955486224439916, + "learning_rate": 2.4279247905271806e-06, + "loss": 1.461, + "step": 24809 + }, + { + "epoch": 2.1145487087701356, + "grad_norm": 58.69802865960285, + "learning_rate": 2.4274995931624285e-06, + "loss": 1.534, + "step": 24810 + }, + { + "epoch": 2.114633938464161, + "grad_norm": 56.965568286414076, + "learning_rate": 2.427074421097195e-06, + "loss": 1.679, + "step": 24811 + }, + { + "epoch": 2.1147191681581865, + "grad_norm": 37.051840432666225, + "learning_rate": 2.42664927433566e-06, + "loss": 0.9631, + "step": 24812 + }, + { + "epoch": 2.1148043978522115, + "grad_norm": 54.62699272644018, + "learning_rate": 2.4262241528820096e-06, + "loss": 1.9823, + "step": 24813 + }, + { + "epoch": 2.114889627546237, + "grad_norm": 45.11535310316292, + "learning_rate": 2.4257990567404217e-06, + "loss": 1.3827, + "step": 24814 + }, + { + "epoch": 2.1149748572402625, + "grad_norm": 39.31001764092901, + "learning_rate": 2.4253739859150764e-06, + "loss": 1.2278, + "step": 24815 + }, + { + "epoch": 2.115060086934288, + "grad_norm": 53.661460194316156, + "learning_rate": 2.424948940410154e-06, + "loss": 1.3384, + "step": 24816 + }, + { + "epoch": 2.1151453166283134, + "grad_norm": 59.52530985402745, + "learning_rate": 2.4245239202298354e-06, + "loss": 1.5238, + "step": 24817 + }, + { + "epoch": 2.115230546322339, + "grad_norm": 64.61556920171657, + "learning_rate": 2.424098925378302e-06, + "loss": 1.6753, + "step": 24818 + }, + { + "epoch": 2.115315776016364, + "grad_norm": 73.08276493366033, + "learning_rate": 2.4236739558597328e-06, + "loss": 1.9849, + "step": 24819 + }, + { + "epoch": 2.1154010057103894, + "grad_norm": 44.1305989690992, + "learning_rate": 2.4232490116783044e-06, + "loss": 1.5627, + "step": 24820 + }, + { + "epoch": 2.115486235404415, + "grad_norm": 52.056247868895895, + "learning_rate": 2.422824092838199e-06, + "loss": 2.0526, + "step": 24821 + }, + { + "epoch": 2.1155714650984403, + "grad_norm": 82.71721613922008, + "learning_rate": 2.422399199343596e-06, + "loss": 2.5802, + "step": 24822 + }, + { + "epoch": 2.1156566947924658, + "grad_norm": 31.779499590392668, + "learning_rate": 2.4219743311986716e-06, + "loss": 1.1252, + "step": 24823 + }, + { + "epoch": 2.1157419244864912, + "grad_norm": 14.16113098929812, + "learning_rate": 2.421549488407604e-06, + "loss": 0.5579, + "step": 24824 + }, + { + "epoch": 2.1158271541805167, + "grad_norm": 48.09767708629152, + "learning_rate": 2.4211246709745723e-06, + "loss": 0.9942, + "step": 24825 + }, + { + "epoch": 2.1159123838745417, + "grad_norm": 42.755910541724354, + "learning_rate": 2.4206998789037568e-06, + "loss": 1.2524, + "step": 24826 + }, + { + "epoch": 2.115997613568567, + "grad_norm": 94.95251798451136, + "learning_rate": 2.420275112199333e-06, + "loss": 1.7395, + "step": 24827 + }, + { + "epoch": 2.1160828432625927, + "grad_norm": 34.339663099952226, + "learning_rate": 2.419850370865478e-06, + "loss": 0.816, + "step": 24828 + }, + { + "epoch": 2.116168072956618, + "grad_norm": 37.10929722000814, + "learning_rate": 2.419425654906368e-06, + "loss": 0.8609, + "step": 24829 + }, + { + "epoch": 2.1162533026506436, + "grad_norm": 62.154216082567906, + "learning_rate": 2.4190009643261832e-06, + "loss": 2.3441, + "step": 24830 + }, + { + "epoch": 2.116338532344669, + "grad_norm": 63.9695909649446, + "learning_rate": 2.4185762991290977e-06, + "loss": 2.3641, + "step": 24831 + }, + { + "epoch": 2.116423762038694, + "grad_norm": 29.500529657512132, + "learning_rate": 2.418151659319287e-06, + "loss": 0.8637, + "step": 24832 + }, + { + "epoch": 2.1165089917327196, + "grad_norm": 44.5828703513216, + "learning_rate": 2.4177270449009292e-06, + "loss": 1.3505, + "step": 24833 + }, + { + "epoch": 2.116594221426745, + "grad_norm": 44.05330617643724, + "learning_rate": 2.417302455878201e-06, + "loss": 1.3953, + "step": 24834 + }, + { + "epoch": 2.1166794511207705, + "grad_norm": 91.44727928634016, + "learning_rate": 2.416877892255278e-06, + "loss": 2.3248, + "step": 24835 + }, + { + "epoch": 2.116764680814796, + "grad_norm": 44.743341752315345, + "learning_rate": 2.416453354036333e-06, + "loss": 1.6956, + "step": 24836 + }, + { + "epoch": 2.1168499105088214, + "grad_norm": 46.5751997994716, + "learning_rate": 2.4160288412255434e-06, + "loss": 1.3326, + "step": 24837 + }, + { + "epoch": 2.1169351402028465, + "grad_norm": 34.462209401493055, + "learning_rate": 2.4156043538270817e-06, + "loss": 1.1678, + "step": 24838 + }, + { + "epoch": 2.117020369896872, + "grad_norm": 49.24696025914408, + "learning_rate": 2.4151798918451256e-06, + "loss": 1.5542, + "step": 24839 + }, + { + "epoch": 2.1171055995908974, + "grad_norm": 50.31055195703881, + "learning_rate": 2.414755455283847e-06, + "loss": 1.4833, + "step": 24840 + }, + { + "epoch": 2.117190829284923, + "grad_norm": 53.61376799562318, + "learning_rate": 2.4143310441474226e-06, + "loss": 1.8229, + "step": 24841 + }, + { + "epoch": 2.1172760589789483, + "grad_norm": 39.86677384263529, + "learning_rate": 2.4139066584400236e-06, + "loss": 1.0229, + "step": 24842 + }, + { + "epoch": 2.117361288672974, + "grad_norm": 43.0957823923678, + "learning_rate": 2.4134822981658264e-06, + "loss": 1.6772, + "step": 24843 + }, + { + "epoch": 2.1174465183669993, + "grad_norm": 59.34376631494764, + "learning_rate": 2.4130579633290023e-06, + "loss": 1.7931, + "step": 24844 + }, + { + "epoch": 2.1175317480610243, + "grad_norm": 88.82961659242802, + "learning_rate": 2.4126336539337263e-06, + "loss": 2.2554, + "step": 24845 + }, + { + "epoch": 2.1176169777550498, + "grad_norm": 49.329183608326375, + "learning_rate": 2.412209369984168e-06, + "loss": 1.7824, + "step": 24846 + }, + { + "epoch": 2.1177022074490752, + "grad_norm": 62.641364325718634, + "learning_rate": 2.4117851114845044e-06, + "loss": 2.1152, + "step": 24847 + }, + { + "epoch": 2.1177874371431007, + "grad_norm": 36.047917708677396, + "learning_rate": 2.411360878438903e-06, + "loss": 1.0352, + "step": 24848 + }, + { + "epoch": 2.117872666837126, + "grad_norm": 16.664873270672658, + "learning_rate": 2.410936670851541e-06, + "loss": 0.5193, + "step": 24849 + }, + { + "epoch": 2.1179578965311516, + "grad_norm": 35.90704442956175, + "learning_rate": 2.4105124887265887e-06, + "loss": 0.8725, + "step": 24850 + }, + { + "epoch": 2.1180431262251767, + "grad_norm": 39.570239698881274, + "learning_rate": 2.410088332068215e-06, + "loss": 1.5359, + "step": 24851 + }, + { + "epoch": 2.118128355919202, + "grad_norm": 21.055318199028147, + "learning_rate": 2.4096642008805947e-06, + "loss": 0.8198, + "step": 24852 + }, + { + "epoch": 2.1182135856132276, + "grad_norm": 20.11922066009275, + "learning_rate": 2.4092400951678985e-06, + "loss": 0.6755, + "step": 24853 + }, + { + "epoch": 2.118298815307253, + "grad_norm": 52.0755336333844, + "learning_rate": 2.408816014934296e-06, + "loss": 1.6583, + "step": 24854 + }, + { + "epoch": 2.1183840450012785, + "grad_norm": 31.84662144215932, + "learning_rate": 2.4083919601839562e-06, + "loss": 0.9503, + "step": 24855 + }, + { + "epoch": 2.118469274695304, + "grad_norm": 5.603384135427408, + "learning_rate": 2.4079679309210518e-06, + "loss": 0.1252, + "step": 24856 + }, + { + "epoch": 2.118554504389329, + "grad_norm": 51.97454202084681, + "learning_rate": 2.4075439271497555e-06, + "loss": 1.7347, + "step": 24857 + }, + { + "epoch": 2.1186397340833545, + "grad_norm": 60.27867035679432, + "learning_rate": 2.4071199488742334e-06, + "loss": 2.5519, + "step": 24858 + }, + { + "epoch": 2.11872496377738, + "grad_norm": 34.88608997945882, + "learning_rate": 2.4066959960986553e-06, + "loss": 1.1837, + "step": 24859 + }, + { + "epoch": 2.1188101934714054, + "grad_norm": 59.86150106596191, + "learning_rate": 2.406272068827193e-06, + "loss": 1.6061, + "step": 24860 + }, + { + "epoch": 2.118895423165431, + "grad_norm": 50.17425573077103, + "learning_rate": 2.4058481670640143e-06, + "loss": 1.7126, + "step": 24861 + }, + { + "epoch": 2.1189806528594564, + "grad_norm": 49.636070987095316, + "learning_rate": 2.4054242908132885e-06, + "loss": 1.6623, + "step": 24862 + }, + { + "epoch": 2.119065882553482, + "grad_norm": 89.87129682202077, + "learning_rate": 2.4050004400791817e-06, + "loss": 2.8484, + "step": 24863 + }, + { + "epoch": 2.119151112247507, + "grad_norm": 31.079160085854387, + "learning_rate": 2.404576614865864e-06, + "loss": 1.0416, + "step": 24864 + }, + { + "epoch": 2.1192363419415323, + "grad_norm": 35.95322784664453, + "learning_rate": 2.404152815177506e-06, + "loss": 1.2577, + "step": 24865 + }, + { + "epoch": 2.119321571635558, + "grad_norm": 79.72604989007685, + "learning_rate": 2.403729041018274e-06, + "loss": 2.1164, + "step": 24866 + }, + { + "epoch": 2.1194068013295833, + "grad_norm": 34.84556283876491, + "learning_rate": 2.403305292392335e-06, + "loss": 1.0752, + "step": 24867 + }, + { + "epoch": 2.1194920310236087, + "grad_norm": 58.76971942320944, + "learning_rate": 2.4028815693038547e-06, + "loss": 2.3821, + "step": 24868 + }, + { + "epoch": 2.119577260717634, + "grad_norm": 44.893537927190366, + "learning_rate": 2.402457871757004e-06, + "loss": 1.4828, + "step": 24869 + }, + { + "epoch": 2.119662490411659, + "grad_norm": 36.938868810178114, + "learning_rate": 2.4020341997559482e-06, + "loss": 1.5345, + "step": 24870 + }, + { + "epoch": 2.1197477201056847, + "grad_norm": 32.853269833248845, + "learning_rate": 2.401610553304852e-06, + "loss": 0.8758, + "step": 24871 + }, + { + "epoch": 2.11983294979971, + "grad_norm": 45.09568804698605, + "learning_rate": 2.401186932407883e-06, + "loss": 1.2286, + "step": 24872 + }, + { + "epoch": 2.1199181794937356, + "grad_norm": 23.544471386944313, + "learning_rate": 2.4007633370692096e-06, + "loss": 1.3643, + "step": 24873 + }, + { + "epoch": 2.120003409187761, + "grad_norm": 194.70971343824937, + "learning_rate": 2.4003397672929964e-06, + "loss": 1.8644, + "step": 24874 + }, + { + "epoch": 2.1200886388817866, + "grad_norm": 53.34381553651168, + "learning_rate": 2.3999162230834084e-06, + "loss": 1.594, + "step": 24875 + }, + { + "epoch": 2.120173868575812, + "grad_norm": 95.18335851630597, + "learning_rate": 2.399492704444609e-06, + "loss": 1.4132, + "step": 24876 + }, + { + "epoch": 2.120259098269837, + "grad_norm": 55.868921125911314, + "learning_rate": 2.399069211380768e-06, + "loss": 2.1185, + "step": 24877 + }, + { + "epoch": 2.1203443279638625, + "grad_norm": 38.69200091544679, + "learning_rate": 2.3986457438960476e-06, + "loss": 1.2082, + "step": 24878 + }, + { + "epoch": 2.120429557657888, + "grad_norm": 29.738355678368123, + "learning_rate": 2.3982223019946103e-06, + "loss": 1.1834, + "step": 24879 + }, + { + "epoch": 2.1205147873519135, + "grad_norm": 36.65655898611804, + "learning_rate": 2.3977988856806255e-06, + "loss": 1.7875, + "step": 24880 + }, + { + "epoch": 2.120600017045939, + "grad_norm": 39.72826026736952, + "learning_rate": 2.397375494958252e-06, + "loss": 1.0154, + "step": 24881 + }, + { + "epoch": 2.1206852467399644, + "grad_norm": 55.24207153380777, + "learning_rate": 2.396952129831659e-06, + "loss": 1.2699, + "step": 24882 + }, + { + "epoch": 2.1207704764339894, + "grad_norm": 17.249415860425067, + "learning_rate": 2.3965287903050067e-06, + "loss": 0.6414, + "step": 24883 + }, + { + "epoch": 2.120855706128015, + "grad_norm": 32.41948684405916, + "learning_rate": 2.3961054763824597e-06, + "loss": 0.6767, + "step": 24884 + }, + { + "epoch": 2.1209409358220404, + "grad_norm": 34.79259638764022, + "learning_rate": 2.3956821880681785e-06, + "loss": 1.3077, + "step": 24885 + }, + { + "epoch": 2.121026165516066, + "grad_norm": 47.105982108642166, + "learning_rate": 2.39525892536633e-06, + "loss": 2.0788, + "step": 24886 + }, + { + "epoch": 2.1211113952100913, + "grad_norm": 25.836062257414774, + "learning_rate": 2.3948356882810737e-06, + "loss": 1.0434, + "step": 24887 + }, + { + "epoch": 2.1211966249041168, + "grad_norm": 31.65939370278061, + "learning_rate": 2.3944124768165743e-06, + "loss": 1.0658, + "step": 24888 + }, + { + "epoch": 2.121281854598142, + "grad_norm": 36.60534213451757, + "learning_rate": 2.3939892909769913e-06, + "loss": 1.6062, + "step": 24889 + }, + { + "epoch": 2.1213670842921672, + "grad_norm": 37.46000335388388, + "learning_rate": 2.39356613076649e-06, + "loss": 0.8759, + "step": 24890 + }, + { + "epoch": 2.1214523139861927, + "grad_norm": 37.610233968162746, + "learning_rate": 2.39314299618923e-06, + "loss": 1.2647, + "step": 24891 + }, + { + "epoch": 2.121537543680218, + "grad_norm": 95.32935198114448, + "learning_rate": 2.3927198872493722e-06, + "loss": 1.6067, + "step": 24892 + }, + { + "epoch": 2.1216227733742437, + "grad_norm": 47.17691956544083, + "learning_rate": 2.3922968039510773e-06, + "loss": 1.3532, + "step": 24893 + }, + { + "epoch": 2.121708003068269, + "grad_norm": 45.747528932565054, + "learning_rate": 2.3918737462985086e-06, + "loss": 2.0937, + "step": 24894 + }, + { + "epoch": 2.1217932327622946, + "grad_norm": 44.02656341990194, + "learning_rate": 2.3914507142958238e-06, + "loss": 1.9506, + "step": 24895 + }, + { + "epoch": 2.1218784624563196, + "grad_norm": 34.87499226042191, + "learning_rate": 2.391027707947186e-06, + "loss": 0.7563, + "step": 24896 + }, + { + "epoch": 2.121963692150345, + "grad_norm": 40.037840455731214, + "learning_rate": 2.390604727256754e-06, + "loss": 1.3403, + "step": 24897 + }, + { + "epoch": 2.1220489218443706, + "grad_norm": 64.68485582410194, + "learning_rate": 2.390181772228686e-06, + "loss": 1.8309, + "step": 24898 + }, + { + "epoch": 2.122134151538396, + "grad_norm": 33.267733434545285, + "learning_rate": 2.3897588428671453e-06, + "loss": 1.4297, + "step": 24899 + }, + { + "epoch": 2.1222193812324215, + "grad_norm": 24.995018048275544, + "learning_rate": 2.389335939176289e-06, + "loss": 0.9765, + "step": 24900 + }, + { + "epoch": 2.122304610926447, + "grad_norm": 29.294358432261824, + "learning_rate": 2.3889130611602764e-06, + "loss": 1.006, + "step": 24901 + }, + { + "epoch": 2.122389840620472, + "grad_norm": 70.29742618028335, + "learning_rate": 2.388490208823264e-06, + "loss": 1.1686, + "step": 24902 + }, + { + "epoch": 2.1224750703144974, + "grad_norm": 58.425009799068036, + "learning_rate": 2.3880673821694135e-06, + "loss": 2.1498, + "step": 24903 + }, + { + "epoch": 2.122560300008523, + "grad_norm": 70.1860845205378, + "learning_rate": 2.3876445812028835e-06, + "loss": 1.7025, + "step": 24904 + }, + { + "epoch": 2.1226455297025484, + "grad_norm": 67.29779154161406, + "learning_rate": 2.3872218059278314e-06, + "loss": 2.1063, + "step": 24905 + }, + { + "epoch": 2.122730759396574, + "grad_norm": 69.03882811332576, + "learning_rate": 2.3867990563484123e-06, + "loss": 1.6486, + "step": 24906 + }, + { + "epoch": 2.1228159890905993, + "grad_norm": 97.20687987372352, + "learning_rate": 2.386376332468788e-06, + "loss": 2.4462, + "step": 24907 + }, + { + "epoch": 2.122901218784625, + "grad_norm": 45.77095474171122, + "learning_rate": 2.3859536342931143e-06, + "loss": 1.5098, + "step": 24908 + }, + { + "epoch": 2.12298644847865, + "grad_norm": 49.666964288530146, + "learning_rate": 2.3855309618255486e-06, + "loss": 1.3475, + "step": 24909 + }, + { + "epoch": 2.1230716781726753, + "grad_norm": 36.94829033096979, + "learning_rate": 2.385108315070244e-06, + "loss": 1.4629, + "step": 24910 + }, + { + "epoch": 2.1231569078667007, + "grad_norm": 28.548892464564798, + "learning_rate": 2.3846856940313608e-06, + "loss": 1.2038, + "step": 24911 + }, + { + "epoch": 2.123242137560726, + "grad_norm": 65.9314701437204, + "learning_rate": 2.3842630987130556e-06, + "loss": 2.3653, + "step": 24912 + }, + { + "epoch": 2.1233273672547517, + "grad_norm": 60.917079881696814, + "learning_rate": 2.3838405291194836e-06, + "loss": 1.9345, + "step": 24913 + }, + { + "epoch": 2.123412596948777, + "grad_norm": 35.98814011017639, + "learning_rate": 2.3834179852548013e-06, + "loss": 1.3285, + "step": 24914 + }, + { + "epoch": 2.123497826642802, + "grad_norm": 40.47801349534914, + "learning_rate": 2.3829954671231608e-06, + "loss": 1.2513, + "step": 24915 + }, + { + "epoch": 2.1235830563368276, + "grad_norm": 23.159412233472978, + "learning_rate": 2.382572974728722e-06, + "loss": 0.8787, + "step": 24916 + }, + { + "epoch": 2.123668286030853, + "grad_norm": 41.046923692286406, + "learning_rate": 2.382150508075638e-06, + "loss": 1.1714, + "step": 24917 + }, + { + "epoch": 2.1237535157248786, + "grad_norm": 80.14131000194278, + "learning_rate": 2.3817280671680614e-06, + "loss": 2.4291, + "step": 24918 + }, + { + "epoch": 2.123838745418904, + "grad_norm": 80.64696881818449, + "learning_rate": 2.381305652010149e-06, + "loss": 1.6545, + "step": 24919 + }, + { + "epoch": 2.1239239751129295, + "grad_norm": 71.16362192863163, + "learning_rate": 2.3808832626060567e-06, + "loss": 1.4523, + "step": 24920 + }, + { + "epoch": 2.1240092048069545, + "grad_norm": 64.15179004450599, + "learning_rate": 2.380460898959937e-06, + "loss": 1.9666, + "step": 24921 + }, + { + "epoch": 2.12409443450098, + "grad_norm": 61.514693387651654, + "learning_rate": 2.3800385610759434e-06, + "loss": 1.5849, + "step": 24922 + }, + { + "epoch": 2.1241796641950055, + "grad_norm": 39.40036705574097, + "learning_rate": 2.37961624895823e-06, + "loss": 1.4903, + "step": 24923 + }, + { + "epoch": 2.124264893889031, + "grad_norm": 58.12508524945693, + "learning_rate": 2.3791939626109466e-06, + "loss": 1.5312, + "step": 24924 + }, + { + "epoch": 2.1243501235830564, + "grad_norm": 36.939763463575964, + "learning_rate": 2.3787717020382516e-06, + "loss": 1.6744, + "step": 24925 + }, + { + "epoch": 2.124435353277082, + "grad_norm": 33.26083187013473, + "learning_rate": 2.3783494672442938e-06, + "loss": 1.0475, + "step": 24926 + }, + { + "epoch": 2.1245205829711074, + "grad_norm": 30.117786567415394, + "learning_rate": 2.3779272582332285e-06, + "loss": 1.207, + "step": 24927 + }, + { + "epoch": 2.1246058126651324, + "grad_norm": 32.150852589595324, + "learning_rate": 2.3775050750092055e-06, + "loss": 1.0874, + "step": 24928 + }, + { + "epoch": 2.124691042359158, + "grad_norm": 41.485507484317765, + "learning_rate": 2.3770829175763795e-06, + "loss": 1.6932, + "step": 24929 + }, + { + "epoch": 2.1247762720531833, + "grad_norm": 54.12210640785553, + "learning_rate": 2.376660785938901e-06, + "loss": 1.818, + "step": 24930 + }, + { + "epoch": 2.124861501747209, + "grad_norm": 37.3322014076411, + "learning_rate": 2.376238680100921e-06, + "loss": 1.2066, + "step": 24931 + }, + { + "epoch": 2.1249467314412342, + "grad_norm": 46.20201309873835, + "learning_rate": 2.375816600066589e-06, + "loss": 1.277, + "step": 24932 + }, + { + "epoch": 2.1250319611352597, + "grad_norm": 27.62334033024453, + "learning_rate": 2.3753945458400595e-06, + "loss": 0.8495, + "step": 24933 + }, + { + "epoch": 2.1251171908292847, + "grad_norm": 33.506628387374654, + "learning_rate": 2.3749725174254806e-06, + "loss": 1.2658, + "step": 24934 + }, + { + "epoch": 2.12520242052331, + "grad_norm": 53.531756277331986, + "learning_rate": 2.3745505148270052e-06, + "loss": 0.8232, + "step": 24935 + }, + { + "epoch": 2.1252876502173357, + "grad_norm": 51.900063865709726, + "learning_rate": 2.374128538048781e-06, + "loss": 1.6035, + "step": 24936 + }, + { + "epoch": 2.125372879911361, + "grad_norm": 72.07030172180576, + "learning_rate": 2.373706587094961e-06, + "loss": 1.7954, + "step": 24937 + }, + { + "epoch": 2.1254581096053866, + "grad_norm": 30.251276897701548, + "learning_rate": 2.3732846619696926e-06, + "loss": 1.1942, + "step": 24938 + }, + { + "epoch": 2.125543339299412, + "grad_norm": 54.44951725984976, + "learning_rate": 2.372862762677126e-06, + "loss": 1.8311, + "step": 24939 + }, + { + "epoch": 2.125628568993437, + "grad_norm": 39.85549431541029, + "learning_rate": 2.3724408892214105e-06, + "loss": 1.2725, + "step": 24940 + }, + { + "epoch": 2.1257137986874626, + "grad_norm": 62.96427282577359, + "learning_rate": 2.372019041606693e-06, + "loss": 1.3677, + "step": 24941 + }, + { + "epoch": 2.125799028381488, + "grad_norm": 57.8223345991118, + "learning_rate": 2.3715972198371236e-06, + "loss": 1.0145, + "step": 24942 + }, + { + "epoch": 2.1258842580755135, + "grad_norm": 32.863472837013234, + "learning_rate": 2.3711754239168528e-06, + "loss": 1.6479, + "step": 24943 + }, + { + "epoch": 2.125969487769539, + "grad_norm": 37.29732136078323, + "learning_rate": 2.3707536538500274e-06, + "loss": 1.499, + "step": 24944 + }, + { + "epoch": 2.1260547174635644, + "grad_norm": 36.67407576952421, + "learning_rate": 2.3703319096407936e-06, + "loss": 1.0658, + "step": 24945 + }, + { + "epoch": 2.12613994715759, + "grad_norm": 67.03165826513649, + "learning_rate": 2.3699101912933014e-06, + "loss": 2.3273, + "step": 24946 + }, + { + "epoch": 2.126225176851615, + "grad_norm": 36.41050565914582, + "learning_rate": 2.3694884988116985e-06, + "loss": 1.1979, + "step": 24947 + }, + { + "epoch": 2.1263104065456404, + "grad_norm": 48.47745554521447, + "learning_rate": 2.3690668322001305e-06, + "loss": 1.3254, + "step": 24948 + }, + { + "epoch": 2.126395636239666, + "grad_norm": 48.703135003173315, + "learning_rate": 2.3686451914627424e-06, + "loss": 1.365, + "step": 24949 + }, + { + "epoch": 2.1264808659336913, + "grad_norm": 54.87660710797976, + "learning_rate": 2.3682235766036838e-06, + "loss": 1.9976, + "step": 24950 + }, + { + "epoch": 2.126566095627717, + "grad_norm": 43.60528321541694, + "learning_rate": 2.3678019876271022e-06, + "loss": 1.5662, + "step": 24951 + }, + { + "epoch": 2.1266513253217423, + "grad_norm": 61.513798640932535, + "learning_rate": 2.3673804245371423e-06, + "loss": 1.484, + "step": 24952 + }, + { + "epoch": 2.1267365550157673, + "grad_norm": 32.806350899898256, + "learning_rate": 2.3669588873379496e-06, + "loss": 1.1606, + "step": 24953 + }, + { + "epoch": 2.1268217847097928, + "grad_norm": 45.1385370408027, + "learning_rate": 2.3665373760336685e-06, + "loss": 1.3383, + "step": 24954 + }, + { + "epoch": 2.1269070144038182, + "grad_norm": 45.00303412903597, + "learning_rate": 2.366115890628447e-06, + "loss": 1.1594, + "step": 24955 + }, + { + "epoch": 2.1269922440978437, + "grad_norm": 27.934290272925907, + "learning_rate": 2.3656944311264297e-06, + "loss": 1.2267, + "step": 24956 + }, + { + "epoch": 2.127077473791869, + "grad_norm": 34.67175221445954, + "learning_rate": 2.365272997531759e-06, + "loss": 1.1129, + "step": 24957 + }, + { + "epoch": 2.1271627034858946, + "grad_norm": 39.06425657516259, + "learning_rate": 2.3648515898485814e-06, + "loss": 1.7124, + "step": 24958 + }, + { + "epoch": 2.1272479331799197, + "grad_norm": 36.3886569470816, + "learning_rate": 2.3644302080810433e-06, + "loss": 1.5353, + "step": 24959 + }, + { + "epoch": 2.127333162873945, + "grad_norm": 39.3661548664921, + "learning_rate": 2.3640088522332865e-06, + "loss": 0.9684, + "step": 24960 + }, + { + "epoch": 2.1274183925679706, + "grad_norm": 26.984800195580082, + "learning_rate": 2.363587522309455e-06, + "loss": 1.0662, + "step": 24961 + }, + { + "epoch": 2.127503622261996, + "grad_norm": 64.7455397752912, + "learning_rate": 2.3631662183136916e-06, + "loss": 1.8424, + "step": 24962 + }, + { + "epoch": 2.1275888519560215, + "grad_norm": 26.13611425777863, + "learning_rate": 2.3627449402501418e-06, + "loss": 0.928, + "step": 24963 + }, + { + "epoch": 2.127674081650047, + "grad_norm": 35.473272622932235, + "learning_rate": 2.362323688122948e-06, + "loss": 1.2659, + "step": 24964 + }, + { + "epoch": 2.1277593113440725, + "grad_norm": 72.42436815924425, + "learning_rate": 2.3619024619362502e-06, + "loss": 2.0341, + "step": 24965 + }, + { + "epoch": 2.1278445410380975, + "grad_norm": 51.68454206814676, + "learning_rate": 2.361481261694194e-06, + "loss": 1.6109, + "step": 24966 + }, + { + "epoch": 2.127929770732123, + "grad_norm": 57.39932805331061, + "learning_rate": 2.3610600874009226e-06, + "loss": 1.7069, + "step": 24967 + }, + { + "epoch": 2.1280150004261484, + "grad_norm": 58.49441473292141, + "learning_rate": 2.360638939060577e-06, + "loss": 1.8302, + "step": 24968 + }, + { + "epoch": 2.128100230120174, + "grad_norm": 45.06852026323568, + "learning_rate": 2.3602178166772983e-06, + "loss": 1.366, + "step": 24969 + }, + { + "epoch": 2.1281854598141994, + "grad_norm": 63.90760872091922, + "learning_rate": 2.3597967202552286e-06, + "loss": 1.9238, + "step": 24970 + }, + { + "epoch": 2.128270689508225, + "grad_norm": 59.09537352660766, + "learning_rate": 2.359375649798507e-06, + "loss": 1.4481, + "step": 24971 + }, + { + "epoch": 2.12835591920225, + "grad_norm": 40.843633311824505, + "learning_rate": 2.358954605311279e-06, + "loss": 1.0598, + "step": 24972 + }, + { + "epoch": 2.1284411488962753, + "grad_norm": 38.429118736875196, + "learning_rate": 2.3585335867976806e-06, + "loss": 0.8966, + "step": 24973 + }, + { + "epoch": 2.128526378590301, + "grad_norm": 34.21434592943652, + "learning_rate": 2.358112594261857e-06, + "loss": 1.4345, + "step": 24974 + }, + { + "epoch": 2.1286116082843263, + "grad_norm": 51.98262934395439, + "learning_rate": 2.3576916277079443e-06, + "loss": 1.237, + "step": 24975 + }, + { + "epoch": 2.1286968379783517, + "grad_norm": 36.82158432149769, + "learning_rate": 2.3572706871400856e-06, + "loss": 1.4171, + "step": 24976 + }, + { + "epoch": 2.128782067672377, + "grad_norm": 34.05504735227389, + "learning_rate": 2.3568497725624207e-06, + "loss": 1.0946, + "step": 24977 + }, + { + "epoch": 2.1288672973664022, + "grad_norm": 47.499084814291, + "learning_rate": 2.3564288839790874e-06, + "loss": 1.8306, + "step": 24978 + }, + { + "epoch": 2.1289525270604277, + "grad_norm": 67.94257693902955, + "learning_rate": 2.3560080213942237e-06, + "loss": 1.6049, + "step": 24979 + }, + { + "epoch": 2.129037756754453, + "grad_norm": 44.256433285059074, + "learning_rate": 2.3555871848119726e-06, + "loss": 1.1418, + "step": 24980 + }, + { + "epoch": 2.1291229864484786, + "grad_norm": 55.22716088872787, + "learning_rate": 2.355166374236469e-06, + "loss": 1.2788, + "step": 24981 + }, + { + "epoch": 2.129208216142504, + "grad_norm": 63.16042647087869, + "learning_rate": 2.3547455896718544e-06, + "loss": 1.1796, + "step": 24982 + }, + { + "epoch": 2.1292934458365296, + "grad_norm": 31.951952139877147, + "learning_rate": 2.3543248311222667e-06, + "loss": 1.4537, + "step": 24983 + }, + { + "epoch": 2.129378675530555, + "grad_norm": 55.72867049370034, + "learning_rate": 2.3539040985918404e-06, + "loss": 1.7113, + "step": 24984 + }, + { + "epoch": 2.12946390522458, + "grad_norm": 123.57977993561167, + "learning_rate": 2.3534833920847176e-06, + "loss": 2.0448, + "step": 24985 + }, + { + "epoch": 2.1295491349186055, + "grad_norm": 38.56263619897552, + "learning_rate": 2.3530627116050347e-06, + "loss": 1.7757, + "step": 24986 + }, + { + "epoch": 2.129634364612631, + "grad_norm": 43.79623059103653, + "learning_rate": 2.3526420571569282e-06, + "loss": 1.8612, + "step": 24987 + }, + { + "epoch": 2.1297195943066565, + "grad_norm": 74.27552870196799, + "learning_rate": 2.3522214287445326e-06, + "loss": 1.3697, + "step": 24988 + }, + { + "epoch": 2.129804824000682, + "grad_norm": 40.028267062022145, + "learning_rate": 2.351800826371987e-06, + "loss": 1.1766, + "step": 24989 + }, + { + "epoch": 2.1298900536947074, + "grad_norm": 40.596086276162296, + "learning_rate": 2.3513802500434305e-06, + "loss": 1.5166, + "step": 24990 + }, + { + "epoch": 2.1299752833887324, + "grad_norm": 69.37399607454451, + "learning_rate": 2.3509596997629967e-06, + "loss": 1.9301, + "step": 24991 + }, + { + "epoch": 2.130060513082758, + "grad_norm": 59.48611450083412, + "learning_rate": 2.3505391755348197e-06, + "loss": 1.7436, + "step": 24992 + }, + { + "epoch": 2.1301457427767834, + "grad_norm": 54.018543226452, + "learning_rate": 2.3501186773630392e-06, + "loss": 1.7585, + "step": 24993 + }, + { + "epoch": 2.130230972470809, + "grad_norm": 27.59948633703605, + "learning_rate": 2.349698205251788e-06, + "loss": 0.9603, + "step": 24994 + }, + { + "epoch": 2.1303162021648343, + "grad_norm": 30.46609939691625, + "learning_rate": 2.349277759205202e-06, + "loss": 0.9335, + "step": 24995 + }, + { + "epoch": 2.1304014318588598, + "grad_norm": 29.481672421011016, + "learning_rate": 2.3488573392274144e-06, + "loss": 1.5166, + "step": 24996 + }, + { + "epoch": 2.130486661552885, + "grad_norm": 61.04047203965949, + "learning_rate": 2.3484369453225606e-06, + "loss": 1.8389, + "step": 24997 + }, + { + "epoch": 2.1305718912469103, + "grad_norm": 35.00482513898669, + "learning_rate": 2.3480165774947787e-06, + "loss": 0.9341, + "step": 24998 + }, + { + "epoch": 2.1306571209409357, + "grad_norm": 31.587119947640595, + "learning_rate": 2.3475962357481996e-06, + "loss": 1.8867, + "step": 24999 + }, + { + "epoch": 2.130742350634961, + "grad_norm": 39.70224467065666, + "learning_rate": 2.3471759200869577e-06, + "loss": 1.5256, + "step": 25000 + }, + { + "epoch": 2.1308275803289867, + "grad_norm": 34.471453877953685, + "learning_rate": 2.346755630515184e-06, + "loss": 1.2139, + "step": 25001 + }, + { + "epoch": 2.130912810023012, + "grad_norm": 24.909783295863445, + "learning_rate": 2.346335367037017e-06, + "loss": 0.9064, + "step": 25002 + }, + { + "epoch": 2.1309980397170376, + "grad_norm": 49.41437408797841, + "learning_rate": 2.345915129656587e-06, + "loss": 1.2988, + "step": 25003 + }, + { + "epoch": 2.1310832694110626, + "grad_norm": 68.65401220765641, + "learning_rate": 2.3454949183780247e-06, + "loss": 1.8178, + "step": 25004 + }, + { + "epoch": 2.131168499105088, + "grad_norm": 70.97219860329969, + "learning_rate": 2.3450747332054656e-06, + "loss": 2.3753, + "step": 25005 + }, + { + "epoch": 2.1312537287991136, + "grad_norm": 35.48228011474754, + "learning_rate": 2.344654574143043e-06, + "loss": 1.07, + "step": 25006 + }, + { + "epoch": 2.131338958493139, + "grad_norm": 46.56648554093961, + "learning_rate": 2.344234441194888e-06, + "loss": 1.5269, + "step": 25007 + }, + { + "epoch": 2.1314241881871645, + "grad_norm": 56.377095893303824, + "learning_rate": 2.343814334365132e-06, + "loss": 1.9981, + "step": 25008 + }, + { + "epoch": 2.13150941788119, + "grad_norm": 87.65773986692253, + "learning_rate": 2.343394253657904e-06, + "loss": 1.6302, + "step": 25009 + }, + { + "epoch": 2.1315946475752154, + "grad_norm": 55.58046378382151, + "learning_rate": 2.3429741990773404e-06, + "loss": 2.2208, + "step": 25010 + }, + { + "epoch": 2.1316798772692405, + "grad_norm": 23.199655124240145, + "learning_rate": 2.3425541706275695e-06, + "loss": 0.8561, + "step": 25011 + }, + { + "epoch": 2.131765106963266, + "grad_norm": 37.75083035020076, + "learning_rate": 2.3421341683127206e-06, + "loss": 1.0816, + "step": 25012 + }, + { + "epoch": 2.1318503366572914, + "grad_norm": 44.062532728144944, + "learning_rate": 2.341714192136928e-06, + "loss": 2.062, + "step": 25013 + }, + { + "epoch": 2.131935566351317, + "grad_norm": 29.578595712607235, + "learning_rate": 2.3412942421043176e-06, + "loss": 0.7727, + "step": 25014 + }, + { + "epoch": 2.1320207960453423, + "grad_norm": 30.524858735905184, + "learning_rate": 2.3408743182190246e-06, + "loss": 0.7727, + "step": 25015 + }, + { + "epoch": 2.1321060257393674, + "grad_norm": 74.31403395902576, + "learning_rate": 2.3404544204851755e-06, + "loss": 2.1695, + "step": 25016 + }, + { + "epoch": 2.132191255433393, + "grad_norm": 39.500799934632084, + "learning_rate": 2.3400345489069004e-06, + "loss": 1.2733, + "step": 25017 + }, + { + "epoch": 2.1322764851274183, + "grad_norm": 41.83583002505379, + "learning_rate": 2.3396147034883264e-06, + "loss": 1.4753, + "step": 25018 + }, + { + "epoch": 2.1323617148214438, + "grad_norm": 51.58921161844535, + "learning_rate": 2.339194884233587e-06, + "loss": 1.2298, + "step": 25019 + }, + { + "epoch": 2.1324469445154692, + "grad_norm": 40.74203709921202, + "learning_rate": 2.3387750911468064e-06, + "loss": 1.2241, + "step": 25020 + }, + { + "epoch": 2.1325321742094947, + "grad_norm": 70.97620202689497, + "learning_rate": 2.3383553242321176e-06, + "loss": 2.4937, + "step": 25021 + }, + { + "epoch": 2.13261740390352, + "grad_norm": 26.599590320971174, + "learning_rate": 2.3379355834936446e-06, + "loss": 1.2861, + "step": 25022 + }, + { + "epoch": 2.132702633597545, + "grad_norm": 43.745946951703395, + "learning_rate": 2.337515868935519e-06, + "loss": 0.753, + "step": 25023 + }, + { + "epoch": 2.1327878632915707, + "grad_norm": 43.56039139483997, + "learning_rate": 2.3370961805618668e-06, + "loss": 1.4282, + "step": 25024 + }, + { + "epoch": 2.132873092985596, + "grad_norm": 45.10466297974985, + "learning_rate": 2.336676518376816e-06, + "loss": 1.1909, + "step": 25025 + }, + { + "epoch": 2.1329583226796216, + "grad_norm": 63.97305270184909, + "learning_rate": 2.336256882384493e-06, + "loss": 2.3003, + "step": 25026 + }, + { + "epoch": 2.133043552373647, + "grad_norm": 49.520167283024556, + "learning_rate": 2.335837272589023e-06, + "loss": 1.5211, + "step": 25027 + }, + { + "epoch": 2.1331287820676725, + "grad_norm": 26.039374906697073, + "learning_rate": 2.335417688994535e-06, + "loss": 0.7126, + "step": 25028 + }, + { + "epoch": 2.133214011761698, + "grad_norm": 37.34043265552762, + "learning_rate": 2.3349981316051574e-06, + "loss": 0.9643, + "step": 25029 + }, + { + "epoch": 2.133299241455723, + "grad_norm": 41.82738817660475, + "learning_rate": 2.3345786004250135e-06, + "loss": 1.5531, + "step": 25030 + }, + { + "epoch": 2.1333844711497485, + "grad_norm": 35.00644519325649, + "learning_rate": 2.3341590954582287e-06, + "loss": 1.2911, + "step": 25031 + }, + { + "epoch": 2.133469700843774, + "grad_norm": 37.220360695605805, + "learning_rate": 2.3337396167089318e-06, + "loss": 0.9222, + "step": 25032 + }, + { + "epoch": 2.1335549305377994, + "grad_norm": 72.47428726007614, + "learning_rate": 2.333320164181246e-06, + "loss": 1.1678, + "step": 25033 + }, + { + "epoch": 2.133640160231825, + "grad_norm": 42.714899654019824, + "learning_rate": 2.3329007378792966e-06, + "loss": 1.3062, + "step": 25034 + }, + { + "epoch": 2.1337253899258504, + "grad_norm": 40.17623358347998, + "learning_rate": 2.332481337807207e-06, + "loss": 1.3435, + "step": 25035 + }, + { + "epoch": 2.1338106196198754, + "grad_norm": 49.7832820213169, + "learning_rate": 2.3320619639691034e-06, + "loss": 1.7353, + "step": 25036 + }, + { + "epoch": 2.133895849313901, + "grad_norm": 65.78894178312831, + "learning_rate": 2.3316426163691127e-06, + "loss": 1.7975, + "step": 25037 + }, + { + "epoch": 2.1339810790079263, + "grad_norm": 85.71238650842038, + "learning_rate": 2.3312232950113555e-06, + "loss": 2.1882, + "step": 25038 + }, + { + "epoch": 2.134066308701952, + "grad_norm": 30.666480042617355, + "learning_rate": 2.3308039998999556e-06, + "loss": 0.9988, + "step": 25039 + }, + { + "epoch": 2.1341515383959773, + "grad_norm": 59.780245192554574, + "learning_rate": 2.3303847310390393e-06, + "loss": 1.6378, + "step": 25040 + }, + { + "epoch": 2.1342367680900027, + "grad_norm": 53.3951390601064, + "learning_rate": 2.3299654884327283e-06, + "loss": 1.5714, + "step": 25041 + }, + { + "epoch": 2.1343219977840278, + "grad_norm": 35.499397408193914, + "learning_rate": 2.329546272085146e-06, + "loss": 1.2695, + "step": 25042 + }, + { + "epoch": 2.1344072274780532, + "grad_norm": 29.744247275529652, + "learning_rate": 2.329127082000413e-06, + "loss": 1.1214, + "step": 25043 + }, + { + "epoch": 2.1344924571720787, + "grad_norm": 33.12979952985172, + "learning_rate": 2.3287079181826537e-06, + "loss": 1.1635, + "step": 25044 + }, + { + "epoch": 2.134577686866104, + "grad_norm": 35.74415717573379, + "learning_rate": 2.3282887806359928e-06, + "loss": 1.1002, + "step": 25045 + }, + { + "epoch": 2.1346629165601296, + "grad_norm": 31.738831018202124, + "learning_rate": 2.3278696693645496e-06, + "loss": 1.0564, + "step": 25046 + }, + { + "epoch": 2.134748146254155, + "grad_norm": 36.89994656042701, + "learning_rate": 2.3274505843724472e-06, + "loss": 1.5578, + "step": 25047 + }, + { + "epoch": 2.1348333759481806, + "grad_norm": 36.34214558196718, + "learning_rate": 2.327031525663804e-06, + "loss": 1.6122, + "step": 25048 + }, + { + "epoch": 2.1349186056422056, + "grad_norm": 36.459816169661046, + "learning_rate": 2.3266124932427454e-06, + "loss": 0.8917, + "step": 25049 + }, + { + "epoch": 2.135003835336231, + "grad_norm": 34.11906211243253, + "learning_rate": 2.3261934871133907e-06, + "loss": 0.6118, + "step": 25050 + }, + { + "epoch": 2.1350890650302565, + "grad_norm": 32.80759066831928, + "learning_rate": 2.325774507279858e-06, + "loss": 1.0873, + "step": 25051 + }, + { + "epoch": 2.135174294724282, + "grad_norm": 30.02415096549253, + "learning_rate": 2.3253555537462714e-06, + "loss": 0.9442, + "step": 25052 + }, + { + "epoch": 2.1352595244183075, + "grad_norm": 46.763896450245454, + "learning_rate": 2.3249366265167516e-06, + "loss": 1.2791, + "step": 25053 + }, + { + "epoch": 2.135344754112333, + "grad_norm": 64.88272878737824, + "learning_rate": 2.3245177255954178e-06, + "loss": 1.8372, + "step": 25054 + }, + { + "epoch": 2.135429983806358, + "grad_norm": 51.235066215135376, + "learning_rate": 2.3240988509863883e-06, + "loss": 1.5854, + "step": 25055 + }, + { + "epoch": 2.1355152135003834, + "grad_norm": 44.09597267325198, + "learning_rate": 2.3236800026937835e-06, + "loss": 1.4573, + "step": 25056 + }, + { + "epoch": 2.135600443194409, + "grad_norm": 34.499638291090044, + "learning_rate": 2.3232611807217204e-06, + "loss": 1.362, + "step": 25057 + }, + { + "epoch": 2.1356856728884344, + "grad_norm": 33.234914247436485, + "learning_rate": 2.3228423850743216e-06, + "loss": 1.2352, + "step": 25058 + }, + { + "epoch": 2.13577090258246, + "grad_norm": 24.900699846279554, + "learning_rate": 2.3224236157557027e-06, + "loss": 0.9248, + "step": 25059 + }, + { + "epoch": 2.1358561322764853, + "grad_norm": 69.96895478840241, + "learning_rate": 2.322004872769985e-06, + "loss": 2.1666, + "step": 25060 + }, + { + "epoch": 2.1359413619705103, + "grad_norm": 55.27904895035512, + "learning_rate": 2.321586156121284e-06, + "loss": 1.1216, + "step": 25061 + }, + { + "epoch": 2.136026591664536, + "grad_norm": 22.369918191778794, + "learning_rate": 2.32116746581372e-06, + "loss": 0.8974, + "step": 25062 + }, + { + "epoch": 2.1361118213585613, + "grad_norm": 45.47151156849211, + "learning_rate": 2.32074880185141e-06, + "loss": 1.6188, + "step": 25063 + }, + { + "epoch": 2.1361970510525867, + "grad_norm": 68.66280934108666, + "learning_rate": 2.3203301642384713e-06, + "loss": 1.9505, + "step": 25064 + }, + { + "epoch": 2.136282280746612, + "grad_norm": 50.191417356949, + "learning_rate": 2.3199115529790185e-06, + "loss": 1.5879, + "step": 25065 + }, + { + "epoch": 2.1363675104406377, + "grad_norm": 27.278510640786955, + "learning_rate": 2.319492968077172e-06, + "loss": 0.6337, + "step": 25066 + }, + { + "epoch": 2.136452740134663, + "grad_norm": 53.9952590115279, + "learning_rate": 2.3190744095370453e-06, + "loss": 1.5341, + "step": 25067 + }, + { + "epoch": 2.136537969828688, + "grad_norm": 83.33608658299761, + "learning_rate": 2.3186558773627587e-06, + "loss": 1.6475, + "step": 25068 + }, + { + "epoch": 2.1366231995227136, + "grad_norm": 64.65599792623624, + "learning_rate": 2.318237371558424e-06, + "loss": 2.374, + "step": 25069 + }, + { + "epoch": 2.136708429216739, + "grad_norm": 62.79073738814047, + "learning_rate": 2.3178188921281607e-06, + "loss": 1.5643, + "step": 25070 + }, + { + "epoch": 2.1367936589107646, + "grad_norm": 41.38112056150194, + "learning_rate": 2.3174004390760835e-06, + "loss": 0.932, + "step": 25071 + }, + { + "epoch": 2.13687888860479, + "grad_norm": 44.58491196028539, + "learning_rate": 2.3169820124063064e-06, + "loss": 1.1274, + "step": 25072 + }, + { + "epoch": 2.1369641182988155, + "grad_norm": 52.515979763468444, + "learning_rate": 2.3165636121229453e-06, + "loss": 1.848, + "step": 25073 + }, + { + "epoch": 2.1370493479928405, + "grad_norm": 35.681923387155976, + "learning_rate": 2.316145238230113e-06, + "loss": 0.8765, + "step": 25074 + }, + { + "epoch": 2.137134577686866, + "grad_norm": 99.4833199066121, + "learning_rate": 2.315726890731926e-06, + "loss": 2.3793, + "step": 25075 + }, + { + "epoch": 2.1372198073808915, + "grad_norm": 78.4151334601345, + "learning_rate": 2.3153085696324996e-06, + "loss": 2.1392, + "step": 25076 + }, + { + "epoch": 2.137305037074917, + "grad_norm": 22.336423838186537, + "learning_rate": 2.314890274935947e-06, + "loss": 0.7863, + "step": 25077 + }, + { + "epoch": 2.1373902667689424, + "grad_norm": 37.14141656632054, + "learning_rate": 2.31447200664638e-06, + "loss": 1.1064, + "step": 25078 + }, + { + "epoch": 2.137475496462968, + "grad_norm": 33.47912350417538, + "learning_rate": 2.314053764767915e-06, + "loss": 1.5317, + "step": 25079 + }, + { + "epoch": 2.137560726156993, + "grad_norm": 45.09058013200484, + "learning_rate": 2.313635549304665e-06, + "loss": 1.6479, + "step": 25080 + }, + { + "epoch": 2.1376459558510184, + "grad_norm": 34.88517777899879, + "learning_rate": 2.3132173602607405e-06, + "loss": 1.2314, + "step": 25081 + }, + { + "epoch": 2.137731185545044, + "grad_norm": 32.44569898764607, + "learning_rate": 2.312799197640255e-06, + "loss": 1.2994, + "step": 25082 + }, + { + "epoch": 2.1378164152390693, + "grad_norm": 78.66628035596591, + "learning_rate": 2.312381061447321e-06, + "loss": 2.082, + "step": 25083 + }, + { + "epoch": 2.1379016449330948, + "grad_norm": 62.03864361213036, + "learning_rate": 2.311962951686053e-06, + "loss": 1.6832, + "step": 25084 + }, + { + "epoch": 2.1379868746271202, + "grad_norm": 63.011398360178354, + "learning_rate": 2.311544868360561e-06, + "loss": 1.4987, + "step": 25085 + }, + { + "epoch": 2.1380721043211457, + "grad_norm": 45.168099600750814, + "learning_rate": 2.3111268114749578e-06, + "loss": 1.7093, + "step": 25086 + }, + { + "epoch": 2.1381573340151707, + "grad_norm": 59.03566047468053, + "learning_rate": 2.3107087810333517e-06, + "loss": 1.7829, + "step": 25087 + }, + { + "epoch": 2.138242563709196, + "grad_norm": 49.0261882029684, + "learning_rate": 2.310290777039858e-06, + "loss": 1.2646, + "step": 25088 + }, + { + "epoch": 2.1383277934032217, + "grad_norm": 41.87258907163612, + "learning_rate": 2.309872799498586e-06, + "loss": 0.7395, + "step": 25089 + }, + { + "epoch": 2.138413023097247, + "grad_norm": 93.33478027961351, + "learning_rate": 2.3094548484136437e-06, + "loss": 2.0518, + "step": 25090 + }, + { + "epoch": 2.1384982527912726, + "grad_norm": 41.784485811178044, + "learning_rate": 2.3090369237891442e-06, + "loss": 1.3197, + "step": 25091 + }, + { + "epoch": 2.138583482485298, + "grad_norm": 34.13684320891781, + "learning_rate": 2.3086190256291986e-06, + "loss": 1.1266, + "step": 25092 + }, + { + "epoch": 2.138668712179323, + "grad_norm": 55.62531095308953, + "learning_rate": 2.3082011539379162e-06, + "loss": 1.7766, + "step": 25093 + }, + { + "epoch": 2.1387539418733486, + "grad_norm": 34.6149223166359, + "learning_rate": 2.307783308719405e-06, + "loss": 0.9474, + "step": 25094 + }, + { + "epoch": 2.138839171567374, + "grad_norm": 66.32583539232658, + "learning_rate": 2.3073654899777744e-06, + "loss": 2.324, + "step": 25095 + }, + { + "epoch": 2.1389244012613995, + "grad_norm": 29.491627401088248, + "learning_rate": 2.306947697717135e-06, + "loss": 0.9275, + "step": 25096 + }, + { + "epoch": 2.139009630955425, + "grad_norm": 46.22403259325597, + "learning_rate": 2.306529931941596e-06, + "loss": 1.6896, + "step": 25097 + }, + { + "epoch": 2.1390948606494504, + "grad_norm": 49.18007108332412, + "learning_rate": 2.3061121926552617e-06, + "loss": 1.2512, + "step": 25098 + }, + { + "epoch": 2.1391800903434754, + "grad_norm": 28.808248972638452, + "learning_rate": 2.305694479862246e-06, + "loss": 0.8262, + "step": 25099 + }, + { + "epoch": 2.139265320037501, + "grad_norm": 42.971250029022116, + "learning_rate": 2.3052767935666525e-06, + "loss": 1.107, + "step": 25100 + }, + { + "epoch": 2.1393505497315264, + "grad_norm": 49.227734372568094, + "learning_rate": 2.3048591337725936e-06, + "loss": 1.5069, + "step": 25101 + }, + { + "epoch": 2.139435779425552, + "grad_norm": 29.29712656754526, + "learning_rate": 2.304441500484173e-06, + "loss": 0.4864, + "step": 25102 + }, + { + "epoch": 2.1395210091195773, + "grad_norm": 54.84939341211309, + "learning_rate": 2.3040238937054997e-06, + "loss": 2.2639, + "step": 25103 + }, + { + "epoch": 2.139606238813603, + "grad_norm": 32.51244847523586, + "learning_rate": 2.3036063134406784e-06, + "loss": 0.8648, + "step": 25104 + }, + { + "epoch": 2.1396914685076283, + "grad_norm": 34.65985292548534, + "learning_rate": 2.3031887596938197e-06, + "loss": 0.8507, + "step": 25105 + }, + { + "epoch": 2.1397766982016533, + "grad_norm": 77.07104102966251, + "learning_rate": 2.3027712324690255e-06, + "loss": 2.2202, + "step": 25106 + }, + { + "epoch": 2.1398619278956788, + "grad_norm": 48.65787873675443, + "learning_rate": 2.3023537317704063e-06, + "loss": 1.2108, + "step": 25107 + }, + { + "epoch": 2.139947157589704, + "grad_norm": 61.300998762798216, + "learning_rate": 2.3019362576020643e-06, + "loss": 1.9784, + "step": 25108 + }, + { + "epoch": 2.1400323872837297, + "grad_norm": 247.1032981874864, + "learning_rate": 2.30151880996811e-06, + "loss": 2.1083, + "step": 25109 + }, + { + "epoch": 2.140117616977755, + "grad_norm": 77.86522475004776, + "learning_rate": 2.3011013888726445e-06, + "loss": 2.3119, + "step": 25110 + }, + { + "epoch": 2.1402028466717806, + "grad_norm": 58.713790706954306, + "learning_rate": 2.3006839943197753e-06, + "loss": 1.5308, + "step": 25111 + }, + { + "epoch": 2.1402880763658056, + "grad_norm": 37.990029096403816, + "learning_rate": 2.3002666263136043e-06, + "loss": 1.0464, + "step": 25112 + }, + { + "epoch": 2.140373306059831, + "grad_norm": 36.66870214223842, + "learning_rate": 2.29984928485824e-06, + "loss": 0.9094, + "step": 25113 + }, + { + "epoch": 2.1404585357538566, + "grad_norm": 59.48302860015722, + "learning_rate": 2.299431969957783e-06, + "loss": 1.2109, + "step": 25114 + }, + { + "epoch": 2.140543765447882, + "grad_norm": 67.6286894133101, + "learning_rate": 2.2990146816163415e-06, + "loss": 1.5952, + "step": 25115 + }, + { + "epoch": 2.1406289951419075, + "grad_norm": 47.302835081196186, + "learning_rate": 2.298597419838018e-06, + "loss": 1.6756, + "step": 25116 + }, + { + "epoch": 2.140714224835933, + "grad_norm": 42.183297211114564, + "learning_rate": 2.298180184626913e-06, + "loss": 1.5768, + "step": 25117 + }, + { + "epoch": 2.140799454529958, + "grad_norm": 39.2737519570709, + "learning_rate": 2.2977629759871338e-06, + "loss": 0.8725, + "step": 25118 + }, + { + "epoch": 2.1408846842239835, + "grad_norm": 79.31414170817627, + "learning_rate": 2.2973457939227827e-06, + "loss": 2.5959, + "step": 25119 + }, + { + "epoch": 2.140969913918009, + "grad_norm": 57.232315520212296, + "learning_rate": 2.2969286384379612e-06, + "loss": 1.8021, + "step": 25120 + }, + { + "epoch": 2.1410551436120344, + "grad_norm": 61.11646363468754, + "learning_rate": 2.296511509536771e-06, + "loss": 1.7426, + "step": 25121 + }, + { + "epoch": 2.14114037330606, + "grad_norm": 29.899174685229248, + "learning_rate": 2.2960944072233156e-06, + "loss": 0.729, + "step": 25122 + }, + { + "epoch": 2.1412256030000854, + "grad_norm": 24.87720785597089, + "learning_rate": 2.2956773315016997e-06, + "loss": 0.8952, + "step": 25123 + }, + { + "epoch": 2.141310832694111, + "grad_norm": 50.690101091926756, + "learning_rate": 2.2952602823760216e-06, + "loss": 1.8086, + "step": 25124 + }, + { + "epoch": 2.141396062388136, + "grad_norm": 55.804527899624865, + "learning_rate": 2.294843259850383e-06, + "loss": 1.3936, + "step": 25125 + }, + { + "epoch": 2.1414812920821613, + "grad_norm": 40.38808091946938, + "learning_rate": 2.2944262639288873e-06, + "loss": 0.9347, + "step": 25126 + }, + { + "epoch": 2.141566521776187, + "grad_norm": 32.76818304458958, + "learning_rate": 2.2940092946156337e-06, + "loss": 1.283, + "step": 25127 + }, + { + "epoch": 2.1416517514702123, + "grad_norm": 60.031317781571765, + "learning_rate": 2.2935923519147247e-06, + "loss": 1.7439, + "step": 25128 + }, + { + "epoch": 2.1417369811642377, + "grad_norm": 33.27992347960136, + "learning_rate": 2.2931754358302573e-06, + "loss": 1.0754, + "step": 25129 + }, + { + "epoch": 2.141822210858263, + "grad_norm": 59.58395468429406, + "learning_rate": 2.2927585463663333e-06, + "loss": 1.6865, + "step": 25130 + }, + { + "epoch": 2.141907440552288, + "grad_norm": 67.45924732763437, + "learning_rate": 2.292341683527055e-06, + "loss": 2.2405, + "step": 25131 + }, + { + "epoch": 2.1419926702463137, + "grad_norm": 58.53567899400964, + "learning_rate": 2.291924847316521e-06, + "loss": 1.9443, + "step": 25132 + }, + { + "epoch": 2.142077899940339, + "grad_norm": 54.58416656418823, + "learning_rate": 2.29150803773883e-06, + "loss": 2.3417, + "step": 25133 + }, + { + "epoch": 2.1421631296343646, + "grad_norm": 59.603597828553795, + "learning_rate": 2.291091254798079e-06, + "loss": 1.8237, + "step": 25134 + }, + { + "epoch": 2.14224835932839, + "grad_norm": 43.90770623782047, + "learning_rate": 2.290674498498371e-06, + "loss": 1.2464, + "step": 25135 + }, + { + "epoch": 2.1423335890224156, + "grad_norm": 42.431273279945614, + "learning_rate": 2.290257768843803e-06, + "loss": 0.9256, + "step": 25136 + }, + { + "epoch": 2.1424188187164406, + "grad_norm": 62.9583211813389, + "learning_rate": 2.289841065838471e-06, + "loss": 2.5307, + "step": 25137 + }, + { + "epoch": 2.142504048410466, + "grad_norm": 80.64927775555327, + "learning_rate": 2.2894243894864755e-06, + "loss": 1.8974, + "step": 25138 + }, + { + "epoch": 2.1425892781044915, + "grad_norm": 56.155104578808746, + "learning_rate": 2.289007739791915e-06, + "loss": 1.0974, + "step": 25139 + }, + { + "epoch": 2.142674507798517, + "grad_norm": 51.351777917418914, + "learning_rate": 2.288591116758887e-06, + "loss": 1.3373, + "step": 25140 + }, + { + "epoch": 2.1427597374925424, + "grad_norm": 67.6515400065844, + "learning_rate": 2.288174520391488e-06, + "loss": 2.1831, + "step": 25141 + }, + { + "epoch": 2.142844967186568, + "grad_norm": 46.8143616781487, + "learning_rate": 2.287757950693813e-06, + "loss": 1.4192, + "step": 25142 + }, + { + "epoch": 2.1429301968805934, + "grad_norm": 19.423896302073192, + "learning_rate": 2.2873414076699625e-06, + "loss": 0.8972, + "step": 25143 + }, + { + "epoch": 2.1430154265746184, + "grad_norm": 52.924981411037315, + "learning_rate": 2.2869248913240314e-06, + "loss": 0.7106, + "step": 25144 + }, + { + "epoch": 2.143100656268644, + "grad_norm": 45.398162495721344, + "learning_rate": 2.2865084016601143e-06, + "loss": 1.4037, + "step": 25145 + }, + { + "epoch": 2.1431858859626693, + "grad_norm": 56.74469695918975, + "learning_rate": 2.2860919386823107e-06, + "loss": 1.4151, + "step": 25146 + }, + { + "epoch": 2.143271115656695, + "grad_norm": 29.545404667837374, + "learning_rate": 2.285675502394712e-06, + "loss": 1.0506, + "step": 25147 + }, + { + "epoch": 2.1433563453507203, + "grad_norm": 51.215009201253615, + "learning_rate": 2.2852590928014183e-06, + "loss": 1.6215, + "step": 25148 + }, + { + "epoch": 2.1434415750447458, + "grad_norm": 45.42611084393216, + "learning_rate": 2.2848427099065234e-06, + "loss": 1.1399, + "step": 25149 + }, + { + "epoch": 2.143526804738771, + "grad_norm": 81.18797274819858, + "learning_rate": 2.284426353714121e-06, + "loss": 1.8548, + "step": 25150 + }, + { + "epoch": 2.1436120344327962, + "grad_norm": 41.77220420482036, + "learning_rate": 2.284010024228304e-06, + "loss": 1.4654, + "step": 25151 + }, + { + "epoch": 2.1436972641268217, + "grad_norm": 72.97302692774444, + "learning_rate": 2.2835937214531715e-06, + "loss": 2.0272, + "step": 25152 + }, + { + "epoch": 2.143782493820847, + "grad_norm": 68.34791272934262, + "learning_rate": 2.283177445392814e-06, + "loss": 1.3324, + "step": 25153 + }, + { + "epoch": 2.1438677235148726, + "grad_norm": 71.87149309643459, + "learning_rate": 2.282761196051328e-06, + "loss": 2.1544, + "step": 25154 + }, + { + "epoch": 2.143952953208898, + "grad_norm": 73.57254224721467, + "learning_rate": 2.282344973432804e-06, + "loss": 2.8628, + "step": 25155 + }, + { + "epoch": 2.1440381829029236, + "grad_norm": 44.852545451223634, + "learning_rate": 2.2819287775413394e-06, + "loss": 1.0027, + "step": 25156 + }, + { + "epoch": 2.1441234125969486, + "grad_norm": 35.42021195651605, + "learning_rate": 2.281512608381025e-06, + "loss": 1.1562, + "step": 25157 + }, + { + "epoch": 2.144208642290974, + "grad_norm": 40.61162074491344, + "learning_rate": 2.2810964659559544e-06, + "loss": 1.3958, + "step": 25158 + }, + { + "epoch": 2.1442938719849995, + "grad_norm": 43.660240956294025, + "learning_rate": 2.280680350270219e-06, + "loss": 1.5479, + "step": 25159 + }, + { + "epoch": 2.144379101679025, + "grad_norm": 53.98008113150252, + "learning_rate": 2.280264261327911e-06, + "loss": 1.551, + "step": 25160 + }, + { + "epoch": 2.1444643313730505, + "grad_norm": 31.426937698651447, + "learning_rate": 2.2798481991331225e-06, + "loss": 0.9629, + "step": 25161 + }, + { + "epoch": 2.144549561067076, + "grad_norm": 56.554743841805305, + "learning_rate": 2.2794321636899473e-06, + "loss": 1.8605, + "step": 25162 + }, + { + "epoch": 2.144634790761101, + "grad_norm": 108.88658460904726, + "learning_rate": 2.279016155002477e-06, + "loss": 2.4453, + "step": 25163 + }, + { + "epoch": 2.1447200204551264, + "grad_norm": 51.508179889588675, + "learning_rate": 2.2786001730747987e-06, + "loss": 1.6244, + "step": 25164 + }, + { + "epoch": 2.144805250149152, + "grad_norm": 81.10670790964257, + "learning_rate": 2.278184217911009e-06, + "loss": 1.4651, + "step": 25165 + }, + { + "epoch": 2.1448904798431774, + "grad_norm": 66.43269952678533, + "learning_rate": 2.277768289515195e-06, + "loss": 1.6214, + "step": 25166 + }, + { + "epoch": 2.144975709537203, + "grad_norm": 71.64352124169325, + "learning_rate": 2.2773523878914494e-06, + "loss": 1.8925, + "step": 25167 + }, + { + "epoch": 2.1450609392312283, + "grad_norm": 31.741031362670576, + "learning_rate": 2.2769365130438585e-06, + "loss": 1.236, + "step": 25168 + }, + { + "epoch": 2.145146168925254, + "grad_norm": 49.0991322278946, + "learning_rate": 2.276520664976515e-06, + "loss": 1.8584, + "step": 25169 + }, + { + "epoch": 2.145231398619279, + "grad_norm": 69.36447315890852, + "learning_rate": 2.2761048436935108e-06, + "loss": 1.7742, + "step": 25170 + }, + { + "epoch": 2.1453166283133043, + "grad_norm": 64.60596548338191, + "learning_rate": 2.2756890491989325e-06, + "loss": 1.7788, + "step": 25171 + }, + { + "epoch": 2.1454018580073297, + "grad_norm": 35.389524153793616, + "learning_rate": 2.2752732814968704e-06, + "loss": 1.0198, + "step": 25172 + }, + { + "epoch": 2.145487087701355, + "grad_norm": 34.50849804431709, + "learning_rate": 2.274857540591411e-06, + "loss": 0.9659, + "step": 25173 + }, + { + "epoch": 2.1455723173953807, + "grad_norm": 39.26227436873362, + "learning_rate": 2.2744418264866464e-06, + "loss": 1.6318, + "step": 25174 + }, + { + "epoch": 2.145657547089406, + "grad_norm": 53.63230676106079, + "learning_rate": 2.2740261391866634e-06, + "loss": 1.6791, + "step": 25175 + }, + { + "epoch": 2.145742776783431, + "grad_norm": 48.98025414708992, + "learning_rate": 2.2736104786955485e-06, + "loss": 1.5359, + "step": 25176 + }, + { + "epoch": 2.1458280064774566, + "grad_norm": 33.039853672796845, + "learning_rate": 2.2731948450173912e-06, + "loss": 0.9993, + "step": 25177 + }, + { + "epoch": 2.145913236171482, + "grad_norm": 54.542197724724815, + "learning_rate": 2.272779238156281e-06, + "loss": 1.7226, + "step": 25178 + }, + { + "epoch": 2.1459984658655076, + "grad_norm": 70.78669173694118, + "learning_rate": 2.272363658116303e-06, + "loss": 2.0803, + "step": 25179 + }, + { + "epoch": 2.146083695559533, + "grad_norm": 86.18036242499939, + "learning_rate": 2.2719481049015444e-06, + "loss": 2.1741, + "step": 25180 + }, + { + "epoch": 2.1461689252535585, + "grad_norm": 19.50945216466981, + "learning_rate": 2.271532578516091e-06, + "loss": 0.9483, + "step": 25181 + }, + { + "epoch": 2.1462541549475835, + "grad_norm": 35.39104372091857, + "learning_rate": 2.271117078964032e-06, + "loss": 0.971, + "step": 25182 + }, + { + "epoch": 2.146339384641609, + "grad_norm": 35.469286674756034, + "learning_rate": 2.270701606249452e-06, + "loss": 1.4464, + "step": 25183 + }, + { + "epoch": 2.1464246143356345, + "grad_norm": 30.654693739429316, + "learning_rate": 2.2702861603764353e-06, + "loss": 0.8647, + "step": 25184 + }, + { + "epoch": 2.14650984402966, + "grad_norm": 21.68224031647609, + "learning_rate": 2.2698707413490696e-06, + "loss": 0.6677, + "step": 25185 + }, + { + "epoch": 2.1465950737236854, + "grad_norm": 44.391439984888926, + "learning_rate": 2.269455349171442e-06, + "loss": 1.3141, + "step": 25186 + }, + { + "epoch": 2.146680303417711, + "grad_norm": 25.09491621703481, + "learning_rate": 2.269039983847637e-06, + "loss": 0.7043, + "step": 25187 + }, + { + "epoch": 2.1467655331117363, + "grad_norm": 35.849828667936954, + "learning_rate": 2.268624645381737e-06, + "loss": 1.6192, + "step": 25188 + }, + { + "epoch": 2.1468507628057614, + "grad_norm": 54.79174752073205, + "learning_rate": 2.2682093337778295e-06, + "loss": 1.2619, + "step": 25189 + }, + { + "epoch": 2.146935992499787, + "grad_norm": 53.97028916518649, + "learning_rate": 2.2677940490399957e-06, + "loss": 1.6183, + "step": 25190 + }, + { + "epoch": 2.1470212221938123, + "grad_norm": 42.050293983522046, + "learning_rate": 2.2673787911723233e-06, + "loss": 1.5202, + "step": 25191 + }, + { + "epoch": 2.1471064518878378, + "grad_norm": 46.570224885401274, + "learning_rate": 2.2669635601788925e-06, + "loss": 1.3774, + "step": 25192 + }, + { + "epoch": 2.1471916815818632, + "grad_norm": 47.38393443382689, + "learning_rate": 2.266548356063791e-06, + "loss": 1.3548, + "step": 25193 + }, + { + "epoch": 2.1472769112758887, + "grad_norm": 58.51014974264085, + "learning_rate": 2.266133178831099e-06, + "loss": 1.9649, + "step": 25194 + }, + { + "epoch": 2.1473621409699137, + "grad_norm": 46.6837502541533, + "learning_rate": 2.265718028484903e-06, + "loss": 1.8874, + "step": 25195 + }, + { + "epoch": 2.147447370663939, + "grad_norm": 63.69303559717781, + "learning_rate": 2.2653029050292828e-06, + "loss": 2.1032, + "step": 25196 + }, + { + "epoch": 2.1475326003579647, + "grad_norm": 58.234232165992026, + "learning_rate": 2.2648878084683223e-06, + "loss": 1.4111, + "step": 25197 + }, + { + "epoch": 2.14761783005199, + "grad_norm": 34.593367788318595, + "learning_rate": 2.264472738806101e-06, + "loss": 1.4001, + "step": 25198 + }, + { + "epoch": 2.1477030597460156, + "grad_norm": 30.108000153873018, + "learning_rate": 2.2640576960467057e-06, + "loss": 1.0751, + "step": 25199 + }, + { + "epoch": 2.147788289440041, + "grad_norm": 43.941798280503676, + "learning_rate": 2.263642680194214e-06, + "loss": 1.6127, + "step": 25200 + }, + { + "epoch": 2.147873519134066, + "grad_norm": 64.04980185395473, + "learning_rate": 2.2632276912527105e-06, + "loss": 2.021, + "step": 25201 + }, + { + "epoch": 2.1479587488280916, + "grad_norm": 71.39702502912655, + "learning_rate": 2.2628127292262756e-06, + "loss": 1.7401, + "step": 25202 + }, + { + "epoch": 2.148043978522117, + "grad_norm": 72.9426030496767, + "learning_rate": 2.2623977941189886e-06, + "loss": 1.8079, + "step": 25203 + }, + { + "epoch": 2.1481292082161425, + "grad_norm": 46.993809540595066, + "learning_rate": 2.2619828859349323e-06, + "loss": 1.5454, + "step": 25204 + }, + { + "epoch": 2.148214437910168, + "grad_norm": 48.895427115049294, + "learning_rate": 2.2615680046781867e-06, + "loss": 1.2972, + "step": 25205 + }, + { + "epoch": 2.1482996676041934, + "grad_norm": 38.566375194571386, + "learning_rate": 2.2611531503528317e-06, + "loss": 1.0709, + "step": 25206 + }, + { + "epoch": 2.148384897298219, + "grad_norm": 36.59663211074046, + "learning_rate": 2.2607383229629453e-06, + "loss": 0.7227, + "step": 25207 + }, + { + "epoch": 2.148470126992244, + "grad_norm": 25.120001688945322, + "learning_rate": 2.2603235225126093e-06, + "loss": 0.8974, + "step": 25208 + }, + { + "epoch": 2.1485553566862694, + "grad_norm": 66.03298301293617, + "learning_rate": 2.2599087490059043e-06, + "loss": 1.6048, + "step": 25209 + }, + { + "epoch": 2.148640586380295, + "grad_norm": 62.387161518333016, + "learning_rate": 2.259494002446908e-06, + "loss": 1.6357, + "step": 25210 + }, + { + "epoch": 2.1487258160743203, + "grad_norm": 50.52787111505378, + "learning_rate": 2.2590792828396973e-06, + "loss": 2.2075, + "step": 25211 + }, + { + "epoch": 2.148811045768346, + "grad_norm": 66.48952004386305, + "learning_rate": 2.258664590188355e-06, + "loss": 1.671, + "step": 25212 + }, + { + "epoch": 2.1488962754623713, + "grad_norm": 10.971046162423681, + "learning_rate": 2.2582499244969564e-06, + "loss": 0.2525, + "step": 25213 + }, + { + "epoch": 2.1489815051563963, + "grad_norm": 28.14679854817596, + "learning_rate": 2.2578352857695805e-06, + "loss": 0.7901, + "step": 25214 + }, + { + "epoch": 2.1490667348504218, + "grad_norm": 65.10108504169749, + "learning_rate": 2.2574206740103037e-06, + "loss": 1.8984, + "step": 25215 + }, + { + "epoch": 2.1491519645444472, + "grad_norm": 50.619893487202816, + "learning_rate": 2.2570060892232043e-06, + "loss": 0.9833, + "step": 25216 + }, + { + "epoch": 2.1492371942384727, + "grad_norm": 66.28975699218127, + "learning_rate": 2.256591531412362e-06, + "loss": 1.9227, + "step": 25217 + }, + { + "epoch": 2.149322423932498, + "grad_norm": 46.789517264140514, + "learning_rate": 2.2561770005818516e-06, + "loss": 1.8959, + "step": 25218 + }, + { + "epoch": 2.1494076536265236, + "grad_norm": 43.427431964091966, + "learning_rate": 2.2557624967357506e-06, + "loss": 1.2803, + "step": 25219 + }, + { + "epoch": 2.1494928833205487, + "grad_norm": 29.372036007734664, + "learning_rate": 2.255348019878133e-06, + "loss": 0.8497, + "step": 25220 + }, + { + "epoch": 2.149578113014574, + "grad_norm": 62.42858803371541, + "learning_rate": 2.254933570013079e-06, + "loss": 1.5195, + "step": 25221 + }, + { + "epoch": 2.1496633427085996, + "grad_norm": 55.114847618764074, + "learning_rate": 2.254519147144662e-06, + "loss": 1.7235, + "step": 25222 + }, + { + "epoch": 2.149748572402625, + "grad_norm": 75.70984448655508, + "learning_rate": 2.2541047512769566e-06, + "loss": 2.2838, + "step": 25223 + }, + { + "epoch": 2.1498338020966505, + "grad_norm": 52.73724280282297, + "learning_rate": 2.25369038241404e-06, + "loss": 1.6063, + "step": 25224 + }, + { + "epoch": 2.149919031790676, + "grad_norm": 74.5339317026414, + "learning_rate": 2.253276040559989e-06, + "loss": 1.9979, + "step": 25225 + }, + { + "epoch": 2.1500042614847015, + "grad_norm": 45.262413179301284, + "learning_rate": 2.252861725718877e-06, + "loss": 1.2067, + "step": 25226 + }, + { + "epoch": 2.1500894911787265, + "grad_norm": 29.88125856741623, + "learning_rate": 2.252447437894778e-06, + "loss": 1.4918, + "step": 25227 + }, + { + "epoch": 2.150174720872752, + "grad_norm": 36.16770971917006, + "learning_rate": 2.2520331770917648e-06, + "loss": 0.9948, + "step": 25228 + }, + { + "epoch": 2.1502599505667774, + "grad_norm": 60.4589772785431, + "learning_rate": 2.251618943313915e-06, + "loss": 1.6433, + "step": 25229 + }, + { + "epoch": 2.150345180260803, + "grad_norm": 44.483529633374886, + "learning_rate": 2.2512047365653013e-06, + "loss": 1.4917, + "step": 25230 + }, + { + "epoch": 2.1504304099548284, + "grad_norm": 25.906534161779646, + "learning_rate": 2.2507905568499948e-06, + "loss": 1.0333, + "step": 25231 + }, + { + "epoch": 2.150515639648854, + "grad_norm": 56.828483166461595, + "learning_rate": 2.2503764041720727e-06, + "loss": 1.1401, + "step": 25232 + }, + { + "epoch": 2.150600869342879, + "grad_norm": 38.102274640951926, + "learning_rate": 2.249962278535604e-06, + "loss": 0.862, + "step": 25233 + }, + { + "epoch": 2.1506860990369043, + "grad_norm": 67.59343042987916, + "learning_rate": 2.249548179944665e-06, + "loss": 1.931, + "step": 25234 + }, + { + "epoch": 2.15077132873093, + "grad_norm": 47.62088875415169, + "learning_rate": 2.2491341084033273e-06, + "loss": 1.2319, + "step": 25235 + }, + { + "epoch": 2.1508565584249553, + "grad_norm": 58.496523545923644, + "learning_rate": 2.2487200639156627e-06, + "loss": 1.3199, + "step": 25236 + }, + { + "epoch": 2.1509417881189807, + "grad_norm": 42.16224090603555, + "learning_rate": 2.24830604648574e-06, + "loss": 1.0234, + "step": 25237 + }, + { + "epoch": 2.151027017813006, + "grad_norm": 24.785965028210242, + "learning_rate": 2.247892056117637e-06, + "loss": 1.1531, + "step": 25238 + }, + { + "epoch": 2.1511122475070312, + "grad_norm": 38.83002596347345, + "learning_rate": 2.24747809281542e-06, + "loss": 1.3614, + "step": 25239 + }, + { + "epoch": 2.1511974772010567, + "grad_norm": 42.60104730448578, + "learning_rate": 2.2470641565831636e-06, + "loss": 0.9561, + "step": 25240 + }, + { + "epoch": 2.151282706895082, + "grad_norm": 53.3312758499419, + "learning_rate": 2.2466502474249357e-06, + "loss": 1.7282, + "step": 25241 + }, + { + "epoch": 2.1513679365891076, + "grad_norm": 56.81835492072887, + "learning_rate": 2.2462363653448105e-06, + "loss": 2.2066, + "step": 25242 + }, + { + "epoch": 2.151453166283133, + "grad_norm": 36.6416945646588, + "learning_rate": 2.245822510346857e-06, + "loss": 1.5352, + "step": 25243 + }, + { + "epoch": 2.1515383959771586, + "grad_norm": 108.5941469145604, + "learning_rate": 2.2454086824351433e-06, + "loss": 1.3389, + "step": 25244 + }, + { + "epoch": 2.151623625671184, + "grad_norm": 23.875132492965648, + "learning_rate": 2.244994881613742e-06, + "loss": 0.8757, + "step": 25245 + }, + { + "epoch": 2.151708855365209, + "grad_norm": 55.441038638039196, + "learning_rate": 2.2445811078867184e-06, + "loss": 1.4168, + "step": 25246 + }, + { + "epoch": 2.1517940850592345, + "grad_norm": 45.04524781980958, + "learning_rate": 2.2441673612581457e-06, + "loss": 1.197, + "step": 25247 + }, + { + "epoch": 2.15187931475326, + "grad_norm": 89.04084851932711, + "learning_rate": 2.2437536417320926e-06, + "loss": 2.0229, + "step": 25248 + }, + { + "epoch": 2.1519645444472855, + "grad_norm": 45.30747835444854, + "learning_rate": 2.2433399493126277e-06, + "loss": 1.5435, + "step": 25249 + }, + { + "epoch": 2.152049774141311, + "grad_norm": 39.124161405578505, + "learning_rate": 2.242926284003818e-06, + "loss": 1.71, + "step": 25250 + }, + { + "epoch": 2.1521350038353364, + "grad_norm": 38.12000751564272, + "learning_rate": 2.2425126458097336e-06, + "loss": 1.1571, + "step": 25251 + }, + { + "epoch": 2.1522202335293614, + "grad_norm": 60.432247802746524, + "learning_rate": 2.2420990347344417e-06, + "loss": 1.623, + "step": 25252 + }, + { + "epoch": 2.152305463223387, + "grad_norm": 23.42547649769452, + "learning_rate": 2.24168545078201e-06, + "loss": 1.3646, + "step": 25253 + }, + { + "epoch": 2.1523906929174124, + "grad_norm": 29.11935747612383, + "learning_rate": 2.2412718939565044e-06, + "loss": 0.967, + "step": 25254 + }, + { + "epoch": 2.152475922611438, + "grad_norm": 57.30939147403891, + "learning_rate": 2.2408583642619934e-06, + "loss": 2.2744, + "step": 25255 + }, + { + "epoch": 2.1525611523054633, + "grad_norm": 55.00876682615817, + "learning_rate": 2.2404448617025454e-06, + "loss": 1.8029, + "step": 25256 + }, + { + "epoch": 2.1526463819994888, + "grad_norm": 42.977450270680954, + "learning_rate": 2.2400313862822257e-06, + "loss": 1.0079, + "step": 25257 + }, + { + "epoch": 2.152731611693514, + "grad_norm": 42.279429893026474, + "learning_rate": 2.2396179380050993e-06, + "loss": 1.2922, + "step": 25258 + }, + { + "epoch": 2.1528168413875393, + "grad_norm": 76.40645665523797, + "learning_rate": 2.2392045168752353e-06, + "loss": 1.747, + "step": 25259 + }, + { + "epoch": 2.1529020710815647, + "grad_norm": 53.7244442776864, + "learning_rate": 2.238791122896698e-06, + "loss": 1.3176, + "step": 25260 + }, + { + "epoch": 2.15298730077559, + "grad_norm": 68.30800547302013, + "learning_rate": 2.238377756073552e-06, + "loss": 2.2147, + "step": 25261 + }, + { + "epoch": 2.1530725304696157, + "grad_norm": 48.80824464359986, + "learning_rate": 2.2379644164098624e-06, + "loss": 1.2548, + "step": 25262 + }, + { + "epoch": 2.153157760163641, + "grad_norm": 19.039633620366043, + "learning_rate": 2.237551103909695e-06, + "loss": 0.6049, + "step": 25263 + }, + { + "epoch": 2.1532429898576666, + "grad_norm": 49.855761105445474, + "learning_rate": 2.2371378185771165e-06, + "loss": 1.2928, + "step": 25264 + }, + { + "epoch": 2.1533282195516916, + "grad_norm": 35.75684199142327, + "learning_rate": 2.2367245604161895e-06, + "loss": 1.1082, + "step": 25265 + }, + { + "epoch": 2.153413449245717, + "grad_norm": 91.02545740929749, + "learning_rate": 2.2363113294309793e-06, + "loss": 1.7277, + "step": 25266 + }, + { + "epoch": 2.1534986789397426, + "grad_norm": 56.039358371055684, + "learning_rate": 2.2358981256255465e-06, + "loss": 1.7695, + "step": 25267 + }, + { + "epoch": 2.153583908633768, + "grad_norm": 68.13566323161983, + "learning_rate": 2.2354849490039586e-06, + "loss": 1.8347, + "step": 25268 + }, + { + "epoch": 2.1536691383277935, + "grad_norm": 50.845059147224816, + "learning_rate": 2.2350717995702788e-06, + "loss": 1.2938, + "step": 25269 + }, + { + "epoch": 2.153754368021819, + "grad_norm": 31.59617330917828, + "learning_rate": 2.2346586773285665e-06, + "loss": 1.2635, + "step": 25270 + }, + { + "epoch": 2.1538395977158444, + "grad_norm": 35.968654976585526, + "learning_rate": 2.234245582282888e-06, + "loss": 1.2808, + "step": 25271 + }, + { + "epoch": 2.1539248274098695, + "grad_norm": 40.222222457095064, + "learning_rate": 2.2338325144373063e-06, + "loss": 1.9422, + "step": 25272 + }, + { + "epoch": 2.154010057103895, + "grad_norm": 35.51388433931673, + "learning_rate": 2.233419473795883e-06, + "loss": 1.1906, + "step": 25273 + }, + { + "epoch": 2.1540952867979204, + "grad_norm": 58.51832260616212, + "learning_rate": 2.23300646036268e-06, + "loss": 1.7598, + "step": 25274 + }, + { + "epoch": 2.154180516491946, + "grad_norm": 29.243943153416495, + "learning_rate": 2.232593474141759e-06, + "loss": 1.2711, + "step": 25275 + }, + { + "epoch": 2.1542657461859713, + "grad_norm": 24.089162749119172, + "learning_rate": 2.232180515137179e-06, + "loss": 0.7022, + "step": 25276 + }, + { + "epoch": 2.1543509758799964, + "grad_norm": 45.39814506876979, + "learning_rate": 2.2317675833530063e-06, + "loss": 1.1362, + "step": 25277 + }, + { + "epoch": 2.154436205574022, + "grad_norm": 33.88041741015332, + "learning_rate": 2.2313546787932972e-06, + "loss": 1.1919, + "step": 25278 + }, + { + "epoch": 2.1545214352680473, + "grad_norm": 36.1412748914385, + "learning_rate": 2.230941801462117e-06, + "loss": 1.7174, + "step": 25279 + }, + { + "epoch": 2.1546066649620728, + "grad_norm": 108.32555112638394, + "learning_rate": 2.2305289513635213e-06, + "loss": 2.6306, + "step": 25280 + }, + { + "epoch": 2.1546918946560982, + "grad_norm": 54.911680352479216, + "learning_rate": 2.2301161285015753e-06, + "loss": 2.1518, + "step": 25281 + }, + { + "epoch": 2.1547771243501237, + "grad_norm": 42.350749292837314, + "learning_rate": 2.229703332880337e-06, + "loss": 1.1173, + "step": 25282 + }, + { + "epoch": 2.154862354044149, + "grad_norm": 51.88175950173174, + "learning_rate": 2.229290564503865e-06, + "loss": 1.2608, + "step": 25283 + }, + { + "epoch": 2.154947583738174, + "grad_norm": 79.08556989473948, + "learning_rate": 2.228877823376217e-06, + "loss": 1.058, + "step": 25284 + }, + { + "epoch": 2.1550328134321997, + "grad_norm": 66.13380259570835, + "learning_rate": 2.228465109501457e-06, + "loss": 2.1806, + "step": 25285 + }, + { + "epoch": 2.155118043126225, + "grad_norm": 24.141761439911136, + "learning_rate": 2.228052422883639e-06, + "loss": 1.1114, + "step": 25286 + }, + { + "epoch": 2.1552032728202506, + "grad_norm": 41.23703218820035, + "learning_rate": 2.2276397635268265e-06, + "loss": 1.5196, + "step": 25287 + }, + { + "epoch": 2.155288502514276, + "grad_norm": 44.75636657110513, + "learning_rate": 2.227227131435073e-06, + "loss": 1.5854, + "step": 25288 + }, + { + "epoch": 2.1553737322083015, + "grad_norm": 55.41058599858421, + "learning_rate": 2.226814526612441e-06, + "loss": 1.7915, + "step": 25289 + }, + { + "epoch": 2.155458961902327, + "grad_norm": 118.54636752192914, + "learning_rate": 2.2264019490629864e-06, + "loss": 3.174, + "step": 25290 + }, + { + "epoch": 2.155544191596352, + "grad_norm": 47.47000401304955, + "learning_rate": 2.2259893987907656e-06, + "loss": 2.0074, + "step": 25291 + }, + { + "epoch": 2.1556294212903775, + "grad_norm": 37.94548689978467, + "learning_rate": 2.2255768757998375e-06, + "loss": 1.3668, + "step": 25292 + }, + { + "epoch": 2.155714650984403, + "grad_norm": 34.945683496750675, + "learning_rate": 2.2251643800942564e-06, + "loss": 1.0453, + "step": 25293 + }, + { + "epoch": 2.1557998806784284, + "grad_norm": 52.90450679754423, + "learning_rate": 2.224751911678081e-06, + "loss": 1.5261, + "step": 25294 + }, + { + "epoch": 2.155885110372454, + "grad_norm": 50.41838441755422, + "learning_rate": 2.2243394705553693e-06, + "loss": 1.8917, + "step": 25295 + }, + { + "epoch": 2.1559703400664794, + "grad_norm": 57.290611639645256, + "learning_rate": 2.223927056730176e-06, + "loss": 1.6453, + "step": 25296 + }, + { + "epoch": 2.1560555697605044, + "grad_norm": 21.055198150125968, + "learning_rate": 2.223514670206556e-06, + "loss": 0.6735, + "step": 25297 + }, + { + "epoch": 2.15614079945453, + "grad_norm": 59.59549131968415, + "learning_rate": 2.2231023109885667e-06, + "loss": 2.0006, + "step": 25298 + }, + { + "epoch": 2.1562260291485553, + "grad_norm": 52.04641188226362, + "learning_rate": 2.2226899790802632e-06, + "loss": 1.4355, + "step": 25299 + }, + { + "epoch": 2.156311258842581, + "grad_norm": 34.77189016299031, + "learning_rate": 2.2222776744857e-06, + "loss": 1.5238, + "step": 25300 + }, + { + "epoch": 2.1563964885366063, + "grad_norm": 58.63495171470432, + "learning_rate": 2.2218653972089304e-06, + "loss": 1.6681, + "step": 25301 + }, + { + "epoch": 2.1564817182306317, + "grad_norm": 62.98328417104664, + "learning_rate": 2.2214531472540106e-06, + "loss": 1.7858, + "step": 25302 + }, + { + "epoch": 2.1565669479246568, + "grad_norm": 56.80252079537915, + "learning_rate": 2.2210409246249975e-06, + "loss": 0.9868, + "step": 25303 + }, + { + "epoch": 2.156652177618682, + "grad_norm": 39.85201102118886, + "learning_rate": 2.220628729325942e-06, + "loss": 1.5015, + "step": 25304 + }, + { + "epoch": 2.1567374073127077, + "grad_norm": 50.610639790209085, + "learning_rate": 2.2202165613608985e-06, + "loss": 1.8827, + "step": 25305 + }, + { + "epoch": 2.156822637006733, + "grad_norm": 79.08578127989794, + "learning_rate": 2.219804420733919e-06, + "loss": 1.9234, + "step": 25306 + }, + { + "epoch": 2.1569078667007586, + "grad_norm": 59.28249998132794, + "learning_rate": 2.2193923074490596e-06, + "loss": 1.604, + "step": 25307 + }, + { + "epoch": 2.156993096394784, + "grad_norm": 44.98287033789302, + "learning_rate": 2.218980221510373e-06, + "loss": 1.8537, + "step": 25308 + }, + { + "epoch": 2.1570783260888096, + "grad_norm": 61.95851283228302, + "learning_rate": 2.218568162921908e-06, + "loss": 2.0944, + "step": 25309 + }, + { + "epoch": 2.1571635557828346, + "grad_norm": 38.352768127231634, + "learning_rate": 2.2181561316877204e-06, + "loss": 1.5133, + "step": 25310 + }, + { + "epoch": 2.15724878547686, + "grad_norm": 45.67547638213209, + "learning_rate": 2.2177441278118633e-06, + "loss": 1.7716, + "step": 25311 + }, + { + "epoch": 2.1573340151708855, + "grad_norm": 45.22790946762994, + "learning_rate": 2.2173321512983875e-06, + "loss": 1.442, + "step": 25312 + }, + { + "epoch": 2.157419244864911, + "grad_norm": 21.36409077252788, + "learning_rate": 2.216920202151344e-06, + "loss": 1.1687, + "step": 25313 + }, + { + "epoch": 2.1575044745589365, + "grad_norm": 43.734943347073994, + "learning_rate": 2.2165082803747827e-06, + "loss": 1.4319, + "step": 25314 + }, + { + "epoch": 2.157589704252962, + "grad_norm": 65.12427125710367, + "learning_rate": 2.2160963859727586e-06, + "loss": 2.0897, + "step": 25315 + }, + { + "epoch": 2.157674933946987, + "grad_norm": 28.054455691275194, + "learning_rate": 2.21568451894932e-06, + "loss": 1.1025, + "step": 25316 + }, + { + "epoch": 2.1577601636410124, + "grad_norm": 38.32549247978632, + "learning_rate": 2.2152726793085155e-06, + "loss": 1.3271, + "step": 25317 + }, + { + "epoch": 2.157845393335038, + "grad_norm": 37.99280294125558, + "learning_rate": 2.2148608670544005e-06, + "loss": 1.0408, + "step": 25318 + }, + { + "epoch": 2.1579306230290634, + "grad_norm": 28.841730855109457, + "learning_rate": 2.2144490821910194e-06, + "loss": 1.206, + "step": 25319 + }, + { + "epoch": 2.158015852723089, + "grad_norm": 44.68157391493522, + "learning_rate": 2.2140373247224265e-06, + "loss": 1.4858, + "step": 25320 + }, + { + "epoch": 2.1581010824171143, + "grad_norm": 48.780360053587465, + "learning_rate": 2.2136255946526704e-06, + "loss": 2.0981, + "step": 25321 + }, + { + "epoch": 2.1581863121111393, + "grad_norm": 31.46688151795876, + "learning_rate": 2.213213891985799e-06, + "loss": 1.333, + "step": 25322 + }, + { + "epoch": 2.158271541805165, + "grad_norm": 40.602325782570034, + "learning_rate": 2.2128022167258596e-06, + "loss": 1.4803, + "step": 25323 + }, + { + "epoch": 2.1583567714991903, + "grad_norm": 37.155392583406766, + "learning_rate": 2.2123905688769047e-06, + "loss": 0.8936, + "step": 25324 + }, + { + "epoch": 2.1584420011932157, + "grad_norm": 50.80091833223745, + "learning_rate": 2.2119789484429788e-06, + "loss": 1.2939, + "step": 25325 + }, + { + "epoch": 2.158527230887241, + "grad_norm": 57.02268751314241, + "learning_rate": 2.211567355428134e-06, + "loss": 1.3705, + "step": 25326 + }, + { + "epoch": 2.1586124605812667, + "grad_norm": 23.558402396920005, + "learning_rate": 2.211155789836415e-06, + "loss": 1.1449, + "step": 25327 + }, + { + "epoch": 2.158697690275292, + "grad_norm": 83.85967177533287, + "learning_rate": 2.210744251671872e-06, + "loss": 1.6771, + "step": 25328 + }, + { + "epoch": 2.158782919969317, + "grad_norm": 28.1215571625898, + "learning_rate": 2.210332740938551e-06, + "loss": 1.0448, + "step": 25329 + }, + { + "epoch": 2.1588681496633426, + "grad_norm": 36.196051685340535, + "learning_rate": 2.2099212576404994e-06, + "loss": 1.0879, + "step": 25330 + }, + { + "epoch": 2.158953379357368, + "grad_norm": 37.32310154274945, + "learning_rate": 2.209509801781761e-06, + "loss": 1.353, + "step": 25331 + }, + { + "epoch": 2.1590386090513936, + "grad_norm": 41.82729998874973, + "learning_rate": 2.209098373366387e-06, + "loss": 1.3253, + "step": 25332 + }, + { + "epoch": 2.159123838745419, + "grad_norm": 44.824069842621455, + "learning_rate": 2.20868697239842e-06, + "loss": 1.3042, + "step": 25333 + }, + { + "epoch": 2.1592090684394445, + "grad_norm": 38.94070884468295, + "learning_rate": 2.208275598881909e-06, + "loss": 1.5314, + "step": 25334 + }, + { + "epoch": 2.1592942981334695, + "grad_norm": 58.78976573413326, + "learning_rate": 2.207864252820898e-06, + "loss": 1.6454, + "step": 25335 + }, + { + "epoch": 2.159379527827495, + "grad_norm": 61.415916139158156, + "learning_rate": 2.2074529342194313e-06, + "loss": 2.3066, + "step": 25336 + }, + { + "epoch": 2.1594647575215205, + "grad_norm": 68.89382600304356, + "learning_rate": 2.2070416430815568e-06, + "loss": 1.6481, + "step": 25337 + }, + { + "epoch": 2.159549987215546, + "grad_norm": 54.333913770656004, + "learning_rate": 2.2066303794113175e-06, + "loss": 1.2906, + "step": 25338 + }, + { + "epoch": 2.1596352169095714, + "grad_norm": 78.72152788479677, + "learning_rate": 2.206219143212759e-06, + "loss": 2.6471, + "step": 25339 + }, + { + "epoch": 2.159720446603597, + "grad_norm": 50.49122922262456, + "learning_rate": 2.205807934489923e-06, + "loss": 1.3612, + "step": 25340 + }, + { + "epoch": 2.159805676297622, + "grad_norm": 68.37131952031326, + "learning_rate": 2.2053967532468564e-06, + "loss": 2.6683, + "step": 25341 + }, + { + "epoch": 2.1598909059916473, + "grad_norm": 74.90703046051814, + "learning_rate": 2.204985599487603e-06, + "loss": 2.2873, + "step": 25342 + }, + { + "epoch": 2.159976135685673, + "grad_norm": 25.270375612530486, + "learning_rate": 2.204574473216206e-06, + "loss": 0.9158, + "step": 25343 + }, + { + "epoch": 2.1600613653796983, + "grad_norm": 40.43688704731249, + "learning_rate": 2.204163374436707e-06, + "loss": 0.9119, + "step": 25344 + }, + { + "epoch": 2.1601465950737238, + "grad_norm": 22.06094223701648, + "learning_rate": 2.203752303153151e-06, + "loss": 0.9724, + "step": 25345 + }, + { + "epoch": 2.160231824767749, + "grad_norm": 43.1288910460843, + "learning_rate": 2.203341259369581e-06, + "loss": 1.8058, + "step": 25346 + }, + { + "epoch": 2.1603170544617747, + "grad_norm": 41.970301549529864, + "learning_rate": 2.2029302430900377e-06, + "loss": 1.2136, + "step": 25347 + }, + { + "epoch": 2.1604022841557997, + "grad_norm": 58.91122894593979, + "learning_rate": 2.2025192543185626e-06, + "loss": 1.5705, + "step": 25348 + }, + { + "epoch": 2.160487513849825, + "grad_norm": 56.37090203221807, + "learning_rate": 2.202108293059199e-06, + "loss": 1.5672, + "step": 25349 + }, + { + "epoch": 2.1605727435438506, + "grad_norm": 55.89625519663909, + "learning_rate": 2.20169735931599e-06, + "loss": 1.6993, + "step": 25350 + }, + { + "epoch": 2.160657973237876, + "grad_norm": 77.31682414218906, + "learning_rate": 2.201286453092976e-06, + "loss": 1.7074, + "step": 25351 + }, + { + "epoch": 2.1607432029319016, + "grad_norm": 54.79837919952247, + "learning_rate": 2.200875574394198e-06, + "loss": 1.4319, + "step": 25352 + }, + { + "epoch": 2.160828432625927, + "grad_norm": 56.58018538564082, + "learning_rate": 2.2004647232236938e-06, + "loss": 1.0788, + "step": 25353 + }, + { + "epoch": 2.160913662319952, + "grad_norm": 33.32971805602923, + "learning_rate": 2.2000538995855086e-06, + "loss": 1.3105, + "step": 25354 + }, + { + "epoch": 2.1609988920139775, + "grad_norm": 26.785912968663123, + "learning_rate": 2.199643103483681e-06, + "loss": 0.6765, + "step": 25355 + }, + { + "epoch": 2.161084121708003, + "grad_norm": 84.09954760821535, + "learning_rate": 2.199232334922249e-06, + "loss": 1.3086, + "step": 25356 + }, + { + "epoch": 2.1611693514020285, + "grad_norm": 47.357709922636644, + "learning_rate": 2.1988215939052537e-06, + "loss": 1.6397, + "step": 25357 + }, + { + "epoch": 2.161254581096054, + "grad_norm": 111.85368235409075, + "learning_rate": 2.198410880436737e-06, + "loss": 1.2104, + "step": 25358 + }, + { + "epoch": 2.1613398107900794, + "grad_norm": 70.44867280644127, + "learning_rate": 2.1980001945207356e-06, + "loss": 1.8064, + "step": 25359 + }, + { + "epoch": 2.1614250404841044, + "grad_norm": 25.770440206815515, + "learning_rate": 2.19758953616129e-06, + "loss": 0.902, + "step": 25360 + }, + { + "epoch": 2.16151027017813, + "grad_norm": 40.66573687626452, + "learning_rate": 2.1971789053624347e-06, + "loss": 1.2435, + "step": 25361 + }, + { + "epoch": 2.1615954998721554, + "grad_norm": 24.729509324967704, + "learning_rate": 2.196768302128213e-06, + "loss": 0.8945, + "step": 25362 + }, + { + "epoch": 2.161680729566181, + "grad_norm": 55.43593251176335, + "learning_rate": 2.196357726462662e-06, + "loss": 1.6405, + "step": 25363 + }, + { + "epoch": 2.1617659592602063, + "grad_norm": 29.51669660969142, + "learning_rate": 2.1959471783698165e-06, + "loss": 1.1143, + "step": 25364 + }, + { + "epoch": 2.161851188954232, + "grad_norm": 73.58963688707996, + "learning_rate": 2.195536657853718e-06, + "loss": 2.168, + "step": 25365 + }, + { + "epoch": 2.1619364186482573, + "grad_norm": 33.89115290284093, + "learning_rate": 2.1951261649184e-06, + "loss": 1.1454, + "step": 25366 + }, + { + "epoch": 2.1620216483422823, + "grad_norm": 42.33197514567843, + "learning_rate": 2.194715699567904e-06, + "loss": 0.9689, + "step": 25367 + }, + { + "epoch": 2.1621068780363077, + "grad_norm": 47.76129654941262, + "learning_rate": 2.1943052618062642e-06, + "loss": 1.3507, + "step": 25368 + }, + { + "epoch": 2.162192107730333, + "grad_norm": 36.55190425312822, + "learning_rate": 2.1938948516375176e-06, + "loss": 1.029, + "step": 25369 + }, + { + "epoch": 2.1622773374243587, + "grad_norm": 69.39934871338319, + "learning_rate": 2.1934844690656977e-06, + "loss": 1.2611, + "step": 25370 + }, + { + "epoch": 2.162362567118384, + "grad_norm": 35.74740055104506, + "learning_rate": 2.1930741140948445e-06, + "loss": 1.2458, + "step": 25371 + }, + { + "epoch": 2.1624477968124096, + "grad_norm": 42.01885159775701, + "learning_rate": 2.1926637867289908e-06, + "loss": 1.371, + "step": 25372 + }, + { + "epoch": 2.1625330265064346, + "grad_norm": 29.504040031103937, + "learning_rate": 2.192253486972175e-06, + "loss": 1.0329, + "step": 25373 + }, + { + "epoch": 2.16261825620046, + "grad_norm": 46.688840264936566, + "learning_rate": 2.1918432148284285e-06, + "loss": 1.2682, + "step": 25374 + }, + { + "epoch": 2.1627034858944856, + "grad_norm": 30.108894548764795, + "learning_rate": 2.19143297030179e-06, + "loss": 1.1389, + "step": 25375 + }, + { + "epoch": 2.162788715588511, + "grad_norm": 33.85114439233238, + "learning_rate": 2.1910227533962917e-06, + "loss": 0.9147, + "step": 25376 + }, + { + "epoch": 2.1628739452825365, + "grad_norm": 38.0444636763971, + "learning_rate": 2.190612564115969e-06, + "loss": 1.6738, + "step": 25377 + }, + { + "epoch": 2.162959174976562, + "grad_norm": 47.29865305299712, + "learning_rate": 2.1902024024648553e-06, + "loss": 1.1637, + "step": 25378 + }, + { + "epoch": 2.163044404670587, + "grad_norm": 38.52021029243509, + "learning_rate": 2.189792268446983e-06, + "loss": 1.348, + "step": 25379 + }, + { + "epoch": 2.1631296343646125, + "grad_norm": 70.29969470273173, + "learning_rate": 2.1893821620663862e-06, + "loss": 2.1324, + "step": 25380 + }, + { + "epoch": 2.163214864058638, + "grad_norm": 27.50015234610317, + "learning_rate": 2.1889720833271014e-06, + "loss": 0.9454, + "step": 25381 + }, + { + "epoch": 2.1633000937526634, + "grad_norm": 60.61245548286012, + "learning_rate": 2.188562032233159e-06, + "loss": 2.0897, + "step": 25382 + }, + { + "epoch": 2.163385323446689, + "grad_norm": 41.99689977636456, + "learning_rate": 2.1881520087885903e-06, + "loss": 1.1265, + "step": 25383 + }, + { + "epoch": 2.1634705531407143, + "grad_norm": 42.60347969451462, + "learning_rate": 2.187742012997431e-06, + "loss": 1.4029, + "step": 25384 + }, + { + "epoch": 2.16355578283474, + "grad_norm": 33.09846958282232, + "learning_rate": 2.1873320448637114e-06, + "loss": 1.2875, + "step": 25385 + }, + { + "epoch": 2.163641012528765, + "grad_norm": 63.156583451687176, + "learning_rate": 2.1869221043914634e-06, + "loss": 1.9769, + "step": 25386 + }, + { + "epoch": 2.1637262422227903, + "grad_norm": 30.577508201748348, + "learning_rate": 2.186512191584717e-06, + "loss": 1.0598, + "step": 25387 + }, + { + "epoch": 2.1638114719168158, + "grad_norm": 57.01445345182296, + "learning_rate": 2.1861023064475055e-06, + "loss": 2.1899, + "step": 25388 + }, + { + "epoch": 2.1638967016108412, + "grad_norm": 31.051812651591643, + "learning_rate": 2.185692448983861e-06, + "loss": 0.9693, + "step": 25389 + }, + { + "epoch": 2.1639819313048667, + "grad_norm": 45.19448717402388, + "learning_rate": 2.1852826191978133e-06, + "loss": 1.7531, + "step": 25390 + }, + { + "epoch": 2.164067160998892, + "grad_norm": 65.39464064399131, + "learning_rate": 2.1848728170933926e-06, + "loss": 1.3628, + "step": 25391 + }, + { + "epoch": 2.164152390692917, + "grad_norm": 18.137083113765737, + "learning_rate": 2.184463042674627e-06, + "loss": 0.5499, + "step": 25392 + }, + { + "epoch": 2.1642376203869427, + "grad_norm": 65.36318070661079, + "learning_rate": 2.1840532959455506e-06, + "loss": 1.8113, + "step": 25393 + }, + { + "epoch": 2.164322850080968, + "grad_norm": 25.775812749719513, + "learning_rate": 2.1836435769101918e-06, + "loss": 1.3228, + "step": 25394 + }, + { + "epoch": 2.1644080797749936, + "grad_norm": 39.62452218436713, + "learning_rate": 2.1832338855725766e-06, + "loss": 1.2622, + "step": 25395 + }, + { + "epoch": 2.164493309469019, + "grad_norm": 35.178760585442056, + "learning_rate": 2.182824221936737e-06, + "loss": 1.5495, + "step": 25396 + }, + { + "epoch": 2.1645785391630445, + "grad_norm": 37.813631905000875, + "learning_rate": 2.1824145860067037e-06, + "loss": 1.5673, + "step": 25397 + }, + { + "epoch": 2.1646637688570696, + "grad_norm": 28.64367223963317, + "learning_rate": 2.1820049777865033e-06, + "loss": 1.2068, + "step": 25398 + }, + { + "epoch": 2.164748998551095, + "grad_norm": 31.739373800129666, + "learning_rate": 2.181595397280164e-06, + "loss": 1.2112, + "step": 25399 + }, + { + "epoch": 2.1648342282451205, + "grad_norm": 62.89530240272781, + "learning_rate": 2.1811858444917122e-06, + "loss": 1.8054, + "step": 25400 + }, + { + "epoch": 2.164919457939146, + "grad_norm": 41.913646623793404, + "learning_rate": 2.1807763194251797e-06, + "loss": 1.4631, + "step": 25401 + }, + { + "epoch": 2.1650046876331714, + "grad_norm": 102.13410517404026, + "learning_rate": 2.1803668220845915e-06, + "loss": 2.0203, + "step": 25402 + }, + { + "epoch": 2.165089917327197, + "grad_norm": 33.06684660303011, + "learning_rate": 2.1799573524739736e-06, + "loss": 1.1723, + "step": 25403 + }, + { + "epoch": 2.1651751470212224, + "grad_norm": 67.7446785620348, + "learning_rate": 2.1795479105973543e-06, + "loss": 2.2492, + "step": 25404 + }, + { + "epoch": 2.1652603767152474, + "grad_norm": 51.521971998056124, + "learning_rate": 2.1791384964587627e-06, + "loss": 1.5474, + "step": 25405 + }, + { + "epoch": 2.165345606409273, + "grad_norm": 42.012768832844095, + "learning_rate": 2.1787291100622234e-06, + "loss": 1.2746, + "step": 25406 + }, + { + "epoch": 2.1654308361032983, + "grad_norm": 116.45831667469403, + "learning_rate": 2.1783197514117615e-06, + "loss": 1.8278, + "step": 25407 + }, + { + "epoch": 2.165516065797324, + "grad_norm": 33.101734720275886, + "learning_rate": 2.177910420511404e-06, + "loss": 1.4577, + "step": 25408 + }, + { + "epoch": 2.1656012954913493, + "grad_norm": 40.45359905821517, + "learning_rate": 2.1775011173651737e-06, + "loss": 1.332, + "step": 25409 + }, + { + "epoch": 2.1656865251853747, + "grad_norm": 17.71772229728435, + "learning_rate": 2.177091841977101e-06, + "loss": 0.5291, + "step": 25410 + }, + { + "epoch": 2.1657717548794, + "grad_norm": 55.65109193009878, + "learning_rate": 2.176682594351206e-06, + "loss": 1.373, + "step": 25411 + }, + { + "epoch": 2.1658569845734252, + "grad_norm": 32.20637445971844, + "learning_rate": 2.1762733744915184e-06, + "loss": 1.2528, + "step": 25412 + }, + { + "epoch": 2.1659422142674507, + "grad_norm": 50.13764554094186, + "learning_rate": 2.175864182402058e-06, + "loss": 1.6559, + "step": 25413 + }, + { + "epoch": 2.166027443961476, + "grad_norm": 40.68566627661899, + "learning_rate": 2.175455018086852e-06, + "loss": 1.7891, + "step": 25414 + }, + { + "epoch": 2.1661126736555016, + "grad_norm": 22.396489214179095, + "learning_rate": 2.175045881549924e-06, + "loss": 1.01, + "step": 25415 + }, + { + "epoch": 2.166197903349527, + "grad_norm": 58.32213280761234, + "learning_rate": 2.1746367727952977e-06, + "loss": 1.4596, + "step": 25416 + }, + { + "epoch": 2.1662831330435526, + "grad_norm": 50.073469706982294, + "learning_rate": 2.1742276918269943e-06, + "loss": 1.1278, + "step": 25417 + }, + { + "epoch": 2.1663683627375776, + "grad_norm": 41.11948162185898, + "learning_rate": 2.17381863864904e-06, + "loss": 0.9364, + "step": 25418 + }, + { + "epoch": 2.166453592431603, + "grad_norm": 52.82408696499893, + "learning_rate": 2.173409613265455e-06, + "loss": 2.1633, + "step": 25419 + }, + { + "epoch": 2.1665388221256285, + "grad_norm": 28.047548600380203, + "learning_rate": 2.173000615680265e-06, + "loss": 0.9826, + "step": 25420 + }, + { + "epoch": 2.166624051819654, + "grad_norm": 36.95156613781873, + "learning_rate": 2.1725916458974905e-06, + "loss": 0.95, + "step": 25421 + }, + { + "epoch": 2.1667092815136795, + "grad_norm": 47.22092898028994, + "learning_rate": 2.172182703921152e-06, + "loss": 1.2998, + "step": 25422 + }, + { + "epoch": 2.166794511207705, + "grad_norm": 48.71749093518384, + "learning_rate": 2.1717737897552745e-06, + "loss": 1.2814, + "step": 25423 + }, + { + "epoch": 2.16687974090173, + "grad_norm": 46.56652742932204, + "learning_rate": 2.171364903403878e-06, + "loss": 1.5567, + "step": 25424 + }, + { + "epoch": 2.1669649705957554, + "grad_norm": 46.52049704536543, + "learning_rate": 2.170956044870984e-06, + "loss": 1.4487, + "step": 25425 + }, + { + "epoch": 2.167050200289781, + "grad_norm": 51.75469834460487, + "learning_rate": 2.1705472141606103e-06, + "loss": 1.5103, + "step": 25426 + }, + { + "epoch": 2.1671354299838064, + "grad_norm": 37.279310685670396, + "learning_rate": 2.1701384112767814e-06, + "loss": 1.5096, + "step": 25427 + }, + { + "epoch": 2.167220659677832, + "grad_norm": 64.56145001581474, + "learning_rate": 2.1697296362235185e-06, + "loss": 1.8442, + "step": 25428 + }, + { + "epoch": 2.1673058893718573, + "grad_norm": 51.91809629910499, + "learning_rate": 2.16932088900484e-06, + "loss": 1.533, + "step": 25429 + }, + { + "epoch": 2.1673911190658828, + "grad_norm": 54.15085106459599, + "learning_rate": 2.168912169624764e-06, + "loss": 1.6281, + "step": 25430 + }, + { + "epoch": 2.167476348759908, + "grad_norm": 46.73325677890172, + "learning_rate": 2.168503478087313e-06, + "loss": 1.7446, + "step": 25431 + }, + { + "epoch": 2.1675615784539333, + "grad_norm": 49.27387590737248, + "learning_rate": 2.168094814396506e-06, + "loss": 1.939, + "step": 25432 + }, + { + "epoch": 2.1676468081479587, + "grad_norm": 60.50321284841279, + "learning_rate": 2.167686178556361e-06, + "loss": 1.1456, + "step": 25433 + }, + { + "epoch": 2.167732037841984, + "grad_norm": 41.200689870460785, + "learning_rate": 2.167277570570895e-06, + "loss": 1.4146, + "step": 25434 + }, + { + "epoch": 2.1678172675360097, + "grad_norm": 26.400669925994634, + "learning_rate": 2.166868990444128e-06, + "loss": 0.858, + "step": 25435 + }, + { + "epoch": 2.167902497230035, + "grad_norm": 34.35748603776969, + "learning_rate": 2.1664604381800815e-06, + "loss": 1.074, + "step": 25436 + }, + { + "epoch": 2.16798772692406, + "grad_norm": 48.395917609693555, + "learning_rate": 2.1660519137827697e-06, + "loss": 1.3039, + "step": 25437 + }, + { + "epoch": 2.1680729566180856, + "grad_norm": 27.649169471879436, + "learning_rate": 2.1656434172562117e-06, + "loss": 1.4027, + "step": 25438 + }, + { + "epoch": 2.168158186312111, + "grad_norm": 63.54548517649834, + "learning_rate": 2.1652349486044226e-06, + "loss": 1.5655, + "step": 25439 + }, + { + "epoch": 2.1682434160061366, + "grad_norm": 62.260824621601316, + "learning_rate": 2.1648265078314235e-06, + "loss": 1.5444, + "step": 25440 + }, + { + "epoch": 2.168328645700162, + "grad_norm": 33.540499132636185, + "learning_rate": 2.1644180949412287e-06, + "loss": 1.1281, + "step": 25441 + }, + { + "epoch": 2.1684138753941875, + "grad_norm": 50.70994112685087, + "learning_rate": 2.164009709937853e-06, + "loss": 1.6547, + "step": 25442 + }, + { + "epoch": 2.1684991050882125, + "grad_norm": 59.10320789660421, + "learning_rate": 2.163601352825315e-06, + "loss": 1.9752, + "step": 25443 + }, + { + "epoch": 2.168584334782238, + "grad_norm": 67.14776667715583, + "learning_rate": 2.163193023607633e-06, + "loss": 1.3261, + "step": 25444 + }, + { + "epoch": 2.1686695644762635, + "grad_norm": 38.37479459241262, + "learning_rate": 2.1627847222888206e-06, + "loss": 1.2002, + "step": 25445 + }, + { + "epoch": 2.168754794170289, + "grad_norm": 44.5152478713377, + "learning_rate": 2.162376448872893e-06, + "loss": 1.823, + "step": 25446 + }, + { + "epoch": 2.1688400238643144, + "grad_norm": 34.05248472839417, + "learning_rate": 2.1619682033638627e-06, + "loss": 1.1661, + "step": 25447 + }, + { + "epoch": 2.16892525355834, + "grad_norm": 76.1145347731651, + "learning_rate": 2.1615599857657497e-06, + "loss": 1.1734, + "step": 25448 + }, + { + "epoch": 2.1690104832523653, + "grad_norm": 52.62470054901122, + "learning_rate": 2.161151796082567e-06, + "loss": 1.4981, + "step": 25449 + }, + { + "epoch": 2.1690957129463904, + "grad_norm": 41.2814963163953, + "learning_rate": 2.160743634318325e-06, + "loss": 1.3592, + "step": 25450 + }, + { + "epoch": 2.169180942640416, + "grad_norm": 70.58289529435226, + "learning_rate": 2.160335500477044e-06, + "loss": 1.7837, + "step": 25451 + }, + { + "epoch": 2.1692661723344413, + "grad_norm": 64.38888432584858, + "learning_rate": 2.1599273945627334e-06, + "loss": 2.1091, + "step": 25452 + }, + { + "epoch": 2.1693514020284668, + "grad_norm": 33.40102865642122, + "learning_rate": 2.15951931657941e-06, + "loss": 1.2413, + "step": 25453 + }, + { + "epoch": 2.1694366317224922, + "grad_norm": 52.2622812079955, + "learning_rate": 2.1591112665310852e-06, + "loss": 1.3959, + "step": 25454 + }, + { + "epoch": 2.1695218614165177, + "grad_norm": 72.01114496219851, + "learning_rate": 2.158703244421772e-06, + "loss": 2.4235, + "step": 25455 + }, + { + "epoch": 2.1696070911105427, + "grad_norm": 49.60422774930711, + "learning_rate": 2.158295250255482e-06, + "loss": 0.8885, + "step": 25456 + }, + { + "epoch": 2.169692320804568, + "grad_norm": 54.08804168355249, + "learning_rate": 2.1578872840362307e-06, + "loss": 1.9896, + "step": 25457 + }, + { + "epoch": 2.1697775504985937, + "grad_norm": 41.27900879212351, + "learning_rate": 2.1574793457680264e-06, + "loss": 1.157, + "step": 25458 + }, + { + "epoch": 2.169862780192619, + "grad_norm": 53.40325318943564, + "learning_rate": 2.1570714354548862e-06, + "loss": 1.7623, + "step": 25459 + }, + { + "epoch": 2.1699480098866446, + "grad_norm": 28.901755086719852, + "learning_rate": 2.156663553100816e-06, + "loss": 1.3122, + "step": 25460 + }, + { + "epoch": 2.17003323958067, + "grad_norm": 44.16266531910799, + "learning_rate": 2.1562556987098314e-06, + "loss": 1.4764, + "step": 25461 + }, + { + "epoch": 2.170118469274695, + "grad_norm": 42.166445685429444, + "learning_rate": 2.155847872285943e-06, + "loss": 1.6417, + "step": 25462 + }, + { + "epoch": 2.1702036989687206, + "grad_norm": 40.25543367242774, + "learning_rate": 2.15544007383316e-06, + "loss": 1.559, + "step": 25463 + }, + { + "epoch": 2.170288928662746, + "grad_norm": 73.10166020126199, + "learning_rate": 2.155032303355494e-06, + "loss": 2.6797, + "step": 25464 + }, + { + "epoch": 2.1703741583567715, + "grad_norm": 26.624112279658693, + "learning_rate": 2.154624560856953e-06, + "loss": 1.1838, + "step": 25465 + }, + { + "epoch": 2.170459388050797, + "grad_norm": 141.91054070530103, + "learning_rate": 2.1542168463415482e-06, + "loss": 3.2282, + "step": 25466 + }, + { + "epoch": 2.1705446177448224, + "grad_norm": 34.6478005396777, + "learning_rate": 2.153809159813292e-06, + "loss": 1.4893, + "step": 25467 + }, + { + "epoch": 2.170629847438848, + "grad_norm": 42.007974024569755, + "learning_rate": 2.1534015012761914e-06, + "loss": 1.1995, + "step": 25468 + }, + { + "epoch": 2.170715077132873, + "grad_norm": 53.31593907991436, + "learning_rate": 2.1529938707342547e-06, + "loss": 1.3478, + "step": 25469 + }, + { + "epoch": 2.1708003068268984, + "grad_norm": 34.30418187142156, + "learning_rate": 2.1525862681914935e-06, + "loss": 1.1213, + "step": 25470 + }, + { + "epoch": 2.170885536520924, + "grad_norm": 59.86658717608159, + "learning_rate": 2.152178693651915e-06, + "loss": 1.7231, + "step": 25471 + }, + { + "epoch": 2.1709707662149493, + "grad_norm": 74.31980658544654, + "learning_rate": 2.1517711471195273e-06, + "loss": 2.0694, + "step": 25472 + }, + { + "epoch": 2.171055995908975, + "grad_norm": 18.13295992204555, + "learning_rate": 2.151363628598337e-06, + "loss": 0.7243, + "step": 25473 + }, + { + "epoch": 2.1711412256030003, + "grad_norm": 55.063604727255544, + "learning_rate": 2.1509561380923534e-06, + "loss": 1.4076, + "step": 25474 + }, + { + "epoch": 2.1712264552970253, + "grad_norm": 20.866015574309415, + "learning_rate": 2.150548675605586e-06, + "loss": 0.4597, + "step": 25475 + }, + { + "epoch": 2.1713116849910508, + "grad_norm": 65.43208117911968, + "learning_rate": 2.1501412411420398e-06, + "loss": 2.0743, + "step": 25476 + }, + { + "epoch": 2.1713969146850762, + "grad_norm": 64.08442495247321, + "learning_rate": 2.149733834705721e-06, + "loss": 1.9386, + "step": 25477 + }, + { + "epoch": 2.1714821443791017, + "grad_norm": 47.028826671047476, + "learning_rate": 2.1493264563006384e-06, + "loss": 1.4193, + "step": 25478 + }, + { + "epoch": 2.171567374073127, + "grad_norm": 59.85264327770478, + "learning_rate": 2.1489191059307973e-06, + "loss": 2.0647, + "step": 25479 + }, + { + "epoch": 2.1716526037671526, + "grad_norm": 55.53269520235118, + "learning_rate": 2.148511783600204e-06, + "loss": 2.0061, + "step": 25480 + }, + { + "epoch": 2.1717378334611777, + "grad_norm": 29.243350999635076, + "learning_rate": 2.148104489312863e-06, + "loss": 1.629, + "step": 25481 + }, + { + "epoch": 2.171823063155203, + "grad_norm": 79.77472765882239, + "learning_rate": 2.1476972230727804e-06, + "loss": 2.4415, + "step": 25482 + }, + { + "epoch": 2.1719082928492286, + "grad_norm": 50.35454993010755, + "learning_rate": 2.1472899848839647e-06, + "loss": 1.1552, + "step": 25483 + }, + { + "epoch": 2.171993522543254, + "grad_norm": 32.96831159061562, + "learning_rate": 2.146882774750418e-06, + "loss": 1.5183, + "step": 25484 + }, + { + "epoch": 2.1720787522372795, + "grad_norm": 56.56645544539848, + "learning_rate": 2.1464755926761454e-06, + "loss": 1.4941, + "step": 25485 + }, + { + "epoch": 2.172163981931305, + "grad_norm": 41.59062267708359, + "learning_rate": 2.1460684386651497e-06, + "loss": 1.1887, + "step": 25486 + }, + { + "epoch": 2.1722492116253305, + "grad_norm": 36.56767619938749, + "learning_rate": 2.1456613127214386e-06, + "loss": 1.0126, + "step": 25487 + }, + { + "epoch": 2.1723344413193555, + "grad_norm": 45.46423568228576, + "learning_rate": 2.1452542148490147e-06, + "loss": 0.9965, + "step": 25488 + }, + { + "epoch": 2.172419671013381, + "grad_norm": 40.08587314758867, + "learning_rate": 2.1448471450518795e-06, + "loss": 1.525, + "step": 25489 + }, + { + "epoch": 2.1725049007074064, + "grad_norm": 46.3265357099367, + "learning_rate": 2.1444401033340372e-06, + "loss": 2.1538, + "step": 25490 + }, + { + "epoch": 2.172590130401432, + "grad_norm": 48.56515750490397, + "learning_rate": 2.1440330896994944e-06, + "loss": 1.9447, + "step": 25491 + }, + { + "epoch": 2.1726753600954574, + "grad_norm": 41.57507625272799, + "learning_rate": 2.143626104152251e-06, + "loss": 1.2957, + "step": 25492 + }, + { + "epoch": 2.172760589789483, + "grad_norm": 25.863699678357698, + "learning_rate": 2.1432191466963105e-06, + "loss": 1.2258, + "step": 25493 + }, + { + "epoch": 2.172845819483508, + "grad_norm": 95.79830845077515, + "learning_rate": 2.1428122173356734e-06, + "loss": 1.7473, + "step": 25494 + }, + { + "epoch": 2.1729310491775333, + "grad_norm": 29.148827766138233, + "learning_rate": 2.1424053160743418e-06, + "loss": 1.2776, + "step": 25495 + }, + { + "epoch": 2.173016278871559, + "grad_norm": 32.044377409238976, + "learning_rate": 2.14199844291632e-06, + "loss": 0.9493, + "step": 25496 + }, + { + "epoch": 2.1731015085655843, + "grad_norm": 61.82302686704644, + "learning_rate": 2.141591597865606e-06, + "loss": 1.8643, + "step": 25497 + }, + { + "epoch": 2.1731867382596097, + "grad_norm": 33.30442345948343, + "learning_rate": 2.141184780926205e-06, + "loss": 1.3878, + "step": 25498 + }, + { + "epoch": 2.173271967953635, + "grad_norm": 76.01539659996781, + "learning_rate": 2.1407779921021133e-06, + "loss": 1.8313, + "step": 25499 + }, + { + "epoch": 2.17335719764766, + "grad_norm": 26.04792698525377, + "learning_rate": 2.140371231397336e-06, + "loss": 1.8265, + "step": 25500 + }, + { + "epoch": 2.1734424273416857, + "grad_norm": 50.65074936093748, + "learning_rate": 2.1399644988158715e-06, + "loss": 2.2463, + "step": 25501 + }, + { + "epoch": 2.173527657035711, + "grad_norm": 58.79120553393706, + "learning_rate": 2.1395577943617197e-06, + "loss": 1.6036, + "step": 25502 + }, + { + "epoch": 2.1736128867297366, + "grad_norm": 37.02162834463173, + "learning_rate": 2.139151118038878e-06, + "loss": 1.5072, + "step": 25503 + }, + { + "epoch": 2.173698116423762, + "grad_norm": 24.36981447357793, + "learning_rate": 2.1387444698513505e-06, + "loss": 0.5941, + "step": 25504 + }, + { + "epoch": 2.1737833461177876, + "grad_norm": 54.69044691642046, + "learning_rate": 2.138337849803133e-06, + "loss": 1.6417, + "step": 25505 + }, + { + "epoch": 2.173868575811813, + "grad_norm": 45.479993800623234, + "learning_rate": 2.1379312578982264e-06, + "loss": 2.0622, + "step": 25506 + }, + { + "epoch": 2.173953805505838, + "grad_norm": 24.194644685717243, + "learning_rate": 2.137524694140627e-06, + "loss": 0.7256, + "step": 25507 + }, + { + "epoch": 2.1740390351998635, + "grad_norm": 27.94246917866616, + "learning_rate": 2.1371181585343366e-06, + "loss": 0.5156, + "step": 25508 + }, + { + "epoch": 2.174124264893889, + "grad_norm": 67.06538471643712, + "learning_rate": 2.1367116510833517e-06, + "loss": 1.424, + "step": 25509 + }, + { + "epoch": 2.1742094945879145, + "grad_norm": 34.86426258135313, + "learning_rate": 2.13630517179167e-06, + "loss": 1.1779, + "step": 25510 + }, + { + "epoch": 2.17429472428194, + "grad_norm": 67.89828941808345, + "learning_rate": 2.1358987206632892e-06, + "loss": 2.0083, + "step": 25511 + }, + { + "epoch": 2.1743799539759654, + "grad_norm": 59.987397642388274, + "learning_rate": 2.1354922977022043e-06, + "loss": 1.9114, + "step": 25512 + }, + { + "epoch": 2.1744651836699904, + "grad_norm": 31.192642686105533, + "learning_rate": 2.135085902912414e-06, + "loss": 1.0377, + "step": 25513 + }, + { + "epoch": 2.174550413364016, + "grad_norm": 28.67273439250984, + "learning_rate": 2.134679536297918e-06, + "loss": 1.0249, + "step": 25514 + }, + { + "epoch": 2.1746356430580414, + "grad_norm": 23.213424187374255, + "learning_rate": 2.1342731978627095e-06, + "loss": 1.1061, + "step": 25515 + }, + { + "epoch": 2.174720872752067, + "grad_norm": 63.98158689649897, + "learning_rate": 2.1338668876107843e-06, + "loss": 1.8178, + "step": 25516 + }, + { + "epoch": 2.1748061024460923, + "grad_norm": 29.65976072534218, + "learning_rate": 2.133460605546141e-06, + "loss": 0.7495, + "step": 25517 + }, + { + "epoch": 2.1748913321401178, + "grad_norm": 19.720556030662625, + "learning_rate": 2.133054351672773e-06, + "loss": 0.8641, + "step": 25518 + }, + { + "epoch": 2.174976561834143, + "grad_norm": 50.99376165291151, + "learning_rate": 2.132648125994677e-06, + "loss": 2.1809, + "step": 25519 + }, + { + "epoch": 2.1750617915281683, + "grad_norm": 45.325845525769815, + "learning_rate": 2.1322419285158453e-06, + "loss": 1.3746, + "step": 25520 + }, + { + "epoch": 2.1751470212221937, + "grad_norm": 26.291334865005602, + "learning_rate": 2.1318357592402743e-06, + "loss": 1.2847, + "step": 25521 + }, + { + "epoch": 2.175232250916219, + "grad_norm": 23.229529144374233, + "learning_rate": 2.1314296181719605e-06, + "loss": 1.0108, + "step": 25522 + }, + { + "epoch": 2.1753174806102447, + "grad_norm": 42.17691072011893, + "learning_rate": 2.1310235053148977e-06, + "loss": 0.8671, + "step": 25523 + }, + { + "epoch": 2.17540271030427, + "grad_norm": 69.4338820593687, + "learning_rate": 2.130617420673078e-06, + "loss": 1.3767, + "step": 25524 + }, + { + "epoch": 2.1754879399982956, + "grad_norm": 47.83904768981346, + "learning_rate": 2.130211364250494e-06, + "loss": 2.047, + "step": 25525 + }, + { + "epoch": 2.1755731696923206, + "grad_norm": 67.90861696521574, + "learning_rate": 2.129805336051143e-06, + "loss": 2.0899, + "step": 25526 + }, + { + "epoch": 2.175658399386346, + "grad_norm": 57.688103759536716, + "learning_rate": 2.1293993360790155e-06, + "loss": 1.6665, + "step": 25527 + }, + { + "epoch": 2.1757436290803716, + "grad_norm": 28.616480809806937, + "learning_rate": 2.128993364338103e-06, + "loss": 0.8389, + "step": 25528 + }, + { + "epoch": 2.175828858774397, + "grad_norm": 47.24902136060869, + "learning_rate": 2.1285874208324004e-06, + "loss": 1.2472, + "step": 25529 + }, + { + "epoch": 2.1759140884684225, + "grad_norm": 43.267786109258545, + "learning_rate": 2.128181505565901e-06, + "loss": 0.8925, + "step": 25530 + }, + { + "epoch": 2.175999318162448, + "grad_norm": 75.6685405949597, + "learning_rate": 2.127775618542596e-06, + "loss": 1.5599, + "step": 25531 + }, + { + "epoch": 2.1760845478564734, + "grad_norm": 56.1644534950646, + "learning_rate": 2.1273697597664757e-06, + "loss": 1.2316, + "step": 25532 + }, + { + "epoch": 2.1761697775504985, + "grad_norm": 29.044995677453425, + "learning_rate": 2.1269639292415306e-06, + "loss": 1.621, + "step": 25533 + }, + { + "epoch": 2.176255007244524, + "grad_norm": 58.82482376033566, + "learning_rate": 2.1265581269717556e-06, + "loss": 1.8476, + "step": 25534 + }, + { + "epoch": 2.1763402369385494, + "grad_norm": 64.63955268808856, + "learning_rate": 2.12615235296114e-06, + "loss": 2.3208, + "step": 25535 + }, + { + "epoch": 2.176425466632575, + "grad_norm": 45.239876799872356, + "learning_rate": 2.1257466072136717e-06, + "loss": 2.0859, + "step": 25536 + }, + { + "epoch": 2.1765106963266003, + "grad_norm": 34.18962036741712, + "learning_rate": 2.1253408897333457e-06, + "loss": 1.0148, + "step": 25537 + }, + { + "epoch": 2.176595926020626, + "grad_norm": 52.0610656705354, + "learning_rate": 2.1249352005241473e-06, + "loss": 2.3033, + "step": 25538 + }, + { + "epoch": 2.176681155714651, + "grad_norm": 39.84459830685258, + "learning_rate": 2.124529539590071e-06, + "loss": 1.3636, + "step": 25539 + }, + { + "epoch": 2.1767663854086763, + "grad_norm": 47.63249357087974, + "learning_rate": 2.124123906935104e-06, + "loss": 1.2668, + "step": 25540 + }, + { + "epoch": 2.1768516151027018, + "grad_norm": 43.328166901186776, + "learning_rate": 2.1237183025632356e-06, + "loss": 1.557, + "step": 25541 + }, + { + "epoch": 2.176936844796727, + "grad_norm": 54.94243587441924, + "learning_rate": 2.123312726478453e-06, + "loss": 1.8104, + "step": 25542 + }, + { + "epoch": 2.1770220744907527, + "grad_norm": 65.86794932925406, + "learning_rate": 2.1229071786847487e-06, + "loss": 2.0911, + "step": 25543 + }, + { + "epoch": 2.177107304184778, + "grad_norm": 43.18994904029961, + "learning_rate": 2.122501659186107e-06, + "loss": 1.5326, + "step": 25544 + }, + { + "epoch": 2.177192533878803, + "grad_norm": 64.63653881961297, + "learning_rate": 2.12209616798652e-06, + "loss": 2.216, + "step": 25545 + }, + { + "epoch": 2.1772777635728287, + "grad_norm": 47.39717232972086, + "learning_rate": 2.1216907050899714e-06, + "loss": 1.6378, + "step": 25546 + }, + { + "epoch": 2.177362993266854, + "grad_norm": 36.25312176933943, + "learning_rate": 2.1212852705004534e-06, + "loss": 0.8634, + "step": 25547 + }, + { + "epoch": 2.1774482229608796, + "grad_norm": 40.07582553188703, + "learning_rate": 2.1208798642219503e-06, + "loss": 1.2203, + "step": 25548 + }, + { + "epoch": 2.177533452654905, + "grad_norm": 35.50653166289421, + "learning_rate": 2.1204744862584503e-06, + "loss": 1.2275, + "step": 25549 + }, + { + "epoch": 2.1776186823489305, + "grad_norm": 34.238594451840186, + "learning_rate": 2.1200691366139368e-06, + "loss": 1.0749, + "step": 25550 + }, + { + "epoch": 2.177703912042956, + "grad_norm": 38.55184779503128, + "learning_rate": 2.119663815292402e-06, + "loss": 1.6256, + "step": 25551 + }, + { + "epoch": 2.177789141736981, + "grad_norm": 55.35904259506775, + "learning_rate": 2.1192585222978264e-06, + "loss": 1.8385, + "step": 25552 + }, + { + "epoch": 2.1778743714310065, + "grad_norm": 73.91484260565849, + "learning_rate": 2.118853257634201e-06, + "loss": 1.7948, + "step": 25553 + }, + { + "epoch": 2.177959601125032, + "grad_norm": 39.20837827486567, + "learning_rate": 2.118448021305508e-06, + "loss": 1.1246, + "step": 25554 + }, + { + "epoch": 2.1780448308190574, + "grad_norm": 26.46385850669345, + "learning_rate": 2.118042813315733e-06, + "loss": 1.2296, + "step": 25555 + }, + { + "epoch": 2.178130060513083, + "grad_norm": 50.49317449892575, + "learning_rate": 2.1176376336688624e-06, + "loss": 2.1783, + "step": 25556 + }, + { + "epoch": 2.1782152902071084, + "grad_norm": 50.97857413438227, + "learning_rate": 2.1172324823688813e-06, + "loss": 1.4393, + "step": 25557 + }, + { + "epoch": 2.1783005199011334, + "grad_norm": 52.227548495793904, + "learning_rate": 2.1168273594197726e-06, + "loss": 1.6803, + "step": 25558 + }, + { + "epoch": 2.178385749595159, + "grad_norm": 64.61381455663728, + "learning_rate": 2.1164222648255196e-06, + "loss": 2.1648, + "step": 25559 + }, + { + "epoch": 2.1784709792891843, + "grad_norm": 29.491544085997646, + "learning_rate": 2.116017198590107e-06, + "loss": 1.444, + "step": 25560 + }, + { + "epoch": 2.17855620898321, + "grad_norm": 6.293755716650682, + "learning_rate": 2.1156121607175222e-06, + "loss": 0.2301, + "step": 25561 + }, + { + "epoch": 2.1786414386772353, + "grad_norm": 43.463463114724895, + "learning_rate": 2.1152071512117454e-06, + "loss": 1.5779, + "step": 25562 + }, + { + "epoch": 2.1787266683712607, + "grad_norm": 70.65648700959672, + "learning_rate": 2.114802170076758e-06, + "loss": 2.1176, + "step": 25563 + }, + { + "epoch": 2.1788118980652857, + "grad_norm": 64.39690386648631, + "learning_rate": 2.114397217316547e-06, + "loss": 1.5395, + "step": 25564 + }, + { + "epoch": 2.178897127759311, + "grad_norm": 27.61715764330251, + "learning_rate": 2.113992292935092e-06, + "loss": 0.8709, + "step": 25565 + }, + { + "epoch": 2.1789823574533367, + "grad_norm": 42.922139931793936, + "learning_rate": 2.1135873969363767e-06, + "loss": 1.1604, + "step": 25566 + }, + { + "epoch": 2.179067587147362, + "grad_norm": 69.86716521119202, + "learning_rate": 2.11318252932438e-06, + "loss": 1.6379, + "step": 25567 + }, + { + "epoch": 2.1791528168413876, + "grad_norm": 55.73940820061791, + "learning_rate": 2.1127776901030865e-06, + "loss": 1.9526, + "step": 25568 + }, + { + "epoch": 2.179238046535413, + "grad_norm": 39.16248049736488, + "learning_rate": 2.1123728792764786e-06, + "loss": 1.5038, + "step": 25569 + }, + { + "epoch": 2.1793232762294386, + "grad_norm": 136.41723466521154, + "learning_rate": 2.111968096848536e-06, + "loss": 1.8726, + "step": 25570 + }, + { + "epoch": 2.1794085059234636, + "grad_norm": 37.87678677597396, + "learning_rate": 2.1115633428232403e-06, + "loss": 1.3535, + "step": 25571 + }, + { + "epoch": 2.179493735617489, + "grad_norm": 44.56409977574755, + "learning_rate": 2.111158617204569e-06, + "loss": 0.9787, + "step": 25572 + }, + { + "epoch": 2.1795789653115145, + "grad_norm": 51.21051984473942, + "learning_rate": 2.1107539199965067e-06, + "loss": 1.9278, + "step": 25573 + }, + { + "epoch": 2.17966419500554, + "grad_norm": 35.419755547716875, + "learning_rate": 2.1103492512030317e-06, + "loss": 1.2099, + "step": 25574 + }, + { + "epoch": 2.1797494246995655, + "grad_norm": 35.09745756282499, + "learning_rate": 2.1099446108281214e-06, + "loss": 0.9215, + "step": 25575 + }, + { + "epoch": 2.179834654393591, + "grad_norm": 65.7153738257695, + "learning_rate": 2.1095399988757574e-06, + "loss": 1.2922, + "step": 25576 + }, + { + "epoch": 2.179919884087616, + "grad_norm": 62.18046108666404, + "learning_rate": 2.1091354153499213e-06, + "loss": 1.3781, + "step": 25577 + }, + { + "epoch": 2.1800051137816414, + "grad_norm": 50.50365711926032, + "learning_rate": 2.1087308602545884e-06, + "loss": 1.7135, + "step": 25578 + }, + { + "epoch": 2.180090343475667, + "grad_norm": 36.871246845685945, + "learning_rate": 2.10832633359374e-06, + "loss": 1.3765, + "step": 25579 + }, + { + "epoch": 2.1801755731696923, + "grad_norm": 75.883120214478, + "learning_rate": 2.107921835371351e-06, + "loss": 2.2218, + "step": 25580 + }, + { + "epoch": 2.180260802863718, + "grad_norm": 19.240263619559155, + "learning_rate": 2.1075173655914026e-06, + "loss": 0.7441, + "step": 25581 + }, + { + "epoch": 2.1803460325577433, + "grad_norm": 103.85164913182588, + "learning_rate": 2.1071129242578724e-06, + "loss": 1.9796, + "step": 25582 + }, + { + "epoch": 2.1804312622517683, + "grad_norm": 42.300298350552545, + "learning_rate": 2.1067085113747347e-06, + "loss": 1.6522, + "step": 25583 + }, + { + "epoch": 2.1805164919457938, + "grad_norm": 62.63218930306959, + "learning_rate": 2.1063041269459715e-06, + "loss": 1.3302, + "step": 25584 + }, + { + "epoch": 2.1806017216398192, + "grad_norm": 56.33172828207676, + "learning_rate": 2.105899770975555e-06, + "loss": 1.0403, + "step": 25585 + }, + { + "epoch": 2.1806869513338447, + "grad_norm": 39.22174548375206, + "learning_rate": 2.105495443467466e-06, + "loss": 0.7515, + "step": 25586 + }, + { + "epoch": 2.18077218102787, + "grad_norm": 47.794236582019415, + "learning_rate": 2.10509114442568e-06, + "loss": 1.5284, + "step": 25587 + }, + { + "epoch": 2.1808574107218957, + "grad_norm": 35.50414602851657, + "learning_rate": 2.1046868738541714e-06, + "loss": 1.3019, + "step": 25588 + }, + { + "epoch": 2.180942640415921, + "grad_norm": 61.42853833108465, + "learning_rate": 2.1042826317569155e-06, + "loss": 1.4873, + "step": 25589 + }, + { + "epoch": 2.181027870109946, + "grad_norm": 59.80755270470689, + "learning_rate": 2.103878418137891e-06, + "loss": 1.8397, + "step": 25590 + }, + { + "epoch": 2.1811130998039716, + "grad_norm": 46.11091833784481, + "learning_rate": 2.10347423300107e-06, + "loss": 1.074, + "step": 25591 + }, + { + "epoch": 2.181198329497997, + "grad_norm": 20.641472709922382, + "learning_rate": 2.1030700763504304e-06, + "loss": 0.6533, + "step": 25592 + }, + { + "epoch": 2.1812835591920225, + "grad_norm": 82.47845477322839, + "learning_rate": 2.1026659481899437e-06, + "loss": 2.6245, + "step": 25593 + }, + { + "epoch": 2.181368788886048, + "grad_norm": 58.585503354814634, + "learning_rate": 2.1022618485235873e-06, + "loss": 1.6894, + "step": 25594 + }, + { + "epoch": 2.1814540185800735, + "grad_norm": 35.76646092971362, + "learning_rate": 2.101857777355335e-06, + "loss": 1.3248, + "step": 25595 + }, + { + "epoch": 2.1815392482740985, + "grad_norm": 49.41679205114938, + "learning_rate": 2.1014537346891594e-06, + "loss": 1.5377, + "step": 25596 + }, + { + "epoch": 2.181624477968124, + "grad_norm": 43.043500205800214, + "learning_rate": 2.1010497205290346e-06, + "loss": 1.5329, + "step": 25597 + }, + { + "epoch": 2.1817097076621494, + "grad_norm": 47.03356995727037, + "learning_rate": 2.1006457348789326e-06, + "loss": 1.9473, + "step": 25598 + }, + { + "epoch": 2.181794937356175, + "grad_norm": 41.940415152526754, + "learning_rate": 2.1002417777428276e-06, + "loss": 1.3635, + "step": 25599 + }, + { + "epoch": 2.1818801670502004, + "grad_norm": 19.08464911496764, + "learning_rate": 2.099837849124694e-06, + "loss": 0.8012, + "step": 25600 + }, + { + "epoch": 2.181965396744226, + "grad_norm": 56.78991504112857, + "learning_rate": 2.0994339490285026e-06, + "loss": 2.0891, + "step": 25601 + }, + { + "epoch": 2.182050626438251, + "grad_norm": 81.74726556567605, + "learning_rate": 2.099030077458224e-06, + "loss": 1.7362, + "step": 25602 + }, + { + "epoch": 2.1821358561322763, + "grad_norm": 39.439364126485515, + "learning_rate": 2.098626234417834e-06, + "loss": 1.353, + "step": 25603 + }, + { + "epoch": 2.182221085826302, + "grad_norm": 38.1590910808112, + "learning_rate": 2.0982224199113018e-06, + "loss": 1.3688, + "step": 25604 + }, + { + "epoch": 2.1823063155203273, + "grad_norm": 30.209537632381554, + "learning_rate": 2.097818633942599e-06, + "loss": 1.1527, + "step": 25605 + }, + { + "epoch": 2.1823915452143527, + "grad_norm": 109.92599221497363, + "learning_rate": 2.097414876515695e-06, + "loss": 1.846, + "step": 25606 + }, + { + "epoch": 2.182476774908378, + "grad_norm": 40.42795192997868, + "learning_rate": 2.0970111476345622e-06, + "loss": 1.6575, + "step": 25607 + }, + { + "epoch": 2.1825620046024037, + "grad_norm": 37.83967840655869, + "learning_rate": 2.096607447303173e-06, + "loss": 1.2636, + "step": 25608 + }, + { + "epoch": 2.1826472342964287, + "grad_norm": 43.14732090184973, + "learning_rate": 2.096203775525496e-06, + "loss": 1.0995, + "step": 25609 + }, + { + "epoch": 2.182732463990454, + "grad_norm": 37.442565300733875, + "learning_rate": 2.0958001323054993e-06, + "loss": 1.6186, + "step": 25610 + }, + { + "epoch": 2.1828176936844796, + "grad_norm": 37.509478944830796, + "learning_rate": 2.095396517647156e-06, + "loss": 1.3811, + "step": 25611 + }, + { + "epoch": 2.182902923378505, + "grad_norm": 47.628248075530074, + "learning_rate": 2.0949929315544336e-06, + "loss": 1.5379, + "step": 25612 + }, + { + "epoch": 2.1829881530725306, + "grad_norm": 55.8953856729598, + "learning_rate": 2.0945893740313017e-06, + "loss": 1.8616, + "step": 25613 + }, + { + "epoch": 2.183073382766556, + "grad_norm": 32.75331314120594, + "learning_rate": 2.0941858450817264e-06, + "loss": 0.7238, + "step": 25614 + }, + { + "epoch": 2.183158612460581, + "grad_norm": 45.922518493008134, + "learning_rate": 2.0937823447096794e-06, + "loss": 0.9466, + "step": 25615 + }, + { + "epoch": 2.1832438421546065, + "grad_norm": 26.21715456202369, + "learning_rate": 2.0933788729191296e-06, + "loss": 1.0751, + "step": 25616 + }, + { + "epoch": 2.183329071848632, + "grad_norm": 26.56295082767564, + "learning_rate": 2.0929754297140444e-06, + "loss": 0.7263, + "step": 25617 + }, + { + "epoch": 2.1834143015426575, + "grad_norm": 58.399141515708905, + "learning_rate": 2.0925720150983896e-06, + "loss": 2.0294, + "step": 25618 + }, + { + "epoch": 2.183499531236683, + "grad_norm": 45.66763842131528, + "learning_rate": 2.0921686290761323e-06, + "loss": 2.1362, + "step": 25619 + }, + { + "epoch": 2.1835847609307084, + "grad_norm": 33.653559053964266, + "learning_rate": 2.0917652716512433e-06, + "loss": 1.0387, + "step": 25620 + }, + { + "epoch": 2.1836699906247334, + "grad_norm": 23.935001884428303, + "learning_rate": 2.0913619428276865e-06, + "loss": 0.9701, + "step": 25621 + }, + { + "epoch": 2.183755220318759, + "grad_norm": 46.664456922956006, + "learning_rate": 2.090958642609428e-06, + "loss": 1.3866, + "step": 25622 + }, + { + "epoch": 2.1838404500127844, + "grad_norm": 57.110960442634585, + "learning_rate": 2.090555371000435e-06, + "loss": 2.1989, + "step": 25623 + }, + { + "epoch": 2.18392567970681, + "grad_norm": 37.25689622316755, + "learning_rate": 2.090152128004676e-06, + "loss": 1.1508, + "step": 25624 + }, + { + "epoch": 2.1840109094008353, + "grad_norm": 51.89091448203377, + "learning_rate": 2.089748913626114e-06, + "loss": 1.79, + "step": 25625 + }, + { + "epoch": 2.1840961390948608, + "grad_norm": 55.315152648774095, + "learning_rate": 2.0893457278687157e-06, + "loss": 1.4081, + "step": 25626 + }, + { + "epoch": 2.1841813687888862, + "grad_norm": 74.19800822015736, + "learning_rate": 2.088942570736445e-06, + "loss": 1.893, + "step": 25627 + }, + { + "epoch": 2.1842665984829113, + "grad_norm": 70.92096685715491, + "learning_rate": 2.0885394422332654e-06, + "loss": 2.1806, + "step": 25628 + }, + { + "epoch": 2.1843518281769367, + "grad_norm": 43.4529068718658, + "learning_rate": 2.088136342363146e-06, + "loss": 1.8744, + "step": 25629 + }, + { + "epoch": 2.184437057870962, + "grad_norm": 50.16089473962495, + "learning_rate": 2.0877332711300463e-06, + "loss": 1.5567, + "step": 25630 + }, + { + "epoch": 2.1845222875649877, + "grad_norm": 51.64473034665361, + "learning_rate": 2.087330228537935e-06, + "loss": 1.9892, + "step": 25631 + }, + { + "epoch": 2.184607517259013, + "grad_norm": 48.907142009453466, + "learning_rate": 2.086927214590771e-06, + "loss": 1.1644, + "step": 25632 + }, + { + "epoch": 2.1846927469530386, + "grad_norm": 64.56159480991603, + "learning_rate": 2.0865242292925224e-06, + "loss": 1.5513, + "step": 25633 + }, + { + "epoch": 2.1847779766470636, + "grad_norm": 20.08430640101625, + "learning_rate": 2.086121272647151e-06, + "loss": 0.9617, + "step": 25634 + }, + { + "epoch": 2.184863206341089, + "grad_norm": 72.49003525557093, + "learning_rate": 2.0857183446586175e-06, + "loss": 1.9935, + "step": 25635 + }, + { + "epoch": 2.1849484360351146, + "grad_norm": 40.61898363033352, + "learning_rate": 2.0853154453308856e-06, + "loss": 0.9874, + "step": 25636 + }, + { + "epoch": 2.18503366572914, + "grad_norm": 22.373453116347175, + "learning_rate": 2.0849125746679196e-06, + "loss": 0.7308, + "step": 25637 + }, + { + "epoch": 2.1851188954231655, + "grad_norm": 31.806421341062478, + "learning_rate": 2.0845097326736778e-06, + "loss": 1.2806, + "step": 25638 + }, + { + "epoch": 2.185204125117191, + "grad_norm": 75.55266643101442, + "learning_rate": 2.0841069193521267e-06, + "loss": 1.4427, + "step": 25639 + }, + { + "epoch": 2.185289354811216, + "grad_norm": 54.115926310309206, + "learning_rate": 2.083704134707225e-06, + "loss": 2.0983, + "step": 25640 + }, + { + "epoch": 2.1853745845052415, + "grad_norm": 39.16432919934066, + "learning_rate": 2.0833013787429323e-06, + "loss": 0.9126, + "step": 25641 + }, + { + "epoch": 2.185459814199267, + "grad_norm": 24.664594594639443, + "learning_rate": 2.082898651463214e-06, + "loss": 0.6939, + "step": 25642 + }, + { + "epoch": 2.1855450438932924, + "grad_norm": 43.593476316925, + "learning_rate": 2.0824959528720277e-06, + "loss": 1.8235, + "step": 25643 + }, + { + "epoch": 2.185630273587318, + "grad_norm": 38.36385815397086, + "learning_rate": 2.082093282973335e-06, + "loss": 1.271, + "step": 25644 + }, + { + "epoch": 2.1857155032813433, + "grad_norm": 75.17631064845472, + "learning_rate": 2.081690641771093e-06, + "loss": 2.0116, + "step": 25645 + }, + { + "epoch": 2.185800732975369, + "grad_norm": 48.99649355022185, + "learning_rate": 2.0812880292692645e-06, + "loss": 1.9116, + "step": 25646 + }, + { + "epoch": 2.185885962669394, + "grad_norm": 29.39846924252373, + "learning_rate": 2.0808854454718095e-06, + "loss": 1.0158, + "step": 25647 + }, + { + "epoch": 2.1859711923634193, + "grad_norm": 26.882000962436734, + "learning_rate": 2.0804828903826867e-06, + "loss": 1.523, + "step": 25648 + }, + { + "epoch": 2.1860564220574448, + "grad_norm": 56.74714747843057, + "learning_rate": 2.080080364005853e-06, + "loss": 1.6345, + "step": 25649 + }, + { + "epoch": 2.1861416517514702, + "grad_norm": 74.64804787373441, + "learning_rate": 2.0796778663452707e-06, + "loss": 1.9336, + "step": 25650 + }, + { + "epoch": 2.1862268814454957, + "grad_norm": 45.80646799599082, + "learning_rate": 2.079275397404895e-06, + "loss": 1.2, + "step": 25651 + }, + { + "epoch": 2.186312111139521, + "grad_norm": 52.35833219870386, + "learning_rate": 2.078872957188686e-06, + "loss": 1.369, + "step": 25652 + }, + { + "epoch": 2.1863973408335466, + "grad_norm": 48.25962927293198, + "learning_rate": 2.0784705457005986e-06, + "loss": 1.0594, + "step": 25653 + }, + { + "epoch": 2.1864825705275717, + "grad_norm": 31.05125904156087, + "learning_rate": 2.0780681629445933e-06, + "loss": 1.0433, + "step": 25654 + }, + { + "epoch": 2.186567800221597, + "grad_norm": 36.543649095926284, + "learning_rate": 2.077665808924627e-06, + "loss": 1.4858, + "step": 25655 + }, + { + "epoch": 2.1866530299156226, + "grad_norm": 47.88547359619465, + "learning_rate": 2.077263483644657e-06, + "loss": 1.1548, + "step": 25656 + }, + { + "epoch": 2.186738259609648, + "grad_norm": 37.351454911823055, + "learning_rate": 2.07686118710864e-06, + "loss": 0.9939, + "step": 25657 + }, + { + "epoch": 2.1868234893036735, + "grad_norm": 80.14851115176228, + "learning_rate": 2.076458919320529e-06, + "loss": 2.556, + "step": 25658 + }, + { + "epoch": 2.1869087189976986, + "grad_norm": 47.73779872123407, + "learning_rate": 2.076056680284284e-06, + "loss": 1.7315, + "step": 25659 + }, + { + "epoch": 2.186993948691724, + "grad_norm": 46.21720012020737, + "learning_rate": 2.0756544700038605e-06, + "loss": 1.2233, + "step": 25660 + }, + { + "epoch": 2.1870791783857495, + "grad_norm": 62.8556563537323, + "learning_rate": 2.0752522884832115e-06, + "loss": 1.865, + "step": 25661 + }, + { + "epoch": 2.187164408079775, + "grad_norm": 52.40202981153146, + "learning_rate": 2.074850135726294e-06, + "loss": 1.426, + "step": 25662 + }, + { + "epoch": 2.1872496377738004, + "grad_norm": 22.655098575253568, + "learning_rate": 2.0744480117370645e-06, + "loss": 0.9912, + "step": 25663 + }, + { + "epoch": 2.187334867467826, + "grad_norm": 44.908507188369924, + "learning_rate": 2.074045916519477e-06, + "loss": 1.6941, + "step": 25664 + }, + { + "epoch": 2.1874200971618514, + "grad_norm": 65.33201943367003, + "learning_rate": 2.073643850077485e-06, + "loss": 1.4641, + "step": 25665 + }, + { + "epoch": 2.1875053268558764, + "grad_norm": 67.14886453509224, + "learning_rate": 2.0732418124150406e-06, + "loss": 1.6979, + "step": 25666 + }, + { + "epoch": 2.187590556549902, + "grad_norm": 36.655722151755455, + "learning_rate": 2.0728398035361023e-06, + "loss": 1.0182, + "step": 25667 + }, + { + "epoch": 2.1876757862439273, + "grad_norm": 29.518253413956607, + "learning_rate": 2.0724378234446215e-06, + "loss": 1.4318, + "step": 25668 + }, + { + "epoch": 2.187761015937953, + "grad_norm": 44.03367498995062, + "learning_rate": 2.0720358721445486e-06, + "loss": 1.6061, + "step": 25669 + }, + { + "epoch": 2.1878462456319783, + "grad_norm": 41.29043016319549, + "learning_rate": 2.0716339496398423e-06, + "loss": 1.3746, + "step": 25670 + }, + { + "epoch": 2.1879314753260037, + "grad_norm": 24.94860124115688, + "learning_rate": 2.0712320559344507e-06, + "loss": 0.7783, + "step": 25671 + }, + { + "epoch": 2.188016705020029, + "grad_norm": 71.49162049297615, + "learning_rate": 2.070830191032329e-06, + "loss": 1.2932, + "step": 25672 + }, + { + "epoch": 2.1881019347140542, + "grad_norm": 43.42495504028684, + "learning_rate": 2.0704283549374294e-06, + "loss": 0.9789, + "step": 25673 + }, + { + "epoch": 2.1881871644080797, + "grad_norm": 37.23315762477336, + "learning_rate": 2.0700265476537023e-06, + "loss": 0.7535, + "step": 25674 + }, + { + "epoch": 2.188272394102105, + "grad_norm": 26.32743780348751, + "learning_rate": 2.0696247691850983e-06, + "loss": 1.2095, + "step": 25675 + }, + { + "epoch": 2.1883576237961306, + "grad_norm": 41.14391528706254, + "learning_rate": 2.0692230195355715e-06, + "loss": 1.1641, + "step": 25676 + }, + { + "epoch": 2.188442853490156, + "grad_norm": 23.544940767132186, + "learning_rate": 2.0688212987090707e-06, + "loss": 1.0191, + "step": 25677 + }, + { + "epoch": 2.1885280831841816, + "grad_norm": 18.22238925419265, + "learning_rate": 2.068419606709549e-06, + "loss": 1.03, + "step": 25678 + }, + { + "epoch": 2.1886133128782066, + "grad_norm": 34.198187126685845, + "learning_rate": 2.068017943540954e-06, + "loss": 1.2675, + "step": 25679 + }, + { + "epoch": 2.188698542572232, + "grad_norm": 54.34726598998393, + "learning_rate": 2.0676163092072394e-06, + "loss": 1.6753, + "step": 25680 + }, + { + "epoch": 2.1887837722662575, + "grad_norm": 62.08995671902603, + "learning_rate": 2.067214703712353e-06, + "loss": 1.724, + "step": 25681 + }, + { + "epoch": 2.188869001960283, + "grad_norm": 84.96294004016116, + "learning_rate": 2.066813127060244e-06, + "loss": 2.678, + "step": 25682 + }, + { + "epoch": 2.1889542316543085, + "grad_norm": 30.495542248665753, + "learning_rate": 2.0664115792548616e-06, + "loss": 1.4034, + "step": 25683 + }, + { + "epoch": 2.189039461348334, + "grad_norm": 45.505182153570786, + "learning_rate": 2.066010060300157e-06, + "loss": 1.4196, + "step": 25684 + }, + { + "epoch": 2.189124691042359, + "grad_norm": 42.820501003444264, + "learning_rate": 2.065608570200076e-06, + "loss": 1.0313, + "step": 25685 + }, + { + "epoch": 2.1892099207363844, + "grad_norm": 37.64069266805289, + "learning_rate": 2.06520710895857e-06, + "loss": 1.5829, + "step": 25686 + }, + { + "epoch": 2.18929515043041, + "grad_norm": 53.45120870781575, + "learning_rate": 2.0648056765795865e-06, + "loss": 1.4319, + "step": 25687 + }, + { + "epoch": 2.1893803801244354, + "grad_norm": 17.40279966317365, + "learning_rate": 2.0644042730670704e-06, + "loss": 0.724, + "step": 25688 + }, + { + "epoch": 2.189465609818461, + "grad_norm": 71.23594235968005, + "learning_rate": 2.064002898424974e-06, + "loss": 1.9685, + "step": 25689 + }, + { + "epoch": 2.1895508395124863, + "grad_norm": 57.51259950076394, + "learning_rate": 2.0636015526572422e-06, + "loss": 1.9286, + "step": 25690 + }, + { + "epoch": 2.1896360692065118, + "grad_norm": 69.69021240898685, + "learning_rate": 2.063200235767821e-06, + "loss": 1.6278, + "step": 25691 + }, + { + "epoch": 2.189721298900537, + "grad_norm": 56.463985060752286, + "learning_rate": 2.06279894776066e-06, + "loss": 2.172, + "step": 25692 + }, + { + "epoch": 2.1898065285945623, + "grad_norm": 62.56856530810735, + "learning_rate": 2.062397688639702e-06, + "loss": 1.9827, + "step": 25693 + }, + { + "epoch": 2.1898917582885877, + "grad_norm": 72.32205525086744, + "learning_rate": 2.061996458408898e-06, + "loss": 2.6038, + "step": 25694 + }, + { + "epoch": 2.189976987982613, + "grad_norm": 59.827731870710124, + "learning_rate": 2.061595257072191e-06, + "loss": 1.6786, + "step": 25695 + }, + { + "epoch": 2.1900622176766387, + "grad_norm": 84.32922698422338, + "learning_rate": 2.0611940846335256e-06, + "loss": 2.4748, + "step": 25696 + }, + { + "epoch": 2.190147447370664, + "grad_norm": 60.20918954105243, + "learning_rate": 2.0607929410968497e-06, + "loss": 1.717, + "step": 25697 + }, + { + "epoch": 2.190232677064689, + "grad_norm": 25.599752418085565, + "learning_rate": 2.060391826466108e-06, + "loss": 0.8666, + "step": 25698 + }, + { + "epoch": 2.1903179067587146, + "grad_norm": 67.56105735114936, + "learning_rate": 2.059990740745244e-06, + "loss": 1.9321, + "step": 25699 + }, + { + "epoch": 2.19040313645274, + "grad_norm": 73.95704502256783, + "learning_rate": 2.0595896839382013e-06, + "loss": 1.7259, + "step": 25700 + }, + { + "epoch": 2.1904883661467656, + "grad_norm": 52.83098551896159, + "learning_rate": 2.0591886560489255e-06, + "loss": 1.7528, + "step": 25701 + }, + { + "epoch": 2.190573595840791, + "grad_norm": 59.962742751355435, + "learning_rate": 2.058787657081363e-06, + "loss": 1.841, + "step": 25702 + }, + { + "epoch": 2.1906588255348165, + "grad_norm": 39.39438649091067, + "learning_rate": 2.058386687039455e-06, + "loss": 1.6567, + "step": 25703 + }, + { + "epoch": 2.1907440552288415, + "grad_norm": 66.50585484358774, + "learning_rate": 2.0579857459271448e-06, + "loss": 2.4923, + "step": 25704 + }, + { + "epoch": 2.190829284922867, + "grad_norm": 58.99208377139517, + "learning_rate": 2.057584833748374e-06, + "loss": 1.7748, + "step": 25705 + }, + { + "epoch": 2.1909145146168925, + "grad_norm": 67.40822103639421, + "learning_rate": 2.057183950507089e-06, + "loss": 1.4109, + "step": 25706 + }, + { + "epoch": 2.190999744310918, + "grad_norm": 47.6897978979226, + "learning_rate": 2.0567830962072305e-06, + "loss": 1.551, + "step": 25707 + }, + { + "epoch": 2.1910849740049434, + "grad_norm": 23.472646705093062, + "learning_rate": 2.0563822708527387e-06, + "loss": 0.8159, + "step": 25708 + }, + { + "epoch": 2.191170203698969, + "grad_norm": 36.65262238603326, + "learning_rate": 2.0559814744475576e-06, + "loss": 1.2278, + "step": 25709 + }, + { + "epoch": 2.1912554333929943, + "grad_norm": 57.943934301499404, + "learning_rate": 2.0555807069956313e-06, + "loss": 1.2624, + "step": 25710 + }, + { + "epoch": 2.1913406630870194, + "grad_norm": 41.01396412731843, + "learning_rate": 2.0551799685008984e-06, + "loss": 1.0882, + "step": 25711 + }, + { + "epoch": 2.191425892781045, + "grad_norm": 64.54463345732556, + "learning_rate": 2.0547792589673e-06, + "loss": 1.5275, + "step": 25712 + }, + { + "epoch": 2.1915111224750703, + "grad_norm": 30.52598082825907, + "learning_rate": 2.0543785783987774e-06, + "loss": 1.2485, + "step": 25713 + }, + { + "epoch": 2.1915963521690958, + "grad_norm": 66.3284284965959, + "learning_rate": 2.0539779267992695e-06, + "loss": 1.1088, + "step": 25714 + }, + { + "epoch": 2.1916815818631212, + "grad_norm": 74.66043780487611, + "learning_rate": 2.0535773041727197e-06, + "loss": 2.3426, + "step": 25715 + }, + { + "epoch": 2.1917668115571467, + "grad_norm": 35.89803609949174, + "learning_rate": 2.053176710523064e-06, + "loss": 1.4738, + "step": 25716 + }, + { + "epoch": 2.1918520412511717, + "grad_norm": 57.04521890252591, + "learning_rate": 2.0527761458542474e-06, + "loss": 2.0284, + "step": 25717 + }, + { + "epoch": 2.191937270945197, + "grad_norm": 52.451703729974966, + "learning_rate": 2.0523756101702035e-06, + "loss": 1.7693, + "step": 25718 + }, + { + "epoch": 2.1920225006392227, + "grad_norm": 24.023509590478817, + "learning_rate": 2.0519751034748763e-06, + "loss": 1.0928, + "step": 25719 + }, + { + "epoch": 2.192107730333248, + "grad_norm": 54.875984107789016, + "learning_rate": 2.0515746257722024e-06, + "loss": 1.1565, + "step": 25720 + }, + { + "epoch": 2.1921929600272736, + "grad_norm": 29.321354633247644, + "learning_rate": 2.051174177066121e-06, + "loss": 1.2186, + "step": 25721 + }, + { + "epoch": 2.192278189721299, + "grad_norm": 39.29224362712424, + "learning_rate": 2.0507737573605674e-06, + "loss": 0.8609, + "step": 25722 + }, + { + "epoch": 2.192363419415324, + "grad_norm": 45.143379260205656, + "learning_rate": 2.050373366659484e-06, + "loss": 1.872, + "step": 25723 + }, + { + "epoch": 2.1924486491093496, + "grad_norm": 31.455342236951033, + "learning_rate": 2.049973004966805e-06, + "loss": 0.9275, + "step": 25724 + }, + { + "epoch": 2.192533878803375, + "grad_norm": 64.2732710474481, + "learning_rate": 2.0495726722864706e-06, + "loss": 1.6405, + "step": 25725 + }, + { + "epoch": 2.1926191084974005, + "grad_norm": 24.089953520597184, + "learning_rate": 2.0491723686224146e-06, + "loss": 0.714, + "step": 25726 + }, + { + "epoch": 2.192704338191426, + "grad_norm": 30.57853560537605, + "learning_rate": 2.048772093978578e-06, + "loss": 0.8387, + "step": 25727 + }, + { + "epoch": 2.1927895678854514, + "grad_norm": 62.032319296555535, + "learning_rate": 2.0483718483588947e-06, + "loss": 1.2849, + "step": 25728 + }, + { + "epoch": 2.192874797579477, + "grad_norm": 27.903036756870968, + "learning_rate": 2.047971631767302e-06, + "loss": 1.2387, + "step": 25729 + }, + { + "epoch": 2.192960027273502, + "grad_norm": 28.49461827399412, + "learning_rate": 2.047571444207734e-06, + "loss": 1.0485, + "step": 25730 + }, + { + "epoch": 2.1930452569675274, + "grad_norm": 51.915801188870745, + "learning_rate": 2.047171285684128e-06, + "loss": 1.6224, + "step": 25731 + }, + { + "epoch": 2.193130486661553, + "grad_norm": 30.326515609735537, + "learning_rate": 2.046771156200418e-06, + "loss": 0.9923, + "step": 25732 + }, + { + "epoch": 2.1932157163555783, + "grad_norm": 39.55654175309678, + "learning_rate": 2.046371055760542e-06, + "loss": 0.5967, + "step": 25733 + }, + { + "epoch": 2.193300946049604, + "grad_norm": 56.73246849072449, + "learning_rate": 2.0459709843684333e-06, + "loss": 2.4524, + "step": 25734 + }, + { + "epoch": 2.1933861757436293, + "grad_norm": 63.07405390660994, + "learning_rate": 2.045570942028024e-06, + "loss": 1.454, + "step": 25735 + }, + { + "epoch": 2.1934714054376543, + "grad_norm": 33.56504714014022, + "learning_rate": 2.0451709287432527e-06, + "loss": 1.4058, + "step": 25736 + }, + { + "epoch": 2.1935566351316798, + "grad_norm": 54.89071522000634, + "learning_rate": 2.044770944518051e-06, + "loss": 1.5793, + "step": 25737 + }, + { + "epoch": 2.1936418648257052, + "grad_norm": 21.402313346893784, + "learning_rate": 2.0443709893563505e-06, + "loss": 0.7122, + "step": 25738 + }, + { + "epoch": 2.1937270945197307, + "grad_norm": 42.4234998953731, + "learning_rate": 2.043971063262089e-06, + "loss": 1.4626, + "step": 25739 + }, + { + "epoch": 2.193812324213756, + "grad_norm": 49.48509322724354, + "learning_rate": 2.0435711662391954e-06, + "loss": 1.3159, + "step": 25740 + }, + { + "epoch": 2.1938975539077816, + "grad_norm": 48.837529010997244, + "learning_rate": 2.043171298291607e-06, + "loss": 1.4485, + "step": 25741 + }, + { + "epoch": 2.1939827836018067, + "grad_norm": 49.21452631596739, + "learning_rate": 2.0427714594232535e-06, + "loss": 1.2395, + "step": 25742 + }, + { + "epoch": 2.194068013295832, + "grad_norm": 31.995043470896803, + "learning_rate": 2.042371649638068e-06, + "loss": 1.1662, + "step": 25743 + }, + { + "epoch": 2.1941532429898576, + "grad_norm": 50.48926093513091, + "learning_rate": 2.0419718689399797e-06, + "loss": 1.6133, + "step": 25744 + }, + { + "epoch": 2.194238472683883, + "grad_norm": 60.89976482193857, + "learning_rate": 2.041572117332925e-06, + "loss": 1.5082, + "step": 25745 + }, + { + "epoch": 2.1943237023779085, + "grad_norm": 48.082318207757375, + "learning_rate": 2.041172394820833e-06, + "loss": 1.6327, + "step": 25746 + }, + { + "epoch": 2.194408932071934, + "grad_norm": 49.99274169840013, + "learning_rate": 2.0407727014076325e-06, + "loss": 1.2406, + "step": 25747 + }, + { + "epoch": 2.1944941617659595, + "grad_norm": 35.87505380077227, + "learning_rate": 2.040373037097257e-06, + "loss": 1.23, + "step": 25748 + }, + { + "epoch": 2.1945793914599845, + "grad_norm": 39.65822082309085, + "learning_rate": 2.039973401893638e-06, + "loss": 0.7839, + "step": 25749 + }, + { + "epoch": 2.19466462115401, + "grad_norm": 78.81880774121291, + "learning_rate": 2.0395737958007043e-06, + "loss": 2.5234, + "step": 25750 + }, + { + "epoch": 2.1947498508480354, + "grad_norm": 36.343375810723664, + "learning_rate": 2.0391742188223866e-06, + "loss": 0.9648, + "step": 25751 + }, + { + "epoch": 2.194835080542061, + "grad_norm": 34.74749054724927, + "learning_rate": 2.0387746709626124e-06, + "loss": 0.9371, + "step": 25752 + }, + { + "epoch": 2.1949203102360864, + "grad_norm": 52.17097217942792, + "learning_rate": 2.038375152225313e-06, + "loss": 1.8663, + "step": 25753 + }, + { + "epoch": 2.195005539930112, + "grad_norm": 39.61854209379313, + "learning_rate": 2.037975662614418e-06, + "loss": 1.281, + "step": 25754 + }, + { + "epoch": 2.195090769624137, + "grad_norm": 43.679137835221134, + "learning_rate": 2.037576202133854e-06, + "loss": 1.9242, + "step": 25755 + }, + { + "epoch": 2.1951759993181623, + "grad_norm": 35.99899877487898, + "learning_rate": 2.0371767707875504e-06, + "loss": 1.2848, + "step": 25756 + }, + { + "epoch": 2.195261229012188, + "grad_norm": 35.1882321662579, + "learning_rate": 2.036777368579438e-06, + "loss": 1.2091, + "step": 25757 + }, + { + "epoch": 2.1953464587062133, + "grad_norm": 43.95162168635261, + "learning_rate": 2.0363779955134426e-06, + "loss": 1.1892, + "step": 25758 + }, + { + "epoch": 2.1954316884002387, + "grad_norm": 46.236781737342504, + "learning_rate": 2.035978651593492e-06, + "loss": 1.2043, + "step": 25759 + }, + { + "epoch": 2.195516918094264, + "grad_norm": 47.011492750086724, + "learning_rate": 2.0355793368235134e-06, + "loss": 1.5414, + "step": 25760 + }, + { + "epoch": 2.195602147788289, + "grad_norm": 59.117984329284795, + "learning_rate": 2.0351800512074326e-06, + "loss": 1.5207, + "step": 25761 + }, + { + "epoch": 2.1956873774823147, + "grad_norm": 39.73569175376125, + "learning_rate": 2.0347807947491794e-06, + "loss": 1.3527, + "step": 25762 + }, + { + "epoch": 2.19577260717634, + "grad_norm": 47.961381180950475, + "learning_rate": 2.034381567452677e-06, + "loss": 1.2057, + "step": 25763 + }, + { + "epoch": 2.1958578368703656, + "grad_norm": 71.08158177728531, + "learning_rate": 2.0339823693218554e-06, + "loss": 1.7281, + "step": 25764 + }, + { + "epoch": 2.195943066564391, + "grad_norm": 27.200052095752486, + "learning_rate": 2.033583200360637e-06, + "loss": 1.1012, + "step": 25765 + }, + { + "epoch": 2.1960282962584166, + "grad_norm": 34.13919704089138, + "learning_rate": 2.03318406057295e-06, + "loss": 1.4282, + "step": 25766 + }, + { + "epoch": 2.196113525952442, + "grad_norm": 31.775311037333033, + "learning_rate": 2.03278494996272e-06, + "loss": 0.9579, + "step": 25767 + }, + { + "epoch": 2.196198755646467, + "grad_norm": 50.47708745380164, + "learning_rate": 2.0323858685338703e-06, + "loss": 1.317, + "step": 25768 + }, + { + "epoch": 2.1962839853404925, + "grad_norm": 57.10116847681625, + "learning_rate": 2.031986816290325e-06, + "loss": 1.3407, + "step": 25769 + }, + { + "epoch": 2.196369215034518, + "grad_norm": 88.15633415188042, + "learning_rate": 2.0315877932360116e-06, + "loss": 2.5877, + "step": 25770 + }, + { + "epoch": 2.1964544447285435, + "grad_norm": 114.52307293787281, + "learning_rate": 2.031188799374851e-06, + "loss": 0.6116, + "step": 25771 + }, + { + "epoch": 2.196539674422569, + "grad_norm": 65.25027351231829, + "learning_rate": 2.0307898347107706e-06, + "loss": 2.1161, + "step": 25772 + }, + { + "epoch": 2.1966249041165944, + "grad_norm": 38.59173631875341, + "learning_rate": 2.030390899247692e-06, + "loss": 1.5633, + "step": 25773 + }, + { + "epoch": 2.1967101338106194, + "grad_norm": 42.254664372860645, + "learning_rate": 2.029991992989537e-06, + "loss": 1.7874, + "step": 25774 + }, + { + "epoch": 2.196795363504645, + "grad_norm": 24.013898057088323, + "learning_rate": 2.0295931159402334e-06, + "loss": 0.8052, + "step": 25775 + }, + { + "epoch": 2.1968805931986704, + "grad_norm": 57.201053527204785, + "learning_rate": 2.0291942681037e-06, + "loss": 1.686, + "step": 25776 + }, + { + "epoch": 2.196965822892696, + "grad_norm": 39.62912050966538, + "learning_rate": 2.02879544948386e-06, + "loss": 1.2363, + "step": 25777 + }, + { + "epoch": 2.1970510525867213, + "grad_norm": 62.09049495721961, + "learning_rate": 2.0283966600846365e-06, + "loss": 1.735, + "step": 25778 + }, + { + "epoch": 2.1971362822807468, + "grad_norm": 40.57450816936709, + "learning_rate": 2.02799789990995e-06, + "loss": 1.5164, + "step": 25779 + }, + { + "epoch": 2.197221511974772, + "grad_norm": 30.17588047392981, + "learning_rate": 2.0275991689637255e-06, + "loss": 0.9785, + "step": 25780 + }, + { + "epoch": 2.1973067416687972, + "grad_norm": 33.123159724712266, + "learning_rate": 2.0272004672498814e-06, + "loss": 0.9897, + "step": 25781 + }, + { + "epoch": 2.1973919713628227, + "grad_norm": 62.5142226087922, + "learning_rate": 2.026801794772337e-06, + "loss": 1.3966, + "step": 25782 + }, + { + "epoch": 2.197477201056848, + "grad_norm": 57.63250621122502, + "learning_rate": 2.0264031515350185e-06, + "loss": 1.1207, + "step": 25783 + }, + { + "epoch": 2.1975624307508737, + "grad_norm": 19.34665358735281, + "learning_rate": 2.026004537541843e-06, + "loss": 0.77, + "step": 25784 + }, + { + "epoch": 2.197647660444899, + "grad_norm": 23.765666454618458, + "learning_rate": 2.025605952796729e-06, + "loss": 1.1033, + "step": 25785 + }, + { + "epoch": 2.1977328901389246, + "grad_norm": 55.56844232612237, + "learning_rate": 2.025207397303601e-06, + "loss": 1.7359, + "step": 25786 + }, + { + "epoch": 2.1978181198329496, + "grad_norm": 53.873774294103555, + "learning_rate": 2.0248088710663737e-06, + "loss": 1.4702, + "step": 25787 + }, + { + "epoch": 2.197903349526975, + "grad_norm": 54.32457513309742, + "learning_rate": 2.024410374088971e-06, + "loss": 1.6662, + "step": 25788 + }, + { + "epoch": 2.1979885792210005, + "grad_norm": 37.160828337481476, + "learning_rate": 2.02401190637531e-06, + "loss": 1.6319, + "step": 25789 + }, + { + "epoch": 2.198073808915026, + "grad_norm": 68.41732401534624, + "learning_rate": 2.023613467929309e-06, + "loss": 1.4889, + "step": 25790 + }, + { + "epoch": 2.1981590386090515, + "grad_norm": 70.16418770235026, + "learning_rate": 2.0232150587548852e-06, + "loss": 2.1364, + "step": 25791 + }, + { + "epoch": 2.198244268303077, + "grad_norm": 82.19811396655439, + "learning_rate": 2.0228166788559608e-06, + "loss": 1.6756, + "step": 25792 + }, + { + "epoch": 2.1983294979971024, + "grad_norm": 50.562652788814944, + "learning_rate": 2.022418328236451e-06, + "loss": 1.7481, + "step": 25793 + }, + { + "epoch": 2.1984147276911274, + "grad_norm": 47.71437766732199, + "learning_rate": 2.0220200069002717e-06, + "loss": 1.4665, + "step": 25794 + }, + { + "epoch": 2.198499957385153, + "grad_norm": 47.708636972448865, + "learning_rate": 2.0216217148513423e-06, + "loss": 1.2551, + "step": 25795 + }, + { + "epoch": 2.1985851870791784, + "grad_norm": 56.99356174045554, + "learning_rate": 2.0212234520935812e-06, + "loss": 1.7226, + "step": 25796 + }, + { + "epoch": 2.198670416773204, + "grad_norm": 129.75125453211265, + "learning_rate": 2.0208252186309044e-06, + "loss": 4.2725, + "step": 25797 + }, + { + "epoch": 2.1987556464672293, + "grad_norm": 43.7355418917494, + "learning_rate": 2.020427014467228e-06, + "loss": 0.7913, + "step": 25798 + }, + { + "epoch": 2.198840876161255, + "grad_norm": 29.213367215457257, + "learning_rate": 2.020028839606466e-06, + "loss": 1.7225, + "step": 25799 + }, + { + "epoch": 2.19892610585528, + "grad_norm": 29.551823417921096, + "learning_rate": 2.019630694052538e-06, + "loss": 1.5875, + "step": 25800 + }, + { + "epoch": 2.1990113355493053, + "grad_norm": 54.9464908112077, + "learning_rate": 2.019232577809358e-06, + "loss": 1.2504, + "step": 25801 + }, + { + "epoch": 2.1990965652433307, + "grad_norm": 51.06904410876965, + "learning_rate": 2.018834490880839e-06, + "loss": 1.8691, + "step": 25802 + }, + { + "epoch": 2.199181794937356, + "grad_norm": 40.29705379672018, + "learning_rate": 2.0184364332709e-06, + "loss": 1.2435, + "step": 25803 + }, + { + "epoch": 2.1992670246313817, + "grad_norm": 54.55279566133729, + "learning_rate": 2.0180384049834522e-06, + "loss": 1.5064, + "step": 25804 + }, + { + "epoch": 2.199352254325407, + "grad_norm": 59.94327150048126, + "learning_rate": 2.0176404060224135e-06, + "loss": 2.0952, + "step": 25805 + }, + { + "epoch": 2.199437484019432, + "grad_norm": 58.50956142192704, + "learning_rate": 2.0172424363916965e-06, + "loss": 1.9407, + "step": 25806 + }, + { + "epoch": 2.1995227137134576, + "grad_norm": 41.87765458796554, + "learning_rate": 2.016844496095215e-06, + "loss": 1.2701, + "step": 25807 + }, + { + "epoch": 2.199607943407483, + "grad_norm": 69.65802484875512, + "learning_rate": 2.0164465851368803e-06, + "loss": 2.3908, + "step": 25808 + }, + { + "epoch": 2.1996931731015086, + "grad_norm": 89.3614790330043, + "learning_rate": 2.01604870352061e-06, + "loss": 1.5821, + "step": 25809 + }, + { + "epoch": 2.199778402795534, + "grad_norm": 26.17797273146789, + "learning_rate": 2.015650851250313e-06, + "loss": 0.6796, + "step": 25810 + }, + { + "epoch": 2.1998636324895595, + "grad_norm": 88.11308689368973, + "learning_rate": 2.015253028329906e-06, + "loss": 1.4251, + "step": 25811 + }, + { + "epoch": 2.199948862183585, + "grad_norm": 49.732503616311575, + "learning_rate": 2.014855234763298e-06, + "loss": 1.0863, + "step": 25812 + }, + { + "epoch": 2.20003409187761, + "grad_norm": 47.84098342512314, + "learning_rate": 2.0144574705544034e-06, + "loss": 1.0789, + "step": 25813 + }, + { + "epoch": 2.2001193215716355, + "grad_norm": 32.933602699256234, + "learning_rate": 2.014059735707134e-06, + "loss": 0.8538, + "step": 25814 + }, + { + "epoch": 2.200204551265661, + "grad_norm": 34.434981378154795, + "learning_rate": 2.0136620302254005e-06, + "loss": 0.8938, + "step": 25815 + }, + { + "epoch": 2.2002897809596864, + "grad_norm": 29.237614788530994, + "learning_rate": 2.0132643541131123e-06, + "loss": 0.9088, + "step": 25816 + }, + { + "epoch": 2.200375010653712, + "grad_norm": 35.103951732239786, + "learning_rate": 2.0128667073741843e-06, + "loss": 1.2264, + "step": 25817 + }, + { + "epoch": 2.2004602403477374, + "grad_norm": 35.85943421561854, + "learning_rate": 2.012469090012523e-06, + "loss": 1.0739, + "step": 25818 + }, + { + "epoch": 2.2005454700417624, + "grad_norm": 62.092705499425406, + "learning_rate": 2.0120715020320436e-06, + "loss": 1.567, + "step": 25819 + }, + { + "epoch": 2.200630699735788, + "grad_norm": 26.5298561575469, + "learning_rate": 2.0116739434366533e-06, + "loss": 0.6175, + "step": 25820 + }, + { + "epoch": 2.2007159294298133, + "grad_norm": 55.608070742334775, + "learning_rate": 2.011276414230261e-06, + "loss": 1.5736, + "step": 25821 + }, + { + "epoch": 2.200801159123839, + "grad_norm": 65.21303419095743, + "learning_rate": 2.010878914416779e-06, + "loss": 1.3208, + "step": 25822 + }, + { + "epoch": 2.2008863888178642, + "grad_norm": 53.51749038587309, + "learning_rate": 2.010481444000115e-06, + "loss": 1.6204, + "step": 25823 + }, + { + "epoch": 2.2009716185118897, + "grad_norm": 58.35912616601466, + "learning_rate": 2.0100840029841763e-06, + "loss": 1.6433, + "step": 25824 + }, + { + "epoch": 2.2010568482059147, + "grad_norm": 31.883173892266186, + "learning_rate": 2.009686591372875e-06, + "loss": 1.1447, + "step": 25825 + }, + { + "epoch": 2.20114207789994, + "grad_norm": 65.76433893692125, + "learning_rate": 2.0092892091701165e-06, + "loss": 2.3644, + "step": 25826 + }, + { + "epoch": 2.2012273075939657, + "grad_norm": 34.288134266283784, + "learning_rate": 2.0088918563798117e-06, + "loss": 1.4565, + "step": 25827 + }, + { + "epoch": 2.201312537287991, + "grad_norm": 58.41362889926928, + "learning_rate": 2.008494533005867e-06, + "loss": 1.3766, + "step": 25828 + }, + { + "epoch": 2.2013977669820166, + "grad_norm": 45.99706364485857, + "learning_rate": 2.008097239052188e-06, + "loss": 1.0651, + "step": 25829 + }, + { + "epoch": 2.201482996676042, + "grad_norm": 46.73593214323738, + "learning_rate": 2.0076999745226854e-06, + "loss": 1.3603, + "step": 25830 + }, + { + "epoch": 2.2015682263700675, + "grad_norm": 47.90703787284533, + "learning_rate": 2.007302739421265e-06, + "loss": 1.4083, + "step": 25831 + }, + { + "epoch": 2.2016534560640926, + "grad_norm": 50.777514443339825, + "learning_rate": 2.0069055337518314e-06, + "loss": 1.46, + "step": 25832 + }, + { + "epoch": 2.201738685758118, + "grad_norm": 75.4260042683944, + "learning_rate": 2.006508357518294e-06, + "loss": 2.2253, + "step": 25833 + }, + { + "epoch": 2.2018239154521435, + "grad_norm": 35.80599523719653, + "learning_rate": 2.006111210724555e-06, + "loss": 1.4668, + "step": 25834 + }, + { + "epoch": 2.201909145146169, + "grad_norm": 71.02168596722352, + "learning_rate": 2.0057140933745253e-06, + "loss": 1.7796, + "step": 25835 + }, + { + "epoch": 2.2019943748401944, + "grad_norm": 64.64152302730689, + "learning_rate": 2.005317005472107e-06, + "loss": 1.4649, + "step": 25836 + }, + { + "epoch": 2.20207960453422, + "grad_norm": 89.66214693195793, + "learning_rate": 2.004919947021206e-06, + "loss": 1.7549, + "step": 25837 + }, + { + "epoch": 2.202164834228245, + "grad_norm": 69.47715852240404, + "learning_rate": 2.004522918025726e-06, + "loss": 1.9236, + "step": 25838 + }, + { + "epoch": 2.2022500639222704, + "grad_norm": 26.42157351122193, + "learning_rate": 2.0041259184895747e-06, + "loss": 0.9414, + "step": 25839 + }, + { + "epoch": 2.202335293616296, + "grad_norm": 20.429834712228782, + "learning_rate": 2.0037289484166538e-06, + "loss": 0.707, + "step": 25840 + }, + { + "epoch": 2.2024205233103213, + "grad_norm": 32.61550099131031, + "learning_rate": 2.0033320078108664e-06, + "loss": 1.1441, + "step": 25841 + }, + { + "epoch": 2.202505753004347, + "grad_norm": 46.76518418964927, + "learning_rate": 2.0029350966761184e-06, + "loss": 1.6474, + "step": 25842 + }, + { + "epoch": 2.2025909826983723, + "grad_norm": 55.07906789867757, + "learning_rate": 2.002538215016314e-06, + "loss": 1.8977, + "step": 25843 + }, + { + "epoch": 2.2026762123923973, + "grad_norm": 56.74655195659106, + "learning_rate": 2.002141362835356e-06, + "loss": 2.3485, + "step": 25844 + }, + { + "epoch": 2.2027614420864228, + "grad_norm": 27.91326178228092, + "learning_rate": 2.001744540137146e-06, + "loss": 0.8036, + "step": 25845 + }, + { + "epoch": 2.2028466717804482, + "grad_norm": 69.95786815265221, + "learning_rate": 2.001347746925587e-06, + "loss": 1.916, + "step": 25846 + }, + { + "epoch": 2.2029319014744737, + "grad_norm": 65.51800802261013, + "learning_rate": 2.00095098320458e-06, + "loss": 1.9439, + "step": 25847 + }, + { + "epoch": 2.203017131168499, + "grad_norm": 43.69622373128471, + "learning_rate": 2.0005542489780296e-06, + "loss": 1.3116, + "step": 25848 + }, + { + "epoch": 2.2031023608625246, + "grad_norm": 63.62470052272561, + "learning_rate": 2.0001575442498346e-06, + "loss": 1.909, + "step": 25849 + }, + { + "epoch": 2.20318759055655, + "grad_norm": 57.108851926775095, + "learning_rate": 1.9997608690239e-06, + "loss": 1.5916, + "step": 25850 + }, + { + "epoch": 2.203272820250575, + "grad_norm": 44.85913257721733, + "learning_rate": 1.999364223304123e-06, + "loss": 1.5381, + "step": 25851 + }, + { + "epoch": 2.2033580499446006, + "grad_norm": 25.807087633954453, + "learning_rate": 1.998967607094409e-06, + "loss": 0.9544, + "step": 25852 + }, + { + "epoch": 2.203443279638626, + "grad_norm": 53.38455408663368, + "learning_rate": 1.9985710203986558e-06, + "loss": 1.8267, + "step": 25853 + }, + { + "epoch": 2.2035285093326515, + "grad_norm": 42.46391703443993, + "learning_rate": 1.998174463220764e-06, + "loss": 1.3979, + "step": 25854 + }, + { + "epoch": 2.203613739026677, + "grad_norm": 40.71764858904257, + "learning_rate": 1.9977779355646316e-06, + "loss": 1.3141, + "step": 25855 + }, + { + "epoch": 2.2036989687207025, + "grad_norm": 70.68354796827157, + "learning_rate": 1.997381437434162e-06, + "loss": 1.8928, + "step": 25856 + }, + { + "epoch": 2.2037841984147275, + "grad_norm": 42.83721554839597, + "learning_rate": 1.996984968833251e-06, + "loss": 1.4037, + "step": 25857 + }, + { + "epoch": 2.203869428108753, + "grad_norm": 31.089081919165217, + "learning_rate": 1.9965885297658015e-06, + "loss": 1.2017, + "step": 25858 + }, + { + "epoch": 2.2039546578027784, + "grad_norm": 66.44296416878797, + "learning_rate": 1.9961921202357105e-06, + "loss": 2.1264, + "step": 25859 + }, + { + "epoch": 2.204039887496804, + "grad_norm": 18.313026017692632, + "learning_rate": 1.9957957402468743e-06, + "loss": 0.7055, + "step": 25860 + }, + { + "epoch": 2.2041251171908294, + "grad_norm": 25.807814082965493, + "learning_rate": 1.9953993898031954e-06, + "loss": 0.4351, + "step": 25861 + }, + { + "epoch": 2.204210346884855, + "grad_norm": 37.92842429692337, + "learning_rate": 1.995003068908569e-06, + "loss": 1.6649, + "step": 25862 + }, + { + "epoch": 2.20429557657888, + "grad_norm": 42.11433161843827, + "learning_rate": 1.994606777566892e-06, + "loss": 1.5971, + "step": 25863 + }, + { + "epoch": 2.2043808062729053, + "grad_norm": 50.68148236491951, + "learning_rate": 1.994210515782065e-06, + "loss": 1.1942, + "step": 25864 + }, + { + "epoch": 2.204466035966931, + "grad_norm": 35.40951742899012, + "learning_rate": 1.993814283557981e-06, + "loss": 1.1495, + "step": 25865 + }, + { + "epoch": 2.2045512656609563, + "grad_norm": 58.70912268268559, + "learning_rate": 1.993418080898541e-06, + "loss": 1.4468, + "step": 25866 + }, + { + "epoch": 2.2046364953549817, + "grad_norm": 79.91746652575499, + "learning_rate": 1.9930219078076397e-06, + "loss": 1.6601, + "step": 25867 + }, + { + "epoch": 2.204721725049007, + "grad_norm": 36.35092657525846, + "learning_rate": 1.992625764289171e-06, + "loss": 1.5529, + "step": 25868 + }, + { + "epoch": 2.2048069547430327, + "grad_norm": 37.04270930409899, + "learning_rate": 1.9922296503470344e-06, + "loss": 1.3935, + "step": 25869 + }, + { + "epoch": 2.2048921844370577, + "grad_norm": 68.18916316460796, + "learning_rate": 1.9918335659851245e-06, + "loss": 1.3618, + "step": 25870 + }, + { + "epoch": 2.204977414131083, + "grad_norm": 31.489218958558716, + "learning_rate": 1.991437511207335e-06, + "loss": 0.9173, + "step": 25871 + }, + { + "epoch": 2.2050626438251086, + "grad_norm": 36.826885488251165, + "learning_rate": 1.9910414860175624e-06, + "loss": 1.2288, + "step": 25872 + }, + { + "epoch": 2.205147873519134, + "grad_norm": 48.76260678095709, + "learning_rate": 1.9906454904197003e-06, + "loss": 1.3076, + "step": 25873 + }, + { + "epoch": 2.2052331032131596, + "grad_norm": 52.37458399497533, + "learning_rate": 1.9902495244176458e-06, + "loss": 0.9645, + "step": 25874 + }, + { + "epoch": 2.205318332907185, + "grad_norm": 34.36918181878392, + "learning_rate": 1.9898535880152906e-06, + "loss": 1.6605, + "step": 25875 + }, + { + "epoch": 2.20540356260121, + "grad_norm": 28.654317721867045, + "learning_rate": 1.98945768121653e-06, + "loss": 1.1062, + "step": 25876 + }, + { + "epoch": 2.2054887922952355, + "grad_norm": 56.112311192202284, + "learning_rate": 1.989061804025255e-06, + "loss": 1.3833, + "step": 25877 + }, + { + "epoch": 2.205574021989261, + "grad_norm": 56.2297095728024, + "learning_rate": 1.988665956445362e-06, + "loss": 1.9383, + "step": 25878 + }, + { + "epoch": 2.2056592516832865, + "grad_norm": 53.83897500521691, + "learning_rate": 1.988270138480741e-06, + "loss": 1.5342, + "step": 25879 + }, + { + "epoch": 2.205744481377312, + "grad_norm": 43.92909330696406, + "learning_rate": 1.987874350135289e-06, + "loss": 1.3259, + "step": 25880 + }, + { + "epoch": 2.2058297110713374, + "grad_norm": 26.45601427441206, + "learning_rate": 1.987478591412893e-06, + "loss": 1.4286, + "step": 25881 + }, + { + "epoch": 2.2059149407653624, + "grad_norm": 35.71489442833, + "learning_rate": 1.9870828623174505e-06, + "loss": 1.5191, + "step": 25882 + }, + { + "epoch": 2.206000170459388, + "grad_norm": 34.340767297683584, + "learning_rate": 1.9866871628528506e-06, + "loss": 1.395, + "step": 25883 + }, + { + "epoch": 2.2060854001534134, + "grad_norm": 36.570639040852754, + "learning_rate": 1.986291493022985e-06, + "loss": 1.3903, + "step": 25884 + }, + { + "epoch": 2.206170629847439, + "grad_norm": 17.397797038489607, + "learning_rate": 1.9858958528317433e-06, + "loss": 0.6587, + "step": 25885 + }, + { + "epoch": 2.2062558595414643, + "grad_norm": 37.849732000547974, + "learning_rate": 1.98550024228302e-06, + "loss": 1.2085, + "step": 25886 + }, + { + "epoch": 2.2063410892354898, + "grad_norm": 17.848911564765, + "learning_rate": 1.9851046613807025e-06, + "loss": 1.1086, + "step": 25887 + }, + { + "epoch": 2.2064263189295152, + "grad_norm": 57.5110572880417, + "learning_rate": 1.9847091101286834e-06, + "loss": 1.5754, + "step": 25888 + }, + { + "epoch": 2.2065115486235403, + "grad_norm": 37.88050846882728, + "learning_rate": 1.9843135885308533e-06, + "loss": 0.5713, + "step": 25889 + }, + { + "epoch": 2.2065967783175657, + "grad_norm": 39.143518330320575, + "learning_rate": 1.9839180965910987e-06, + "loss": 1.5196, + "step": 25890 + }, + { + "epoch": 2.206682008011591, + "grad_norm": 73.4998692285604, + "learning_rate": 1.9835226343133123e-06, + "loss": 2.2807, + "step": 25891 + }, + { + "epoch": 2.2067672377056167, + "grad_norm": 145.48456060894824, + "learning_rate": 1.9831272017013827e-06, + "loss": 2.1944, + "step": 25892 + }, + { + "epoch": 2.206852467399642, + "grad_norm": 33.436224598298175, + "learning_rate": 1.982731798759198e-06, + "loss": 1.4971, + "step": 25893 + }, + { + "epoch": 2.2069376970936676, + "grad_norm": 60.467085651864856, + "learning_rate": 1.982336425490646e-06, + "loss": 1.266, + "step": 25894 + }, + { + "epoch": 2.2070229267876926, + "grad_norm": 59.41310258587663, + "learning_rate": 1.981941081899617e-06, + "loss": 1.7685, + "step": 25895 + }, + { + "epoch": 2.207108156481718, + "grad_norm": 29.33500406552643, + "learning_rate": 1.981545767989997e-06, + "loss": 1.3782, + "step": 25896 + }, + { + "epoch": 2.2071933861757436, + "grad_norm": 69.88531837381484, + "learning_rate": 1.9811504837656765e-06, + "loss": 1.9691, + "step": 25897 + }, + { + "epoch": 2.207278615869769, + "grad_norm": 70.33130185984784, + "learning_rate": 1.9807552292305394e-06, + "loss": 2.4361, + "step": 25898 + }, + { + "epoch": 2.2073638455637945, + "grad_norm": 42.12233159374234, + "learning_rate": 1.9803600043884767e-06, + "loss": 1.2618, + "step": 25899 + }, + { + "epoch": 2.20744907525782, + "grad_norm": 41.32987566733409, + "learning_rate": 1.9799648092433744e-06, + "loss": 1.4507, + "step": 25900 + }, + { + "epoch": 2.207534304951845, + "grad_norm": 49.155707726434876, + "learning_rate": 1.9795696437991173e-06, + "loss": 1.8715, + "step": 25901 + }, + { + "epoch": 2.2076195346458705, + "grad_norm": 43.08665628176861, + "learning_rate": 1.9791745080595908e-06, + "loss": 1.5068, + "step": 25902 + }, + { + "epoch": 2.207704764339896, + "grad_norm": 64.52183055034473, + "learning_rate": 1.978779402028685e-06, + "loss": 1.1415, + "step": 25903 + }, + { + "epoch": 2.2077899940339214, + "grad_norm": 78.70564105536937, + "learning_rate": 1.978384325710281e-06, + "loss": 1.9379, + "step": 25904 + }, + { + "epoch": 2.207875223727947, + "grad_norm": 31.792073362387168, + "learning_rate": 1.9779892791082683e-06, + "loss": 1.1663, + "step": 25905 + }, + { + "epoch": 2.2079604534219723, + "grad_norm": 49.447896435189065, + "learning_rate": 1.9775942622265297e-06, + "loss": 1.6369, + "step": 25906 + }, + { + "epoch": 2.208045683115998, + "grad_norm": 24.95339164219244, + "learning_rate": 1.9771992750689497e-06, + "loss": 0.942, + "step": 25907 + }, + { + "epoch": 2.208130912810023, + "grad_norm": 31.827383473574056, + "learning_rate": 1.976804317639415e-06, + "loss": 1.071, + "step": 25908 + }, + { + "epoch": 2.2082161425040483, + "grad_norm": 62.76764988013711, + "learning_rate": 1.9764093899418086e-06, + "loss": 1.7511, + "step": 25909 + }, + { + "epoch": 2.2083013721980738, + "grad_norm": 28.62928997654505, + "learning_rate": 1.976014491980013e-06, + "loss": 1.2857, + "step": 25910 + }, + { + "epoch": 2.2083866018920992, + "grad_norm": 30.067494115813435, + "learning_rate": 1.9756196237579144e-06, + "loss": 0.9477, + "step": 25911 + }, + { + "epoch": 2.2084718315861247, + "grad_norm": 34.989955073455555, + "learning_rate": 1.9752247852793936e-06, + "loss": 1.0292, + "step": 25912 + }, + { + "epoch": 2.20855706128015, + "grad_norm": 17.86872049979693, + "learning_rate": 1.974829976548337e-06, + "loss": 0.6452, + "step": 25913 + }, + { + "epoch": 2.2086422909741756, + "grad_norm": 44.829891700285835, + "learning_rate": 1.974435197568625e-06, + "loss": 1.4411, + "step": 25914 + }, + { + "epoch": 2.2087275206682007, + "grad_norm": 57.35726670052664, + "learning_rate": 1.9740404483441387e-06, + "loss": 1.6263, + "step": 25915 + }, + { + "epoch": 2.208812750362226, + "grad_norm": 83.64544369987307, + "learning_rate": 1.973645728878765e-06, + "loss": 1.7105, + "step": 25916 + }, + { + "epoch": 2.2088979800562516, + "grad_norm": 59.859091343174015, + "learning_rate": 1.973251039176382e-06, + "loss": 1.9076, + "step": 25917 + }, + { + "epoch": 2.208983209750277, + "grad_norm": 22.183851852405272, + "learning_rate": 1.9728563792408712e-06, + "loss": 0.8458, + "step": 25918 + }, + { + "epoch": 2.2090684394443025, + "grad_norm": 47.621085676099895, + "learning_rate": 1.9724617490761167e-06, + "loss": 1.5006, + "step": 25919 + }, + { + "epoch": 2.2091536691383276, + "grad_norm": 54.22046145217609, + "learning_rate": 1.972067148685996e-06, + "loss": 1.879, + "step": 25920 + }, + { + "epoch": 2.209238898832353, + "grad_norm": 63.51200072515193, + "learning_rate": 1.9716725780743936e-06, + "loss": 1.9945, + "step": 25921 + }, + { + "epoch": 2.2093241285263785, + "grad_norm": 22.49796583705499, + "learning_rate": 1.971278037245187e-06, + "loss": 0.9455, + "step": 25922 + }, + { + "epoch": 2.209409358220404, + "grad_norm": 59.42933686658931, + "learning_rate": 1.9708835262022586e-06, + "loss": 1.1088, + "step": 25923 + }, + { + "epoch": 2.2094945879144294, + "grad_norm": 39.21055754718037, + "learning_rate": 1.970489044949485e-06, + "loss": 1.3047, + "step": 25924 + }, + { + "epoch": 2.209579817608455, + "grad_norm": 53.121522307411986, + "learning_rate": 1.9700945934907495e-06, + "loss": 1.7899, + "step": 25925 + }, + { + "epoch": 2.2096650473024804, + "grad_norm": 29.525397931971487, + "learning_rate": 1.9697001718299287e-06, + "loss": 1.0603, + "step": 25926 + }, + { + "epoch": 2.2097502769965054, + "grad_norm": 95.45138488257324, + "learning_rate": 1.9693057799709033e-06, + "loss": 2.2836, + "step": 25927 + }, + { + "epoch": 2.209835506690531, + "grad_norm": 65.01551464582361, + "learning_rate": 1.96891141791755e-06, + "loss": 1.5081, + "step": 25928 + }, + { + "epoch": 2.2099207363845563, + "grad_norm": 49.22291842088325, + "learning_rate": 1.9685170856737502e-06, + "loss": 2.1076, + "step": 25929 + }, + { + "epoch": 2.210005966078582, + "grad_norm": 51.74077187747654, + "learning_rate": 1.96812278324338e-06, + "loss": 1.2891, + "step": 25930 + }, + { + "epoch": 2.2100911957726073, + "grad_norm": 50.86336288004566, + "learning_rate": 1.967728510630318e-06, + "loss": 1.4517, + "step": 25931 + }, + { + "epoch": 2.2101764254666327, + "grad_norm": 55.182963262795596, + "learning_rate": 1.9673342678384403e-06, + "loss": 1.5638, + "step": 25932 + }, + { + "epoch": 2.210261655160658, + "grad_norm": 60.31537463933841, + "learning_rate": 1.966940054871624e-06, + "loss": 1.6711, + "step": 25933 + }, + { + "epoch": 2.2103468848546832, + "grad_norm": 71.28337629187052, + "learning_rate": 1.966545871733747e-06, + "loss": 2.1831, + "step": 25934 + }, + { + "epoch": 2.2104321145487087, + "grad_norm": 25.923231690196005, + "learning_rate": 1.9661517184286875e-06, + "loss": 0.5397, + "step": 25935 + }, + { + "epoch": 2.210517344242734, + "grad_norm": 59.412003964397385, + "learning_rate": 1.9657575949603202e-06, + "loss": 1.9556, + "step": 25936 + }, + { + "epoch": 2.2106025739367596, + "grad_norm": 35.781606965179904, + "learning_rate": 1.96536350133252e-06, + "loss": 1.1833, + "step": 25937 + }, + { + "epoch": 2.210687803630785, + "grad_norm": 81.51748316424248, + "learning_rate": 1.964969437549165e-06, + "loss": 1.9963, + "step": 25938 + }, + { + "epoch": 2.2107730333248106, + "grad_norm": 56.543753719308405, + "learning_rate": 1.9645754036141303e-06, + "loss": 2.0839, + "step": 25939 + }, + { + "epoch": 2.2108582630188356, + "grad_norm": 35.37834029335361, + "learning_rate": 1.96418139953129e-06, + "loss": 1.3756, + "step": 25940 + }, + { + "epoch": 2.210943492712861, + "grad_norm": 45.906624746807324, + "learning_rate": 1.9637874253045174e-06, + "loss": 1.0979, + "step": 25941 + }, + { + "epoch": 2.2110287224068865, + "grad_norm": 32.38809093439863, + "learning_rate": 1.963393480937691e-06, + "loss": 1.312, + "step": 25942 + }, + { + "epoch": 2.211113952100912, + "grad_norm": 34.356459449220104, + "learning_rate": 1.9629995664346812e-06, + "loss": 0.9569, + "step": 25943 + }, + { + "epoch": 2.2111991817949375, + "grad_norm": 25.669135611657254, + "learning_rate": 1.962605681799366e-06, + "loss": 1.0042, + "step": 25944 + }, + { + "epoch": 2.211284411488963, + "grad_norm": 36.847283325640646, + "learning_rate": 1.962211827035615e-06, + "loss": 1.2065, + "step": 25945 + }, + { + "epoch": 2.211369641182988, + "grad_norm": 44.694022714620125, + "learning_rate": 1.961818002147305e-06, + "loss": 1.5358, + "step": 25946 + }, + { + "epoch": 2.2114548708770134, + "grad_norm": 44.66405194049858, + "learning_rate": 1.9614242071383084e-06, + "loss": 1.0483, + "step": 25947 + }, + { + "epoch": 2.211540100571039, + "grad_norm": 40.42522853969597, + "learning_rate": 1.9610304420124963e-06, + "loss": 1.4371, + "step": 25948 + }, + { + "epoch": 2.2116253302650644, + "grad_norm": 76.9133812125759, + "learning_rate": 1.9606367067737416e-06, + "loss": 2.0042, + "step": 25949 + }, + { + "epoch": 2.21171055995909, + "grad_norm": 63.4917711571107, + "learning_rate": 1.960243001425918e-06, + "loss": 2.191, + "step": 25950 + }, + { + "epoch": 2.2117957896531153, + "grad_norm": 34.65952066310663, + "learning_rate": 1.9598493259728952e-06, + "loss": 1.439, + "step": 25951 + }, + { + "epoch": 2.2118810193471408, + "grad_norm": 60.33921613926963, + "learning_rate": 1.9594556804185484e-06, + "loss": 1.695, + "step": 25952 + }, + { + "epoch": 2.211966249041166, + "grad_norm": 68.11713574230224, + "learning_rate": 1.9590620647667457e-06, + "loss": 2.1787, + "step": 25953 + }, + { + "epoch": 2.2120514787351913, + "grad_norm": 22.358514522431093, + "learning_rate": 1.9586684790213585e-06, + "loss": 0.8747, + "step": 25954 + }, + { + "epoch": 2.2121367084292167, + "grad_norm": 67.30104554784516, + "learning_rate": 1.9582749231862597e-06, + "loss": 1.5354, + "step": 25955 + }, + { + "epoch": 2.212221938123242, + "grad_norm": 56.068255342848495, + "learning_rate": 1.9578813972653183e-06, + "loss": 1.6222, + "step": 25956 + }, + { + "epoch": 2.2123071678172677, + "grad_norm": 36.70483595043624, + "learning_rate": 1.957487901262403e-06, + "loss": 1.1912, + "step": 25957 + }, + { + "epoch": 2.212392397511293, + "grad_norm": 59.056306258575695, + "learning_rate": 1.957094435181387e-06, + "loss": 2.1594, + "step": 25958 + }, + { + "epoch": 2.212477627205318, + "grad_norm": 28.91121328674473, + "learning_rate": 1.956700999026136e-06, + "loss": 1.1369, + "step": 25959 + }, + { + "epoch": 2.2125628568993436, + "grad_norm": 38.62285351502342, + "learning_rate": 1.956307592800523e-06, + "loss": 1.0488, + "step": 25960 + }, + { + "epoch": 2.212648086593369, + "grad_norm": 65.19700632838223, + "learning_rate": 1.955914216508416e-06, + "loss": 1.7714, + "step": 25961 + }, + { + "epoch": 2.2127333162873946, + "grad_norm": 74.31099115298464, + "learning_rate": 1.9555208701536834e-06, + "loss": 2.2005, + "step": 25962 + }, + { + "epoch": 2.21281854598142, + "grad_norm": 67.21539035846642, + "learning_rate": 1.9551275537401904e-06, + "loss": 1.626, + "step": 25963 + }, + { + "epoch": 2.2129037756754455, + "grad_norm": 43.049348088807406, + "learning_rate": 1.9547342672718108e-06, + "loss": 1.3968, + "step": 25964 + }, + { + "epoch": 2.2129890053694705, + "grad_norm": 52.06062724782695, + "learning_rate": 1.954341010752407e-06, + "loss": 1.4578, + "step": 25965 + }, + { + "epoch": 2.213074235063496, + "grad_norm": 51.2561354392265, + "learning_rate": 1.953947784185851e-06, + "loss": 1.4285, + "step": 25966 + }, + { + "epoch": 2.2131594647575215, + "grad_norm": 37.343946817931176, + "learning_rate": 1.953554587576007e-06, + "loss": 1.0762, + "step": 25967 + }, + { + "epoch": 2.213244694451547, + "grad_norm": 37.581800428064405, + "learning_rate": 1.9531614209267443e-06, + "loss": 0.9869, + "step": 25968 + }, + { + "epoch": 2.2133299241455724, + "grad_norm": 35.407923911069986, + "learning_rate": 1.9527682842419286e-06, + "loss": 1.0818, + "step": 25969 + }, + { + "epoch": 2.213415153839598, + "grad_norm": 29.97900376622981, + "learning_rate": 1.9523751775254258e-06, + "loss": 1.1597, + "step": 25970 + }, + { + "epoch": 2.2135003835336233, + "grad_norm": 27.22642757078881, + "learning_rate": 1.9519821007811003e-06, + "loss": 0.7345, + "step": 25971 + }, + { + "epoch": 2.2135856132276484, + "grad_norm": 30.375147659769567, + "learning_rate": 1.951589054012822e-06, + "loss": 0.9748, + "step": 25972 + }, + { + "epoch": 2.213670842921674, + "grad_norm": 38.61235323667851, + "learning_rate": 1.9511960372244514e-06, + "loss": 1.2401, + "step": 25973 + }, + { + "epoch": 2.2137560726156993, + "grad_norm": 36.26475961831471, + "learning_rate": 1.950803050419859e-06, + "loss": 0.9285, + "step": 25974 + }, + { + "epoch": 2.2138413023097248, + "grad_norm": 122.13878566972663, + "learning_rate": 1.9504100936029046e-06, + "loss": 1.9373, + "step": 25975 + }, + { + "epoch": 2.2139265320037502, + "grad_norm": 54.00828469679313, + "learning_rate": 1.950017166777457e-06, + "loss": 1.0987, + "step": 25976 + }, + { + "epoch": 2.2140117616977757, + "grad_norm": 25.383881976303112, + "learning_rate": 1.949624269947378e-06, + "loss": 0.722, + "step": 25977 + }, + { + "epoch": 2.2140969913918007, + "grad_norm": 58.88837828821428, + "learning_rate": 1.949231403116533e-06, + "loss": 1.5238, + "step": 25978 + }, + { + "epoch": 2.214182221085826, + "grad_norm": 35.49034502673005, + "learning_rate": 1.9488385662887847e-06, + "loss": 1.2489, + "step": 25979 + }, + { + "epoch": 2.2142674507798517, + "grad_norm": 33.77363701790564, + "learning_rate": 1.948445759467996e-06, + "loss": 1.0633, + "step": 25980 + }, + { + "epoch": 2.214352680473877, + "grad_norm": 25.663716231366983, + "learning_rate": 1.948052982658029e-06, + "loss": 0.8666, + "step": 25981 + }, + { + "epoch": 2.2144379101679026, + "grad_norm": 33.34327712358082, + "learning_rate": 1.9476602358627503e-06, + "loss": 1.443, + "step": 25982 + }, + { + "epoch": 2.214523139861928, + "grad_norm": 48.87204970901963, + "learning_rate": 1.947267519086021e-06, + "loss": 1.3142, + "step": 25983 + }, + { + "epoch": 2.214608369555953, + "grad_norm": 100.2496600729053, + "learning_rate": 1.9468748323317003e-06, + "loss": 2.6888, + "step": 25984 + }, + { + "epoch": 2.2146935992499786, + "grad_norm": 54.13879419220498, + "learning_rate": 1.9464821756036543e-06, + "loss": 1.1617, + "step": 25985 + }, + { + "epoch": 2.214778828944004, + "grad_norm": 66.0695421808354, + "learning_rate": 1.946089548905743e-06, + "loss": 1.6777, + "step": 25986 + }, + { + "epoch": 2.2148640586380295, + "grad_norm": 62.45278526367923, + "learning_rate": 1.9456969522418273e-06, + "loss": 1.7741, + "step": 25987 + }, + { + "epoch": 2.214949288332055, + "grad_norm": 53.76791151944747, + "learning_rate": 1.9453043856157666e-06, + "loss": 1.9731, + "step": 25988 + }, + { + "epoch": 2.2150345180260804, + "grad_norm": 42.52011689986212, + "learning_rate": 1.9449118490314252e-06, + "loss": 0.8379, + "step": 25989 + }, + { + "epoch": 2.215119747720106, + "grad_norm": 30.689676256741176, + "learning_rate": 1.94451934249266e-06, + "loss": 1.493, + "step": 25990 + }, + { + "epoch": 2.215204977414131, + "grad_norm": 49.363651331067764, + "learning_rate": 1.9441268660033348e-06, + "loss": 1.2339, + "step": 25991 + }, + { + "epoch": 2.2152902071081564, + "grad_norm": 30.444403713413564, + "learning_rate": 1.9437344195673077e-06, + "loss": 1.532, + "step": 25992 + }, + { + "epoch": 2.215375436802182, + "grad_norm": 85.76301821000395, + "learning_rate": 1.9433420031884366e-06, + "loss": 1.5381, + "step": 25993 + }, + { + "epoch": 2.2154606664962073, + "grad_norm": 38.656326786877365, + "learning_rate": 1.9429496168705835e-06, + "loss": 1.5335, + "step": 25994 + }, + { + "epoch": 2.215545896190233, + "grad_norm": 58.189106917396664, + "learning_rate": 1.9425572606176063e-06, + "loss": 1.794, + "step": 25995 + }, + { + "epoch": 2.2156311258842583, + "grad_norm": 46.32656912865338, + "learning_rate": 1.9421649344333623e-06, + "loss": 1.225, + "step": 25996 + }, + { + "epoch": 2.2157163555782833, + "grad_norm": 63.039916797050466, + "learning_rate": 1.941772638321712e-06, + "loss": 1.5324, + "step": 25997 + }, + { + "epoch": 2.2158015852723087, + "grad_norm": 32.52131658513385, + "learning_rate": 1.941380372286511e-06, + "loss": 1.3151, + "step": 25998 + }, + { + "epoch": 2.215886814966334, + "grad_norm": 34.511236490035735, + "learning_rate": 1.940988136331621e-06, + "loss": 1.1615, + "step": 25999 + }, + { + "epoch": 2.2159720446603597, + "grad_norm": 37.19485546793285, + "learning_rate": 1.940595930460897e-06, + "loss": 1.198, + "step": 26000 + }, + { + "epoch": 2.216057274354385, + "grad_norm": 56.26894360668975, + "learning_rate": 1.940203754678194e-06, + "loss": 1.3514, + "step": 26001 + }, + { + "epoch": 2.2161425040484106, + "grad_norm": 56.88465149010573, + "learning_rate": 1.939811608987374e-06, + "loss": 1.5221, + "step": 26002 + }, + { + "epoch": 2.2162277337424356, + "grad_norm": 58.694858925877135, + "learning_rate": 1.9394194933922892e-06, + "loss": 1.2243, + "step": 26003 + }, + { + "epoch": 2.216312963436461, + "grad_norm": 33.50555441558678, + "learning_rate": 1.9390274078967974e-06, + "loss": 1.152, + "step": 26004 + }, + { + "epoch": 2.2163981931304866, + "grad_norm": 53.57609756626259, + "learning_rate": 1.9386353525047557e-06, + "loss": 1.3798, + "step": 26005 + }, + { + "epoch": 2.216483422824512, + "grad_norm": 22.111452272313272, + "learning_rate": 1.9382433272200173e-06, + "loss": 0.817, + "step": 26006 + }, + { + "epoch": 2.2165686525185375, + "grad_norm": 25.982816396054364, + "learning_rate": 1.937851332046441e-06, + "loss": 0.5697, + "step": 26007 + }, + { + "epoch": 2.216653882212563, + "grad_norm": 28.156918712316045, + "learning_rate": 1.93745936698788e-06, + "loss": 1.3132, + "step": 26008 + }, + { + "epoch": 2.2167391119065885, + "grad_norm": 61.77417036098523, + "learning_rate": 1.9370674320481886e-06, + "loss": 2.134, + "step": 26009 + }, + { + "epoch": 2.2168243416006135, + "grad_norm": 48.87779692239588, + "learning_rate": 1.936675527231221e-06, + "loss": 1.4133, + "step": 26010 + }, + { + "epoch": 2.216909571294639, + "grad_norm": 37.358074749452264, + "learning_rate": 1.9362836525408335e-06, + "loss": 1.0826, + "step": 26011 + }, + { + "epoch": 2.2169948009886644, + "grad_norm": 56.05299674183943, + "learning_rate": 1.9358918079808777e-06, + "loss": 1.9471, + "step": 26012 + }, + { + "epoch": 2.21708003068269, + "grad_norm": 65.7752223392002, + "learning_rate": 1.93549999355521e-06, + "loss": 1.5208, + "step": 26013 + }, + { + "epoch": 2.2171652603767154, + "grad_norm": 43.76125347131648, + "learning_rate": 1.9351082092676797e-06, + "loss": 1.6339, + "step": 26014 + }, + { + "epoch": 2.217250490070741, + "grad_norm": 40.65787664698066, + "learning_rate": 1.9347164551221446e-06, + "loss": 1.5648, + "step": 26015 + }, + { + "epoch": 2.217335719764766, + "grad_norm": 34.22668158160238, + "learning_rate": 1.9343247311224554e-06, + "loss": 1.0917, + "step": 26016 + }, + { + "epoch": 2.2174209494587913, + "grad_norm": 43.704259509147406, + "learning_rate": 1.933933037272464e-06, + "loss": 1.3149, + "step": 26017 + }, + { + "epoch": 2.217506179152817, + "grad_norm": 48.134199375619026, + "learning_rate": 1.933541373576021e-06, + "loss": 1.527, + "step": 26018 + }, + { + "epoch": 2.2175914088468422, + "grad_norm": 29.71251195950342, + "learning_rate": 1.933149740036982e-06, + "loss": 0.7336, + "step": 26019 + }, + { + "epoch": 2.2176766385408677, + "grad_norm": 43.467544240649154, + "learning_rate": 1.932758136659195e-06, + "loss": 1.4265, + "step": 26020 + }, + { + "epoch": 2.217761868234893, + "grad_norm": 69.50281702364116, + "learning_rate": 1.9323665634465145e-06, + "loss": 2.1842, + "step": 26021 + }, + { + "epoch": 2.217847097928918, + "grad_norm": 63.694806510303806, + "learning_rate": 1.93197502040279e-06, + "loss": 2.2945, + "step": 26022 + }, + { + "epoch": 2.2179323276229437, + "grad_norm": 34.20090736771371, + "learning_rate": 1.9315835075318705e-06, + "loss": 1.5048, + "step": 26023 + }, + { + "epoch": 2.218017557316969, + "grad_norm": 19.306988426100645, + "learning_rate": 1.9311920248376094e-06, + "loss": 0.6652, + "step": 26024 + }, + { + "epoch": 2.2181027870109946, + "grad_norm": 64.95055824829682, + "learning_rate": 1.9308005723238556e-06, + "loss": 2.0197, + "step": 26025 + }, + { + "epoch": 2.21818801670502, + "grad_norm": 81.2622485744817, + "learning_rate": 1.9304091499944584e-06, + "loss": 2.0402, + "step": 26026 + }, + { + "epoch": 2.2182732463990456, + "grad_norm": 49.64300883582378, + "learning_rate": 1.930017757853265e-06, + "loss": 1.0025, + "step": 26027 + }, + { + "epoch": 2.218358476093071, + "grad_norm": 34.263336579774574, + "learning_rate": 1.929626395904128e-06, + "loss": 1.1163, + "step": 26028 + }, + { + "epoch": 2.218443705787096, + "grad_norm": 61.02752790879319, + "learning_rate": 1.9292350641508966e-06, + "loss": 1.8836, + "step": 26029 + }, + { + "epoch": 2.2185289354811215, + "grad_norm": 30.90322926048261, + "learning_rate": 1.928843762597419e-06, + "loss": 1.0308, + "step": 26030 + }, + { + "epoch": 2.218614165175147, + "grad_norm": 43.99317332255562, + "learning_rate": 1.92845249124754e-06, + "loss": 1.7205, + "step": 26031 + }, + { + "epoch": 2.2186993948691724, + "grad_norm": 31.734816797298755, + "learning_rate": 1.928061250105112e-06, + "loss": 1.3878, + "step": 26032 + }, + { + "epoch": 2.218784624563198, + "grad_norm": 33.89923711416106, + "learning_rate": 1.927670039173981e-06, + "loss": 0.9041, + "step": 26033 + }, + { + "epoch": 2.2188698542572234, + "grad_norm": 54.48766480415638, + "learning_rate": 1.927278858457995e-06, + "loss": 1.3696, + "step": 26034 + }, + { + "epoch": 2.2189550839512484, + "grad_norm": 36.49828479715154, + "learning_rate": 1.9268877079609984e-06, + "loss": 1.3846, + "step": 26035 + }, + { + "epoch": 2.219040313645274, + "grad_norm": 44.30042784125766, + "learning_rate": 1.9264965876868396e-06, + "loss": 1.6391, + "step": 26036 + }, + { + "epoch": 2.2191255433392993, + "grad_norm": 44.104322206510965, + "learning_rate": 1.926105497639368e-06, + "loss": 1.3098, + "step": 26037 + }, + { + "epoch": 2.219210773033325, + "grad_norm": 36.69355428923339, + "learning_rate": 1.9257144378224264e-06, + "loss": 1.119, + "step": 26038 + }, + { + "epoch": 2.2192960027273503, + "grad_norm": 54.9813635816858, + "learning_rate": 1.9253234082398626e-06, + "loss": 2.3177, + "step": 26039 + }, + { + "epoch": 2.2193812324213757, + "grad_norm": 30.282777931837565, + "learning_rate": 1.924932408895519e-06, + "loss": 0.7987, + "step": 26040 + }, + { + "epoch": 2.2194664621154008, + "grad_norm": 46.27579637066567, + "learning_rate": 1.9245414397932455e-06, + "loss": 1.5592, + "step": 26041 + }, + { + "epoch": 2.2195516918094262, + "grad_norm": 30.367092537788675, + "learning_rate": 1.9241505009368842e-06, + "loss": 1.0464, + "step": 26042 + }, + { + "epoch": 2.2196369215034517, + "grad_norm": 53.01378800912197, + "learning_rate": 1.9237595923302794e-06, + "loss": 2.0324, + "step": 26043 + }, + { + "epoch": 2.219722151197477, + "grad_norm": 25.07891022872535, + "learning_rate": 1.9233687139772782e-06, + "loss": 1.2609, + "step": 26044 + }, + { + "epoch": 2.2198073808915026, + "grad_norm": 36.161374552790555, + "learning_rate": 1.9229778658817216e-06, + "loss": 0.6996, + "step": 26045 + }, + { + "epoch": 2.219892610585528, + "grad_norm": 31.43416594378068, + "learning_rate": 1.9225870480474563e-06, + "loss": 0.7919, + "step": 26046 + }, + { + "epoch": 2.2199778402795536, + "grad_norm": 82.0471345473377, + "learning_rate": 1.9221962604783246e-06, + "loss": 2.2365, + "step": 26047 + }, + { + "epoch": 2.2200630699735786, + "grad_norm": 38.62494300401508, + "learning_rate": 1.9218055031781684e-06, + "loss": 1.1275, + "step": 26048 + }, + { + "epoch": 2.220148299667604, + "grad_norm": 65.92932257334053, + "learning_rate": 1.9214147761508334e-06, + "loss": 2.3444, + "step": 26049 + }, + { + "epoch": 2.2202335293616295, + "grad_norm": 35.941727130807685, + "learning_rate": 1.9210240794001605e-06, + "loss": 1.6661, + "step": 26050 + }, + { + "epoch": 2.220318759055655, + "grad_norm": 40.0359614253436, + "learning_rate": 1.920633412929991e-06, + "loss": 1.6952, + "step": 26051 + }, + { + "epoch": 2.2204039887496805, + "grad_norm": 45.94053131366847, + "learning_rate": 1.9202427767441693e-06, + "loss": 2.2158, + "step": 26052 + }, + { + "epoch": 2.220489218443706, + "grad_norm": 34.38180318374832, + "learning_rate": 1.9198521708465353e-06, + "loss": 1.1999, + "step": 26053 + }, + { + "epoch": 2.2205744481377314, + "grad_norm": 54.24615028097466, + "learning_rate": 1.9194615952409323e-06, + "loss": 1.7463, + "step": 26054 + }, + { + "epoch": 2.2206596778317564, + "grad_norm": 30.56134499930036, + "learning_rate": 1.919071049931201e-06, + "loss": 1.0527, + "step": 26055 + }, + { + "epoch": 2.220744907525782, + "grad_norm": 28.875814841052115, + "learning_rate": 1.918680534921181e-06, + "loss": 1.5785, + "step": 26056 + }, + { + "epoch": 2.2208301372198074, + "grad_norm": 39.85896976102663, + "learning_rate": 1.9182900502147127e-06, + "loss": 0.7565, + "step": 26057 + }, + { + "epoch": 2.220915366913833, + "grad_norm": 35.177990749350954, + "learning_rate": 1.9178995958156383e-06, + "loss": 1.2139, + "step": 26058 + }, + { + "epoch": 2.2210005966078583, + "grad_norm": 55.714041851186046, + "learning_rate": 1.917509171727795e-06, + "loss": 2.1217, + "step": 26059 + }, + { + "epoch": 2.221085826301884, + "grad_norm": 18.57456659150614, + "learning_rate": 1.917118777955026e-06, + "loss": 0.7176, + "step": 26060 + }, + { + "epoch": 2.221171055995909, + "grad_norm": 34.26957444272022, + "learning_rate": 1.916728414501167e-06, + "loss": 1.0848, + "step": 26061 + }, + { + "epoch": 2.2212562856899343, + "grad_norm": 42.62128837803939, + "learning_rate": 1.9163380813700604e-06, + "loss": 1.3596, + "step": 26062 + }, + { + "epoch": 2.2213415153839597, + "grad_norm": 45.94492833969246, + "learning_rate": 1.915947778565544e-06, + "loss": 1.4621, + "step": 26063 + }, + { + "epoch": 2.221426745077985, + "grad_norm": 40.88934142751878, + "learning_rate": 1.9155575060914556e-06, + "loss": 1.33, + "step": 26064 + }, + { + "epoch": 2.2215119747720107, + "grad_norm": 55.1251350592657, + "learning_rate": 1.915167263951633e-06, + "loss": 2.2483, + "step": 26065 + }, + { + "epoch": 2.221597204466036, + "grad_norm": 69.24240187948892, + "learning_rate": 1.914777052149913e-06, + "loss": 1.8786, + "step": 26066 + }, + { + "epoch": 2.221682434160061, + "grad_norm": 76.02165134792442, + "learning_rate": 1.9143868706901348e-06, + "loss": 1.5823, + "step": 26067 + }, + { + "epoch": 2.2217676638540866, + "grad_norm": 71.80744400421116, + "learning_rate": 1.9139967195761377e-06, + "loss": 2.1701, + "step": 26068 + }, + { + "epoch": 2.221852893548112, + "grad_norm": 29.543700600358935, + "learning_rate": 1.9136065988117563e-06, + "loss": 0.79, + "step": 26069 + }, + { + "epoch": 2.2219381232421376, + "grad_norm": 29.77147118372842, + "learning_rate": 1.9132165084008256e-06, + "loss": 0.9881, + "step": 26070 + }, + { + "epoch": 2.222023352936163, + "grad_norm": 60.71320891411656, + "learning_rate": 1.912826448347186e-06, + "loss": 1.8385, + "step": 26071 + }, + { + "epoch": 2.2221085826301885, + "grad_norm": 62.211109671360205, + "learning_rate": 1.9124364186546716e-06, + "loss": 1.5208, + "step": 26072 + }, + { + "epoch": 2.222193812324214, + "grad_norm": 36.249856200244224, + "learning_rate": 1.912046419327118e-06, + "loss": 1.2596, + "step": 26073 + }, + { + "epoch": 2.222279042018239, + "grad_norm": 56.895230092839405, + "learning_rate": 1.9116564503683594e-06, + "loss": 1.7744, + "step": 26074 + }, + { + "epoch": 2.2223642717122645, + "grad_norm": 54.349061777881765, + "learning_rate": 1.911266511782232e-06, + "loss": 1.1967, + "step": 26075 + }, + { + "epoch": 2.22244950140629, + "grad_norm": 57.96436488353175, + "learning_rate": 1.9108766035725732e-06, + "loss": 2.2167, + "step": 26076 + }, + { + "epoch": 2.2225347311003154, + "grad_norm": 57.27320235259774, + "learning_rate": 1.9104867257432153e-06, + "loss": 1.9234, + "step": 26077 + }, + { + "epoch": 2.222619960794341, + "grad_norm": 55.906491946174945, + "learning_rate": 1.910096878297993e-06, + "loss": 1.4096, + "step": 26078 + }, + { + "epoch": 2.2227051904883663, + "grad_norm": 54.60793299261818, + "learning_rate": 1.909707061240738e-06, + "loss": 1.4443, + "step": 26079 + }, + { + "epoch": 2.2227904201823914, + "grad_norm": 64.20604703615405, + "learning_rate": 1.909317274575288e-06, + "loss": 2.107, + "step": 26080 + }, + { + "epoch": 2.222875649876417, + "grad_norm": 31.862001874435915, + "learning_rate": 1.9089275183054745e-06, + "loss": 1.2231, + "step": 26081 + }, + { + "epoch": 2.2229608795704423, + "grad_norm": 63.72658128026064, + "learning_rate": 1.9085377924351288e-06, + "loss": 1.5838, + "step": 26082 + }, + { + "epoch": 2.2230461092644678, + "grad_norm": 60.75244197687894, + "learning_rate": 1.908148096968086e-06, + "loss": 2.0717, + "step": 26083 + }, + { + "epoch": 2.2231313389584932, + "grad_norm": 31.405916449184698, + "learning_rate": 1.9077584319081793e-06, + "loss": 0.7009, + "step": 26084 + }, + { + "epoch": 2.2232165686525187, + "grad_norm": 61.82066646959606, + "learning_rate": 1.90736879725924e-06, + "loss": 2.0692, + "step": 26085 + }, + { + "epoch": 2.2233017983465437, + "grad_norm": 71.3508598752331, + "learning_rate": 1.9069791930250992e-06, + "loss": 2.0616, + "step": 26086 + }, + { + "epoch": 2.223387028040569, + "grad_norm": 19.313294238876466, + "learning_rate": 1.9065896192095873e-06, + "loss": 0.9986, + "step": 26087 + }, + { + "epoch": 2.2234722577345947, + "grad_norm": 60.38696052190835, + "learning_rate": 1.9062000758165394e-06, + "loss": 1.8198, + "step": 26088 + }, + { + "epoch": 2.22355748742862, + "grad_norm": 51.36385155124465, + "learning_rate": 1.9058105628497841e-06, + "loss": 1.2527, + "step": 26089 + }, + { + "epoch": 2.2236427171226456, + "grad_norm": 52.92409928883718, + "learning_rate": 1.9054210803131506e-06, + "loss": 1.6519, + "step": 26090 + }, + { + "epoch": 2.223727946816671, + "grad_norm": 67.57047726091045, + "learning_rate": 1.9050316282104736e-06, + "loss": 2.0044, + "step": 26091 + }, + { + "epoch": 2.2238131765106965, + "grad_norm": 44.47799703130648, + "learning_rate": 1.9046422065455783e-06, + "loss": 1.2914, + "step": 26092 + }, + { + "epoch": 2.2238984062047216, + "grad_norm": 54.71756365596322, + "learning_rate": 1.904252815322299e-06, + "loss": 1.5, + "step": 26093 + }, + { + "epoch": 2.223983635898747, + "grad_norm": 36.97420330263413, + "learning_rate": 1.903863454544463e-06, + "loss": 1.0762, + "step": 26094 + }, + { + "epoch": 2.2240688655927725, + "grad_norm": 41.968651068798884, + "learning_rate": 1.9034741242158994e-06, + "loss": 1.3499, + "step": 26095 + }, + { + "epoch": 2.224154095286798, + "grad_norm": 53.03776582805565, + "learning_rate": 1.9030848243404353e-06, + "loss": 1.56, + "step": 26096 + }, + { + "epoch": 2.2242393249808234, + "grad_norm": 66.88136484758027, + "learning_rate": 1.9026955549219039e-06, + "loss": 2.0454, + "step": 26097 + }, + { + "epoch": 2.224324554674849, + "grad_norm": 44.27423622174551, + "learning_rate": 1.9023063159641286e-06, + "loss": 0.8275, + "step": 26098 + }, + { + "epoch": 2.224409784368874, + "grad_norm": 57.667278548905976, + "learning_rate": 1.901917107470942e-06, + "loss": 1.9319, + "step": 26099 + }, + { + "epoch": 2.2244950140628994, + "grad_norm": 28.468627540276756, + "learning_rate": 1.901527929446168e-06, + "loss": 1.4525, + "step": 26100 + }, + { + "epoch": 2.224580243756925, + "grad_norm": 38.71405442919005, + "learning_rate": 1.9011387818936366e-06, + "loss": 1.2166, + "step": 26101 + }, + { + "epoch": 2.2246654734509503, + "grad_norm": 44.92626066465669, + "learning_rate": 1.9007496648171746e-06, + "loss": 1.7127, + "step": 26102 + }, + { + "epoch": 2.224750703144976, + "grad_norm": 31.162518441626478, + "learning_rate": 1.900360578220608e-06, + "loss": 1.2028, + "step": 26103 + }, + { + "epoch": 2.2248359328390013, + "grad_norm": 41.11156064747761, + "learning_rate": 1.8999715221077613e-06, + "loss": 1.296, + "step": 26104 + }, + { + "epoch": 2.2249211625330263, + "grad_norm": 50.60704811274177, + "learning_rate": 1.899582496482465e-06, + "loss": 1.2789, + "step": 26105 + }, + { + "epoch": 2.2250063922270518, + "grad_norm": 55.26515050485439, + "learning_rate": 1.8991935013485407e-06, + "loss": 1.3854, + "step": 26106 + }, + { + "epoch": 2.2250916219210772, + "grad_norm": 23.306624127322298, + "learning_rate": 1.8988045367098186e-06, + "loss": 1.134, + "step": 26107 + }, + { + "epoch": 2.2251768516151027, + "grad_norm": 48.14601576582743, + "learning_rate": 1.898415602570121e-06, + "loss": 1.4035, + "step": 26108 + }, + { + "epoch": 2.225262081309128, + "grad_norm": 57.39082063114402, + "learning_rate": 1.8980266989332719e-06, + "loss": 1.4332, + "step": 26109 + }, + { + "epoch": 2.2253473110031536, + "grad_norm": 33.613467936667654, + "learning_rate": 1.8976378258030992e-06, + "loss": 1.0078, + "step": 26110 + }, + { + "epoch": 2.225432540697179, + "grad_norm": 79.18515344606148, + "learning_rate": 1.8972489831834262e-06, + "loss": 1.7167, + "step": 26111 + }, + { + "epoch": 2.225517770391204, + "grad_norm": 60.83858937305566, + "learning_rate": 1.896860171078076e-06, + "loss": 1.9505, + "step": 26112 + }, + { + "epoch": 2.2256030000852296, + "grad_norm": 32.77994825902455, + "learning_rate": 1.8964713894908715e-06, + "loss": 1.2061, + "step": 26113 + }, + { + "epoch": 2.225688229779255, + "grad_norm": 38.94202971087985, + "learning_rate": 1.8960826384256375e-06, + "loss": 1.5457, + "step": 26114 + }, + { + "epoch": 2.2257734594732805, + "grad_norm": 62.99071441742492, + "learning_rate": 1.8956939178861993e-06, + "loss": 1.5041, + "step": 26115 + }, + { + "epoch": 2.225858689167306, + "grad_norm": 47.81685734073607, + "learning_rate": 1.8953052278763779e-06, + "loss": 1.6233, + "step": 26116 + }, + { + "epoch": 2.2259439188613315, + "grad_norm": 110.2246254316602, + "learning_rate": 1.8949165683999942e-06, + "loss": 2.6673, + "step": 26117 + }, + { + "epoch": 2.2260291485553565, + "grad_norm": 32.69594460557853, + "learning_rate": 1.894527939460874e-06, + "loss": 1.1556, + "step": 26118 + }, + { + "epoch": 2.226114378249382, + "grad_norm": 24.892619602624983, + "learning_rate": 1.8941393410628372e-06, + "loss": 1.1237, + "step": 26119 + }, + { + "epoch": 2.2261996079434074, + "grad_norm": 44.40673163137088, + "learning_rate": 1.8937507732097061e-06, + "loss": 1.9757, + "step": 26120 + }, + { + "epoch": 2.226284837637433, + "grad_norm": 62.38400921615627, + "learning_rate": 1.8933622359053e-06, + "loss": 2.1197, + "step": 26121 + }, + { + "epoch": 2.2263700673314584, + "grad_norm": 62.85926356064878, + "learning_rate": 1.8929737291534417e-06, + "loss": 2.0152, + "step": 26122 + }, + { + "epoch": 2.226455297025484, + "grad_norm": 59.15059919403023, + "learning_rate": 1.8925852529579542e-06, + "loss": 2.0625, + "step": 26123 + }, + { + "epoch": 2.226540526719509, + "grad_norm": 21.819152725575258, + "learning_rate": 1.8921968073226554e-06, + "loss": 1.3004, + "step": 26124 + }, + { + "epoch": 2.2266257564135343, + "grad_norm": 52.16148249329274, + "learning_rate": 1.891808392251367e-06, + "loss": 1.3793, + "step": 26125 + }, + { + "epoch": 2.22671098610756, + "grad_norm": 34.94648635332637, + "learning_rate": 1.8914200077479056e-06, + "loss": 1.1216, + "step": 26126 + }, + { + "epoch": 2.2267962158015853, + "grad_norm": 80.24707348939323, + "learning_rate": 1.891031653816095e-06, + "loss": 1.5053, + "step": 26127 + }, + { + "epoch": 2.2268814454956107, + "grad_norm": 46.06044802452398, + "learning_rate": 1.8906433304597528e-06, + "loss": 1.6815, + "step": 26128 + }, + { + "epoch": 2.226966675189636, + "grad_norm": 46.694223215168776, + "learning_rate": 1.890255037682696e-06, + "loss": 1.1543, + "step": 26129 + }, + { + "epoch": 2.2270519048836617, + "grad_norm": 65.80705867232086, + "learning_rate": 1.8898667754887456e-06, + "loss": 1.9301, + "step": 26130 + }, + { + "epoch": 2.2271371345776867, + "grad_norm": 40.755881331066064, + "learning_rate": 1.889478543881721e-06, + "loss": 1.273, + "step": 26131 + }, + { + "epoch": 2.227222364271712, + "grad_norm": 68.8747753444666, + "learning_rate": 1.889090342865439e-06, + "loss": 1.7153, + "step": 26132 + }, + { + "epoch": 2.2273075939657376, + "grad_norm": 64.5378815272256, + "learning_rate": 1.8887021724437172e-06, + "loss": 1.4971, + "step": 26133 + }, + { + "epoch": 2.227392823659763, + "grad_norm": 57.3568422744415, + "learning_rate": 1.8883140326203719e-06, + "loss": 1.8408, + "step": 26134 + }, + { + "epoch": 2.2274780533537886, + "grad_norm": 39.67040781861203, + "learning_rate": 1.887925923399223e-06, + "loss": 1.6237, + "step": 26135 + }, + { + "epoch": 2.227563283047814, + "grad_norm": 51.485319424074454, + "learning_rate": 1.8875378447840863e-06, + "loss": 1.815, + "step": 26136 + }, + { + "epoch": 2.227648512741839, + "grad_norm": 70.50450295764644, + "learning_rate": 1.8871497967787762e-06, + "loss": 2.1896, + "step": 26137 + }, + { + "epoch": 2.2277337424358645, + "grad_norm": 38.644894431820404, + "learning_rate": 1.886761779387113e-06, + "loss": 1.3264, + "step": 26138 + }, + { + "epoch": 2.22781897212989, + "grad_norm": 48.41998927968029, + "learning_rate": 1.8863737926129082e-06, + "loss": 1.1508, + "step": 26139 + }, + { + "epoch": 2.2279042018239155, + "grad_norm": 37.593694676610305, + "learning_rate": 1.885985836459982e-06, + "loss": 1.0225, + "step": 26140 + }, + { + "epoch": 2.227989431517941, + "grad_norm": 59.609477509916424, + "learning_rate": 1.8855979109321475e-06, + "loss": 1.2258, + "step": 26141 + }, + { + "epoch": 2.2280746612119664, + "grad_norm": 53.64035640968768, + "learning_rate": 1.8852100160332203e-06, + "loss": 1.6063, + "step": 26142 + }, + { + "epoch": 2.2281598909059914, + "grad_norm": 46.30222652842965, + "learning_rate": 1.8848221517670135e-06, + "loss": 1.2484, + "step": 26143 + }, + { + "epoch": 2.228245120600017, + "grad_norm": 25.688807575704015, + "learning_rate": 1.8844343181373442e-06, + "loss": 1.0251, + "step": 26144 + }, + { + "epoch": 2.2283303502940424, + "grad_norm": 56.41544703509123, + "learning_rate": 1.8840465151480236e-06, + "loss": 1.7423, + "step": 26145 + }, + { + "epoch": 2.228415579988068, + "grad_norm": 60.63461407301456, + "learning_rate": 1.8836587428028696e-06, + "loss": 1.3958, + "step": 26146 + }, + { + "epoch": 2.2285008096820933, + "grad_norm": 79.41535047051674, + "learning_rate": 1.8832710011056915e-06, + "loss": 1.3334, + "step": 26147 + }, + { + "epoch": 2.2285860393761188, + "grad_norm": 46.00292071497061, + "learning_rate": 1.8828832900603062e-06, + "loss": 1.4078, + "step": 26148 + }, + { + "epoch": 2.2286712690701442, + "grad_norm": 69.11485075988301, + "learning_rate": 1.8824956096705255e-06, + "loss": 1.9152, + "step": 26149 + }, + { + "epoch": 2.2287564987641693, + "grad_norm": 30.8221708507519, + "learning_rate": 1.8821079599401616e-06, + "loss": 0.7433, + "step": 26150 + }, + { + "epoch": 2.2288417284581947, + "grad_norm": 43.89703157431108, + "learning_rate": 1.8817203408730267e-06, + "loss": 1.1861, + "step": 26151 + }, + { + "epoch": 2.22892695815222, + "grad_norm": 67.48751734796582, + "learning_rate": 1.8813327524729314e-06, + "loss": 2.7411, + "step": 26152 + }, + { + "epoch": 2.2290121878462457, + "grad_norm": 83.84311610177487, + "learning_rate": 1.8809451947436897e-06, + "loss": 2.6652, + "step": 26153 + }, + { + "epoch": 2.229097417540271, + "grad_norm": 117.49515395903744, + "learning_rate": 1.8805576676891141e-06, + "loss": 3.1574, + "step": 26154 + }, + { + "epoch": 2.2291826472342966, + "grad_norm": 46.07052476943847, + "learning_rate": 1.8801701713130137e-06, + "loss": 1.9559, + "step": 26155 + }, + { + "epoch": 2.2292678769283216, + "grad_norm": 73.00090684428253, + "learning_rate": 1.8797827056191991e-06, + "loss": 2.0105, + "step": 26156 + }, + { + "epoch": 2.229353106622347, + "grad_norm": 39.96980210480201, + "learning_rate": 1.8793952706114832e-06, + "loss": 1.3325, + "step": 26157 + }, + { + "epoch": 2.2294383363163726, + "grad_norm": 29.56043052534276, + "learning_rate": 1.879007866293675e-06, + "loss": 0.8679, + "step": 26158 + }, + { + "epoch": 2.229523566010398, + "grad_norm": 39.64703449466842, + "learning_rate": 1.8786204926695839e-06, + "loss": 1.3092, + "step": 26159 + }, + { + "epoch": 2.2296087957044235, + "grad_norm": 33.02186036194229, + "learning_rate": 1.8782331497430183e-06, + "loss": 1.0288, + "step": 26160 + }, + { + "epoch": 2.229694025398449, + "grad_norm": 62.51633364757637, + "learning_rate": 1.8778458375177893e-06, + "loss": 2.4337, + "step": 26161 + }, + { + "epoch": 2.229779255092474, + "grad_norm": 45.369426256388806, + "learning_rate": 1.8774585559977078e-06, + "loss": 1.8288, + "step": 26162 + }, + { + "epoch": 2.2298644847864995, + "grad_norm": 34.95585092809853, + "learning_rate": 1.8770713051865802e-06, + "loss": 1.1494, + "step": 26163 + }, + { + "epoch": 2.229949714480525, + "grad_norm": 38.48580749519638, + "learning_rate": 1.876684085088214e-06, + "loss": 1.3369, + "step": 26164 + }, + { + "epoch": 2.2300349441745504, + "grad_norm": 31.118980592821888, + "learning_rate": 1.8762968957064204e-06, + "loss": 1.8131, + "step": 26165 + }, + { + "epoch": 2.230120173868576, + "grad_norm": 51.540069787135714, + "learning_rate": 1.8759097370450052e-06, + "loss": 1.0436, + "step": 26166 + }, + { + "epoch": 2.2302054035626013, + "grad_norm": 33.1315177540512, + "learning_rate": 1.8755226091077765e-06, + "loss": 1.1469, + "step": 26167 + }, + { + "epoch": 2.230290633256627, + "grad_norm": 49.0788640098382, + "learning_rate": 1.8751355118985404e-06, + "loss": 1.7351, + "step": 26168 + }, + { + "epoch": 2.230375862950652, + "grad_norm": 35.831011094655715, + "learning_rate": 1.874748445421104e-06, + "loss": 1.4323, + "step": 26169 + }, + { + "epoch": 2.2304610926446773, + "grad_norm": 34.50702814407862, + "learning_rate": 1.8743614096792767e-06, + "loss": 1.0521, + "step": 26170 + }, + { + "epoch": 2.2305463223387028, + "grad_norm": 60.867890733351956, + "learning_rate": 1.8739744046768633e-06, + "loss": 1.8876, + "step": 26171 + }, + { + "epoch": 2.2306315520327282, + "grad_norm": 51.644852513923, + "learning_rate": 1.8735874304176693e-06, + "loss": 1.7554, + "step": 26172 + }, + { + "epoch": 2.2307167817267537, + "grad_norm": 17.50956607333804, + "learning_rate": 1.8732004869054987e-06, + "loss": 0.6423, + "step": 26173 + }, + { + "epoch": 2.230802011420779, + "grad_norm": 39.30450573198843, + "learning_rate": 1.8728135741441606e-06, + "loss": 1.1168, + "step": 26174 + }, + { + "epoch": 2.2308872411148046, + "grad_norm": 58.107051232427665, + "learning_rate": 1.8724266921374585e-06, + "loss": 2.0097, + "step": 26175 + }, + { + "epoch": 2.2309724708088297, + "grad_norm": 31.849052669024278, + "learning_rate": 1.8720398408891948e-06, + "loss": 0.6913, + "step": 26176 + }, + { + "epoch": 2.231057700502855, + "grad_norm": 44.39608740331496, + "learning_rate": 1.871653020403177e-06, + "loss": 0.9571, + "step": 26177 + }, + { + "epoch": 2.2311429301968806, + "grad_norm": 45.85266727260929, + "learning_rate": 1.8712662306832103e-06, + "loss": 1.4787, + "step": 26178 + }, + { + "epoch": 2.231228159890906, + "grad_norm": 62.24681498034332, + "learning_rate": 1.870879471733097e-06, + "loss": 2.1561, + "step": 26179 + }, + { + "epoch": 2.2313133895849315, + "grad_norm": 32.87983196640344, + "learning_rate": 1.8704927435566406e-06, + "loss": 0.9708, + "step": 26180 + }, + { + "epoch": 2.231398619278957, + "grad_norm": 85.22626895503502, + "learning_rate": 1.8701060461576443e-06, + "loss": 1.6733, + "step": 26181 + }, + { + "epoch": 2.231483848972982, + "grad_norm": 28.838102001685137, + "learning_rate": 1.86971937953991e-06, + "loss": 0.9648, + "step": 26182 + }, + { + "epoch": 2.2315690786670075, + "grad_norm": 43.26842449310975, + "learning_rate": 1.8693327437072434e-06, + "loss": 1.1641, + "step": 26183 + }, + { + "epoch": 2.231654308361033, + "grad_norm": 45.85016674977878, + "learning_rate": 1.8689461386634433e-06, + "loss": 1.4691, + "step": 26184 + }, + { + "epoch": 2.2317395380550584, + "grad_norm": 20.477516649174056, + "learning_rate": 1.8685595644123156e-06, + "loss": 0.691, + "step": 26185 + }, + { + "epoch": 2.231824767749084, + "grad_norm": 67.63837922717049, + "learning_rate": 1.8681730209576588e-06, + "loss": 1.7174, + "step": 26186 + }, + { + "epoch": 2.2319099974431094, + "grad_norm": 64.74387633964298, + "learning_rate": 1.8677865083032775e-06, + "loss": 1.9475, + "step": 26187 + }, + { + "epoch": 2.2319952271371344, + "grad_norm": 72.0296585533509, + "learning_rate": 1.8674000264529708e-06, + "loss": 1.7245, + "step": 26188 + }, + { + "epoch": 2.23208045683116, + "grad_norm": 53.36374035846478, + "learning_rate": 1.8670135754105407e-06, + "loss": 2.0523, + "step": 26189 + }, + { + "epoch": 2.2321656865251853, + "grad_norm": 69.86372602797478, + "learning_rate": 1.8666271551797848e-06, + "loss": 1.9465, + "step": 26190 + }, + { + "epoch": 2.232250916219211, + "grad_norm": 49.60093201138855, + "learning_rate": 1.8662407657645082e-06, + "loss": 1.1983, + "step": 26191 + }, + { + "epoch": 2.2323361459132363, + "grad_norm": 17.07110902911981, + "learning_rate": 1.8658544071685065e-06, + "loss": 0.6824, + "step": 26192 + }, + { + "epoch": 2.2324213756072617, + "grad_norm": 41.30232119923022, + "learning_rate": 1.8654680793955827e-06, + "loss": 1.7229, + "step": 26193 + }, + { + "epoch": 2.232506605301287, + "grad_norm": 41.67966041400629, + "learning_rate": 1.8650817824495332e-06, + "loss": 1.9191, + "step": 26194 + }, + { + "epoch": 2.232591834995312, + "grad_norm": 72.29359970488814, + "learning_rate": 1.8646955163341606e-06, + "loss": 2.3488, + "step": 26195 + }, + { + "epoch": 2.2326770646893377, + "grad_norm": 44.60252898938023, + "learning_rate": 1.8643092810532614e-06, + "loss": 1.5238, + "step": 26196 + }, + { + "epoch": 2.232762294383363, + "grad_norm": 49.757722611898075, + "learning_rate": 1.8639230766106349e-06, + "loss": 1.7157, + "step": 26197 + }, + { + "epoch": 2.2328475240773886, + "grad_norm": 61.49418158332052, + "learning_rate": 1.863536903010078e-06, + "loss": 2.0148, + "step": 26198 + }, + { + "epoch": 2.232932753771414, + "grad_norm": 62.32330473865472, + "learning_rate": 1.8631507602553884e-06, + "loss": 2.4947, + "step": 26199 + }, + { + "epoch": 2.2330179834654396, + "grad_norm": 60.95458028864782, + "learning_rate": 1.8627646483503642e-06, + "loss": 1.6631, + "step": 26200 + }, + { + "epoch": 2.2331032131594646, + "grad_norm": 24.809448809986026, + "learning_rate": 1.8623785672988053e-06, + "loss": 0.8659, + "step": 26201 + }, + { + "epoch": 2.23318844285349, + "grad_norm": 55.94135832031657, + "learning_rate": 1.861992517104506e-06, + "loss": 1.5772, + "step": 26202 + }, + { + "epoch": 2.2332736725475155, + "grad_norm": 75.96973635956802, + "learning_rate": 1.8616064977712617e-06, + "loss": 1.7869, + "step": 26203 + }, + { + "epoch": 2.233358902241541, + "grad_norm": 34.92119212710329, + "learning_rate": 1.8612205093028723e-06, + "loss": 1.3619, + "step": 26204 + }, + { + "epoch": 2.2334441319355665, + "grad_norm": 62.392899706639575, + "learning_rate": 1.8608345517031317e-06, + "loss": 1.3433, + "step": 26205 + }, + { + "epoch": 2.233529361629592, + "grad_norm": 45.262894792897406, + "learning_rate": 1.8604486249758357e-06, + "loss": 0.7676, + "step": 26206 + }, + { + "epoch": 2.233614591323617, + "grad_norm": 28.63892782965911, + "learning_rate": 1.8600627291247785e-06, + "loss": 1.263, + "step": 26207 + }, + { + "epoch": 2.2336998210176424, + "grad_norm": 60.70914250000948, + "learning_rate": 1.8596768641537571e-06, + "loss": 0.649, + "step": 26208 + }, + { + "epoch": 2.233785050711668, + "grad_norm": 34.46227961934349, + "learning_rate": 1.8592910300665673e-06, + "loss": 1.5573, + "step": 26209 + }, + { + "epoch": 2.2338702804056934, + "grad_norm": 53.770166621819484, + "learning_rate": 1.8589052268670022e-06, + "loss": 1.8094, + "step": 26210 + }, + { + "epoch": 2.233955510099719, + "grad_norm": 43.81312747469614, + "learning_rate": 1.858519454558856e-06, + "loss": 1.582, + "step": 26211 + }, + { + "epoch": 2.2340407397937443, + "grad_norm": 39.73949964725385, + "learning_rate": 1.858133713145921e-06, + "loss": 1.3828, + "step": 26212 + }, + { + "epoch": 2.2341259694877698, + "grad_norm": 45.93089947256036, + "learning_rate": 1.8577480026319938e-06, + "loss": 1.9539, + "step": 26213 + }, + { + "epoch": 2.234211199181795, + "grad_norm": 46.82752681563462, + "learning_rate": 1.857362323020867e-06, + "loss": 1.3213, + "step": 26214 + }, + { + "epoch": 2.2342964288758203, + "grad_norm": 46.543282545197556, + "learning_rate": 1.8569766743163309e-06, + "loss": 1.0263, + "step": 26215 + }, + { + "epoch": 2.2343816585698457, + "grad_norm": 84.17388601322716, + "learning_rate": 1.85659105652218e-06, + "loss": 1.4882, + "step": 26216 + }, + { + "epoch": 2.234466888263871, + "grad_norm": 39.7204718231258, + "learning_rate": 1.8562054696422093e-06, + "loss": 1.0331, + "step": 26217 + }, + { + "epoch": 2.2345521179578967, + "grad_norm": 24.75022728584555, + "learning_rate": 1.8558199136802085e-06, + "loss": 0.7487, + "step": 26218 + }, + { + "epoch": 2.234637347651922, + "grad_norm": 44.92806943387842, + "learning_rate": 1.8554343886399695e-06, + "loss": 2.2407, + "step": 26219 + }, + { + "epoch": 2.234722577345947, + "grad_norm": 90.83655692042277, + "learning_rate": 1.8550488945252821e-06, + "loss": 2.4862, + "step": 26220 + }, + { + "epoch": 2.2348078070399726, + "grad_norm": 36.59067850924907, + "learning_rate": 1.8546634313399404e-06, + "loss": 0.8189, + "step": 26221 + }, + { + "epoch": 2.234893036733998, + "grad_norm": 60.441688117512186, + "learning_rate": 1.854277999087734e-06, + "loss": 1.3813, + "step": 26222 + }, + { + "epoch": 2.2349782664280236, + "grad_norm": 61.77397856264613, + "learning_rate": 1.8538925977724526e-06, + "loss": 1.5361, + "step": 26223 + }, + { + "epoch": 2.235063496122049, + "grad_norm": 27.99995793900437, + "learning_rate": 1.8535072273978865e-06, + "loss": 1.0624, + "step": 26224 + }, + { + "epoch": 2.2351487258160745, + "grad_norm": 46.11613247118693, + "learning_rate": 1.8531218879678287e-06, + "loss": 1.438, + "step": 26225 + }, + { + "epoch": 2.2352339555100995, + "grad_norm": 59.56894765488523, + "learning_rate": 1.8527365794860663e-06, + "loss": 1.7841, + "step": 26226 + }, + { + "epoch": 2.235319185204125, + "grad_norm": 36.211396201753026, + "learning_rate": 1.8523513019563894e-06, + "loss": 1.7618, + "step": 26227 + }, + { + "epoch": 2.2354044148981504, + "grad_norm": 39.62067699469358, + "learning_rate": 1.8519660553825864e-06, + "loss": 1.4703, + "step": 26228 + }, + { + "epoch": 2.235489644592176, + "grad_norm": 40.23735969221582, + "learning_rate": 1.8515808397684448e-06, + "loss": 1.2495, + "step": 26229 + }, + { + "epoch": 2.2355748742862014, + "grad_norm": 52.46964103368128, + "learning_rate": 1.8511956551177562e-06, + "loss": 1.6214, + "step": 26230 + }, + { + "epoch": 2.235660103980227, + "grad_norm": 47.18582675985872, + "learning_rate": 1.8508105014343058e-06, + "loss": 1.4677, + "step": 26231 + }, + { + "epoch": 2.2357453336742523, + "grad_norm": 78.69836658377227, + "learning_rate": 1.850425378721884e-06, + "loss": 2.4525, + "step": 26232 + }, + { + "epoch": 2.2358305633682773, + "grad_norm": 36.96403961028567, + "learning_rate": 1.8500402869842755e-06, + "loss": 1.0124, + "step": 26233 + }, + { + "epoch": 2.235915793062303, + "grad_norm": 94.6519682078147, + "learning_rate": 1.849655226225271e-06, + "loss": 2.7784, + "step": 26234 + }, + { + "epoch": 2.2360010227563283, + "grad_norm": 34.75039645155076, + "learning_rate": 1.8492701964486553e-06, + "loss": 0.9793, + "step": 26235 + }, + { + "epoch": 2.2360862524503538, + "grad_norm": 68.60808013444706, + "learning_rate": 1.8488851976582155e-06, + "loss": 1.9274, + "step": 26236 + }, + { + "epoch": 2.236171482144379, + "grad_norm": 70.13506658034936, + "learning_rate": 1.848500229857736e-06, + "loss": 2.1537, + "step": 26237 + }, + { + "epoch": 2.2362567118384047, + "grad_norm": 29.439492616110105, + "learning_rate": 1.8481152930510054e-06, + "loss": 1.2351, + "step": 26238 + }, + { + "epoch": 2.2363419415324297, + "grad_norm": 101.48039109946481, + "learning_rate": 1.8477303872418073e-06, + "loss": 2.1606, + "step": 26239 + }, + { + "epoch": 2.236427171226455, + "grad_norm": 51.59477277357908, + "learning_rate": 1.8473455124339301e-06, + "loss": 1.1571, + "step": 26240 + }, + { + "epoch": 2.2365124009204806, + "grad_norm": 39.63145583418766, + "learning_rate": 1.846960668631157e-06, + "loss": 1.4557, + "step": 26241 + }, + { + "epoch": 2.236597630614506, + "grad_norm": 56.51664562368422, + "learning_rate": 1.846575855837271e-06, + "loss": 1.9536, + "step": 26242 + }, + { + "epoch": 2.2366828603085316, + "grad_norm": 24.29906864958812, + "learning_rate": 1.84619107405606e-06, + "loss": 0.9411, + "step": 26243 + }, + { + "epoch": 2.236768090002557, + "grad_norm": 41.653297053215105, + "learning_rate": 1.845806323291307e-06, + "loss": 1.7594, + "step": 26244 + }, + { + "epoch": 2.236853319696582, + "grad_norm": 48.66081037315247, + "learning_rate": 1.8454216035467948e-06, + "loss": 1.5255, + "step": 26245 + }, + { + "epoch": 2.2369385493906075, + "grad_norm": 49.52063690066186, + "learning_rate": 1.8450369148263065e-06, + "loss": 1.3601, + "step": 26246 + }, + { + "epoch": 2.237023779084633, + "grad_norm": 60.027527449223086, + "learning_rate": 1.8446522571336262e-06, + "loss": 1.2492, + "step": 26247 + }, + { + "epoch": 2.2371090087786585, + "grad_norm": 71.46637193312817, + "learning_rate": 1.8442676304725387e-06, + "loss": 2.0745, + "step": 26248 + }, + { + "epoch": 2.237194238472684, + "grad_norm": 39.09939625510028, + "learning_rate": 1.8438830348468255e-06, + "loss": 1.9189, + "step": 26249 + }, + { + "epoch": 2.2372794681667094, + "grad_norm": 28.657174700667543, + "learning_rate": 1.8434984702602665e-06, + "loss": 1.0614, + "step": 26250 + }, + { + "epoch": 2.237364697860735, + "grad_norm": 57.02065371083391, + "learning_rate": 1.8431139367166478e-06, + "loss": 1.9672, + "step": 26251 + }, + { + "epoch": 2.23744992755476, + "grad_norm": 35.55323810380632, + "learning_rate": 1.8427294342197487e-06, + "loss": 1.1274, + "step": 26252 + }, + { + "epoch": 2.2375351572487854, + "grad_norm": 54.89373256026416, + "learning_rate": 1.8423449627733514e-06, + "loss": 2.2861, + "step": 26253 + }, + { + "epoch": 2.237620386942811, + "grad_norm": 36.472645907238054, + "learning_rate": 1.8419605223812348e-06, + "loss": 0.7847, + "step": 26254 + }, + { + "epoch": 2.2377056166368363, + "grad_norm": 36.21343706410687, + "learning_rate": 1.8415761130471816e-06, + "loss": 1.3185, + "step": 26255 + }, + { + "epoch": 2.237790846330862, + "grad_norm": 25.63111903136748, + "learning_rate": 1.841191734774974e-06, + "loss": 1.3323, + "step": 26256 + }, + { + "epoch": 2.2378760760248873, + "grad_norm": 14.253550408737848, + "learning_rate": 1.8408073875683907e-06, + "loss": 0.6214, + "step": 26257 + }, + { + "epoch": 2.2379613057189123, + "grad_norm": 21.585227472848057, + "learning_rate": 1.840423071431211e-06, + "loss": 0.9424, + "step": 26258 + }, + { + "epoch": 2.2380465354129377, + "grad_norm": 33.12108043630316, + "learning_rate": 1.8400387863672132e-06, + "loss": 1.1155, + "step": 26259 + }, + { + "epoch": 2.238131765106963, + "grad_norm": 54.59004063454511, + "learning_rate": 1.8396545323801801e-06, + "loss": 1.2633, + "step": 26260 + }, + { + "epoch": 2.2382169948009887, + "grad_norm": 47.31270924255049, + "learning_rate": 1.839270309473889e-06, + "loss": 1.6742, + "step": 26261 + }, + { + "epoch": 2.238302224495014, + "grad_norm": 72.60241558976442, + "learning_rate": 1.8388861176521167e-06, + "loss": 2.2189, + "step": 26262 + }, + { + "epoch": 2.2383874541890396, + "grad_norm": 43.40971485308867, + "learning_rate": 1.8385019569186436e-06, + "loss": 1.6352, + "step": 26263 + }, + { + "epoch": 2.2384726838830646, + "grad_norm": 64.12358579440883, + "learning_rate": 1.8381178272772487e-06, + "loss": 1.0266, + "step": 26264 + }, + { + "epoch": 2.23855791357709, + "grad_norm": 45.89253562323599, + "learning_rate": 1.8377337287317087e-06, + "loss": 1.4813, + "step": 26265 + }, + { + "epoch": 2.2386431432711156, + "grad_norm": 56.50440093263089, + "learning_rate": 1.837349661285801e-06, + "loss": 1.4884, + "step": 26266 + }, + { + "epoch": 2.238728372965141, + "grad_norm": 46.33513190648589, + "learning_rate": 1.8369656249433016e-06, + "loss": 1.2032, + "step": 26267 + }, + { + "epoch": 2.2388136026591665, + "grad_norm": 82.66239036135197, + "learning_rate": 1.8365816197079894e-06, + "loss": 1.8148, + "step": 26268 + }, + { + "epoch": 2.238898832353192, + "grad_norm": 66.71535366802284, + "learning_rate": 1.83619764558364e-06, + "loss": 1.9816, + "step": 26269 + }, + { + "epoch": 2.2389840620472174, + "grad_norm": 53.44502083576761, + "learning_rate": 1.835813702574028e-06, + "loss": 1.5915, + "step": 26270 + }, + { + "epoch": 2.2390692917412425, + "grad_norm": 53.47758285982227, + "learning_rate": 1.835429790682932e-06, + "loss": 1.4842, + "step": 26271 + }, + { + "epoch": 2.239154521435268, + "grad_norm": 76.0996610352911, + "learning_rate": 1.8350459099141254e-06, + "loss": 0.9873, + "step": 26272 + }, + { + "epoch": 2.2392397511292934, + "grad_norm": 60.37787828328221, + "learning_rate": 1.8346620602713866e-06, + "loss": 1.8219, + "step": 26273 + }, + { + "epoch": 2.239324980823319, + "grad_norm": 40.97357600070949, + "learning_rate": 1.834278241758488e-06, + "loss": 1.5369, + "step": 26274 + }, + { + "epoch": 2.2394102105173443, + "grad_norm": 60.52227601088549, + "learning_rate": 1.8338944543792058e-06, + "loss": 1.5405, + "step": 26275 + }, + { + "epoch": 2.23949544021137, + "grad_norm": 21.726403061003275, + "learning_rate": 1.8335106981373114e-06, + "loss": 0.8595, + "step": 26276 + }, + { + "epoch": 2.239580669905395, + "grad_norm": 34.393839914773494, + "learning_rate": 1.8331269730365825e-06, + "loss": 0.6764, + "step": 26277 + }, + { + "epoch": 2.2396658995994203, + "grad_norm": 31.949530550974554, + "learning_rate": 1.8327432790807902e-06, + "loss": 1.2163, + "step": 26278 + }, + { + "epoch": 2.2397511292934458, + "grad_norm": 50.77324958321428, + "learning_rate": 1.8323596162737107e-06, + "loss": 0.876, + "step": 26279 + }, + { + "epoch": 2.2398363589874712, + "grad_norm": 32.33221645324754, + "learning_rate": 1.8319759846191137e-06, + "loss": 0.9477, + "step": 26280 + }, + { + "epoch": 2.2399215886814967, + "grad_norm": 11.013746484167044, + "learning_rate": 1.8315923841207762e-06, + "loss": 0.4915, + "step": 26281 + }, + { + "epoch": 2.240006818375522, + "grad_norm": 64.73886690099339, + "learning_rate": 1.831208814782468e-06, + "loss": 1.7766, + "step": 26282 + }, + { + "epoch": 2.240092048069547, + "grad_norm": 18.584957369388448, + "learning_rate": 1.8308252766079627e-06, + "loss": 0.5695, + "step": 26283 + }, + { + "epoch": 2.2401772777635727, + "grad_norm": 24.838115479243573, + "learning_rate": 1.8304417696010312e-06, + "loss": 0.8819, + "step": 26284 + }, + { + "epoch": 2.240262507457598, + "grad_norm": 37.17049365917786, + "learning_rate": 1.8300582937654438e-06, + "loss": 1.2111, + "step": 26285 + }, + { + "epoch": 2.2403477371516236, + "grad_norm": 62.27320177597339, + "learning_rate": 1.8296748491049732e-06, + "loss": 2.3529, + "step": 26286 + }, + { + "epoch": 2.240432966845649, + "grad_norm": 37.887209470364986, + "learning_rate": 1.8292914356233927e-06, + "loss": 1.0685, + "step": 26287 + }, + { + "epoch": 2.2405181965396745, + "grad_norm": 40.4660841525133, + "learning_rate": 1.8289080533244706e-06, + "loss": 1.5361, + "step": 26288 + }, + { + "epoch": 2.2406034262337, + "grad_norm": 56.06409834274547, + "learning_rate": 1.8285247022119761e-06, + "loss": 1.2249, + "step": 26289 + }, + { + "epoch": 2.240688655927725, + "grad_norm": 22.973053211602316, + "learning_rate": 1.8281413822896833e-06, + "loss": 0.951, + "step": 26290 + }, + { + "epoch": 2.2407738856217505, + "grad_norm": 49.308068274703764, + "learning_rate": 1.8277580935613586e-06, + "loss": 1.4853, + "step": 26291 + }, + { + "epoch": 2.240859115315776, + "grad_norm": 33.90796208702431, + "learning_rate": 1.8273748360307736e-06, + "loss": 1.0941, + "step": 26292 + }, + { + "epoch": 2.2409443450098014, + "grad_norm": 46.88879489033464, + "learning_rate": 1.8269916097016937e-06, + "loss": 1.3032, + "step": 26293 + }, + { + "epoch": 2.241029574703827, + "grad_norm": 42.58503212330044, + "learning_rate": 1.8266084145778912e-06, + "loss": 1.491, + "step": 26294 + }, + { + "epoch": 2.2411148043978524, + "grad_norm": 61.982870809041, + "learning_rate": 1.8262252506631356e-06, + "loss": 1.2307, + "step": 26295 + }, + { + "epoch": 2.2412000340918774, + "grad_norm": 65.05546445590414, + "learning_rate": 1.825842117961193e-06, + "loss": 1.7174, + "step": 26296 + }, + { + "epoch": 2.241285263785903, + "grad_norm": 59.8531529123981, + "learning_rate": 1.8254590164758306e-06, + "loss": 1.8727, + "step": 26297 + }, + { + "epoch": 2.2413704934799283, + "grad_norm": 28.064991568281684, + "learning_rate": 1.8250759462108192e-06, + "loss": 0.9228, + "step": 26298 + }, + { + "epoch": 2.241455723173954, + "grad_norm": 32.74809744210581, + "learning_rate": 1.8246929071699238e-06, + "loss": 1.4007, + "step": 26299 + }, + { + "epoch": 2.2415409528679793, + "grad_norm": 43.25777484070208, + "learning_rate": 1.8243098993569124e-06, + "loss": 0.9911, + "step": 26300 + }, + { + "epoch": 2.2416261825620047, + "grad_norm": 51.97502526178298, + "learning_rate": 1.8239269227755491e-06, + "loss": 1.3024, + "step": 26301 + }, + { + "epoch": 2.2417114122560298, + "grad_norm": 82.54208593359958, + "learning_rate": 1.8235439774296026e-06, + "loss": 2.0522, + "step": 26302 + }, + { + "epoch": 2.2417966419500552, + "grad_norm": 38.83674986803992, + "learning_rate": 1.8231610633228408e-06, + "loss": 1.7574, + "step": 26303 + }, + { + "epoch": 2.2418818716440807, + "grad_norm": 58.243722567959054, + "learning_rate": 1.8227781804590277e-06, + "loss": 1.7366, + "step": 26304 + }, + { + "epoch": 2.241967101338106, + "grad_norm": 69.02334134266933, + "learning_rate": 1.8223953288419282e-06, + "loss": 1.5178, + "step": 26305 + }, + { + "epoch": 2.2420523310321316, + "grad_norm": 61.16628837156996, + "learning_rate": 1.8220125084753065e-06, + "loss": 1.8012, + "step": 26306 + }, + { + "epoch": 2.242137560726157, + "grad_norm": 54.488145957574794, + "learning_rate": 1.8216297193629307e-06, + "loss": 1.4802, + "step": 26307 + }, + { + "epoch": 2.2422227904201826, + "grad_norm": 47.66923203931493, + "learning_rate": 1.8212469615085637e-06, + "loss": 1.6932, + "step": 26308 + }, + { + "epoch": 2.2423080201142076, + "grad_norm": 46.93184214659165, + "learning_rate": 1.8208642349159677e-06, + "loss": 1.4051, + "step": 26309 + }, + { + "epoch": 2.242393249808233, + "grad_norm": 48.008266704837695, + "learning_rate": 1.8204815395889092e-06, + "loss": 1.4686, + "step": 26310 + }, + { + "epoch": 2.2424784795022585, + "grad_norm": 78.34995426989298, + "learning_rate": 1.8200988755311528e-06, + "loss": 2.2095, + "step": 26311 + }, + { + "epoch": 2.242563709196284, + "grad_norm": 31.912073079796404, + "learning_rate": 1.8197162427464605e-06, + "loss": 1.2943, + "step": 26312 + }, + { + "epoch": 2.2426489388903095, + "grad_norm": 62.89169126581041, + "learning_rate": 1.8193336412385948e-06, + "loss": 1.6874, + "step": 26313 + }, + { + "epoch": 2.242734168584335, + "grad_norm": 54.3192776017862, + "learning_rate": 1.8189510710113195e-06, + "loss": 1.6597, + "step": 26314 + }, + { + "epoch": 2.2428193982783604, + "grad_norm": 90.61691345550048, + "learning_rate": 1.8185685320683943e-06, + "loss": 2.9129, + "step": 26315 + }, + { + "epoch": 2.2429046279723854, + "grad_norm": 102.51877390126015, + "learning_rate": 1.8181860244135852e-06, + "loss": 1.1914, + "step": 26316 + }, + { + "epoch": 2.242989857666411, + "grad_norm": 60.842907243642436, + "learning_rate": 1.8178035480506506e-06, + "loss": 1.8616, + "step": 26317 + }, + { + "epoch": 2.2430750873604364, + "grad_norm": 38.36447286230703, + "learning_rate": 1.817421102983355e-06, + "loss": 1.2002, + "step": 26318 + }, + { + "epoch": 2.243160317054462, + "grad_norm": 37.664361238616465, + "learning_rate": 1.8170386892154568e-06, + "loss": 1.6186, + "step": 26319 + }, + { + "epoch": 2.2432455467484873, + "grad_norm": 43.787458568766965, + "learning_rate": 1.8166563067507198e-06, + "loss": 1.7313, + "step": 26320 + }, + { + "epoch": 2.2433307764425128, + "grad_norm": 60.33833880847572, + "learning_rate": 1.816273955592903e-06, + "loss": 1.4333, + "step": 26321 + }, + { + "epoch": 2.243416006136538, + "grad_norm": 45.75944758772495, + "learning_rate": 1.8158916357457673e-06, + "loss": 1.8197, + "step": 26322 + }, + { + "epoch": 2.2435012358305633, + "grad_norm": 75.49753255729038, + "learning_rate": 1.8155093472130702e-06, + "loss": 2.0111, + "step": 26323 + }, + { + "epoch": 2.2435864655245887, + "grad_norm": 49.36173284875118, + "learning_rate": 1.815127089998575e-06, + "loss": 2.0457, + "step": 26324 + }, + { + "epoch": 2.243671695218614, + "grad_norm": 32.44214875709155, + "learning_rate": 1.8147448641060378e-06, + "loss": 0.961, + "step": 26325 + }, + { + "epoch": 2.2437569249126397, + "grad_norm": 108.01365169142049, + "learning_rate": 1.8143626695392203e-06, + "loss": 1.8666, + "step": 26326 + }, + { + "epoch": 2.243842154606665, + "grad_norm": 47.53904954136975, + "learning_rate": 1.8139805063018806e-06, + "loss": 1.0627, + "step": 26327 + }, + { + "epoch": 2.24392738430069, + "grad_norm": 55.61167279787052, + "learning_rate": 1.8135983743977748e-06, + "loss": 1.4525, + "step": 26328 + }, + { + "epoch": 2.2440126139947156, + "grad_norm": 44.98691796136222, + "learning_rate": 1.8132162738306642e-06, + "loss": 1.2398, + "step": 26329 + }, + { + "epoch": 2.244097843688741, + "grad_norm": 47.72869297340654, + "learning_rate": 1.8128342046043057e-06, + "loss": 1.4992, + "step": 26330 + }, + { + "epoch": 2.2441830733827666, + "grad_norm": 25.929509250521615, + "learning_rate": 1.812452166722456e-06, + "loss": 0.9978, + "step": 26331 + }, + { + "epoch": 2.244268303076792, + "grad_norm": 46.96757491952791, + "learning_rate": 1.8120701601888707e-06, + "loss": 1.3565, + "step": 26332 + }, + { + "epoch": 2.2443535327708175, + "grad_norm": 44.27799423477287, + "learning_rate": 1.8116881850073092e-06, + "loss": 1.5781, + "step": 26333 + }, + { + "epoch": 2.244438762464843, + "grad_norm": 62.90517760601348, + "learning_rate": 1.8113062411815291e-06, + "loss": 1.6682, + "step": 26334 + }, + { + "epoch": 2.244523992158868, + "grad_norm": 36.20781593566394, + "learning_rate": 1.810924328715285e-06, + "loss": 0.8287, + "step": 26335 + }, + { + "epoch": 2.2446092218528935, + "grad_norm": 40.51213626381241, + "learning_rate": 1.8105424476123311e-06, + "loss": 1.115, + "step": 26336 + }, + { + "epoch": 2.244694451546919, + "grad_norm": 37.417136623101534, + "learning_rate": 1.8101605978764264e-06, + "loss": 1.2893, + "step": 26337 + }, + { + "epoch": 2.2447796812409444, + "grad_norm": 60.43476731398299, + "learning_rate": 1.8097787795113248e-06, + "loss": 1.8926, + "step": 26338 + }, + { + "epoch": 2.24486491093497, + "grad_norm": 61.563428802241525, + "learning_rate": 1.8093969925207816e-06, + "loss": 1.7897, + "step": 26339 + }, + { + "epoch": 2.2449501406289953, + "grad_norm": 38.102576217643005, + "learning_rate": 1.8090152369085489e-06, + "loss": 1.5776, + "step": 26340 + }, + { + "epoch": 2.2450353703230204, + "grad_norm": 48.438324576337834, + "learning_rate": 1.8086335126783838e-06, + "loss": 1.4246, + "step": 26341 + }, + { + "epoch": 2.245120600017046, + "grad_norm": 36.24002154448283, + "learning_rate": 1.8082518198340415e-06, + "loss": 0.9397, + "step": 26342 + }, + { + "epoch": 2.2452058297110713, + "grad_norm": 64.36365081062789, + "learning_rate": 1.8078701583792745e-06, + "loss": 2.0008, + "step": 26343 + }, + { + "epoch": 2.2452910594050968, + "grad_norm": 45.33801168790399, + "learning_rate": 1.8074885283178355e-06, + "loss": 1.8851, + "step": 26344 + }, + { + "epoch": 2.2453762890991222, + "grad_norm": 33.21443927671218, + "learning_rate": 1.807106929653477e-06, + "loss": 0.9782, + "step": 26345 + }, + { + "epoch": 2.2454615187931477, + "grad_norm": 33.054981423205135, + "learning_rate": 1.806725362389955e-06, + "loss": 1.2462, + "step": 26346 + }, + { + "epoch": 2.2455467484871727, + "grad_norm": 28.066677721354147, + "learning_rate": 1.8063438265310201e-06, + "loss": 1.3787, + "step": 26347 + }, + { + "epoch": 2.245631978181198, + "grad_norm": 21.11912288474579, + "learning_rate": 1.8059623220804222e-06, + "loss": 1.0121, + "step": 26348 + }, + { + "epoch": 2.2457172078752237, + "grad_norm": 29.393018322335, + "learning_rate": 1.8055808490419163e-06, + "loss": 0.7132, + "step": 26349 + }, + { + "epoch": 2.245802437569249, + "grad_norm": 36.0700712781855, + "learning_rate": 1.8051994074192552e-06, + "loss": 1.5959, + "step": 26350 + }, + { + "epoch": 2.2458876672632746, + "grad_norm": 34.32455787239157, + "learning_rate": 1.8048179972161882e-06, + "loss": 1.1805, + "step": 26351 + }, + { + "epoch": 2.2459728969573, + "grad_norm": 68.02295910266093, + "learning_rate": 1.804436618436467e-06, + "loss": 2.0426, + "step": 26352 + }, + { + "epoch": 2.2460581266513255, + "grad_norm": 49.27640801228006, + "learning_rate": 1.8040552710838394e-06, + "loss": 1.207, + "step": 26353 + }, + { + "epoch": 2.2461433563453506, + "grad_norm": 35.056748507191834, + "learning_rate": 1.8036739551620608e-06, + "loss": 0.6627, + "step": 26354 + }, + { + "epoch": 2.246228586039376, + "grad_norm": 24.467529028943225, + "learning_rate": 1.8032926706748782e-06, + "loss": 1.2013, + "step": 26355 + }, + { + "epoch": 2.2463138157334015, + "grad_norm": 51.105891876205504, + "learning_rate": 1.8029114176260403e-06, + "loss": 1.2856, + "step": 26356 + }, + { + "epoch": 2.246399045427427, + "grad_norm": 51.64943194536466, + "learning_rate": 1.8025301960192998e-06, + "loss": 1.6214, + "step": 26357 + }, + { + "epoch": 2.2464842751214524, + "grad_norm": 39.29848141166675, + "learning_rate": 1.802149005858403e-06, + "loss": 1.0341, + "step": 26358 + }, + { + "epoch": 2.246569504815478, + "grad_norm": 38.10839505679889, + "learning_rate": 1.8017678471471012e-06, + "loss": 0.7197, + "step": 26359 + }, + { + "epoch": 2.246654734509503, + "grad_norm": 75.15124861688199, + "learning_rate": 1.8013867198891422e-06, + "loss": 2.0643, + "step": 26360 + }, + { + "epoch": 2.2467399642035284, + "grad_norm": 55.43936483649997, + "learning_rate": 1.8010056240882734e-06, + "loss": 1.5232, + "step": 26361 + }, + { + "epoch": 2.246825193897554, + "grad_norm": 55.810338255331786, + "learning_rate": 1.8006245597482418e-06, + "loss": 1.9544, + "step": 26362 + }, + { + "epoch": 2.2469104235915793, + "grad_norm": 40.364833269832666, + "learning_rate": 1.800243526872798e-06, + "loss": 1.6076, + "step": 26363 + }, + { + "epoch": 2.246995653285605, + "grad_norm": 41.14140370099503, + "learning_rate": 1.799862525465686e-06, + "loss": 1.6355, + "step": 26364 + }, + { + "epoch": 2.2470808829796303, + "grad_norm": 60.323176983487585, + "learning_rate": 1.799481555530656e-06, + "loss": 1.9056, + "step": 26365 + }, + { + "epoch": 2.2471661126736553, + "grad_norm": 46.66508690203123, + "learning_rate": 1.7991006170714515e-06, + "loss": 1.2233, + "step": 26366 + }, + { + "epoch": 2.2472513423676808, + "grad_norm": 51.54638090153139, + "learning_rate": 1.798719710091822e-06, + "loss": 1.0955, + "step": 26367 + }, + { + "epoch": 2.2473365720617062, + "grad_norm": 55.65642040046924, + "learning_rate": 1.798338834595512e-06, + "loss": 1.4342, + "step": 26368 + }, + { + "epoch": 2.2474218017557317, + "grad_norm": 26.46804926274743, + "learning_rate": 1.7979579905862682e-06, + "loss": 0.9243, + "step": 26369 + }, + { + "epoch": 2.247507031449757, + "grad_norm": 57.538211851019554, + "learning_rate": 1.7975771780678325e-06, + "loss": 1.7579, + "step": 26370 + }, + { + "epoch": 2.2475922611437826, + "grad_norm": 48.84130599779518, + "learning_rate": 1.797196397043955e-06, + "loss": 1.7325, + "step": 26371 + }, + { + "epoch": 2.247677490837808, + "grad_norm": 77.67415100228793, + "learning_rate": 1.7968156475183763e-06, + "loss": 2.3283, + "step": 26372 + }, + { + "epoch": 2.247762720531833, + "grad_norm": 62.01462256839316, + "learning_rate": 1.7964349294948447e-06, + "loss": 2.2077, + "step": 26373 + }, + { + "epoch": 2.2478479502258586, + "grad_norm": 116.92005094768437, + "learning_rate": 1.7960542429771028e-06, + "loss": 2.1944, + "step": 26374 + }, + { + "epoch": 2.247933179919884, + "grad_norm": 50.13846794001958, + "learning_rate": 1.7956735879688924e-06, + "loss": 1.5467, + "step": 26375 + }, + { + "epoch": 2.2480184096139095, + "grad_norm": 42.073965221279664, + "learning_rate": 1.7952929644739608e-06, + "loss": 1.7815, + "step": 26376 + }, + { + "epoch": 2.248103639307935, + "grad_norm": 41.44492355717277, + "learning_rate": 1.7949123724960493e-06, + "loss": 1.4691, + "step": 26377 + }, + { + "epoch": 2.2481888690019605, + "grad_norm": 41.06387801701036, + "learning_rate": 1.794531812038901e-06, + "loss": 1.0471, + "step": 26378 + }, + { + "epoch": 2.2482740986959855, + "grad_norm": 49.29661695448932, + "learning_rate": 1.7941512831062568e-06, + "loss": 1.6374, + "step": 26379 + }, + { + "epoch": 2.248359328390011, + "grad_norm": 27.837391846561278, + "learning_rate": 1.7937707857018611e-06, + "loss": 0.8991, + "step": 26380 + }, + { + "epoch": 2.2484445580840364, + "grad_norm": 55.78802729304483, + "learning_rate": 1.7933903198294578e-06, + "loss": 2.5369, + "step": 26381 + }, + { + "epoch": 2.248529787778062, + "grad_norm": 20.00190732277861, + "learning_rate": 1.7930098854927858e-06, + "loss": 0.7837, + "step": 26382 + }, + { + "epoch": 2.2486150174720874, + "grad_norm": 67.82134126772488, + "learning_rate": 1.792629482695586e-06, + "loss": 2.1958, + "step": 26383 + }, + { + "epoch": 2.248700247166113, + "grad_norm": 40.819932853349, + "learning_rate": 1.7922491114416018e-06, + "loss": 1.3075, + "step": 26384 + }, + { + "epoch": 2.248785476860138, + "grad_norm": 68.03551842539244, + "learning_rate": 1.7918687717345735e-06, + "loss": 1.5582, + "step": 26385 + }, + { + "epoch": 2.2488707065541633, + "grad_norm": 51.12080751758508, + "learning_rate": 1.791488463578241e-06, + "loss": 1.3223, + "step": 26386 + }, + { + "epoch": 2.248955936248189, + "grad_norm": 70.94468795701489, + "learning_rate": 1.791108186976343e-06, + "loss": 1.6223, + "step": 26387 + }, + { + "epoch": 2.2490411659422143, + "grad_norm": 61.44217048784602, + "learning_rate": 1.7907279419326206e-06, + "loss": 2.4424, + "step": 26388 + }, + { + "epoch": 2.2491263956362397, + "grad_norm": 60.371531467098805, + "learning_rate": 1.7903477284508152e-06, + "loss": 2.3175, + "step": 26389 + }, + { + "epoch": 2.249211625330265, + "grad_norm": 53.725465764293226, + "learning_rate": 1.789967546534665e-06, + "loss": 1.2655, + "step": 26390 + }, + { + "epoch": 2.2492968550242907, + "grad_norm": 48.091572013030806, + "learning_rate": 1.789587396187908e-06, + "loss": 1.1198, + "step": 26391 + }, + { + "epoch": 2.2493820847183157, + "grad_norm": 59.448586876788276, + "learning_rate": 1.7892072774142815e-06, + "loss": 1.9519, + "step": 26392 + }, + { + "epoch": 2.249467314412341, + "grad_norm": 54.897928633569755, + "learning_rate": 1.788827190217528e-06, + "loss": 2.4264, + "step": 26393 + }, + { + "epoch": 2.2495525441063666, + "grad_norm": 42.306699186720465, + "learning_rate": 1.7884471346013826e-06, + "loss": 2.1259, + "step": 26394 + }, + { + "epoch": 2.249637773800392, + "grad_norm": 21.052508633418526, + "learning_rate": 1.7880671105695818e-06, + "loss": 0.7133, + "step": 26395 + }, + { + "epoch": 2.2497230034944176, + "grad_norm": 35.15264105336811, + "learning_rate": 1.7876871181258643e-06, + "loss": 1.5705, + "step": 26396 + }, + { + "epoch": 2.249808233188443, + "grad_norm": 56.4838797018832, + "learning_rate": 1.7873071572739697e-06, + "loss": 1.9986, + "step": 26397 + }, + { + "epoch": 2.249893462882468, + "grad_norm": 50.27251928040016, + "learning_rate": 1.7869272280176326e-06, + "loss": 0.9771, + "step": 26398 + }, + { + "epoch": 2.2499786925764935, + "grad_norm": 25.95386298420679, + "learning_rate": 1.7865473303605897e-06, + "loss": 1.0719, + "step": 26399 + }, + { + "epoch": 2.250063922270519, + "grad_norm": 34.10894645567061, + "learning_rate": 1.7861674643065762e-06, + "loss": 1.4352, + "step": 26400 + }, + { + "epoch": 2.2501491519645445, + "grad_norm": 38.53867755928619, + "learning_rate": 1.7857876298593275e-06, + "loss": 1.0533, + "step": 26401 + }, + { + "epoch": 2.25023438165857, + "grad_norm": 59.67124799274031, + "learning_rate": 1.7854078270225817e-06, + "loss": 1.7701, + "step": 26402 + }, + { + "epoch": 2.2503196113525954, + "grad_norm": 35.66623440320526, + "learning_rate": 1.785028055800071e-06, + "loss": 1.192, + "step": 26403 + }, + { + "epoch": 2.2504048410466204, + "grad_norm": 34.003557191858846, + "learning_rate": 1.7846483161955335e-06, + "loss": 1.4503, + "step": 26404 + }, + { + "epoch": 2.250490070740646, + "grad_norm": 31.39748974047892, + "learning_rate": 1.7842686082127004e-06, + "loss": 1.0734, + "step": 26405 + }, + { + "epoch": 2.2505753004346714, + "grad_norm": 37.106056209295375, + "learning_rate": 1.7838889318553093e-06, + "loss": 0.6992, + "step": 26406 + }, + { + "epoch": 2.250660530128697, + "grad_norm": 29.976832281740837, + "learning_rate": 1.7835092871270931e-06, + "loss": 0.877, + "step": 26407 + }, + { + "epoch": 2.2507457598227223, + "grad_norm": 38.385693678280035, + "learning_rate": 1.7831296740317845e-06, + "loss": 0.9861, + "step": 26408 + }, + { + "epoch": 2.2508309895167478, + "grad_norm": 30.47521237977044, + "learning_rate": 1.7827500925731155e-06, + "loss": 0.735, + "step": 26409 + }, + { + "epoch": 2.2509162192107732, + "grad_norm": 32.83872112091688, + "learning_rate": 1.7823705427548232e-06, + "loss": 1.768, + "step": 26410 + }, + { + "epoch": 2.2510014489047983, + "grad_norm": 45.93322153083855, + "learning_rate": 1.781991024580636e-06, + "loss": 1.0585, + "step": 26411 + }, + { + "epoch": 2.2510866785988237, + "grad_norm": 47.38119239814736, + "learning_rate": 1.7816115380542897e-06, + "loss": 1.8644, + "step": 26412 + }, + { + "epoch": 2.251171908292849, + "grad_norm": 48.13693147632444, + "learning_rate": 1.7812320831795138e-06, + "loss": 1.2977, + "step": 26413 + }, + { + "epoch": 2.2512571379868747, + "grad_norm": 59.67770307176886, + "learning_rate": 1.7808526599600433e-06, + "loss": 1.3663, + "step": 26414 + }, + { + "epoch": 2.2513423676809, + "grad_norm": 35.042699730728856, + "learning_rate": 1.7804732683996072e-06, + "loss": 1.112, + "step": 26415 + }, + { + "epoch": 2.2514275973749256, + "grad_norm": 47.6843787439648, + "learning_rate": 1.7800939085019376e-06, + "loss": 1.3675, + "step": 26416 + }, + { + "epoch": 2.251512827068951, + "grad_norm": 199.26852230408582, + "learning_rate": 1.779714580270765e-06, + "loss": 2.3105, + "step": 26417 + }, + { + "epoch": 2.251598056762976, + "grad_norm": 26.686587968336962, + "learning_rate": 1.7793352837098177e-06, + "loss": 0.9344, + "step": 26418 + }, + { + "epoch": 2.2516832864570016, + "grad_norm": 48.886348415088285, + "learning_rate": 1.7789560188228283e-06, + "loss": 1.4928, + "step": 26419 + }, + { + "epoch": 2.251768516151027, + "grad_norm": 39.783586770250885, + "learning_rate": 1.7785767856135284e-06, + "loss": 0.848, + "step": 26420 + }, + { + "epoch": 2.2518537458450525, + "grad_norm": 58.41698209002301, + "learning_rate": 1.7781975840856457e-06, + "loss": 1.696, + "step": 26421 + }, + { + "epoch": 2.251938975539078, + "grad_norm": 34.36975764175979, + "learning_rate": 1.7778184142429077e-06, + "loss": 1.3743, + "step": 26422 + }, + { + "epoch": 2.252024205233103, + "grad_norm": 23.78354509169798, + "learning_rate": 1.777439276089047e-06, + "loss": 0.5974, + "step": 26423 + }, + { + "epoch": 2.2521094349271285, + "grad_norm": 62.93150406153467, + "learning_rate": 1.7770601696277907e-06, + "loss": 2.1904, + "step": 26424 + }, + { + "epoch": 2.252194664621154, + "grad_norm": 63.99519504618801, + "learning_rate": 1.7766810948628666e-06, + "loss": 1.6947, + "step": 26425 + }, + { + "epoch": 2.2522798943151794, + "grad_norm": 37.7715003531448, + "learning_rate": 1.7763020517980018e-06, + "loss": 0.7967, + "step": 26426 + }, + { + "epoch": 2.252365124009205, + "grad_norm": 39.08950146427838, + "learning_rate": 1.775923040436925e-06, + "loss": 1.3787, + "step": 26427 + }, + { + "epoch": 2.2524503537032303, + "grad_norm": 37.164980004527635, + "learning_rate": 1.7755440607833656e-06, + "loss": 1.176, + "step": 26428 + }, + { + "epoch": 2.252535583397256, + "grad_norm": 46.260577096083864, + "learning_rate": 1.7751651128410495e-06, + "loss": 1.4078, + "step": 26429 + }, + { + "epoch": 2.252620813091281, + "grad_norm": 70.03191196570087, + "learning_rate": 1.7747861966137025e-06, + "loss": 2.154, + "step": 26430 + }, + { + "epoch": 2.2527060427853063, + "grad_norm": 24.39230618075566, + "learning_rate": 1.7744073121050509e-06, + "loss": 0.8059, + "step": 26431 + }, + { + "epoch": 2.2527912724793318, + "grad_norm": 39.37102972972288, + "learning_rate": 1.7740284593188229e-06, + "loss": 1.5055, + "step": 26432 + }, + { + "epoch": 2.252876502173357, + "grad_norm": 72.56810352078327, + "learning_rate": 1.7736496382587426e-06, + "loss": 2.2948, + "step": 26433 + }, + { + "epoch": 2.2529617318673827, + "grad_norm": 60.169823802434166, + "learning_rate": 1.773270848928535e-06, + "loss": 1.6394, + "step": 26434 + }, + { + "epoch": 2.253046961561408, + "grad_norm": 61.0083081208479, + "learning_rate": 1.7728920913319264e-06, + "loss": 1.5981, + "step": 26435 + }, + { + "epoch": 2.2531321912554336, + "grad_norm": 59.11043202582474, + "learning_rate": 1.7725133654726434e-06, + "loss": 1.6225, + "step": 26436 + }, + { + "epoch": 2.2532174209494586, + "grad_norm": 46.48956255253628, + "learning_rate": 1.7721346713544086e-06, + "loss": 1.6155, + "step": 26437 + }, + { + "epoch": 2.253302650643484, + "grad_norm": 84.86315957487562, + "learning_rate": 1.7717560089809472e-06, + "loss": 1.9195, + "step": 26438 + }, + { + "epoch": 2.2533878803375096, + "grad_norm": 27.062031038611185, + "learning_rate": 1.7713773783559806e-06, + "loss": 1.1094, + "step": 26439 + }, + { + "epoch": 2.253473110031535, + "grad_norm": 63.73021834949449, + "learning_rate": 1.770998779483236e-06, + "loss": 2.1326, + "step": 26440 + }, + { + "epoch": 2.2535583397255605, + "grad_norm": 50.628720355838205, + "learning_rate": 1.7706202123664363e-06, + "loss": 1.8848, + "step": 26441 + }, + { + "epoch": 2.2536435694195855, + "grad_norm": 23.71081731683268, + "learning_rate": 1.7702416770093012e-06, + "loss": 0.9046, + "step": 26442 + }, + { + "epoch": 2.253728799113611, + "grad_norm": 49.239627515949465, + "learning_rate": 1.7698631734155558e-06, + "loss": 1.7159, + "step": 26443 + }, + { + "epoch": 2.2538140288076365, + "grad_norm": 65.44356338311407, + "learning_rate": 1.7694847015889245e-06, + "loss": 1.7823, + "step": 26444 + }, + { + "epoch": 2.253899258501662, + "grad_norm": 76.68143522669963, + "learning_rate": 1.7691062615331273e-06, + "loss": 1.733, + "step": 26445 + }, + { + "epoch": 2.2539844881956874, + "grad_norm": 61.41432059231419, + "learning_rate": 1.7687278532518865e-06, + "loss": 1.8408, + "step": 26446 + }, + { + "epoch": 2.254069717889713, + "grad_norm": 32.92639721260506, + "learning_rate": 1.7683494767489228e-06, + "loss": 0.7579, + "step": 26447 + }, + { + "epoch": 2.2541549475837384, + "grad_norm": 50.0948108383737, + "learning_rate": 1.7679711320279563e-06, + "loss": 1.2977, + "step": 26448 + }, + { + "epoch": 2.2542401772777634, + "grad_norm": 19.727132552858258, + "learning_rate": 1.7675928190927117e-06, + "loss": 0.7406, + "step": 26449 + }, + { + "epoch": 2.254325406971789, + "grad_norm": 48.65761554674347, + "learning_rate": 1.7672145379469046e-06, + "loss": 2.3023, + "step": 26450 + }, + { + "epoch": 2.2544106366658143, + "grad_norm": 35.74732897643044, + "learning_rate": 1.7668362885942602e-06, + "loss": 1.1401, + "step": 26451 + }, + { + "epoch": 2.25449586635984, + "grad_norm": 47.27015355624533, + "learning_rate": 1.7664580710384949e-06, + "loss": 1.4726, + "step": 26452 + }, + { + "epoch": 2.2545810960538653, + "grad_norm": 88.27031544284412, + "learning_rate": 1.7660798852833305e-06, + "loss": 2.0586, + "step": 26453 + }, + { + "epoch": 2.2546663257478907, + "grad_norm": 74.69048475076217, + "learning_rate": 1.765701731332486e-06, + "loss": 1.8411, + "step": 26454 + }, + { + "epoch": 2.254751555441916, + "grad_norm": 65.26149708737303, + "learning_rate": 1.765323609189679e-06, + "loss": 1.4602, + "step": 26455 + }, + { + "epoch": 2.254836785135941, + "grad_norm": 107.01327178451992, + "learning_rate": 1.764945518858628e-06, + "loss": 2.2009, + "step": 26456 + }, + { + "epoch": 2.2549220148299667, + "grad_norm": 65.69911240417943, + "learning_rate": 1.764567460343054e-06, + "loss": 1.8413, + "step": 26457 + }, + { + "epoch": 2.255007244523992, + "grad_norm": 55.138683229329175, + "learning_rate": 1.7641894336466714e-06, + "loss": 1.6268, + "step": 26458 + }, + { + "epoch": 2.2550924742180176, + "grad_norm": 51.28010528415284, + "learning_rate": 1.7638114387732019e-06, + "loss": 1.4091, + "step": 26459 + }, + { + "epoch": 2.255177703912043, + "grad_norm": 27.014197544715127, + "learning_rate": 1.7634334757263605e-06, + "loss": 1.1198, + "step": 26460 + }, + { + "epoch": 2.255262933606068, + "grad_norm": 37.58356819374922, + "learning_rate": 1.7630555445098634e-06, + "loss": 1.0711, + "step": 26461 + }, + { + "epoch": 2.2553481633000936, + "grad_norm": 43.24079867084937, + "learning_rate": 1.7626776451274308e-06, + "loss": 1.5195, + "step": 26462 + }, + { + "epoch": 2.255433392994119, + "grad_norm": 26.810814251592923, + "learning_rate": 1.7622997775827767e-06, + "loss": 0.8498, + "step": 26463 + }, + { + "epoch": 2.2555186226881445, + "grad_norm": 36.92807058476514, + "learning_rate": 1.761921941879618e-06, + "loss": 1.3732, + "step": 26464 + }, + { + "epoch": 2.25560385238217, + "grad_norm": 64.76856465963871, + "learning_rate": 1.7615441380216686e-06, + "loss": 1.3225, + "step": 26465 + }, + { + "epoch": 2.2556890820761955, + "grad_norm": 25.761775239521945, + "learning_rate": 1.7611663660126455e-06, + "loss": 0.4722, + "step": 26466 + }, + { + "epoch": 2.255774311770221, + "grad_norm": 34.41856267828645, + "learning_rate": 1.760788625856266e-06, + "loss": 1.3512, + "step": 26467 + }, + { + "epoch": 2.255859541464246, + "grad_norm": 50.654162268398146, + "learning_rate": 1.7604109175562434e-06, + "loss": 1.1506, + "step": 26468 + }, + { + "epoch": 2.2559447711582714, + "grad_norm": 82.05073725861648, + "learning_rate": 1.76003324111629e-06, + "loss": 2.2717, + "step": 26469 + }, + { + "epoch": 2.256030000852297, + "grad_norm": 72.85130292727713, + "learning_rate": 1.7596555965401247e-06, + "loss": 2.2425, + "step": 26470 + }, + { + "epoch": 2.2561152305463223, + "grad_norm": 60.03531495320505, + "learning_rate": 1.7592779838314582e-06, + "loss": 1.8846, + "step": 26471 + }, + { + "epoch": 2.256200460240348, + "grad_norm": 73.18018986413877, + "learning_rate": 1.7589004029940054e-06, + "loss": 2.5207, + "step": 26472 + }, + { + "epoch": 2.2562856899343733, + "grad_norm": 30.23932409375206, + "learning_rate": 1.758522854031478e-06, + "loss": 1.2451, + "step": 26473 + }, + { + "epoch": 2.2563709196283988, + "grad_norm": 43.97930750990208, + "learning_rate": 1.7581453369475898e-06, + "loss": 1.5004, + "step": 26474 + }, + { + "epoch": 2.2564561493224238, + "grad_norm": 41.219185619405806, + "learning_rate": 1.7577678517460555e-06, + "loss": 0.9033, + "step": 26475 + }, + { + "epoch": 2.2565413790164492, + "grad_norm": 47.89568863098868, + "learning_rate": 1.7573903984305862e-06, + "loss": 1.6617, + "step": 26476 + }, + { + "epoch": 2.2566266087104747, + "grad_norm": 29.58439492570002, + "learning_rate": 1.7570129770048944e-06, + "loss": 0.9145, + "step": 26477 + }, + { + "epoch": 2.2567118384045, + "grad_norm": 54.30208127459395, + "learning_rate": 1.7566355874726893e-06, + "loss": 1.4681, + "step": 26478 + }, + { + "epoch": 2.2567970680985256, + "grad_norm": 73.23432583064647, + "learning_rate": 1.7562582298376857e-06, + "loss": 2.0275, + "step": 26479 + }, + { + "epoch": 2.256882297792551, + "grad_norm": 57.91351169262651, + "learning_rate": 1.755880904103594e-06, + "loss": 1.0948, + "step": 26480 + }, + { + "epoch": 2.256967527486576, + "grad_norm": 61.73548597331041, + "learning_rate": 1.7555036102741224e-06, + "loss": 1.8073, + "step": 26481 + }, + { + "epoch": 2.2570527571806016, + "grad_norm": 45.6806256939768, + "learning_rate": 1.755126348352984e-06, + "loss": 1.0237, + "step": 26482 + }, + { + "epoch": 2.257137986874627, + "grad_norm": 25.064873007745135, + "learning_rate": 1.7547491183438896e-06, + "loss": 0.7178, + "step": 26483 + }, + { + "epoch": 2.2572232165686525, + "grad_norm": 44.50963573028566, + "learning_rate": 1.7543719202505488e-06, + "loss": 1.7391, + "step": 26484 + }, + { + "epoch": 2.257308446262678, + "grad_norm": 38.236723647135896, + "learning_rate": 1.75399475407667e-06, + "loss": 1.275, + "step": 26485 + }, + { + "epoch": 2.2573936759567035, + "grad_norm": 34.669054947564206, + "learning_rate": 1.7536176198259614e-06, + "loss": 1.2142, + "step": 26486 + }, + { + "epoch": 2.2574789056507285, + "grad_norm": 24.138435590379316, + "learning_rate": 1.7532405175021354e-06, + "loss": 0.9511, + "step": 26487 + }, + { + "epoch": 2.257564135344754, + "grad_norm": 39.92534430386584, + "learning_rate": 1.7528634471088984e-06, + "loss": 1.4313, + "step": 26488 + }, + { + "epoch": 2.2576493650387794, + "grad_norm": 58.57566443288692, + "learning_rate": 1.7524864086499576e-06, + "loss": 1.9736, + "step": 26489 + }, + { + "epoch": 2.257734594732805, + "grad_norm": 48.882835218540826, + "learning_rate": 1.7521094021290246e-06, + "loss": 1.6326, + "step": 26490 + }, + { + "epoch": 2.2578198244268304, + "grad_norm": 55.01904123082381, + "learning_rate": 1.7517324275498033e-06, + "loss": 1.8202, + "step": 26491 + }, + { + "epoch": 2.257905054120856, + "grad_norm": 42.41249720654764, + "learning_rate": 1.7513554849160043e-06, + "loss": 1.4795, + "step": 26492 + }, + { + "epoch": 2.2579902838148813, + "grad_norm": 57.803008007487165, + "learning_rate": 1.7509785742313328e-06, + "loss": 1.1792, + "step": 26493 + }, + { + "epoch": 2.2580755135089063, + "grad_norm": 52.10237315413315, + "learning_rate": 1.7506016954994965e-06, + "loss": 1.7957, + "step": 26494 + }, + { + "epoch": 2.258160743202932, + "grad_norm": 21.672922098310668, + "learning_rate": 1.7502248487241995e-06, + "loss": 0.842, + "step": 26495 + }, + { + "epoch": 2.2582459728969573, + "grad_norm": 57.049311639925676, + "learning_rate": 1.7498480339091516e-06, + "loss": 2.0479, + "step": 26496 + }, + { + "epoch": 2.2583312025909827, + "grad_norm": 37.6637813718647, + "learning_rate": 1.7494712510580554e-06, + "loss": 0.9333, + "step": 26497 + }, + { + "epoch": 2.258416432285008, + "grad_norm": 69.03569895653871, + "learning_rate": 1.7490945001746185e-06, + "loss": 2.0174, + "step": 26498 + }, + { + "epoch": 2.2585016619790337, + "grad_norm": 58.76050765867124, + "learning_rate": 1.7487177812625445e-06, + "loss": 1.6619, + "step": 26499 + }, + { + "epoch": 2.2585868916730587, + "grad_norm": 37.41839854841527, + "learning_rate": 1.7483410943255409e-06, + "loss": 0.8448, + "step": 26500 + }, + { + "epoch": 2.258672121367084, + "grad_norm": 31.758644161638188, + "learning_rate": 1.74796443936731e-06, + "loss": 0.8168, + "step": 26501 + }, + { + "epoch": 2.2587573510611096, + "grad_norm": 40.12420998744577, + "learning_rate": 1.7475878163915571e-06, + "loss": 1.6581, + "step": 26502 + }, + { + "epoch": 2.258842580755135, + "grad_norm": 34.455860727208815, + "learning_rate": 1.7472112254019857e-06, + "loss": 1.2749, + "step": 26503 + }, + { + "epoch": 2.2589278104491606, + "grad_norm": 20.074926606190104, + "learning_rate": 1.7468346664022974e-06, + "loss": 0.7821, + "step": 26504 + }, + { + "epoch": 2.259013040143186, + "grad_norm": 59.9516109134677, + "learning_rate": 1.7464581393961978e-06, + "loss": 2.0883, + "step": 26505 + }, + { + "epoch": 2.259098269837211, + "grad_norm": 39.678863336148176, + "learning_rate": 1.7460816443873907e-06, + "loss": 1.0492, + "step": 26506 + }, + { + "epoch": 2.2591834995312365, + "grad_norm": 37.394932606964154, + "learning_rate": 1.7457051813795782e-06, + "loss": 1.0635, + "step": 26507 + }, + { + "epoch": 2.259268729225262, + "grad_norm": 43.56292129349237, + "learning_rate": 1.7453287503764603e-06, + "loss": 1.3598, + "step": 26508 + }, + { + "epoch": 2.2593539589192875, + "grad_norm": 62.55112903997578, + "learning_rate": 1.7449523513817424e-06, + "loss": 1.3646, + "step": 26509 + }, + { + "epoch": 2.259439188613313, + "grad_norm": 38.9997748839039, + "learning_rate": 1.7445759843991244e-06, + "loss": 1.5876, + "step": 26510 + }, + { + "epoch": 2.2595244183073384, + "grad_norm": 56.01097474375522, + "learning_rate": 1.7441996494323087e-06, + "loss": 1.3989, + "step": 26511 + }, + { + "epoch": 2.259609648001364, + "grad_norm": 26.789090982626824, + "learning_rate": 1.7438233464849936e-06, + "loss": 0.9562, + "step": 26512 + }, + { + "epoch": 2.259694877695389, + "grad_norm": 57.40520349512622, + "learning_rate": 1.7434470755608822e-06, + "loss": 1.4807, + "step": 26513 + }, + { + "epoch": 2.2597801073894144, + "grad_norm": 67.53117829093324, + "learning_rate": 1.7430708366636761e-06, + "loss": 1.5886, + "step": 26514 + }, + { + "epoch": 2.25986533708344, + "grad_norm": 116.35306763258862, + "learning_rate": 1.742694629797075e-06, + "loss": 3.6329, + "step": 26515 + }, + { + "epoch": 2.2599505667774653, + "grad_norm": 73.30117194040194, + "learning_rate": 1.7423184549647755e-06, + "loss": 1.253, + "step": 26516 + }, + { + "epoch": 2.2600357964714908, + "grad_norm": 66.75876303403834, + "learning_rate": 1.741942312170481e-06, + "loss": 1.9131, + "step": 26517 + }, + { + "epoch": 2.2601210261655162, + "grad_norm": 53.96471694042204, + "learning_rate": 1.7415662014178897e-06, + "loss": 1.6416, + "step": 26518 + }, + { + "epoch": 2.2602062558595417, + "grad_norm": 82.18041362569599, + "learning_rate": 1.7411901227106997e-06, + "loss": 1.5792, + "step": 26519 + }, + { + "epoch": 2.2602914855535667, + "grad_norm": 30.91728274381412, + "learning_rate": 1.7408140760526087e-06, + "loss": 0.8314, + "step": 26520 + }, + { + "epoch": 2.260376715247592, + "grad_norm": 54.739875698666026, + "learning_rate": 1.740438061447316e-06, + "loss": 1.578, + "step": 26521 + }, + { + "epoch": 2.2604619449416177, + "grad_norm": 47.24535902839358, + "learning_rate": 1.7400620788985213e-06, + "loss": 1.0847, + "step": 26522 + }, + { + "epoch": 2.260547174635643, + "grad_norm": 53.63062729429297, + "learning_rate": 1.7396861284099204e-06, + "loss": 1.6242, + "step": 26523 + }, + { + "epoch": 2.2606324043296686, + "grad_norm": 37.012282688540466, + "learning_rate": 1.7393102099852116e-06, + "loss": 0.9064, + "step": 26524 + }, + { + "epoch": 2.2607176340236936, + "grad_norm": 39.4098478844676, + "learning_rate": 1.7389343236280892e-06, + "loss": 1.2152, + "step": 26525 + }, + { + "epoch": 2.260802863717719, + "grad_norm": 42.241484082604586, + "learning_rate": 1.7385584693422536e-06, + "loss": 1.2755, + "step": 26526 + }, + { + "epoch": 2.2608880934117446, + "grad_norm": 52.2314948623275, + "learning_rate": 1.738182647131399e-06, + "loss": 1.531, + "step": 26527 + }, + { + "epoch": 2.26097332310577, + "grad_norm": 44.497558095641025, + "learning_rate": 1.7378068569992206e-06, + "loss": 1.4508, + "step": 26528 + }, + { + "epoch": 2.2610585527997955, + "grad_norm": 93.64676151304175, + "learning_rate": 1.737431098949416e-06, + "loss": 2.9739, + "step": 26529 + }, + { + "epoch": 2.261143782493821, + "grad_norm": 37.32441921961621, + "learning_rate": 1.7370553729856814e-06, + "loss": 0.9291, + "step": 26530 + }, + { + "epoch": 2.2612290121878464, + "grad_norm": 47.48853300683835, + "learning_rate": 1.7366796791117108e-06, + "loss": 1.2807, + "step": 26531 + }, + { + "epoch": 2.2613142418818715, + "grad_norm": 22.79624375946187, + "learning_rate": 1.7363040173311984e-06, + "loss": 0.918, + "step": 26532 + }, + { + "epoch": 2.261399471575897, + "grad_norm": 29.924022853374534, + "learning_rate": 1.7359283876478395e-06, + "loss": 0.9732, + "step": 26533 + }, + { + "epoch": 2.2614847012699224, + "grad_norm": 70.4250382093724, + "learning_rate": 1.7355527900653264e-06, + "loss": 2.4648, + "step": 26534 + }, + { + "epoch": 2.261569930963948, + "grad_norm": 65.08143883970445, + "learning_rate": 1.7351772245873554e-06, + "loss": 1.792, + "step": 26535 + }, + { + "epoch": 2.2616551606579733, + "grad_norm": 52.96124377862941, + "learning_rate": 1.734801691217618e-06, + "loss": 1.2615, + "step": 26536 + }, + { + "epoch": 2.261740390351999, + "grad_norm": 48.62386706185358, + "learning_rate": 1.7344261899598102e-06, + "loss": 1.1278, + "step": 26537 + }, + { + "epoch": 2.2618256200460243, + "grad_norm": 28.223159078056497, + "learning_rate": 1.734050720817621e-06, + "loss": 1.7024, + "step": 26538 + }, + { + "epoch": 2.2619108497400493, + "grad_norm": 52.89732055341162, + "learning_rate": 1.7336752837947473e-06, + "loss": 1.8537, + "step": 26539 + }, + { + "epoch": 2.2619960794340748, + "grad_norm": 81.11957307702491, + "learning_rate": 1.7332998788948797e-06, + "loss": 2.4359, + "step": 26540 + }, + { + "epoch": 2.2620813091281002, + "grad_norm": 30.53761069222928, + "learning_rate": 1.7329245061217088e-06, + "loss": 0.8302, + "step": 26541 + }, + { + "epoch": 2.2621665388221257, + "grad_norm": 47.18899723716997, + "learning_rate": 1.732549165478926e-06, + "loss": 1.7509, + "step": 26542 + }, + { + "epoch": 2.262251768516151, + "grad_norm": 46.20629233122535, + "learning_rate": 1.7321738569702256e-06, + "loss": 1.2449, + "step": 26543 + }, + { + "epoch": 2.262336998210176, + "grad_norm": 23.37021030695726, + "learning_rate": 1.7317985805992949e-06, + "loss": 1.7957, + "step": 26544 + }, + { + "epoch": 2.2624222279042017, + "grad_norm": 42.29108285309917, + "learning_rate": 1.731423336369828e-06, + "loss": 1.4241, + "step": 26545 + }, + { + "epoch": 2.262507457598227, + "grad_norm": 68.52109803726611, + "learning_rate": 1.7310481242855144e-06, + "loss": 2.1927, + "step": 26546 + }, + { + "epoch": 2.2625926872922526, + "grad_norm": 30.60539520400172, + "learning_rate": 1.7306729443500414e-06, + "loss": 1.4574, + "step": 26547 + }, + { + "epoch": 2.262677916986278, + "grad_norm": 48.40830666768301, + "learning_rate": 1.7302977965671026e-06, + "loss": 1.2817, + "step": 26548 + }, + { + "epoch": 2.2627631466803035, + "grad_norm": 46.6523700629378, + "learning_rate": 1.7299226809403858e-06, + "loss": 1.4476, + "step": 26549 + }, + { + "epoch": 2.262848376374329, + "grad_norm": 40.109285219996245, + "learning_rate": 1.7295475974735798e-06, + "loss": 1.1672, + "step": 26550 + }, + { + "epoch": 2.262933606068354, + "grad_norm": 51.8173864741012, + "learning_rate": 1.7291725461703722e-06, + "loss": 1.4697, + "step": 26551 + }, + { + "epoch": 2.2630188357623795, + "grad_norm": 79.40511055854715, + "learning_rate": 1.7287975270344532e-06, + "loss": 2.2742, + "step": 26552 + }, + { + "epoch": 2.263104065456405, + "grad_norm": 81.12102174828104, + "learning_rate": 1.7284225400695114e-06, + "loss": 1.5265, + "step": 26553 + }, + { + "epoch": 2.2631892951504304, + "grad_norm": 31.194759322086075, + "learning_rate": 1.728047585279235e-06, + "loss": 1.3241, + "step": 26554 + }, + { + "epoch": 2.263274524844456, + "grad_norm": 31.24142853086463, + "learning_rate": 1.727672662667308e-06, + "loss": 0.7664, + "step": 26555 + }, + { + "epoch": 2.2633597545384814, + "grad_norm": 39.39760895194818, + "learning_rate": 1.7272977722374224e-06, + "loss": 1.1665, + "step": 26556 + }, + { + "epoch": 2.263444984232507, + "grad_norm": 76.6466377693826, + "learning_rate": 1.7269229139932625e-06, + "loss": 1.9619, + "step": 26557 + }, + { + "epoch": 2.263530213926532, + "grad_norm": 34.86732662308787, + "learning_rate": 1.726548087938515e-06, + "loss": 1.1036, + "step": 26558 + }, + { + "epoch": 2.2636154436205573, + "grad_norm": 46.69239269650707, + "learning_rate": 1.7261732940768644e-06, + "loss": 1.9823, + "step": 26559 + }, + { + "epoch": 2.263700673314583, + "grad_norm": 72.47716994414039, + "learning_rate": 1.7257985324119992e-06, + "loss": 1.4913, + "step": 26560 + }, + { + "epoch": 2.2637859030086083, + "grad_norm": 63.452334451163026, + "learning_rate": 1.7254238029476056e-06, + "loss": 2.0434, + "step": 26561 + }, + { + "epoch": 2.2638711327026337, + "grad_norm": 59.10435258326243, + "learning_rate": 1.7250491056873676e-06, + "loss": 1.6988, + "step": 26562 + }, + { + "epoch": 2.2639563623966588, + "grad_norm": 60.398340001623104, + "learning_rate": 1.7246744406349708e-06, + "loss": 2.0924, + "step": 26563 + }, + { + "epoch": 2.2640415920906842, + "grad_norm": 32.33879710128329, + "learning_rate": 1.7242998077940976e-06, + "loss": 0.5796, + "step": 26564 + }, + { + "epoch": 2.2641268217847097, + "grad_norm": 51.44080719859163, + "learning_rate": 1.7239252071684354e-06, + "loss": 1.9027, + "step": 26565 + }, + { + "epoch": 2.264212051478735, + "grad_norm": 133.03768336192374, + "learning_rate": 1.7235506387616675e-06, + "loss": 2.3495, + "step": 26566 + }, + { + "epoch": 2.2642972811727606, + "grad_norm": 71.34417962939114, + "learning_rate": 1.723176102577475e-06, + "loss": 1.6496, + "step": 26567 + }, + { + "epoch": 2.264382510866786, + "grad_norm": 30.911012898584698, + "learning_rate": 1.722801598619544e-06, + "loss": 1.4255, + "step": 26568 + }, + { + "epoch": 2.2644677405608116, + "grad_norm": 46.31520680679719, + "learning_rate": 1.7224271268915582e-06, + "loss": 1.3426, + "step": 26569 + }, + { + "epoch": 2.2645529702548366, + "grad_norm": 28.89283682308478, + "learning_rate": 1.7220526873971994e-06, + "loss": 1.0915, + "step": 26570 + }, + { + "epoch": 2.264638199948862, + "grad_norm": 28.346025492322962, + "learning_rate": 1.72167828014015e-06, + "loss": 1.1782, + "step": 26571 + }, + { + "epoch": 2.2647234296428875, + "grad_norm": 35.10537337692101, + "learning_rate": 1.7213039051240903e-06, + "loss": 1.0862, + "step": 26572 + }, + { + "epoch": 2.264808659336913, + "grad_norm": 38.036266516915404, + "learning_rate": 1.7209295623527056e-06, + "loss": 1.0807, + "step": 26573 + }, + { + "epoch": 2.2648938890309385, + "grad_norm": 54.35044952533585, + "learning_rate": 1.7205552518296758e-06, + "loss": 1.4133, + "step": 26574 + }, + { + "epoch": 2.264979118724964, + "grad_norm": 30.926599286278297, + "learning_rate": 1.72018097355868e-06, + "loss": 1.0776, + "step": 26575 + }, + { + "epoch": 2.2650643484189894, + "grad_norm": 66.27492947398866, + "learning_rate": 1.7198067275434027e-06, + "loss": 1.8224, + "step": 26576 + }, + { + "epoch": 2.2651495781130144, + "grad_norm": 38.83694655639401, + "learning_rate": 1.7194325137875211e-06, + "loss": 1.4051, + "step": 26577 + }, + { + "epoch": 2.26523480780704, + "grad_norm": 41.988574344465704, + "learning_rate": 1.7190583322947196e-06, + "loss": 1.2905, + "step": 26578 + }, + { + "epoch": 2.2653200375010654, + "grad_norm": 79.7071925847014, + "learning_rate": 1.7186841830686752e-06, + "loss": 1.6897, + "step": 26579 + }, + { + "epoch": 2.265405267195091, + "grad_norm": 42.936022295712796, + "learning_rate": 1.7183100661130675e-06, + "loss": 1.303, + "step": 26580 + }, + { + "epoch": 2.2654904968891163, + "grad_norm": 40.269028780970984, + "learning_rate": 1.717935981431575e-06, + "loss": 1.6645, + "step": 26581 + }, + { + "epoch": 2.2655757265831413, + "grad_norm": 58.95703512844478, + "learning_rate": 1.71756192902788e-06, + "loss": 1.5497, + "step": 26582 + }, + { + "epoch": 2.265660956277167, + "grad_norm": 42.90549858074439, + "learning_rate": 1.717187908905657e-06, + "loss": 1.0211, + "step": 26583 + }, + { + "epoch": 2.2657461859711923, + "grad_norm": 66.64035292270303, + "learning_rate": 1.716813921068588e-06, + "loss": 1.9709, + "step": 26584 + }, + { + "epoch": 2.2658314156652177, + "grad_norm": 33.17630786110671, + "learning_rate": 1.7164399655203478e-06, + "loss": 1.3723, + "step": 26585 + }, + { + "epoch": 2.265916645359243, + "grad_norm": 29.32338351622098, + "learning_rate": 1.716066042264618e-06, + "loss": 0.5935, + "step": 26586 + }, + { + "epoch": 2.2660018750532687, + "grad_norm": 42.02237620270053, + "learning_rate": 1.7156921513050735e-06, + "loss": 1.4137, + "step": 26587 + }, + { + "epoch": 2.266087104747294, + "grad_norm": 65.93544918263684, + "learning_rate": 1.7153182926453914e-06, + "loss": 1.4956, + "step": 26588 + }, + { + "epoch": 2.266172334441319, + "grad_norm": 16.011759282129688, + "learning_rate": 1.7149444662892472e-06, + "loss": 0.6634, + "step": 26589 + }, + { + "epoch": 2.2662575641353446, + "grad_norm": 28.209813355895246, + "learning_rate": 1.7145706722403204e-06, + "loss": 0.8596, + "step": 26590 + }, + { + "epoch": 2.26634279382937, + "grad_norm": 37.147136160953885, + "learning_rate": 1.7141969105022842e-06, + "loss": 1.3464, + "step": 26591 + }, + { + "epoch": 2.2664280235233956, + "grad_norm": 26.67784054335109, + "learning_rate": 1.713823181078817e-06, + "loss": 0.9152, + "step": 26592 + }, + { + "epoch": 2.266513253217421, + "grad_norm": 78.98679067330377, + "learning_rate": 1.7134494839735932e-06, + "loss": 2.569, + "step": 26593 + }, + { + "epoch": 2.2665984829114465, + "grad_norm": 95.62010035955308, + "learning_rate": 1.7130758191902864e-06, + "loss": 2.2991, + "step": 26594 + }, + { + "epoch": 2.266683712605472, + "grad_norm": 22.178819755512254, + "learning_rate": 1.7127021867325738e-06, + "loss": 0.7817, + "step": 26595 + }, + { + "epoch": 2.266768942299497, + "grad_norm": 55.84924865052496, + "learning_rate": 1.7123285866041296e-06, + "loss": 1.2809, + "step": 26596 + }, + { + "epoch": 2.2668541719935225, + "grad_norm": 56.226824822310235, + "learning_rate": 1.711955018808627e-06, + "loss": 1.7638, + "step": 26597 + }, + { + "epoch": 2.266939401687548, + "grad_norm": 38.725335439018046, + "learning_rate": 1.7115814833497385e-06, + "loss": 1.6875, + "step": 26598 + }, + { + "epoch": 2.2670246313815734, + "grad_norm": 51.37256908957952, + "learning_rate": 1.7112079802311398e-06, + "loss": 1.195, + "step": 26599 + }, + { + "epoch": 2.267109861075599, + "grad_norm": 53.78071868033987, + "learning_rate": 1.710834509456505e-06, + "loss": 1.6014, + "step": 26600 + }, + { + "epoch": 2.267195090769624, + "grad_norm": 57.75972871713589, + "learning_rate": 1.710461071029506e-06, + "loss": 1.6279, + "step": 26601 + }, + { + "epoch": 2.2672803204636494, + "grad_norm": 82.93032772934679, + "learning_rate": 1.710087664953814e-06, + "loss": 1.961, + "step": 26602 + }, + { + "epoch": 2.267365550157675, + "grad_norm": 41.05352439171559, + "learning_rate": 1.7097142912331032e-06, + "loss": 1.3404, + "step": 26603 + }, + { + "epoch": 2.2674507798517003, + "grad_norm": 57.49714639266191, + "learning_rate": 1.7093409498710456e-06, + "loss": 1.3926, + "step": 26604 + }, + { + "epoch": 2.2675360095457258, + "grad_norm": 27.598306481500153, + "learning_rate": 1.7089676408713123e-06, + "loss": 0.7759, + "step": 26605 + }, + { + "epoch": 2.2676212392397512, + "grad_norm": 83.22766345780552, + "learning_rate": 1.708594364237573e-06, + "loss": 1.65, + "step": 26606 + }, + { + "epoch": 2.2677064689337767, + "grad_norm": 59.21743256176577, + "learning_rate": 1.7082211199734993e-06, + "loss": 1.9938, + "step": 26607 + }, + { + "epoch": 2.2677916986278017, + "grad_norm": 67.85526387327478, + "learning_rate": 1.7078479080827654e-06, + "loss": 1.4694, + "step": 26608 + }, + { + "epoch": 2.267876928321827, + "grad_norm": 45.731660925772395, + "learning_rate": 1.707474728569039e-06, + "loss": 1.6857, + "step": 26609 + }, + { + "epoch": 2.2679621580158527, + "grad_norm": 64.3594774817813, + "learning_rate": 1.7071015814359904e-06, + "loss": 1.8415, + "step": 26610 + }, + { + "epoch": 2.268047387709878, + "grad_norm": 52.92146632942658, + "learning_rate": 1.7067284666872874e-06, + "loss": 1.4293, + "step": 26611 + }, + { + "epoch": 2.2681326174039036, + "grad_norm": 28.581817883580857, + "learning_rate": 1.7063553843266028e-06, + "loss": 0.924, + "step": 26612 + }, + { + "epoch": 2.268217847097929, + "grad_norm": 57.39633084521717, + "learning_rate": 1.7059823343576043e-06, + "loss": 1.0912, + "step": 26613 + }, + { + "epoch": 2.2683030767919545, + "grad_norm": 38.68751574592193, + "learning_rate": 1.7056093167839594e-06, + "loss": 2.02, + "step": 26614 + }, + { + "epoch": 2.2683883064859796, + "grad_norm": 56.69078655073037, + "learning_rate": 1.7052363316093373e-06, + "loss": 1.6563, + "step": 26615 + }, + { + "epoch": 2.268473536180005, + "grad_norm": 58.8539465485562, + "learning_rate": 1.7048633788374086e-06, + "loss": 1.9644, + "step": 26616 + }, + { + "epoch": 2.2685587658740305, + "grad_norm": 42.04069595080502, + "learning_rate": 1.7044904584718391e-06, + "loss": 1.0645, + "step": 26617 + }, + { + "epoch": 2.268643995568056, + "grad_norm": 29.60276016671933, + "learning_rate": 1.7041175705162966e-06, + "loss": 0.9337, + "step": 26618 + }, + { + "epoch": 2.2687292252620814, + "grad_norm": 59.32332014407429, + "learning_rate": 1.703744714974448e-06, + "loss": 1.2727, + "step": 26619 + }, + { + "epoch": 2.268814454956107, + "grad_norm": 31.700671089910678, + "learning_rate": 1.7033718918499587e-06, + "loss": 1.1688, + "step": 26620 + }, + { + "epoch": 2.268899684650132, + "grad_norm": 66.64775175023244, + "learning_rate": 1.7029991011464986e-06, + "loss": 0.9647, + "step": 26621 + }, + { + "epoch": 2.2689849143441574, + "grad_norm": 68.76958757425258, + "learning_rate": 1.7026263428677303e-06, + "loss": 2.0951, + "step": 26622 + }, + { + "epoch": 2.269070144038183, + "grad_norm": 62.13186533372437, + "learning_rate": 1.7022536170173237e-06, + "loss": 1.3506, + "step": 26623 + }, + { + "epoch": 2.2691553737322083, + "grad_norm": 20.417811703137836, + "learning_rate": 1.7018809235989403e-06, + "loss": 0.9528, + "step": 26624 + }, + { + "epoch": 2.269240603426234, + "grad_norm": 36.97879034200072, + "learning_rate": 1.7015082626162493e-06, + "loss": 1.3147, + "step": 26625 + }, + { + "epoch": 2.2693258331202593, + "grad_norm": 24.854172589832984, + "learning_rate": 1.701135634072914e-06, + "loss": 1.1169, + "step": 26626 + }, + { + "epoch": 2.2694110628142843, + "grad_norm": 105.84302725912288, + "learning_rate": 1.7007630379725987e-06, + "loss": 2.4097, + "step": 26627 + }, + { + "epoch": 2.2694962925083098, + "grad_norm": 66.944850240499, + "learning_rate": 1.7003904743189664e-06, + "loss": 1.8655, + "step": 26628 + }, + { + "epoch": 2.269581522202335, + "grad_norm": 23.23441518412986, + "learning_rate": 1.7000179431156844e-06, + "loss": 0.666, + "step": 26629 + }, + { + "epoch": 2.2696667518963607, + "grad_norm": 28.011726588029905, + "learning_rate": 1.6996454443664128e-06, + "loss": 0.9583, + "step": 26630 + }, + { + "epoch": 2.269751981590386, + "grad_norm": 50.643839756310506, + "learning_rate": 1.699272978074819e-06, + "loss": 1.8848, + "step": 26631 + }, + { + "epoch": 2.2698372112844116, + "grad_norm": 27.29999835939635, + "learning_rate": 1.6989005442445617e-06, + "loss": 0.5743, + "step": 26632 + }, + { + "epoch": 2.269922440978437, + "grad_norm": 52.42795861127742, + "learning_rate": 1.6985281428793076e-06, + "loss": 1.5597, + "step": 26633 + }, + { + "epoch": 2.270007670672462, + "grad_norm": 46.6898981848797, + "learning_rate": 1.6981557739827171e-06, + "loss": 1.2604, + "step": 26634 + }, + { + "epoch": 2.2700929003664876, + "grad_norm": 25.965234494742, + "learning_rate": 1.6977834375584529e-06, + "loss": 0.8609, + "step": 26635 + }, + { + "epoch": 2.270178130060513, + "grad_norm": 38.18798732754991, + "learning_rate": 1.6974111336101762e-06, + "loss": 0.9307, + "step": 26636 + }, + { + "epoch": 2.2702633597545385, + "grad_norm": 54.927310413365035, + "learning_rate": 1.6970388621415474e-06, + "loss": 1.2561, + "step": 26637 + }, + { + "epoch": 2.270348589448564, + "grad_norm": 81.0260639706746, + "learning_rate": 1.6966666231562284e-06, + "loss": 1.9319, + "step": 26638 + }, + { + "epoch": 2.2704338191425895, + "grad_norm": 43.466849573922104, + "learning_rate": 1.6962944166578827e-06, + "loss": 1.1184, + "step": 26639 + }, + { + "epoch": 2.2705190488366145, + "grad_norm": 25.67605437533586, + "learning_rate": 1.6959222426501682e-06, + "loss": 0.7896, + "step": 26640 + }, + { + "epoch": 2.27060427853064, + "grad_norm": 41.460925961387055, + "learning_rate": 1.6955501011367442e-06, + "loss": 1.2176, + "step": 26641 + }, + { + "epoch": 2.2706895082246654, + "grad_norm": 36.15591226314625, + "learning_rate": 1.695177992121273e-06, + "loss": 1.2188, + "step": 26642 + }, + { + "epoch": 2.270774737918691, + "grad_norm": 43.92043359953537, + "learning_rate": 1.6948059156074137e-06, + "loss": 1.4901, + "step": 26643 + }, + { + "epoch": 2.2708599676127164, + "grad_norm": 61.7691763763442, + "learning_rate": 1.6944338715988245e-06, + "loss": 1.5229, + "step": 26644 + }, + { + "epoch": 2.270945197306742, + "grad_norm": 47.80475557511559, + "learning_rate": 1.6940618600991631e-06, + "loss": 1.577, + "step": 26645 + }, + { + "epoch": 2.271030427000767, + "grad_norm": 105.75462647281132, + "learning_rate": 1.69368988111209e-06, + "loss": 2.5612, + "step": 26646 + }, + { + "epoch": 2.2711156566947923, + "grad_norm": 35.59575527269845, + "learning_rate": 1.6933179346412643e-06, + "loss": 1.0599, + "step": 26647 + }, + { + "epoch": 2.271200886388818, + "grad_norm": 35.75351474463685, + "learning_rate": 1.6929460206903424e-06, + "loss": 1.2352, + "step": 26648 + }, + { + "epoch": 2.2712861160828433, + "grad_norm": 25.65646620261647, + "learning_rate": 1.692574139262983e-06, + "loss": 0.9088, + "step": 26649 + }, + { + "epoch": 2.2713713457768687, + "grad_norm": 54.26851163817395, + "learning_rate": 1.692202290362841e-06, + "loss": 1.5949, + "step": 26650 + }, + { + "epoch": 2.271456575470894, + "grad_norm": 49.9778649091622, + "learning_rate": 1.6918304739935764e-06, + "loss": 1.8441, + "step": 26651 + }, + { + "epoch": 2.2715418051649197, + "grad_norm": 51.29284832189405, + "learning_rate": 1.6914586901588448e-06, + "loss": 1.4181, + "step": 26652 + }, + { + "epoch": 2.2716270348589447, + "grad_norm": 62.10761172992062, + "learning_rate": 1.6910869388623008e-06, + "loss": 2.0562, + "step": 26653 + }, + { + "epoch": 2.27171226455297, + "grad_norm": 14.078584959109468, + "learning_rate": 1.6907152201076017e-06, + "loss": 0.6324, + "step": 26654 + }, + { + "epoch": 2.2717974942469956, + "grad_norm": 61.628726561698876, + "learning_rate": 1.6903435338984053e-06, + "loss": 2.2579, + "step": 26655 + }, + { + "epoch": 2.271882723941021, + "grad_norm": 45.39720920031166, + "learning_rate": 1.6899718802383646e-06, + "loss": 1.3725, + "step": 26656 + }, + { + "epoch": 2.2719679536350466, + "grad_norm": 52.89974140127826, + "learning_rate": 1.6896002591311356e-06, + "loss": 1.6664, + "step": 26657 + }, + { + "epoch": 2.272053183329072, + "grad_norm": 50.85486523638874, + "learning_rate": 1.6892286705803705e-06, + "loss": 1.4077, + "step": 26658 + }, + { + "epoch": 2.2721384130230975, + "grad_norm": 52.888364637881224, + "learning_rate": 1.6888571145897282e-06, + "loss": 1.1684, + "step": 26659 + }, + { + "epoch": 2.2722236427171225, + "grad_norm": 38.93219408206318, + "learning_rate": 1.68848559116286e-06, + "loss": 1.2005, + "step": 26660 + }, + { + "epoch": 2.272308872411148, + "grad_norm": 81.21170233758015, + "learning_rate": 1.688114100303419e-06, + "loss": 2.0778, + "step": 26661 + }, + { + "epoch": 2.2723941021051735, + "grad_norm": 35.96198865125829, + "learning_rate": 1.6877426420150595e-06, + "loss": 0.7084, + "step": 26662 + }, + { + "epoch": 2.272479331799199, + "grad_norm": 64.42162331685604, + "learning_rate": 1.687371216301436e-06, + "loss": 1.4507, + "step": 26663 + }, + { + "epoch": 2.2725645614932244, + "grad_norm": 34.03554148356523, + "learning_rate": 1.686999823166201e-06, + "loss": 1.5712, + "step": 26664 + }, + { + "epoch": 2.2726497911872494, + "grad_norm": 69.61100910194774, + "learning_rate": 1.6866284626130064e-06, + "loss": 2.1524, + "step": 26665 + }, + { + "epoch": 2.272735020881275, + "grad_norm": 38.79751055468496, + "learning_rate": 1.6862571346455037e-06, + "loss": 1.5432, + "step": 26666 + }, + { + "epoch": 2.2728202505753003, + "grad_norm": 32.98619249218716, + "learning_rate": 1.685885839267344e-06, + "loss": 1.2068, + "step": 26667 + }, + { + "epoch": 2.272905480269326, + "grad_norm": 72.33924229014437, + "learning_rate": 1.6855145764821817e-06, + "loss": 1.5086, + "step": 26668 + }, + { + "epoch": 2.2729907099633513, + "grad_norm": 38.671768183370254, + "learning_rate": 1.6851433462936651e-06, + "loss": 0.9534, + "step": 26669 + }, + { + "epoch": 2.2730759396573768, + "grad_norm": 33.0964538702873, + "learning_rate": 1.6847721487054481e-06, + "loss": 1.4194, + "step": 26670 + }, + { + "epoch": 2.2731611693514022, + "grad_norm": 44.48381788356436, + "learning_rate": 1.6844009837211778e-06, + "loss": 1.3144, + "step": 26671 + }, + { + "epoch": 2.2732463990454272, + "grad_norm": 36.39023879183813, + "learning_rate": 1.684029851344508e-06, + "loss": 1.2432, + "step": 26672 + }, + { + "epoch": 2.2733316287394527, + "grad_norm": 71.92057128495543, + "learning_rate": 1.683658751579087e-06, + "loss": 1.6242, + "step": 26673 + }, + { + "epoch": 2.273416858433478, + "grad_norm": 40.375172541595404, + "learning_rate": 1.6832876844285644e-06, + "loss": 1.2274, + "step": 26674 + }, + { + "epoch": 2.2735020881275037, + "grad_norm": 69.52352830846827, + "learning_rate": 1.6829166498965877e-06, + "loss": 1.6899, + "step": 26675 + }, + { + "epoch": 2.273587317821529, + "grad_norm": 43.49897736761073, + "learning_rate": 1.6825456479868096e-06, + "loss": 1.1699, + "step": 26676 + }, + { + "epoch": 2.2736725475155546, + "grad_norm": 32.76358038932018, + "learning_rate": 1.6821746787028754e-06, + "loss": 0.8501, + "step": 26677 + }, + { + "epoch": 2.27375777720958, + "grad_norm": 53.19345336680932, + "learning_rate": 1.6818037420484356e-06, + "loss": 1.7486, + "step": 26678 + }, + { + "epoch": 2.273843006903605, + "grad_norm": 42.170689016817455, + "learning_rate": 1.6814328380271383e-06, + "loss": 1.3544, + "step": 26679 + }, + { + "epoch": 2.2739282365976305, + "grad_norm": 39.571701897916256, + "learning_rate": 1.6810619666426286e-06, + "loss": 1.4368, + "step": 26680 + }, + { + "epoch": 2.274013466291656, + "grad_norm": 33.23828831540925, + "learning_rate": 1.680691127898557e-06, + "loss": 0.863, + "step": 26681 + }, + { + "epoch": 2.2740986959856815, + "grad_norm": 56.28616684031979, + "learning_rate": 1.6803203217985693e-06, + "loss": 1.302, + "step": 26682 + }, + { + "epoch": 2.274183925679707, + "grad_norm": 32.4828034876586, + "learning_rate": 1.6799495483463123e-06, + "loss": 1.2722, + "step": 26683 + }, + { + "epoch": 2.274269155373732, + "grad_norm": 86.75354133537707, + "learning_rate": 1.6795788075454306e-06, + "loss": 2.0806, + "step": 26684 + }, + { + "epoch": 2.2743543850677574, + "grad_norm": 22.350745602470255, + "learning_rate": 1.679208099399572e-06, + "loss": 1.0133, + "step": 26685 + }, + { + "epoch": 2.274439614761783, + "grad_norm": 32.209801786030674, + "learning_rate": 1.6788374239123833e-06, + "loss": 0.8914, + "step": 26686 + }, + { + "epoch": 2.2745248444558084, + "grad_norm": 47.100201863289385, + "learning_rate": 1.6784667810875093e-06, + "loss": 1.2376, + "step": 26687 + }, + { + "epoch": 2.274610074149834, + "grad_norm": 49.33516526606506, + "learning_rate": 1.6780961709285926e-06, + "loss": 1.1832, + "step": 26688 + }, + { + "epoch": 2.2746953038438593, + "grad_norm": 31.45942146920498, + "learning_rate": 1.6777255934392821e-06, + "loss": 1.1156, + "step": 26689 + }, + { + "epoch": 2.274780533537885, + "grad_norm": 57.63039745800598, + "learning_rate": 1.6773550486232198e-06, + "loss": 1.6032, + "step": 26690 + }, + { + "epoch": 2.27486576323191, + "grad_norm": 40.48255212794638, + "learning_rate": 1.676984536484051e-06, + "loss": 1.1899, + "step": 26691 + }, + { + "epoch": 2.2749509929259353, + "grad_norm": 68.20043978577456, + "learning_rate": 1.676614057025417e-06, + "loss": 1.4688, + "step": 26692 + }, + { + "epoch": 2.2750362226199607, + "grad_norm": 44.264033764239656, + "learning_rate": 1.6762436102509627e-06, + "loss": 0.9235, + "step": 26693 + }, + { + "epoch": 2.275121452313986, + "grad_norm": 29.459817577656118, + "learning_rate": 1.6758731961643338e-06, + "loss": 0.8924, + "step": 26694 + }, + { + "epoch": 2.2752066820080117, + "grad_norm": 60.23267361678932, + "learning_rate": 1.675502814769171e-06, + "loss": 1.7378, + "step": 26695 + }, + { + "epoch": 2.275291911702037, + "grad_norm": 31.160388795791533, + "learning_rate": 1.6751324660691171e-06, + "loss": 1.0335, + "step": 26696 + }, + { + "epoch": 2.2753771413960626, + "grad_norm": 23.215458068656016, + "learning_rate": 1.6747621500678124e-06, + "loss": 0.4657, + "step": 26697 + }, + { + "epoch": 2.2754623710900876, + "grad_norm": 31.593760733789775, + "learning_rate": 1.6743918667689024e-06, + "loss": 1.1859, + "step": 26698 + }, + { + "epoch": 2.275547600784113, + "grad_norm": 31.351788147053522, + "learning_rate": 1.674021616176027e-06, + "loss": 1.1627, + "step": 26699 + }, + { + "epoch": 2.2756328304781386, + "grad_norm": 39.769704541493134, + "learning_rate": 1.6736513982928255e-06, + "loss": 1.1834, + "step": 26700 + }, + { + "epoch": 2.275718060172164, + "grad_norm": 19.664468923472896, + "learning_rate": 1.673281213122941e-06, + "loss": 0.8904, + "step": 26701 + }, + { + "epoch": 2.2758032898661895, + "grad_norm": 23.868459636310103, + "learning_rate": 1.6729110606700156e-06, + "loss": 0.6974, + "step": 26702 + }, + { + "epoch": 2.2758885195602145, + "grad_norm": 20.849731711042185, + "learning_rate": 1.6725409409376874e-06, + "loss": 0.8044, + "step": 26703 + }, + { + "epoch": 2.27597374925424, + "grad_norm": 25.822331118778617, + "learning_rate": 1.6721708539295972e-06, + "loss": 1.0344, + "step": 26704 + }, + { + "epoch": 2.2760589789482655, + "grad_norm": 21.92434467866152, + "learning_rate": 1.671800799649383e-06, + "loss": 0.8145, + "step": 26705 + }, + { + "epoch": 2.276144208642291, + "grad_norm": 71.82016792076625, + "learning_rate": 1.671430778100686e-06, + "loss": 1.1407, + "step": 26706 + }, + { + "epoch": 2.2762294383363164, + "grad_norm": 61.13950119219431, + "learning_rate": 1.6710607892871456e-06, + "loss": 1.8162, + "step": 26707 + }, + { + "epoch": 2.276314668030342, + "grad_norm": 38.63110075096363, + "learning_rate": 1.6706908332123978e-06, + "loss": 1.1132, + "step": 26708 + }, + { + "epoch": 2.2763998977243673, + "grad_norm": 72.26370855431546, + "learning_rate": 1.6703209098800838e-06, + "loss": 2.2193, + "step": 26709 + }, + { + "epoch": 2.2764851274183924, + "grad_norm": 36.20520908192837, + "learning_rate": 1.6699510192938395e-06, + "loss": 1.2486, + "step": 26710 + }, + { + "epoch": 2.276570357112418, + "grad_norm": 68.83987581276725, + "learning_rate": 1.6695811614573049e-06, + "loss": 2.5367, + "step": 26711 + }, + { + "epoch": 2.2766555868064433, + "grad_norm": 36.51722526992757, + "learning_rate": 1.669211336374117e-06, + "loss": 1.3438, + "step": 26712 + }, + { + "epoch": 2.276740816500469, + "grad_norm": 63.321612528809894, + "learning_rate": 1.668841544047911e-06, + "loss": 2.3018, + "step": 26713 + }, + { + "epoch": 2.2768260461944942, + "grad_norm": 50.03003016983614, + "learning_rate": 1.668471784482324e-06, + "loss": 1.6482, + "step": 26714 + }, + { + "epoch": 2.2769112758885197, + "grad_norm": 41.567271531109384, + "learning_rate": 1.6681020576809943e-06, + "loss": 1.4631, + "step": 26715 + }, + { + "epoch": 2.276996505582545, + "grad_norm": 51.96087000335379, + "learning_rate": 1.6677323636475551e-06, + "loss": 1.4079, + "step": 26716 + }, + { + "epoch": 2.27708173527657, + "grad_norm": 40.34243500619396, + "learning_rate": 1.667362702385646e-06, + "loss": 1.4609, + "step": 26717 + }, + { + "epoch": 2.2771669649705957, + "grad_norm": 41.696836066852576, + "learning_rate": 1.6669930738988987e-06, + "loss": 1.2529, + "step": 26718 + }, + { + "epoch": 2.277252194664621, + "grad_norm": 58.17397955385339, + "learning_rate": 1.6666234781909518e-06, + "loss": 2.0276, + "step": 26719 + }, + { + "epoch": 2.2773374243586466, + "grad_norm": 55.90811033597443, + "learning_rate": 1.6662539152654384e-06, + "loss": 2.0608, + "step": 26720 + }, + { + "epoch": 2.277422654052672, + "grad_norm": 58.37215199436437, + "learning_rate": 1.6658843851259932e-06, + "loss": 1.9443, + "step": 26721 + }, + { + "epoch": 2.277507883746697, + "grad_norm": 62.29779970934488, + "learning_rate": 1.66551488777625e-06, + "loss": 2.2337, + "step": 26722 + }, + { + "epoch": 2.2775931134407226, + "grad_norm": 52.90670238584342, + "learning_rate": 1.6651454232198415e-06, + "loss": 1.5776, + "step": 26723 + }, + { + "epoch": 2.277678343134748, + "grad_norm": 52.59718859408336, + "learning_rate": 1.6647759914604022e-06, + "loss": 1.623, + "step": 26724 + }, + { + "epoch": 2.2777635728287735, + "grad_norm": 48.98650421574988, + "learning_rate": 1.664406592501568e-06, + "loss": 1.2802, + "step": 26725 + }, + { + "epoch": 2.277848802522799, + "grad_norm": 24.704403311652694, + "learning_rate": 1.664037226346969e-06, + "loss": 1.0387, + "step": 26726 + }, + { + "epoch": 2.2779340322168244, + "grad_norm": 81.28029507187868, + "learning_rate": 1.663667893000237e-06, + "loss": 2.1198, + "step": 26727 + }, + { + "epoch": 2.27801926191085, + "grad_norm": 58.895700933666156, + "learning_rate": 1.663298592465007e-06, + "loss": 2.2523, + "step": 26728 + }, + { + "epoch": 2.278104491604875, + "grad_norm": 36.495126437023586, + "learning_rate": 1.6629293247449096e-06, + "loss": 1.6136, + "step": 26729 + }, + { + "epoch": 2.2781897212989004, + "grad_norm": 42.04031332314047, + "learning_rate": 1.6625600898435757e-06, + "loss": 1.2156, + "step": 26730 + }, + { + "epoch": 2.278274950992926, + "grad_norm": 27.105751787664126, + "learning_rate": 1.6621908877646364e-06, + "loss": 1.1275, + "step": 26731 + }, + { + "epoch": 2.2783601806869513, + "grad_norm": 52.038219083130365, + "learning_rate": 1.6618217185117224e-06, + "loss": 1.2914, + "step": 26732 + }, + { + "epoch": 2.278445410380977, + "grad_norm": 41.65008149534722, + "learning_rate": 1.661452582088468e-06, + "loss": 1.7544, + "step": 26733 + }, + { + "epoch": 2.2785306400750023, + "grad_norm": 71.73515443301201, + "learning_rate": 1.6610834784985003e-06, + "loss": 2.6924, + "step": 26734 + }, + { + "epoch": 2.2786158697690277, + "grad_norm": 60.62425602789488, + "learning_rate": 1.6607144077454484e-06, + "loss": 1.2279, + "step": 26735 + }, + { + "epoch": 2.2787010994630528, + "grad_norm": 66.59925190064152, + "learning_rate": 1.6603453698329446e-06, + "loss": 1.4187, + "step": 26736 + }, + { + "epoch": 2.2787863291570782, + "grad_norm": 65.80610490462544, + "learning_rate": 1.6599763647646173e-06, + "loss": 1.304, + "step": 26737 + }, + { + "epoch": 2.2788715588511037, + "grad_norm": 56.49804557301295, + "learning_rate": 1.6596073925440953e-06, + "loss": 1.3288, + "step": 26738 + }, + { + "epoch": 2.278956788545129, + "grad_norm": 63.04945911916454, + "learning_rate": 1.6592384531750055e-06, + "loss": 1.7291, + "step": 26739 + }, + { + "epoch": 2.2790420182391546, + "grad_norm": 49.94400189446566, + "learning_rate": 1.6588695466609783e-06, + "loss": 2.0058, + "step": 26740 + }, + { + "epoch": 2.27912724793318, + "grad_norm": 48.30436220766406, + "learning_rate": 1.658500673005643e-06, + "loss": 1.5824, + "step": 26741 + }, + { + "epoch": 2.279212477627205, + "grad_norm": 25.989253314059567, + "learning_rate": 1.6581318322126261e-06, + "loss": 0.9457, + "step": 26742 + }, + { + "epoch": 2.2792977073212306, + "grad_norm": 53.17915999615902, + "learning_rate": 1.6577630242855542e-06, + "loss": 1.4108, + "step": 26743 + }, + { + "epoch": 2.279382937015256, + "grad_norm": 27.672794234359344, + "learning_rate": 1.6573942492280532e-06, + "loss": 0.6574, + "step": 26744 + }, + { + "epoch": 2.2794681667092815, + "grad_norm": 31.539975651561146, + "learning_rate": 1.6570255070437536e-06, + "loss": 1.5239, + "step": 26745 + }, + { + "epoch": 2.279553396403307, + "grad_norm": 104.15720460250095, + "learning_rate": 1.6566567977362796e-06, + "loss": 1.3185, + "step": 26746 + }, + { + "epoch": 2.2796386260973325, + "grad_norm": 82.44593696246547, + "learning_rate": 1.6562881213092557e-06, + "loss": 2.8392, + "step": 26747 + }, + { + "epoch": 2.2797238557913575, + "grad_norm": 20.852318250801773, + "learning_rate": 1.6559194777663096e-06, + "loss": 0.7954, + "step": 26748 + }, + { + "epoch": 2.279809085485383, + "grad_norm": 22.27844896559725, + "learning_rate": 1.655550867111068e-06, + "loss": 1.2223, + "step": 26749 + }, + { + "epoch": 2.2798943151794084, + "grad_norm": 24.78509318652656, + "learning_rate": 1.6551822893471553e-06, + "loss": 1.2169, + "step": 26750 + }, + { + "epoch": 2.279979544873434, + "grad_norm": 18.726924870401024, + "learning_rate": 1.6548137444781947e-06, + "loss": 0.8088, + "step": 26751 + }, + { + "epoch": 2.2800647745674594, + "grad_norm": 41.46420729676747, + "learning_rate": 1.6544452325078125e-06, + "loss": 0.9265, + "step": 26752 + }, + { + "epoch": 2.280150004261485, + "grad_norm": 28.620024946667304, + "learning_rate": 1.6540767534396302e-06, + "loss": 0.7694, + "step": 26753 + }, + { + "epoch": 2.2802352339555103, + "grad_norm": 50.21924531862669, + "learning_rate": 1.6537083072772753e-06, + "loss": 1.4522, + "step": 26754 + }, + { + "epoch": 2.2803204636495353, + "grad_norm": 49.8135765344774, + "learning_rate": 1.6533398940243672e-06, + "loss": 1.5763, + "step": 26755 + }, + { + "epoch": 2.280405693343561, + "grad_norm": 71.8035148526492, + "learning_rate": 1.6529715136845337e-06, + "loss": 1.521, + "step": 26756 + }, + { + "epoch": 2.2804909230375863, + "grad_norm": 57.856804866698944, + "learning_rate": 1.6526031662613934e-06, + "loss": 1.7617, + "step": 26757 + }, + { + "epoch": 2.2805761527316117, + "grad_norm": 41.18006207632388, + "learning_rate": 1.652234851758573e-06, + "loss": 1.4464, + "step": 26758 + }, + { + "epoch": 2.280661382425637, + "grad_norm": 39.90055054352282, + "learning_rate": 1.6518665701796922e-06, + "loss": 1.2072, + "step": 26759 + }, + { + "epoch": 2.2807466121196627, + "grad_norm": 23.29470812663668, + "learning_rate": 1.6514983215283732e-06, + "loss": 1.0357, + "step": 26760 + }, + { + "epoch": 2.2808318418136877, + "grad_norm": 63.56617396050157, + "learning_rate": 1.6511301058082357e-06, + "loss": 1.2058, + "step": 26761 + }, + { + "epoch": 2.280917071507713, + "grad_norm": 18.249523411646965, + "learning_rate": 1.650761923022905e-06, + "loss": 1.074, + "step": 26762 + }, + { + "epoch": 2.2810023012017386, + "grad_norm": 55.95337330799033, + "learning_rate": 1.6503937731759983e-06, + "loss": 1.9036, + "step": 26763 + }, + { + "epoch": 2.281087530895764, + "grad_norm": 44.30782884879605, + "learning_rate": 1.6500256562711392e-06, + "loss": 1.1084, + "step": 26764 + }, + { + "epoch": 2.2811727605897896, + "grad_norm": 65.997531823546, + "learning_rate": 1.6496575723119469e-06, + "loss": 1.5227, + "step": 26765 + }, + { + "epoch": 2.281257990283815, + "grad_norm": 36.425951630170104, + "learning_rate": 1.6492895213020393e-06, + "loss": 1.2278, + "step": 26766 + }, + { + "epoch": 2.28134321997784, + "grad_norm": 31.667562094217416, + "learning_rate": 1.6489215032450396e-06, + "loss": 1.2781, + "step": 26767 + }, + { + "epoch": 2.2814284496718655, + "grad_norm": 85.51534832316797, + "learning_rate": 1.6485535181445656e-06, + "loss": 2.0902, + "step": 26768 + }, + { + "epoch": 2.281513679365891, + "grad_norm": 43.36740364378382, + "learning_rate": 1.648185566004235e-06, + "loss": 1.2624, + "step": 26769 + }, + { + "epoch": 2.2815989090599165, + "grad_norm": 58.378660423734054, + "learning_rate": 1.6478176468276668e-06, + "loss": 1.2894, + "step": 26770 + }, + { + "epoch": 2.281684138753942, + "grad_norm": 36.62096923819259, + "learning_rate": 1.6474497606184797e-06, + "loss": 1.2856, + "step": 26771 + }, + { + "epoch": 2.2817693684479674, + "grad_norm": 42.53432736072946, + "learning_rate": 1.6470819073802934e-06, + "loss": 1.4975, + "step": 26772 + }, + { + "epoch": 2.281854598141993, + "grad_norm": 22.705528831866236, + "learning_rate": 1.6467140871167247e-06, + "loss": 1.3213, + "step": 26773 + }, + { + "epoch": 2.281939827836018, + "grad_norm": 40.52193671957097, + "learning_rate": 1.6463462998313884e-06, + "loss": 0.9951, + "step": 26774 + }, + { + "epoch": 2.2820250575300434, + "grad_norm": 57.008729063619796, + "learning_rate": 1.6459785455279059e-06, + "loss": 1.4512, + "step": 26775 + }, + { + "epoch": 2.282110287224069, + "grad_norm": 31.750636832341662, + "learning_rate": 1.6456108242098912e-06, + "loss": 1.0879, + "step": 26776 + }, + { + "epoch": 2.2821955169180943, + "grad_norm": 36.15983407176227, + "learning_rate": 1.645243135880961e-06, + "loss": 1.0267, + "step": 26777 + }, + { + "epoch": 2.2822807466121198, + "grad_norm": 47.71939111726638, + "learning_rate": 1.6448754805447304e-06, + "loss": 1.2932, + "step": 26778 + }, + { + "epoch": 2.2823659763061452, + "grad_norm": 46.12607006481641, + "learning_rate": 1.6445078582048158e-06, + "loss": 1.1656, + "step": 26779 + }, + { + "epoch": 2.2824512060001707, + "grad_norm": 42.14760896841628, + "learning_rate": 1.6441402688648344e-06, + "loss": 0.769, + "step": 26780 + }, + { + "epoch": 2.2825364356941957, + "grad_norm": 50.07569799490888, + "learning_rate": 1.643772712528401e-06, + "loss": 1.5272, + "step": 26781 + }, + { + "epoch": 2.282621665388221, + "grad_norm": 21.13134024753842, + "learning_rate": 1.6434051891991282e-06, + "loss": 1.2871, + "step": 26782 + }, + { + "epoch": 2.2827068950822467, + "grad_norm": 34.16559456724306, + "learning_rate": 1.6430376988806307e-06, + "loss": 0.8054, + "step": 26783 + }, + { + "epoch": 2.282792124776272, + "grad_norm": 58.91560296594263, + "learning_rate": 1.642670241576525e-06, + "loss": 1.2343, + "step": 26784 + }, + { + "epoch": 2.2828773544702976, + "grad_norm": 20.657996505551132, + "learning_rate": 1.6423028172904231e-06, + "loss": 0.7734, + "step": 26785 + }, + { + "epoch": 2.2829625841643226, + "grad_norm": 43.56965729039004, + "learning_rate": 1.6419354260259373e-06, + "loss": 1.6867, + "step": 26786 + }, + { + "epoch": 2.283047813858348, + "grad_norm": 47.613183374138885, + "learning_rate": 1.6415680677866825e-06, + "loss": 1.2527, + "step": 26787 + }, + { + "epoch": 2.2831330435523736, + "grad_norm": 37.65485247548994, + "learning_rate": 1.6412007425762727e-06, + "loss": 0.8502, + "step": 26788 + }, + { + "epoch": 2.283218273246399, + "grad_norm": 49.59525575534542, + "learning_rate": 1.6408334503983185e-06, + "loss": 1.7912, + "step": 26789 + }, + { + "epoch": 2.2833035029404245, + "grad_norm": 50.230781713398855, + "learning_rate": 1.640466191256433e-06, + "loss": 1.2003, + "step": 26790 + }, + { + "epoch": 2.28338873263445, + "grad_norm": 41.48278488145914, + "learning_rate": 1.640098965154226e-06, + "loss": 1.368, + "step": 26791 + }, + { + "epoch": 2.2834739623284754, + "grad_norm": 42.73542918662257, + "learning_rate": 1.6397317720953122e-06, + "loss": 1.4765, + "step": 26792 + }, + { + "epoch": 2.2835591920225005, + "grad_norm": 31.47856826233571, + "learning_rate": 1.6393646120833007e-06, + "loss": 1.5017, + "step": 26793 + }, + { + "epoch": 2.283644421716526, + "grad_norm": 44.16562404620393, + "learning_rate": 1.6389974851218016e-06, + "loss": 0.8808, + "step": 26794 + }, + { + "epoch": 2.2837296514105514, + "grad_norm": 67.26872536295927, + "learning_rate": 1.6386303912144286e-06, + "loss": 1.5823, + "step": 26795 + }, + { + "epoch": 2.283814881104577, + "grad_norm": 35.335372630419876, + "learning_rate": 1.6382633303647877e-06, + "loss": 1.2594, + "step": 26796 + }, + { + "epoch": 2.2839001107986023, + "grad_norm": 43.55296280015199, + "learning_rate": 1.6378963025764933e-06, + "loss": 1.9078, + "step": 26797 + }, + { + "epoch": 2.283985340492628, + "grad_norm": 62.864129749816, + "learning_rate": 1.6375293078531529e-06, + "loss": 1.5515, + "step": 26798 + }, + { + "epoch": 2.2840705701866533, + "grad_norm": 39.972717777115975, + "learning_rate": 1.6371623461983755e-06, + "loss": 1.6241, + "step": 26799 + }, + { + "epoch": 2.2841557998806783, + "grad_norm": 39.48376641076476, + "learning_rate": 1.636795417615768e-06, + "loss": 0.8586, + "step": 26800 + }, + { + "epoch": 2.2842410295747038, + "grad_norm": 50.34947334590392, + "learning_rate": 1.6364285221089433e-06, + "loss": 1.5988, + "step": 26801 + }, + { + "epoch": 2.2843262592687292, + "grad_norm": 52.25447128946416, + "learning_rate": 1.636061659681506e-06, + "loss": 1.6217, + "step": 26802 + }, + { + "epoch": 2.2844114889627547, + "grad_norm": 86.24709837550591, + "learning_rate": 1.6356948303370668e-06, + "loss": 1.9319, + "step": 26803 + }, + { + "epoch": 2.28449671865678, + "grad_norm": 43.74622691707163, + "learning_rate": 1.6353280340792305e-06, + "loss": 1.1513, + "step": 26804 + }, + { + "epoch": 2.284581948350805, + "grad_norm": 31.011214671141996, + "learning_rate": 1.6349612709116081e-06, + "loss": 1.3973, + "step": 26805 + }, + { + "epoch": 2.2846671780448307, + "grad_norm": 76.17466042262606, + "learning_rate": 1.634594540837804e-06, + "loss": 1.8196, + "step": 26806 + }, + { + "epoch": 2.284752407738856, + "grad_norm": 35.50820844499832, + "learning_rate": 1.6342278438614256e-06, + "loss": 1.602, + "step": 26807 + }, + { + "epoch": 2.2848376374328816, + "grad_norm": 59.03137012362029, + "learning_rate": 1.633861179986077e-06, + "loss": 1.4878, + "step": 26808 + }, + { + "epoch": 2.284922867126907, + "grad_norm": 44.13877568221111, + "learning_rate": 1.633494549215368e-06, + "loss": 1.5298, + "step": 26809 + }, + { + "epoch": 2.2850080968209325, + "grad_norm": 28.101841567327906, + "learning_rate": 1.633127951552901e-06, + "loss": 0.8766, + "step": 26810 + }, + { + "epoch": 2.285093326514958, + "grad_norm": 37.64610342209649, + "learning_rate": 1.632761387002284e-06, + "loss": 1.252, + "step": 26811 + }, + { + "epoch": 2.285178556208983, + "grad_norm": 72.99823370116646, + "learning_rate": 1.6323948555671204e-06, + "loss": 1.6855, + "step": 26812 + }, + { + "epoch": 2.2852637859030085, + "grad_norm": 39.895145951616485, + "learning_rate": 1.6320283572510142e-06, + "loss": 1.4947, + "step": 26813 + }, + { + "epoch": 2.285349015597034, + "grad_norm": 57.18343877232371, + "learning_rate": 1.6316618920575722e-06, + "loss": 1.8332, + "step": 26814 + }, + { + "epoch": 2.2854342452910594, + "grad_norm": 17.44775738378969, + "learning_rate": 1.6312954599903974e-06, + "loss": 0.5615, + "step": 26815 + }, + { + "epoch": 2.285519474985085, + "grad_norm": 27.81650618420665, + "learning_rate": 1.6309290610530926e-06, + "loss": 1.0059, + "step": 26816 + }, + { + "epoch": 2.2856047046791104, + "grad_norm": 23.214807909700113, + "learning_rate": 1.6305626952492604e-06, + "loss": 0.9956, + "step": 26817 + }, + { + "epoch": 2.285689934373136, + "grad_norm": 79.18490297985807, + "learning_rate": 1.6301963625825052e-06, + "loss": 0.9857, + "step": 26818 + }, + { + "epoch": 2.285775164067161, + "grad_norm": 40.331252733409244, + "learning_rate": 1.6298300630564312e-06, + "loss": 1.2729, + "step": 26819 + }, + { + "epoch": 2.2858603937611863, + "grad_norm": 59.25249876380314, + "learning_rate": 1.6294637966746397e-06, + "loss": 1.1721, + "step": 26820 + }, + { + "epoch": 2.285945623455212, + "grad_norm": 62.202565876754086, + "learning_rate": 1.6290975634407309e-06, + "loss": 2.0992, + "step": 26821 + }, + { + "epoch": 2.2860308531492373, + "grad_norm": 34.4263749157105, + "learning_rate": 1.628731363358309e-06, + "loss": 1.1713, + "step": 26822 + }, + { + "epoch": 2.2861160828432627, + "grad_norm": 54.496103030511875, + "learning_rate": 1.6283651964309756e-06, + "loss": 1.7101, + "step": 26823 + }, + { + "epoch": 2.2862013125372878, + "grad_norm": 60.2197988663626, + "learning_rate": 1.62799906266233e-06, + "loss": 2.1246, + "step": 26824 + }, + { + "epoch": 2.2862865422313132, + "grad_norm": 56.7754684309835, + "learning_rate": 1.6276329620559722e-06, + "loss": 1.4076, + "step": 26825 + }, + { + "epoch": 2.2863717719253387, + "grad_norm": 55.22428951132557, + "learning_rate": 1.627266894615504e-06, + "loss": 1.635, + "step": 26826 + }, + { + "epoch": 2.286457001619364, + "grad_norm": 66.20485134057134, + "learning_rate": 1.6269008603445275e-06, + "loss": 2.2052, + "step": 26827 + }, + { + "epoch": 2.2865422313133896, + "grad_norm": 26.54160313588341, + "learning_rate": 1.6265348592466407e-06, + "loss": 0.6921, + "step": 26828 + }, + { + "epoch": 2.286627461007415, + "grad_norm": 32.68180472621013, + "learning_rate": 1.626168891325443e-06, + "loss": 0.5697, + "step": 26829 + }, + { + "epoch": 2.2867126907014406, + "grad_norm": 54.774320110639486, + "learning_rate": 1.6258029565845324e-06, + "loss": 1.0994, + "step": 26830 + }, + { + "epoch": 2.2867979203954656, + "grad_norm": 43.408412784665686, + "learning_rate": 1.6254370550275101e-06, + "loss": 1.2085, + "step": 26831 + }, + { + "epoch": 2.286883150089491, + "grad_norm": 22.365523807294625, + "learning_rate": 1.6250711866579733e-06, + "loss": 0.7928, + "step": 26832 + }, + { + "epoch": 2.2869683797835165, + "grad_norm": 39.83384388079558, + "learning_rate": 1.624705351479519e-06, + "loss": 1.1638, + "step": 26833 + }, + { + "epoch": 2.287053609477542, + "grad_norm": 53.33014476321109, + "learning_rate": 1.6243395494957464e-06, + "loss": 1.5534, + "step": 26834 + }, + { + "epoch": 2.2871388391715675, + "grad_norm": 63.98010654701854, + "learning_rate": 1.6239737807102545e-06, + "loss": 1.4564, + "step": 26835 + }, + { + "epoch": 2.287224068865593, + "grad_norm": 40.52973579900196, + "learning_rate": 1.623608045126639e-06, + "loss": 1.0452, + "step": 26836 + }, + { + "epoch": 2.2873092985596184, + "grad_norm": 47.627948991840555, + "learning_rate": 1.6232423427484966e-06, + "loss": 0.4543, + "step": 26837 + }, + { + "epoch": 2.2873945282536434, + "grad_norm": 26.858140863066705, + "learning_rate": 1.6228766735794228e-06, + "loss": 0.7558, + "step": 26838 + }, + { + "epoch": 2.287479757947669, + "grad_norm": 46.51445151012435, + "learning_rate": 1.6225110376230163e-06, + "loss": 1.7685, + "step": 26839 + }, + { + "epoch": 2.2875649876416944, + "grad_norm": 54.979976194796535, + "learning_rate": 1.622145434882872e-06, + "loss": 1.2311, + "step": 26840 + }, + { + "epoch": 2.28765021733572, + "grad_norm": 83.93829781028174, + "learning_rate": 1.6217798653625826e-06, + "loss": 2.355, + "step": 26841 + }, + { + "epoch": 2.2877354470297453, + "grad_norm": 50.86632222073862, + "learning_rate": 1.621414329065748e-06, + "loss": 1.949, + "step": 26842 + }, + { + "epoch": 2.2878206767237703, + "grad_norm": 31.348191445607714, + "learning_rate": 1.6210488259959596e-06, + "loss": 1.1301, + "step": 26843 + }, + { + "epoch": 2.287905906417796, + "grad_norm": 50.61723888732361, + "learning_rate": 1.6206833561568148e-06, + "loss": 1.2866, + "step": 26844 + }, + { + "epoch": 2.2879911361118213, + "grad_norm": 19.013884095435955, + "learning_rate": 1.6203179195519064e-06, + "loss": 0.8081, + "step": 26845 + }, + { + "epoch": 2.2880763658058467, + "grad_norm": 76.02589468530535, + "learning_rate": 1.619952516184828e-06, + "loss": 1.7927, + "step": 26846 + }, + { + "epoch": 2.288161595499872, + "grad_norm": 39.78810868449416, + "learning_rate": 1.6195871460591721e-06, + "loss": 0.8302, + "step": 26847 + }, + { + "epoch": 2.2882468251938977, + "grad_norm": 28.7069194753311, + "learning_rate": 1.619221809178535e-06, + "loss": 0.8339, + "step": 26848 + }, + { + "epoch": 2.288332054887923, + "grad_norm": 37.0959779399546, + "learning_rate": 1.618856505546506e-06, + "loss": 1.2784, + "step": 26849 + }, + { + "epoch": 2.288417284581948, + "grad_norm": 49.6524068986586, + "learning_rate": 1.618491235166682e-06, + "loss": 1.8993, + "step": 26850 + }, + { + "epoch": 2.2885025142759736, + "grad_norm": 22.413823308232047, + "learning_rate": 1.6181259980426512e-06, + "loss": 0.9456, + "step": 26851 + }, + { + "epoch": 2.288587743969999, + "grad_norm": 36.01818190864679, + "learning_rate": 1.6177607941780087e-06, + "loss": 1.0805, + "step": 26852 + }, + { + "epoch": 2.2886729736640246, + "grad_norm": 60.49965455912171, + "learning_rate": 1.6173956235763454e-06, + "loss": 1.7982, + "step": 26853 + }, + { + "epoch": 2.28875820335805, + "grad_norm": 50.051098134024066, + "learning_rate": 1.6170304862412517e-06, + "loss": 1.0599, + "step": 26854 + }, + { + "epoch": 2.2888434330520755, + "grad_norm": 26.476711657723644, + "learning_rate": 1.616665382176319e-06, + "loss": 1.1074, + "step": 26855 + }, + { + "epoch": 2.288928662746101, + "grad_norm": 45.52268459553487, + "learning_rate": 1.6163003113851362e-06, + "loss": 1.0281, + "step": 26856 + }, + { + "epoch": 2.289013892440126, + "grad_norm": 22.78175402545651, + "learning_rate": 1.6159352738712947e-06, + "loss": 0.828, + "step": 26857 + }, + { + "epoch": 2.2890991221341515, + "grad_norm": 41.35952791795608, + "learning_rate": 1.6155702696383874e-06, + "loss": 1.5341, + "step": 26858 + }, + { + "epoch": 2.289184351828177, + "grad_norm": 60.03960325340173, + "learning_rate": 1.6152052986900013e-06, + "loss": 1.7329, + "step": 26859 + }, + { + "epoch": 2.2892695815222024, + "grad_norm": 49.259332835305685, + "learning_rate": 1.6148403610297247e-06, + "loss": 1.5138, + "step": 26860 + }, + { + "epoch": 2.289354811216228, + "grad_norm": 32.317735662094876, + "learning_rate": 1.6144754566611497e-06, + "loss": 0.888, + "step": 26861 + }, + { + "epoch": 2.289440040910253, + "grad_norm": 53.7409811455762, + "learning_rate": 1.6141105855878636e-06, + "loss": 1.296, + "step": 26862 + }, + { + "epoch": 2.2895252706042784, + "grad_norm": 80.15316716303948, + "learning_rate": 1.6137457478134537e-06, + "loss": 1.9132, + "step": 26863 + }, + { + "epoch": 2.289610500298304, + "grad_norm": 24.52213744050148, + "learning_rate": 1.6133809433415077e-06, + "loss": 1.1173, + "step": 26864 + }, + { + "epoch": 2.2896957299923293, + "grad_norm": 97.80630913205061, + "learning_rate": 1.6130161721756143e-06, + "loss": 1.9054, + "step": 26865 + }, + { + "epoch": 2.2897809596863548, + "grad_norm": 41.41482545519003, + "learning_rate": 1.6126514343193628e-06, + "loss": 0.9373, + "step": 26866 + }, + { + "epoch": 2.2898661893803802, + "grad_norm": 52.74593519501155, + "learning_rate": 1.612286729776338e-06, + "loss": 1.956, + "step": 26867 + }, + { + "epoch": 2.2899514190744057, + "grad_norm": 34.030248240447655, + "learning_rate": 1.611922058550127e-06, + "loss": 1.0599, + "step": 26868 + }, + { + "epoch": 2.2900366487684307, + "grad_norm": 44.126808495123605, + "learning_rate": 1.6115574206443152e-06, + "loss": 1.5425, + "step": 26869 + }, + { + "epoch": 2.290121878462456, + "grad_norm": 46.73908146915999, + "learning_rate": 1.6111928160624912e-06, + "loss": 1.2833, + "step": 26870 + }, + { + "epoch": 2.2902071081564817, + "grad_norm": 30.775468203035853, + "learning_rate": 1.6108282448082391e-06, + "loss": 1.5823, + "step": 26871 + }, + { + "epoch": 2.290292337850507, + "grad_norm": 46.88643480169887, + "learning_rate": 1.610463706885143e-06, + "loss": 1.7164, + "step": 26872 + }, + { + "epoch": 2.2903775675445326, + "grad_norm": 44.32913271937778, + "learning_rate": 1.6100992022967892e-06, + "loss": 1.4299, + "step": 26873 + }, + { + "epoch": 2.290462797238558, + "grad_norm": 73.65534653020123, + "learning_rate": 1.6097347310467648e-06, + "loss": 1.447, + "step": 26874 + }, + { + "epoch": 2.2905480269325835, + "grad_norm": 57.880292926232194, + "learning_rate": 1.6093702931386518e-06, + "loss": 1.7517, + "step": 26875 + }, + { + "epoch": 2.2906332566266085, + "grad_norm": 42.775388704227296, + "learning_rate": 1.6090058885760346e-06, + "loss": 1.0433, + "step": 26876 + }, + { + "epoch": 2.290718486320634, + "grad_norm": 60.8149119011053, + "learning_rate": 1.6086415173624959e-06, + "loss": 1.278, + "step": 26877 + }, + { + "epoch": 2.2908037160146595, + "grad_norm": 36.94846780558401, + "learning_rate": 1.6082771795016216e-06, + "loss": 1.1535, + "step": 26878 + }, + { + "epoch": 2.290888945708685, + "grad_norm": 29.69754378288202, + "learning_rate": 1.6079128749969936e-06, + "loss": 0.8992, + "step": 26879 + }, + { + "epoch": 2.2909741754027104, + "grad_norm": 45.731467688593995, + "learning_rate": 1.6075486038521925e-06, + "loss": 1.6482, + "step": 26880 + }, + { + "epoch": 2.291059405096736, + "grad_norm": 69.4293201100632, + "learning_rate": 1.6071843660708036e-06, + "loss": 2.1384, + "step": 26881 + }, + { + "epoch": 2.291144634790761, + "grad_norm": 29.53379009841389, + "learning_rate": 1.6068201616564094e-06, + "loss": 1.3812, + "step": 26882 + }, + { + "epoch": 2.2912298644847864, + "grad_norm": 74.02157375328802, + "learning_rate": 1.606455990612591e-06, + "loss": 1.1009, + "step": 26883 + }, + { + "epoch": 2.291315094178812, + "grad_norm": 40.121873247917996, + "learning_rate": 1.6060918529429288e-06, + "loss": 1.1503, + "step": 26884 + }, + { + "epoch": 2.2914003238728373, + "grad_norm": 55.10363700961399, + "learning_rate": 1.6057277486510054e-06, + "loss": 2.2843, + "step": 26885 + }, + { + "epoch": 2.291485553566863, + "grad_norm": 47.33346472475119, + "learning_rate": 1.6053636777403987e-06, + "loss": 1.4992, + "step": 26886 + }, + { + "epoch": 2.2915707832608883, + "grad_norm": 72.588531347468, + "learning_rate": 1.6049996402146928e-06, + "loss": 2.2974, + "step": 26887 + }, + { + "epoch": 2.2916560129549133, + "grad_norm": 57.10819930377042, + "learning_rate": 1.6046356360774651e-06, + "loss": 1.683, + "step": 26888 + }, + { + "epoch": 2.2917412426489387, + "grad_norm": 42.56118454566309, + "learning_rate": 1.6042716653322987e-06, + "loss": 1.2678, + "step": 26889 + }, + { + "epoch": 2.291826472342964, + "grad_norm": 38.88503284035407, + "learning_rate": 1.6039077279827691e-06, + "loss": 1.0391, + "step": 26890 + }, + { + "epoch": 2.2919117020369897, + "grad_norm": 145.91818979232676, + "learning_rate": 1.603543824032459e-06, + "loss": 1.7031, + "step": 26891 + }, + { + "epoch": 2.291996931731015, + "grad_norm": 69.05876374066005, + "learning_rate": 1.6031799534849462e-06, + "loss": 1.515, + "step": 26892 + }, + { + "epoch": 2.2920821614250406, + "grad_norm": 64.32560632728867, + "learning_rate": 1.6028161163438083e-06, + "loss": 0.7552, + "step": 26893 + }, + { + "epoch": 2.292167391119066, + "grad_norm": 36.96891777943536, + "learning_rate": 1.6024523126126229e-06, + "loss": 1.1675, + "step": 26894 + }, + { + "epoch": 2.292252620813091, + "grad_norm": 39.438892971783176, + "learning_rate": 1.60208854229497e-06, + "loss": 1.6486, + "step": 26895 + }, + { + "epoch": 2.2923378505071166, + "grad_norm": 30.358289572336297, + "learning_rate": 1.601724805394425e-06, + "loss": 0.9428, + "step": 26896 + }, + { + "epoch": 2.292423080201142, + "grad_norm": 59.50352289822054, + "learning_rate": 1.6013611019145676e-06, + "loss": 1.8571, + "step": 26897 + }, + { + "epoch": 2.2925083098951675, + "grad_norm": 59.29885790621166, + "learning_rate": 1.6009974318589733e-06, + "loss": 1.6301, + "step": 26898 + }, + { + "epoch": 2.292593539589193, + "grad_norm": 44.51657267674527, + "learning_rate": 1.6006337952312168e-06, + "loss": 1.0865, + "step": 26899 + }, + { + "epoch": 2.2926787692832185, + "grad_norm": 35.60145868376009, + "learning_rate": 1.6002701920348779e-06, + "loss": 0.9783, + "step": 26900 + }, + { + "epoch": 2.2927639989772435, + "grad_norm": 27.930662212821733, + "learning_rate": 1.599906622273531e-06, + "loss": 1.2307, + "step": 26901 + }, + { + "epoch": 2.292849228671269, + "grad_norm": 55.56977843994153, + "learning_rate": 1.599543085950751e-06, + "loss": 1.6737, + "step": 26902 + }, + { + "epoch": 2.2929344583652944, + "grad_norm": 112.4949334772489, + "learning_rate": 1.5991795830701117e-06, + "loss": 2.272, + "step": 26903 + }, + { + "epoch": 2.29301968805932, + "grad_norm": 54.70749195184746, + "learning_rate": 1.5988161136351904e-06, + "loss": 1.5207, + "step": 26904 + }, + { + "epoch": 2.2931049177533454, + "grad_norm": 71.51254627012085, + "learning_rate": 1.5984526776495629e-06, + "loss": 2.1108, + "step": 26905 + }, + { + "epoch": 2.293190147447371, + "grad_norm": 73.73412033095904, + "learning_rate": 1.5980892751168016e-06, + "loss": 2.393, + "step": 26906 + }, + { + "epoch": 2.293275377141396, + "grad_norm": 33.39018547945344, + "learning_rate": 1.5977259060404781e-06, + "loss": 1.0192, + "step": 26907 + }, + { + "epoch": 2.2933606068354213, + "grad_norm": 54.640141867726626, + "learning_rate": 1.5973625704241707e-06, + "loss": 1.833, + "step": 26908 + }, + { + "epoch": 2.293445836529447, + "grad_norm": 41.585826350751404, + "learning_rate": 1.5969992682714501e-06, + "loss": 1.3694, + "step": 26909 + }, + { + "epoch": 2.2935310662234722, + "grad_norm": 66.36106451839879, + "learning_rate": 1.5966359995858899e-06, + "loss": 1.647, + "step": 26910 + }, + { + "epoch": 2.2936162959174977, + "grad_norm": 58.46944420041184, + "learning_rate": 1.59627276437106e-06, + "loss": 1.9604, + "step": 26911 + }, + { + "epoch": 2.293701525611523, + "grad_norm": 42.80807143133591, + "learning_rate": 1.5959095626305354e-06, + "loss": 1.3956, + "step": 26912 + }, + { + "epoch": 2.2937867553055487, + "grad_norm": 56.212136428943886, + "learning_rate": 1.595546394367889e-06, + "loss": 1.6237, + "step": 26913 + }, + { + "epoch": 2.2938719849995737, + "grad_norm": 97.21096147308023, + "learning_rate": 1.5951832595866912e-06, + "loss": 2.9101, + "step": 26914 + }, + { + "epoch": 2.293957214693599, + "grad_norm": 47.963574042723714, + "learning_rate": 1.5948201582905133e-06, + "loss": 1.759, + "step": 26915 + }, + { + "epoch": 2.2940424443876246, + "grad_norm": 51.549059642924696, + "learning_rate": 1.5944570904829237e-06, + "loss": 1.1157, + "step": 26916 + }, + { + "epoch": 2.29412767408165, + "grad_norm": 32.71740515513911, + "learning_rate": 1.5940940561674977e-06, + "loss": 1.1687, + "step": 26917 + }, + { + "epoch": 2.2942129037756755, + "grad_norm": 24.352471261802723, + "learning_rate": 1.593731055347803e-06, + "loss": 1.1128, + "step": 26918 + }, + { + "epoch": 2.294298133469701, + "grad_norm": 31.606614351365437, + "learning_rate": 1.5933680880274082e-06, + "loss": 1.4126, + "step": 26919 + }, + { + "epoch": 2.2943833631637265, + "grad_norm": 31.525086328893348, + "learning_rate": 1.5930051542098845e-06, + "loss": 0.9298, + "step": 26920 + }, + { + "epoch": 2.2944685928577515, + "grad_norm": 29.71259017252049, + "learning_rate": 1.5926422538988035e-06, + "loss": 1.0621, + "step": 26921 + }, + { + "epoch": 2.294553822551777, + "grad_norm": 47.50098514962757, + "learning_rate": 1.592279387097731e-06, + "loss": 1.3998, + "step": 26922 + }, + { + "epoch": 2.2946390522458024, + "grad_norm": 50.4457241669786, + "learning_rate": 1.5919165538102376e-06, + "loss": 1.4927, + "step": 26923 + }, + { + "epoch": 2.294724281939828, + "grad_norm": 68.6637657737793, + "learning_rate": 1.5915537540398884e-06, + "loss": 2.1018, + "step": 26924 + }, + { + "epoch": 2.2948095116338534, + "grad_norm": 45.56597604814534, + "learning_rate": 1.5911909877902554e-06, + "loss": 1.6942, + "step": 26925 + }, + { + "epoch": 2.2948947413278784, + "grad_norm": 84.28435934252796, + "learning_rate": 1.5908282550649045e-06, + "loss": 1.9862, + "step": 26926 + }, + { + "epoch": 2.294979971021904, + "grad_norm": 79.94780541495528, + "learning_rate": 1.5904655558674021e-06, + "loss": 1.7805, + "step": 26927 + }, + { + "epoch": 2.2950652007159293, + "grad_norm": 73.54337544966172, + "learning_rate": 1.5901028902013172e-06, + "loss": 2.0094, + "step": 26928 + }, + { + "epoch": 2.295150430409955, + "grad_norm": 62.12619740278609, + "learning_rate": 1.5897402580702137e-06, + "loss": 2.0259, + "step": 26929 + }, + { + "epoch": 2.2952356601039803, + "grad_norm": 34.986075328338934, + "learning_rate": 1.589377659477661e-06, + "loss": 1.1959, + "step": 26930 + }, + { + "epoch": 2.2953208897980057, + "grad_norm": 45.24969201377392, + "learning_rate": 1.5890150944272242e-06, + "loss": 1.4757, + "step": 26931 + }, + { + "epoch": 2.295406119492031, + "grad_norm": 67.21420841248835, + "learning_rate": 1.5886525629224686e-06, + "loss": 2.0267, + "step": 26932 + }, + { + "epoch": 2.2954913491860562, + "grad_norm": 64.08715648557146, + "learning_rate": 1.5882900649669574e-06, + "loss": 2.1595, + "step": 26933 + }, + { + "epoch": 2.2955765788800817, + "grad_norm": 56.287447790615914, + "learning_rate": 1.5879276005642601e-06, + "loss": 1.3588, + "step": 26934 + }, + { + "epoch": 2.295661808574107, + "grad_norm": 49.05943383966674, + "learning_rate": 1.5875651697179367e-06, + "loss": 1.0973, + "step": 26935 + }, + { + "epoch": 2.2957470382681326, + "grad_norm": 48.42859049293191, + "learning_rate": 1.5872027724315558e-06, + "loss": 1.7667, + "step": 26936 + }, + { + "epoch": 2.295832267962158, + "grad_norm": 26.83466554638862, + "learning_rate": 1.586840408708678e-06, + "loss": 1.3792, + "step": 26937 + }, + { + "epoch": 2.2959174976561836, + "grad_norm": 67.3432879520787, + "learning_rate": 1.5864780785528705e-06, + "loss": 1.8696, + "step": 26938 + }, + { + "epoch": 2.296002727350209, + "grad_norm": 32.77131623074469, + "learning_rate": 1.5861157819676942e-06, + "loss": 1.0448, + "step": 26939 + }, + { + "epoch": 2.296087957044234, + "grad_norm": 72.04075096619785, + "learning_rate": 1.5857535189567126e-06, + "loss": 1.8181, + "step": 26940 + }, + { + "epoch": 2.2961731867382595, + "grad_norm": 60.784293462738795, + "learning_rate": 1.5853912895234885e-06, + "loss": 1.3453, + "step": 26941 + }, + { + "epoch": 2.296258416432285, + "grad_norm": 51.68951731785809, + "learning_rate": 1.5850290936715828e-06, + "loss": 1.3427, + "step": 26942 + }, + { + "epoch": 2.2963436461263105, + "grad_norm": 76.81859288456168, + "learning_rate": 1.5846669314045587e-06, + "loss": 1.3361, + "step": 26943 + }, + { + "epoch": 2.296428875820336, + "grad_norm": 33.34555616907575, + "learning_rate": 1.58430480272598e-06, + "loss": 1.4557, + "step": 26944 + }, + { + "epoch": 2.296514105514361, + "grad_norm": 58.554421486589355, + "learning_rate": 1.5839427076394065e-06, + "loss": 1.754, + "step": 26945 + }, + { + "epoch": 2.2965993352083864, + "grad_norm": 54.83668438195182, + "learning_rate": 1.583580646148397e-06, + "loss": 1.9928, + "step": 26946 + }, + { + "epoch": 2.296684564902412, + "grad_norm": 71.6349253099588, + "learning_rate": 1.5832186182565162e-06, + "loss": 1.616, + "step": 26947 + }, + { + "epoch": 2.2967697945964374, + "grad_norm": 55.941969617232125, + "learning_rate": 1.5828566239673226e-06, + "loss": 1.5441, + "step": 26948 + }, + { + "epoch": 2.296855024290463, + "grad_norm": 35.12692816505719, + "learning_rate": 1.5824946632843764e-06, + "loss": 1.2135, + "step": 26949 + }, + { + "epoch": 2.2969402539844883, + "grad_norm": 45.52625420298977, + "learning_rate": 1.5821327362112354e-06, + "loss": 1.3171, + "step": 26950 + }, + { + "epoch": 2.297025483678514, + "grad_norm": 51.23076073683151, + "learning_rate": 1.5817708427514605e-06, + "loss": 1.4251, + "step": 26951 + }, + { + "epoch": 2.297110713372539, + "grad_norm": 50.1947857703078, + "learning_rate": 1.5814089829086133e-06, + "loss": 1.0168, + "step": 26952 + }, + { + "epoch": 2.2971959430665643, + "grad_norm": 26.036801333763485, + "learning_rate": 1.5810471566862502e-06, + "loss": 0.7955, + "step": 26953 + }, + { + "epoch": 2.2972811727605897, + "grad_norm": 44.70112180074335, + "learning_rate": 1.5806853640879278e-06, + "loss": 1.8619, + "step": 26954 + }, + { + "epoch": 2.297366402454615, + "grad_norm": 35.36673959717274, + "learning_rate": 1.5803236051172077e-06, + "loss": 1.0582, + "step": 26955 + }, + { + "epoch": 2.2974516321486407, + "grad_norm": 28.078431817848294, + "learning_rate": 1.5799618797776461e-06, + "loss": 1.048, + "step": 26956 + }, + { + "epoch": 2.297536861842666, + "grad_norm": 32.37491219185653, + "learning_rate": 1.5796001880728013e-06, + "loss": 0.9688, + "step": 26957 + }, + { + "epoch": 2.2976220915366916, + "grad_norm": 61.45078460062548, + "learning_rate": 1.5792385300062268e-06, + "loss": 1.5798, + "step": 26958 + }, + { + "epoch": 2.2977073212307166, + "grad_norm": 32.2778172599119, + "learning_rate": 1.5788769055814828e-06, + "loss": 1.0175, + "step": 26959 + }, + { + "epoch": 2.297792550924742, + "grad_norm": 71.00740042320615, + "learning_rate": 1.5785153148021265e-06, + "loss": 1.9372, + "step": 26960 + }, + { + "epoch": 2.2978777806187676, + "grad_norm": 43.69854057169643, + "learning_rate": 1.578153757671712e-06, + "loss": 1.493, + "step": 26961 + }, + { + "epoch": 2.297963010312793, + "grad_norm": 57.43932184422734, + "learning_rate": 1.5777922341937962e-06, + "loss": 1.3119, + "step": 26962 + }, + { + "epoch": 2.2980482400068185, + "grad_norm": 80.88008455623202, + "learning_rate": 1.5774307443719322e-06, + "loss": 2.1054, + "step": 26963 + }, + { + "epoch": 2.2981334697008435, + "grad_norm": 18.0728898492485, + "learning_rate": 1.5770692882096782e-06, + "loss": 0.5372, + "step": 26964 + }, + { + "epoch": 2.298218699394869, + "grad_norm": 54.340601462657645, + "learning_rate": 1.5767078657105877e-06, + "loss": 1.8327, + "step": 26965 + }, + { + "epoch": 2.2983039290888945, + "grad_norm": 27.71650995217497, + "learning_rate": 1.5763464768782134e-06, + "loss": 0.9937, + "step": 26966 + }, + { + "epoch": 2.29838915878292, + "grad_norm": 28.953368834864264, + "learning_rate": 1.5759851217161115e-06, + "loss": 0.9445, + "step": 26967 + }, + { + "epoch": 2.2984743884769454, + "grad_norm": 35.196981964760106, + "learning_rate": 1.5756238002278361e-06, + "loss": 0.899, + "step": 26968 + }, + { + "epoch": 2.298559618170971, + "grad_norm": 49.02719143453086, + "learning_rate": 1.5752625124169407e-06, + "loss": 1.3176, + "step": 26969 + }, + { + "epoch": 2.2986448478649963, + "grad_norm": 27.058860050906624, + "learning_rate": 1.5749012582869778e-06, + "loss": 0.5865, + "step": 26970 + }, + { + "epoch": 2.2987300775590214, + "grad_norm": 22.464082249289138, + "learning_rate": 1.5745400378414993e-06, + "loss": 0.8394, + "step": 26971 + }, + { + "epoch": 2.298815307253047, + "grad_norm": 30.757219389391818, + "learning_rate": 1.5741788510840572e-06, + "loss": 0.7508, + "step": 26972 + }, + { + "epoch": 2.2989005369470723, + "grad_norm": 55.77330432270866, + "learning_rate": 1.5738176980182062e-06, + "loss": 1.5567, + "step": 26973 + }, + { + "epoch": 2.2989857666410978, + "grad_norm": 61.305258394954876, + "learning_rate": 1.5734565786474954e-06, + "loss": 1.6075, + "step": 26974 + }, + { + "epoch": 2.2990709963351232, + "grad_norm": 42.28302349119785, + "learning_rate": 1.573095492975479e-06, + "loss": 1.1175, + "step": 26975 + }, + { + "epoch": 2.2991562260291487, + "grad_norm": 45.91277316671874, + "learning_rate": 1.5727344410057049e-06, + "loss": 1.6202, + "step": 26976 + }, + { + "epoch": 2.299241455723174, + "grad_norm": 97.04093999917974, + "learning_rate": 1.5723734227417276e-06, + "loss": 1.8901, + "step": 26977 + }, + { + "epoch": 2.299326685417199, + "grad_norm": 30.306179914103595, + "learning_rate": 1.5720124381870956e-06, + "loss": 0.9421, + "step": 26978 + }, + { + "epoch": 2.2994119151112247, + "grad_norm": 65.60047809126186, + "learning_rate": 1.5716514873453586e-06, + "loss": 1.5479, + "step": 26979 + }, + { + "epoch": 2.29949714480525, + "grad_norm": 22.431181682465727, + "learning_rate": 1.571290570220066e-06, + "loss": 0.5956, + "step": 26980 + }, + { + "epoch": 2.2995823744992756, + "grad_norm": 65.38261997162631, + "learning_rate": 1.57092968681477e-06, + "loss": 1.726, + "step": 26981 + }, + { + "epoch": 2.299667604193301, + "grad_norm": 41.64675263990873, + "learning_rate": 1.5705688371330157e-06, + "loss": 0.8442, + "step": 26982 + }, + { + "epoch": 2.299752833887326, + "grad_norm": 51.036031033572556, + "learning_rate": 1.5702080211783561e-06, + "loss": 1.6264, + "step": 26983 + }, + { + "epoch": 2.2998380635813516, + "grad_norm": 28.155032512609147, + "learning_rate": 1.5698472389543362e-06, + "loss": 1.0822, + "step": 26984 + }, + { + "epoch": 2.299923293275377, + "grad_norm": 35.7755706465196, + "learning_rate": 1.5694864904645068e-06, + "loss": 1.3135, + "step": 26985 + }, + { + "epoch": 2.3000085229694025, + "grad_norm": 13.740496585604896, + "learning_rate": 1.569125775712415e-06, + "loss": 0.388, + "step": 26986 + }, + { + "epoch": 2.300093752663428, + "grad_norm": 44.095106487867746, + "learning_rate": 1.5687650947016081e-06, + "loss": 0.9908, + "step": 26987 + }, + { + "epoch": 2.3001789823574534, + "grad_norm": 71.636090586942, + "learning_rate": 1.568404447435633e-06, + "loss": 1.7009, + "step": 26988 + }, + { + "epoch": 2.300264212051479, + "grad_norm": 45.082135607866874, + "learning_rate": 1.5680438339180348e-06, + "loss": 1.415, + "step": 26989 + }, + { + "epoch": 2.300349441745504, + "grad_norm": 29.17775999376315, + "learning_rate": 1.5676832541523618e-06, + "loss": 1.0275, + "step": 26990 + }, + { + "epoch": 2.3004346714395294, + "grad_norm": 31.150770798332253, + "learning_rate": 1.5673227081421616e-06, + "loss": 1.1353, + "step": 26991 + }, + { + "epoch": 2.300519901133555, + "grad_norm": 56.394708828835924, + "learning_rate": 1.5669621958909787e-06, + "loss": 1.7192, + "step": 26992 + }, + { + "epoch": 2.3006051308275803, + "grad_norm": 72.1832328960537, + "learning_rate": 1.5666017174023567e-06, + "loss": 2.5838, + "step": 26993 + }, + { + "epoch": 2.300690360521606, + "grad_norm": 41.9831870079973, + "learning_rate": 1.5662412726798442e-06, + "loss": 1.7994, + "step": 26994 + }, + { + "epoch": 2.3007755902156313, + "grad_norm": 37.212525616013956, + "learning_rate": 1.565880861726985e-06, + "loss": 1.3958, + "step": 26995 + }, + { + "epoch": 2.3008608199096567, + "grad_norm": 31.31779095233955, + "learning_rate": 1.565520484547322e-06, + "loss": 0.9647, + "step": 26996 + }, + { + "epoch": 2.3009460496036818, + "grad_norm": 27.180818889320683, + "learning_rate": 1.565160141144399e-06, + "loss": 1.2018, + "step": 26997 + }, + { + "epoch": 2.3010312792977072, + "grad_norm": 52.944042458673984, + "learning_rate": 1.564799831521761e-06, + "loss": 1.8292, + "step": 26998 + }, + { + "epoch": 2.3011165089917327, + "grad_norm": 28.834321759010816, + "learning_rate": 1.5644395556829534e-06, + "loss": 1.0951, + "step": 26999 + }, + { + "epoch": 2.301201738685758, + "grad_norm": 46.86583259269661, + "learning_rate": 1.564079313631518e-06, + "loss": 1.4453, + "step": 27000 + }, + { + "epoch": 2.3012869683797836, + "grad_norm": 71.02839807615358, + "learning_rate": 1.5637191053709965e-06, + "loss": 1.6792, + "step": 27001 + }, + { + "epoch": 2.301372198073809, + "grad_norm": 45.94852213909431, + "learning_rate": 1.5633589309049313e-06, + "loss": 1.5602, + "step": 27002 + }, + { + "epoch": 2.301457427767834, + "grad_norm": 39.860650156592534, + "learning_rate": 1.5629987902368666e-06, + "loss": 0.7952, + "step": 27003 + }, + { + "epoch": 2.3015426574618596, + "grad_norm": 27.8000337964321, + "learning_rate": 1.5626386833703432e-06, + "loss": 0.68, + "step": 27004 + }, + { + "epoch": 2.301627887155885, + "grad_norm": 64.4209893653372, + "learning_rate": 1.5622786103089004e-06, + "loss": 2.1585, + "step": 27005 + }, + { + "epoch": 2.3017131168499105, + "grad_norm": 40.101496294882075, + "learning_rate": 1.561918571056082e-06, + "loss": 1.8234, + "step": 27006 + }, + { + "epoch": 2.301798346543936, + "grad_norm": 24.93966147718615, + "learning_rate": 1.5615585656154297e-06, + "loss": 0.7845, + "step": 27007 + }, + { + "epoch": 2.3018835762379615, + "grad_norm": 51.5768298577422, + "learning_rate": 1.5611985939904827e-06, + "loss": 1.3351, + "step": 27008 + }, + { + "epoch": 2.3019688059319865, + "grad_norm": 38.71200743046991, + "learning_rate": 1.560838656184781e-06, + "loss": 0.8108, + "step": 27009 + }, + { + "epoch": 2.302054035626012, + "grad_norm": 31.42682046419701, + "learning_rate": 1.5604787522018633e-06, + "loss": 0.7473, + "step": 27010 + }, + { + "epoch": 2.3021392653200374, + "grad_norm": 73.33930085812929, + "learning_rate": 1.5601188820452711e-06, + "loss": 1.2707, + "step": 27011 + }, + { + "epoch": 2.302224495014063, + "grad_norm": 54.386181899651795, + "learning_rate": 1.5597590457185436e-06, + "loss": 1.7468, + "step": 27012 + }, + { + "epoch": 2.3023097247080884, + "grad_norm": 22.74850376701689, + "learning_rate": 1.559399243225217e-06, + "loss": 1.2294, + "step": 27013 + }, + { + "epoch": 2.302394954402114, + "grad_norm": 44.80652066275932, + "learning_rate": 1.559039474568833e-06, + "loss": 1.8883, + "step": 27014 + }, + { + "epoch": 2.3024801840961393, + "grad_norm": 41.115382809738726, + "learning_rate": 1.5586797397529275e-06, + "loss": 1.3588, + "step": 27015 + }, + { + "epoch": 2.3025654137901643, + "grad_norm": 34.40488705861286, + "learning_rate": 1.5583200387810404e-06, + "loss": 0.9195, + "step": 27016 + }, + { + "epoch": 2.30265064348419, + "grad_norm": 46.298806356037254, + "learning_rate": 1.5579603716567081e-06, + "loss": 1.6015, + "step": 27017 + }, + { + "epoch": 2.3027358731782153, + "grad_norm": 63.54993649199241, + "learning_rate": 1.5576007383834684e-06, + "loss": 0.8891, + "step": 27018 + }, + { + "epoch": 2.3028211028722407, + "grad_norm": 54.352279012770964, + "learning_rate": 1.5572411389648557e-06, + "loss": 2.22, + "step": 27019 + }, + { + "epoch": 2.302906332566266, + "grad_norm": 24.655578539808996, + "learning_rate": 1.5568815734044096e-06, + "loss": 0.9266, + "step": 27020 + }, + { + "epoch": 2.3029915622602917, + "grad_norm": 53.32244960385464, + "learning_rate": 1.5565220417056636e-06, + "loss": 1.3091, + "step": 27021 + }, + { + "epoch": 2.3030767919543167, + "grad_norm": 43.83245897358395, + "learning_rate": 1.5561625438721568e-06, + "loss": 1.76, + "step": 27022 + }, + { + "epoch": 2.303162021648342, + "grad_norm": 70.52460467383824, + "learning_rate": 1.5558030799074214e-06, + "loss": 1.1788, + "step": 27023 + }, + { + "epoch": 2.3032472513423676, + "grad_norm": 57.57019952281368, + "learning_rate": 1.5554436498149954e-06, + "loss": 1.4861, + "step": 27024 + }, + { + "epoch": 2.303332481036393, + "grad_norm": 28.932097808348505, + "learning_rate": 1.5550842535984128e-06, + "loss": 1.4626, + "step": 27025 + }, + { + "epoch": 2.3034177107304186, + "grad_norm": 22.573545300061614, + "learning_rate": 1.5547248912612073e-06, + "loss": 1.4743, + "step": 27026 + }, + { + "epoch": 2.303502940424444, + "grad_norm": 81.98939983215537, + "learning_rate": 1.554365562806912e-06, + "loss": 1.7434, + "step": 27027 + }, + { + "epoch": 2.303588170118469, + "grad_norm": 27.946457542698926, + "learning_rate": 1.5540062682390638e-06, + "loss": 1.2637, + "step": 27028 + }, + { + "epoch": 2.3036733998124945, + "grad_norm": 42.555433567989176, + "learning_rate": 1.5536470075611932e-06, + "loss": 0.9001, + "step": 27029 + }, + { + "epoch": 2.30375862950652, + "grad_norm": 51.88991092984379, + "learning_rate": 1.5532877807768359e-06, + "loss": 1.7518, + "step": 27030 + }, + { + "epoch": 2.3038438592005455, + "grad_norm": 65.75444967707092, + "learning_rate": 1.5529285878895246e-06, + "loss": 1.8251, + "step": 27031 + }, + { + "epoch": 2.303929088894571, + "grad_norm": 53.215498686697494, + "learning_rate": 1.5525694289027887e-06, + "loss": 1.8732, + "step": 27032 + }, + { + "epoch": 2.3040143185885964, + "grad_norm": 36.09801710526935, + "learning_rate": 1.5522103038201642e-06, + "loss": 0.9636, + "step": 27033 + }, + { + "epoch": 2.304099548282622, + "grad_norm": 36.172160477765715, + "learning_rate": 1.5518512126451811e-06, + "loss": 1.3377, + "step": 27034 + }, + { + "epoch": 2.304184777976647, + "grad_norm": 41.54318423136122, + "learning_rate": 1.5514921553813716e-06, + "loss": 1.612, + "step": 27035 + }, + { + "epoch": 2.3042700076706724, + "grad_norm": 57.639112987974904, + "learning_rate": 1.5511331320322647e-06, + "loss": 1.9595, + "step": 27036 + }, + { + "epoch": 2.304355237364698, + "grad_norm": 43.84748868841842, + "learning_rate": 1.5507741426013934e-06, + "loss": 1.4462, + "step": 27037 + }, + { + "epoch": 2.3044404670587233, + "grad_norm": 53.170485830901484, + "learning_rate": 1.550415187092289e-06, + "loss": 1.3321, + "step": 27038 + }, + { + "epoch": 2.3045256967527488, + "grad_norm": 48.00654093319899, + "learning_rate": 1.5500562655084805e-06, + "loss": 1.4282, + "step": 27039 + }, + { + "epoch": 2.3046109264467742, + "grad_norm": 53.926976619264885, + "learning_rate": 1.5496973778534963e-06, + "loss": 1.3577, + "step": 27040 + }, + { + "epoch": 2.3046961561407997, + "grad_norm": 52.77554026776075, + "learning_rate": 1.5493385241308688e-06, + "loss": 2.1369, + "step": 27041 + }, + { + "epoch": 2.3047813858348247, + "grad_norm": 37.910775391788654, + "learning_rate": 1.5489797043441262e-06, + "loss": 1.2112, + "step": 27042 + }, + { + "epoch": 2.30486661552885, + "grad_norm": 91.86419737230895, + "learning_rate": 1.5486209184967964e-06, + "loss": 1.9813, + "step": 27043 + }, + { + "epoch": 2.3049518452228757, + "grad_norm": 84.1965875482868, + "learning_rate": 1.5482621665924068e-06, + "loss": 1.9733, + "step": 27044 + }, + { + "epoch": 2.305037074916901, + "grad_norm": 49.933311499914794, + "learning_rate": 1.5479034486344879e-06, + "loss": 1.5478, + "step": 27045 + }, + { + "epoch": 2.3051223046109266, + "grad_norm": 51.69330890699928, + "learning_rate": 1.5475447646265678e-06, + "loss": 1.4, + "step": 27046 + }, + { + "epoch": 2.3052075343049516, + "grad_norm": 49.12107557749011, + "learning_rate": 1.5471861145721733e-06, + "loss": 1.6266, + "step": 27047 + }, + { + "epoch": 2.305292763998977, + "grad_norm": 24.330274549089435, + "learning_rate": 1.546827498474831e-06, + "loss": 1.0746, + "step": 27048 + }, + { + "epoch": 2.3053779936930026, + "grad_norm": 58.97769976925928, + "learning_rate": 1.546468916338067e-06, + "loss": 2.2381, + "step": 27049 + }, + { + "epoch": 2.305463223387028, + "grad_norm": 53.484670688146444, + "learning_rate": 1.5461103681654105e-06, + "loss": 1.651, + "step": 27050 + }, + { + "epoch": 2.3055484530810535, + "grad_norm": 57.41770367470811, + "learning_rate": 1.5457518539603855e-06, + "loss": 1.8097, + "step": 27051 + }, + { + "epoch": 2.305633682775079, + "grad_norm": 25.96705799346068, + "learning_rate": 1.5453933737265176e-06, + "loss": 0.7455, + "step": 27052 + }, + { + "epoch": 2.3057189124691044, + "grad_norm": 26.708630746964758, + "learning_rate": 1.5450349274673326e-06, + "loss": 0.8718, + "step": 27053 + }, + { + "epoch": 2.3058041421631295, + "grad_norm": 30.95227348617833, + "learning_rate": 1.5446765151863579e-06, + "loss": 1.299, + "step": 27054 + }, + { + "epoch": 2.305889371857155, + "grad_norm": 33.06195575495278, + "learning_rate": 1.5443181368871168e-06, + "loss": 1.0705, + "step": 27055 + }, + { + "epoch": 2.3059746015511804, + "grad_norm": 58.13068877526056, + "learning_rate": 1.5439597925731336e-06, + "loss": 1.3691, + "step": 27056 + }, + { + "epoch": 2.306059831245206, + "grad_norm": 65.26272267109422, + "learning_rate": 1.5436014822479312e-06, + "loss": 1.7279, + "step": 27057 + }, + { + "epoch": 2.3061450609392313, + "grad_norm": 60.317322611765036, + "learning_rate": 1.543243205915036e-06, + "loss": 1.4857, + "step": 27058 + }, + { + "epoch": 2.306230290633257, + "grad_norm": 57.59427378864239, + "learning_rate": 1.5428849635779702e-06, + "loss": 1.2056, + "step": 27059 + }, + { + "epoch": 2.3063155203272823, + "grad_norm": 84.7213001080176, + "learning_rate": 1.5425267552402556e-06, + "loss": 1.713, + "step": 27060 + }, + { + "epoch": 2.3064007500213073, + "grad_norm": 70.13986114679601, + "learning_rate": 1.5421685809054183e-06, + "loss": 1.8417, + "step": 27061 + }, + { + "epoch": 2.3064859797153328, + "grad_norm": 61.79918966814807, + "learning_rate": 1.5418104405769774e-06, + "loss": 1.8287, + "step": 27062 + }, + { + "epoch": 2.3065712094093582, + "grad_norm": 67.10983121724034, + "learning_rate": 1.541452334258458e-06, + "loss": 1.8087, + "step": 27063 + }, + { + "epoch": 2.3066564391033837, + "grad_norm": 42.75413394507289, + "learning_rate": 1.5410942619533809e-06, + "loss": 0.7725, + "step": 27064 + }, + { + "epoch": 2.306741668797409, + "grad_norm": 45.71380400390605, + "learning_rate": 1.5407362236652672e-06, + "loss": 1.4147, + "step": 27065 + }, + { + "epoch": 2.306826898491434, + "grad_norm": 60.475808180915834, + "learning_rate": 1.5403782193976364e-06, + "loss": 1.9602, + "step": 27066 + }, + { + "epoch": 2.3069121281854597, + "grad_norm": 41.828277233480854, + "learning_rate": 1.5400202491540128e-06, + "loss": 1.2051, + "step": 27067 + }, + { + "epoch": 2.306997357879485, + "grad_norm": 66.52967415193052, + "learning_rate": 1.5396623129379134e-06, + "loss": 1.5974, + "step": 27068 + }, + { + "epoch": 2.3070825875735106, + "grad_norm": 84.07914279529524, + "learning_rate": 1.539304410752862e-06, + "loss": 1.7424, + "step": 27069 + }, + { + "epoch": 2.307167817267536, + "grad_norm": 35.59885342708756, + "learning_rate": 1.5389465426023754e-06, + "loss": 1.3554, + "step": 27070 + }, + { + "epoch": 2.3072530469615615, + "grad_norm": 32.851426234820664, + "learning_rate": 1.5385887084899748e-06, + "loss": 0.987, + "step": 27071 + }, + { + "epoch": 2.307338276655587, + "grad_norm": 69.47502835465, + "learning_rate": 1.53823090841918e-06, + "loss": 1.8983, + "step": 27072 + }, + { + "epoch": 2.307423506349612, + "grad_norm": 30.985977298849715, + "learning_rate": 1.537873142393508e-06, + "loss": 1.0144, + "step": 27073 + }, + { + "epoch": 2.3075087360436375, + "grad_norm": 76.96616701390744, + "learning_rate": 1.5375154104164775e-06, + "loss": 2.1658, + "step": 27074 + }, + { + "epoch": 2.307593965737663, + "grad_norm": 40.6148991103857, + "learning_rate": 1.5371577124916058e-06, + "loss": 1.7522, + "step": 27075 + }, + { + "epoch": 2.3076791954316884, + "grad_norm": 35.977611164264175, + "learning_rate": 1.5368000486224127e-06, + "loss": 1.4054, + "step": 27076 + }, + { + "epoch": 2.307764425125714, + "grad_norm": 36.16000953802004, + "learning_rate": 1.5364424188124161e-06, + "loss": 1.1727, + "step": 27077 + }, + { + "epoch": 2.3078496548197394, + "grad_norm": 31.419839515134303, + "learning_rate": 1.5360848230651314e-06, + "loss": 1.2166, + "step": 27078 + }, + { + "epoch": 2.307934884513765, + "grad_norm": 57.58449943291949, + "learning_rate": 1.535727261384075e-06, + "loss": 1.3196, + "step": 27079 + }, + { + "epoch": 2.30802011420779, + "grad_norm": 51.689051333941634, + "learning_rate": 1.5353697337727663e-06, + "loss": 1.4788, + "step": 27080 + }, + { + "epoch": 2.3081053439018153, + "grad_norm": 60.52920692968972, + "learning_rate": 1.5350122402347195e-06, + "loss": 0.9642, + "step": 27081 + }, + { + "epoch": 2.308190573595841, + "grad_norm": 34.86311597115086, + "learning_rate": 1.53465478077345e-06, + "loss": 1.0837, + "step": 27082 + }, + { + "epoch": 2.3082758032898663, + "grad_norm": 64.63369732073991, + "learning_rate": 1.5342973553924722e-06, + "loss": 2.3942, + "step": 27083 + }, + { + "epoch": 2.3083610329838917, + "grad_norm": 32.42806415101794, + "learning_rate": 1.5339399640953034e-06, + "loss": 0.8206, + "step": 27084 + }, + { + "epoch": 2.3084462626779167, + "grad_norm": 25.080600078415184, + "learning_rate": 1.5335826068854586e-06, + "loss": 0.8412, + "step": 27085 + }, + { + "epoch": 2.308531492371942, + "grad_norm": 55.7968680199355, + "learning_rate": 1.5332252837664519e-06, + "loss": 1.1823, + "step": 27086 + }, + { + "epoch": 2.3086167220659677, + "grad_norm": 40.302033248693206, + "learning_rate": 1.5328679947417968e-06, + "loss": 1.4425, + "step": 27087 + }, + { + "epoch": 2.308701951759993, + "grad_norm": 70.86548047097602, + "learning_rate": 1.5325107398150058e-06, + "loss": 1.5765, + "step": 27088 + }, + { + "epoch": 2.3087871814540186, + "grad_norm": 27.98777740411686, + "learning_rate": 1.5321535189895952e-06, + "loss": 1.0424, + "step": 27089 + }, + { + "epoch": 2.308872411148044, + "grad_norm": 29.163109004075164, + "learning_rate": 1.531796332269077e-06, + "loss": 0.9305, + "step": 27090 + }, + { + "epoch": 2.3089576408420696, + "grad_norm": 43.26960118400974, + "learning_rate": 1.5314391796569621e-06, + "loss": 0.8489, + "step": 27091 + }, + { + "epoch": 2.3090428705360946, + "grad_norm": 49.081371955223524, + "learning_rate": 1.5310820611567645e-06, + "loss": 0.8516, + "step": 27092 + }, + { + "epoch": 2.30912810023012, + "grad_norm": 24.536566613925654, + "learning_rate": 1.5307249767719983e-06, + "loss": 1.0247, + "step": 27093 + }, + { + "epoch": 2.3092133299241455, + "grad_norm": 27.269197282836476, + "learning_rate": 1.5303679265061732e-06, + "loss": 1.0589, + "step": 27094 + }, + { + "epoch": 2.309298559618171, + "grad_norm": 53.42929993245898, + "learning_rate": 1.5300109103628009e-06, + "loss": 1.2517, + "step": 27095 + }, + { + "epoch": 2.3093837893121965, + "grad_norm": 37.17544215317695, + "learning_rate": 1.5296539283453905e-06, + "loss": 0.5898, + "step": 27096 + }, + { + "epoch": 2.309469019006222, + "grad_norm": 22.21545797133492, + "learning_rate": 1.5292969804574564e-06, + "loss": 0.6483, + "step": 27097 + }, + { + "epoch": 2.3095542487002474, + "grad_norm": 19.282681679352223, + "learning_rate": 1.528940066702508e-06, + "loss": 0.8219, + "step": 27098 + }, + { + "epoch": 2.3096394783942724, + "grad_norm": 29.875451240411483, + "learning_rate": 1.528583187084053e-06, + "loss": 0.9398, + "step": 27099 + }, + { + "epoch": 2.309724708088298, + "grad_norm": 67.29505862753653, + "learning_rate": 1.5282263416056025e-06, + "loss": 1.767, + "step": 27100 + }, + { + "epoch": 2.3098099377823234, + "grad_norm": 34.76397385350504, + "learning_rate": 1.5278695302706681e-06, + "loss": 1.2164, + "step": 27101 + }, + { + "epoch": 2.309895167476349, + "grad_norm": 37.26620140072252, + "learning_rate": 1.5275127530827572e-06, + "loss": 0.9995, + "step": 27102 + }, + { + "epoch": 2.3099803971703743, + "grad_norm": 63.0161878453167, + "learning_rate": 1.5271560100453786e-06, + "loss": 1.4565, + "step": 27103 + }, + { + "epoch": 2.3100656268643993, + "grad_norm": 61.09707479039764, + "learning_rate": 1.5267993011620408e-06, + "loss": 1.2608, + "step": 27104 + }, + { + "epoch": 2.310150856558425, + "grad_norm": 94.78770843917836, + "learning_rate": 1.5264426264362497e-06, + "loss": 2.1685, + "step": 27105 + }, + { + "epoch": 2.3102360862524502, + "grad_norm": 39.68016730156006, + "learning_rate": 1.526085985871517e-06, + "loss": 1.4217, + "step": 27106 + }, + { + "epoch": 2.3103213159464757, + "grad_norm": 36.397146578163785, + "learning_rate": 1.5257293794713467e-06, + "loss": 0.8864, + "step": 27107 + }, + { + "epoch": 2.310406545640501, + "grad_norm": 46.371642409009254, + "learning_rate": 1.5253728072392493e-06, + "loss": 0.9736, + "step": 27108 + }, + { + "epoch": 2.3104917753345267, + "grad_norm": 45.381947283227476, + "learning_rate": 1.5250162691787278e-06, + "loss": 1.3207, + "step": 27109 + }, + { + "epoch": 2.310577005028552, + "grad_norm": 57.04502034121886, + "learning_rate": 1.524659765293292e-06, + "loss": 1.3853, + "step": 27110 + }, + { + "epoch": 2.310662234722577, + "grad_norm": 54.14292872631364, + "learning_rate": 1.5243032955864462e-06, + "loss": 1.6745, + "step": 27111 + }, + { + "epoch": 2.3107474644166026, + "grad_norm": 61.16799021816136, + "learning_rate": 1.523946860061697e-06, + "loss": 1.33, + "step": 27112 + }, + { + "epoch": 2.310832694110628, + "grad_norm": 67.48035702003479, + "learning_rate": 1.5235904587225469e-06, + "loss": 1.2894, + "step": 27113 + }, + { + "epoch": 2.3109179238046536, + "grad_norm": 72.02121073381606, + "learning_rate": 1.5232340915725053e-06, + "loss": 2.0747, + "step": 27114 + }, + { + "epoch": 2.311003153498679, + "grad_norm": 29.17131653291319, + "learning_rate": 1.5228777586150735e-06, + "loss": 1.4037, + "step": 27115 + }, + { + "epoch": 2.3110883831927045, + "grad_norm": 22.775203758089575, + "learning_rate": 1.522521459853758e-06, + "loss": 0.888, + "step": 27116 + }, + { + "epoch": 2.31117361288673, + "grad_norm": 29.998224948141115, + "learning_rate": 1.522165195292063e-06, + "loss": 0.5917, + "step": 27117 + }, + { + "epoch": 2.311258842580755, + "grad_norm": 88.82062335467373, + "learning_rate": 1.5218089649334895e-06, + "loss": 2.218, + "step": 27118 + }, + { + "epoch": 2.3113440722747804, + "grad_norm": 23.25574560889927, + "learning_rate": 1.521452768781544e-06, + "loss": 0.8311, + "step": 27119 + }, + { + "epoch": 2.311429301968806, + "grad_norm": 57.05483401554889, + "learning_rate": 1.5210966068397287e-06, + "loss": 1.9184, + "step": 27120 + }, + { + "epoch": 2.3115145316628314, + "grad_norm": 29.384882078317265, + "learning_rate": 1.5207404791115454e-06, + "loss": 1.1346, + "step": 27121 + }, + { + "epoch": 2.311599761356857, + "grad_norm": 65.1659245584447, + "learning_rate": 1.5203843856004956e-06, + "loss": 1.3958, + "step": 27122 + }, + { + "epoch": 2.3116849910508823, + "grad_norm": 40.06061998333605, + "learning_rate": 1.520028326310083e-06, + "loss": 1.4074, + "step": 27123 + }, + { + "epoch": 2.3117702207449073, + "grad_norm": 54.30825152808746, + "learning_rate": 1.5196723012438102e-06, + "loss": 1.7639, + "step": 27124 + }, + { + "epoch": 2.311855450438933, + "grad_norm": 84.90968671176105, + "learning_rate": 1.5193163104051779e-06, + "loss": 2.2793, + "step": 27125 + }, + { + "epoch": 2.3119406801329583, + "grad_norm": 79.90431584487057, + "learning_rate": 1.5189603537976844e-06, + "loss": 1.8697, + "step": 27126 + }, + { + "epoch": 2.3120259098269837, + "grad_norm": 49.447471240577165, + "learning_rate": 1.5186044314248343e-06, + "loss": 1.8404, + "step": 27127 + }, + { + "epoch": 2.312111139521009, + "grad_norm": 107.65373104736248, + "learning_rate": 1.5182485432901267e-06, + "loss": 2.3177, + "step": 27128 + }, + { + "epoch": 2.3121963692150347, + "grad_norm": 56.81677238683492, + "learning_rate": 1.5178926893970608e-06, + "loss": 1.6133, + "step": 27129 + }, + { + "epoch": 2.3122815989090597, + "grad_norm": 41.656360426261855, + "learning_rate": 1.5175368697491349e-06, + "loss": 1.4473, + "step": 27130 + }, + { + "epoch": 2.312366828603085, + "grad_norm": 52.86730864831295, + "learning_rate": 1.5171810843498507e-06, + "loss": 1.2643, + "step": 27131 + }, + { + "epoch": 2.3124520582971106, + "grad_norm": 117.9230567086114, + "learning_rate": 1.5168253332027078e-06, + "loss": 2.6583, + "step": 27132 + }, + { + "epoch": 2.312537287991136, + "grad_norm": 57.772697825721835, + "learning_rate": 1.5164696163112036e-06, + "loss": 1.6003, + "step": 27133 + }, + { + "epoch": 2.3126225176851616, + "grad_norm": 56.545920659896105, + "learning_rate": 1.5161139336788366e-06, + "loss": 1.7923, + "step": 27134 + }, + { + "epoch": 2.312707747379187, + "grad_norm": 42.68296854524673, + "learning_rate": 1.515758285309103e-06, + "loss": 1.3823, + "step": 27135 + }, + { + "epoch": 2.3127929770732125, + "grad_norm": 36.4166946239709, + "learning_rate": 1.5154026712055037e-06, + "loss": 0.9747, + "step": 27136 + }, + { + "epoch": 2.3128782067672375, + "grad_norm": 38.536126792304, + "learning_rate": 1.5150470913715348e-06, + "loss": 1.0935, + "step": 27137 + }, + { + "epoch": 2.312963436461263, + "grad_norm": 35.99440494793149, + "learning_rate": 1.5146915458106908e-06, + "loss": 1.2128, + "step": 27138 + }, + { + "epoch": 2.3130486661552885, + "grad_norm": 33.395232217080064, + "learning_rate": 1.5143360345264713e-06, + "loss": 0.9237, + "step": 27139 + }, + { + "epoch": 2.313133895849314, + "grad_norm": 48.83774551731427, + "learning_rate": 1.5139805575223732e-06, + "loss": 0.8536, + "step": 27140 + }, + { + "epoch": 2.3132191255433394, + "grad_norm": 28.31090913125524, + "learning_rate": 1.513625114801891e-06, + "loss": 0.8678, + "step": 27141 + }, + { + "epoch": 2.313304355237365, + "grad_norm": 23.9686353502617, + "learning_rate": 1.5132697063685203e-06, + "loss": 0.8738, + "step": 27142 + }, + { + "epoch": 2.31338958493139, + "grad_norm": 39.24203857748962, + "learning_rate": 1.512914332225755e-06, + "loss": 1.2546, + "step": 27143 + }, + { + "epoch": 2.3134748146254154, + "grad_norm": 30.2514350695098, + "learning_rate": 1.5125589923770934e-06, + "loss": 1.0898, + "step": 27144 + }, + { + "epoch": 2.313560044319441, + "grad_norm": 41.226981132961335, + "learning_rate": 1.5122036868260282e-06, + "loss": 0.7813, + "step": 27145 + }, + { + "epoch": 2.3136452740134663, + "grad_norm": 38.19314626439442, + "learning_rate": 1.5118484155760521e-06, + "loss": 1.2017, + "step": 27146 + }, + { + "epoch": 2.313730503707492, + "grad_norm": 53.51092035993528, + "learning_rate": 1.5114931786306625e-06, + "loss": 1.5526, + "step": 27147 + }, + { + "epoch": 2.3138157334015172, + "grad_norm": 44.26109638980016, + "learning_rate": 1.5111379759933499e-06, + "loss": 1.6053, + "step": 27148 + }, + { + "epoch": 2.3139009630955423, + "grad_norm": 44.89739217509823, + "learning_rate": 1.5107828076676102e-06, + "loss": 1.6003, + "step": 27149 + }, + { + "epoch": 2.3139861927895677, + "grad_norm": 52.54704108517582, + "learning_rate": 1.5104276736569351e-06, + "loss": 1.4437, + "step": 27150 + }, + { + "epoch": 2.314071422483593, + "grad_norm": 48.9716216688865, + "learning_rate": 1.5100725739648176e-06, + "loss": 1.1048, + "step": 27151 + }, + { + "epoch": 2.3141566521776187, + "grad_norm": 58.095203460442086, + "learning_rate": 1.5097175085947475e-06, + "loss": 1.6901, + "step": 27152 + }, + { + "epoch": 2.314241881871644, + "grad_norm": 37.62615604337053, + "learning_rate": 1.5093624775502208e-06, + "loss": 1.0614, + "step": 27153 + }, + { + "epoch": 2.3143271115656696, + "grad_norm": 17.560597662426698, + "learning_rate": 1.5090074808347255e-06, + "loss": 0.6468, + "step": 27154 + }, + { + "epoch": 2.314412341259695, + "grad_norm": 87.57640960924033, + "learning_rate": 1.5086525184517559e-06, + "loss": 1.8024, + "step": 27155 + }, + { + "epoch": 2.31449757095372, + "grad_norm": 27.20580715280346, + "learning_rate": 1.5082975904047998e-06, + "loss": 0.7034, + "step": 27156 + }, + { + "epoch": 2.3145828006477456, + "grad_norm": 22.376735191672523, + "learning_rate": 1.5079426966973509e-06, + "loss": 0.7914, + "step": 27157 + }, + { + "epoch": 2.314668030341771, + "grad_norm": 62.74233373955065, + "learning_rate": 1.5075878373328977e-06, + "loss": 1.6332, + "step": 27158 + }, + { + "epoch": 2.3147532600357965, + "grad_norm": 54.04159911638525, + "learning_rate": 1.507233012314931e-06, + "loss": 1.4885, + "step": 27159 + }, + { + "epoch": 2.314838489729822, + "grad_norm": 33.43134880579041, + "learning_rate": 1.5068782216469396e-06, + "loss": 1.3263, + "step": 27160 + }, + { + "epoch": 2.3149237194238474, + "grad_norm": 84.43301187332362, + "learning_rate": 1.5065234653324113e-06, + "loss": 1.6048, + "step": 27161 + }, + { + "epoch": 2.315008949117873, + "grad_norm": 52.682268647444836, + "learning_rate": 1.5061687433748367e-06, + "loss": 1.6893, + "step": 27162 + }, + { + "epoch": 2.315094178811898, + "grad_norm": 23.559318234399452, + "learning_rate": 1.505814055777705e-06, + "loss": 0.8834, + "step": 27163 + }, + { + "epoch": 2.3151794085059234, + "grad_norm": 42.47168576480838, + "learning_rate": 1.5054594025445047e-06, + "loss": 1.4733, + "step": 27164 + }, + { + "epoch": 2.315264638199949, + "grad_norm": 50.98594823480687, + "learning_rate": 1.5051047836787202e-06, + "loss": 2.1045, + "step": 27165 + }, + { + "epoch": 2.3153498678939743, + "grad_norm": 33.69541366405205, + "learning_rate": 1.504750199183843e-06, + "loss": 0.923, + "step": 27166 + }, + { + "epoch": 2.315435097588, + "grad_norm": 46.89862186864832, + "learning_rate": 1.5043956490633593e-06, + "loss": 1.4123, + "step": 27167 + }, + { + "epoch": 2.315520327282025, + "grad_norm": 21.828159681745344, + "learning_rate": 1.504041133320755e-06, + "loss": 0.8833, + "step": 27168 + }, + { + "epoch": 2.3156055569760503, + "grad_norm": 28.647735104911504, + "learning_rate": 1.5036866519595156e-06, + "loss": 1.1099, + "step": 27169 + }, + { + "epoch": 2.3156907866700758, + "grad_norm": 51.1994011205472, + "learning_rate": 1.5033322049831283e-06, + "loss": 1.6022, + "step": 27170 + }, + { + "epoch": 2.3157760163641012, + "grad_norm": 71.31160024472733, + "learning_rate": 1.5029777923950811e-06, + "loss": 1.937, + "step": 27171 + }, + { + "epoch": 2.3158612460581267, + "grad_norm": 26.591417902377334, + "learning_rate": 1.5026234141988578e-06, + "loss": 0.7575, + "step": 27172 + }, + { + "epoch": 2.315946475752152, + "grad_norm": 60.943651706179544, + "learning_rate": 1.502269070397942e-06, + "loss": 1.5731, + "step": 27173 + }, + { + "epoch": 2.3160317054461776, + "grad_norm": 42.64223948184931, + "learning_rate": 1.5019147609958218e-06, + "loss": 1.4863, + "step": 27174 + }, + { + "epoch": 2.3161169351402027, + "grad_norm": 62.420514414251656, + "learning_rate": 1.5015604859959793e-06, + "loss": 1.5346, + "step": 27175 + }, + { + "epoch": 2.316202164834228, + "grad_norm": 54.80024411894382, + "learning_rate": 1.5012062454018994e-06, + "loss": 1.1861, + "step": 27176 + }, + { + "epoch": 2.3162873945282536, + "grad_norm": 49.73100008394938, + "learning_rate": 1.5008520392170646e-06, + "loss": 1.4511, + "step": 27177 + }, + { + "epoch": 2.316372624222279, + "grad_norm": 46.887757775329824, + "learning_rate": 1.5004978674449594e-06, + "loss": 1.5202, + "step": 27178 + }, + { + "epoch": 2.3164578539163045, + "grad_norm": 66.57986353371562, + "learning_rate": 1.5001437300890688e-06, + "loss": 1.911, + "step": 27179 + }, + { + "epoch": 2.31654308361033, + "grad_norm": 62.87959002249523, + "learning_rate": 1.499789627152874e-06, + "loss": 1.0371, + "step": 27180 + }, + { + "epoch": 2.3166283133043555, + "grad_norm": 75.10156080849255, + "learning_rate": 1.4994355586398568e-06, + "loss": 1.7258, + "step": 27181 + }, + { + "epoch": 2.3167135429983805, + "grad_norm": 44.69489694738974, + "learning_rate": 1.499081524553499e-06, + "loss": 1.0335, + "step": 27182 + }, + { + "epoch": 2.316798772692406, + "grad_norm": 31.850188909829118, + "learning_rate": 1.4987275248972843e-06, + "loss": 1.7189, + "step": 27183 + }, + { + "epoch": 2.3168840023864314, + "grad_norm": 28.346472397849553, + "learning_rate": 1.4983735596746935e-06, + "loss": 0.7878, + "step": 27184 + }, + { + "epoch": 2.316969232080457, + "grad_norm": 42.25080198331425, + "learning_rate": 1.4980196288892052e-06, + "loss": 1.363, + "step": 27185 + }, + { + "epoch": 2.3170544617744824, + "grad_norm": 25.920886398089323, + "learning_rate": 1.4976657325443028e-06, + "loss": 0.9915, + "step": 27186 + }, + { + "epoch": 2.3171396914685074, + "grad_norm": 43.2507362071562, + "learning_rate": 1.4973118706434675e-06, + "loss": 1.2816, + "step": 27187 + }, + { + "epoch": 2.317224921162533, + "grad_norm": 62.19699087582816, + "learning_rate": 1.4969580431901788e-06, + "loss": 1.4705, + "step": 27188 + }, + { + "epoch": 2.3173101508565583, + "grad_norm": 63.22149700627674, + "learning_rate": 1.4966042501879153e-06, + "loss": 2.0422, + "step": 27189 + }, + { + "epoch": 2.317395380550584, + "grad_norm": 34.19808030827754, + "learning_rate": 1.4962504916401566e-06, + "loss": 0.9297, + "step": 27190 + }, + { + "epoch": 2.3174806102446093, + "grad_norm": 21.20625817682332, + "learning_rate": 1.495896767550381e-06, + "loss": 1.1339, + "step": 27191 + }, + { + "epoch": 2.3175658399386347, + "grad_norm": 33.234942150027585, + "learning_rate": 1.4955430779220697e-06, + "loss": 0.8698, + "step": 27192 + }, + { + "epoch": 2.31765106963266, + "grad_norm": 34.7396433123582, + "learning_rate": 1.4951894227586976e-06, + "loss": 1.1548, + "step": 27193 + }, + { + "epoch": 2.3177362993266852, + "grad_norm": 73.88187928272413, + "learning_rate": 1.4948358020637472e-06, + "loss": 2.1005, + "step": 27194 + }, + { + "epoch": 2.3178215290207107, + "grad_norm": 31.657404190867982, + "learning_rate": 1.4944822158406919e-06, + "loss": 0.9138, + "step": 27195 + }, + { + "epoch": 2.317906758714736, + "grad_norm": 38.22376570516571, + "learning_rate": 1.4941286640930125e-06, + "loss": 1.0716, + "step": 27196 + }, + { + "epoch": 2.3179919884087616, + "grad_norm": 63.402247454479344, + "learning_rate": 1.4937751468241851e-06, + "loss": 1.2461, + "step": 27197 + }, + { + "epoch": 2.318077218102787, + "grad_norm": 26.159864947507177, + "learning_rate": 1.4934216640376853e-06, + "loss": 0.6296, + "step": 27198 + }, + { + "epoch": 2.3181624477968126, + "grad_norm": 72.32473298636094, + "learning_rate": 1.4930682157369891e-06, + "loss": 1.9561, + "step": 27199 + }, + { + "epoch": 2.318247677490838, + "grad_norm": 56.98605025947681, + "learning_rate": 1.4927148019255744e-06, + "loss": 2.0757, + "step": 27200 + }, + { + "epoch": 2.318332907184863, + "grad_norm": 65.99697256463419, + "learning_rate": 1.492361422606915e-06, + "loss": 2.1351, + "step": 27201 + }, + { + "epoch": 2.3184181368788885, + "grad_norm": 58.657563165833146, + "learning_rate": 1.4920080777844887e-06, + "loss": 2.1523, + "step": 27202 + }, + { + "epoch": 2.318503366572914, + "grad_norm": 43.87585808522034, + "learning_rate": 1.491654767461767e-06, + "loss": 1.563, + "step": 27203 + }, + { + "epoch": 2.3185885962669395, + "grad_norm": 30.07122870838786, + "learning_rate": 1.4913014916422287e-06, + "loss": 0.8544, + "step": 27204 + }, + { + "epoch": 2.318673825960965, + "grad_norm": 43.82459923002794, + "learning_rate": 1.4909482503293454e-06, + "loss": 1.454, + "step": 27205 + }, + { + "epoch": 2.31875905565499, + "grad_norm": 59.550082868702326, + "learning_rate": 1.490595043526592e-06, + "loss": 1.5991, + "step": 27206 + }, + { + "epoch": 2.3188442853490154, + "grad_norm": 40.23162968877397, + "learning_rate": 1.4902418712374416e-06, + "loss": 1.3848, + "step": 27207 + }, + { + "epoch": 2.318929515043041, + "grad_norm": 74.60370566497097, + "learning_rate": 1.4898887334653667e-06, + "loss": 1.7833, + "step": 27208 + }, + { + "epoch": 2.3190147447370664, + "grad_norm": 56.6892911754741, + "learning_rate": 1.4895356302138408e-06, + "loss": 1.58, + "step": 27209 + }, + { + "epoch": 2.319099974431092, + "grad_norm": 82.59925329189409, + "learning_rate": 1.4891825614863392e-06, + "loss": 2.0026, + "step": 27210 + }, + { + "epoch": 2.3191852041251173, + "grad_norm": 21.296095079443415, + "learning_rate": 1.4888295272863318e-06, + "loss": 0.7486, + "step": 27211 + }, + { + "epoch": 2.3192704338191428, + "grad_norm": 83.4579446497841, + "learning_rate": 1.4884765276172891e-06, + "loss": 2.006, + "step": 27212 + }, + { + "epoch": 2.319355663513168, + "grad_norm": 19.33516616056603, + "learning_rate": 1.4881235624826863e-06, + "loss": 0.6876, + "step": 27213 + }, + { + "epoch": 2.3194408932071933, + "grad_norm": 53.985134432913114, + "learning_rate": 1.4877706318859925e-06, + "loss": 1.1975, + "step": 27214 + }, + { + "epoch": 2.3195261229012187, + "grad_norm": 39.35686685864451, + "learning_rate": 1.4874177358306784e-06, + "loss": 1.3672, + "step": 27215 + }, + { + "epoch": 2.319611352595244, + "grad_norm": 112.86036090248469, + "learning_rate": 1.4870648743202142e-06, + "loss": 1.998, + "step": 27216 + }, + { + "epoch": 2.3196965822892697, + "grad_norm": 61.31521522169442, + "learning_rate": 1.486712047358071e-06, + "loss": 1.47, + "step": 27217 + }, + { + "epoch": 2.319781811983295, + "grad_norm": 40.49303443463122, + "learning_rate": 1.4863592549477203e-06, + "loss": 0.8792, + "step": 27218 + }, + { + "epoch": 2.3198670416773206, + "grad_norm": 35.61518811323199, + "learning_rate": 1.4860064970926303e-06, + "loss": 0.7926, + "step": 27219 + }, + { + "epoch": 2.3199522713713456, + "grad_norm": 73.73145803878657, + "learning_rate": 1.4856537737962696e-06, + "loss": 1.9411, + "step": 27220 + }, + { + "epoch": 2.320037501065371, + "grad_norm": 45.4808005038028, + "learning_rate": 1.4853010850621064e-06, + "loss": 1.288, + "step": 27221 + }, + { + "epoch": 2.3201227307593966, + "grad_norm": 36.692983270738644, + "learning_rate": 1.4849484308936118e-06, + "loss": 1.4076, + "step": 27222 + }, + { + "epoch": 2.320207960453422, + "grad_norm": 55.65130930906686, + "learning_rate": 1.4845958112942528e-06, + "loss": 1.6834, + "step": 27223 + }, + { + "epoch": 2.3202931901474475, + "grad_norm": 34.16165446902381, + "learning_rate": 1.4842432262674956e-06, + "loss": 1.4894, + "step": 27224 + }, + { + "epoch": 2.3203784198414725, + "grad_norm": 68.95460343353912, + "learning_rate": 1.4838906758168092e-06, + "loss": 1.5033, + "step": 27225 + }, + { + "epoch": 2.320463649535498, + "grad_norm": 66.39235102368693, + "learning_rate": 1.4835381599456622e-06, + "loss": 1.9979, + "step": 27226 + }, + { + "epoch": 2.3205488792295235, + "grad_norm": 112.58210984082878, + "learning_rate": 1.4831856786575199e-06, + "loss": 2.4198, + "step": 27227 + }, + { + "epoch": 2.320634108923549, + "grad_norm": 67.78996901816917, + "learning_rate": 1.4828332319558498e-06, + "loss": 1.404, + "step": 27228 + }, + { + "epoch": 2.3207193386175744, + "grad_norm": 34.96741423711698, + "learning_rate": 1.482480819844115e-06, + "loss": 0.881, + "step": 27229 + }, + { + "epoch": 2.3208045683116, + "grad_norm": 44.437867883351764, + "learning_rate": 1.4821284423257854e-06, + "loss": 0.9497, + "step": 27230 + }, + { + "epoch": 2.3208897980056253, + "grad_norm": 32.49391081770223, + "learning_rate": 1.4817760994043246e-06, + "loss": 0.7007, + "step": 27231 + }, + { + "epoch": 2.3209750276996504, + "grad_norm": 85.82704580030524, + "learning_rate": 1.4814237910831963e-06, + "loss": 1.6776, + "step": 27232 + }, + { + "epoch": 2.321060257393676, + "grad_norm": 54.391205133177294, + "learning_rate": 1.4810715173658685e-06, + "loss": 1.6111, + "step": 27233 + }, + { + "epoch": 2.3211454870877013, + "grad_norm": 27.521057932508242, + "learning_rate": 1.4807192782558028e-06, + "loss": 0.6727, + "step": 27234 + }, + { + "epoch": 2.3212307167817268, + "grad_norm": 27.67861807651231, + "learning_rate": 1.4803670737564658e-06, + "loss": 1.2798, + "step": 27235 + }, + { + "epoch": 2.3213159464757522, + "grad_norm": 51.54626250801532, + "learning_rate": 1.4800149038713201e-06, + "loss": 1.668, + "step": 27236 + }, + { + "epoch": 2.3214011761697777, + "grad_norm": 54.83723075053211, + "learning_rate": 1.4796627686038289e-06, + "loss": 2.4427, + "step": 27237 + }, + { + "epoch": 2.321486405863803, + "grad_norm": 58.33911352627417, + "learning_rate": 1.4793106679574543e-06, + "loss": 1.5874, + "step": 27238 + }, + { + "epoch": 2.321571635557828, + "grad_norm": 64.67120042173885, + "learning_rate": 1.4789586019356616e-06, + "loss": 2.1765, + "step": 27239 + }, + { + "epoch": 2.3216568652518537, + "grad_norm": 22.027519298536106, + "learning_rate": 1.4786065705419105e-06, + "loss": 0.6879, + "step": 27240 + }, + { + "epoch": 2.321742094945879, + "grad_norm": 17.585332152686586, + "learning_rate": 1.4782545737796667e-06, + "loss": 0.7477, + "step": 27241 + }, + { + "epoch": 2.3218273246399046, + "grad_norm": 30.504199568874757, + "learning_rate": 1.4779026116523876e-06, + "loss": 1.1643, + "step": 27242 + }, + { + "epoch": 2.32191255433393, + "grad_norm": 47.77877001297525, + "learning_rate": 1.4775506841635378e-06, + "loss": 1.9777, + "step": 27243 + }, + { + "epoch": 2.321997784027955, + "grad_norm": 21.863258265766653, + "learning_rate": 1.4771987913165781e-06, + "loss": 0.813, + "step": 27244 + }, + { + "epoch": 2.3220830137219806, + "grad_norm": 39.11334225964024, + "learning_rate": 1.476846933114968e-06, + "loss": 1.0042, + "step": 27245 + }, + { + "epoch": 2.322168243416006, + "grad_norm": 51.462888037114, + "learning_rate": 1.4764951095621672e-06, + "loss": 1.229, + "step": 27246 + }, + { + "epoch": 2.3222534731100315, + "grad_norm": 36.857117191979654, + "learning_rate": 1.4761433206616381e-06, + "loss": 0.9202, + "step": 27247 + }, + { + "epoch": 2.322338702804057, + "grad_norm": 23.522290286827637, + "learning_rate": 1.4757915664168383e-06, + "loss": 1.4277, + "step": 27248 + }, + { + "epoch": 2.3224239324980824, + "grad_norm": 64.71721487627576, + "learning_rate": 1.4754398468312286e-06, + "loss": 1.7803, + "step": 27249 + }, + { + "epoch": 2.322509162192108, + "grad_norm": 18.077783156975528, + "learning_rate": 1.4750881619082686e-06, + "loss": 0.8952, + "step": 27250 + }, + { + "epoch": 2.322594391886133, + "grad_norm": 96.21110008552043, + "learning_rate": 1.4747365116514135e-06, + "loss": 1.8112, + "step": 27251 + }, + { + "epoch": 2.3226796215801584, + "grad_norm": 43.94369191776717, + "learning_rate": 1.4743848960641256e-06, + "loss": 1.2029, + "step": 27252 + }, + { + "epoch": 2.322764851274184, + "grad_norm": 68.93602417766775, + "learning_rate": 1.4740333151498615e-06, + "loss": 2.0233, + "step": 27253 + }, + { + "epoch": 2.3228500809682093, + "grad_norm": 72.44265683376759, + "learning_rate": 1.4736817689120786e-06, + "loss": 1.7149, + "step": 27254 + }, + { + "epoch": 2.322935310662235, + "grad_norm": 95.29407056187213, + "learning_rate": 1.4733302573542329e-06, + "loss": 2.9412, + "step": 27255 + }, + { + "epoch": 2.3230205403562603, + "grad_norm": 53.53736254212212, + "learning_rate": 1.4729787804797824e-06, + "loss": 1.5021, + "step": 27256 + }, + { + "epoch": 2.3231057700502857, + "grad_norm": 39.78732786922435, + "learning_rate": 1.4726273382921863e-06, + "loss": 1.2367, + "step": 27257 + }, + { + "epoch": 2.3231909997443108, + "grad_norm": 51.788922062668505, + "learning_rate": 1.472275930794898e-06, + "loss": 2.2737, + "step": 27258 + }, + { + "epoch": 2.3232762294383362, + "grad_norm": 32.15620462692654, + "learning_rate": 1.4719245579913727e-06, + "loss": 0.8593, + "step": 27259 + }, + { + "epoch": 2.3233614591323617, + "grad_norm": 51.9936303035568, + "learning_rate": 1.4715732198850696e-06, + "loss": 1.6219, + "step": 27260 + }, + { + "epoch": 2.323446688826387, + "grad_norm": 58.83911561147765, + "learning_rate": 1.471221916479441e-06, + "loss": 1.4212, + "step": 27261 + }, + { + "epoch": 2.3235319185204126, + "grad_norm": 28.828891753111698, + "learning_rate": 1.4708706477779433e-06, + "loss": 0.8599, + "step": 27262 + }, + { + "epoch": 2.323617148214438, + "grad_norm": 54.38613945598585, + "learning_rate": 1.4705194137840285e-06, + "loss": 1.4036, + "step": 27263 + }, + { + "epoch": 2.323702377908463, + "grad_norm": 39.73547366346002, + "learning_rate": 1.4701682145011532e-06, + "loss": 1.3776, + "step": 27264 + }, + { + "epoch": 2.3237876076024886, + "grad_norm": 33.772178248204, + "learning_rate": 1.469817049932772e-06, + "loss": 1.033, + "step": 27265 + }, + { + "epoch": 2.323872837296514, + "grad_norm": 70.05030334880298, + "learning_rate": 1.4694659200823374e-06, + "loss": 1.5148, + "step": 27266 + }, + { + "epoch": 2.3239580669905395, + "grad_norm": 29.922131970565303, + "learning_rate": 1.4691148249533028e-06, + "loss": 1.1105, + "step": 27267 + }, + { + "epoch": 2.324043296684565, + "grad_norm": 76.86403996280927, + "learning_rate": 1.4687637645491193e-06, + "loss": 2.1781, + "step": 27268 + }, + { + "epoch": 2.3241285263785905, + "grad_norm": 48.9207563010791, + "learning_rate": 1.4684127388732423e-06, + "loss": 1.5622, + "step": 27269 + }, + { + "epoch": 2.3242137560726155, + "grad_norm": 61.996258107577326, + "learning_rate": 1.4680617479291225e-06, + "loss": 1.424, + "step": 27270 + }, + { + "epoch": 2.324298985766641, + "grad_norm": 51.75118530959595, + "learning_rate": 1.467710791720211e-06, + "loss": 1.384, + "step": 27271 + }, + { + "epoch": 2.3243842154606664, + "grad_norm": 57.27918159180715, + "learning_rate": 1.4673598702499603e-06, + "loss": 1.1479, + "step": 27272 + }, + { + "epoch": 2.324469445154692, + "grad_norm": 58.44397193865177, + "learning_rate": 1.4670089835218227e-06, + "loss": 1.9356, + "step": 27273 + }, + { + "epoch": 2.3245546748487174, + "grad_norm": 22.02793041319727, + "learning_rate": 1.466658131539248e-06, + "loss": 0.6399, + "step": 27274 + }, + { + "epoch": 2.324639904542743, + "grad_norm": 53.76465159566535, + "learning_rate": 1.4663073143056867e-06, + "loss": 1.162, + "step": 27275 + }, + { + "epoch": 2.3247251342367683, + "grad_norm": 33.582471441316535, + "learning_rate": 1.4659565318245866e-06, + "loss": 0.8555, + "step": 27276 + }, + { + "epoch": 2.3248103639307933, + "grad_norm": 34.972264108528464, + "learning_rate": 1.465605784099402e-06, + "loss": 1.6842, + "step": 27277 + }, + { + "epoch": 2.324895593624819, + "grad_norm": 48.85840360821381, + "learning_rate": 1.46525507113358e-06, + "loss": 1.4709, + "step": 27278 + }, + { + "epoch": 2.3249808233188443, + "grad_norm": 18.139512280791838, + "learning_rate": 1.4649043929305678e-06, + "loss": 0.6933, + "step": 27279 + }, + { + "epoch": 2.3250660530128697, + "grad_norm": 11.154953767661665, + "learning_rate": 1.4645537494938183e-06, + "loss": 0.1984, + "step": 27280 + }, + { + "epoch": 2.325151282706895, + "grad_norm": 30.01739846936111, + "learning_rate": 1.464203140826776e-06, + "loss": 1.139, + "step": 27281 + }, + { + "epoch": 2.3252365124009207, + "grad_norm": 71.41428380881993, + "learning_rate": 1.4638525669328924e-06, + "loss": 1.0451, + "step": 27282 + }, + { + "epoch": 2.3253217420949457, + "grad_norm": 27.846844279354315, + "learning_rate": 1.4635020278156136e-06, + "loss": 0.717, + "step": 27283 + }, + { + "epoch": 2.325406971788971, + "grad_norm": 45.33942588960239, + "learning_rate": 1.463151523478387e-06, + "loss": 1.1751, + "step": 27284 + }, + { + "epoch": 2.3254922014829966, + "grad_norm": 57.432433090620236, + "learning_rate": 1.4628010539246583e-06, + "loss": 1.323, + "step": 27285 + }, + { + "epoch": 2.325577431177022, + "grad_norm": 50.704262902664084, + "learning_rate": 1.462450619157878e-06, + "loss": 1.64, + "step": 27286 + }, + { + "epoch": 2.3256626608710476, + "grad_norm": 36.78689248109289, + "learning_rate": 1.4621002191814876e-06, + "loss": 1.609, + "step": 27287 + }, + { + "epoch": 2.325747890565073, + "grad_norm": 56.59686173161369, + "learning_rate": 1.4617498539989377e-06, + "loss": 1.759, + "step": 27288 + }, + { + "epoch": 2.325833120259098, + "grad_norm": 83.42575757954653, + "learning_rate": 1.4613995236136702e-06, + "loss": 1.9341, + "step": 27289 + }, + { + "epoch": 2.3259183499531235, + "grad_norm": 26.93851888890058, + "learning_rate": 1.4610492280291345e-06, + "loss": 0.8855, + "step": 27290 + }, + { + "epoch": 2.326003579647149, + "grad_norm": 88.09609395844235, + "learning_rate": 1.4606989672487725e-06, + "loss": 2.6763, + "step": 27291 + }, + { + "epoch": 2.3260888093411745, + "grad_norm": 60.342150842910335, + "learning_rate": 1.4603487412760304e-06, + "loss": 0.9735, + "step": 27292 + }, + { + "epoch": 2.3261740390352, + "grad_norm": 54.561378826487584, + "learning_rate": 1.4599985501143516e-06, + "loss": 1.2267, + "step": 27293 + }, + { + "epoch": 2.3262592687292254, + "grad_norm": 80.79786558987169, + "learning_rate": 1.459648393767179e-06, + "loss": 1.5794, + "step": 27294 + }, + { + "epoch": 2.326344498423251, + "grad_norm": 46.19295581780609, + "learning_rate": 1.4592982722379578e-06, + "loss": 1.6607, + "step": 27295 + }, + { + "epoch": 2.326429728117276, + "grad_norm": 40.72790591739043, + "learning_rate": 1.4589481855301324e-06, + "loss": 1.1501, + "step": 27296 + }, + { + "epoch": 2.3265149578113014, + "grad_norm": 33.60778301417965, + "learning_rate": 1.4585981336471444e-06, + "loss": 1.0125, + "step": 27297 + }, + { + "epoch": 2.326600187505327, + "grad_norm": 54.13514178396575, + "learning_rate": 1.4582481165924355e-06, + "loss": 1.3388, + "step": 27298 + }, + { + "epoch": 2.3266854171993523, + "grad_norm": 71.68476088645858, + "learning_rate": 1.4578981343694504e-06, + "loss": 2.0112, + "step": 27299 + }, + { + "epoch": 2.3267706468933778, + "grad_norm": 53.32984738073466, + "learning_rate": 1.4575481869816294e-06, + "loss": 1.536, + "step": 27300 + }, + { + "epoch": 2.3268558765874032, + "grad_norm": 63.713233029222614, + "learning_rate": 1.4571982744324149e-06, + "loss": 1.4602, + "step": 27301 + }, + { + "epoch": 2.3269411062814287, + "grad_norm": 49.30805313076074, + "learning_rate": 1.4568483967252457e-06, + "loss": 1.488, + "step": 27302 + }, + { + "epoch": 2.3270263359754537, + "grad_norm": 57.8974937976966, + "learning_rate": 1.456498553863564e-06, + "loss": 1.4063, + "step": 27303 + }, + { + "epoch": 2.327111565669479, + "grad_norm": 41.31181855990152, + "learning_rate": 1.4561487458508133e-06, + "loss": 0.8448, + "step": 27304 + }, + { + "epoch": 2.3271967953635047, + "grad_norm": 54.29034989277412, + "learning_rate": 1.4557989726904316e-06, + "loss": 1.5663, + "step": 27305 + }, + { + "epoch": 2.32728202505753, + "grad_norm": 20.749697714573557, + "learning_rate": 1.4554492343858578e-06, + "loss": 1.0651, + "step": 27306 + }, + { + "epoch": 2.3273672547515556, + "grad_norm": 64.22309392975757, + "learning_rate": 1.4550995309405315e-06, + "loss": 1.4739, + "step": 27307 + }, + { + "epoch": 2.3274524844455806, + "grad_norm": 63.78523772349001, + "learning_rate": 1.4547498623578943e-06, + "loss": 1.3019, + "step": 27308 + }, + { + "epoch": 2.327537714139606, + "grad_norm": 56.15344726363218, + "learning_rate": 1.4544002286413833e-06, + "loss": 1.2744, + "step": 27309 + }, + { + "epoch": 2.3276229438336316, + "grad_norm": 83.23388461358259, + "learning_rate": 1.454050629794435e-06, + "loss": 2.0602, + "step": 27310 + }, + { + "epoch": 2.327708173527657, + "grad_norm": 79.92542666928699, + "learning_rate": 1.4537010658204909e-06, + "loss": 1.5041, + "step": 27311 + }, + { + "epoch": 2.3277934032216825, + "grad_norm": 79.9338241827064, + "learning_rate": 1.4533515367229882e-06, + "loss": 1.7816, + "step": 27312 + }, + { + "epoch": 2.327878632915708, + "grad_norm": 54.751087005645665, + "learning_rate": 1.4530020425053643e-06, + "loss": 1.2827, + "step": 27313 + }, + { + "epoch": 2.3279638626097334, + "grad_norm": 34.535418968112985, + "learning_rate": 1.452652583171056e-06, + "loss": 1.3742, + "step": 27314 + }, + { + "epoch": 2.3280490923037584, + "grad_norm": 56.25441105185017, + "learning_rate": 1.4523031587234981e-06, + "loss": 2.3008, + "step": 27315 + }, + { + "epoch": 2.328134321997784, + "grad_norm": 46.56054485450483, + "learning_rate": 1.4519537691661306e-06, + "loss": 1.2826, + "step": 27316 + }, + { + "epoch": 2.3282195516918094, + "grad_norm": 60.395691180914916, + "learning_rate": 1.4516044145023877e-06, + "loss": 1.6631, + "step": 27317 + }, + { + "epoch": 2.328304781385835, + "grad_norm": 41.47092921769708, + "learning_rate": 1.451255094735704e-06, + "loss": 1.3027, + "step": 27318 + }, + { + "epoch": 2.3283900110798603, + "grad_norm": 36.31529537396456, + "learning_rate": 1.4509058098695168e-06, + "loss": 0.8968, + "step": 27319 + }, + { + "epoch": 2.328475240773886, + "grad_norm": 24.937330995733625, + "learning_rate": 1.4505565599072619e-06, + "loss": 1.2582, + "step": 27320 + }, + { + "epoch": 2.3285604704679113, + "grad_norm": 57.20496007636806, + "learning_rate": 1.4502073448523723e-06, + "loss": 1.508, + "step": 27321 + }, + { + "epoch": 2.3286457001619363, + "grad_norm": 71.24961400124685, + "learning_rate": 1.4498581647082838e-06, + "loss": 1.9545, + "step": 27322 + }, + { + "epoch": 2.3287309298559618, + "grad_norm": 39.843178287480434, + "learning_rate": 1.4495090194784288e-06, + "loss": 1.2893, + "step": 27323 + }, + { + "epoch": 2.328816159549987, + "grad_norm": 41.370077773809925, + "learning_rate": 1.4491599091662406e-06, + "loss": 1.2019, + "step": 27324 + }, + { + "epoch": 2.3289013892440127, + "grad_norm": 33.37136973032529, + "learning_rate": 1.448810833775155e-06, + "loss": 1.0033, + "step": 27325 + }, + { + "epoch": 2.328986618938038, + "grad_norm": 66.1194717134735, + "learning_rate": 1.4484617933086027e-06, + "loss": 1.6901, + "step": 27326 + }, + { + "epoch": 2.329071848632063, + "grad_norm": 50.99338323561232, + "learning_rate": 1.4481127877700184e-06, + "loss": 1.3281, + "step": 27327 + }, + { + "epoch": 2.3291570783260886, + "grad_norm": 57.297610459853615, + "learning_rate": 1.4477638171628322e-06, + "loss": 2.0113, + "step": 27328 + }, + { + "epoch": 2.329242308020114, + "grad_norm": 60.38443109621492, + "learning_rate": 1.4474148814904793e-06, + "loss": 1.2926, + "step": 27329 + }, + { + "epoch": 2.3293275377141396, + "grad_norm": 47.94983350868135, + "learning_rate": 1.4470659807563887e-06, + "loss": 1.4385, + "step": 27330 + }, + { + "epoch": 2.329412767408165, + "grad_norm": 44.280403222081475, + "learning_rate": 1.4467171149639924e-06, + "loss": 1.92, + "step": 27331 + }, + { + "epoch": 2.3294979971021905, + "grad_norm": 54.22174129135057, + "learning_rate": 1.4463682841167203e-06, + "loss": 1.484, + "step": 27332 + }, + { + "epoch": 2.329583226796216, + "grad_norm": 59.779322398313916, + "learning_rate": 1.446019488218005e-06, + "loss": 1.3781, + "step": 27333 + }, + { + "epoch": 2.329668456490241, + "grad_norm": 20.781650592493627, + "learning_rate": 1.445670727271275e-06, + "loss": 0.6053, + "step": 27334 + }, + { + "epoch": 2.3297536861842665, + "grad_norm": 24.974014598220215, + "learning_rate": 1.4453220012799618e-06, + "loss": 0.9635, + "step": 27335 + }, + { + "epoch": 2.329838915878292, + "grad_norm": 62.963277380752935, + "learning_rate": 1.444973310247495e-06, + "loss": 2.3916, + "step": 27336 + }, + { + "epoch": 2.3299241455723174, + "grad_norm": 30.269986424754002, + "learning_rate": 1.4446246541773007e-06, + "loss": 0.9976, + "step": 27337 + }, + { + "epoch": 2.330009375266343, + "grad_norm": 28.06844890864658, + "learning_rate": 1.4442760330728123e-06, + "loss": 1.4563, + "step": 27338 + }, + { + "epoch": 2.3300946049603684, + "grad_norm": 77.17461211280636, + "learning_rate": 1.443927446937456e-06, + "loss": 1.0352, + "step": 27339 + }, + { + "epoch": 2.330179834654394, + "grad_norm": 62.5510304656617, + "learning_rate": 1.4435788957746606e-06, + "loss": 1.7021, + "step": 27340 + }, + { + "epoch": 2.330265064348419, + "grad_norm": 119.97807641403803, + "learning_rate": 1.4432303795878517e-06, + "loss": 2.5326, + "step": 27341 + }, + { + "epoch": 2.3303502940424443, + "grad_norm": 42.047500028195714, + "learning_rate": 1.4428818983804582e-06, + "loss": 2.0208, + "step": 27342 + }, + { + "epoch": 2.33043552373647, + "grad_norm": 70.3197079146889, + "learning_rate": 1.4425334521559097e-06, + "loss": 1.4588, + "step": 27343 + }, + { + "epoch": 2.3305207534304953, + "grad_norm": 44.14464683795428, + "learning_rate": 1.4421850409176313e-06, + "loss": 0.7258, + "step": 27344 + }, + { + "epoch": 2.3306059831245207, + "grad_norm": 76.14284072129786, + "learning_rate": 1.441836664669048e-06, + "loss": 1.9837, + "step": 27345 + }, + { + "epoch": 2.3306912128185457, + "grad_norm": 77.59753833651037, + "learning_rate": 1.441488323413588e-06, + "loss": 1.9179, + "step": 27346 + }, + { + "epoch": 2.330776442512571, + "grad_norm": 66.07558800322491, + "learning_rate": 1.441140017154677e-06, + "loss": 2.0656, + "step": 27347 + }, + { + "epoch": 2.3308616722065967, + "grad_norm": 42.61042772422515, + "learning_rate": 1.4407917458957393e-06, + "loss": 1.4166, + "step": 27348 + }, + { + "epoch": 2.330946901900622, + "grad_norm": 77.32814990128945, + "learning_rate": 1.4404435096401991e-06, + "loss": 1.481, + "step": 27349 + }, + { + "epoch": 2.3310321315946476, + "grad_norm": 56.711168972294026, + "learning_rate": 1.4400953083914827e-06, + "loss": 1.5333, + "step": 27350 + }, + { + "epoch": 2.331117361288673, + "grad_norm": 87.77590648821375, + "learning_rate": 1.4397471421530157e-06, + "loss": 2.2596, + "step": 27351 + }, + { + "epoch": 2.3312025909826986, + "grad_norm": 32.93622079558633, + "learning_rate": 1.4393990109282214e-06, + "loss": 1.2771, + "step": 27352 + }, + { + "epoch": 2.3312878206767236, + "grad_norm": 28.652360359190638, + "learning_rate": 1.4390509147205224e-06, + "loss": 0.9072, + "step": 27353 + }, + { + "epoch": 2.331373050370749, + "grad_norm": 31.69359894065449, + "learning_rate": 1.4387028535333415e-06, + "loss": 1.1399, + "step": 27354 + }, + { + "epoch": 2.3314582800647745, + "grad_norm": 78.13110106067455, + "learning_rate": 1.4383548273701036e-06, + "loss": 1.3961, + "step": 27355 + }, + { + "epoch": 2.3315435097588, + "grad_norm": 79.07996988959621, + "learning_rate": 1.4380068362342314e-06, + "loss": 2.5921, + "step": 27356 + }, + { + "epoch": 2.3316287394528254, + "grad_norm": 37.637557534646746, + "learning_rate": 1.4376588801291451e-06, + "loss": 1.5141, + "step": 27357 + }, + { + "epoch": 2.331713969146851, + "grad_norm": 43.35831069898331, + "learning_rate": 1.4373109590582685e-06, + "loss": 1.0214, + "step": 27358 + }, + { + "epoch": 2.3317991988408764, + "grad_norm": 49.97404675039607, + "learning_rate": 1.4369630730250238e-06, + "loss": 1.4717, + "step": 27359 + }, + { + "epoch": 2.3318844285349014, + "grad_norm": 24.871245061796678, + "learning_rate": 1.4366152220328317e-06, + "loss": 1.3048, + "step": 27360 + }, + { + "epoch": 2.331969658228927, + "grad_norm": 38.00340260775132, + "learning_rate": 1.4362674060851129e-06, + "loss": 1.2163, + "step": 27361 + }, + { + "epoch": 2.3320548879229523, + "grad_norm": 65.92129347035791, + "learning_rate": 1.4359196251852864e-06, + "loss": 2.1221, + "step": 27362 + }, + { + "epoch": 2.332140117616978, + "grad_norm": 39.64059112833311, + "learning_rate": 1.4355718793367763e-06, + "loss": 1.2081, + "step": 27363 + }, + { + "epoch": 2.3322253473110033, + "grad_norm": 55.18715086185222, + "learning_rate": 1.4352241685429996e-06, + "loss": 1.5658, + "step": 27364 + }, + { + "epoch": 2.3323105770050283, + "grad_norm": 67.36811351750681, + "learning_rate": 1.4348764928073756e-06, + "loss": 1.3576, + "step": 27365 + }, + { + "epoch": 2.3323958066990538, + "grad_norm": 45.339702038092014, + "learning_rate": 1.4345288521333262e-06, + "loss": 1.1107, + "step": 27366 + }, + { + "epoch": 2.3324810363930792, + "grad_norm": 22.21530993756247, + "learning_rate": 1.4341812465242671e-06, + "loss": 0.5553, + "step": 27367 + }, + { + "epoch": 2.3325662660871047, + "grad_norm": 25.848849393734916, + "learning_rate": 1.4338336759836202e-06, + "loss": 0.8676, + "step": 27368 + }, + { + "epoch": 2.33265149578113, + "grad_norm": 41.109598252462845, + "learning_rate": 1.433486140514802e-06, + "loss": 1.4305, + "step": 27369 + }, + { + "epoch": 2.3327367254751556, + "grad_norm": 22.29019601650405, + "learning_rate": 1.4331386401212304e-06, + "loss": 0.7455, + "step": 27370 + }, + { + "epoch": 2.332821955169181, + "grad_norm": 29.500078643314886, + "learning_rate": 1.4327911748063216e-06, + "loss": 1.0841, + "step": 27371 + }, + { + "epoch": 2.332907184863206, + "grad_norm": 86.89810027121865, + "learning_rate": 1.4324437445734952e-06, + "loss": 1.559, + "step": 27372 + }, + { + "epoch": 2.3329924145572316, + "grad_norm": 45.178480713174466, + "learning_rate": 1.4320963494261658e-06, + "loss": 1.3386, + "step": 27373 + }, + { + "epoch": 2.333077644251257, + "grad_norm": 45.375045048526886, + "learning_rate": 1.4317489893677522e-06, + "loss": 0.8959, + "step": 27374 + }, + { + "epoch": 2.3331628739452825, + "grad_norm": 84.91730769250474, + "learning_rate": 1.4314016644016682e-06, + "loss": 2.174, + "step": 27375 + }, + { + "epoch": 2.333248103639308, + "grad_norm": 40.84467953698352, + "learning_rate": 1.4310543745313326e-06, + "loss": 1.1755, + "step": 27376 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 46.879466620417524, + "learning_rate": 1.4307071197601585e-06, + "loss": 1.2243, + "step": 27377 + }, + { + "epoch": 2.333418563027359, + "grad_norm": 42.355831314407, + "learning_rate": 1.4303599000915618e-06, + "loss": 1.6171, + "step": 27378 + }, + { + "epoch": 2.333503792721384, + "grad_norm": 67.24838485932095, + "learning_rate": 1.4300127155289572e-06, + "loss": 1.524, + "step": 27379 + }, + { + "epoch": 2.3335890224154094, + "grad_norm": 30.598369244703402, + "learning_rate": 1.4296655660757574e-06, + "loss": 1.1391, + "step": 27380 + }, + { + "epoch": 2.333674252109435, + "grad_norm": 33.00593290311537, + "learning_rate": 1.429318451735378e-06, + "loss": 0.9969, + "step": 27381 + }, + { + "epoch": 2.3337594818034604, + "grad_norm": 41.98007511051679, + "learning_rate": 1.4289713725112343e-06, + "loss": 1.6745, + "step": 27382 + }, + { + "epoch": 2.333844711497486, + "grad_norm": 88.35244706338928, + "learning_rate": 1.428624328406738e-06, + "loss": 2.296, + "step": 27383 + }, + { + "epoch": 2.3339299411915113, + "grad_norm": 46.880375620613606, + "learning_rate": 1.4282773194253007e-06, + "loss": 1.7925, + "step": 27384 + }, + { + "epoch": 2.3340151708855363, + "grad_norm": 71.6519901108641, + "learning_rate": 1.4279303455703386e-06, + "loss": 2.2558, + "step": 27385 + }, + { + "epoch": 2.334100400579562, + "grad_norm": 67.38535362415224, + "learning_rate": 1.427583406845262e-06, + "loss": 1.5428, + "step": 27386 + }, + { + "epoch": 2.3341856302735873, + "grad_norm": 30.33082866508844, + "learning_rate": 1.427236503253483e-06, + "loss": 1.1761, + "step": 27387 + }, + { + "epoch": 2.3342708599676127, + "grad_norm": 25.974699197804338, + "learning_rate": 1.4268896347984117e-06, + "loss": 1.3529, + "step": 27388 + }, + { + "epoch": 2.334356089661638, + "grad_norm": 32.71551549833169, + "learning_rate": 1.4265428014834614e-06, + "loss": 0.7791, + "step": 27389 + }, + { + "epoch": 2.3344413193556637, + "grad_norm": 77.11335289582715, + "learning_rate": 1.4261960033120438e-06, + "loss": 2.1098, + "step": 27390 + }, + { + "epoch": 2.3345265490496887, + "grad_norm": 41.22337788794912, + "learning_rate": 1.4258492402875684e-06, + "loss": 1.5475, + "step": 27391 + }, + { + "epoch": 2.334611778743714, + "grad_norm": 47.8704479025835, + "learning_rate": 1.4255025124134441e-06, + "loss": 1.4096, + "step": 27392 + }, + { + "epoch": 2.3346970084377396, + "grad_norm": 35.87499995212634, + "learning_rate": 1.4251558196930844e-06, + "loss": 1.1804, + "step": 27393 + }, + { + "epoch": 2.334782238131765, + "grad_norm": 30.919602835149796, + "learning_rate": 1.424809162129896e-06, + "loss": 0.4151, + "step": 27394 + }, + { + "epoch": 2.3348674678257906, + "grad_norm": 62.99198356583303, + "learning_rate": 1.4244625397272888e-06, + "loss": 1.6794, + "step": 27395 + }, + { + "epoch": 2.334952697519816, + "grad_norm": 52.90811130334121, + "learning_rate": 1.4241159524886706e-06, + "loss": 1.2667, + "step": 27396 + }, + { + "epoch": 2.3350379272138415, + "grad_norm": 99.25225143485036, + "learning_rate": 1.4237694004174506e-06, + "loss": 1.7117, + "step": 27397 + }, + { + "epoch": 2.3351231569078665, + "grad_norm": 49.55380339853658, + "learning_rate": 1.4234228835170394e-06, + "loss": 1.0313, + "step": 27398 + }, + { + "epoch": 2.335208386601892, + "grad_norm": 35.47266208521833, + "learning_rate": 1.4230764017908427e-06, + "loss": 0.908, + "step": 27399 + }, + { + "epoch": 2.3352936162959175, + "grad_norm": 79.3403718081207, + "learning_rate": 1.4227299552422684e-06, + "loss": 1.5888, + "step": 27400 + }, + { + "epoch": 2.335378845989943, + "grad_norm": 48.89189505616802, + "learning_rate": 1.4223835438747219e-06, + "loss": 1.6505, + "step": 27401 + }, + { + "epoch": 2.3354640756839684, + "grad_norm": 70.87555109977532, + "learning_rate": 1.4220371676916133e-06, + "loss": 1.8459, + "step": 27402 + }, + { + "epoch": 2.335549305377994, + "grad_norm": 29.165929059651944, + "learning_rate": 1.4216908266963475e-06, + "loss": 1.124, + "step": 27403 + }, + { + "epoch": 2.335634535072019, + "grad_norm": 28.06078735487207, + "learning_rate": 1.4213445208923289e-06, + "loss": 0.6209, + "step": 27404 + }, + { + "epoch": 2.3357197647660444, + "grad_norm": 44.57201054204813, + "learning_rate": 1.420998250282965e-06, + "loss": 1.4085, + "step": 27405 + }, + { + "epoch": 2.33580499446007, + "grad_norm": 42.742880435779725, + "learning_rate": 1.4206520148716623e-06, + "loss": 1.3826, + "step": 27406 + }, + { + "epoch": 2.3358902241540953, + "grad_norm": 19.69271036811047, + "learning_rate": 1.4203058146618253e-06, + "loss": 0.7369, + "step": 27407 + }, + { + "epoch": 2.3359754538481208, + "grad_norm": 39.258367189312814, + "learning_rate": 1.419959649656858e-06, + "loss": 1.3176, + "step": 27408 + }, + { + "epoch": 2.3360606835421462, + "grad_norm": 34.975290503853415, + "learning_rate": 1.4196135198601647e-06, + "loss": 1.2935, + "step": 27409 + }, + { + "epoch": 2.3361459132361713, + "grad_norm": 68.7519731188288, + "learning_rate": 1.419267425275148e-06, + "loss": 2.322, + "step": 27410 + }, + { + "epoch": 2.3362311429301967, + "grad_norm": 34.91442570932882, + "learning_rate": 1.4189213659052158e-06, + "loss": 1.0699, + "step": 27411 + }, + { + "epoch": 2.336316372624222, + "grad_norm": 55.219376274728205, + "learning_rate": 1.4185753417537669e-06, + "loss": 1.6061, + "step": 27412 + }, + { + "epoch": 2.3364016023182477, + "grad_norm": 31.23960541970274, + "learning_rate": 1.418229352824208e-06, + "loss": 1.1012, + "step": 27413 + }, + { + "epoch": 2.336486832012273, + "grad_norm": 42.78485752738988, + "learning_rate": 1.4178833991199382e-06, + "loss": 1.0126, + "step": 27414 + }, + { + "epoch": 2.3365720617062986, + "grad_norm": 53.55906426443876, + "learning_rate": 1.4175374806443638e-06, + "loss": 1.6526, + "step": 27415 + }, + { + "epoch": 2.336657291400324, + "grad_norm": 40.303943022719984, + "learning_rate": 1.4171915974008849e-06, + "loss": 1.3496, + "step": 27416 + }, + { + "epoch": 2.336742521094349, + "grad_norm": 194.95290600405485, + "learning_rate": 1.4168457493929021e-06, + "loss": 1.7367, + "step": 27417 + }, + { + "epoch": 2.3368277507883746, + "grad_norm": 40.35920638740449, + "learning_rate": 1.416499936623817e-06, + "loss": 1.4329, + "step": 27418 + }, + { + "epoch": 2.3369129804824, + "grad_norm": 54.828243917060014, + "learning_rate": 1.4161541590970318e-06, + "loss": 1.9566, + "step": 27419 + }, + { + "epoch": 2.3369982101764255, + "grad_norm": 46.71722121811661, + "learning_rate": 1.4158084168159452e-06, + "loss": 1.7597, + "step": 27420 + }, + { + "epoch": 2.337083439870451, + "grad_norm": 32.62430300436707, + "learning_rate": 1.4154627097839607e-06, + "loss": 1.551, + "step": 27421 + }, + { + "epoch": 2.3371686695644764, + "grad_norm": 42.94099615074005, + "learning_rate": 1.4151170380044738e-06, + "loss": 1.229, + "step": 27422 + }, + { + "epoch": 2.337253899258502, + "grad_norm": 47.97947508979184, + "learning_rate": 1.4147714014808882e-06, + "loss": 1.2854, + "step": 27423 + }, + { + "epoch": 2.337339128952527, + "grad_norm": 19.974066957210283, + "learning_rate": 1.414425800216601e-06, + "loss": 0.7661, + "step": 27424 + }, + { + "epoch": 2.3374243586465524, + "grad_norm": 59.30033120797899, + "learning_rate": 1.4140802342150117e-06, + "loss": 2.1484, + "step": 27425 + }, + { + "epoch": 2.337509588340578, + "grad_norm": 28.193458369614337, + "learning_rate": 1.413734703479518e-06, + "loss": 1.0178, + "step": 27426 + }, + { + "epoch": 2.3375948180346033, + "grad_norm": 30.982820693477823, + "learning_rate": 1.413389208013517e-06, + "loss": 0.9863, + "step": 27427 + }, + { + "epoch": 2.337680047728629, + "grad_norm": 36.50051859452894, + "learning_rate": 1.4130437478204078e-06, + "loss": 0.9094, + "step": 27428 + }, + { + "epoch": 2.337765277422654, + "grad_norm": 71.98167558769846, + "learning_rate": 1.41269832290359e-06, + "loss": 2.0753, + "step": 27429 + }, + { + "epoch": 2.3378505071166793, + "grad_norm": 55.8836697049432, + "learning_rate": 1.4123529332664582e-06, + "loss": 1.2803, + "step": 27430 + }, + { + "epoch": 2.3379357368107048, + "grad_norm": 82.79107771674039, + "learning_rate": 1.4120075789124088e-06, + "loss": 2.4081, + "step": 27431 + }, + { + "epoch": 2.3380209665047302, + "grad_norm": 61.4341780919456, + "learning_rate": 1.4116622598448404e-06, + "loss": 1.7456, + "step": 27432 + }, + { + "epoch": 2.3381061961987557, + "grad_norm": 92.2025926349849, + "learning_rate": 1.4113169760671474e-06, + "loss": 2.1303, + "step": 27433 + }, + { + "epoch": 2.338191425892781, + "grad_norm": 32.10330671698621, + "learning_rate": 1.4109717275827262e-06, + "loss": 1.2237, + "step": 27434 + }, + { + "epoch": 2.3382766555868066, + "grad_norm": 17.299855897897956, + "learning_rate": 1.4106265143949705e-06, + "loss": 0.5391, + "step": 27435 + }, + { + "epoch": 2.3383618852808317, + "grad_norm": 27.670896330104565, + "learning_rate": 1.4102813365072772e-06, + "loss": 0.9608, + "step": 27436 + }, + { + "epoch": 2.338447114974857, + "grad_norm": 69.7364998996721, + "learning_rate": 1.409936193923041e-06, + "loss": 2.14, + "step": 27437 + }, + { + "epoch": 2.3385323446688826, + "grad_norm": 59.692941559998175, + "learning_rate": 1.4095910866456569e-06, + "loss": 2.3934, + "step": 27438 + }, + { + "epoch": 2.338617574362908, + "grad_norm": 69.26862803180552, + "learning_rate": 1.409246014678517e-06, + "loss": 2.3013, + "step": 27439 + }, + { + "epoch": 2.3387028040569335, + "grad_norm": 68.61586946845213, + "learning_rate": 1.4089009780250145e-06, + "loss": 1.768, + "step": 27440 + }, + { + "epoch": 2.338788033750959, + "grad_norm": 25.33290575357443, + "learning_rate": 1.4085559766885454e-06, + "loss": 0.7806, + "step": 27441 + }, + { + "epoch": 2.3388732634449845, + "grad_norm": 57.438847093094346, + "learning_rate": 1.408211010672501e-06, + "loss": 0.9993, + "step": 27442 + }, + { + "epoch": 2.3389584931390095, + "grad_norm": 42.43854772584496, + "learning_rate": 1.407866079980273e-06, + "loss": 1.5594, + "step": 27443 + }, + { + "epoch": 2.339043722833035, + "grad_norm": 46.79871931697509, + "learning_rate": 1.4075211846152543e-06, + "loss": 1.2596, + "step": 27444 + }, + { + "epoch": 2.3391289525270604, + "grad_norm": 59.01691969288222, + "learning_rate": 1.407176324580839e-06, + "loss": 2.1535, + "step": 27445 + }, + { + "epoch": 2.339214182221086, + "grad_norm": 76.61404907382109, + "learning_rate": 1.4068314998804167e-06, + "loss": 1.2121, + "step": 27446 + }, + { + "epoch": 2.3392994119151114, + "grad_norm": 51.20264096205687, + "learning_rate": 1.406486710517379e-06, + "loss": 1.4406, + "step": 27447 + }, + { + "epoch": 2.3393846416091364, + "grad_norm": 60.406807477674924, + "learning_rate": 1.406141956495115e-06, + "loss": 1.6177, + "step": 27448 + }, + { + "epoch": 2.339469871303162, + "grad_norm": 76.91803100086555, + "learning_rate": 1.4057972378170181e-06, + "loss": 2.5539, + "step": 27449 + }, + { + "epoch": 2.3395551009971873, + "grad_norm": 39.92254072121798, + "learning_rate": 1.4054525544864773e-06, + "loss": 0.856, + "step": 27450 + }, + { + "epoch": 2.339640330691213, + "grad_norm": 77.56434021322993, + "learning_rate": 1.4051079065068812e-06, + "loss": 1.8175, + "step": 27451 + }, + { + "epoch": 2.3397255603852383, + "grad_norm": 50.09704225907528, + "learning_rate": 1.4047632938816197e-06, + "loss": 0.9034, + "step": 27452 + }, + { + "epoch": 2.3398107900792637, + "grad_norm": 91.56629540406986, + "learning_rate": 1.4044187166140842e-06, + "loss": 2.4779, + "step": 27453 + }, + { + "epoch": 2.339896019773289, + "grad_norm": 64.0409022314896, + "learning_rate": 1.4040741747076625e-06, + "loss": 1.9877, + "step": 27454 + }, + { + "epoch": 2.3399812494673142, + "grad_norm": 28.400779107847896, + "learning_rate": 1.403729668165742e-06, + "loss": 1.0331, + "step": 27455 + }, + { + "epoch": 2.3400664791613397, + "grad_norm": 105.78327566453942, + "learning_rate": 1.4033851969917111e-06, + "loss": 2.1005, + "step": 27456 + }, + { + "epoch": 2.340151708855365, + "grad_norm": 32.15418527443075, + "learning_rate": 1.4030407611889562e-06, + "loss": 0.8916, + "step": 27457 + }, + { + "epoch": 2.3402369385493906, + "grad_norm": 37.0127487708072, + "learning_rate": 1.402696360760868e-06, + "loss": 1.5066, + "step": 27458 + }, + { + "epoch": 2.340322168243416, + "grad_norm": 67.88330191086958, + "learning_rate": 1.4023519957108296e-06, + "loss": 1.9864, + "step": 27459 + }, + { + "epoch": 2.3404073979374416, + "grad_norm": 33.74291763820681, + "learning_rate": 1.4020076660422315e-06, + "loss": 1.5712, + "step": 27460 + }, + { + "epoch": 2.340492627631467, + "grad_norm": 53.92457541901147, + "learning_rate": 1.4016633717584567e-06, + "loss": 1.553, + "step": 27461 + }, + { + "epoch": 2.340577857325492, + "grad_norm": 30.38493211021199, + "learning_rate": 1.4013191128628944e-06, + "loss": 1.1486, + "step": 27462 + }, + { + "epoch": 2.3406630870195175, + "grad_norm": 38.29871461102679, + "learning_rate": 1.4009748893589286e-06, + "loss": 0.8266, + "step": 27463 + }, + { + "epoch": 2.340748316713543, + "grad_norm": 49.99969808515878, + "learning_rate": 1.4006307012499449e-06, + "loss": 1.5704, + "step": 27464 + }, + { + "epoch": 2.3408335464075685, + "grad_norm": 68.83366652011699, + "learning_rate": 1.4002865485393259e-06, + "loss": 1.8893, + "step": 27465 + }, + { + "epoch": 2.340918776101594, + "grad_norm": 43.82035390150996, + "learning_rate": 1.3999424312304605e-06, + "loss": 1.6151, + "step": 27466 + }, + { + "epoch": 2.341004005795619, + "grad_norm": 26.81457663977135, + "learning_rate": 1.399598349326729e-06, + "loss": 0.8029, + "step": 27467 + }, + { + "epoch": 2.3410892354896444, + "grad_norm": 28.092460955177827, + "learning_rate": 1.3992543028315182e-06, + "loss": 0.8737, + "step": 27468 + }, + { + "epoch": 2.34117446518367, + "grad_norm": 36.97666509663934, + "learning_rate": 1.3989102917482111e-06, + "loss": 1.0291, + "step": 27469 + }, + { + "epoch": 2.3412596948776954, + "grad_norm": 85.85722379469912, + "learning_rate": 1.3985663160801882e-06, + "loss": 2.3008, + "step": 27470 + }, + { + "epoch": 2.341344924571721, + "grad_norm": 76.88628688641741, + "learning_rate": 1.3982223758308366e-06, + "loss": 0.8259, + "step": 27471 + }, + { + "epoch": 2.3414301542657463, + "grad_norm": 35.577870592606196, + "learning_rate": 1.3978784710035365e-06, + "loss": 0.7769, + "step": 27472 + }, + { + "epoch": 2.3415153839597718, + "grad_norm": 20.163279612708422, + "learning_rate": 1.3975346016016706e-06, + "loss": 0.3672, + "step": 27473 + }, + { + "epoch": 2.341600613653797, + "grad_norm": 33.50338974837498, + "learning_rate": 1.3971907676286183e-06, + "loss": 1.6938, + "step": 27474 + }, + { + "epoch": 2.3416858433478223, + "grad_norm": 75.58719631428046, + "learning_rate": 1.396846969087764e-06, + "loss": 1.3451, + "step": 27475 + }, + { + "epoch": 2.3417710730418477, + "grad_norm": 39.85134202955361, + "learning_rate": 1.396503205982489e-06, + "loss": 0.942, + "step": 27476 + }, + { + "epoch": 2.341856302735873, + "grad_norm": 19.75053482854195, + "learning_rate": 1.396159478316173e-06, + "loss": 0.7988, + "step": 27477 + }, + { + "epoch": 2.3419415324298987, + "grad_norm": 32.17584732429772, + "learning_rate": 1.395815786092195e-06, + "loss": 1.0212, + "step": 27478 + }, + { + "epoch": 2.342026762123924, + "grad_norm": 32.25429988940271, + "learning_rate": 1.3954721293139384e-06, + "loss": 1.1196, + "step": 27479 + }, + { + "epoch": 2.3421119918179496, + "grad_norm": 70.53804632376999, + "learning_rate": 1.395128507984781e-06, + "loss": 1.7843, + "step": 27480 + }, + { + "epoch": 2.3421972215119746, + "grad_norm": 31.081884212189618, + "learning_rate": 1.3947849221081023e-06, + "loss": 1.7475, + "step": 27481 + }, + { + "epoch": 2.342282451206, + "grad_norm": 22.452024923456964, + "learning_rate": 1.3944413716872796e-06, + "loss": 0.5271, + "step": 27482 + }, + { + "epoch": 2.3423676809000256, + "grad_norm": 103.54529751888383, + "learning_rate": 1.3940978567256935e-06, + "loss": 1.6091, + "step": 27483 + }, + { + "epoch": 2.342452910594051, + "grad_norm": 58.290611417183335, + "learning_rate": 1.3937543772267237e-06, + "loss": 1.5353, + "step": 27484 + }, + { + "epoch": 2.3425381402880765, + "grad_norm": 64.2099572469441, + "learning_rate": 1.3934109331937461e-06, + "loss": 1.8279, + "step": 27485 + }, + { + "epoch": 2.3426233699821015, + "grad_norm": 53.98506593201878, + "learning_rate": 1.3930675246301396e-06, + "loss": 1.9232, + "step": 27486 + }, + { + "epoch": 2.342708599676127, + "grad_norm": 46.27058317980084, + "learning_rate": 1.3927241515392786e-06, + "loss": 1.346, + "step": 27487 + }, + { + "epoch": 2.3427938293701525, + "grad_norm": 54.258660708942465, + "learning_rate": 1.3923808139245443e-06, + "loss": 1.3057, + "step": 27488 + }, + { + "epoch": 2.342879059064178, + "grad_norm": 20.927232961811207, + "learning_rate": 1.3920375117893108e-06, + "loss": 0.9178, + "step": 27489 + }, + { + "epoch": 2.3429642887582034, + "grad_norm": 84.97215858521965, + "learning_rate": 1.391694245136953e-06, + "loss": 2.2793, + "step": 27490 + }, + { + "epoch": 2.343049518452229, + "grad_norm": 65.36270834861091, + "learning_rate": 1.3913510139708486e-06, + "loss": 1.6615, + "step": 27491 + }, + { + "epoch": 2.3431347481462543, + "grad_norm": 67.43408254026305, + "learning_rate": 1.3910078182943747e-06, + "loss": 1.4492, + "step": 27492 + }, + { + "epoch": 2.3432199778402794, + "grad_norm": 39.219014691918574, + "learning_rate": 1.3906646581109052e-06, + "loss": 1.2915, + "step": 27493 + }, + { + "epoch": 2.343305207534305, + "grad_norm": 54.54506183245923, + "learning_rate": 1.3903215334238141e-06, + "loss": 1.8259, + "step": 27494 + }, + { + "epoch": 2.3433904372283303, + "grad_norm": 58.42505825448209, + "learning_rate": 1.389978444236475e-06, + "loss": 1.4185, + "step": 27495 + }, + { + "epoch": 2.3434756669223558, + "grad_norm": 48.08168322778562, + "learning_rate": 1.3896353905522647e-06, + "loss": 1.679, + "step": 27496 + }, + { + "epoch": 2.3435608966163812, + "grad_norm": 71.77592314443355, + "learning_rate": 1.3892923723745561e-06, + "loss": 1.9617, + "step": 27497 + }, + { + "epoch": 2.3436461263104067, + "grad_norm": 36.34707587154283, + "learning_rate": 1.3889493897067203e-06, + "loss": 1.3476, + "step": 27498 + }, + { + "epoch": 2.343731356004432, + "grad_norm": 55.32371737277351, + "learning_rate": 1.3886064425521344e-06, + "loss": 1.4002, + "step": 27499 + }, + { + "epoch": 2.343816585698457, + "grad_norm": 79.71366550921663, + "learning_rate": 1.3882635309141673e-06, + "loss": 2.0696, + "step": 27500 + }, + { + "epoch": 2.3439018153924827, + "grad_norm": 43.25630979852157, + "learning_rate": 1.387920654796195e-06, + "loss": 1.5963, + "step": 27501 + }, + { + "epoch": 2.343987045086508, + "grad_norm": 48.894191621379, + "learning_rate": 1.3875778142015871e-06, + "loss": 1.957, + "step": 27502 + }, + { + "epoch": 2.3440722747805336, + "grad_norm": 41.83228834458912, + "learning_rate": 1.3872350091337167e-06, + "loss": 1.2845, + "step": 27503 + }, + { + "epoch": 2.344157504474559, + "grad_norm": 54.2466976797454, + "learning_rate": 1.3868922395959522e-06, + "loss": 2.1355, + "step": 27504 + }, + { + "epoch": 2.344242734168584, + "grad_norm": 45.26145173699761, + "learning_rate": 1.386549505591669e-06, + "loss": 1.3626, + "step": 27505 + }, + { + "epoch": 2.3443279638626096, + "grad_norm": 53.26853308645736, + "learning_rate": 1.3862068071242335e-06, + "loss": 1.903, + "step": 27506 + }, + { + "epoch": 2.344413193556635, + "grad_norm": 59.162937156877966, + "learning_rate": 1.38586414419702e-06, + "loss": 2.109, + "step": 27507 + }, + { + "epoch": 2.3444984232506605, + "grad_norm": 69.34133021273367, + "learning_rate": 1.3855215168133952e-06, + "loss": 1.7989, + "step": 27508 + }, + { + "epoch": 2.344583652944686, + "grad_norm": 43.92509958144751, + "learning_rate": 1.3851789249767311e-06, + "loss": 1.124, + "step": 27509 + }, + { + "epoch": 2.3446688826387114, + "grad_norm": 91.76799800567022, + "learning_rate": 1.3848363686903959e-06, + "loss": 2.3605, + "step": 27510 + }, + { + "epoch": 2.344754112332737, + "grad_norm": 62.37847775323923, + "learning_rate": 1.3844938479577586e-06, + "loss": 1.6547, + "step": 27511 + }, + { + "epoch": 2.344839342026762, + "grad_norm": 38.623611872922886, + "learning_rate": 1.3841513627821862e-06, + "loss": 1.0544, + "step": 27512 + }, + { + "epoch": 2.3449245717207874, + "grad_norm": 65.07611857704501, + "learning_rate": 1.3838089131670495e-06, + "loss": 1.8098, + "step": 27513 + }, + { + "epoch": 2.345009801414813, + "grad_norm": 82.62623050309861, + "learning_rate": 1.3834664991157137e-06, + "loss": 2.2495, + "step": 27514 + }, + { + "epoch": 2.3450950311088383, + "grad_norm": 30.484332809035696, + "learning_rate": 1.3831241206315499e-06, + "loss": 1.2529, + "step": 27515 + }, + { + "epoch": 2.345180260802864, + "grad_norm": 31.324252210794107, + "learning_rate": 1.3827817777179225e-06, + "loss": 1.0744, + "step": 27516 + }, + { + "epoch": 2.3452654904968893, + "grad_norm": 65.93626107777531, + "learning_rate": 1.382439470378198e-06, + "loss": 1.3256, + "step": 27517 + }, + { + "epoch": 2.3453507201909147, + "grad_norm": 58.48795647081699, + "learning_rate": 1.3820971986157456e-06, + "loss": 1.6623, + "step": 27518 + }, + { + "epoch": 2.3454359498849398, + "grad_norm": 42.82923211867959, + "learning_rate": 1.3817549624339288e-06, + "loss": 1.6693, + "step": 27519 + }, + { + "epoch": 2.345521179578965, + "grad_norm": 66.37525055460107, + "learning_rate": 1.3814127618361145e-06, + "loss": 2.1108, + "step": 27520 + }, + { + "epoch": 2.3456064092729907, + "grad_norm": 35.72996139502261, + "learning_rate": 1.3810705968256666e-06, + "loss": 0.9226, + "step": 27521 + }, + { + "epoch": 2.345691638967016, + "grad_norm": 45.77841092450789, + "learning_rate": 1.3807284674059511e-06, + "loss": 1.1558, + "step": 27522 + }, + { + "epoch": 2.3457768686610416, + "grad_norm": 40.20829588477493, + "learning_rate": 1.3803863735803346e-06, + "loss": 1.3947, + "step": 27523 + }, + { + "epoch": 2.345862098355067, + "grad_norm": 53.05134193131102, + "learning_rate": 1.3800443153521798e-06, + "loss": 1.9267, + "step": 27524 + }, + { + "epoch": 2.345947328049092, + "grad_norm": 52.11554025189761, + "learning_rate": 1.3797022927248488e-06, + "loss": 1.8357, + "step": 27525 + }, + { + "epoch": 2.3460325577431176, + "grad_norm": 44.58396695982403, + "learning_rate": 1.3793603057017092e-06, + "loss": 1.1275, + "step": 27526 + }, + { + "epoch": 2.346117787437143, + "grad_norm": 63.21967441039828, + "learning_rate": 1.3790183542861224e-06, + "loss": 1.3197, + "step": 27527 + }, + { + "epoch": 2.3462030171311685, + "grad_norm": 65.9499688845419, + "learning_rate": 1.378676438481451e-06, + "loss": 1.9274, + "step": 27528 + }, + { + "epoch": 2.346288246825194, + "grad_norm": 58.33089886792467, + "learning_rate": 1.3783345582910562e-06, + "loss": 1.7981, + "step": 27529 + }, + { + "epoch": 2.3463734765192195, + "grad_norm": 30.380021838361387, + "learning_rate": 1.3779927137183018e-06, + "loss": 1.1956, + "step": 27530 + }, + { + "epoch": 2.3464587062132445, + "grad_norm": 59.477427357092374, + "learning_rate": 1.3776509047665514e-06, + "loss": 1.2644, + "step": 27531 + }, + { + "epoch": 2.34654393590727, + "grad_norm": 54.83420357148032, + "learning_rate": 1.3773091314391657e-06, + "loss": 1.8697, + "step": 27532 + }, + { + "epoch": 2.3466291656012954, + "grad_norm": 52.65905040259211, + "learning_rate": 1.3769673937395045e-06, + "loss": 1.4407, + "step": 27533 + }, + { + "epoch": 2.346714395295321, + "grad_norm": 118.9795593142558, + "learning_rate": 1.376625691670928e-06, + "loss": 3.8811, + "step": 27534 + }, + { + "epoch": 2.3467996249893464, + "grad_norm": 28.714266013589103, + "learning_rate": 1.3762840252367993e-06, + "loss": 1.8173, + "step": 27535 + }, + { + "epoch": 2.346884854683372, + "grad_norm": 50.6523158719199, + "learning_rate": 1.3759423944404775e-06, + "loss": 1.4693, + "step": 27536 + }, + { + "epoch": 2.3469700843773973, + "grad_norm": 66.37188773648033, + "learning_rate": 1.375600799285321e-06, + "loss": 2.2499, + "step": 27537 + }, + { + "epoch": 2.3470553140714223, + "grad_norm": 70.56472125932248, + "learning_rate": 1.37525923977469e-06, + "loss": 1.6453, + "step": 27538 + }, + { + "epoch": 2.347140543765448, + "grad_norm": 77.13869273888272, + "learning_rate": 1.3749177159119453e-06, + "loss": 1.4338, + "step": 27539 + }, + { + "epoch": 2.3472257734594733, + "grad_norm": 53.45594230231169, + "learning_rate": 1.3745762277004448e-06, + "loss": 1.5231, + "step": 27540 + }, + { + "epoch": 2.3473110031534987, + "grad_norm": 31.02334793264551, + "learning_rate": 1.3742347751435464e-06, + "loss": 1.1073, + "step": 27541 + }, + { + "epoch": 2.347396232847524, + "grad_norm": 36.20222005966701, + "learning_rate": 1.3738933582446084e-06, + "loss": 0.8707, + "step": 27542 + }, + { + "epoch": 2.3474814625415497, + "grad_norm": 51.082661483072386, + "learning_rate": 1.3735519770069866e-06, + "loss": 1.3706, + "step": 27543 + }, + { + "epoch": 2.3475666922355747, + "grad_norm": 52.26108353824163, + "learning_rate": 1.373210631434042e-06, + "loss": 1.444, + "step": 27544 + }, + { + "epoch": 2.3476519219296, + "grad_norm": 64.33370228152818, + "learning_rate": 1.3728693215291277e-06, + "loss": 1.5361, + "step": 27545 + }, + { + "epoch": 2.3477371516236256, + "grad_norm": 37.70366658511397, + "learning_rate": 1.3725280472956043e-06, + "loss": 1.5913, + "step": 27546 + }, + { + "epoch": 2.347822381317651, + "grad_norm": 27.833797915582775, + "learning_rate": 1.3721868087368246e-06, + "loss": 0.8389, + "step": 27547 + }, + { + "epoch": 2.3479076110116766, + "grad_norm": 77.26372751541103, + "learning_rate": 1.3718456058561475e-06, + "loss": 2.0641, + "step": 27548 + }, + { + "epoch": 2.347992840705702, + "grad_norm": 61.802348097762334, + "learning_rate": 1.3715044386569271e-06, + "loss": 1.3251, + "step": 27549 + }, + { + "epoch": 2.348078070399727, + "grad_norm": 71.31837883815966, + "learning_rate": 1.3711633071425185e-06, + "loss": 1.9758, + "step": 27550 + }, + { + "epoch": 2.3481633000937525, + "grad_norm": 39.99190278120159, + "learning_rate": 1.3708222113162756e-06, + "loss": 0.8903, + "step": 27551 + }, + { + "epoch": 2.348248529787778, + "grad_norm": 34.590590792549754, + "learning_rate": 1.3704811511815551e-06, + "loss": 1.1499, + "step": 27552 + }, + { + "epoch": 2.3483337594818035, + "grad_norm": 39.29779823481186, + "learning_rate": 1.3701401267417098e-06, + "loss": 0.6322, + "step": 27553 + }, + { + "epoch": 2.348418989175829, + "grad_norm": 19.324451094439755, + "learning_rate": 1.3697991380000947e-06, + "loss": 0.6021, + "step": 27554 + }, + { + "epoch": 2.3485042188698544, + "grad_norm": 32.558975697357134, + "learning_rate": 1.369458184960063e-06, + "loss": 1.1107, + "step": 27555 + }, + { + "epoch": 2.34858944856388, + "grad_norm": 50.26484589676753, + "learning_rate": 1.3691172676249653e-06, + "loss": 0.9827, + "step": 27556 + }, + { + "epoch": 2.348674678257905, + "grad_norm": 47.23829546670744, + "learning_rate": 1.3687763859981584e-06, + "loss": 1.6722, + "step": 27557 + }, + { + "epoch": 2.3487599079519303, + "grad_norm": 33.616965688269495, + "learning_rate": 1.368435540082993e-06, + "loss": 0.86, + "step": 27558 + }, + { + "epoch": 2.348845137645956, + "grad_norm": 43.55149251587177, + "learning_rate": 1.3680947298828196e-06, + "loss": 1.1272, + "step": 27559 + }, + { + "epoch": 2.3489303673399813, + "grad_norm": 46.412023585847564, + "learning_rate": 1.3677539554009928e-06, + "loss": 2.1356, + "step": 27560 + }, + { + "epoch": 2.3490155970340068, + "grad_norm": 67.81096792784278, + "learning_rate": 1.3674132166408604e-06, + "loss": 1.7596, + "step": 27561 + }, + { + "epoch": 2.349100826728032, + "grad_norm": 113.9214632386359, + "learning_rate": 1.3670725136057778e-06, + "loss": 2.2404, + "step": 27562 + }, + { + "epoch": 2.3491860564220577, + "grad_norm": 40.25101613347283, + "learning_rate": 1.3667318462990931e-06, + "loss": 1.3598, + "step": 27563 + }, + { + "epoch": 2.3492712861160827, + "grad_norm": 44.37473006884272, + "learning_rate": 1.3663912147241554e-06, + "loss": 1.0726, + "step": 27564 + }, + { + "epoch": 2.349356515810108, + "grad_norm": 46.8811566090169, + "learning_rate": 1.366050618884318e-06, + "loss": 0.9709, + "step": 27565 + }, + { + "epoch": 2.3494417455041336, + "grad_norm": 34.953001822286005, + "learning_rate": 1.3657100587829286e-06, + "loss": 1.1969, + "step": 27566 + }, + { + "epoch": 2.349526975198159, + "grad_norm": 51.67814597493437, + "learning_rate": 1.365369534423337e-06, + "loss": 1.2475, + "step": 27567 + }, + { + "epoch": 2.3496122048921846, + "grad_norm": 20.05644074126782, + "learning_rate": 1.3650290458088899e-06, + "loss": 0.9452, + "step": 27568 + }, + { + "epoch": 2.3496974345862096, + "grad_norm": 34.15004622330805, + "learning_rate": 1.3646885929429376e-06, + "loss": 1.5534, + "step": 27569 + }, + { + "epoch": 2.349782664280235, + "grad_norm": 28.89721105724803, + "learning_rate": 1.3643481758288302e-06, + "loss": 0.7923, + "step": 27570 + }, + { + "epoch": 2.3498678939742605, + "grad_norm": 72.54492868775041, + "learning_rate": 1.3640077944699136e-06, + "loss": 1.4823, + "step": 27571 + }, + { + "epoch": 2.349953123668286, + "grad_norm": 36.45982503967544, + "learning_rate": 1.3636674488695361e-06, + "loss": 1.4839, + "step": 27572 + }, + { + "epoch": 2.3500383533623115, + "grad_norm": 27.98307471515364, + "learning_rate": 1.3633271390310427e-06, + "loss": 0.824, + "step": 27573 + }, + { + "epoch": 2.350123583056337, + "grad_norm": 61.52109183040812, + "learning_rate": 1.3629868649577826e-06, + "loss": 1.7471, + "step": 27574 + }, + { + "epoch": 2.3502088127503624, + "grad_norm": 72.28389164505069, + "learning_rate": 1.3626466266531018e-06, + "loss": 1.8433, + "step": 27575 + }, + { + "epoch": 2.3502940424443874, + "grad_norm": 40.356919593181274, + "learning_rate": 1.3623064241203448e-06, + "loss": 1.143, + "step": 27576 + }, + { + "epoch": 2.350379272138413, + "grad_norm": 49.18254188078161, + "learning_rate": 1.361966257362859e-06, + "loss": 1.1421, + "step": 27577 + }, + { + "epoch": 2.3504645018324384, + "grad_norm": 59.46001519723488, + "learning_rate": 1.3616261263839902e-06, + "loss": 1.3048, + "step": 27578 + }, + { + "epoch": 2.350549731526464, + "grad_norm": 131.731745979713, + "learning_rate": 1.361286031187083e-06, + "loss": 1.9884, + "step": 27579 + }, + { + "epoch": 2.3506349612204893, + "grad_norm": 39.77792857345804, + "learning_rate": 1.360945971775482e-06, + "loss": 0.748, + "step": 27580 + }, + { + "epoch": 2.350720190914515, + "grad_norm": 73.54690243649732, + "learning_rate": 1.3606059481525296e-06, + "loss": 2.0069, + "step": 27581 + }, + { + "epoch": 2.3508054206085403, + "grad_norm": 38.09483017827502, + "learning_rate": 1.3602659603215728e-06, + "loss": 1.3273, + "step": 27582 + }, + { + "epoch": 2.3508906503025653, + "grad_norm": 47.24510968944412, + "learning_rate": 1.3599260082859545e-06, + "loss": 1.0464, + "step": 27583 + }, + { + "epoch": 2.3509758799965907, + "grad_norm": 58.52682490934983, + "learning_rate": 1.3595860920490161e-06, + "loss": 1.5719, + "step": 27584 + }, + { + "epoch": 2.351061109690616, + "grad_norm": 61.126623083079664, + "learning_rate": 1.359246211614103e-06, + "loss": 1.1111, + "step": 27585 + }, + { + "epoch": 2.3511463393846417, + "grad_norm": 68.12958050767715, + "learning_rate": 1.3589063669845553e-06, + "loss": 2.0675, + "step": 27586 + }, + { + "epoch": 2.351231569078667, + "grad_norm": 51.20312756749577, + "learning_rate": 1.3585665581637182e-06, + "loss": 1.3404, + "step": 27587 + }, + { + "epoch": 2.351316798772692, + "grad_norm": 48.929151121734655, + "learning_rate": 1.358226785154932e-06, + "loss": 1.2118, + "step": 27588 + }, + { + "epoch": 2.3514020284667176, + "grad_norm": 75.16861996870344, + "learning_rate": 1.3578870479615381e-06, + "loss": 1.593, + "step": 27589 + }, + { + "epoch": 2.351487258160743, + "grad_norm": 28.82639248165942, + "learning_rate": 1.3575473465868767e-06, + "loss": 0.9667, + "step": 27590 + }, + { + "epoch": 2.3515724878547686, + "grad_norm": 72.0046414015199, + "learning_rate": 1.357207681034291e-06, + "loss": 1.6995, + "step": 27591 + }, + { + "epoch": 2.351657717548794, + "grad_norm": 52.54133107826491, + "learning_rate": 1.3568680513071192e-06, + "loss": 1.2363, + "step": 27592 + }, + { + "epoch": 2.3517429472428195, + "grad_norm": 35.44630337190262, + "learning_rate": 1.3565284574087034e-06, + "loss": 0.8663, + "step": 27593 + }, + { + "epoch": 2.351828176936845, + "grad_norm": 20.644475899904574, + "learning_rate": 1.356188899342381e-06, + "loss": 0.6486, + "step": 27594 + }, + { + "epoch": 2.35191340663087, + "grad_norm": 73.3467676575511, + "learning_rate": 1.3558493771114944e-06, + "loss": 1.8554, + "step": 27595 + }, + { + "epoch": 2.3519986363248955, + "grad_norm": 37.120362318739154, + "learning_rate": 1.3555098907193813e-06, + "loss": 1.0459, + "step": 27596 + }, + { + "epoch": 2.352083866018921, + "grad_norm": 34.34662064673864, + "learning_rate": 1.3551704401693804e-06, + "loss": 0.8553, + "step": 27597 + }, + { + "epoch": 2.3521690957129464, + "grad_norm": 78.96903603248948, + "learning_rate": 1.3548310254648283e-06, + "loss": 1.9554, + "step": 27598 + }, + { + "epoch": 2.352254325406972, + "grad_norm": 53.95187887909747, + "learning_rate": 1.3544916466090663e-06, + "loss": 1.0938, + "step": 27599 + }, + { + "epoch": 2.3523395551009973, + "grad_norm": 20.76446715149203, + "learning_rate": 1.3541523036054282e-06, + "loss": 0.821, + "step": 27600 + }, + { + "epoch": 2.352424784795023, + "grad_norm": 68.30069686015223, + "learning_rate": 1.3538129964572555e-06, + "loss": 1.4067, + "step": 27601 + }, + { + "epoch": 2.352510014489048, + "grad_norm": 52.06781220005951, + "learning_rate": 1.3534737251678831e-06, + "loss": 1.4311, + "step": 27602 + }, + { + "epoch": 2.3525952441830733, + "grad_norm": 28.878745068101978, + "learning_rate": 1.3531344897406461e-06, + "loss": 1.151, + "step": 27603 + }, + { + "epoch": 2.3526804738770988, + "grad_norm": 34.620173745570604, + "learning_rate": 1.3527952901788837e-06, + "loss": 0.8944, + "step": 27604 + }, + { + "epoch": 2.3527657035711242, + "grad_norm": 46.07768084986298, + "learning_rate": 1.3524561264859304e-06, + "loss": 1.6004, + "step": 27605 + }, + { + "epoch": 2.3528509332651497, + "grad_norm": 65.61703150210225, + "learning_rate": 1.3521169986651205e-06, + "loss": 1.7257, + "step": 27606 + }, + { + "epoch": 2.3529361629591747, + "grad_norm": 56.14956877755452, + "learning_rate": 1.351777906719791e-06, + "loss": 1.8486, + "step": 27607 + }, + { + "epoch": 2.3530213926532, + "grad_norm": 69.66875240704108, + "learning_rate": 1.3514388506532754e-06, + "loss": 1.8686, + "step": 27608 + }, + { + "epoch": 2.3531066223472257, + "grad_norm": 43.18447630330366, + "learning_rate": 1.3510998304689105e-06, + "loss": 0.97, + "step": 27609 + }, + { + "epoch": 2.353191852041251, + "grad_norm": 37.55321817494731, + "learning_rate": 1.3507608461700284e-06, + "loss": 0.9307, + "step": 27610 + }, + { + "epoch": 2.3532770817352766, + "grad_norm": 49.15271037314206, + "learning_rate": 1.350421897759962e-06, + "loss": 1.3208, + "step": 27611 + }, + { + "epoch": 2.353362311429302, + "grad_norm": 51.89957738094038, + "learning_rate": 1.3500829852420477e-06, + "loss": 1.489, + "step": 27612 + }, + { + "epoch": 2.3534475411233275, + "grad_norm": 55.76274606670708, + "learning_rate": 1.3497441086196168e-06, + "loss": 1.65, + "step": 27613 + }, + { + "epoch": 2.3535327708173526, + "grad_norm": 28.31889287302518, + "learning_rate": 1.3494052678960017e-06, + "loss": 1.0353, + "step": 27614 + }, + { + "epoch": 2.353618000511378, + "grad_norm": 40.525154264756885, + "learning_rate": 1.3490664630745342e-06, + "loss": 1.0464, + "step": 27615 + }, + { + "epoch": 2.3537032302054035, + "grad_norm": 32.390388131731484, + "learning_rate": 1.348727694158547e-06, + "loss": 1.3945, + "step": 27616 + }, + { + "epoch": 2.353788459899429, + "grad_norm": 78.81900628313919, + "learning_rate": 1.348388961151374e-06, + "loss": 1.6602, + "step": 27617 + }, + { + "epoch": 2.3538736895934544, + "grad_norm": 34.06194724465661, + "learning_rate": 1.3480502640563443e-06, + "loss": 1.0305, + "step": 27618 + }, + { + "epoch": 2.35395891928748, + "grad_norm": 60.84871555552658, + "learning_rate": 1.347711602876789e-06, + "loss": 1.193, + "step": 27619 + }, + { + "epoch": 2.3540441489815054, + "grad_norm": 37.03199446675585, + "learning_rate": 1.3473729776160372e-06, + "loss": 1.3193, + "step": 27620 + }, + { + "epoch": 2.3541293786755304, + "grad_norm": 21.489829890888725, + "learning_rate": 1.3470343882774223e-06, + "loss": 0.6888, + "step": 27621 + }, + { + "epoch": 2.354214608369556, + "grad_norm": 74.68163828187751, + "learning_rate": 1.3466958348642727e-06, + "loss": 2.2169, + "step": 27622 + }, + { + "epoch": 2.3542998380635813, + "grad_norm": 58.45294007534151, + "learning_rate": 1.3463573173799165e-06, + "loss": 1.4042, + "step": 27623 + }, + { + "epoch": 2.354385067757607, + "grad_norm": 77.7499372812404, + "learning_rate": 1.3460188358276843e-06, + "loss": 1.4671, + "step": 27624 + }, + { + "epoch": 2.3544702974516323, + "grad_norm": 82.34536210834408, + "learning_rate": 1.3456803902109062e-06, + "loss": 2.421, + "step": 27625 + }, + { + "epoch": 2.3545555271456573, + "grad_norm": 59.21059761820008, + "learning_rate": 1.3453419805329094e-06, + "loss": 1.626, + "step": 27626 + }, + { + "epoch": 2.3546407568396828, + "grad_norm": 46.62739720692477, + "learning_rate": 1.3450036067970224e-06, + "loss": 1.5003, + "step": 27627 + }, + { + "epoch": 2.3547259865337082, + "grad_norm": 57.203704004947866, + "learning_rate": 1.344665269006572e-06, + "loss": 1.9111, + "step": 27628 + }, + { + "epoch": 2.3548112162277337, + "grad_norm": 33.205893416671636, + "learning_rate": 1.344326967164885e-06, + "loss": 1.0283, + "step": 27629 + }, + { + "epoch": 2.354896445921759, + "grad_norm": 73.72200970941635, + "learning_rate": 1.3439887012752912e-06, + "loss": 1.2626, + "step": 27630 + }, + { + "epoch": 2.3549816756157846, + "grad_norm": 72.43618128965392, + "learning_rate": 1.343650471341114e-06, + "loss": 1.2468, + "step": 27631 + }, + { + "epoch": 2.35506690530981, + "grad_norm": 69.52858179902137, + "learning_rate": 1.343312277365683e-06, + "loss": 2.1232, + "step": 27632 + }, + { + "epoch": 2.355152135003835, + "grad_norm": 34.39583132999057, + "learning_rate": 1.3429741193523216e-06, + "loss": 1.1887, + "step": 27633 + }, + { + "epoch": 2.3552373646978606, + "grad_norm": 24.142663618888893, + "learning_rate": 1.3426359973043579e-06, + "loss": 0.6731, + "step": 27634 + }, + { + "epoch": 2.355322594391886, + "grad_norm": 38.56338315927223, + "learning_rate": 1.3422979112251161e-06, + "loss": 0.9221, + "step": 27635 + }, + { + "epoch": 2.3554078240859115, + "grad_norm": 51.27659176526343, + "learning_rate": 1.3419598611179208e-06, + "loss": 1.9521, + "step": 27636 + }, + { + "epoch": 2.355493053779937, + "grad_norm": 79.61917768098155, + "learning_rate": 1.3416218469860948e-06, + "loss": 1.7177, + "step": 27637 + }, + { + "epoch": 2.3555782834739625, + "grad_norm": 73.30305310207861, + "learning_rate": 1.341283868832966e-06, + "loss": 1.7075, + "step": 27638 + }, + { + "epoch": 2.355663513167988, + "grad_norm": 37.88223483950921, + "learning_rate": 1.3409459266618551e-06, + "loss": 0.8301, + "step": 27639 + }, + { + "epoch": 2.355748742862013, + "grad_norm": 53.68067621437229, + "learning_rate": 1.3406080204760884e-06, + "loss": 1.2502, + "step": 27640 + }, + { + "epoch": 2.3558339725560384, + "grad_norm": 26.32291180490578, + "learning_rate": 1.3402701502789866e-06, + "loss": 0.8886, + "step": 27641 + }, + { + "epoch": 2.355919202250064, + "grad_norm": 60.661752319761945, + "learning_rate": 1.3399323160738748e-06, + "loss": 1.0641, + "step": 27642 + }, + { + "epoch": 2.3560044319440894, + "grad_norm": 44.64073277421007, + "learning_rate": 1.3395945178640746e-06, + "loss": 1.441, + "step": 27643 + }, + { + "epoch": 2.356089661638115, + "grad_norm": 62.53342288832279, + "learning_rate": 1.3392567556529074e-06, + "loss": 1.3037, + "step": 27644 + }, + { + "epoch": 2.3561748913321403, + "grad_norm": 61.32980308321803, + "learning_rate": 1.3389190294436938e-06, + "loss": 1.1674, + "step": 27645 + }, + { + "epoch": 2.3562601210261653, + "grad_norm": 50.40850700246204, + "learning_rate": 1.3385813392397583e-06, + "loss": 1.7567, + "step": 27646 + }, + { + "epoch": 2.356345350720191, + "grad_norm": 73.31910325850929, + "learning_rate": 1.3382436850444198e-06, + "loss": 2.242, + "step": 27647 + }, + { + "epoch": 2.3564305804142163, + "grad_norm": 73.87369332856315, + "learning_rate": 1.337906066861e-06, + "loss": 1.7261, + "step": 27648 + }, + { + "epoch": 2.3565158101082417, + "grad_norm": 47.48875146994554, + "learning_rate": 1.3375684846928193e-06, + "loss": 1.063, + "step": 27649 + }, + { + "epoch": 2.356601039802267, + "grad_norm": 64.19220699899466, + "learning_rate": 1.337230938543196e-06, + "loss": 2.1087, + "step": 27650 + }, + { + "epoch": 2.3566862694962927, + "grad_norm": 43.415101368433376, + "learning_rate": 1.3368934284154522e-06, + "loss": 1.183, + "step": 27651 + }, + { + "epoch": 2.3567714991903177, + "grad_norm": 39.69322234392099, + "learning_rate": 1.3365559543129058e-06, + "loss": 1.3787, + "step": 27652 + }, + { + "epoch": 2.356856728884343, + "grad_norm": 76.15527711466017, + "learning_rate": 1.3362185162388746e-06, + "loss": 1.7618, + "step": 27653 + }, + { + "epoch": 2.3569419585783686, + "grad_norm": 36.243772071267735, + "learning_rate": 1.3358811141966799e-06, + "loss": 0.8654, + "step": 27654 + }, + { + "epoch": 2.357027188272394, + "grad_norm": 72.4768673883168, + "learning_rate": 1.3355437481896372e-06, + "loss": 2.2002, + "step": 27655 + }, + { + "epoch": 2.3571124179664196, + "grad_norm": 36.6458926166385, + "learning_rate": 1.3352064182210666e-06, + "loss": 1.275, + "step": 27656 + }, + { + "epoch": 2.357197647660445, + "grad_norm": 38.580009534269685, + "learning_rate": 1.3348691242942852e-06, + "loss": 1.2028, + "step": 27657 + }, + { + "epoch": 2.3572828773544705, + "grad_norm": 62.21428268324598, + "learning_rate": 1.3345318664126095e-06, + "loss": 1.4385, + "step": 27658 + }, + { + "epoch": 2.3573681070484955, + "grad_norm": 60.38408790822969, + "learning_rate": 1.3341946445793546e-06, + "loss": 1.4465, + "step": 27659 + }, + { + "epoch": 2.357453336742521, + "grad_norm": 55.60762614905296, + "learning_rate": 1.3338574587978403e-06, + "loss": 1.7528, + "step": 27660 + }, + { + "epoch": 2.3575385664365465, + "grad_norm": 63.829046565642905, + "learning_rate": 1.3335203090713794e-06, + "loss": 1.4783, + "step": 27661 + }, + { + "epoch": 2.357623796130572, + "grad_norm": 88.64447560819033, + "learning_rate": 1.3331831954032914e-06, + "loss": 1.5911, + "step": 27662 + }, + { + "epoch": 2.3577090258245974, + "grad_norm": 29.731309647482853, + "learning_rate": 1.3328461177968876e-06, + "loss": 1.1434, + "step": 27663 + }, + { + "epoch": 2.357794255518623, + "grad_norm": 55.061913110536395, + "learning_rate": 1.332509076255487e-06, + "loss": 1.4501, + "step": 27664 + }, + { + "epoch": 2.357879485212648, + "grad_norm": 32.09904475115755, + "learning_rate": 1.332172070782402e-06, + "loss": 0.9636, + "step": 27665 + }, + { + "epoch": 2.3579647149066734, + "grad_norm": 33.562679966622575, + "learning_rate": 1.3318351013809472e-06, + "loss": 1.044, + "step": 27666 + }, + { + "epoch": 2.358049944600699, + "grad_norm": 40.228043534198214, + "learning_rate": 1.3314981680544354e-06, + "loss": 1.3765, + "step": 27667 + }, + { + "epoch": 2.3581351742947243, + "grad_norm": 47.658060350637584, + "learning_rate": 1.3311612708061828e-06, + "loss": 1.3462, + "step": 27668 + }, + { + "epoch": 2.3582204039887498, + "grad_norm": 45.533731228646985, + "learning_rate": 1.3308244096395007e-06, + "loss": 1.4759, + "step": 27669 + }, + { + "epoch": 2.3583056336827752, + "grad_norm": 47.964900892633, + "learning_rate": 1.3304875845577014e-06, + "loss": 1.3455, + "step": 27670 + }, + { + "epoch": 2.3583908633768003, + "grad_norm": 53.12342101239985, + "learning_rate": 1.330150795564099e-06, + "loss": 0.9647, + "step": 27671 + }, + { + "epoch": 2.3584760930708257, + "grad_norm": 28.832433939303858, + "learning_rate": 1.3298140426620065e-06, + "loss": 1.0114, + "step": 27672 + }, + { + "epoch": 2.358561322764851, + "grad_norm": 57.40634837931579, + "learning_rate": 1.3294773258547343e-06, + "loss": 1.356, + "step": 27673 + }, + { + "epoch": 2.3586465524588767, + "grad_norm": 42.017802403244566, + "learning_rate": 1.329140645145594e-06, + "loss": 0.8008, + "step": 27674 + }, + { + "epoch": 2.358731782152902, + "grad_norm": 32.91283292627834, + "learning_rate": 1.3288040005378973e-06, + "loss": 0.8175, + "step": 27675 + }, + { + "epoch": 2.3588170118469276, + "grad_norm": 52.112418413597396, + "learning_rate": 1.3284673920349528e-06, + "loss": 1.9563, + "step": 27676 + }, + { + "epoch": 2.358902241540953, + "grad_norm": 31.290414583774727, + "learning_rate": 1.328130819640074e-06, + "loss": 0.9707, + "step": 27677 + }, + { + "epoch": 2.358987471234978, + "grad_norm": 39.690399579058464, + "learning_rate": 1.3277942833565678e-06, + "loss": 1.2067, + "step": 27678 + }, + { + "epoch": 2.3590727009290036, + "grad_norm": 43.81408760908108, + "learning_rate": 1.3274577831877472e-06, + "loss": 1.1981, + "step": 27679 + }, + { + "epoch": 2.359157930623029, + "grad_norm": 50.80710276310355, + "learning_rate": 1.3271213191369188e-06, + "loss": 1.8694, + "step": 27680 + }, + { + "epoch": 2.3592431603170545, + "grad_norm": 63.31135310246394, + "learning_rate": 1.3267848912073938e-06, + "loss": 1.2369, + "step": 27681 + }, + { + "epoch": 2.35932839001108, + "grad_norm": 47.30565872127592, + "learning_rate": 1.3264484994024802e-06, + "loss": 1.4769, + "step": 27682 + }, + { + "epoch": 2.3594136197051054, + "grad_norm": 67.83439984354722, + "learning_rate": 1.3261121437254858e-06, + "loss": 1.7287, + "step": 27683 + }, + { + "epoch": 2.359498849399131, + "grad_norm": 65.03169662890123, + "learning_rate": 1.325775824179717e-06, + "loss": 2.0787, + "step": 27684 + }, + { + "epoch": 2.359584079093156, + "grad_norm": 38.82470598439652, + "learning_rate": 1.3254395407684845e-06, + "loss": 1.2632, + "step": 27685 + }, + { + "epoch": 2.3596693087871814, + "grad_norm": 64.47434417284573, + "learning_rate": 1.3251032934950925e-06, + "loss": 1.2223, + "step": 27686 + }, + { + "epoch": 2.359754538481207, + "grad_norm": 51.63401009353105, + "learning_rate": 1.3247670823628512e-06, + "loss": 1.3672, + "step": 27687 + }, + { + "epoch": 2.3598397681752323, + "grad_norm": 31.117672883983882, + "learning_rate": 1.3244309073750649e-06, + "loss": 0.8909, + "step": 27688 + }, + { + "epoch": 2.359924997869258, + "grad_norm": 56.8103895660729, + "learning_rate": 1.3240947685350386e-06, + "loss": 1.7403, + "step": 27689 + }, + { + "epoch": 2.360010227563283, + "grad_norm": 34.901423251667275, + "learning_rate": 1.3237586658460811e-06, + "loss": 1.1766, + "step": 27690 + }, + { + "epoch": 2.3600954572573083, + "grad_norm": 33.15221317000865, + "learning_rate": 1.3234225993114964e-06, + "loss": 0.8055, + "step": 27691 + }, + { + "epoch": 2.3601806869513338, + "grad_norm": 52.28510520280453, + "learning_rate": 1.3230865689345878e-06, + "loss": 1.2421, + "step": 27692 + }, + { + "epoch": 2.3602659166453592, + "grad_norm": 60.813648602898446, + "learning_rate": 1.3227505747186635e-06, + "loss": 1.5063, + "step": 27693 + }, + { + "epoch": 2.3603511463393847, + "grad_norm": 33.53952260896476, + "learning_rate": 1.3224146166670244e-06, + "loss": 0.9054, + "step": 27694 + }, + { + "epoch": 2.36043637603341, + "grad_norm": 23.941193399538573, + "learning_rate": 1.3220786947829778e-06, + "loss": 0.7523, + "step": 27695 + }, + { + "epoch": 2.3605216057274356, + "grad_norm": 62.94159208104377, + "learning_rate": 1.321742809069826e-06, + "loss": 1.4549, + "step": 27696 + }, + { + "epoch": 2.3606068354214607, + "grad_norm": 68.1409629151038, + "learning_rate": 1.3214069595308698e-06, + "loss": 1.7673, + "step": 27697 + }, + { + "epoch": 2.360692065115486, + "grad_norm": 36.04925986016159, + "learning_rate": 1.321071146169417e-06, + "loss": 1.2592, + "step": 27698 + }, + { + "epoch": 2.3607772948095116, + "grad_norm": 36.70596592801904, + "learning_rate": 1.320735368988767e-06, + "loss": 1.4487, + "step": 27699 + }, + { + "epoch": 2.360862524503537, + "grad_norm": 55.707483691923045, + "learning_rate": 1.320399627992221e-06, + "loss": 1.7539, + "step": 27700 + }, + { + "epoch": 2.3609477541975625, + "grad_norm": 66.66748828633787, + "learning_rate": 1.3200639231830842e-06, + "loss": 1.4339, + "step": 27701 + }, + { + "epoch": 2.361032983891588, + "grad_norm": 47.02136971300694, + "learning_rate": 1.319728254564655e-06, + "loss": 0.9396, + "step": 27702 + }, + { + "epoch": 2.3611182135856135, + "grad_norm": 42.25219232301889, + "learning_rate": 1.3193926221402375e-06, + "loss": 1.1549, + "step": 27703 + }, + { + "epoch": 2.3612034432796385, + "grad_norm": 40.87190623101983, + "learning_rate": 1.3190570259131314e-06, + "loss": 1.0423, + "step": 27704 + }, + { + "epoch": 2.361288672973664, + "grad_norm": 48.34348836094032, + "learning_rate": 1.3187214658866366e-06, + "loss": 1.53, + "step": 27705 + }, + { + "epoch": 2.3613739026676894, + "grad_norm": 39.12892713600552, + "learning_rate": 1.3183859420640516e-06, + "loss": 1.3623, + "step": 27706 + }, + { + "epoch": 2.361459132361715, + "grad_norm": 46.923299375490735, + "learning_rate": 1.3180504544486795e-06, + "loss": 0.7712, + "step": 27707 + }, + { + "epoch": 2.3615443620557404, + "grad_norm": 103.44800455170572, + "learning_rate": 1.3177150030438168e-06, + "loss": 2.2488, + "step": 27708 + }, + { + "epoch": 2.3616295917497654, + "grad_norm": 41.078320819249036, + "learning_rate": 1.317379587852765e-06, + "loss": 1.02, + "step": 27709 + }, + { + "epoch": 2.361714821443791, + "grad_norm": 42.87313018927345, + "learning_rate": 1.3170442088788205e-06, + "loss": 1.0333, + "step": 27710 + }, + { + "epoch": 2.3618000511378163, + "grad_norm": 67.05119254454594, + "learning_rate": 1.3167088661252842e-06, + "loss": 1.5414, + "step": 27711 + }, + { + "epoch": 2.361885280831842, + "grad_norm": 56.07338252246181, + "learning_rate": 1.3163735595954524e-06, + "loss": 1.831, + "step": 27712 + }, + { + "epoch": 2.3619705105258673, + "grad_norm": 79.2845439080843, + "learning_rate": 1.3160382892926226e-06, + "loss": 2.6297, + "step": 27713 + }, + { + "epoch": 2.3620557402198927, + "grad_norm": 66.05837107463326, + "learning_rate": 1.3157030552200917e-06, + "loss": 1.9979, + "step": 27714 + }, + { + "epoch": 2.362140969913918, + "grad_norm": 37.133847107002, + "learning_rate": 1.3153678573811584e-06, + "loss": 1.1603, + "step": 27715 + }, + { + "epoch": 2.3622261996079432, + "grad_norm": 25.142568358977808, + "learning_rate": 1.3150326957791177e-06, + "loss": 0.8197, + "step": 27716 + }, + { + "epoch": 2.3623114293019687, + "grad_norm": 37.31646143713904, + "learning_rate": 1.3146975704172653e-06, + "loss": 0.7423, + "step": 27717 + }, + { + "epoch": 2.362396658995994, + "grad_norm": 103.76864329434122, + "learning_rate": 1.314362481298899e-06, + "loss": 2.049, + "step": 27718 + }, + { + "epoch": 2.3624818886900196, + "grad_norm": 23.456989425601382, + "learning_rate": 1.3140274284273118e-06, + "loss": 1.162, + "step": 27719 + }, + { + "epoch": 2.362567118384045, + "grad_norm": 50.48250981418459, + "learning_rate": 1.3136924118058014e-06, + "loss": 1.2993, + "step": 27720 + }, + { + "epoch": 2.3626523480780706, + "grad_norm": 64.06303461363825, + "learning_rate": 1.3133574314376618e-06, + "loss": 1.3952, + "step": 27721 + }, + { + "epoch": 2.362737577772096, + "grad_norm": 33.240835454232695, + "learning_rate": 1.3130224873261866e-06, + "loss": 1.9569, + "step": 27722 + }, + { + "epoch": 2.362822807466121, + "grad_norm": 26.765661798083066, + "learning_rate": 1.312687579474669e-06, + "loss": 0.8969, + "step": 27723 + }, + { + "epoch": 2.3629080371601465, + "grad_norm": 57.0046385736043, + "learning_rate": 1.3123527078864052e-06, + "loss": 1.5694, + "step": 27724 + }, + { + "epoch": 2.362993266854172, + "grad_norm": 27.248700239988988, + "learning_rate": 1.3120178725646854e-06, + "loss": 1.0061, + "step": 27725 + }, + { + "epoch": 2.3630784965481975, + "grad_norm": 23.80894733023861, + "learning_rate": 1.311683073512806e-06, + "loss": 0.8198, + "step": 27726 + }, + { + "epoch": 2.363163726242223, + "grad_norm": 37.6904119050371, + "learning_rate": 1.311348310734057e-06, + "loss": 1.2214, + "step": 27727 + }, + { + "epoch": 2.363248955936248, + "grad_norm": 54.02808774455738, + "learning_rate": 1.3110135842317323e-06, + "loss": 1.4566, + "step": 27728 + }, + { + "epoch": 2.3633341856302734, + "grad_norm": 64.61775724487157, + "learning_rate": 1.3106788940091237e-06, + "loss": 1.7085, + "step": 27729 + }, + { + "epoch": 2.363419415324299, + "grad_norm": 32.29890172030517, + "learning_rate": 1.310344240069522e-06, + "loss": 1.0242, + "step": 27730 + }, + { + "epoch": 2.3635046450183244, + "grad_norm": 63.8961047021967, + "learning_rate": 1.3100096224162173e-06, + "loss": 1.7026, + "step": 27731 + }, + { + "epoch": 2.36358987471235, + "grad_norm": 22.865561616831226, + "learning_rate": 1.3096750410525033e-06, + "loss": 0.5833, + "step": 27732 + }, + { + "epoch": 2.3636751044063753, + "grad_norm": 62.85498030719047, + "learning_rate": 1.3093404959816669e-06, + "loss": 2.2935, + "step": 27733 + }, + { + "epoch": 2.3637603341004008, + "grad_norm": 38.176224370094424, + "learning_rate": 1.309005987207002e-06, + "loss": 1.4416, + "step": 27734 + }, + { + "epoch": 2.363845563794426, + "grad_norm": 46.89699435837102, + "learning_rate": 1.3086715147317963e-06, + "loss": 1.541, + "step": 27735 + }, + { + "epoch": 2.3639307934884513, + "grad_norm": 26.2780687407033, + "learning_rate": 1.308337078559338e-06, + "loss": 0.8971, + "step": 27736 + }, + { + "epoch": 2.3640160231824767, + "grad_norm": 44.75256479809021, + "learning_rate": 1.3080026786929195e-06, + "loss": 1.3927, + "step": 27737 + }, + { + "epoch": 2.364101252876502, + "grad_norm": 61.49748324528383, + "learning_rate": 1.3076683151358272e-06, + "loss": 1.4748, + "step": 27738 + }, + { + "epoch": 2.3641864825705277, + "grad_norm": 27.913217778855874, + "learning_rate": 1.3073339878913488e-06, + "loss": 1.19, + "step": 27739 + }, + { + "epoch": 2.364271712264553, + "grad_norm": 44.86230484654774, + "learning_rate": 1.306999696962774e-06, + "loss": 0.8814, + "step": 27740 + }, + { + "epoch": 2.3643569419585786, + "grad_norm": 59.483980554269465, + "learning_rate": 1.3066654423533886e-06, + "loss": 1.4663, + "step": 27741 + }, + { + "epoch": 2.3644421716526036, + "grad_norm": 70.25049358951324, + "learning_rate": 1.3063312240664826e-06, + "loss": 1.4451, + "step": 27742 + }, + { + "epoch": 2.364527401346629, + "grad_norm": 55.95507298377065, + "learning_rate": 1.3059970421053414e-06, + "loss": 1.6407, + "step": 27743 + }, + { + "epoch": 2.3646126310406546, + "grad_norm": 35.48664338445739, + "learning_rate": 1.30566289647325e-06, + "loss": 0.7923, + "step": 27744 + }, + { + "epoch": 2.36469786073468, + "grad_norm": 48.040720014703105, + "learning_rate": 1.305328787173497e-06, + "loss": 1.6685, + "step": 27745 + }, + { + "epoch": 2.3647830904287055, + "grad_norm": 53.599119829729005, + "learning_rate": 1.3049947142093678e-06, + "loss": 0.7876, + "step": 27746 + }, + { + "epoch": 2.3648683201227305, + "grad_norm": 48.778018069444634, + "learning_rate": 1.3046606775841458e-06, + "loss": 1.5194, + "step": 27747 + }, + { + "epoch": 2.364953549816756, + "grad_norm": 39.50687676590211, + "learning_rate": 1.3043266773011192e-06, + "loss": 1.4668, + "step": 27748 + }, + { + "epoch": 2.3650387795107815, + "grad_norm": 82.56181247848733, + "learning_rate": 1.3039927133635693e-06, + "loss": 2.3319, + "step": 27749 + }, + { + "epoch": 2.365124009204807, + "grad_norm": 26.1481450673341, + "learning_rate": 1.303658785774784e-06, + "loss": 0.7472, + "step": 27750 + }, + { + "epoch": 2.3652092388988324, + "grad_norm": 26.986364376410716, + "learning_rate": 1.303324894538046e-06, + "loss": 1.1071, + "step": 27751 + }, + { + "epoch": 2.365294468592858, + "grad_norm": 83.80924827078479, + "learning_rate": 1.3029910396566386e-06, + "loss": 2.1822, + "step": 27752 + }, + { + "epoch": 2.3653796982868833, + "grad_norm": 67.4489755967464, + "learning_rate": 1.3026572211338439e-06, + "loss": 1.786, + "step": 27753 + }, + { + "epoch": 2.3654649279809083, + "grad_norm": 65.9008315238765, + "learning_rate": 1.3023234389729477e-06, + "loss": 1.9007, + "step": 27754 + }, + { + "epoch": 2.365550157674934, + "grad_norm": 70.82702137735782, + "learning_rate": 1.3019896931772298e-06, + "loss": 1.9937, + "step": 27755 + }, + { + "epoch": 2.3656353873689593, + "grad_norm": 20.556711153370458, + "learning_rate": 1.3016559837499749e-06, + "loss": 0.8954, + "step": 27756 + }, + { + "epoch": 2.3657206170629848, + "grad_norm": 29.146197790839963, + "learning_rate": 1.3013223106944628e-06, + "loss": 0.9135, + "step": 27757 + }, + { + "epoch": 2.3658058467570102, + "grad_norm": 43.2670756288385, + "learning_rate": 1.300988674013977e-06, + "loss": 1.3796, + "step": 27758 + }, + { + "epoch": 2.3658910764510357, + "grad_norm": 36.44921126189224, + "learning_rate": 1.3006550737117985e-06, + "loss": 1.0558, + "step": 27759 + }, + { + "epoch": 2.365976306145061, + "grad_norm": 62.413255267417654, + "learning_rate": 1.3003215097912063e-06, + "loss": 1.2559, + "step": 27760 + }, + { + "epoch": 2.366061535839086, + "grad_norm": 26.776471893109054, + "learning_rate": 1.2999879822554828e-06, + "loss": 0.925, + "step": 27761 + }, + { + "epoch": 2.3661467655331117, + "grad_norm": 53.996785217667956, + "learning_rate": 1.2996544911079057e-06, + "loss": 0.9389, + "step": 27762 + }, + { + "epoch": 2.366231995227137, + "grad_norm": 22.022688250678524, + "learning_rate": 1.2993210363517578e-06, + "loss": 0.6722, + "step": 27763 + }, + { + "epoch": 2.3663172249211626, + "grad_norm": 26.366378877788538, + "learning_rate": 1.2989876179903154e-06, + "loss": 0.8662, + "step": 27764 + }, + { + "epoch": 2.366402454615188, + "grad_norm": 50.655854628471324, + "learning_rate": 1.2986542360268606e-06, + "loss": 1.8077, + "step": 27765 + }, + { + "epoch": 2.3664876843092135, + "grad_norm": 43.78122047934459, + "learning_rate": 1.2983208904646688e-06, + "loss": 1.0888, + "step": 27766 + }, + { + "epoch": 2.3665729140032385, + "grad_norm": 75.86265197764708, + "learning_rate": 1.297987581307022e-06, + "loss": 2.1842, + "step": 27767 + }, + { + "epoch": 2.366658143697264, + "grad_norm": 61.948726860962466, + "learning_rate": 1.2976543085571964e-06, + "loss": 1.2645, + "step": 27768 + }, + { + "epoch": 2.3667433733912895, + "grad_norm": 53.35690302883341, + "learning_rate": 1.2973210722184693e-06, + "loss": 1.7126, + "step": 27769 + }, + { + "epoch": 2.366828603085315, + "grad_norm": 67.70941242073529, + "learning_rate": 1.296987872294117e-06, + "loss": 1.4967, + "step": 27770 + }, + { + "epoch": 2.3669138327793404, + "grad_norm": 32.22093707743979, + "learning_rate": 1.2966547087874188e-06, + "loss": 1.4857, + "step": 27771 + }, + { + "epoch": 2.366999062473366, + "grad_norm": 27.225737332966364, + "learning_rate": 1.2963215817016484e-06, + "loss": 0.8658, + "step": 27772 + }, + { + "epoch": 2.367084292167391, + "grad_norm": 63.27785868894476, + "learning_rate": 1.295988491040085e-06, + "loss": 1.4409, + "step": 27773 + }, + { + "epoch": 2.3671695218614164, + "grad_norm": 45.29354843426555, + "learning_rate": 1.2956554368060032e-06, + "loss": 1.342, + "step": 27774 + }, + { + "epoch": 2.367254751555442, + "grad_norm": 53.95224519126952, + "learning_rate": 1.2953224190026764e-06, + "loss": 2.1777, + "step": 27775 + }, + { + "epoch": 2.3673399812494673, + "grad_norm": 66.58761204370303, + "learning_rate": 1.2949894376333833e-06, + "loss": 1.9043, + "step": 27776 + }, + { + "epoch": 2.367425210943493, + "grad_norm": 27.50925618507361, + "learning_rate": 1.2946564927013967e-06, + "loss": 1.278, + "step": 27777 + }, + { + "epoch": 2.3675104406375183, + "grad_norm": 24.47716090438716, + "learning_rate": 1.2943235842099895e-06, + "loss": 0.7021, + "step": 27778 + }, + { + "epoch": 2.3675956703315437, + "grad_norm": 28.58100815135258, + "learning_rate": 1.2939907121624389e-06, + "loss": 1.1167, + "step": 27779 + }, + { + "epoch": 2.3676809000255687, + "grad_norm": 69.17185607309112, + "learning_rate": 1.2936578765620155e-06, + "loss": 1.9391, + "step": 27780 + }, + { + "epoch": 2.367766129719594, + "grad_norm": 56.686838009737535, + "learning_rate": 1.293325077411996e-06, + "loss": 1.0942, + "step": 27781 + }, + { + "epoch": 2.3678513594136197, + "grad_norm": 26.245652834556118, + "learning_rate": 1.2929923147156514e-06, + "loss": 0.8734, + "step": 27782 + }, + { + "epoch": 2.367936589107645, + "grad_norm": 65.27996771054507, + "learning_rate": 1.2926595884762528e-06, + "loss": 2.0972, + "step": 27783 + }, + { + "epoch": 2.3680218188016706, + "grad_norm": 81.16092272939825, + "learning_rate": 1.292326898697075e-06, + "loss": 1.797, + "step": 27784 + }, + { + "epoch": 2.368107048495696, + "grad_norm": 51.70717621434356, + "learning_rate": 1.2919942453813894e-06, + "loss": 1.4868, + "step": 27785 + }, + { + "epoch": 2.368192278189721, + "grad_norm": 37.79139096749006, + "learning_rate": 1.2916616285324657e-06, + "loss": 0.7203, + "step": 27786 + }, + { + "epoch": 2.3682775078837466, + "grad_norm": 53.19041533313823, + "learning_rate": 1.2913290481535774e-06, + "loss": 1.2287, + "step": 27787 + }, + { + "epoch": 2.368362737577772, + "grad_norm": 70.81006860010974, + "learning_rate": 1.2909965042479926e-06, + "loss": 1.3745, + "step": 27788 + }, + { + "epoch": 2.3684479672717975, + "grad_norm": 23.22318444271044, + "learning_rate": 1.290663996818985e-06, + "loss": 0.6994, + "step": 27789 + }, + { + "epoch": 2.368533196965823, + "grad_norm": 32.76592467136389, + "learning_rate": 1.2903315258698235e-06, + "loss": 0.9308, + "step": 27790 + }, + { + "epoch": 2.3686184266598485, + "grad_norm": 76.88813352972707, + "learning_rate": 1.289999091403777e-06, + "loss": 1.6439, + "step": 27791 + }, + { + "epoch": 2.3687036563538735, + "grad_norm": 60.70661497706649, + "learning_rate": 1.2896666934241137e-06, + "loss": 2.3149, + "step": 27792 + }, + { + "epoch": 2.368788886047899, + "grad_norm": 32.354450408989415, + "learning_rate": 1.2893343319341056e-06, + "loss": 0.9792, + "step": 27793 + }, + { + "epoch": 2.3688741157419244, + "grad_norm": 50.40228529841139, + "learning_rate": 1.2890020069370185e-06, + "loss": 1.7782, + "step": 27794 + }, + { + "epoch": 2.36895934543595, + "grad_norm": 28.801291321476462, + "learning_rate": 1.2886697184361236e-06, + "loss": 1.0178, + "step": 27795 + }, + { + "epoch": 2.3690445751299753, + "grad_norm": 53.20426708181724, + "learning_rate": 1.2883374664346854e-06, + "loss": 1.7389, + "step": 27796 + }, + { + "epoch": 2.369129804824001, + "grad_norm": 112.98516679054681, + "learning_rate": 1.2880052509359752e-06, + "loss": 2.6437, + "step": 27797 + }, + { + "epoch": 2.3692150345180263, + "grad_norm": 105.20967658366096, + "learning_rate": 1.2876730719432584e-06, + "loss": 2.7757, + "step": 27798 + }, + { + "epoch": 2.3693002642120513, + "grad_norm": 67.1772565882055, + "learning_rate": 1.2873409294598015e-06, + "loss": 1.4504, + "step": 27799 + }, + { + "epoch": 2.369385493906077, + "grad_norm": 68.63007846623582, + "learning_rate": 1.28700882348887e-06, + "loss": 2.0764, + "step": 27800 + }, + { + "epoch": 2.3694707236001022, + "grad_norm": 39.05174222031557, + "learning_rate": 1.2866767540337327e-06, + "loss": 1.3642, + "step": 27801 + }, + { + "epoch": 2.3695559532941277, + "grad_norm": 54.253917542423245, + "learning_rate": 1.2863447210976527e-06, + "loss": 1.3526, + "step": 27802 + }, + { + "epoch": 2.369641182988153, + "grad_norm": 45.981681068647255, + "learning_rate": 1.2860127246838976e-06, + "loss": 1.577, + "step": 27803 + }, + { + "epoch": 2.3697264126821787, + "grad_norm": 45.38790190445135, + "learning_rate": 1.2856807647957325e-06, + "loss": 1.036, + "step": 27804 + }, + { + "epoch": 2.3698116423762037, + "grad_norm": 55.50960249445368, + "learning_rate": 1.285348841436419e-06, + "loss": 1.5959, + "step": 27805 + }, + { + "epoch": 2.369896872070229, + "grad_norm": 40.23700497158574, + "learning_rate": 1.2850169546092256e-06, + "loss": 1.4365, + "step": 27806 + }, + { + "epoch": 2.3699821017642546, + "grad_norm": 64.25110068295847, + "learning_rate": 1.2846851043174137e-06, + "loss": 1.8892, + "step": 27807 + }, + { + "epoch": 2.37006733145828, + "grad_norm": 52.65903911440421, + "learning_rate": 1.2843532905642475e-06, + "loss": 1.4632, + "step": 27808 + }, + { + "epoch": 2.3701525611523055, + "grad_norm": 61.6324016474064, + "learning_rate": 1.2840215133529892e-06, + "loss": 1.7573, + "step": 27809 + }, + { + "epoch": 2.370237790846331, + "grad_norm": 33.50254457657022, + "learning_rate": 1.2836897726869042e-06, + "loss": 0.9703, + "step": 27810 + }, + { + "epoch": 2.370323020540356, + "grad_norm": 54.69683219260696, + "learning_rate": 1.2833580685692515e-06, + "loss": 1.6431, + "step": 27811 + }, + { + "epoch": 2.3704082502343815, + "grad_norm": 71.45700003652709, + "learning_rate": 1.2830264010032973e-06, + "loss": 2.2518, + "step": 27812 + }, + { + "epoch": 2.370493479928407, + "grad_norm": 77.31285964352585, + "learning_rate": 1.2826947699922997e-06, + "loss": 2.7931, + "step": 27813 + }, + { + "epoch": 2.3705787096224324, + "grad_norm": 30.2474505533203, + "learning_rate": 1.282363175539524e-06, + "loss": 1.3181, + "step": 27814 + }, + { + "epoch": 2.370663939316458, + "grad_norm": 48.716736138002, + "learning_rate": 1.282031617648229e-06, + "loss": 1.2229, + "step": 27815 + }, + { + "epoch": 2.3707491690104834, + "grad_norm": 71.02052268036752, + "learning_rate": 1.2817000963216752e-06, + "loss": 1.9391, + "step": 27816 + }, + { + "epoch": 2.370834398704509, + "grad_norm": 48.03805855415911, + "learning_rate": 1.2813686115631224e-06, + "loss": 1.3762, + "step": 27817 + }, + { + "epoch": 2.370919628398534, + "grad_norm": 55.813518236615636, + "learning_rate": 1.2810371633758329e-06, + "loss": 0.9307, + "step": 27818 + }, + { + "epoch": 2.3710048580925593, + "grad_norm": 46.941458576033405, + "learning_rate": 1.280705751763064e-06, + "loss": 1.5392, + "step": 27819 + }, + { + "epoch": 2.371090087786585, + "grad_norm": 50.485736915557204, + "learning_rate": 1.2803743767280768e-06, + "loss": 1.4402, + "step": 27820 + }, + { + "epoch": 2.3711753174806103, + "grad_norm": 102.85526122825634, + "learning_rate": 1.2800430382741302e-06, + "loss": 2.3842, + "step": 27821 + }, + { + "epoch": 2.3712605471746357, + "grad_norm": 40.72838797523272, + "learning_rate": 1.279711736404481e-06, + "loss": 1.4824, + "step": 27822 + }, + { + "epoch": 2.371345776868661, + "grad_norm": 42.26673214612279, + "learning_rate": 1.2793804711223895e-06, + "loss": 1.1319, + "step": 27823 + }, + { + "epoch": 2.3714310065626867, + "grad_norm": 42.68487692330572, + "learning_rate": 1.2790492424311123e-06, + "loss": 1.1478, + "step": 27824 + }, + { + "epoch": 2.3715162362567117, + "grad_norm": 33.845965965554726, + "learning_rate": 1.2787180503339063e-06, + "loss": 1.3342, + "step": 27825 + }, + { + "epoch": 2.371601465950737, + "grad_norm": 55.60072184983926, + "learning_rate": 1.2783868948340312e-06, + "loss": 1.7038, + "step": 27826 + }, + { + "epoch": 2.3716866956447626, + "grad_norm": 57.67456201338247, + "learning_rate": 1.2780557759347405e-06, + "loss": 1.4745, + "step": 27827 + }, + { + "epoch": 2.371771925338788, + "grad_norm": 43.34145899563466, + "learning_rate": 1.277724693639294e-06, + "loss": 1.5998, + "step": 27828 + }, + { + "epoch": 2.3718571550328136, + "grad_norm": 43.57505220656295, + "learning_rate": 1.2773936479509459e-06, + "loss": 1.6117, + "step": 27829 + }, + { + "epoch": 2.3719423847268386, + "grad_norm": 53.71505055415501, + "learning_rate": 1.27706263887295e-06, + "loss": 1.2253, + "step": 27830 + }, + { + "epoch": 2.372027614420864, + "grad_norm": 61.8698894407209, + "learning_rate": 1.276731666408566e-06, + "loss": 1.8846, + "step": 27831 + }, + { + "epoch": 2.3721128441148895, + "grad_norm": 64.8376571171947, + "learning_rate": 1.276400730561046e-06, + "loss": 1.2528, + "step": 27832 + }, + { + "epoch": 2.372198073808915, + "grad_norm": 39.51728394000449, + "learning_rate": 1.2760698313336438e-06, + "loss": 0.8938, + "step": 27833 + }, + { + "epoch": 2.3722833035029405, + "grad_norm": 41.5605190447996, + "learning_rate": 1.2757389687296168e-06, + "loss": 1.0694, + "step": 27834 + }, + { + "epoch": 2.372368533196966, + "grad_norm": 37.42886620021286, + "learning_rate": 1.2754081427522159e-06, + "loss": 1.1173, + "step": 27835 + }, + { + "epoch": 2.3724537628909914, + "grad_norm": 59.43119968459055, + "learning_rate": 1.2750773534046967e-06, + "loss": 1.5128, + "step": 27836 + }, + { + "epoch": 2.3725389925850164, + "grad_norm": 45.27367986640861, + "learning_rate": 1.2747466006903125e-06, + "loss": 1.1479, + "step": 27837 + }, + { + "epoch": 2.372624222279042, + "grad_norm": 24.49105594787826, + "learning_rate": 1.2744158846123145e-06, + "loss": 0.7945, + "step": 27838 + }, + { + "epoch": 2.3727094519730674, + "grad_norm": 22.22113374264244, + "learning_rate": 1.2740852051739543e-06, + "loss": 0.9191, + "step": 27839 + }, + { + "epoch": 2.372794681667093, + "grad_norm": 42.981600981414374, + "learning_rate": 1.2737545623784875e-06, + "loss": 1.3984, + "step": 27840 + }, + { + "epoch": 2.3728799113611183, + "grad_norm": 70.99891471596891, + "learning_rate": 1.2734239562291623e-06, + "loss": 2.109, + "step": 27841 + }, + { + "epoch": 2.372965141055144, + "grad_norm": 78.93210113794808, + "learning_rate": 1.2730933867292334e-06, + "loss": 1.7577, + "step": 27842 + }, + { + "epoch": 2.3730503707491692, + "grad_norm": 72.37925469045213, + "learning_rate": 1.272762853881948e-06, + "loss": 2.2059, + "step": 27843 + }, + { + "epoch": 2.3731356004431943, + "grad_norm": 31.33626563233893, + "learning_rate": 1.2724323576905606e-06, + "loss": 0.6049, + "step": 27844 + }, + { + "epoch": 2.3732208301372197, + "grad_norm": 66.97906100149747, + "learning_rate": 1.27210189815832e-06, + "loss": 1.8581, + "step": 27845 + }, + { + "epoch": 2.373306059831245, + "grad_norm": 51.41493676601018, + "learning_rate": 1.2717714752884757e-06, + "loss": 1.2346, + "step": 27846 + }, + { + "epoch": 2.3733912895252707, + "grad_norm": 42.9550053787452, + "learning_rate": 1.2714410890842778e-06, + "loss": 1.8474, + "step": 27847 + }, + { + "epoch": 2.373476519219296, + "grad_norm": 56.2286845953971, + "learning_rate": 1.2711107395489736e-06, + "loss": 1.8935, + "step": 27848 + }, + { + "epoch": 2.373561748913321, + "grad_norm": 43.34364242107097, + "learning_rate": 1.2707804266858137e-06, + "loss": 1.8094, + "step": 27849 + }, + { + "epoch": 2.3736469786073466, + "grad_norm": 61.01639968659626, + "learning_rate": 1.2704501504980477e-06, + "loss": 1.5416, + "step": 27850 + }, + { + "epoch": 2.373732208301372, + "grad_norm": 52.56177387320438, + "learning_rate": 1.2701199109889223e-06, + "loss": 1.6896, + "step": 27851 + }, + { + "epoch": 2.3738174379953976, + "grad_norm": 24.144905189753597, + "learning_rate": 1.2697897081616844e-06, + "loss": 0.7699, + "step": 27852 + }, + { + "epoch": 2.373902667689423, + "grad_norm": 52.77927319512209, + "learning_rate": 1.2694595420195837e-06, + "loss": 1.4794, + "step": 27853 + }, + { + "epoch": 2.3739878973834485, + "grad_norm": 29.948273515191417, + "learning_rate": 1.2691294125658659e-06, + "loss": 1.3706, + "step": 27854 + }, + { + "epoch": 2.374073127077474, + "grad_norm": 53.79047082386268, + "learning_rate": 1.268799319803778e-06, + "loss": 1.2792, + "step": 27855 + }, + { + "epoch": 2.374158356771499, + "grad_norm": 36.3788943799178, + "learning_rate": 1.2684692637365648e-06, + "loss": 1.0205, + "step": 27856 + }, + { + "epoch": 2.3742435864655245, + "grad_norm": 41.67780121170369, + "learning_rate": 1.2681392443674734e-06, + "loss": 1.078, + "step": 27857 + }, + { + "epoch": 2.37432881615955, + "grad_norm": 37.048631819603024, + "learning_rate": 1.267809261699751e-06, + "loss": 1.2407, + "step": 27858 + }, + { + "epoch": 2.3744140458535754, + "grad_norm": 54.306459130836224, + "learning_rate": 1.267479315736642e-06, + "loss": 1.9612, + "step": 27859 + }, + { + "epoch": 2.374499275547601, + "grad_norm": 59.64755841363552, + "learning_rate": 1.267149406481389e-06, + "loss": 1.565, + "step": 27860 + }, + { + "epoch": 2.3745845052416263, + "grad_norm": 41.3317987139661, + "learning_rate": 1.2668195339372397e-06, + "loss": 0.8954, + "step": 27861 + }, + { + "epoch": 2.374669734935652, + "grad_norm": 81.02460398825245, + "learning_rate": 1.266489698107437e-06, + "loss": 1.2746, + "step": 27862 + }, + { + "epoch": 2.374754964629677, + "grad_norm": 38.55853072792311, + "learning_rate": 1.2661598989952246e-06, + "loss": 1.3527, + "step": 27863 + }, + { + "epoch": 2.3748401943237023, + "grad_norm": 50.547919997199564, + "learning_rate": 1.2658301366038446e-06, + "loss": 1.0412, + "step": 27864 + }, + { + "epoch": 2.3749254240177278, + "grad_norm": 44.337581648067655, + "learning_rate": 1.2655004109365427e-06, + "loss": 0.8982, + "step": 27865 + }, + { + "epoch": 2.3750106537117532, + "grad_norm": 49.51850655735031, + "learning_rate": 1.2651707219965586e-06, + "loss": 1.0891, + "step": 27866 + }, + { + "epoch": 2.3750958834057787, + "grad_norm": 35.906251791829526, + "learning_rate": 1.2648410697871377e-06, + "loss": 1.0331, + "step": 27867 + }, + { + "epoch": 2.3751811130998037, + "grad_norm": 56.102127947106695, + "learning_rate": 1.2645114543115211e-06, + "loss": 1.4031, + "step": 27868 + }, + { + "epoch": 2.375266342793829, + "grad_norm": 62.35809715953991, + "learning_rate": 1.2641818755729484e-06, + "loss": 1.6603, + "step": 27869 + }, + { + "epoch": 2.3753515724878547, + "grad_norm": 24.217234908293403, + "learning_rate": 1.2638523335746639e-06, + "loss": 1.0327, + "step": 27870 + }, + { + "epoch": 2.37543680218188, + "grad_norm": 80.56068128496665, + "learning_rate": 1.2635228283199075e-06, + "loss": 1.9886, + "step": 27871 + }, + { + "epoch": 2.3755220318759056, + "grad_norm": 45.3393329823533, + "learning_rate": 1.2631933598119173e-06, + "loss": 1.6152, + "step": 27872 + }, + { + "epoch": 2.375607261569931, + "grad_norm": 55.705558241827084, + "learning_rate": 1.2628639280539374e-06, + "loss": 0.9274, + "step": 27873 + }, + { + "epoch": 2.3756924912639565, + "grad_norm": 77.09614747341008, + "learning_rate": 1.262534533049204e-06, + "loss": 1.9925, + "step": 27874 + }, + { + "epoch": 2.3757777209579816, + "grad_norm": 55.38125489884713, + "learning_rate": 1.26220517480096e-06, + "loss": 1.8069, + "step": 27875 + }, + { + "epoch": 2.375862950652007, + "grad_norm": 32.39915291088692, + "learning_rate": 1.261875853312443e-06, + "loss": 1.1428, + "step": 27876 + }, + { + "epoch": 2.3759481803460325, + "grad_norm": 49.603145354908165, + "learning_rate": 1.2615465685868916e-06, + "loss": 1.2772, + "step": 27877 + }, + { + "epoch": 2.376033410040058, + "grad_norm": 43.29589307380042, + "learning_rate": 1.2612173206275424e-06, + "loss": 0.9665, + "step": 27878 + }, + { + "epoch": 2.3761186397340834, + "grad_norm": 67.12010069334654, + "learning_rate": 1.2608881094376368e-06, + "loss": 1.4209, + "step": 27879 + }, + { + "epoch": 2.376203869428109, + "grad_norm": 48.128780054521435, + "learning_rate": 1.2605589350204095e-06, + "loss": 1.1159, + "step": 27880 + }, + { + "epoch": 2.3762890991221344, + "grad_norm": 32.53616515668937, + "learning_rate": 1.2602297973791006e-06, + "loss": 0.806, + "step": 27881 + }, + { + "epoch": 2.3763743288161594, + "grad_norm": 40.60035395803781, + "learning_rate": 1.2599006965169442e-06, + "loss": 1.2682, + "step": 27882 + }, + { + "epoch": 2.376459558510185, + "grad_norm": 26.410513561650426, + "learning_rate": 1.2595716324371793e-06, + "loss": 0.7668, + "step": 27883 + }, + { + "epoch": 2.3765447882042103, + "grad_norm": 38.884907480235015, + "learning_rate": 1.2592426051430416e-06, + "loss": 1.5873, + "step": 27884 + }, + { + "epoch": 2.376630017898236, + "grad_norm": 41.2360838844311, + "learning_rate": 1.2589136146377662e-06, + "loss": 1.5461, + "step": 27885 + }, + { + "epoch": 2.3767152475922613, + "grad_norm": 41.29038087095314, + "learning_rate": 1.2585846609245877e-06, + "loss": 1.2182, + "step": 27886 + }, + { + "epoch": 2.3768004772862863, + "grad_norm": 37.48744799870515, + "learning_rate": 1.2582557440067433e-06, + "loss": 1.0311, + "step": 27887 + }, + { + "epoch": 2.3768857069803118, + "grad_norm": 43.25589733070415, + "learning_rate": 1.2579268638874658e-06, + "loss": 1.4003, + "step": 27888 + }, + { + "epoch": 2.3769709366743372, + "grad_norm": 61.410437356986684, + "learning_rate": 1.2575980205699916e-06, + "loss": 2.5929, + "step": 27889 + }, + { + "epoch": 2.3770561663683627, + "grad_norm": 33.34828040757193, + "learning_rate": 1.2572692140575533e-06, + "loss": 1.2192, + "step": 27890 + }, + { + "epoch": 2.377141396062388, + "grad_norm": 46.37593476207888, + "learning_rate": 1.2569404443533855e-06, + "loss": 1.1283, + "step": 27891 + }, + { + "epoch": 2.3772266257564136, + "grad_norm": 66.50044037348121, + "learning_rate": 1.2566117114607217e-06, + "loss": 2.0026, + "step": 27892 + }, + { + "epoch": 2.377311855450439, + "grad_norm": 46.246757943354126, + "learning_rate": 1.2562830153827937e-06, + "loss": 1.0076, + "step": 27893 + }, + { + "epoch": 2.377397085144464, + "grad_norm": 41.83517240791339, + "learning_rate": 1.2559543561228354e-06, + "loss": 1.4223, + "step": 27894 + }, + { + "epoch": 2.3774823148384896, + "grad_norm": 30.403644983589068, + "learning_rate": 1.255625733684076e-06, + "loss": 1.1707, + "step": 27895 + }, + { + "epoch": 2.377567544532515, + "grad_norm": 59.462026149137046, + "learning_rate": 1.25529714806975e-06, + "loss": 1.4825, + "step": 27896 + }, + { + "epoch": 2.3776527742265405, + "grad_norm": 53.152540363544766, + "learning_rate": 1.25496859928309e-06, + "loss": 1.7639, + "step": 27897 + }, + { + "epoch": 2.377738003920566, + "grad_norm": 42.357694514831756, + "learning_rate": 1.2546400873273262e-06, + "loss": 1.1504, + "step": 27898 + }, + { + "epoch": 2.3778232336145915, + "grad_norm": 52.97456093814087, + "learning_rate": 1.254311612205687e-06, + "loss": 1.2568, + "step": 27899 + }, + { + "epoch": 2.377908463308617, + "grad_norm": 30.916429811187015, + "learning_rate": 1.2539831739214059e-06, + "loss": 0.8746, + "step": 27900 + }, + { + "epoch": 2.377993693002642, + "grad_norm": 72.6392512435505, + "learning_rate": 1.253654772477712e-06, + "loss": 1.598, + "step": 27901 + }, + { + "epoch": 2.3780789226966674, + "grad_norm": 53.54918342715881, + "learning_rate": 1.2533264078778346e-06, + "loss": 1.4274, + "step": 27902 + }, + { + "epoch": 2.378164152390693, + "grad_norm": 22.047612560464014, + "learning_rate": 1.2529980801250019e-06, + "loss": 1.1174, + "step": 27903 + }, + { + "epoch": 2.3782493820847184, + "grad_norm": 119.57470517101618, + "learning_rate": 1.2526697892224443e-06, + "loss": 2.1736, + "step": 27904 + }, + { + "epoch": 2.378334611778744, + "grad_norm": 42.223442530407574, + "learning_rate": 1.2523415351733913e-06, + "loss": 1.2952, + "step": 27905 + }, + { + "epoch": 2.3784198414727693, + "grad_norm": 36.27605347086185, + "learning_rate": 1.2520133179810706e-06, + "loss": 0.8573, + "step": 27906 + }, + { + "epoch": 2.3785050711667943, + "grad_norm": 82.0626428627618, + "learning_rate": 1.2516851376487093e-06, + "loss": 1.9842, + "step": 27907 + }, + { + "epoch": 2.37859030086082, + "grad_norm": 65.3144384615602, + "learning_rate": 1.251356994179534e-06, + "loss": 1.5496, + "step": 27908 + }, + { + "epoch": 2.3786755305548453, + "grad_norm": 81.94888816947362, + "learning_rate": 1.2510288875767746e-06, + "loss": 2.3256, + "step": 27909 + }, + { + "epoch": 2.3787607602488707, + "grad_norm": 27.350091770862882, + "learning_rate": 1.2507008178436563e-06, + "loss": 1.0757, + "step": 27910 + }, + { + "epoch": 2.378845989942896, + "grad_norm": 29.576719538516567, + "learning_rate": 1.250372784983404e-06, + "loss": 1.03, + "step": 27911 + }, + { + "epoch": 2.3789312196369217, + "grad_norm": 75.94784078148355, + "learning_rate": 1.2500447889992468e-06, + "loss": 1.7661, + "step": 27912 + }, + { + "epoch": 2.3790164493309467, + "grad_norm": 42.109596873243014, + "learning_rate": 1.249716829894408e-06, + "loss": 1.3088, + "step": 27913 + }, + { + "epoch": 2.379101679024972, + "grad_norm": 15.205574315918676, + "learning_rate": 1.2493889076721149e-06, + "loss": 0.5515, + "step": 27914 + }, + { + "epoch": 2.3791869087189976, + "grad_norm": 54.440202153373406, + "learning_rate": 1.249061022335592e-06, + "loss": 1.2565, + "step": 27915 + }, + { + "epoch": 2.379272138413023, + "grad_norm": 44.40424215705388, + "learning_rate": 1.2487331738880621e-06, + "loss": 1.5349, + "step": 27916 + }, + { + "epoch": 2.3793573681070486, + "grad_norm": 20.697064265770962, + "learning_rate": 1.248405362332752e-06, + "loss": 0.7064, + "step": 27917 + }, + { + "epoch": 2.379442597801074, + "grad_norm": 54.68799162195682, + "learning_rate": 1.2480775876728841e-06, + "loss": 1.4747, + "step": 27918 + }, + { + "epoch": 2.3795278274950995, + "grad_norm": 45.630075966224936, + "learning_rate": 1.2477498499116813e-06, + "loss": 1.741, + "step": 27919 + }, + { + "epoch": 2.3796130571891245, + "grad_norm": 68.0029748103319, + "learning_rate": 1.2474221490523692e-06, + "loss": 1.5361, + "step": 27920 + }, + { + "epoch": 2.37969828688315, + "grad_norm": 25.25840456143382, + "learning_rate": 1.247094485098167e-06, + "loss": 0.9808, + "step": 27921 + }, + { + "epoch": 2.3797835165771755, + "grad_norm": 25.97367910602343, + "learning_rate": 1.2467668580523012e-06, + "loss": 0.6781, + "step": 27922 + }, + { + "epoch": 2.379868746271201, + "grad_norm": 86.58904132567488, + "learning_rate": 1.2464392679179922e-06, + "loss": 1.6756, + "step": 27923 + }, + { + "epoch": 2.3799539759652264, + "grad_norm": 64.03829978874224, + "learning_rate": 1.2461117146984614e-06, + "loss": 1.4901, + "step": 27924 + }, + { + "epoch": 2.380039205659252, + "grad_norm": 35.68066448505997, + "learning_rate": 1.2457841983969288e-06, + "loss": 1.0936, + "step": 27925 + }, + { + "epoch": 2.380124435353277, + "grad_norm": 71.22964989936109, + "learning_rate": 1.2454567190166177e-06, + "loss": 1.2769, + "step": 27926 + }, + { + "epoch": 2.3802096650473024, + "grad_norm": 58.95244560491612, + "learning_rate": 1.245129276560747e-06, + "loss": 1.8463, + "step": 27927 + }, + { + "epoch": 2.380294894741328, + "grad_norm": 63.71529855860233, + "learning_rate": 1.2448018710325394e-06, + "loss": 1.8483, + "step": 27928 + }, + { + "epoch": 2.3803801244353533, + "grad_norm": 52.69777554788157, + "learning_rate": 1.2444745024352112e-06, + "loss": 1.1379, + "step": 27929 + }, + { + "epoch": 2.3804653541293788, + "grad_norm": 91.31352875804903, + "learning_rate": 1.244147170771986e-06, + "loss": 2.4393, + "step": 27930 + }, + { + "epoch": 2.3805505838234042, + "grad_norm": 34.225701639696574, + "learning_rate": 1.2438198760460807e-06, + "loss": 0.7308, + "step": 27931 + }, + { + "epoch": 2.3806358135174293, + "grad_norm": 61.79906641685035, + "learning_rate": 1.2434926182607144e-06, + "loss": 1.6979, + "step": 27932 + }, + { + "epoch": 2.3807210432114547, + "grad_norm": 36.974617703042135, + "learning_rate": 1.2431653974191038e-06, + "loss": 1.2126, + "step": 27933 + }, + { + "epoch": 2.38080627290548, + "grad_norm": 75.34663946278098, + "learning_rate": 1.2428382135244705e-06, + "loss": 2.3312, + "step": 27934 + }, + { + "epoch": 2.3808915025995057, + "grad_norm": 62.87741811615621, + "learning_rate": 1.2425110665800288e-06, + "loss": 1.3687, + "step": 27935 + }, + { + "epoch": 2.380976732293531, + "grad_norm": 62.18595105599735, + "learning_rate": 1.2421839565889987e-06, + "loss": 1.5737, + "step": 27936 + }, + { + "epoch": 2.3810619619875566, + "grad_norm": 27.07610054118786, + "learning_rate": 1.2418568835545968e-06, + "loss": 0.9125, + "step": 27937 + }, + { + "epoch": 2.381147191681582, + "grad_norm": 56.48505669077246, + "learning_rate": 1.2415298474800376e-06, + "loss": 1.6226, + "step": 27938 + }, + { + "epoch": 2.381232421375607, + "grad_norm": 58.26566465247384, + "learning_rate": 1.24120284836854e-06, + "loss": 1.575, + "step": 27939 + }, + { + "epoch": 2.3813176510696326, + "grad_norm": 35.70066417220742, + "learning_rate": 1.2408758862233188e-06, + "loss": 1.1195, + "step": 27940 + }, + { + "epoch": 2.381402880763658, + "grad_norm": 74.38979894027926, + "learning_rate": 1.2405489610475902e-06, + "loss": 1.8961, + "step": 27941 + }, + { + "epoch": 2.3814881104576835, + "grad_norm": 39.695104148636425, + "learning_rate": 1.2402220728445668e-06, + "loss": 1.2355, + "step": 27942 + }, + { + "epoch": 2.381573340151709, + "grad_norm": 41.875934292750046, + "learning_rate": 1.2398952216174653e-06, + "loss": 0.9471, + "step": 27943 + }, + { + "epoch": 2.3816585698457344, + "grad_norm": 35.21687567674669, + "learning_rate": 1.2395684073695014e-06, + "loss": 1.4319, + "step": 27944 + }, + { + "epoch": 2.38174379953976, + "grad_norm": 28.66810135832015, + "learning_rate": 1.2392416301038883e-06, + "loss": 1.1443, + "step": 27945 + }, + { + "epoch": 2.381829029233785, + "grad_norm": 62.64396778419175, + "learning_rate": 1.2389148898238378e-06, + "loss": 1.7288, + "step": 27946 + }, + { + "epoch": 2.3819142589278104, + "grad_norm": 60.31110309224065, + "learning_rate": 1.2385881865325667e-06, + "loss": 1.7152, + "step": 27947 + }, + { + "epoch": 2.381999488621836, + "grad_norm": 34.27511126470099, + "learning_rate": 1.2382615202332854e-06, + "loss": 0.8375, + "step": 27948 + }, + { + "epoch": 2.3820847183158613, + "grad_norm": 81.75342668212465, + "learning_rate": 1.2379348909292076e-06, + "loss": 1.9306, + "step": 27949 + }, + { + "epoch": 2.382169948009887, + "grad_norm": 51.22079486634626, + "learning_rate": 1.2376082986235438e-06, + "loss": 1.6948, + "step": 27950 + }, + { + "epoch": 2.382255177703912, + "grad_norm": 32.058798284139336, + "learning_rate": 1.2372817433195078e-06, + "loss": 1.5063, + "step": 27951 + }, + { + "epoch": 2.3823404073979373, + "grad_norm": 99.43098607380024, + "learning_rate": 1.2369552250203115e-06, + "loss": 2.1625, + "step": 27952 + }, + { + "epoch": 2.3824256370919628, + "grad_norm": 33.73246247280477, + "learning_rate": 1.2366287437291657e-06, + "loss": 1.2141, + "step": 27953 + }, + { + "epoch": 2.3825108667859882, + "grad_norm": 47.81418381433235, + "learning_rate": 1.2363022994492808e-06, + "loss": 1.5786, + "step": 27954 + }, + { + "epoch": 2.3825960964800137, + "grad_norm": 64.62514153245377, + "learning_rate": 1.2359758921838656e-06, + "loss": 1.3463, + "step": 27955 + }, + { + "epoch": 2.382681326174039, + "grad_norm": 49.47304345845403, + "learning_rate": 1.235649521936133e-06, + "loss": 1.21, + "step": 27956 + }, + { + "epoch": 2.3827665558680646, + "grad_norm": 51.81215473709053, + "learning_rate": 1.235323188709292e-06, + "loss": 1.1316, + "step": 27957 + }, + { + "epoch": 2.3828517855620897, + "grad_norm": 78.32515533273667, + "learning_rate": 1.2349968925065498e-06, + "loss": 1.6988, + "step": 27958 + }, + { + "epoch": 2.382937015256115, + "grad_norm": 70.63251008473291, + "learning_rate": 1.2346706333311186e-06, + "loss": 2.1003, + "step": 27959 + }, + { + "epoch": 2.3830222449501406, + "grad_norm": 39.42111444084755, + "learning_rate": 1.2343444111862041e-06, + "loss": 1.3748, + "step": 27960 + }, + { + "epoch": 2.383107474644166, + "grad_norm": 85.98745598279079, + "learning_rate": 1.2340182260750177e-06, + "loss": 2.1774, + "step": 27961 + }, + { + "epoch": 2.3831927043381915, + "grad_norm": 53.42799513431421, + "learning_rate": 1.2336920780007649e-06, + "loss": 1.4744, + "step": 27962 + }, + { + "epoch": 2.383277934032217, + "grad_norm": 46.52695191516089, + "learning_rate": 1.2333659669666532e-06, + "loss": 1.2692, + "step": 27963 + }, + { + "epoch": 2.3833631637262425, + "grad_norm": 52.72060599401198, + "learning_rate": 1.2330398929758918e-06, + "loss": 1.4134, + "step": 27964 + }, + { + "epoch": 2.3834483934202675, + "grad_norm": 34.13580182519123, + "learning_rate": 1.232713856031686e-06, + "loss": 1.2888, + "step": 27965 + }, + { + "epoch": 2.383533623114293, + "grad_norm": 70.14722865556816, + "learning_rate": 1.232387856137241e-06, + "loss": 1.4787, + "step": 27966 + }, + { + "epoch": 2.3836188528083184, + "grad_norm": 61.03229334764534, + "learning_rate": 1.2320618932957663e-06, + "loss": 1.9673, + "step": 27967 + }, + { + "epoch": 2.383704082502344, + "grad_norm": 28.69974761320979, + "learning_rate": 1.2317359675104635e-06, + "loss": 0.7868, + "step": 27968 + }, + { + "epoch": 2.3837893121963694, + "grad_norm": 25.186753455109553, + "learning_rate": 1.2314100787845417e-06, + "loss": 0.8226, + "step": 27969 + }, + { + "epoch": 2.3838745418903944, + "grad_norm": 42.78145905112073, + "learning_rate": 1.2310842271212043e-06, + "loss": 1.3918, + "step": 27970 + }, + { + "epoch": 2.38395977158442, + "grad_norm": 62.689951202254974, + "learning_rate": 1.230758412523656e-06, + "loss": 2.2711, + "step": 27971 + }, + { + "epoch": 2.3840450012784453, + "grad_norm": 57.24556072902084, + "learning_rate": 1.2304326349950995e-06, + "loss": 1.6536, + "step": 27972 + }, + { + "epoch": 2.384130230972471, + "grad_norm": 43.48406917294317, + "learning_rate": 1.2301068945387419e-06, + "loss": 1.1125, + "step": 27973 + }, + { + "epoch": 2.3842154606664963, + "grad_norm": 36.783876557287506, + "learning_rate": 1.2297811911577833e-06, + "loss": 1.5277, + "step": 27974 + }, + { + "epoch": 2.3843006903605217, + "grad_norm": 60.57533734882984, + "learning_rate": 1.2294555248554297e-06, + "loss": 1.5907, + "step": 27975 + }, + { + "epoch": 2.384385920054547, + "grad_norm": 80.95447558885179, + "learning_rate": 1.2291298956348818e-06, + "loss": 1.9031, + "step": 27976 + }, + { + "epoch": 2.384471149748572, + "grad_norm": 21.543981130209943, + "learning_rate": 1.2288043034993442e-06, + "loss": 0.9306, + "step": 27977 + }, + { + "epoch": 2.3845563794425977, + "grad_norm": 53.76726442177173, + "learning_rate": 1.2284787484520178e-06, + "loss": 1.5415, + "step": 27978 + }, + { + "epoch": 2.384641609136623, + "grad_norm": 55.10528381864711, + "learning_rate": 1.2281532304961041e-06, + "loss": 1.5428, + "step": 27979 + }, + { + "epoch": 2.3847268388306486, + "grad_norm": 61.46699836280222, + "learning_rate": 1.2278277496348045e-06, + "loss": 1.6922, + "step": 27980 + }, + { + "epoch": 2.384812068524674, + "grad_norm": 78.90090950210354, + "learning_rate": 1.2275023058713188e-06, + "loss": 2.0522, + "step": 27981 + }, + { + "epoch": 2.3848972982186996, + "grad_norm": 23.77677108743447, + "learning_rate": 1.227176899208849e-06, + "loss": 0.9141, + "step": 27982 + }, + { + "epoch": 2.384982527912725, + "grad_norm": 84.91429914225235, + "learning_rate": 1.2268515296505968e-06, + "loss": 2.5437, + "step": 27983 + }, + { + "epoch": 2.38506775760675, + "grad_norm": 39.964775725118486, + "learning_rate": 1.2265261971997599e-06, + "loss": 1.5968, + "step": 27984 + }, + { + "epoch": 2.3851529873007755, + "grad_norm": 39.50830348384589, + "learning_rate": 1.2262009018595373e-06, + "loss": 1.3893, + "step": 27985 + }, + { + "epoch": 2.385238216994801, + "grad_norm": 55.87687947701446, + "learning_rate": 1.225875643633131e-06, + "loss": 1.7204, + "step": 27986 + }, + { + "epoch": 2.3853234466888265, + "grad_norm": 83.40308286236473, + "learning_rate": 1.2255504225237375e-06, + "loss": 2.5034, + "step": 27987 + }, + { + "epoch": 2.385408676382852, + "grad_norm": 36.56041236450946, + "learning_rate": 1.2252252385345564e-06, + "loss": 1.1122, + "step": 27988 + }, + { + "epoch": 2.385493906076877, + "grad_norm": 39.717105786594914, + "learning_rate": 1.224900091668783e-06, + "loss": 1.0021, + "step": 27989 + }, + { + "epoch": 2.3855791357709024, + "grad_norm": 39.41582602373156, + "learning_rate": 1.2245749819296177e-06, + "loss": 1.2712, + "step": 27990 + }, + { + "epoch": 2.385664365464928, + "grad_norm": 63.62534373579301, + "learning_rate": 1.2242499093202587e-06, + "loss": 2.0225, + "step": 27991 + }, + { + "epoch": 2.3857495951589534, + "grad_norm": 39.65057475076693, + "learning_rate": 1.2239248738439008e-06, + "loss": 1.1363, + "step": 27992 + }, + { + "epoch": 2.385834824852979, + "grad_norm": 75.32119104413279, + "learning_rate": 1.223599875503742e-06, + "loss": 1.845, + "step": 27993 + }, + { + "epoch": 2.3859200545470043, + "grad_norm": 32.5418723786457, + "learning_rate": 1.223274914302976e-06, + "loss": 0.8089, + "step": 27994 + }, + { + "epoch": 2.3860052842410298, + "grad_norm": 73.75831202660655, + "learning_rate": 1.2229499902448023e-06, + "loss": 2.171, + "step": 27995 + }, + { + "epoch": 2.386090513935055, + "grad_norm": 37.17329961826361, + "learning_rate": 1.2226251033324143e-06, + "loss": 1.3168, + "step": 27996 + }, + { + "epoch": 2.3861757436290802, + "grad_norm": 24.44148771063574, + "learning_rate": 1.222300253569006e-06, + "loss": 0.6485, + "step": 27997 + }, + { + "epoch": 2.3862609733231057, + "grad_norm": 68.30726672800934, + "learning_rate": 1.2219754409577738e-06, + "loss": 1.5122, + "step": 27998 + }, + { + "epoch": 2.386346203017131, + "grad_norm": 39.252989527874604, + "learning_rate": 1.2216506655019134e-06, + "loss": 1.6353, + "step": 27999 + }, + { + "epoch": 2.3864314327111567, + "grad_norm": 48.965435985487744, + "learning_rate": 1.2213259272046169e-06, + "loss": 1.2495, + "step": 28000 + }, + { + "epoch": 2.386516662405182, + "grad_norm": 56.66710220995053, + "learning_rate": 1.221001226069079e-06, + "loss": 1.1657, + "step": 28001 + }, + { + "epoch": 2.3866018920992076, + "grad_norm": 68.40914119392066, + "learning_rate": 1.2206765620984906e-06, + "loss": 1.7322, + "step": 28002 + }, + { + "epoch": 2.3866871217932326, + "grad_norm": 42.676221676932734, + "learning_rate": 1.2203519352960474e-06, + "loss": 1.3188, + "step": 28003 + }, + { + "epoch": 2.386772351487258, + "grad_norm": 16.564302004585223, + "learning_rate": 1.220027345664942e-06, + "loss": 0.4886, + "step": 28004 + }, + { + "epoch": 2.3868575811812835, + "grad_norm": 77.65233398837968, + "learning_rate": 1.2197027932083638e-06, + "loss": 2.3196, + "step": 28005 + }, + { + "epoch": 2.386942810875309, + "grad_norm": 29.12441848038484, + "learning_rate": 1.219378277929506e-06, + "loss": 1.2296, + "step": 28006 + }, + { + "epoch": 2.3870280405693345, + "grad_norm": 64.24851060003898, + "learning_rate": 1.2190537998315626e-06, + "loss": 1.8877, + "step": 28007 + }, + { + "epoch": 2.3871132702633595, + "grad_norm": 67.18775325305104, + "learning_rate": 1.218729358917723e-06, + "loss": 1.4517, + "step": 28008 + }, + { + "epoch": 2.387198499957385, + "grad_norm": 24.629416962814172, + "learning_rate": 1.218404955191177e-06, + "loss": 0.7146, + "step": 28009 + }, + { + "epoch": 2.3872837296514104, + "grad_norm": 54.37318805612499, + "learning_rate": 1.2180805886551155e-06, + "loss": 1.666, + "step": 28010 + }, + { + "epoch": 2.387368959345436, + "grad_norm": 35.99495229891847, + "learning_rate": 1.2177562593127274e-06, + "loss": 0.7034, + "step": 28011 + }, + { + "epoch": 2.3874541890394614, + "grad_norm": 33.926591882807834, + "learning_rate": 1.2174319671672052e-06, + "loss": 1.4025, + "step": 28012 + }, + { + "epoch": 2.387539418733487, + "grad_norm": 61.3768844128866, + "learning_rate": 1.217107712221735e-06, + "loss": 1.7229, + "step": 28013 + }, + { + "epoch": 2.3876246484275123, + "grad_norm": 33.970188979889464, + "learning_rate": 1.2167834944795087e-06, + "loss": 1.2505, + "step": 28014 + }, + { + "epoch": 2.3877098781215373, + "grad_norm": 30.15409514492206, + "learning_rate": 1.216459313943712e-06, + "loss": 1.3392, + "step": 28015 + }, + { + "epoch": 2.387795107815563, + "grad_norm": 55.97270542470514, + "learning_rate": 1.2161351706175361e-06, + "loss": 1.1828, + "step": 28016 + }, + { + "epoch": 2.3878803375095883, + "grad_norm": 66.08596093086365, + "learning_rate": 1.2158110645041671e-06, + "loss": 2.1415, + "step": 28017 + }, + { + "epoch": 2.3879655672036137, + "grad_norm": 22.50737819725397, + "learning_rate": 1.2154869956067922e-06, + "loss": 1.3291, + "step": 28018 + }, + { + "epoch": 2.388050796897639, + "grad_norm": 58.628269153784906, + "learning_rate": 1.2151629639285978e-06, + "loss": 2.2767, + "step": 28019 + }, + { + "epoch": 2.3881360265916647, + "grad_norm": 30.507279725909036, + "learning_rate": 1.2148389694727736e-06, + "loss": 1.1454, + "step": 28020 + }, + { + "epoch": 2.38822125628569, + "grad_norm": 34.55942450323882, + "learning_rate": 1.214515012242502e-06, + "loss": 0.7886, + "step": 28021 + }, + { + "epoch": 2.388306485979715, + "grad_norm": 66.77246983170991, + "learning_rate": 1.2141910922409727e-06, + "loss": 1.6394, + "step": 28022 + }, + { + "epoch": 2.3883917156737406, + "grad_norm": 80.77156755530349, + "learning_rate": 1.2138672094713695e-06, + "loss": 1.2342, + "step": 28023 + }, + { + "epoch": 2.388476945367766, + "grad_norm": 56.800682295250056, + "learning_rate": 1.2135433639368766e-06, + "loss": 1.6518, + "step": 28024 + }, + { + "epoch": 2.3885621750617916, + "grad_norm": 47.65874182465597, + "learning_rate": 1.2132195556406817e-06, + "loss": 1.6761, + "step": 28025 + }, + { + "epoch": 2.388647404755817, + "grad_norm": 56.95005440688703, + "learning_rate": 1.2128957845859672e-06, + "loss": 1.2657, + "step": 28026 + }, + { + "epoch": 2.3887326344498425, + "grad_norm": 46.75107139104282, + "learning_rate": 1.212572050775918e-06, + "loss": 1.5964, + "step": 28027 + }, + { + "epoch": 2.3888178641438675, + "grad_norm": 41.526640588132274, + "learning_rate": 1.2122483542137164e-06, + "loss": 0.9668, + "step": 28028 + }, + { + "epoch": 2.388903093837893, + "grad_norm": 82.19407896827742, + "learning_rate": 1.2119246949025477e-06, + "loss": 1.3327, + "step": 28029 + }, + { + "epoch": 2.3889883235319185, + "grad_norm": 62.67743931696944, + "learning_rate": 1.2116010728455952e-06, + "loss": 1.5516, + "step": 28030 + }, + { + "epoch": 2.389073553225944, + "grad_norm": 33.17769355303805, + "learning_rate": 1.211277488046041e-06, + "loss": 1.0243, + "step": 28031 + }, + { + "epoch": 2.3891587829199694, + "grad_norm": 51.81748281088562, + "learning_rate": 1.2109539405070652e-06, + "loss": 1.461, + "step": 28032 + }, + { + "epoch": 2.389244012613995, + "grad_norm": 31.266447275142603, + "learning_rate": 1.2106304302318538e-06, + "loss": 1.3009, + "step": 28033 + }, + { + "epoch": 2.38932924230802, + "grad_norm": 58.97557504831068, + "learning_rate": 1.2103069572235864e-06, + "loss": 1.6728, + "step": 28034 + }, + { + "epoch": 2.3894144720020454, + "grad_norm": 37.620877713313455, + "learning_rate": 1.2099835214854438e-06, + "loss": 1.3671, + "step": 28035 + }, + { + "epoch": 2.389499701696071, + "grad_norm": 70.43793904814072, + "learning_rate": 1.209660123020606e-06, + "loss": 2.0109, + "step": 28036 + }, + { + "epoch": 2.3895849313900963, + "grad_norm": 35.48447468638498, + "learning_rate": 1.2093367618322548e-06, + "loss": 1.2489, + "step": 28037 + }, + { + "epoch": 2.389670161084122, + "grad_norm": 95.74962361192989, + "learning_rate": 1.2090134379235718e-06, + "loss": 1.9857, + "step": 28038 + }, + { + "epoch": 2.3897553907781472, + "grad_norm": 31.073181454588134, + "learning_rate": 1.208690151297735e-06, + "loss": 0.996, + "step": 28039 + }, + { + "epoch": 2.3898406204721727, + "grad_norm": 38.02839072179823, + "learning_rate": 1.2083669019579246e-06, + "loss": 1.2282, + "step": 28040 + }, + { + "epoch": 2.3899258501661977, + "grad_norm": 53.846156803517, + "learning_rate": 1.2080436899073172e-06, + "loss": 1.6693, + "step": 28041 + }, + { + "epoch": 2.390011079860223, + "grad_norm": 30.203418304061177, + "learning_rate": 1.207720515149095e-06, + "loss": 0.8232, + "step": 28042 + }, + { + "epoch": 2.3900963095542487, + "grad_norm": 24.786411543203386, + "learning_rate": 1.2073973776864344e-06, + "loss": 1.2453, + "step": 28043 + }, + { + "epoch": 2.390181539248274, + "grad_norm": 34.26321275728962, + "learning_rate": 1.207074277522512e-06, + "loss": 0.7168, + "step": 28044 + }, + { + "epoch": 2.3902667689422996, + "grad_norm": 89.7049268671985, + "learning_rate": 1.206751214660507e-06, + "loss": 2.3141, + "step": 28045 + }, + { + "epoch": 2.390351998636325, + "grad_norm": 54.954455409626824, + "learning_rate": 1.2064281891035984e-06, + "loss": 1.8314, + "step": 28046 + }, + { + "epoch": 2.39043722833035, + "grad_norm": 34.28196348049364, + "learning_rate": 1.2061052008549605e-06, + "loss": 1.5238, + "step": 28047 + }, + { + "epoch": 2.3905224580243756, + "grad_norm": 73.04128181538444, + "learning_rate": 1.2057822499177706e-06, + "loss": 2.4782, + "step": 28048 + }, + { + "epoch": 2.390607687718401, + "grad_norm": 49.64506193769138, + "learning_rate": 1.2054593362952033e-06, + "loss": 1.4731, + "step": 28049 + }, + { + "epoch": 2.3906929174124265, + "grad_norm": 66.38934391243977, + "learning_rate": 1.2051364599904374e-06, + "loss": 1.611, + "step": 28050 + }, + { + "epoch": 2.390778147106452, + "grad_norm": 89.32306553771075, + "learning_rate": 1.2048136210066459e-06, + "loss": 1.996, + "step": 28051 + }, + { + "epoch": 2.3908633768004774, + "grad_norm": 37.635072553885344, + "learning_rate": 1.2044908193470035e-06, + "loss": 1.2116, + "step": 28052 + }, + { + "epoch": 2.3909486064945025, + "grad_norm": 52.02260143076948, + "learning_rate": 1.2041680550146866e-06, + "loss": 1.463, + "step": 28053 + }, + { + "epoch": 2.391033836188528, + "grad_norm": 26.605784855320877, + "learning_rate": 1.203845328012867e-06, + "loss": 0.9657, + "step": 28054 + }, + { + "epoch": 2.3911190658825534, + "grad_norm": 71.11890516751929, + "learning_rate": 1.2035226383447225e-06, + "loss": 1.9776, + "step": 28055 + }, + { + "epoch": 2.391204295576579, + "grad_norm": 33.66743680452099, + "learning_rate": 1.2031999860134235e-06, + "loss": 1.8992, + "step": 28056 + }, + { + "epoch": 2.3912895252706043, + "grad_norm": 69.01692928876146, + "learning_rate": 1.2028773710221441e-06, + "loss": 2.3765, + "step": 28057 + }, + { + "epoch": 2.39137475496463, + "grad_norm": 69.23982447534564, + "learning_rate": 1.2025547933740556e-06, + "loss": 1.0991, + "step": 28058 + }, + { + "epoch": 2.3914599846586553, + "grad_norm": 57.3753742417828, + "learning_rate": 1.202232253072333e-06, + "loss": 1.7868, + "step": 28059 + }, + { + "epoch": 2.3915452143526803, + "grad_norm": 50.929169641623375, + "learning_rate": 1.2019097501201454e-06, + "loss": 1.4039, + "step": 28060 + }, + { + "epoch": 2.3916304440467058, + "grad_norm": 70.82318990472793, + "learning_rate": 1.2015872845206683e-06, + "loss": 1.5857, + "step": 28061 + }, + { + "epoch": 2.3917156737407312, + "grad_norm": 56.93842852538367, + "learning_rate": 1.201264856277069e-06, + "loss": 1.841, + "step": 28062 + }, + { + "epoch": 2.3918009034347567, + "grad_norm": 52.449663459822766, + "learning_rate": 1.2009424653925223e-06, + "loss": 1.4965, + "step": 28063 + }, + { + "epoch": 2.391886133128782, + "grad_norm": 64.32842143519348, + "learning_rate": 1.2006201118701965e-06, + "loss": 2.0139, + "step": 28064 + }, + { + "epoch": 2.3919713628228076, + "grad_norm": 35.9610664184793, + "learning_rate": 1.2002977957132622e-06, + "loss": 1.3454, + "step": 28065 + }, + { + "epoch": 2.392056592516833, + "grad_norm": 59.139143775172506, + "learning_rate": 1.1999755169248878e-06, + "loss": 1.7519, + "step": 28066 + }, + { + "epoch": 2.392141822210858, + "grad_norm": 51.73097152042294, + "learning_rate": 1.1996532755082458e-06, + "loss": 0.9987, + "step": 28067 + }, + { + "epoch": 2.3922270519048836, + "grad_norm": 58.12583141216487, + "learning_rate": 1.1993310714665018e-06, + "loss": 1.5242, + "step": 28068 + }, + { + "epoch": 2.392312281598909, + "grad_norm": 45.32641935470778, + "learning_rate": 1.1990089048028286e-06, + "loss": 1.7479, + "step": 28069 + }, + { + "epoch": 2.3923975112929345, + "grad_norm": 44.99404637931988, + "learning_rate": 1.1986867755203919e-06, + "loss": 1.0177, + "step": 28070 + }, + { + "epoch": 2.39248274098696, + "grad_norm": 49.48582806690054, + "learning_rate": 1.1983646836223589e-06, + "loss": 1.1461, + "step": 28071 + }, + { + "epoch": 2.392567970680985, + "grad_norm": 33.06554156193529, + "learning_rate": 1.1980426291119002e-06, + "loss": 1.0689, + "step": 28072 + }, + { + "epoch": 2.3926532003750105, + "grad_norm": 57.969924668737534, + "learning_rate": 1.1977206119921815e-06, + "loss": 1.601, + "step": 28073 + }, + { + "epoch": 2.392738430069036, + "grad_norm": 49.33902201117234, + "learning_rate": 1.19739863226637e-06, + "loss": 1.2431, + "step": 28074 + }, + { + "epoch": 2.3928236597630614, + "grad_norm": 39.68452378321401, + "learning_rate": 1.19707668993763e-06, + "loss": 1.4853, + "step": 28075 + }, + { + "epoch": 2.392908889457087, + "grad_norm": 66.76781087169586, + "learning_rate": 1.1967547850091299e-06, + "loss": 1.6287, + "step": 28076 + }, + { + "epoch": 2.3929941191511124, + "grad_norm": 71.7389915376476, + "learning_rate": 1.1964329174840367e-06, + "loss": 1.5201, + "step": 28077 + }, + { + "epoch": 2.393079348845138, + "grad_norm": 45.73489968729169, + "learning_rate": 1.1961110873655142e-06, + "loss": 1.0416, + "step": 28078 + }, + { + "epoch": 2.393164578539163, + "grad_norm": 16.03251897406056, + "learning_rate": 1.1957892946567262e-06, + "loss": 0.7506, + "step": 28079 + }, + { + "epoch": 2.3932498082331883, + "grad_norm": 75.23714249851126, + "learning_rate": 1.1954675393608406e-06, + "loss": 2.1501, + "step": 28080 + }, + { + "epoch": 2.393335037927214, + "grad_norm": 76.15831430023809, + "learning_rate": 1.1951458214810197e-06, + "loss": 1.7583, + "step": 28081 + }, + { + "epoch": 2.3934202676212393, + "grad_norm": 62.87706952282265, + "learning_rate": 1.1948241410204275e-06, + "loss": 1.7728, + "step": 28082 + }, + { + "epoch": 2.3935054973152647, + "grad_norm": 42.84911305324132, + "learning_rate": 1.1945024979822268e-06, + "loss": 1.5117, + "step": 28083 + }, + { + "epoch": 2.39359072700929, + "grad_norm": 46.88473834560768, + "learning_rate": 1.1941808923695813e-06, + "loss": 1.0501, + "step": 28084 + }, + { + "epoch": 2.3936759567033157, + "grad_norm": 31.78503431855352, + "learning_rate": 1.1938593241856567e-06, + "loss": 0.8907, + "step": 28085 + }, + { + "epoch": 2.3937611863973407, + "grad_norm": 62.66852897540916, + "learning_rate": 1.193537793433613e-06, + "loss": 1.5622, + "step": 28086 + }, + { + "epoch": 2.393846416091366, + "grad_norm": 101.01298230847183, + "learning_rate": 1.193216300116612e-06, + "loss": 1.3898, + "step": 28087 + }, + { + "epoch": 2.3939316457853916, + "grad_norm": 67.93750919218755, + "learning_rate": 1.1928948442378153e-06, + "loss": 2.0923, + "step": 28088 + }, + { + "epoch": 2.394016875479417, + "grad_norm": 82.88792188784885, + "learning_rate": 1.1925734258003863e-06, + "loss": 1.6151, + "step": 28089 + }, + { + "epoch": 2.3941021051734426, + "grad_norm": 53.14156668415703, + "learning_rate": 1.1922520448074842e-06, + "loss": 1.8165, + "step": 28090 + }, + { + "epoch": 2.3941873348674676, + "grad_norm": 35.51506627289025, + "learning_rate": 1.1919307012622694e-06, + "loss": 1.3735, + "step": 28091 + }, + { + "epoch": 2.394272564561493, + "grad_norm": 48.97170232094132, + "learning_rate": 1.1916093951679025e-06, + "loss": 1.257, + "step": 28092 + }, + { + "epoch": 2.3943577942555185, + "grad_norm": 72.9057247313339, + "learning_rate": 1.1912881265275455e-06, + "loss": 1.4639, + "step": 28093 + }, + { + "epoch": 2.394443023949544, + "grad_norm": 43.71419847771429, + "learning_rate": 1.1909668953443566e-06, + "loss": 1.1664, + "step": 28094 + }, + { + "epoch": 2.3945282536435695, + "grad_norm": 82.14644759738921, + "learning_rate": 1.1906457016214945e-06, + "loss": 2.1323, + "step": 28095 + }, + { + "epoch": 2.394613483337595, + "grad_norm": 51.571881698367875, + "learning_rate": 1.190324545362118e-06, + "loss": 1.2445, + "step": 28096 + }, + { + "epoch": 2.3946987130316204, + "grad_norm": 64.32187274144175, + "learning_rate": 1.1900034265693848e-06, + "loss": 1.3041, + "step": 28097 + }, + { + "epoch": 2.3947839427256454, + "grad_norm": 38.98080389910974, + "learning_rate": 1.189682345246455e-06, + "loss": 1.1914, + "step": 28098 + }, + { + "epoch": 2.394869172419671, + "grad_norm": 68.6399424649394, + "learning_rate": 1.189361301396484e-06, + "loss": 2.2084, + "step": 28099 + }, + { + "epoch": 2.3949544021136964, + "grad_norm": 43.528524951866395, + "learning_rate": 1.189040295022632e-06, + "loss": 1.3796, + "step": 28100 + }, + { + "epoch": 2.395039631807722, + "grad_norm": 30.48694618715142, + "learning_rate": 1.1887193261280527e-06, + "loss": 1.179, + "step": 28101 + }, + { + "epoch": 2.3951248615017473, + "grad_norm": 24.206524235213443, + "learning_rate": 1.1883983947159061e-06, + "loss": 1.0964, + "step": 28102 + }, + { + "epoch": 2.3952100911957728, + "grad_norm": 75.23907095527109, + "learning_rate": 1.1880775007893464e-06, + "loss": 1.8424, + "step": 28103 + }, + { + "epoch": 2.3952953208897982, + "grad_norm": 59.19001222335483, + "learning_rate": 1.1877566443515298e-06, + "loss": 1.3934, + "step": 28104 + }, + { + "epoch": 2.3953805505838233, + "grad_norm": 42.01296444603298, + "learning_rate": 1.1874358254056107e-06, + "loss": 1.2883, + "step": 28105 + }, + { + "epoch": 2.3954657802778487, + "grad_norm": 49.43837069109976, + "learning_rate": 1.1871150439547464e-06, + "loss": 1.3231, + "step": 28106 + }, + { + "epoch": 2.395551009971874, + "grad_norm": 46.09204521935939, + "learning_rate": 1.1867943000020887e-06, + "loss": 1.4627, + "step": 28107 + }, + { + "epoch": 2.3956362396658997, + "grad_norm": 45.91357869962292, + "learning_rate": 1.186473593550796e-06, + "loss": 1.652, + "step": 28108 + }, + { + "epoch": 2.395721469359925, + "grad_norm": 71.43925592817716, + "learning_rate": 1.1861529246040182e-06, + "loss": 1.4866, + "step": 28109 + }, + { + "epoch": 2.39580669905395, + "grad_norm": 49.76836705254022, + "learning_rate": 1.1858322931649125e-06, + "loss": 1.316, + "step": 28110 + }, + { + "epoch": 2.3958919287479756, + "grad_norm": 68.82073034801299, + "learning_rate": 1.185511699236631e-06, + "loss": 1.9966, + "step": 28111 + }, + { + "epoch": 2.395977158442001, + "grad_norm": 32.0853696947043, + "learning_rate": 1.1851911428223256e-06, + "loss": 0.7863, + "step": 28112 + }, + { + "epoch": 2.3960623881360266, + "grad_norm": 46.441906453765995, + "learning_rate": 1.18487062392515e-06, + "loss": 1.1229, + "step": 28113 + }, + { + "epoch": 2.396147617830052, + "grad_norm": 83.20675018967931, + "learning_rate": 1.1845501425482542e-06, + "loss": 1.9512, + "step": 28114 + }, + { + "epoch": 2.3962328475240775, + "grad_norm": 61.73577664045552, + "learning_rate": 1.1842296986947915e-06, + "loss": 1.8763, + "step": 28115 + }, + { + "epoch": 2.396318077218103, + "grad_norm": 52.51166582825786, + "learning_rate": 1.1839092923679147e-06, + "loss": 1.5286, + "step": 28116 + }, + { + "epoch": 2.396403306912128, + "grad_norm": 28.166442777878036, + "learning_rate": 1.1835889235707744e-06, + "loss": 1.0729, + "step": 28117 + }, + { + "epoch": 2.3964885366061535, + "grad_norm": 43.80107033066055, + "learning_rate": 1.1832685923065185e-06, + "loss": 1.2511, + "step": 28118 + }, + { + "epoch": 2.396573766300179, + "grad_norm": 24.330386717535117, + "learning_rate": 1.1829482985783007e-06, + "loss": 0.9387, + "step": 28119 + }, + { + "epoch": 2.3966589959942044, + "grad_norm": 83.88467530138742, + "learning_rate": 1.18262804238927e-06, + "loss": 1.4543, + "step": 28120 + }, + { + "epoch": 2.39674422568823, + "grad_norm": 61.80276516055071, + "learning_rate": 1.1823078237425756e-06, + "loss": 1.5644, + "step": 28121 + }, + { + "epoch": 2.3968294553822553, + "grad_norm": 26.74228597784104, + "learning_rate": 1.181987642641365e-06, + "loss": 0.75, + "step": 28122 + }, + { + "epoch": 2.396914685076281, + "grad_norm": 61.85503022633529, + "learning_rate": 1.1816674990887894e-06, + "loss": 1.6632, + "step": 28123 + }, + { + "epoch": 2.396999914770306, + "grad_norm": 94.62309800982985, + "learning_rate": 1.1813473930879977e-06, + "loss": 1.8823, + "step": 28124 + }, + { + "epoch": 2.3970851444643313, + "grad_norm": 42.33363999838819, + "learning_rate": 1.1810273246421368e-06, + "loss": 1.1684, + "step": 28125 + }, + { + "epoch": 2.3971703741583568, + "grad_norm": 51.433365486124075, + "learning_rate": 1.1807072937543551e-06, + "loss": 1.6691, + "step": 28126 + }, + { + "epoch": 2.3972556038523822, + "grad_norm": 46.41603524081948, + "learning_rate": 1.1803873004277976e-06, + "loss": 1.7883, + "step": 28127 + }, + { + "epoch": 2.3973408335464077, + "grad_norm": 40.57850720493102, + "learning_rate": 1.1800673446656153e-06, + "loss": 1.5031, + "step": 28128 + }, + { + "epoch": 2.3974260632404327, + "grad_norm": 46.042236891492266, + "learning_rate": 1.1797474264709524e-06, + "loss": 1.1236, + "step": 28129 + }, + { + "epoch": 2.397511292934458, + "grad_norm": 61.56792901514182, + "learning_rate": 1.179427545846954e-06, + "loss": 1.2549, + "step": 28130 + }, + { + "epoch": 2.3975965226284837, + "grad_norm": 61.55241616455743, + "learning_rate": 1.179107702796768e-06, + "loss": 1.0967, + "step": 28131 + }, + { + "epoch": 2.397681752322509, + "grad_norm": 47.82324467035774, + "learning_rate": 1.1787878973235406e-06, + "loss": 1.3333, + "step": 28132 + }, + { + "epoch": 2.3977669820165346, + "grad_norm": 89.0898683249676, + "learning_rate": 1.1784681294304157e-06, + "loss": 1.9452, + "step": 28133 + }, + { + "epoch": 2.39785221171056, + "grad_norm": 45.906552300770365, + "learning_rate": 1.1781483991205384e-06, + "loss": 1.2272, + "step": 28134 + }, + { + "epoch": 2.3979374414045855, + "grad_norm": 36.0536305924342, + "learning_rate": 1.177828706397051e-06, + "loss": 1.1954, + "step": 28135 + }, + { + "epoch": 2.3980226710986106, + "grad_norm": 33.73646831672099, + "learning_rate": 1.1775090512631016e-06, + "loss": 0.9031, + "step": 28136 + }, + { + "epoch": 2.398107900792636, + "grad_norm": 56.60355047984466, + "learning_rate": 1.1771894337218316e-06, + "loss": 1.232, + "step": 28137 + }, + { + "epoch": 2.3981931304866615, + "grad_norm": 48.875451895308174, + "learning_rate": 1.176869853776383e-06, + "loss": 1.5415, + "step": 28138 + }, + { + "epoch": 2.398278360180687, + "grad_norm": 77.36634964352082, + "learning_rate": 1.1765503114299003e-06, + "loss": 1.9607, + "step": 28139 + }, + { + "epoch": 2.3983635898747124, + "grad_norm": 31.282680519824083, + "learning_rate": 1.176230806685527e-06, + "loss": 1.1118, + "step": 28140 + }, + { + "epoch": 2.398448819568738, + "grad_norm": 55.9162090861458, + "learning_rate": 1.175911339546405e-06, + "loss": 1.2839, + "step": 28141 + }, + { + "epoch": 2.3985340492627634, + "grad_norm": 28.579586123477846, + "learning_rate": 1.1755919100156749e-06, + "loss": 0.7317, + "step": 28142 + }, + { + "epoch": 2.3986192789567884, + "grad_norm": 65.62007438982549, + "learning_rate": 1.1752725180964792e-06, + "loss": 1.8047, + "step": 28143 + }, + { + "epoch": 2.398704508650814, + "grad_norm": 48.60099362778999, + "learning_rate": 1.1749531637919565e-06, + "loss": 1.3594, + "step": 28144 + }, + { + "epoch": 2.3987897383448393, + "grad_norm": 61.02360465004085, + "learning_rate": 1.1746338471052515e-06, + "loss": 2.1148, + "step": 28145 + }, + { + "epoch": 2.398874968038865, + "grad_norm": 56.31860200664404, + "learning_rate": 1.1743145680395008e-06, + "loss": 1.3752, + "step": 28146 + }, + { + "epoch": 2.3989601977328903, + "grad_norm": 35.65914022713178, + "learning_rate": 1.1739953265978476e-06, + "loss": 1.1351, + "step": 28147 + }, + { + "epoch": 2.3990454274269153, + "grad_norm": 81.24720482215847, + "learning_rate": 1.173676122783428e-06, + "loss": 1.9883, + "step": 28148 + }, + { + "epoch": 2.3991306571209408, + "grad_norm": 52.942883214154094, + "learning_rate": 1.1733569565993857e-06, + "loss": 1.5688, + "step": 28149 + }, + { + "epoch": 2.3992158868149662, + "grad_norm": 51.69289681816143, + "learning_rate": 1.173037828048857e-06, + "loss": 1.2029, + "step": 28150 + }, + { + "epoch": 2.3993011165089917, + "grad_norm": 33.3223185596151, + "learning_rate": 1.1727187371349801e-06, + "loss": 1.2729, + "step": 28151 + }, + { + "epoch": 2.399386346203017, + "grad_norm": 70.52136425087501, + "learning_rate": 1.1723996838608925e-06, + "loss": 2.6688, + "step": 28152 + }, + { + "epoch": 2.3994715758970426, + "grad_norm": 49.74428197403105, + "learning_rate": 1.1720806682297347e-06, + "loss": 1.1041, + "step": 28153 + }, + { + "epoch": 2.399556805591068, + "grad_norm": 32.739876947722706, + "learning_rate": 1.1717616902446406e-06, + "loss": 1.0214, + "step": 28154 + }, + { + "epoch": 2.399642035285093, + "grad_norm": 34.55115022939001, + "learning_rate": 1.171442749908751e-06, + "loss": 1.1845, + "step": 28155 + }, + { + "epoch": 2.3997272649791186, + "grad_norm": 26.80391623955066, + "learning_rate": 1.1711238472252001e-06, + "loss": 0.5585, + "step": 28156 + }, + { + "epoch": 2.399812494673144, + "grad_norm": 26.71252449494925, + "learning_rate": 1.1708049821971236e-06, + "loss": 1.4373, + "step": 28157 + }, + { + "epoch": 2.3998977243671695, + "grad_norm": 33.66219537750768, + "learning_rate": 1.1704861548276596e-06, + "loss": 1.289, + "step": 28158 + }, + { + "epoch": 2.399982954061195, + "grad_norm": 55.02821571577418, + "learning_rate": 1.170167365119943e-06, + "loss": 1.7535, + "step": 28159 + }, + { + "epoch": 2.4000681837552205, + "grad_norm": 54.621238426688464, + "learning_rate": 1.1698486130771086e-06, + "loss": 1.6307, + "step": 28160 + }, + { + "epoch": 2.400153413449246, + "grad_norm": 66.83897274289274, + "learning_rate": 1.1695298987022896e-06, + "loss": 1.4113, + "step": 28161 + }, + { + "epoch": 2.400238643143271, + "grad_norm": 62.99281044905739, + "learning_rate": 1.1692112219986219e-06, + "loss": 1.2379, + "step": 28162 + }, + { + "epoch": 2.4003238728372964, + "grad_norm": 62.491105528078215, + "learning_rate": 1.168892582969241e-06, + "loss": 1.686, + "step": 28163 + }, + { + "epoch": 2.400409102531322, + "grad_norm": 74.719611897359, + "learning_rate": 1.1685739816172792e-06, + "loss": 1.7438, + "step": 28164 + }, + { + "epoch": 2.4004943322253474, + "grad_norm": 38.41254447648376, + "learning_rate": 1.1682554179458682e-06, + "loss": 0.8997, + "step": 28165 + }, + { + "epoch": 2.400579561919373, + "grad_norm": 48.318387556833905, + "learning_rate": 1.1679368919581446e-06, + "loss": 1.2809, + "step": 28166 + }, + { + "epoch": 2.4006647916133983, + "grad_norm": 74.10681151812798, + "learning_rate": 1.1676184036572385e-06, + "loss": 2.1773, + "step": 28167 + }, + { + "epoch": 2.4007500213074233, + "grad_norm": 38.55858668674123, + "learning_rate": 1.1672999530462825e-06, + "loss": 1.2078, + "step": 28168 + }, + { + "epoch": 2.400835251001449, + "grad_norm": 98.53978739116329, + "learning_rate": 1.1669815401284074e-06, + "loss": 2.916, + "step": 28169 + }, + { + "epoch": 2.4009204806954743, + "grad_norm": 128.06825762598626, + "learning_rate": 1.1666631649067456e-06, + "loss": 3.4425, + "step": 28170 + }, + { + "epoch": 2.4010057103894997, + "grad_norm": 53.902692316111974, + "learning_rate": 1.1663448273844298e-06, + "loss": 1.3255, + "step": 28171 + }, + { + "epoch": 2.401090940083525, + "grad_norm": 74.58709160119979, + "learning_rate": 1.1660265275645893e-06, + "loss": 1.7351, + "step": 28172 + }, + { + "epoch": 2.4011761697775507, + "grad_norm": 74.69020590477469, + "learning_rate": 1.1657082654503542e-06, + "loss": 1.7312, + "step": 28173 + }, + { + "epoch": 2.4012613994715757, + "grad_norm": 60.20082510414107, + "learning_rate": 1.1653900410448539e-06, + "loss": 1.237, + "step": 28174 + }, + { + "epoch": 2.401346629165601, + "grad_norm": 32.66333437129698, + "learning_rate": 1.1650718543512197e-06, + "loss": 1.1281, + "step": 28175 + }, + { + "epoch": 2.4014318588596266, + "grad_norm": 19.274805587982772, + "learning_rate": 1.1647537053725799e-06, + "loss": 1.0708, + "step": 28176 + }, + { + "epoch": 2.401517088553652, + "grad_norm": 24.64868385239772, + "learning_rate": 1.1644355941120622e-06, + "loss": 0.6688, + "step": 28177 + }, + { + "epoch": 2.4016023182476776, + "grad_norm": 44.21749118747089, + "learning_rate": 1.164117520572796e-06, + "loss": 1.4136, + "step": 28178 + }, + { + "epoch": 2.401687547941703, + "grad_norm": 50.83011057438823, + "learning_rate": 1.163799484757912e-06, + "loss": 0.967, + "step": 28179 + }, + { + "epoch": 2.4017727776357285, + "grad_norm": 73.00416058561348, + "learning_rate": 1.1634814866705351e-06, + "loss": 2.0476, + "step": 28180 + }, + { + "epoch": 2.4018580073297535, + "grad_norm": 39.625016204068, + "learning_rate": 1.163163526313793e-06, + "loss": 1.0299, + "step": 28181 + }, + { + "epoch": 2.401943237023779, + "grad_norm": 41.187489645251325, + "learning_rate": 1.162845603690812e-06, + "loss": 1.3332, + "step": 28182 + }, + { + "epoch": 2.4020284667178045, + "grad_norm": 67.37948773222094, + "learning_rate": 1.1625277188047212e-06, + "loss": 1.8639, + "step": 28183 + }, + { + "epoch": 2.40211369641183, + "grad_norm": 65.76882105558, + "learning_rate": 1.1622098716586456e-06, + "loss": 1.962, + "step": 28184 + }, + { + "epoch": 2.4021989261058554, + "grad_norm": 36.81360841527454, + "learning_rate": 1.1618920622557096e-06, + "loss": 1.4516, + "step": 28185 + }, + { + "epoch": 2.402284155799881, + "grad_norm": 37.19531788998989, + "learning_rate": 1.1615742905990407e-06, + "loss": 1.168, + "step": 28186 + }, + { + "epoch": 2.402369385493906, + "grad_norm": 68.58007272675204, + "learning_rate": 1.1612565566917627e-06, + "loss": 1.4814, + "step": 28187 + }, + { + "epoch": 2.4024546151879314, + "grad_norm": 53.3614065278292, + "learning_rate": 1.160938860537002e-06, + "loss": 1.0051, + "step": 28188 + }, + { + "epoch": 2.402539844881957, + "grad_norm": 50.13744974752379, + "learning_rate": 1.1606212021378826e-06, + "loss": 2.0361, + "step": 28189 + }, + { + "epoch": 2.4026250745759823, + "grad_norm": 31.302905627802662, + "learning_rate": 1.1603035814975278e-06, + "loss": 1.1064, + "step": 28190 + }, + { + "epoch": 2.4027103042700078, + "grad_norm": 58.16368914029009, + "learning_rate": 1.1599859986190603e-06, + "loss": 1.6943, + "step": 28191 + }, + { + "epoch": 2.4027955339640332, + "grad_norm": 31.89707800875232, + "learning_rate": 1.1596684535056057e-06, + "loss": 1.1371, + "step": 28192 + }, + { + "epoch": 2.4028807636580582, + "grad_norm": 82.53657287696987, + "learning_rate": 1.1593509461602847e-06, + "loss": 2.1143, + "step": 28193 + }, + { + "epoch": 2.4029659933520837, + "grad_norm": 52.36551628800902, + "learning_rate": 1.1590334765862227e-06, + "loss": 1.4888, + "step": 28194 + }, + { + "epoch": 2.403051223046109, + "grad_norm": 39.20160658916513, + "learning_rate": 1.158716044786538e-06, + "loss": 1.2373, + "step": 28195 + }, + { + "epoch": 2.4031364527401347, + "grad_norm": 60.46508193902015, + "learning_rate": 1.1583986507643568e-06, + "loss": 1.7139, + "step": 28196 + }, + { + "epoch": 2.40322168243416, + "grad_norm": 60.258424159030746, + "learning_rate": 1.1580812945227981e-06, + "loss": 2.082, + "step": 28197 + }, + { + "epoch": 2.4033069121281856, + "grad_norm": 30.33264370285641, + "learning_rate": 1.1577639760649835e-06, + "loss": 1.31, + "step": 28198 + }, + { + "epoch": 2.403392141822211, + "grad_norm": 48.74826564357296, + "learning_rate": 1.1574466953940327e-06, + "loss": 1.5197, + "step": 28199 + }, + { + "epoch": 2.403477371516236, + "grad_norm": 46.70877622057722, + "learning_rate": 1.157129452513066e-06, + "loss": 1.4032, + "step": 28200 + }, + { + "epoch": 2.4035626012102616, + "grad_norm": 42.08094498014972, + "learning_rate": 1.1568122474252041e-06, + "loss": 1.0803, + "step": 28201 + }, + { + "epoch": 2.403647830904287, + "grad_norm": 25.918993192857283, + "learning_rate": 1.1564950801335678e-06, + "loss": 0.834, + "step": 28202 + }, + { + "epoch": 2.4037330605983125, + "grad_norm": 66.24923482918396, + "learning_rate": 1.1561779506412752e-06, + "loss": 1.3453, + "step": 28203 + }, + { + "epoch": 2.403818290292338, + "grad_norm": 41.9745844673196, + "learning_rate": 1.1558608589514436e-06, + "loss": 1.7637, + "step": 28204 + }, + { + "epoch": 2.4039035199863634, + "grad_norm": 49.603897983658015, + "learning_rate": 1.155543805067194e-06, + "loss": 1.0527, + "step": 28205 + }, + { + "epoch": 2.403988749680389, + "grad_norm": 66.248466132243, + "learning_rate": 1.155226788991644e-06, + "loss": 1.7752, + "step": 28206 + }, + { + "epoch": 2.404073979374414, + "grad_norm": 45.67292796589091, + "learning_rate": 1.154909810727911e-06, + "loss": 1.4355, + "step": 28207 + }, + { + "epoch": 2.4041592090684394, + "grad_norm": 22.61591843824816, + "learning_rate": 1.1545928702791104e-06, + "loss": 0.772, + "step": 28208 + }, + { + "epoch": 2.404244438762465, + "grad_norm": 59.14489494136845, + "learning_rate": 1.1542759676483606e-06, + "loss": 1.2328, + "step": 28209 + }, + { + "epoch": 2.4043296684564903, + "grad_norm": 18.147331926675978, + "learning_rate": 1.1539591028387803e-06, + "loss": 0.8885, + "step": 28210 + }, + { + "epoch": 2.404414898150516, + "grad_norm": 33.8557455464882, + "learning_rate": 1.1536422758534837e-06, + "loss": 0.8284, + "step": 28211 + }, + { + "epoch": 2.404500127844541, + "grad_norm": 23.230233110968932, + "learning_rate": 1.1533254866955857e-06, + "loss": 0.8386, + "step": 28212 + }, + { + "epoch": 2.4045853575385663, + "grad_norm": 52.11134663927425, + "learning_rate": 1.1530087353682046e-06, + "loss": 1.3425, + "step": 28213 + }, + { + "epoch": 2.4046705872325917, + "grad_norm": 40.34852794625781, + "learning_rate": 1.1526920218744537e-06, + "loss": 0.9907, + "step": 28214 + }, + { + "epoch": 2.404755816926617, + "grad_norm": 64.60670755616529, + "learning_rate": 1.1523753462174474e-06, + "loss": 2.1962, + "step": 28215 + }, + { + "epoch": 2.4048410466206427, + "grad_norm": 45.06280916056989, + "learning_rate": 1.1520587084003004e-06, + "loss": 1.8195, + "step": 28216 + }, + { + "epoch": 2.404926276314668, + "grad_norm": 27.129877953242193, + "learning_rate": 1.151742108426126e-06, + "loss": 0.9348, + "step": 28217 + }, + { + "epoch": 2.4050115060086936, + "grad_norm": 41.7681528336695, + "learning_rate": 1.151425546298041e-06, + "loss": 1.2471, + "step": 28218 + }, + { + "epoch": 2.4050967357027186, + "grad_norm": 38.34925274404185, + "learning_rate": 1.1511090220191563e-06, + "loss": 0.8994, + "step": 28219 + }, + { + "epoch": 2.405181965396744, + "grad_norm": 46.505888483892384, + "learning_rate": 1.1507925355925847e-06, + "loss": 1.1552, + "step": 28220 + }, + { + "epoch": 2.4052671950907696, + "grad_norm": 65.48772975860504, + "learning_rate": 1.1504760870214376e-06, + "loss": 1.8173, + "step": 28221 + }, + { + "epoch": 2.405352424784795, + "grad_norm": 52.73480302462928, + "learning_rate": 1.1501596763088302e-06, + "loss": 1.1737, + "step": 28222 + }, + { + "epoch": 2.4054376544788205, + "grad_norm": 31.475313844405377, + "learning_rate": 1.1498433034578722e-06, + "loss": 0.9122, + "step": 28223 + }, + { + "epoch": 2.405522884172846, + "grad_norm": 34.86804928747308, + "learning_rate": 1.1495269684716743e-06, + "loss": 1.3894, + "step": 28224 + }, + { + "epoch": 2.4056081138668715, + "grad_norm": 27.221463226128392, + "learning_rate": 1.1492106713533485e-06, + "loss": 1.0783, + "step": 28225 + }, + { + "epoch": 2.4056933435608965, + "grad_norm": 50.20802777472375, + "learning_rate": 1.1488944121060074e-06, + "loss": 1.1899, + "step": 28226 + }, + { + "epoch": 2.405778573254922, + "grad_norm": 35.30986520017705, + "learning_rate": 1.1485781907327593e-06, + "loss": 1.3864, + "step": 28227 + }, + { + "epoch": 2.4058638029489474, + "grad_norm": 35.761774881198754, + "learning_rate": 1.148262007236714e-06, + "loss": 1.4752, + "step": 28228 + }, + { + "epoch": 2.405949032642973, + "grad_norm": 45.92666817677427, + "learning_rate": 1.1479458616209815e-06, + "loss": 1.6926, + "step": 28229 + }, + { + "epoch": 2.4060342623369984, + "grad_norm": 33.845376510496635, + "learning_rate": 1.1476297538886693e-06, + "loss": 1.0021, + "step": 28230 + }, + { + "epoch": 2.4061194920310234, + "grad_norm": 59.85440608002164, + "learning_rate": 1.147313684042889e-06, + "loss": 1.5275, + "step": 28231 + }, + { + "epoch": 2.406204721725049, + "grad_norm": 63.3152667725514, + "learning_rate": 1.1469976520867465e-06, + "loss": 1.4931, + "step": 28232 + }, + { + "epoch": 2.4062899514190743, + "grad_norm": 49.21817071743164, + "learning_rate": 1.1466816580233526e-06, + "loss": 1.4076, + "step": 28233 + }, + { + "epoch": 2.4063751811131, + "grad_norm": 146.67249416618418, + "learning_rate": 1.1463657018558117e-06, + "loss": 1.3042, + "step": 28234 + }, + { + "epoch": 2.4064604108071252, + "grad_norm": 67.96036225633776, + "learning_rate": 1.1460497835872348e-06, + "loss": 1.6103, + "step": 28235 + }, + { + "epoch": 2.4065456405011507, + "grad_norm": 42.46349918627193, + "learning_rate": 1.145733903220726e-06, + "loss": 1.5637, + "step": 28236 + }, + { + "epoch": 2.406630870195176, + "grad_norm": 44.88105231597467, + "learning_rate": 1.1454180607593934e-06, + "loss": 1.3779, + "step": 28237 + }, + { + "epoch": 2.406716099889201, + "grad_norm": 50.93674835328211, + "learning_rate": 1.1451022562063408e-06, + "loss": 1.4246, + "step": 28238 + }, + { + "epoch": 2.4068013295832267, + "grad_norm": 48.50060525065089, + "learning_rate": 1.144786489564677e-06, + "loss": 1.3065, + "step": 28239 + }, + { + "epoch": 2.406886559277252, + "grad_norm": 57.51902681433066, + "learning_rate": 1.1444707608375056e-06, + "loss": 1.6133, + "step": 28240 + }, + { + "epoch": 2.4069717889712776, + "grad_norm": 40.60776662061832, + "learning_rate": 1.144155070027933e-06, + "loss": 1.0511, + "step": 28241 + }, + { + "epoch": 2.407057018665303, + "grad_norm": 57.34013013292139, + "learning_rate": 1.1438394171390632e-06, + "loss": 1.6897, + "step": 28242 + }, + { + "epoch": 2.4071422483593286, + "grad_norm": 31.145896522702824, + "learning_rate": 1.1435238021739987e-06, + "loss": 1.4067, + "step": 28243 + }, + { + "epoch": 2.407227478053354, + "grad_norm": 23.19910453734555, + "learning_rate": 1.1432082251358467e-06, + "loss": 0.7761, + "step": 28244 + }, + { + "epoch": 2.407312707747379, + "grad_norm": 15.44770817939397, + "learning_rate": 1.1428926860277095e-06, + "loss": 0.4987, + "step": 28245 + }, + { + "epoch": 2.4073979374414045, + "grad_norm": 55.996549208069716, + "learning_rate": 1.1425771848526902e-06, + "loss": 1.5191, + "step": 28246 + }, + { + "epoch": 2.40748316713543, + "grad_norm": 78.04239832443058, + "learning_rate": 1.1422617216138899e-06, + "loss": 2.1369, + "step": 28247 + }, + { + "epoch": 2.4075683968294554, + "grad_norm": 53.97853628130938, + "learning_rate": 1.141946296314413e-06, + "loss": 1.2577, + "step": 28248 + }, + { + "epoch": 2.407653626523481, + "grad_norm": 32.544780418425816, + "learning_rate": 1.1416309089573618e-06, + "loss": 1.3373, + "step": 28249 + }, + { + "epoch": 2.407738856217506, + "grad_norm": 64.38230365648731, + "learning_rate": 1.141315559545838e-06, + "loss": 1.9022, + "step": 28250 + }, + { + "epoch": 2.4078240859115314, + "grad_norm": 104.20492553677742, + "learning_rate": 1.1410002480829413e-06, + "loss": 2.2152, + "step": 28251 + }, + { + "epoch": 2.407909315605557, + "grad_norm": 61.701247988468275, + "learning_rate": 1.1406849745717746e-06, + "loss": 1.3413, + "step": 28252 + }, + { + "epoch": 2.4079945452995823, + "grad_norm": 56.47045539386895, + "learning_rate": 1.1403697390154378e-06, + "loss": 1.4242, + "step": 28253 + }, + { + "epoch": 2.408079774993608, + "grad_norm": 30.373983797427723, + "learning_rate": 1.140054541417031e-06, + "loss": 0.7271, + "step": 28254 + }, + { + "epoch": 2.4081650046876333, + "grad_norm": 115.3462881874549, + "learning_rate": 1.1397393817796526e-06, + "loss": 2.4498, + "step": 28255 + }, + { + "epoch": 2.4082502343816587, + "grad_norm": 44.85553681260697, + "learning_rate": 1.1394242601064032e-06, + "loss": 0.9839, + "step": 28256 + }, + { + "epoch": 2.4083354640756838, + "grad_norm": 98.51181076686098, + "learning_rate": 1.1391091764003843e-06, + "loss": 2.3791, + "step": 28257 + }, + { + "epoch": 2.4084206937697092, + "grad_norm": 64.9812856373997, + "learning_rate": 1.1387941306646916e-06, + "loss": 1.116, + "step": 28258 + }, + { + "epoch": 2.4085059234637347, + "grad_norm": 62.77830494351262, + "learning_rate": 1.138479122902425e-06, + "loss": 1.7395, + "step": 28259 + }, + { + "epoch": 2.40859115315776, + "grad_norm": 70.51272131078795, + "learning_rate": 1.13816415311668e-06, + "loss": 1.7917, + "step": 28260 + }, + { + "epoch": 2.4086763828517856, + "grad_norm": 20.67477121306989, + "learning_rate": 1.1378492213105575e-06, + "loss": 0.7996, + "step": 28261 + }, + { + "epoch": 2.408761612545811, + "grad_norm": 21.393425129876782, + "learning_rate": 1.137534327487153e-06, + "loss": 0.8867, + "step": 28262 + }, + { + "epoch": 2.4088468422398366, + "grad_norm": 30.274702466623797, + "learning_rate": 1.1372194716495628e-06, + "loss": 1.055, + "step": 28263 + }, + { + "epoch": 2.4089320719338616, + "grad_norm": 25.237220141363228, + "learning_rate": 1.1369046538008837e-06, + "loss": 0.8766, + "step": 28264 + }, + { + "epoch": 2.409017301627887, + "grad_norm": 29.314166344244377, + "learning_rate": 1.1365898739442137e-06, + "loss": 0.893, + "step": 28265 + }, + { + "epoch": 2.4091025313219125, + "grad_norm": 57.02001619238584, + "learning_rate": 1.1362751320826476e-06, + "loss": 1.8645, + "step": 28266 + }, + { + "epoch": 2.409187761015938, + "grad_norm": 67.73120660023326, + "learning_rate": 1.13596042821928e-06, + "loss": 1.6409, + "step": 28267 + }, + { + "epoch": 2.4092729907099635, + "grad_norm": 38.008047137959146, + "learning_rate": 1.1356457623572043e-06, + "loss": 1.1084, + "step": 28268 + }, + { + "epoch": 2.4093582204039885, + "grad_norm": 77.67073885120261, + "learning_rate": 1.1353311344995189e-06, + "loss": 1.9235, + "step": 28269 + }, + { + "epoch": 2.409443450098014, + "grad_norm": 46.53347786875693, + "learning_rate": 1.135016544649316e-06, + "loss": 1.5889, + "step": 28270 + }, + { + "epoch": 2.4095286797920394, + "grad_norm": 32.77471037064677, + "learning_rate": 1.134701992809688e-06, + "loss": 0.941, + "step": 28271 + }, + { + "epoch": 2.409613909486065, + "grad_norm": 59.184695394586875, + "learning_rate": 1.1343874789837318e-06, + "loss": 1.6622, + "step": 28272 + }, + { + "epoch": 2.4096991391800904, + "grad_norm": 69.37030644875364, + "learning_rate": 1.1340730031745374e-06, + "loss": 1.8759, + "step": 28273 + }, + { + "epoch": 2.409784368874116, + "grad_norm": 46.7643653132075, + "learning_rate": 1.1337585653851997e-06, + "loss": 1.1981, + "step": 28274 + }, + { + "epoch": 2.4098695985681413, + "grad_norm": 63.2938397992932, + "learning_rate": 1.1334441656188105e-06, + "loss": 1.6312, + "step": 28275 + }, + { + "epoch": 2.4099548282621663, + "grad_norm": 34.059234388944596, + "learning_rate": 1.133129803878461e-06, + "loss": 1.1044, + "step": 28276 + }, + { + "epoch": 2.410040057956192, + "grad_norm": 34.84266510426975, + "learning_rate": 1.1328154801672424e-06, + "loss": 1.069, + "step": 28277 + }, + { + "epoch": 2.4101252876502173, + "grad_norm": 48.442579933953134, + "learning_rate": 1.1325011944882485e-06, + "loss": 0.9874, + "step": 28278 + }, + { + "epoch": 2.4102105173442427, + "grad_norm": 46.591214608591905, + "learning_rate": 1.1321869468445668e-06, + "loss": 1.7957, + "step": 28279 + }, + { + "epoch": 2.410295747038268, + "grad_norm": 54.73475690968388, + "learning_rate": 1.1318727372392913e-06, + "loss": 1.444, + "step": 28280 + }, + { + "epoch": 2.4103809767322937, + "grad_norm": 31.53693236918566, + "learning_rate": 1.1315585656755084e-06, + "loss": 1.1976, + "step": 28281 + }, + { + "epoch": 2.410466206426319, + "grad_norm": 67.38075248240973, + "learning_rate": 1.131244432156312e-06, + "loss": 1.7323, + "step": 28282 + }, + { + "epoch": 2.410551436120344, + "grad_norm": 177.6855392839805, + "learning_rate": 1.130930336684789e-06, + "loss": 1.8204, + "step": 28283 + }, + { + "epoch": 2.4106366658143696, + "grad_norm": 59.30424398264702, + "learning_rate": 1.1306162792640285e-06, + "loss": 1.5071, + "step": 28284 + }, + { + "epoch": 2.410721895508395, + "grad_norm": 40.778316097391, + "learning_rate": 1.130302259897118e-06, + "loss": 1.5456, + "step": 28285 + }, + { + "epoch": 2.4108071252024206, + "grad_norm": 68.84467394132754, + "learning_rate": 1.1299882785871487e-06, + "loss": 1.6665, + "step": 28286 + }, + { + "epoch": 2.410892354896446, + "grad_norm": 61.509827608554296, + "learning_rate": 1.1296743353372052e-06, + "loss": 1.8204, + "step": 28287 + }, + { + "epoch": 2.4109775845904715, + "grad_norm": 32.922395090444034, + "learning_rate": 1.1293604301503786e-06, + "loss": 0.8826, + "step": 28288 + }, + { + "epoch": 2.4110628142844965, + "grad_norm": 90.05634604733477, + "learning_rate": 1.1290465630297537e-06, + "loss": 1.8342, + "step": 28289 + }, + { + "epoch": 2.411148043978522, + "grad_norm": 87.18316650498227, + "learning_rate": 1.1287327339784166e-06, + "loss": 2.1477, + "step": 28290 + }, + { + "epoch": 2.4112332736725475, + "grad_norm": 36.27680562431796, + "learning_rate": 1.1284189429994559e-06, + "loss": 0.5757, + "step": 28291 + }, + { + "epoch": 2.411318503366573, + "grad_norm": 28.361214366190172, + "learning_rate": 1.1281051900959566e-06, + "loss": 1.1779, + "step": 28292 + }, + { + "epoch": 2.4114037330605984, + "grad_norm": 26.756598729881233, + "learning_rate": 1.1277914752710039e-06, + "loss": 1.0734, + "step": 28293 + }, + { + "epoch": 2.411488962754624, + "grad_norm": 31.828612385460954, + "learning_rate": 1.1274777985276824e-06, + "loss": 1.097, + "step": 28294 + }, + { + "epoch": 2.411574192448649, + "grad_norm": 75.10815274332667, + "learning_rate": 1.127164159869078e-06, + "loss": 1.756, + "step": 28295 + }, + { + "epoch": 2.4116594221426744, + "grad_norm": 59.30681378824891, + "learning_rate": 1.126850559298276e-06, + "loss": 1.7756, + "step": 28296 + }, + { + "epoch": 2.4117446518367, + "grad_norm": 81.05031175504004, + "learning_rate": 1.1265369968183603e-06, + "loss": 2.5758, + "step": 28297 + }, + { + "epoch": 2.4118298815307253, + "grad_norm": 35.308004815878625, + "learning_rate": 1.1262234724324124e-06, + "loss": 1.1124, + "step": 28298 + }, + { + "epoch": 2.4119151112247508, + "grad_norm": 87.59889119255718, + "learning_rate": 1.125909986143519e-06, + "loss": 2.1796, + "step": 28299 + }, + { + "epoch": 2.4120003409187762, + "grad_norm": 66.07309446233562, + "learning_rate": 1.1255965379547612e-06, + "loss": 2.0639, + "step": 28300 + }, + { + "epoch": 2.4120855706128017, + "grad_norm": 23.649035707825135, + "learning_rate": 1.1252831278692222e-06, + "loss": 0.9518, + "step": 28301 + }, + { + "epoch": 2.4121708003068267, + "grad_norm": 30.271150810140874, + "learning_rate": 1.1249697558899819e-06, + "loss": 1.1713, + "step": 28302 + }, + { + "epoch": 2.412256030000852, + "grad_norm": 34.58419537401536, + "learning_rate": 1.124656422020125e-06, + "loss": 1.1929, + "step": 28303 + }, + { + "epoch": 2.4123412596948777, + "grad_norm": 31.3275346240154, + "learning_rate": 1.1243431262627337e-06, + "loss": 1.2833, + "step": 28304 + }, + { + "epoch": 2.412426489388903, + "grad_norm": 73.63842332299744, + "learning_rate": 1.1240298686208873e-06, + "loss": 1.5915, + "step": 28305 + }, + { + "epoch": 2.4125117190829286, + "grad_norm": 49.74353172472069, + "learning_rate": 1.123716649097667e-06, + "loss": 1.3126, + "step": 28306 + }, + { + "epoch": 2.412596948776954, + "grad_norm": 48.97062142546283, + "learning_rate": 1.123403467696152e-06, + "loss": 0.8021, + "step": 28307 + }, + { + "epoch": 2.412682178470979, + "grad_norm": 65.91495758063036, + "learning_rate": 1.1230903244194246e-06, + "loss": 1.8037, + "step": 28308 + }, + { + "epoch": 2.4127674081650046, + "grad_norm": 52.225629900216944, + "learning_rate": 1.1227772192705638e-06, + "loss": 1.1571, + "step": 28309 + }, + { + "epoch": 2.41285263785903, + "grad_norm": 56.482525082286855, + "learning_rate": 1.1224641522526465e-06, + "loss": 1.3465, + "step": 28310 + }, + { + "epoch": 2.4129378675530555, + "grad_norm": 67.27723540153033, + "learning_rate": 1.1221511233687538e-06, + "loss": 2.2169, + "step": 28311 + }, + { + "epoch": 2.413023097247081, + "grad_norm": 58.10967803080351, + "learning_rate": 1.1218381326219651e-06, + "loss": 1.4411, + "step": 28312 + }, + { + "epoch": 2.4131083269411064, + "grad_norm": 65.57266052989247, + "learning_rate": 1.1215251800153576e-06, + "loss": 1.737, + "step": 28313 + }, + { + "epoch": 2.4131935566351315, + "grad_norm": 34.813222773357566, + "learning_rate": 1.1212122655520085e-06, + "loss": 0.6177, + "step": 28314 + }, + { + "epoch": 2.413278786329157, + "grad_norm": 32.496848109461425, + "learning_rate": 1.1208993892349957e-06, + "loss": 1.3869, + "step": 28315 + }, + { + "epoch": 2.4133640160231824, + "grad_norm": 54.69484499235123, + "learning_rate": 1.1205865510673947e-06, + "loss": 1.4727, + "step": 28316 + }, + { + "epoch": 2.413449245717208, + "grad_norm": 42.336283760250524, + "learning_rate": 1.1202737510522842e-06, + "loss": 1.2357, + "step": 28317 + }, + { + "epoch": 2.4135344754112333, + "grad_norm": 24.74661452273299, + "learning_rate": 1.1199609891927389e-06, + "loss": 0.9379, + "step": 28318 + }, + { + "epoch": 2.413619705105259, + "grad_norm": 38.091602769109876, + "learning_rate": 1.1196482654918368e-06, + "loss": 0.924, + "step": 28319 + }, + { + "epoch": 2.4137049347992843, + "grad_norm": 117.94392118382501, + "learning_rate": 1.1193355799526506e-06, + "loss": 1.4024, + "step": 28320 + }, + { + "epoch": 2.4137901644933093, + "grad_norm": 21.308566057715215, + "learning_rate": 1.1190229325782588e-06, + "loss": 0.8506, + "step": 28321 + }, + { + "epoch": 2.4138753941873348, + "grad_norm": 53.32019906018086, + "learning_rate": 1.118710323371734e-06, + "loss": 1.4659, + "step": 28322 + }, + { + "epoch": 2.4139606238813602, + "grad_norm": 64.5665212307541, + "learning_rate": 1.1183977523361506e-06, + "loss": 1.698, + "step": 28323 + }, + { + "epoch": 2.4140458535753857, + "grad_norm": 51.26200405314982, + "learning_rate": 1.1180852194745818e-06, + "loss": 1.2422, + "step": 28324 + }, + { + "epoch": 2.414131083269411, + "grad_norm": 32.46072395532208, + "learning_rate": 1.1177727247901037e-06, + "loss": 0.9947, + "step": 28325 + }, + { + "epoch": 2.4142163129634366, + "grad_norm": 60.11411838260166, + "learning_rate": 1.1174602682857866e-06, + "loss": 1.4087, + "step": 28326 + }, + { + "epoch": 2.414301542657462, + "grad_norm": 24.746586607339708, + "learning_rate": 1.1171478499647065e-06, + "loss": 1.3729, + "step": 28327 + }, + { + "epoch": 2.414386772351487, + "grad_norm": 25.294638451242587, + "learning_rate": 1.1168354698299333e-06, + "loss": 0.9887, + "step": 28328 + }, + { + "epoch": 2.4144720020455126, + "grad_norm": 25.324994396616134, + "learning_rate": 1.1165231278845412e-06, + "loss": 1.5659, + "step": 28329 + }, + { + "epoch": 2.414557231739538, + "grad_norm": 53.27411802134441, + "learning_rate": 1.116210824131601e-06, + "loss": 1.632, + "step": 28330 + }, + { + "epoch": 2.4146424614335635, + "grad_norm": 42.4446987924778, + "learning_rate": 1.1158985585741844e-06, + "loss": 1.324, + "step": 28331 + }, + { + "epoch": 2.414727691127589, + "grad_norm": 93.84000176201322, + "learning_rate": 1.115586331215362e-06, + "loss": 2.0185, + "step": 28332 + }, + { + "epoch": 2.414812920821614, + "grad_norm": 82.5745305917282, + "learning_rate": 1.115274142058203e-06, + "loss": 2.1597, + "step": 28333 + }, + { + "epoch": 2.4148981505156395, + "grad_norm": 59.802796927531276, + "learning_rate": 1.1149619911057796e-06, + "loss": 1.4169, + "step": 28334 + }, + { + "epoch": 2.414983380209665, + "grad_norm": 46.36462583303561, + "learning_rate": 1.114649878361162e-06, + "loss": 1.1924, + "step": 28335 + }, + { + "epoch": 2.4150686099036904, + "grad_norm": 45.72419187082061, + "learning_rate": 1.1143378038274194e-06, + "loss": 1.1335, + "step": 28336 + }, + { + "epoch": 2.415153839597716, + "grad_norm": 65.71406409339552, + "learning_rate": 1.1140257675076188e-06, + "loss": 2.2824, + "step": 28337 + }, + { + "epoch": 2.4152390692917414, + "grad_norm": 42.855059002328105, + "learning_rate": 1.1137137694048323e-06, + "loss": 1.1815, + "step": 28338 + }, + { + "epoch": 2.415324298985767, + "grad_norm": 32.413264988367494, + "learning_rate": 1.1134018095221267e-06, + "loss": 0.9679, + "step": 28339 + }, + { + "epoch": 2.415409528679792, + "grad_norm": 50.962576231908876, + "learning_rate": 1.1130898878625695e-06, + "loss": 1.4601, + "step": 28340 + }, + { + "epoch": 2.4154947583738173, + "grad_norm": 43.62343299814634, + "learning_rate": 1.112778004429228e-06, + "loss": 1.3371, + "step": 28341 + }, + { + "epoch": 2.415579988067843, + "grad_norm": 70.57292136219529, + "learning_rate": 1.1124661592251696e-06, + "loss": 1.4375, + "step": 28342 + }, + { + "epoch": 2.4156652177618683, + "grad_norm": 71.9842465037965, + "learning_rate": 1.1121543522534633e-06, + "loss": 1.9692, + "step": 28343 + }, + { + "epoch": 2.4157504474558937, + "grad_norm": 49.58591657885399, + "learning_rate": 1.1118425835171743e-06, + "loss": 0.8813, + "step": 28344 + }, + { + "epoch": 2.415835677149919, + "grad_norm": 79.27863700828081, + "learning_rate": 1.1115308530193681e-06, + "loss": 1.8428, + "step": 28345 + }, + { + "epoch": 2.4159209068439447, + "grad_norm": 28.908808426026877, + "learning_rate": 1.11121916076311e-06, + "loss": 0.6078, + "step": 28346 + }, + { + "epoch": 2.4160061365379697, + "grad_norm": 32.15915313885789, + "learning_rate": 1.1109075067514668e-06, + "loss": 1.3693, + "step": 28347 + }, + { + "epoch": 2.416091366231995, + "grad_norm": 34.52722933222968, + "learning_rate": 1.1105958909875035e-06, + "loss": 1.5834, + "step": 28348 + }, + { + "epoch": 2.4161765959260206, + "grad_norm": 55.043923116554176, + "learning_rate": 1.1102843134742825e-06, + "loss": 1.4829, + "step": 28349 + }, + { + "epoch": 2.416261825620046, + "grad_norm": 55.63735156984457, + "learning_rate": 1.1099727742148697e-06, + "loss": 1.9031, + "step": 28350 + }, + { + "epoch": 2.4163470553140716, + "grad_norm": 64.47748875286719, + "learning_rate": 1.1096612732123308e-06, + "loss": 1.8332, + "step": 28351 + }, + { + "epoch": 2.4164322850080966, + "grad_norm": 64.27151200473065, + "learning_rate": 1.1093498104697265e-06, + "loss": 1.7807, + "step": 28352 + }, + { + "epoch": 2.416517514702122, + "grad_norm": 49.26894045270833, + "learning_rate": 1.1090383859901217e-06, + "loss": 1.3425, + "step": 28353 + }, + { + "epoch": 2.4166027443961475, + "grad_norm": 76.06493243823581, + "learning_rate": 1.1087269997765764e-06, + "loss": 1.701, + "step": 28354 + }, + { + "epoch": 2.416687974090173, + "grad_norm": 20.276972941636433, + "learning_rate": 1.1084156518321558e-06, + "loss": 0.9225, + "step": 28355 + }, + { + "epoch": 2.4167732037841985, + "grad_norm": 41.106586809207776, + "learning_rate": 1.1081043421599214e-06, + "loss": 1.1818, + "step": 28356 + }, + { + "epoch": 2.416858433478224, + "grad_norm": 68.31348557263539, + "learning_rate": 1.107793070762933e-06, + "loss": 1.7924, + "step": 28357 + }, + { + "epoch": 2.4169436631722494, + "grad_norm": 44.40481464229407, + "learning_rate": 1.1074818376442531e-06, + "loss": 0.98, + "step": 28358 + }, + { + "epoch": 2.4170288928662744, + "grad_norm": 39.94525484264536, + "learning_rate": 1.107170642806944e-06, + "loss": 1.4207, + "step": 28359 + }, + { + "epoch": 2.4171141225603, + "grad_norm": 47.26988533539109, + "learning_rate": 1.1068594862540643e-06, + "loss": 1.2694, + "step": 28360 + }, + { + "epoch": 2.4171993522543254, + "grad_norm": 48.04572099181797, + "learning_rate": 1.1065483679886752e-06, + "loss": 1.5897, + "step": 28361 + }, + { + "epoch": 2.417284581948351, + "grad_norm": 49.5334478153709, + "learning_rate": 1.1062372880138355e-06, + "loss": 1.3428, + "step": 28362 + }, + { + "epoch": 2.4173698116423763, + "grad_norm": 68.19651598762644, + "learning_rate": 1.1059262463326038e-06, + "loss": 2.2083, + "step": 28363 + }, + { + "epoch": 2.4174550413364018, + "grad_norm": 49.06710306243579, + "learning_rate": 1.105615242948041e-06, + "loss": 1.648, + "step": 28364 + }, + { + "epoch": 2.4175402710304272, + "grad_norm": 44.14208946923574, + "learning_rate": 1.1053042778632034e-06, + "loss": 0.8801, + "step": 28365 + }, + { + "epoch": 2.4176255007244523, + "grad_norm": 46.19670766564288, + "learning_rate": 1.1049933510811522e-06, + "loss": 1.2211, + "step": 28366 + }, + { + "epoch": 2.4177107304184777, + "grad_norm": 49.43154608040621, + "learning_rate": 1.1046824626049424e-06, + "loss": 1.7212, + "step": 28367 + }, + { + "epoch": 2.417795960112503, + "grad_norm": 82.81009416066887, + "learning_rate": 1.104371612437634e-06, + "loss": 2.2262, + "step": 28368 + }, + { + "epoch": 2.4178811898065287, + "grad_norm": 23.345958718216313, + "learning_rate": 1.1040608005822829e-06, + "loss": 0.9582, + "step": 28369 + }, + { + "epoch": 2.417966419500554, + "grad_norm": 77.33144538821487, + "learning_rate": 1.1037500270419455e-06, + "loss": 1.953, + "step": 28370 + }, + { + "epoch": 2.418051649194579, + "grad_norm": 59.15034351670385, + "learning_rate": 1.1034392918196774e-06, + "loss": 1.1221, + "step": 28371 + }, + { + "epoch": 2.4181368788886046, + "grad_norm": 29.99145981232978, + "learning_rate": 1.1031285949185365e-06, + "loss": 0.7866, + "step": 28372 + }, + { + "epoch": 2.41822210858263, + "grad_norm": 26.839371076193338, + "learning_rate": 1.1028179363415759e-06, + "loss": 1.1524, + "step": 28373 + }, + { + "epoch": 2.4183073382766556, + "grad_norm": 65.01829128669651, + "learning_rate": 1.1025073160918543e-06, + "loss": 1.9088, + "step": 28374 + }, + { + "epoch": 2.418392567970681, + "grad_norm": 54.52268657313435, + "learning_rate": 1.1021967341724237e-06, + "loss": 1.6865, + "step": 28375 + }, + { + "epoch": 2.4184777976647065, + "grad_norm": 24.98163894134828, + "learning_rate": 1.1018861905863387e-06, + "loss": 0.7737, + "step": 28376 + }, + { + "epoch": 2.418563027358732, + "grad_norm": 67.59185585930712, + "learning_rate": 1.1015756853366555e-06, + "loss": 2.0947, + "step": 28377 + }, + { + "epoch": 2.418648257052757, + "grad_norm": 56.84907353323577, + "learning_rate": 1.101265218426426e-06, + "loss": 1.6931, + "step": 28378 + }, + { + "epoch": 2.4187334867467825, + "grad_norm": 50.97572288854613, + "learning_rate": 1.1009547898587041e-06, + "loss": 1.1136, + "step": 28379 + }, + { + "epoch": 2.418818716440808, + "grad_norm": 38.1674957216139, + "learning_rate": 1.1006443996365407e-06, + "loss": 1.3116, + "step": 28380 + }, + { + "epoch": 2.4189039461348334, + "grad_norm": 24.543669665881037, + "learning_rate": 1.1003340477629904e-06, + "loss": 0.6574, + "step": 28381 + }, + { + "epoch": 2.418989175828859, + "grad_norm": 69.1191266431089, + "learning_rate": 1.1000237342411068e-06, + "loss": 1.5486, + "step": 28382 + }, + { + "epoch": 2.4190744055228843, + "grad_norm": 50.21651647973883, + "learning_rate": 1.09971345907394e-06, + "loss": 1.5692, + "step": 28383 + }, + { + "epoch": 2.41915963521691, + "grad_norm": 31.33478945599127, + "learning_rate": 1.09940322226454e-06, + "loss": 1.402, + "step": 28384 + }, + { + "epoch": 2.419244864910935, + "grad_norm": 35.664400413661845, + "learning_rate": 1.0990930238159608e-06, + "loss": 1.4144, + "step": 28385 + }, + { + "epoch": 2.4193300946049603, + "grad_norm": 67.38534725839236, + "learning_rate": 1.0987828637312513e-06, + "loss": 1.9504, + "step": 28386 + }, + { + "epoch": 2.4194153242989858, + "grad_norm": 38.385825556426774, + "learning_rate": 1.0984727420134627e-06, + "loss": 1.4913, + "step": 28387 + }, + { + "epoch": 2.4195005539930112, + "grad_norm": 28.38533212116475, + "learning_rate": 1.098162658665643e-06, + "loss": 0.6785, + "step": 28388 + }, + { + "epoch": 2.4195857836870367, + "grad_norm": 31.272291315682722, + "learning_rate": 1.0978526136908425e-06, + "loss": 1.1293, + "step": 28389 + }, + { + "epoch": 2.4196710133810617, + "grad_norm": 57.328437769930254, + "learning_rate": 1.097542607092113e-06, + "loss": 1.2317, + "step": 28390 + }, + { + "epoch": 2.419756243075087, + "grad_norm": 36.92030713173029, + "learning_rate": 1.0972326388725008e-06, + "loss": 1.1092, + "step": 28391 + }, + { + "epoch": 2.4198414727691127, + "grad_norm": 68.61300892186671, + "learning_rate": 1.0969227090350548e-06, + "loss": 1.6154, + "step": 28392 + }, + { + "epoch": 2.419926702463138, + "grad_norm": 55.1164493434432, + "learning_rate": 1.0966128175828222e-06, + "loss": 1.2348, + "step": 28393 + }, + { + "epoch": 2.4200119321571636, + "grad_norm": 47.70443017642825, + "learning_rate": 1.0963029645188523e-06, + "loss": 1.7731, + "step": 28394 + }, + { + "epoch": 2.420097161851189, + "grad_norm": 48.79356366179028, + "learning_rate": 1.0959931498461918e-06, + "loss": 1.205, + "step": 28395 + }, + { + "epoch": 2.4201823915452145, + "grad_norm": 26.5404190673117, + "learning_rate": 1.0956833735678857e-06, + "loss": 0.96, + "step": 28396 + }, + { + "epoch": 2.4202676212392396, + "grad_norm": 59.85336773350955, + "learning_rate": 1.0953736356869826e-06, + "loss": 1.3204, + "step": 28397 + }, + { + "epoch": 2.420352850933265, + "grad_norm": 32.04159414808769, + "learning_rate": 1.0950639362065297e-06, + "loss": 0.8792, + "step": 28398 + }, + { + "epoch": 2.4204380806272905, + "grad_norm": 56.09025477264823, + "learning_rate": 1.094754275129571e-06, + "loss": 1.2736, + "step": 28399 + }, + { + "epoch": 2.420523310321316, + "grad_norm": 112.91840025569796, + "learning_rate": 1.0944446524591522e-06, + "loss": 3.3257, + "step": 28400 + }, + { + "epoch": 2.4206085400153414, + "grad_norm": 59.71614452191206, + "learning_rate": 1.094135068198317e-06, + "loss": 2.4599, + "step": 28401 + }, + { + "epoch": 2.420693769709367, + "grad_norm": 17.243965150183524, + "learning_rate": 1.0938255223501127e-06, + "loss": 0.8111, + "step": 28402 + }, + { + "epoch": 2.4207789994033924, + "grad_norm": 45.72198738477131, + "learning_rate": 1.0935160149175828e-06, + "loss": 0.9671, + "step": 28403 + }, + { + "epoch": 2.4208642290974174, + "grad_norm": 64.80883476635671, + "learning_rate": 1.0932065459037684e-06, + "loss": 1.7356, + "step": 28404 + }, + { + "epoch": 2.420949458791443, + "grad_norm": 53.94793899868757, + "learning_rate": 1.0928971153117168e-06, + "loss": 1.513, + "step": 28405 + }, + { + "epoch": 2.4210346884854683, + "grad_norm": 38.19308562979956, + "learning_rate": 1.0925877231444686e-06, + "loss": 1.1871, + "step": 28406 + }, + { + "epoch": 2.421119918179494, + "grad_norm": 29.683157961376725, + "learning_rate": 1.0922783694050686e-06, + "loss": 0.89, + "step": 28407 + }, + { + "epoch": 2.4212051478735193, + "grad_norm": 49.178910148226656, + "learning_rate": 1.0919690540965578e-06, + "loss": 1.3291, + "step": 28408 + }, + { + "epoch": 2.4212903775675443, + "grad_norm": 35.15119163621007, + "learning_rate": 1.091659777221979e-06, + "loss": 0.8019, + "step": 28409 + }, + { + "epoch": 2.4213756072615698, + "grad_norm": 30.33399212183803, + "learning_rate": 1.0913505387843715e-06, + "loss": 1.0384, + "step": 28410 + }, + { + "epoch": 2.421460836955595, + "grad_norm": 62.62726956345209, + "learning_rate": 1.0910413387867797e-06, + "loss": 1.4842, + "step": 28411 + }, + { + "epoch": 2.4215460666496207, + "grad_norm": 34.083092640798085, + "learning_rate": 1.0907321772322415e-06, + "loss": 1.5765, + "step": 28412 + }, + { + "epoch": 2.421631296343646, + "grad_norm": 42.16044130045642, + "learning_rate": 1.0904230541238008e-06, + "loss": 1.4697, + "step": 28413 + }, + { + "epoch": 2.4217165260376716, + "grad_norm": 32.186634067109054, + "learning_rate": 1.0901139694644941e-06, + "loss": 1.0702, + "step": 28414 + }, + { + "epoch": 2.421801755731697, + "grad_norm": 62.76081527838225, + "learning_rate": 1.0898049232573643e-06, + "loss": 1.6981, + "step": 28415 + }, + { + "epoch": 2.421886985425722, + "grad_norm": 42.21628922770007, + "learning_rate": 1.0894959155054496e-06, + "loss": 0.6796, + "step": 28416 + }, + { + "epoch": 2.4219722151197476, + "grad_norm": 62.79831742681365, + "learning_rate": 1.0891869462117887e-06, + "loss": 1.6659, + "step": 28417 + }, + { + "epoch": 2.422057444813773, + "grad_norm": 41.97979730681279, + "learning_rate": 1.0888780153794198e-06, + "loss": 1.4447, + "step": 28418 + }, + { + "epoch": 2.4221426745077985, + "grad_norm": 67.35218619725502, + "learning_rate": 1.0885691230113805e-06, + "loss": 1.4795, + "step": 28419 + }, + { + "epoch": 2.422227904201824, + "grad_norm": 16.15428398325474, + "learning_rate": 1.0882602691107091e-06, + "loss": 0.6341, + "step": 28420 + }, + { + "epoch": 2.4223131338958495, + "grad_norm": 59.909223023719775, + "learning_rate": 1.0879514536804453e-06, + "loss": 1.4881, + "step": 28421 + }, + { + "epoch": 2.422398363589875, + "grad_norm": 30.584336389337306, + "learning_rate": 1.0876426767236247e-06, + "loss": 0.7625, + "step": 28422 + }, + { + "epoch": 2.4224835932839, + "grad_norm": 46.3164956934256, + "learning_rate": 1.0873339382432819e-06, + "loss": 1.5719, + "step": 28423 + }, + { + "epoch": 2.4225688229779254, + "grad_norm": 53.88995326854984, + "learning_rate": 1.0870252382424568e-06, + "loss": 1.2771, + "step": 28424 + }, + { + "epoch": 2.422654052671951, + "grad_norm": 43.48104592115531, + "learning_rate": 1.0867165767241832e-06, + "loss": 1.6426, + "step": 28425 + }, + { + "epoch": 2.4227392823659764, + "grad_norm": 46.85602518886225, + "learning_rate": 1.0864079536914978e-06, + "loss": 1.6277, + "step": 28426 + }, + { + "epoch": 2.422824512060002, + "grad_norm": 81.12286663135252, + "learning_rate": 1.086099369147433e-06, + "loss": 1.7057, + "step": 28427 + }, + { + "epoch": 2.4229097417540273, + "grad_norm": 67.31256382511795, + "learning_rate": 1.085790823095026e-06, + "loss": 1.7684, + "step": 28428 + }, + { + "epoch": 2.4229949714480523, + "grad_norm": 32.9756605811422, + "learning_rate": 1.0854823155373123e-06, + "loss": 0.9997, + "step": 28429 + }, + { + "epoch": 2.423080201142078, + "grad_norm": 97.98266590294861, + "learning_rate": 1.0851738464773242e-06, + "loss": 2.7382, + "step": 28430 + }, + { + "epoch": 2.4231654308361033, + "grad_norm": 56.95993599812868, + "learning_rate": 1.084865415918095e-06, + "loss": 1.7995, + "step": 28431 + }, + { + "epoch": 2.4232506605301287, + "grad_norm": 29.915920721486994, + "learning_rate": 1.0845570238626597e-06, + "loss": 0.5979, + "step": 28432 + }, + { + "epoch": 2.423335890224154, + "grad_norm": 28.003783967375654, + "learning_rate": 1.0842486703140503e-06, + "loss": 0.7594, + "step": 28433 + }, + { + "epoch": 2.4234211199181797, + "grad_norm": 33.64643919502565, + "learning_rate": 1.0839403552752991e-06, + "loss": 1.0188, + "step": 28434 + }, + { + "epoch": 2.4235063496122047, + "grad_norm": 64.97469977562618, + "learning_rate": 1.083632078749437e-06, + "loss": 1.6754, + "step": 28435 + }, + { + "epoch": 2.42359157930623, + "grad_norm": 44.909988151974574, + "learning_rate": 1.0833238407394974e-06, + "loss": 0.9802, + "step": 28436 + }, + { + "epoch": 2.4236768090002556, + "grad_norm": 46.85624061653537, + "learning_rate": 1.0830156412485126e-06, + "loss": 1.5228, + "step": 28437 + }, + { + "epoch": 2.423762038694281, + "grad_norm": 40.866271208855, + "learning_rate": 1.0827074802795129e-06, + "loss": 0.6709, + "step": 28438 + }, + { + "epoch": 2.4238472683883066, + "grad_norm": 58.906273847101865, + "learning_rate": 1.0823993578355284e-06, + "loss": 1.5768, + "step": 28439 + }, + { + "epoch": 2.423932498082332, + "grad_norm": 35.61456857967752, + "learning_rate": 1.0820912739195883e-06, + "loss": 1.5945, + "step": 28440 + }, + { + "epoch": 2.4240177277763575, + "grad_norm": 21.395263623911376, + "learning_rate": 1.0817832285347246e-06, + "loss": 0.7052, + "step": 28441 + }, + { + "epoch": 2.4241029574703825, + "grad_norm": 33.22421980599249, + "learning_rate": 1.0814752216839663e-06, + "loss": 1.0329, + "step": 28442 + }, + { + "epoch": 2.424188187164408, + "grad_norm": 78.17405896424565, + "learning_rate": 1.0811672533703404e-06, + "loss": 2.5369, + "step": 28443 + }, + { + "epoch": 2.4242734168584334, + "grad_norm": 32.17146131174326, + "learning_rate": 1.080859323596878e-06, + "loss": 0.88, + "step": 28444 + }, + { + "epoch": 2.424358646552459, + "grad_norm": 48.89786068041177, + "learning_rate": 1.0805514323666072e-06, + "loss": 2.0482, + "step": 28445 + }, + { + "epoch": 2.4244438762464844, + "grad_norm": 56.103639926212836, + "learning_rate": 1.0802435796825566e-06, + "loss": 1.3445, + "step": 28446 + }, + { + "epoch": 2.42452910594051, + "grad_norm": 33.64761879154956, + "learning_rate": 1.0799357655477521e-06, + "loss": 0.8873, + "step": 28447 + }, + { + "epoch": 2.424614335634535, + "grad_norm": 36.66534653696259, + "learning_rate": 1.0796279899652224e-06, + "loss": 1.1636, + "step": 28448 + }, + { + "epoch": 2.4246995653285603, + "grad_norm": 37.77974758121962, + "learning_rate": 1.0793202529379915e-06, + "loss": 1.2513, + "step": 28449 + }, + { + "epoch": 2.424784795022586, + "grad_norm": 47.1649162802115, + "learning_rate": 1.07901255446909e-06, + "loss": 1.1953, + "step": 28450 + }, + { + "epoch": 2.4248700247166113, + "grad_norm": 30.04544963446774, + "learning_rate": 1.0787048945615402e-06, + "loss": 0.9499, + "step": 28451 + }, + { + "epoch": 2.4249552544106368, + "grad_norm": 18.92153156844244, + "learning_rate": 1.078397273218371e-06, + "loss": 0.5426, + "step": 28452 + }, + { + "epoch": 2.425040484104662, + "grad_norm": 48.418804065469246, + "learning_rate": 1.0780896904426052e-06, + "loss": 1.6005, + "step": 28453 + }, + { + "epoch": 2.4251257137986872, + "grad_norm": 27.578100822382712, + "learning_rate": 1.0777821462372701e-06, + "loss": 1.2475, + "step": 28454 + }, + { + "epoch": 2.4252109434927127, + "grad_norm": 25.940355241086003, + "learning_rate": 1.0774746406053892e-06, + "loss": 0.7951, + "step": 28455 + }, + { + "epoch": 2.425296173186738, + "grad_norm": 69.33266737520883, + "learning_rate": 1.0771671735499862e-06, + "loss": 1.9227, + "step": 28456 + }, + { + "epoch": 2.4253814028807636, + "grad_norm": 22.960519176907628, + "learning_rate": 1.0768597450740843e-06, + "loss": 0.5987, + "step": 28457 + }, + { + "epoch": 2.425466632574789, + "grad_norm": 61.07266623793891, + "learning_rate": 1.0765523551807088e-06, + "loss": 1.8434, + "step": 28458 + }, + { + "epoch": 2.4255518622688146, + "grad_norm": 64.33529527823029, + "learning_rate": 1.076245003872881e-06, + "loss": 1.2458, + "step": 28459 + }, + { + "epoch": 2.42563709196284, + "grad_norm": 44.69072492582051, + "learning_rate": 1.0759376911536257e-06, + "loss": 1.3713, + "step": 28460 + }, + { + "epoch": 2.425722321656865, + "grad_norm": 38.22123451064586, + "learning_rate": 1.0756304170259635e-06, + "loss": 0.8453, + "step": 28461 + }, + { + "epoch": 2.4258075513508905, + "grad_norm": 55.07580470228022, + "learning_rate": 1.075323181492916e-06, + "loss": 1.5559, + "step": 28462 + }, + { + "epoch": 2.425892781044916, + "grad_norm": 94.81834560032867, + "learning_rate": 1.0750159845575065e-06, + "loss": 2.8803, + "step": 28463 + }, + { + "epoch": 2.4259780107389415, + "grad_norm": 47.499141071719215, + "learning_rate": 1.0747088262227552e-06, + "loss": 1.7087, + "step": 28464 + }, + { + "epoch": 2.426063240432967, + "grad_norm": 62.31430687210492, + "learning_rate": 1.0744017064916828e-06, + "loss": 1.4376, + "step": 28465 + }, + { + "epoch": 2.4261484701269924, + "grad_norm": 32.36424893545349, + "learning_rate": 1.0740946253673085e-06, + "loss": 1.0186, + "step": 28466 + }, + { + "epoch": 2.426233699821018, + "grad_norm": 73.42032249543826, + "learning_rate": 1.0737875828526535e-06, + "loss": 1.4427, + "step": 28467 + }, + { + "epoch": 2.426318929515043, + "grad_norm": 28.138524608368147, + "learning_rate": 1.073480578950739e-06, + "loss": 0.8137, + "step": 28468 + }, + { + "epoch": 2.4264041592090684, + "grad_norm": 96.06814634952043, + "learning_rate": 1.0731736136645825e-06, + "loss": 2.1026, + "step": 28469 + }, + { + "epoch": 2.426489388903094, + "grad_norm": 121.22164565988577, + "learning_rate": 1.0728666869972021e-06, + "loss": 2.1458, + "step": 28470 + }, + { + "epoch": 2.4265746185971193, + "grad_norm": 64.30796012355765, + "learning_rate": 1.0725597989516184e-06, + "loss": 1.8108, + "step": 28471 + }, + { + "epoch": 2.426659848291145, + "grad_norm": 47.98703321024109, + "learning_rate": 1.0722529495308487e-06, + "loss": 1.4773, + "step": 28472 + }, + { + "epoch": 2.42674507798517, + "grad_norm": 51.98193699512732, + "learning_rate": 1.0719461387379098e-06, + "loss": 1.4399, + "step": 28473 + }, + { + "epoch": 2.4268303076791953, + "grad_norm": 74.5607231272751, + "learning_rate": 1.0716393665758197e-06, + "loss": 2.5858, + "step": 28474 + }, + { + "epoch": 2.4269155373732207, + "grad_norm": 52.84611407321965, + "learning_rate": 1.0713326330475943e-06, + "loss": 1.2052, + "step": 28475 + }, + { + "epoch": 2.427000767067246, + "grad_norm": 62.303593751952086, + "learning_rate": 1.0710259381562533e-06, + "loss": 2.0386, + "step": 28476 + }, + { + "epoch": 2.4270859967612717, + "grad_norm": 50.00833800864194, + "learning_rate": 1.0707192819048107e-06, + "loss": 0.8212, + "step": 28477 + }, + { + "epoch": 2.427171226455297, + "grad_norm": 66.21182867565327, + "learning_rate": 1.0704126642962825e-06, + "loss": 1.5848, + "step": 28478 + }, + { + "epoch": 2.4272564561493226, + "grad_norm": 54.281726098901736, + "learning_rate": 1.0701060853336836e-06, + "loss": 1.7625, + "step": 28479 + }, + { + "epoch": 2.4273416858433476, + "grad_norm": 82.8398451639725, + "learning_rate": 1.0697995450200305e-06, + "loss": 1.5865, + "step": 28480 + }, + { + "epoch": 2.427426915537373, + "grad_norm": 38.86562989320037, + "learning_rate": 1.0694930433583372e-06, + "loss": 1.0048, + "step": 28481 + }, + { + "epoch": 2.4275121452313986, + "grad_norm": 22.88706988873548, + "learning_rate": 1.0691865803516172e-06, + "loss": 0.6101, + "step": 28482 + }, + { + "epoch": 2.427597374925424, + "grad_norm": 72.10613650179229, + "learning_rate": 1.0688801560028844e-06, + "loss": 2.2224, + "step": 28483 + }, + { + "epoch": 2.4276826046194495, + "grad_norm": 103.89642271802133, + "learning_rate": 1.0685737703151549e-06, + "loss": 3.0366, + "step": 28484 + }, + { + "epoch": 2.427767834313475, + "grad_norm": 42.84709715262292, + "learning_rate": 1.0682674232914408e-06, + "loss": 1.1398, + "step": 28485 + }, + { + "epoch": 2.4278530640075004, + "grad_norm": 32.123739437326094, + "learning_rate": 1.0679611149347535e-06, + "loss": 1.157, + "step": 28486 + }, + { + "epoch": 2.4279382937015255, + "grad_norm": 29.478764720695658, + "learning_rate": 1.067654845248105e-06, + "loss": 0.886, + "step": 28487 + }, + { + "epoch": 2.428023523395551, + "grad_norm": 71.99062253308556, + "learning_rate": 1.0673486142345103e-06, + "loss": 1.4741, + "step": 28488 + }, + { + "epoch": 2.4281087530895764, + "grad_norm": 79.01064646419063, + "learning_rate": 1.0670424218969788e-06, + "loss": 2.3935, + "step": 28489 + }, + { + "epoch": 2.428193982783602, + "grad_norm": 42.713213256368235, + "learning_rate": 1.0667362682385208e-06, + "loss": 0.8742, + "step": 28490 + }, + { + "epoch": 2.4282792124776273, + "grad_norm": 37.520444875360155, + "learning_rate": 1.0664301532621506e-06, + "loss": 0.9352, + "step": 28491 + }, + { + "epoch": 2.4283644421716524, + "grad_norm": 67.16582356084542, + "learning_rate": 1.0661240769708748e-06, + "loss": 1.8169, + "step": 28492 + }, + { + "epoch": 2.428449671865678, + "grad_norm": 29.5532917923953, + "learning_rate": 1.0658180393677075e-06, + "loss": 1.3808, + "step": 28493 + }, + { + "epoch": 2.4285349015597033, + "grad_norm": 23.129039942154, + "learning_rate": 1.0655120404556563e-06, + "loss": 0.9684, + "step": 28494 + }, + { + "epoch": 2.4286201312537288, + "grad_norm": 58.97990353886489, + "learning_rate": 1.0652060802377306e-06, + "loss": 0.9476, + "step": 28495 + }, + { + "epoch": 2.4287053609477542, + "grad_norm": 56.973690111821796, + "learning_rate": 1.0649001587169383e-06, + "loss": 1.3621, + "step": 28496 + }, + { + "epoch": 2.4287905906417797, + "grad_norm": 36.429087358053934, + "learning_rate": 1.0645942758962907e-06, + "loss": 1.1364, + "step": 28497 + }, + { + "epoch": 2.428875820335805, + "grad_norm": 22.818904241223855, + "learning_rate": 1.0642884317787932e-06, + "loss": 0.6275, + "step": 28498 + }, + { + "epoch": 2.42896105002983, + "grad_norm": 67.05237312240371, + "learning_rate": 1.0639826263674562e-06, + "loss": 1.7396, + "step": 28499 + }, + { + "epoch": 2.4290462797238557, + "grad_norm": 40.26203357446731, + "learning_rate": 1.0636768596652852e-06, + "loss": 1.0596, + "step": 28500 + }, + { + "epoch": 2.429131509417881, + "grad_norm": 78.0744281934966, + "learning_rate": 1.0633711316752892e-06, + "loss": 1.7451, + "step": 28501 + }, + { + "epoch": 2.4292167391119066, + "grad_norm": 58.481953438077156, + "learning_rate": 1.0630654424004738e-06, + "loss": 1.6566, + "step": 28502 + }, + { + "epoch": 2.429301968805932, + "grad_norm": 97.0599842219733, + "learning_rate": 1.0627597918438453e-06, + "loss": 2.5035, + "step": 28503 + }, + { + "epoch": 2.4293871984999575, + "grad_norm": 64.7399356390057, + "learning_rate": 1.0624541800084087e-06, + "loss": 1.7866, + "step": 28504 + }, + { + "epoch": 2.429472428193983, + "grad_norm": 27.64028905144129, + "learning_rate": 1.062148606897172e-06, + "loss": 0.7005, + "step": 28505 + }, + { + "epoch": 2.429557657888008, + "grad_norm": 35.96558372657473, + "learning_rate": 1.0618430725131373e-06, + "loss": 0.9179, + "step": 28506 + }, + { + "epoch": 2.4296428875820335, + "grad_norm": 73.82673423663924, + "learning_rate": 1.0615375768593129e-06, + "loss": 1.6551, + "step": 28507 + }, + { + "epoch": 2.429728117276059, + "grad_norm": 50.5665773588431, + "learning_rate": 1.061232119938701e-06, + "loss": 0.7429, + "step": 28508 + }, + { + "epoch": 2.4298133469700844, + "grad_norm": 59.334185257413786, + "learning_rate": 1.0609267017543051e-06, + "loss": 1.9341, + "step": 28509 + }, + { + "epoch": 2.42989857666411, + "grad_norm": 39.7571976488527, + "learning_rate": 1.0606213223091311e-06, + "loss": 1.0596, + "step": 28510 + }, + { + "epoch": 2.429983806358135, + "grad_norm": 98.99628074039072, + "learning_rate": 1.060315981606181e-06, + "loss": 1.7041, + "step": 28511 + }, + { + "epoch": 2.4300690360521604, + "grad_norm": 44.45597166895674, + "learning_rate": 1.0600106796484576e-06, + "loss": 1.3709, + "step": 28512 + }, + { + "epoch": 2.430154265746186, + "grad_norm": 62.401825975798054, + "learning_rate": 1.059705416438962e-06, + "loss": 1.0968, + "step": 28513 + }, + { + "epoch": 2.4302394954402113, + "grad_norm": 54.01929830791763, + "learning_rate": 1.0594001919806985e-06, + "loss": 1.4441, + "step": 28514 + }, + { + "epoch": 2.430324725134237, + "grad_norm": 101.3373010595342, + "learning_rate": 1.0590950062766692e-06, + "loss": 2.915, + "step": 28515 + }, + { + "epoch": 2.4304099548282623, + "grad_norm": 47.81809462748778, + "learning_rate": 1.0587898593298745e-06, + "loss": 1.5992, + "step": 28516 + }, + { + "epoch": 2.4304951845222877, + "grad_norm": 30.160324897814064, + "learning_rate": 1.0584847511433143e-06, + "loss": 1.0321, + "step": 28517 + }, + { + "epoch": 2.4305804142163128, + "grad_norm": 47.53371191388794, + "learning_rate": 1.058179681719992e-06, + "loss": 1.578, + "step": 28518 + }, + { + "epoch": 2.4306656439103382, + "grad_norm": 82.58831089791595, + "learning_rate": 1.0578746510629063e-06, + "loss": 1.044, + "step": 28519 + }, + { + "epoch": 2.4307508736043637, + "grad_norm": 46.1016752117921, + "learning_rate": 1.057569659175056e-06, + "loss": 1.1299, + "step": 28520 + }, + { + "epoch": 2.430836103298389, + "grad_norm": 28.11145356132322, + "learning_rate": 1.057264706059441e-06, + "loss": 1.1461, + "step": 28521 + }, + { + "epoch": 2.4309213329924146, + "grad_norm": 37.27533647331018, + "learning_rate": 1.0569597917190604e-06, + "loss": 1.0634, + "step": 28522 + }, + { + "epoch": 2.43100656268644, + "grad_norm": 46.62646183020207, + "learning_rate": 1.0566549161569155e-06, + "loss": 1.6097, + "step": 28523 + }, + { + "epoch": 2.4310917923804656, + "grad_norm": 29.626793306789047, + "learning_rate": 1.0563500793760024e-06, + "loss": 1.2654, + "step": 28524 + }, + { + "epoch": 2.4311770220744906, + "grad_norm": 37.54902289599008, + "learning_rate": 1.0560452813793187e-06, + "loss": 1.0487, + "step": 28525 + }, + { + "epoch": 2.431262251768516, + "grad_norm": 52.83981433610966, + "learning_rate": 1.0557405221698614e-06, + "loss": 0.9726, + "step": 28526 + }, + { + "epoch": 2.4313474814625415, + "grad_norm": 34.53150600740864, + "learning_rate": 1.05543580175063e-06, + "loss": 1.3527, + "step": 28527 + }, + { + "epoch": 2.431432711156567, + "grad_norm": 41.061348093074415, + "learning_rate": 1.0551311201246207e-06, + "loss": 1.6767, + "step": 28528 + }, + { + "epoch": 2.4315179408505925, + "grad_norm": 53.876714125235765, + "learning_rate": 1.0548264772948268e-06, + "loss": 1.1822, + "step": 28529 + }, + { + "epoch": 2.4316031705446175, + "grad_norm": 41.89971783565068, + "learning_rate": 1.0545218732642477e-06, + "loss": 1.2101, + "step": 28530 + }, + { + "epoch": 2.431688400238643, + "grad_norm": 54.65241689022586, + "learning_rate": 1.0542173080358792e-06, + "loss": 1.2759, + "step": 28531 + }, + { + "epoch": 2.4317736299326684, + "grad_norm": 47.579748086605015, + "learning_rate": 1.053912781612716e-06, + "loss": 1.3443, + "step": 28532 + }, + { + "epoch": 2.431858859626694, + "grad_norm": 55.69492393701032, + "learning_rate": 1.0536082939977522e-06, + "loss": 1.4487, + "step": 28533 + }, + { + "epoch": 2.4319440893207194, + "grad_norm": 58.82325332410147, + "learning_rate": 1.0533038451939826e-06, + "loss": 1.2685, + "step": 28534 + }, + { + "epoch": 2.432029319014745, + "grad_norm": 19.77426954380967, + "learning_rate": 1.0529994352044004e-06, + "loss": 0.7691, + "step": 28535 + }, + { + "epoch": 2.4321145487087703, + "grad_norm": 47.55214861069377, + "learning_rate": 1.0526950640320015e-06, + "loss": 1.5536, + "step": 28536 + }, + { + "epoch": 2.4321997784027953, + "grad_norm": 24.14788139848203, + "learning_rate": 1.0523907316797772e-06, + "loss": 0.9994, + "step": 28537 + }, + { + "epoch": 2.432285008096821, + "grad_norm": 44.47674512153058, + "learning_rate": 1.0520864381507223e-06, + "loss": 1.3962, + "step": 28538 + }, + { + "epoch": 2.4323702377908463, + "grad_norm": 49.75177711378926, + "learning_rate": 1.051782183447827e-06, + "loss": 1.5069, + "step": 28539 + }, + { + "epoch": 2.4324554674848717, + "grad_norm": 78.35475900652304, + "learning_rate": 1.0514779675740872e-06, + "loss": 1.3518, + "step": 28540 + }, + { + "epoch": 2.432540697178897, + "grad_norm": 63.61021688563961, + "learning_rate": 1.0511737905324921e-06, + "loss": 1.4062, + "step": 28541 + }, + { + "epoch": 2.4326259268729227, + "grad_norm": 39.392471670295606, + "learning_rate": 1.0508696523260336e-06, + "loss": 1.5672, + "step": 28542 + }, + { + "epoch": 2.432711156566948, + "grad_norm": 41.13599723075996, + "learning_rate": 1.0505655529577013e-06, + "loss": 1.4235, + "step": 28543 + }, + { + "epoch": 2.432796386260973, + "grad_norm": 73.23835904031623, + "learning_rate": 1.0502614924304895e-06, + "loss": 2.2578, + "step": 28544 + }, + { + "epoch": 2.4328816159549986, + "grad_norm": 26.46609662669066, + "learning_rate": 1.0499574707473842e-06, + "loss": 1.2342, + "step": 28545 + }, + { + "epoch": 2.432966845649024, + "grad_norm": 28.777466821214208, + "learning_rate": 1.0496534879113795e-06, + "loss": 0.9751, + "step": 28546 + }, + { + "epoch": 2.4330520753430496, + "grad_norm": 28.13665882621066, + "learning_rate": 1.0493495439254613e-06, + "loss": 1.5054, + "step": 28547 + }, + { + "epoch": 2.433137305037075, + "grad_norm": 50.28015559110269, + "learning_rate": 1.0490456387926223e-06, + "loss": 1.5756, + "step": 28548 + }, + { + "epoch": 2.4332225347311005, + "grad_norm": 26.618217840833207, + "learning_rate": 1.0487417725158493e-06, + "loss": 1.0821, + "step": 28549 + }, + { + "epoch": 2.4333077644251255, + "grad_norm": 235.7676102805942, + "learning_rate": 1.0484379450981308e-06, + "loss": 1.5636, + "step": 28550 + }, + { + "epoch": 2.433392994119151, + "grad_norm": 20.637025565266313, + "learning_rate": 1.0481341565424551e-06, + "loss": 0.7454, + "step": 28551 + }, + { + "epoch": 2.4334782238131765, + "grad_norm": 70.57486363683115, + "learning_rate": 1.0478304068518086e-06, + "loss": 2.1981, + "step": 28552 + }, + { + "epoch": 2.433563453507202, + "grad_norm": 41.043058619988635, + "learning_rate": 1.0475266960291792e-06, + "loss": 1.547, + "step": 28553 + }, + { + "epoch": 2.4336486832012274, + "grad_norm": 68.48126805638769, + "learning_rate": 1.0472230240775556e-06, + "loss": 1.6063, + "step": 28554 + }, + { + "epoch": 2.433733912895253, + "grad_norm": 79.27644296261641, + "learning_rate": 1.0469193909999225e-06, + "loss": 2.1559, + "step": 28555 + }, + { + "epoch": 2.433819142589278, + "grad_norm": 77.12463100682095, + "learning_rate": 1.0466157967992651e-06, + "loss": 1.8403, + "step": 28556 + }, + { + "epoch": 2.4339043722833034, + "grad_norm": 26.964113593839897, + "learning_rate": 1.0463122414785726e-06, + "loss": 0.6927, + "step": 28557 + }, + { + "epoch": 2.433989601977329, + "grad_norm": 57.04090593770572, + "learning_rate": 1.0460087250408274e-06, + "loss": 0.9315, + "step": 28558 + }, + { + "epoch": 2.4340748316713543, + "grad_norm": 29.95053004976677, + "learning_rate": 1.0457052474890155e-06, + "loss": 0.7351, + "step": 28559 + }, + { + "epoch": 2.4341600613653798, + "grad_norm": 68.68962511487148, + "learning_rate": 1.0454018088261197e-06, + "loss": 1.5374, + "step": 28560 + }, + { + "epoch": 2.4342452910594052, + "grad_norm": 51.01225436600844, + "learning_rate": 1.0450984090551258e-06, + "loss": 2.1835, + "step": 28561 + }, + { + "epoch": 2.4343305207534307, + "grad_norm": 52.06660001484179, + "learning_rate": 1.0447950481790187e-06, + "loss": 1.4264, + "step": 28562 + }, + { + "epoch": 2.4344157504474557, + "grad_norm": 30.784455703816327, + "learning_rate": 1.0444917262007809e-06, + "loss": 0.8733, + "step": 28563 + }, + { + "epoch": 2.434500980141481, + "grad_norm": 29.684836924925403, + "learning_rate": 1.0441884431233951e-06, + "loss": 0.9699, + "step": 28564 + }, + { + "epoch": 2.4345862098355067, + "grad_norm": 69.51424103937528, + "learning_rate": 1.0438851989498428e-06, + "loss": 1.5643, + "step": 28565 + }, + { + "epoch": 2.434671439529532, + "grad_norm": 62.00174878448959, + "learning_rate": 1.0435819936831093e-06, + "loss": 1.8133, + "step": 28566 + }, + { + "epoch": 2.4347566692235576, + "grad_norm": 83.26545305549172, + "learning_rate": 1.043278827326174e-06, + "loss": 2.1474, + "step": 28567 + }, + { + "epoch": 2.434841898917583, + "grad_norm": 58.26613247563337, + "learning_rate": 1.0429756998820184e-06, + "loss": 1.4688, + "step": 28568 + }, + { + "epoch": 2.434927128611608, + "grad_norm": 23.069281313911386, + "learning_rate": 1.042672611353624e-06, + "loss": 0.6136, + "step": 28569 + }, + { + "epoch": 2.4350123583056336, + "grad_norm": 67.45495921183536, + "learning_rate": 1.0423695617439738e-06, + "loss": 1.0823, + "step": 28570 + }, + { + "epoch": 2.435097587999659, + "grad_norm": 29.05993356028055, + "learning_rate": 1.0420665510560457e-06, + "loss": 1.2856, + "step": 28571 + }, + { + "epoch": 2.4351828176936845, + "grad_norm": 32.848192054073216, + "learning_rate": 1.041763579292821e-06, + "loss": 0.9501, + "step": 28572 + }, + { + "epoch": 2.43526804738771, + "grad_norm": 38.1359916847756, + "learning_rate": 1.0414606464572767e-06, + "loss": 0.9413, + "step": 28573 + }, + { + "epoch": 2.4353532770817354, + "grad_norm": 86.53335004254556, + "learning_rate": 1.0411577525523959e-06, + "loss": 2.0212, + "step": 28574 + }, + { + "epoch": 2.4354385067757605, + "grad_norm": 63.828773965219796, + "learning_rate": 1.0408548975811555e-06, + "loss": 1.6344, + "step": 28575 + }, + { + "epoch": 2.435523736469786, + "grad_norm": 53.45150922889515, + "learning_rate": 1.0405520815465319e-06, + "loss": 1.3621, + "step": 28576 + }, + { + "epoch": 2.4356089661638114, + "grad_norm": 35.847459409816146, + "learning_rate": 1.040249304451506e-06, + "loss": 1.1041, + "step": 28577 + }, + { + "epoch": 2.435694195857837, + "grad_norm": 49.20222272226065, + "learning_rate": 1.039946566299056e-06, + "loss": 1.0806, + "step": 28578 + }, + { + "epoch": 2.4357794255518623, + "grad_norm": 58.518732419384904, + "learning_rate": 1.0396438670921578e-06, + "loss": 1.1015, + "step": 28579 + }, + { + "epoch": 2.435864655245888, + "grad_norm": 48.72985539963994, + "learning_rate": 1.0393412068337882e-06, + "loss": 1.4526, + "step": 28580 + }, + { + "epoch": 2.4359498849399133, + "grad_norm": 37.12182813685354, + "learning_rate": 1.0390385855269242e-06, + "loss": 1.1646, + "step": 28581 + }, + { + "epoch": 2.4360351146339383, + "grad_norm": 30.529252034156357, + "learning_rate": 1.0387360031745408e-06, + "loss": 1.1192, + "step": 28582 + }, + { + "epoch": 2.4361203443279638, + "grad_norm": 83.01879506710486, + "learning_rate": 1.038433459779616e-06, + "loss": 2.1614, + "step": 28583 + }, + { + "epoch": 2.4362055740219892, + "grad_norm": 43.367991928776654, + "learning_rate": 1.038130955345122e-06, + "loss": 2.0717, + "step": 28584 + }, + { + "epoch": 2.4362908037160147, + "grad_norm": 44.42102112428038, + "learning_rate": 1.0378284898740376e-06, + "loss": 1.2945, + "step": 28585 + }, + { + "epoch": 2.43637603341004, + "grad_norm": 33.68593436259072, + "learning_rate": 1.0375260633693341e-06, + "loss": 0.633, + "step": 28586 + }, + { + "epoch": 2.4364612631040656, + "grad_norm": 58.73994029030514, + "learning_rate": 1.0372236758339886e-06, + "loss": 1.7662, + "step": 28587 + }, + { + "epoch": 2.436546492798091, + "grad_norm": 22.222238773206207, + "learning_rate": 1.0369213272709738e-06, + "loss": 0.7273, + "step": 28588 + }, + { + "epoch": 2.436631722492116, + "grad_norm": 52.63755597968851, + "learning_rate": 1.036619017683263e-06, + "loss": 1.585, + "step": 28589 + }, + { + "epoch": 2.4367169521861416, + "grad_norm": 28.72106160660122, + "learning_rate": 1.0363167470738278e-06, + "loss": 0.7859, + "step": 28590 + }, + { + "epoch": 2.436802181880167, + "grad_norm": 60.196850573754155, + "learning_rate": 1.0360145154456437e-06, + "loss": 1.6768, + "step": 28591 + }, + { + "epoch": 2.4368874115741925, + "grad_norm": 86.03228862601239, + "learning_rate": 1.0357123228016802e-06, + "loss": 2.2305, + "step": 28592 + }, + { + "epoch": 2.436972641268218, + "grad_norm": 44.97244442574829, + "learning_rate": 1.0354101691449119e-06, + "loss": 1.3784, + "step": 28593 + }, + { + "epoch": 2.437057870962243, + "grad_norm": 30.03465006541227, + "learning_rate": 1.0351080544783098e-06, + "loss": 1.0731, + "step": 28594 + }, + { + "epoch": 2.4371431006562685, + "grad_norm": 36.36481756427821, + "learning_rate": 1.0348059788048426e-06, + "loss": 0.6498, + "step": 28595 + }, + { + "epoch": 2.437228330350294, + "grad_norm": 33.713126207393, + "learning_rate": 1.034503942127485e-06, + "loss": 1.2977, + "step": 28596 + }, + { + "epoch": 2.4373135600443194, + "grad_norm": 71.74632681115793, + "learning_rate": 1.0342019444492047e-06, + "loss": 1.6498, + "step": 28597 + }, + { + "epoch": 2.437398789738345, + "grad_norm": 41.563708940219094, + "learning_rate": 1.0338999857729726e-06, + "loss": 1.3036, + "step": 28598 + }, + { + "epoch": 2.4374840194323704, + "grad_norm": 41.51817608001548, + "learning_rate": 1.033598066101757e-06, + "loss": 1.4036, + "step": 28599 + }, + { + "epoch": 2.437569249126396, + "grad_norm": 33.13409643570602, + "learning_rate": 1.0332961854385282e-06, + "loss": 1.0195, + "step": 28600 + }, + { + "epoch": 2.437654478820421, + "grad_norm": 40.05361885832879, + "learning_rate": 1.0329943437862567e-06, + "loss": 1.4314, + "step": 28601 + }, + { + "epoch": 2.4377397085144463, + "grad_norm": 59.0334263106806, + "learning_rate": 1.0326925411479093e-06, + "loss": 1.5645, + "step": 28602 + }, + { + "epoch": 2.437824938208472, + "grad_norm": 31.266608174148093, + "learning_rate": 1.0323907775264536e-06, + "loss": 1.156, + "step": 28603 + }, + { + "epoch": 2.4379101679024973, + "grad_norm": 55.8158552568668, + "learning_rate": 1.0320890529248585e-06, + "loss": 1.626, + "step": 28604 + }, + { + "epoch": 2.4379953975965227, + "grad_norm": 61.02017400834373, + "learning_rate": 1.0317873673460916e-06, + "loss": 1.6484, + "step": 28605 + }, + { + "epoch": 2.438080627290548, + "grad_norm": 39.85816616141875, + "learning_rate": 1.0314857207931183e-06, + "loss": 0.6934, + "step": 28606 + }, + { + "epoch": 2.4381658569845737, + "grad_norm": 61.92211164749953, + "learning_rate": 1.0311841132689055e-06, + "loss": 1.4751, + "step": 28607 + }, + { + "epoch": 2.4382510866785987, + "grad_norm": 47.226228145582965, + "learning_rate": 1.0308825447764194e-06, + "loss": 1.223, + "step": 28608 + }, + { + "epoch": 2.438336316372624, + "grad_norm": 180.57738525687162, + "learning_rate": 1.0305810153186275e-06, + "loss": 3.1156, + "step": 28609 + }, + { + "epoch": 2.4384215460666496, + "grad_norm": 86.80583537488303, + "learning_rate": 1.0302795248984942e-06, + "loss": 2.2384, + "step": 28610 + }, + { + "epoch": 2.438506775760675, + "grad_norm": 28.32949227718914, + "learning_rate": 1.0299780735189846e-06, + "loss": 0.8038, + "step": 28611 + }, + { + "epoch": 2.4385920054547006, + "grad_norm": 31.816372893668316, + "learning_rate": 1.0296766611830616e-06, + "loss": 0.8308, + "step": 28612 + }, + { + "epoch": 2.4386772351487256, + "grad_norm": 40.8586775326874, + "learning_rate": 1.0293752878936925e-06, + "loss": 1.3817, + "step": 28613 + }, + { + "epoch": 2.438762464842751, + "grad_norm": 41.708369865893054, + "learning_rate": 1.0290739536538396e-06, + "loss": 1.3648, + "step": 28614 + }, + { + "epoch": 2.4388476945367765, + "grad_norm": 74.67177611329232, + "learning_rate": 1.0287726584664648e-06, + "loss": 1.8044, + "step": 28615 + }, + { + "epoch": 2.438932924230802, + "grad_norm": 26.482928387424284, + "learning_rate": 1.0284714023345333e-06, + "loss": 1.2805, + "step": 28616 + }, + { + "epoch": 2.4390181539248275, + "grad_norm": 34.968048263031214, + "learning_rate": 1.0281701852610082e-06, + "loss": 1.2673, + "step": 28617 + }, + { + "epoch": 2.439103383618853, + "grad_norm": 64.12808183324061, + "learning_rate": 1.0278690072488518e-06, + "loss": 2.1357, + "step": 28618 + }, + { + "epoch": 2.4391886133128784, + "grad_norm": 30.74690108924426, + "learning_rate": 1.0275678683010247e-06, + "loss": 0.9465, + "step": 28619 + }, + { + "epoch": 2.4392738430069034, + "grad_norm": 67.22849098611047, + "learning_rate": 1.0272667684204884e-06, + "loss": 1.1478, + "step": 28620 + }, + { + "epoch": 2.439359072700929, + "grad_norm": 58.35389623551309, + "learning_rate": 1.0269657076102058e-06, + "loss": 1.4758, + "step": 28621 + }, + { + "epoch": 2.4394443023949544, + "grad_norm": 42.27855136188545, + "learning_rate": 1.0266646858731366e-06, + "loss": 1.3765, + "step": 28622 + }, + { + "epoch": 2.43952953208898, + "grad_norm": 34.004796964206086, + "learning_rate": 1.0263637032122404e-06, + "loss": 1.0035, + "step": 28623 + }, + { + "epoch": 2.4396147617830053, + "grad_norm": 72.4292007663197, + "learning_rate": 1.0260627596304796e-06, + "loss": 2.1922, + "step": 28624 + }, + { + "epoch": 2.4396999914770308, + "grad_norm": 33.884088166280335, + "learning_rate": 1.0257618551308107e-06, + "loss": 1.228, + "step": 28625 + }, + { + "epoch": 2.4397852211710562, + "grad_norm": 58.50393692368581, + "learning_rate": 1.0254609897161967e-06, + "loss": 1.8998, + "step": 28626 + }, + { + "epoch": 2.4398704508650813, + "grad_norm": 99.04514324080466, + "learning_rate": 1.0251601633895936e-06, + "loss": 2.8098, + "step": 28627 + }, + { + "epoch": 2.4399556805591067, + "grad_norm": 55.86174140954865, + "learning_rate": 1.0248593761539615e-06, + "loss": 1.5583, + "step": 28628 + }, + { + "epoch": 2.440040910253132, + "grad_norm": 51.5123624868551, + "learning_rate": 1.0245586280122565e-06, + "loss": 1.1002, + "step": 28629 + }, + { + "epoch": 2.4401261399471577, + "grad_norm": 49.26173820477705, + "learning_rate": 1.024257918967439e-06, + "loss": 1.6256, + "step": 28630 + }, + { + "epoch": 2.440211369641183, + "grad_norm": 44.96278975045259, + "learning_rate": 1.0239572490224636e-06, + "loss": 1.4691, + "step": 28631 + }, + { + "epoch": 2.440296599335208, + "grad_norm": 53.30699433749037, + "learning_rate": 1.0236566181802903e-06, + "loss": 1.2446, + "step": 28632 + }, + { + "epoch": 2.4403818290292336, + "grad_norm": 87.19822345366445, + "learning_rate": 1.0233560264438725e-06, + "loss": 2.0752, + "step": 28633 + }, + { + "epoch": 2.440467058723259, + "grad_norm": 59.302447646257114, + "learning_rate": 1.0230554738161695e-06, + "loss": 1.4691, + "step": 28634 + }, + { + "epoch": 2.4405522884172846, + "grad_norm": 56.51813983924174, + "learning_rate": 1.0227549603001358e-06, + "loss": 1.7172, + "step": 28635 + }, + { + "epoch": 2.44063751811131, + "grad_norm": 27.319431554950615, + "learning_rate": 1.0224544858987262e-06, + "loss": 0.9062, + "step": 28636 + }, + { + "epoch": 2.4407227478053355, + "grad_norm": 66.61487139778757, + "learning_rate": 1.022154050614897e-06, + "loss": 1.6126, + "step": 28637 + }, + { + "epoch": 2.440807977499361, + "grad_norm": 48.04256972445261, + "learning_rate": 1.0218536544516e-06, + "loss": 1.249, + "step": 28638 + }, + { + "epoch": 2.440893207193386, + "grad_norm": 59.33468304370349, + "learning_rate": 1.0215532974117914e-06, + "loss": 1.6544, + "step": 28639 + }, + { + "epoch": 2.4409784368874115, + "grad_norm": 54.082870174800675, + "learning_rate": 1.021252979498427e-06, + "loss": 1.2795, + "step": 28640 + }, + { + "epoch": 2.441063666581437, + "grad_norm": 69.2937013205344, + "learning_rate": 1.0209527007144587e-06, + "loss": 2.355, + "step": 28641 + }, + { + "epoch": 2.4411488962754624, + "grad_norm": 49.45312735534554, + "learning_rate": 1.0206524610628383e-06, + "loss": 0.9286, + "step": 28642 + }, + { + "epoch": 2.441234125969488, + "grad_norm": 71.56648802568593, + "learning_rate": 1.0203522605465205e-06, + "loss": 2.2209, + "step": 28643 + }, + { + "epoch": 2.4413193556635133, + "grad_norm": 42.98355303783235, + "learning_rate": 1.020052099168457e-06, + "loss": 1.2726, + "step": 28644 + }, + { + "epoch": 2.441404585357539, + "grad_norm": 50.74527328058913, + "learning_rate": 1.0197519769315995e-06, + "loss": 2.0249, + "step": 28645 + }, + { + "epoch": 2.441489815051564, + "grad_norm": 69.72694405633447, + "learning_rate": 1.0194518938388987e-06, + "loss": 1.9247, + "step": 28646 + }, + { + "epoch": 2.4415750447455893, + "grad_norm": 36.28330518300645, + "learning_rate": 1.0191518498933068e-06, + "loss": 1.4059, + "step": 28647 + }, + { + "epoch": 2.4416602744396148, + "grad_norm": 85.43855344648975, + "learning_rate": 1.0188518450977758e-06, + "loss": 1.9159, + "step": 28648 + }, + { + "epoch": 2.44174550413364, + "grad_norm": 25.397704645463552, + "learning_rate": 1.0185518794552545e-06, + "loss": 0.8789, + "step": 28649 + }, + { + "epoch": 2.4418307338276657, + "grad_norm": 42.4180830766549, + "learning_rate": 1.018251952968693e-06, + "loss": 1.7119, + "step": 28650 + }, + { + "epoch": 2.4419159635216907, + "grad_norm": 52.68731642714901, + "learning_rate": 1.0179520656410418e-06, + "loss": 1.6816, + "step": 28651 + }, + { + "epoch": 2.442001193215716, + "grad_norm": 57.36899331776381, + "learning_rate": 1.01765221747525e-06, + "loss": 1.9947, + "step": 28652 + }, + { + "epoch": 2.4420864229097416, + "grad_norm": 90.00834435476762, + "learning_rate": 1.0173524084742664e-06, + "loss": 2.3315, + "step": 28653 + }, + { + "epoch": 2.442171652603767, + "grad_norm": 54.942821155265875, + "learning_rate": 1.0170526386410385e-06, + "loss": 1.5496, + "step": 28654 + }, + { + "epoch": 2.4422568822977926, + "grad_norm": 88.00677995617802, + "learning_rate": 1.0167529079785144e-06, + "loss": 1.687, + "step": 28655 + }, + { + "epoch": 2.442342111991818, + "grad_norm": 35.097202968550576, + "learning_rate": 1.016453216489644e-06, + "loss": 1.3351, + "step": 28656 + }, + { + "epoch": 2.4424273416858435, + "grad_norm": 36.26567296545101, + "learning_rate": 1.016153564177374e-06, + "loss": 1.0476, + "step": 28657 + }, + { + "epoch": 2.4425125713798685, + "grad_norm": 65.79430133981886, + "learning_rate": 1.0158539510446503e-06, + "loss": 1.789, + "step": 28658 + }, + { + "epoch": 2.442597801073894, + "grad_norm": 59.754040977428176, + "learning_rate": 1.0155543770944182e-06, + "loss": 1.3167, + "step": 28659 + }, + { + "epoch": 2.4426830307679195, + "grad_norm": 116.69052350961749, + "learning_rate": 1.015254842329627e-06, + "loss": 3.5204, + "step": 28660 + }, + { + "epoch": 2.442768260461945, + "grad_norm": 73.9918435058633, + "learning_rate": 1.0149553467532213e-06, + "loss": 2.5068, + "step": 28661 + }, + { + "epoch": 2.4428534901559704, + "grad_norm": 86.6838462956414, + "learning_rate": 1.0146558903681447e-06, + "loss": 1.6941, + "step": 28662 + }, + { + "epoch": 2.442938719849996, + "grad_norm": 33.66476687401222, + "learning_rate": 1.0143564731773437e-06, + "loss": 1.1692, + "step": 28663 + }, + { + "epoch": 2.4430239495440214, + "grad_norm": 45.61223260339344, + "learning_rate": 1.0140570951837647e-06, + "loss": 1.6677, + "step": 28664 + }, + { + "epoch": 2.4431091792380464, + "grad_norm": 43.93562170976813, + "learning_rate": 1.01375775639035e-06, + "loss": 1.4407, + "step": 28665 + }, + { + "epoch": 2.443194408932072, + "grad_norm": 23.61618924776944, + "learning_rate": 1.0134584568000444e-06, + "loss": 0.8726, + "step": 28666 + }, + { + "epoch": 2.4432796386260973, + "grad_norm": 28.475897174701085, + "learning_rate": 1.01315919641579e-06, + "loss": 0.7924, + "step": 28667 + }, + { + "epoch": 2.443364868320123, + "grad_norm": 79.03238881802676, + "learning_rate": 1.0128599752405299e-06, + "loss": 1.5009, + "step": 28668 + }, + { + "epoch": 2.4434500980141483, + "grad_norm": 43.629054909685856, + "learning_rate": 1.0125607932772085e-06, + "loss": 1.5135, + "step": 28669 + }, + { + "epoch": 2.4435353277081737, + "grad_norm": 59.35817917473319, + "learning_rate": 1.0122616505287663e-06, + "loss": 1.766, + "step": 28670 + }, + { + "epoch": 2.4436205574021987, + "grad_norm": 56.24363895408978, + "learning_rate": 1.0119625469981475e-06, + "loss": 1.7417, + "step": 28671 + }, + { + "epoch": 2.443705787096224, + "grad_norm": 34.64521338082261, + "learning_rate": 1.011663482688291e-06, + "loss": 1.1503, + "step": 28672 + }, + { + "epoch": 2.4437910167902497, + "grad_norm": 49.96720543238079, + "learning_rate": 1.0113644576021404e-06, + "loss": 2.1886, + "step": 28673 + }, + { + "epoch": 2.443876246484275, + "grad_norm": 44.841030892384495, + "learning_rate": 1.0110654717426354e-06, + "loss": 1.5636, + "step": 28674 + }, + { + "epoch": 2.4439614761783006, + "grad_norm": 34.898346243316325, + "learning_rate": 1.010766525112717e-06, + "loss": 1.1469, + "step": 28675 + }, + { + "epoch": 2.444046705872326, + "grad_norm": 55.441327564165995, + "learning_rate": 1.0104676177153228e-06, + "loss": 1.4056, + "step": 28676 + }, + { + "epoch": 2.444131935566351, + "grad_norm": 60.25154817927687, + "learning_rate": 1.0101687495533958e-06, + "loss": 1.6739, + "step": 28677 + }, + { + "epoch": 2.4442171652603766, + "grad_norm": 26.872485673273097, + "learning_rate": 1.0098699206298723e-06, + "loss": 0.9451, + "step": 28678 + }, + { + "epoch": 2.444302394954402, + "grad_norm": 26.74874691244667, + "learning_rate": 1.009571130947694e-06, + "loss": 1.1362, + "step": 28679 + }, + { + "epoch": 2.4443876246484275, + "grad_norm": 38.49552890467126, + "learning_rate": 1.009272380509797e-06, + "loss": 0.9206, + "step": 28680 + }, + { + "epoch": 2.444472854342453, + "grad_norm": 71.44786266028142, + "learning_rate": 1.0089736693191214e-06, + "loss": 1.3626, + "step": 28681 + }, + { + "epoch": 2.4445580840364785, + "grad_norm": 44.60434018327751, + "learning_rate": 1.0086749973786043e-06, + "loss": 1.2702, + "step": 28682 + }, + { + "epoch": 2.444643313730504, + "grad_norm": 53.86924654611028, + "learning_rate": 1.008376364691182e-06, + "loss": 1.3946, + "step": 28683 + }, + { + "epoch": 2.444728543424529, + "grad_norm": 69.6352989633979, + "learning_rate": 1.0080777712597923e-06, + "loss": 1.1434, + "step": 28684 + }, + { + "epoch": 2.4448137731185544, + "grad_norm": 37.91270146198015, + "learning_rate": 1.0077792170873702e-06, + "loss": 1.3513, + "step": 28685 + }, + { + "epoch": 2.44489900281258, + "grad_norm": 67.53466723313312, + "learning_rate": 1.0074807021768528e-06, + "loss": 1.8048, + "step": 28686 + }, + { + "epoch": 2.4449842325066053, + "grad_norm": 31.334398369598198, + "learning_rate": 1.0071822265311775e-06, + "loss": 0.8641, + "step": 28687 + }, + { + "epoch": 2.445069462200631, + "grad_norm": 67.08314137737324, + "learning_rate": 1.006883790153279e-06, + "loss": 1.7028, + "step": 28688 + }, + { + "epoch": 2.4451546918946563, + "grad_norm": 75.16507714306476, + "learning_rate": 1.0065853930460901e-06, + "loss": 2.6588, + "step": 28689 + }, + { + "epoch": 2.4452399215886813, + "grad_norm": 51.78605881429039, + "learning_rate": 1.0062870352125482e-06, + "loss": 1.6631, + "step": 28690 + }, + { + "epoch": 2.4453251512827068, + "grad_norm": 47.474958623350496, + "learning_rate": 1.0059887166555865e-06, + "loss": 1.3507, + "step": 28691 + }, + { + "epoch": 2.4454103809767322, + "grad_norm": 58.35104179182081, + "learning_rate": 1.005690437378139e-06, + "loss": 1.3632, + "step": 28692 + }, + { + "epoch": 2.4454956106707577, + "grad_norm": 65.0492485146157, + "learning_rate": 1.005392197383137e-06, + "loss": 1.9903, + "step": 28693 + }, + { + "epoch": 2.445580840364783, + "grad_norm": 65.64726411990755, + "learning_rate": 1.0050939966735157e-06, + "loss": 1.5515, + "step": 28694 + }, + { + "epoch": 2.4456660700588086, + "grad_norm": 23.16191267142204, + "learning_rate": 1.004795835252209e-06, + "loss": 0.6911, + "step": 28695 + }, + { + "epoch": 2.4457512997528337, + "grad_norm": 27.242712927151477, + "learning_rate": 1.0044977131221478e-06, + "loss": 1.4468, + "step": 28696 + }, + { + "epoch": 2.445836529446859, + "grad_norm": 45.275735442071046, + "learning_rate": 1.0041996302862632e-06, + "loss": 1.295, + "step": 28697 + }, + { + "epoch": 2.4459217591408846, + "grad_norm": 35.568029725913895, + "learning_rate": 1.0039015867474872e-06, + "loss": 1.31, + "step": 28698 + }, + { + "epoch": 2.44600698883491, + "grad_norm": 61.97545105019719, + "learning_rate": 1.003603582508752e-06, + "loss": 1.1437, + "step": 28699 + }, + { + "epoch": 2.4460922185289355, + "grad_norm": 58.47569859184338, + "learning_rate": 1.0033056175729872e-06, + "loss": 1.6826, + "step": 28700 + }, + { + "epoch": 2.446177448222961, + "grad_norm": 34.095705747377636, + "learning_rate": 1.0030076919431225e-06, + "loss": 1.1923, + "step": 28701 + }, + { + "epoch": 2.4462626779169865, + "grad_norm": 59.63723368984981, + "learning_rate": 1.002709805622089e-06, + "loss": 1.5881, + "step": 28702 + }, + { + "epoch": 2.4463479076110115, + "grad_norm": 54.15272946447018, + "learning_rate": 1.0024119586128172e-06, + "loss": 1.5855, + "step": 28703 + }, + { + "epoch": 2.446433137305037, + "grad_norm": 72.17243406301488, + "learning_rate": 1.0021141509182358e-06, + "loss": 1.5629, + "step": 28704 + }, + { + "epoch": 2.4465183669990624, + "grad_norm": 66.57981527550547, + "learning_rate": 1.001816382541273e-06, + "loss": 2.1149, + "step": 28705 + }, + { + "epoch": 2.446603596693088, + "grad_norm": 56.01270776238259, + "learning_rate": 1.001518653484856e-06, + "loss": 1.3622, + "step": 28706 + }, + { + "epoch": 2.4466888263871134, + "grad_norm": 71.18535215611904, + "learning_rate": 1.0012209637519156e-06, + "loss": 2.0778, + "step": 28707 + }, + { + "epoch": 2.446774056081139, + "grad_norm": 48.694944519283084, + "learning_rate": 1.0009233133453777e-06, + "loss": 1.767, + "step": 28708 + }, + { + "epoch": 2.4468592857751643, + "grad_norm": 53.94969703084257, + "learning_rate": 1.0006257022681688e-06, + "loss": 1.4291, + "step": 28709 + }, + { + "epoch": 2.4469445154691893, + "grad_norm": 36.321588919291074, + "learning_rate": 1.0003281305232181e-06, + "loss": 1.3218, + "step": 28710 + }, + { + "epoch": 2.447029745163215, + "grad_norm": 71.72757937245636, + "learning_rate": 1.00003059811345e-06, + "loss": 2.2624, + "step": 28711 + }, + { + "epoch": 2.4471149748572403, + "grad_norm": 59.42759845354898, + "learning_rate": 9.997331050417925e-07, + "loss": 1.9977, + "step": 28712 + }, + { + "epoch": 2.4472002045512657, + "grad_norm": 36.253700666947644, + "learning_rate": 9.994356513111702e-07, + "loss": 1.1876, + "step": 28713 + }, + { + "epoch": 2.447285434245291, + "grad_norm": 54.6466295176417, + "learning_rate": 9.99138236924509e-07, + "loss": 1.4929, + "step": 28714 + }, + { + "epoch": 2.4473706639393162, + "grad_norm": 28.575323609732262, + "learning_rate": 9.988408618847322e-07, + "loss": 1.1448, + "step": 28715 + }, + { + "epoch": 2.4474558936333417, + "grad_norm": 95.31184198360731, + "learning_rate": 9.985435261947667e-07, + "loss": 2.4986, + "step": 28716 + }, + { + "epoch": 2.447541123327367, + "grad_norm": 42.53903465089188, + "learning_rate": 9.982462298575342e-07, + "loss": 1.4935, + "step": 28717 + }, + { + "epoch": 2.4476263530213926, + "grad_norm": 56.23001216752434, + "learning_rate": 9.979489728759612e-07, + "loss": 1.887, + "step": 28718 + }, + { + "epoch": 2.447711582715418, + "grad_norm": 164.0471756214829, + "learning_rate": 9.976517552529685e-07, + "loss": 2.0768, + "step": 28719 + }, + { + "epoch": 2.4477968124094436, + "grad_norm": 62.314006522224325, + "learning_rate": 9.973545769914823e-07, + "loss": 1.3104, + "step": 28720 + }, + { + "epoch": 2.447882042103469, + "grad_norm": 85.55256774711513, + "learning_rate": 9.970574380944225e-07, + "loss": 2.0349, + "step": 28721 + }, + { + "epoch": 2.447967271797494, + "grad_norm": 52.57572065609315, + "learning_rate": 9.967603385647129e-07, + "loss": 1.2654, + "step": 28722 + }, + { + "epoch": 2.4480525014915195, + "grad_norm": 47.003111505041794, + "learning_rate": 9.96463278405273e-07, + "loss": 1.1919, + "step": 28723 + }, + { + "epoch": 2.448137731185545, + "grad_norm": 58.4585957834804, + "learning_rate": 9.961662576190267e-07, + "loss": 1.5657, + "step": 28724 + }, + { + "epoch": 2.4482229608795705, + "grad_norm": 67.42970858789225, + "learning_rate": 9.95869276208894e-07, + "loss": 1.5045, + "step": 28725 + }, + { + "epoch": 2.448308190573596, + "grad_norm": 37.51791124527585, + "learning_rate": 9.95572334177796e-07, + "loss": 1.2278, + "step": 28726 + }, + { + "epoch": 2.4483934202676214, + "grad_norm": 27.0512983753557, + "learning_rate": 9.95275431528654e-07, + "loss": 1.1126, + "step": 28727 + }, + { + "epoch": 2.448478649961647, + "grad_norm": 30.90537932638269, + "learning_rate": 9.949785682643848e-07, + "loss": 1.9168, + "step": 28728 + }, + { + "epoch": 2.448563879655672, + "grad_norm": 23.477436475621513, + "learning_rate": 9.946817443879115e-07, + "loss": 1.1304, + "step": 28729 + }, + { + "epoch": 2.4486491093496974, + "grad_norm": 72.22685561822094, + "learning_rate": 9.94384959902152e-07, + "loss": 2.141, + "step": 28730 + }, + { + "epoch": 2.448734339043723, + "grad_norm": 37.756369050465274, + "learning_rate": 9.940882148100246e-07, + "loss": 0.9225, + "step": 28731 + }, + { + "epoch": 2.4488195687377483, + "grad_norm": 100.80682259892677, + "learning_rate": 9.937915091144461e-07, + "loss": 0.961, + "step": 28732 + }, + { + "epoch": 2.4489047984317738, + "grad_norm": 35.75022469633037, + "learning_rate": 9.934948428183366e-07, + "loss": 1.0933, + "step": 28733 + }, + { + "epoch": 2.448990028125799, + "grad_norm": 67.29474002104203, + "learning_rate": 9.931982159246145e-07, + "loss": 1.6649, + "step": 28734 + }, + { + "epoch": 2.4490752578198243, + "grad_norm": 19.87251870855112, + "learning_rate": 9.929016284361953e-07, + "loss": 0.8697, + "step": 28735 + }, + { + "epoch": 2.4491604875138497, + "grad_norm": 76.86330577547207, + "learning_rate": 9.926050803559955e-07, + "loss": 2.1164, + "step": 28736 + }, + { + "epoch": 2.449245717207875, + "grad_norm": 53.50224584057851, + "learning_rate": 9.923085716869334e-07, + "loss": 1.1982, + "step": 28737 + }, + { + "epoch": 2.4493309469019007, + "grad_norm": 29.992184300365146, + "learning_rate": 9.920121024319246e-07, + "loss": 0.9612, + "step": 28738 + }, + { + "epoch": 2.449416176595926, + "grad_norm": 38.130519092499725, + "learning_rate": 9.917156725938837e-07, + "loss": 0.8811, + "step": 28739 + }, + { + "epoch": 2.4495014062899516, + "grad_norm": 58.985235156310196, + "learning_rate": 9.91419282175725e-07, + "loss": 1.3629, + "step": 28740 + }, + { + "epoch": 2.4495866359839766, + "grad_norm": 21.340967229203653, + "learning_rate": 9.911229311803649e-07, + "loss": 0.7852, + "step": 28741 + }, + { + "epoch": 2.449671865678002, + "grad_norm": 47.659788917766626, + "learning_rate": 9.908266196107197e-07, + "loss": 1.5368, + "step": 28742 + }, + { + "epoch": 2.4497570953720276, + "grad_norm": 52.2615922863472, + "learning_rate": 9.90530347469701e-07, + "loss": 1.5966, + "step": 28743 + }, + { + "epoch": 2.449842325066053, + "grad_norm": 67.94940036493739, + "learning_rate": 9.902341147602228e-07, + "loss": 1.8252, + "step": 28744 + }, + { + "epoch": 2.4499275547600785, + "grad_norm": 23.315282048780595, + "learning_rate": 9.899379214851983e-07, + "loss": 0.9466, + "step": 28745 + }, + { + "epoch": 2.450012784454104, + "grad_norm": 53.533627000770196, + "learning_rate": 9.896417676475417e-07, + "loss": 1.1066, + "step": 28746 + }, + { + "epoch": 2.4500980141481294, + "grad_norm": 25.768244077297933, + "learning_rate": 9.89345653250165e-07, + "loss": 0.8992, + "step": 28747 + }, + { + "epoch": 2.4501832438421545, + "grad_norm": 52.454858439920876, + "learning_rate": 9.890495782959785e-07, + "loss": 1.3627, + "step": 28748 + }, + { + "epoch": 2.45026847353618, + "grad_norm": 62.7942219078877, + "learning_rate": 9.887535427878959e-07, + "loss": 1.612, + "step": 28749 + }, + { + "epoch": 2.4503537032302054, + "grad_norm": 25.020191005805525, + "learning_rate": 9.884575467288294e-07, + "loss": 1.5198, + "step": 28750 + }, + { + "epoch": 2.450438932924231, + "grad_norm": 35.97045337384811, + "learning_rate": 9.88161590121689e-07, + "loss": 1.0813, + "step": 28751 + }, + { + "epoch": 2.4505241626182563, + "grad_norm": 66.68015766057778, + "learning_rate": 9.878656729693853e-07, + "loss": 1.7052, + "step": 28752 + }, + { + "epoch": 2.4506093923122814, + "grad_norm": 42.12218700745571, + "learning_rate": 9.87569795274827e-07, + "loss": 1.2034, + "step": 28753 + }, + { + "epoch": 2.450694622006307, + "grad_norm": 29.37957551430092, + "learning_rate": 9.872739570409262e-07, + "loss": 0.8423, + "step": 28754 + }, + { + "epoch": 2.4507798517003323, + "grad_norm": 53.86811286384522, + "learning_rate": 9.869781582705923e-07, + "loss": 1.1861, + "step": 28755 + }, + { + "epoch": 2.4508650813943578, + "grad_norm": 43.2106205230506, + "learning_rate": 9.866823989667317e-07, + "loss": 1.2757, + "step": 28756 + }, + { + "epoch": 2.4509503110883832, + "grad_norm": 81.759759897453, + "learning_rate": 9.863866791322568e-07, + "loss": 2.0033, + "step": 28757 + }, + { + "epoch": 2.4510355407824087, + "grad_norm": 28.883170157874524, + "learning_rate": 9.860909987700723e-07, + "loss": 0.5576, + "step": 28758 + }, + { + "epoch": 2.451120770476434, + "grad_norm": 49.90789630253302, + "learning_rate": 9.857953578830898e-07, + "loss": 1.36, + "step": 28759 + }, + { + "epoch": 2.451206000170459, + "grad_norm": 49.319634518389094, + "learning_rate": 9.854997564742142e-07, + "loss": 1.2973, + "step": 28760 + }, + { + "epoch": 2.4512912298644847, + "grad_norm": 38.99439562216185, + "learning_rate": 9.852041945463537e-07, + "loss": 0.9567, + "step": 28761 + }, + { + "epoch": 2.45137645955851, + "grad_norm": 36.11578180661586, + "learning_rate": 9.84908672102413e-07, + "loss": 1.1661, + "step": 28762 + }, + { + "epoch": 2.4514616892525356, + "grad_norm": 35.0942011653006, + "learning_rate": 9.846131891453015e-07, + "loss": 1.1251, + "step": 28763 + }, + { + "epoch": 2.451546918946561, + "grad_norm": 19.68766297125226, + "learning_rate": 9.84317745677923e-07, + "loss": 0.6532, + "step": 28764 + }, + { + "epoch": 2.4516321486405865, + "grad_norm": 85.72186630456169, + "learning_rate": 9.840223417031842e-07, + "loss": 1.6771, + "step": 28765 + }, + { + "epoch": 2.451717378334612, + "grad_norm": 52.007257304006046, + "learning_rate": 9.837269772239889e-07, + "loss": 1.7102, + "step": 28766 + }, + { + "epoch": 2.451802608028637, + "grad_norm": 36.88288757175083, + "learning_rate": 9.834316522432442e-07, + "loss": 1.1816, + "step": 28767 + }, + { + "epoch": 2.4518878377226625, + "grad_norm": 38.00177581079544, + "learning_rate": 9.831363667638532e-07, + "loss": 1.2761, + "step": 28768 + }, + { + "epoch": 2.451973067416688, + "grad_norm": 56.109119887629966, + "learning_rate": 9.8284112078872e-07, + "loss": 1.282, + "step": 28769 + }, + { + "epoch": 2.4520582971107134, + "grad_norm": 52.52928547353299, + "learning_rate": 9.825459143207477e-07, + "loss": 1.6431, + "step": 28770 + }, + { + "epoch": 2.452143526804739, + "grad_norm": 25.38773820464218, + "learning_rate": 9.82250747362839e-07, + "loss": 0.951, + "step": 28771 + }, + { + "epoch": 2.452228756498764, + "grad_norm": 68.06026865348974, + "learning_rate": 9.819556199178982e-07, + "loss": 2.0763, + "step": 28772 + }, + { + "epoch": 2.4523139861927894, + "grad_norm": 23.840485101243406, + "learning_rate": 9.816605319888279e-07, + "loss": 0.7459, + "step": 28773 + }, + { + "epoch": 2.452399215886815, + "grad_norm": 31.283306802361146, + "learning_rate": 9.813654835785297e-07, + "loss": 0.995, + "step": 28774 + }, + { + "epoch": 2.4524844455808403, + "grad_norm": 36.25186139555689, + "learning_rate": 9.810704746899035e-07, + "loss": 0.8024, + "step": 28775 + }, + { + "epoch": 2.452569675274866, + "grad_norm": 42.31533368340214, + "learning_rate": 9.807755053258538e-07, + "loss": 1.32, + "step": 28776 + }, + { + "epoch": 2.4526549049688913, + "grad_norm": 43.24907720852334, + "learning_rate": 9.804805754892798e-07, + "loss": 1.0881, + "step": 28777 + }, + { + "epoch": 2.4527401346629167, + "grad_norm": 43.829929526805905, + "learning_rate": 9.801856851830822e-07, + "loss": 1.744, + "step": 28778 + }, + { + "epoch": 2.4528253643569418, + "grad_norm": 26.623076456572395, + "learning_rate": 9.7989083441016e-07, + "loss": 1.0578, + "step": 28779 + }, + { + "epoch": 2.4529105940509672, + "grad_norm": 48.043607182283445, + "learning_rate": 9.795960231734137e-07, + "loss": 1.0423, + "step": 28780 + }, + { + "epoch": 2.4529958237449927, + "grad_norm": 62.10490889225202, + "learning_rate": 9.793012514757444e-07, + "loss": 1.8567, + "step": 28781 + }, + { + "epoch": 2.453081053439018, + "grad_norm": 58.161896368691494, + "learning_rate": 9.790065193200498e-07, + "loss": 1.4553, + "step": 28782 + }, + { + "epoch": 2.4531662831330436, + "grad_norm": 54.2163519607686, + "learning_rate": 9.787118267092276e-07, + "loss": 1.433, + "step": 28783 + }, + { + "epoch": 2.453251512827069, + "grad_norm": 24.596845448494477, + "learning_rate": 9.784171736461762e-07, + "loss": 0.7587, + "step": 28784 + }, + { + "epoch": 2.4533367425210946, + "grad_norm": 39.09202723622803, + "learning_rate": 9.781225601337947e-07, + "loss": 0.969, + "step": 28785 + }, + { + "epoch": 2.4534219722151196, + "grad_norm": 59.264381313357255, + "learning_rate": 9.778279861749801e-07, + "loss": 1.7555, + "step": 28786 + }, + { + "epoch": 2.453507201909145, + "grad_norm": 12.885556542123119, + "learning_rate": 9.775334517726275e-07, + "loss": 0.6638, + "step": 28787 + }, + { + "epoch": 2.4535924316031705, + "grad_norm": 86.67365965822931, + "learning_rate": 9.772389569296348e-07, + "loss": 2.1681, + "step": 28788 + }, + { + "epoch": 2.453677661297196, + "grad_norm": 46.35728334868405, + "learning_rate": 9.769445016489e-07, + "loss": 1.5612, + "step": 28789 + }, + { + "epoch": 2.4537628909912215, + "grad_norm": 16.812524628664285, + "learning_rate": 9.766500859333172e-07, + "loss": 0.6924, + "step": 28790 + }, + { + "epoch": 2.4538481206852465, + "grad_norm": 48.74549353435494, + "learning_rate": 9.763557097857823e-07, + "loss": 1.67, + "step": 28791 + }, + { + "epoch": 2.453933350379272, + "grad_norm": 73.86934194898934, + "learning_rate": 9.76061373209189e-07, + "loss": 1.3104, + "step": 28792 + }, + { + "epoch": 2.4540185800732974, + "grad_norm": 46.98889833823334, + "learning_rate": 9.757670762064343e-07, + "loss": 1.7002, + "step": 28793 + }, + { + "epoch": 2.454103809767323, + "grad_norm": 32.04635476881602, + "learning_rate": 9.754728187804114e-07, + "loss": 0.9122, + "step": 28794 + }, + { + "epoch": 2.4541890394613484, + "grad_norm": 63.154285636912896, + "learning_rate": 9.751786009340125e-07, + "loss": 1.6044, + "step": 28795 + }, + { + "epoch": 2.454274269155374, + "grad_norm": 56.58807774973684, + "learning_rate": 9.748844226701327e-07, + "loss": 1.099, + "step": 28796 + }, + { + "epoch": 2.4543594988493993, + "grad_norm": 31.85239255881495, + "learning_rate": 9.745902839916672e-07, + "loss": 1.1185, + "step": 28797 + }, + { + "epoch": 2.4544447285434243, + "grad_norm": 76.05601067191641, + "learning_rate": 9.74296184901506e-07, + "loss": 1.4359, + "step": 28798 + }, + { + "epoch": 2.45452995823745, + "grad_norm": 68.68538343165437, + "learning_rate": 9.740021254025423e-07, + "loss": 1.5263, + "step": 28799 + }, + { + "epoch": 2.4546151879314753, + "grad_norm": 32.553277747428176, + "learning_rate": 9.737081054976682e-07, + "loss": 0.8829, + "step": 28800 + }, + { + "epoch": 2.4547004176255007, + "grad_norm": 37.4198466227516, + "learning_rate": 9.734141251897732e-07, + "loss": 1.4319, + "step": 28801 + }, + { + "epoch": 2.454785647319526, + "grad_norm": 53.47331317221602, + "learning_rate": 9.731201844817523e-07, + "loss": 1.183, + "step": 28802 + }, + { + "epoch": 2.4548708770135517, + "grad_norm": 66.46958487133163, + "learning_rate": 9.728262833764924e-07, + "loss": 1.7951, + "step": 28803 + }, + { + "epoch": 2.454956106707577, + "grad_norm": 51.26897325713072, + "learning_rate": 9.725324218768872e-07, + "loss": 1.5345, + "step": 28804 + }, + { + "epoch": 2.455041336401602, + "grad_norm": 43.037841363424434, + "learning_rate": 9.72238599985824e-07, + "loss": 1.3558, + "step": 28805 + }, + { + "epoch": 2.4551265660956276, + "grad_norm": 40.9294892261465, + "learning_rate": 9.71944817706195e-07, + "loss": 1.2755, + "step": 28806 + }, + { + "epoch": 2.455211795789653, + "grad_norm": 64.54486058955125, + "learning_rate": 9.71651075040888e-07, + "loss": 1.7482, + "step": 28807 + }, + { + "epoch": 2.4552970254836786, + "grad_norm": 32.216277459507324, + "learning_rate": 9.71357371992792e-07, + "loss": 1.4766, + "step": 28808 + }, + { + "epoch": 2.455382255177704, + "grad_norm": 70.51859069707523, + "learning_rate": 9.710637085647945e-07, + "loss": 1.0924, + "step": 28809 + }, + { + "epoch": 2.4554674848717295, + "grad_norm": 11.43873548821421, + "learning_rate": 9.707700847597856e-07, + "loss": 0.6075, + "step": 28810 + }, + { + "epoch": 2.4555527145657545, + "grad_norm": 41.94059127196936, + "learning_rate": 9.704765005806506e-07, + "loss": 1.2664, + "step": 28811 + }, + { + "epoch": 2.45563794425978, + "grad_norm": 44.38111785155497, + "learning_rate": 9.70182956030279e-07, + "loss": 1.2723, + "step": 28812 + }, + { + "epoch": 2.4557231739538055, + "grad_norm": 23.42198415747038, + "learning_rate": 9.69889451111557e-07, + "loss": 0.8911, + "step": 28813 + }, + { + "epoch": 2.455808403647831, + "grad_norm": 58.50474699103713, + "learning_rate": 9.69595985827369e-07, + "loss": 1.6289, + "step": 28814 + }, + { + "epoch": 2.4558936333418564, + "grad_norm": 54.098751342635595, + "learning_rate": 9.69302560180605e-07, + "loss": 1.2375, + "step": 28815 + }, + { + "epoch": 2.455978863035882, + "grad_norm": 36.973740370363956, + "learning_rate": 9.690091741741487e-07, + "loss": 1.3137, + "step": 28816 + }, + { + "epoch": 2.456064092729907, + "grad_norm": 48.934626255459726, + "learning_rate": 9.687158278108848e-07, + "loss": 1.8589, + "step": 28817 + }, + { + "epoch": 2.4561493224239324, + "grad_norm": 20.926772544242723, + "learning_rate": 9.684225210936977e-07, + "loss": 0.5384, + "step": 28818 + }, + { + "epoch": 2.456234552117958, + "grad_norm": 57.07488938507194, + "learning_rate": 9.681292540254734e-07, + "loss": 1.4599, + "step": 28819 + }, + { + "epoch": 2.4563197818119833, + "grad_norm": 40.076087102763445, + "learning_rate": 9.678360266090968e-07, + "loss": 1.379, + "step": 28820 + }, + { + "epoch": 2.4564050115060088, + "grad_norm": 67.18552837734886, + "learning_rate": 9.675428388474512e-07, + "loss": 1.8066, + "step": 28821 + }, + { + "epoch": 2.4564902412000342, + "grad_norm": 335.6571193484536, + "learning_rate": 9.672496907434176e-07, + "loss": 1.3061, + "step": 28822 + }, + { + "epoch": 2.4565754708940597, + "grad_norm": 24.353467785186314, + "learning_rate": 9.669565822998821e-07, + "loss": 0.9909, + "step": 28823 + }, + { + "epoch": 2.4566607005880847, + "grad_norm": 87.87415109878093, + "learning_rate": 9.666635135197266e-07, + "loss": 2.266, + "step": 28824 + }, + { + "epoch": 2.45674593028211, + "grad_norm": 31.904603574207293, + "learning_rate": 9.66370484405832e-07, + "loss": 0.6859, + "step": 28825 + }, + { + "epoch": 2.4568311599761357, + "grad_norm": 42.63667353931643, + "learning_rate": 9.660774949610803e-07, + "loss": 0.9006, + "step": 28826 + }, + { + "epoch": 2.456916389670161, + "grad_norm": 71.32719644124894, + "learning_rate": 9.65784545188353e-07, + "loss": 1.2301, + "step": 28827 + }, + { + "epoch": 2.4570016193641866, + "grad_norm": 32.958853284494715, + "learning_rate": 9.65491635090533e-07, + "loss": 1.2018, + "step": 28828 + }, + { + "epoch": 2.457086849058212, + "grad_norm": 46.42231212319049, + "learning_rate": 9.651987646704997e-07, + "loss": 1.3726, + "step": 28829 + }, + { + "epoch": 2.457172078752237, + "grad_norm": 31.525973267634058, + "learning_rate": 9.649059339311335e-07, + "loss": 1.1162, + "step": 28830 + }, + { + "epoch": 2.4572573084462626, + "grad_norm": 49.90981852508732, + "learning_rate": 9.646131428753125e-07, + "loss": 1.2027, + "step": 28831 + }, + { + "epoch": 2.457342538140288, + "grad_norm": 64.556078604945, + "learning_rate": 9.64320391505919e-07, + "loss": 1.3365, + "step": 28832 + }, + { + "epoch": 2.4574277678343135, + "grad_norm": 26.067591822670558, + "learning_rate": 9.640276798258314e-07, + "loss": 0.8742, + "step": 28833 + }, + { + "epoch": 2.457512997528339, + "grad_norm": 51.22105765706913, + "learning_rate": 9.637350078379259e-07, + "loss": 1.0026, + "step": 28834 + }, + { + "epoch": 2.4575982272223644, + "grad_norm": 46.52850059206715, + "learning_rate": 9.63442375545083e-07, + "loss": 1.3812, + "step": 28835 + }, + { + "epoch": 2.4576834569163895, + "grad_norm": 60.53611911727486, + "learning_rate": 9.631497829501813e-07, + "loss": 1.3456, + "step": 28836 + }, + { + "epoch": 2.457768686610415, + "grad_norm": 30.237906894936085, + "learning_rate": 9.62857230056098e-07, + "loss": 1.2462, + "step": 28837 + }, + { + "epoch": 2.4578539163044404, + "grad_norm": 24.986850769084263, + "learning_rate": 9.625647168657092e-07, + "loss": 1.3568, + "step": 28838 + }, + { + "epoch": 2.457939145998466, + "grad_norm": 33.73610437670093, + "learning_rate": 9.62272243381891e-07, + "loss": 1.2838, + "step": 28839 + }, + { + "epoch": 2.4580243756924913, + "grad_norm": 52.12047247814605, + "learning_rate": 9.619798096075217e-07, + "loss": 0.9964, + "step": 28840 + }, + { + "epoch": 2.458109605386517, + "grad_norm": 67.21247037160653, + "learning_rate": 9.61687415545477e-07, + "loss": 1.8899, + "step": 28841 + }, + { + "epoch": 2.4581948350805423, + "grad_norm": 22.117728403001394, + "learning_rate": 9.6139506119863e-07, + "loss": 0.7181, + "step": 28842 + }, + { + "epoch": 2.4582800647745673, + "grad_norm": 52.06684103549424, + "learning_rate": 9.611027465698596e-07, + "loss": 1.6782, + "step": 28843 + }, + { + "epoch": 2.4583652944685928, + "grad_norm": 40.10613677299559, + "learning_rate": 9.608104716620375e-07, + "loss": 1.0592, + "step": 28844 + }, + { + "epoch": 2.4584505241626182, + "grad_norm": 50.29043188792265, + "learning_rate": 9.605182364780402e-07, + "loss": 1.058, + "step": 28845 + }, + { + "epoch": 2.4585357538566437, + "grad_norm": 57.6300513977289, + "learning_rate": 9.602260410207416e-07, + "loss": 1.8826, + "step": 28846 + }, + { + "epoch": 2.458620983550669, + "grad_norm": 33.17962796666033, + "learning_rate": 9.59933885293014e-07, + "loss": 1.4839, + "step": 28847 + }, + { + "epoch": 2.4587062132446946, + "grad_norm": 46.67236839920949, + "learning_rate": 9.596417692977305e-07, + "loss": 1.2205, + "step": 28848 + }, + { + "epoch": 2.45879144293872, + "grad_norm": 41.87502185295586, + "learning_rate": 9.593496930377654e-07, + "loss": 1.0957, + "step": 28849 + }, + { + "epoch": 2.458876672632745, + "grad_norm": 55.91400189157694, + "learning_rate": 9.590576565159898e-07, + "loss": 1.5402, + "step": 28850 + }, + { + "epoch": 2.4589619023267706, + "grad_norm": 43.83875935937201, + "learning_rate": 9.587656597352774e-07, + "loss": 1.3191, + "step": 28851 + }, + { + "epoch": 2.459047132020796, + "grad_norm": 45.61641383166733, + "learning_rate": 9.58473702698498e-07, + "loss": 1.0568, + "step": 28852 + }, + { + "epoch": 2.4591323617148215, + "grad_norm": 90.64890824557553, + "learning_rate": 9.581817854085245e-07, + "loss": 2.4376, + "step": 28853 + }, + { + "epoch": 2.459217591408847, + "grad_norm": 46.61045622226876, + "learning_rate": 9.578899078682275e-07, + "loss": 0.9171, + "step": 28854 + }, + { + "epoch": 2.459302821102872, + "grad_norm": 69.89278717194838, + "learning_rate": 9.575980700804772e-07, + "loss": 2.1804, + "step": 28855 + }, + { + "epoch": 2.4593880507968975, + "grad_norm": 48.282630907022124, + "learning_rate": 9.57306272048143e-07, + "loss": 1.4939, + "step": 28856 + }, + { + "epoch": 2.459473280490923, + "grad_norm": 121.99421586690191, + "learning_rate": 9.570145137740944e-07, + "loss": 2.7112, + "step": 28857 + }, + { + "epoch": 2.4595585101849484, + "grad_norm": 47.4576985630521, + "learning_rate": 9.567227952612013e-07, + "loss": 1.1632, + "step": 28858 + }, + { + "epoch": 2.459643739878974, + "grad_norm": 43.84326716313504, + "learning_rate": 9.56431116512334e-07, + "loss": 1.4913, + "step": 28859 + }, + { + "epoch": 2.4597289695729994, + "grad_norm": 16.169575112212247, + "learning_rate": 9.561394775303606e-07, + "loss": 0.4942, + "step": 28860 + }, + { + "epoch": 2.459814199267025, + "grad_norm": 47.057460113849665, + "learning_rate": 9.55847878318147e-07, + "loss": 1.4406, + "step": 28861 + }, + { + "epoch": 2.45989942896105, + "grad_norm": 73.34239997648528, + "learning_rate": 9.555563188785633e-07, + "loss": 1.745, + "step": 28862 + }, + { + "epoch": 2.4599846586550753, + "grad_norm": 30.282956608258775, + "learning_rate": 9.552647992144765e-07, + "loss": 0.7837, + "step": 28863 + }, + { + "epoch": 2.460069888349101, + "grad_norm": 69.27822025554966, + "learning_rate": 9.54973319328753e-07, + "loss": 1.6204, + "step": 28864 + }, + { + "epoch": 2.4601551180431263, + "grad_norm": 67.58986699708069, + "learning_rate": 9.546818792242585e-07, + "loss": 1.5834, + "step": 28865 + }, + { + "epoch": 2.4602403477371517, + "grad_norm": 38.53049549737511, + "learning_rate": 9.543904789038599e-07, + "loss": 1.3194, + "step": 28866 + }, + { + "epoch": 2.460325577431177, + "grad_norm": 52.95444471476133, + "learning_rate": 9.540991183704252e-07, + "loss": 1.5077, + "step": 28867 + }, + { + "epoch": 2.4604108071252027, + "grad_norm": 46.514641873783745, + "learning_rate": 9.538077976268178e-07, + "loss": 1.5997, + "step": 28868 + }, + { + "epoch": 2.4604960368192277, + "grad_norm": 57.876826304545474, + "learning_rate": 9.535165166759014e-07, + "loss": 1.305, + "step": 28869 + }, + { + "epoch": 2.460581266513253, + "grad_norm": 81.1531828791738, + "learning_rate": 9.532252755205435e-07, + "loss": 1.5226, + "step": 28870 + }, + { + "epoch": 2.4606664962072786, + "grad_norm": 43.50575787218802, + "learning_rate": 9.529340741636073e-07, + "loss": 1.7062, + "step": 28871 + }, + { + "epoch": 2.460751725901304, + "grad_norm": 42.48357515553102, + "learning_rate": 9.526429126079561e-07, + "loss": 1.2085, + "step": 28872 + }, + { + "epoch": 2.4608369555953296, + "grad_norm": 74.58328206147726, + "learning_rate": 9.523517908564523e-07, + "loss": 1.9047, + "step": 28873 + }, + { + "epoch": 2.4609221852893546, + "grad_norm": 63.35841856286861, + "learning_rate": 9.520607089119599e-07, + "loss": 1.9251, + "step": 28874 + }, + { + "epoch": 2.46100741498338, + "grad_norm": 25.498641286834953, + "learning_rate": 9.517696667773435e-07, + "loss": 0.9495, + "step": 28875 + }, + { + "epoch": 2.4610926446774055, + "grad_norm": 66.57239718245795, + "learning_rate": 9.514786644554641e-07, + "loss": 2.1524, + "step": 28876 + }, + { + "epoch": 2.461177874371431, + "grad_norm": 16.015543327300577, + "learning_rate": 9.511877019491833e-07, + "loss": 0.5682, + "step": 28877 + }, + { + "epoch": 2.4612631040654565, + "grad_norm": 33.96439518772216, + "learning_rate": 9.508967792613611e-07, + "loss": 1.3604, + "step": 28878 + }, + { + "epoch": 2.461348333759482, + "grad_norm": 76.63700125007306, + "learning_rate": 9.506058963948611e-07, + "loss": 1.4439, + "step": 28879 + }, + { + "epoch": 2.4614335634535074, + "grad_norm": 54.55925845266076, + "learning_rate": 9.503150533525435e-07, + "loss": 1.3829, + "step": 28880 + }, + { + "epoch": 2.4615187931475324, + "grad_norm": 14.419199707567817, + "learning_rate": 9.50024250137267e-07, + "loss": 0.6428, + "step": 28881 + }, + { + "epoch": 2.461604022841558, + "grad_norm": 35.23243281704218, + "learning_rate": 9.49733486751892e-07, + "loss": 1.0799, + "step": 28882 + }, + { + "epoch": 2.4616892525355833, + "grad_norm": 66.69685105870447, + "learning_rate": 9.494427631992809e-07, + "loss": 1.0608, + "step": 28883 + }, + { + "epoch": 2.461774482229609, + "grad_norm": 30.927216019026297, + "learning_rate": 9.4915207948229e-07, + "loss": 0.982, + "step": 28884 + }, + { + "epoch": 2.4618597119236343, + "grad_norm": 50.72389570800003, + "learning_rate": 9.48861435603779e-07, + "loss": 1.7401, + "step": 28885 + }, + { + "epoch": 2.4619449416176598, + "grad_norm": 32.55171622790586, + "learning_rate": 9.485708315666065e-07, + "loss": 1.2399, + "step": 28886 + }, + { + "epoch": 2.4620301713116852, + "grad_norm": 39.21570832434589, + "learning_rate": 9.48280267373628e-07, + "loss": 1.0405, + "step": 28887 + }, + { + "epoch": 2.4621154010057102, + "grad_norm": 59.65048668123778, + "learning_rate": 9.479897430277047e-07, + "loss": 1.586, + "step": 28888 + }, + { + "epoch": 2.4622006306997357, + "grad_norm": 73.72070397008085, + "learning_rate": 9.476992585316907e-07, + "loss": 2.6528, + "step": 28889 + }, + { + "epoch": 2.462285860393761, + "grad_norm": 47.329918024930905, + "learning_rate": 9.474088138884458e-07, + "loss": 1.3465, + "step": 28890 + }, + { + "epoch": 2.4623710900877867, + "grad_norm": 25.712868494938323, + "learning_rate": 9.471184091008235e-07, + "loss": 0.6725, + "step": 28891 + }, + { + "epoch": 2.462456319781812, + "grad_norm": 42.72336360179191, + "learning_rate": 9.468280441716821e-07, + "loss": 1.5806, + "step": 28892 + }, + { + "epoch": 2.462541549475837, + "grad_norm": 39.00049241409886, + "learning_rate": 9.465377191038766e-07, + "loss": 1.0556, + "step": 28893 + }, + { + "epoch": 2.4626267791698626, + "grad_norm": 39.12428726180727, + "learning_rate": 9.462474339002619e-07, + "loss": 1.1165, + "step": 28894 + }, + { + "epoch": 2.462712008863888, + "grad_norm": 77.48623496272927, + "learning_rate": 9.459571885636915e-07, + "loss": 2.4896, + "step": 28895 + }, + { + "epoch": 2.4627972385579135, + "grad_norm": 37.23539262740777, + "learning_rate": 9.456669830970227e-07, + "loss": 1.2015, + "step": 28896 + }, + { + "epoch": 2.462882468251939, + "grad_norm": 31.7270516492244, + "learning_rate": 9.453768175031069e-07, + "loss": 1.282, + "step": 28897 + }, + { + "epoch": 2.4629676979459645, + "grad_norm": 33.77980617779378, + "learning_rate": 9.450866917848e-07, + "loss": 0.7021, + "step": 28898 + }, + { + "epoch": 2.46305292763999, + "grad_norm": 69.02132288322491, + "learning_rate": 9.447966059449531e-07, + "loss": 1.8361, + "step": 28899 + }, + { + "epoch": 2.463138157334015, + "grad_norm": 31.328309652036936, + "learning_rate": 9.445065599864217e-07, + "loss": 1.1718, + "step": 28900 + }, + { + "epoch": 2.4632233870280404, + "grad_norm": 53.051718708573276, + "learning_rate": 9.442165539120563e-07, + "loss": 1.2872, + "step": 28901 + }, + { + "epoch": 2.463308616722066, + "grad_norm": 41.36907662715337, + "learning_rate": 9.439265877247095e-07, + "loss": 1.0182, + "step": 28902 + }, + { + "epoch": 2.4633938464160914, + "grad_norm": 31.8754974407405, + "learning_rate": 9.436366614272335e-07, + "loss": 1.4197, + "step": 28903 + }, + { + "epoch": 2.463479076110117, + "grad_norm": 32.42267317449867, + "learning_rate": 9.433467750224773e-07, + "loss": 0.8838, + "step": 28904 + }, + { + "epoch": 2.4635643058041423, + "grad_norm": 66.93575973135485, + "learning_rate": 9.43056928513294e-07, + "loss": 2.418, + "step": 28905 + }, + { + "epoch": 2.463649535498168, + "grad_norm": 59.820006237656855, + "learning_rate": 9.427671219025347e-07, + "loss": 1.2698, + "step": 28906 + }, + { + "epoch": 2.463734765192193, + "grad_norm": 77.9251932005388, + "learning_rate": 9.424773551930489e-07, + "loss": 2.1507, + "step": 28907 + }, + { + "epoch": 2.4638199948862183, + "grad_norm": 58.73135044563093, + "learning_rate": 9.421876283876841e-07, + "loss": 1.858, + "step": 28908 + }, + { + "epoch": 2.4639052245802437, + "grad_norm": 40.57631898849016, + "learning_rate": 9.418979414892937e-07, + "loss": 1.8364, + "step": 28909 + }, + { + "epoch": 2.463990454274269, + "grad_norm": 37.48114089680739, + "learning_rate": 9.416082945007238e-07, + "loss": 1.4433, + "step": 28910 + }, + { + "epoch": 2.4640756839682947, + "grad_norm": 29.47439001247065, + "learning_rate": 9.413186874248242e-07, + "loss": 0.7862, + "step": 28911 + }, + { + "epoch": 2.4641609136623197, + "grad_norm": 74.8664343521228, + "learning_rate": 9.410291202644405e-07, + "loss": 1.8079, + "step": 28912 + }, + { + "epoch": 2.464246143356345, + "grad_norm": 57.42647127608769, + "learning_rate": 9.407395930224234e-07, + "loss": 1.3399, + "step": 28913 + }, + { + "epoch": 2.4643313730503706, + "grad_norm": 57.32044812790952, + "learning_rate": 9.404501057016203e-07, + "loss": 1.3935, + "step": 28914 + }, + { + "epoch": 2.464416602744396, + "grad_norm": 70.80592472023424, + "learning_rate": 9.401606583048773e-07, + "loss": 2.1598, + "step": 28915 + }, + { + "epoch": 2.4645018324384216, + "grad_norm": 61.07592969918464, + "learning_rate": 9.398712508350411e-07, + "loss": 1.6732, + "step": 28916 + }, + { + "epoch": 2.464587062132447, + "grad_norm": 27.03828992128203, + "learning_rate": 9.395818832949566e-07, + "loss": 0.9651, + "step": 28917 + }, + { + "epoch": 2.4646722918264725, + "grad_norm": 44.37591720905975, + "learning_rate": 9.392925556874716e-07, + "loss": 1.3896, + "step": 28918 + }, + { + "epoch": 2.4647575215204975, + "grad_norm": 85.57115165502286, + "learning_rate": 9.390032680154315e-07, + "loss": 2.1482, + "step": 28919 + }, + { + "epoch": 2.464842751214523, + "grad_norm": 39.7677358517215, + "learning_rate": 9.387140202816785e-07, + "loss": 1.491, + "step": 28920 + }, + { + "epoch": 2.4649279809085485, + "grad_norm": 44.97126292263794, + "learning_rate": 9.384248124890599e-07, + "loss": 1.4084, + "step": 28921 + }, + { + "epoch": 2.465013210602574, + "grad_norm": 85.24105793449505, + "learning_rate": 9.381356446404205e-07, + "loss": 2.0534, + "step": 28922 + }, + { + "epoch": 2.4650984402965994, + "grad_norm": 29.179086053755643, + "learning_rate": 9.37846516738603e-07, + "loss": 0.9196, + "step": 28923 + }, + { + "epoch": 2.465183669990625, + "grad_norm": 60.34980662693396, + "learning_rate": 9.375574287864508e-07, + "loss": 1.5028, + "step": 28924 + }, + { + "epoch": 2.4652688996846503, + "grad_norm": 50.38318115820093, + "learning_rate": 9.37268380786806e-07, + "loss": 1.398, + "step": 28925 + }, + { + "epoch": 2.4653541293786754, + "grad_norm": 65.72059362586654, + "learning_rate": 9.36979372742513e-07, + "loss": 1.5992, + "step": 28926 + }, + { + "epoch": 2.465439359072701, + "grad_norm": 39.81290724738619, + "learning_rate": 9.36690404656414e-07, + "loss": 1.3801, + "step": 28927 + }, + { + "epoch": 2.4655245887667263, + "grad_norm": 24.86880286232067, + "learning_rate": 9.36401476531349e-07, + "loss": 0.9895, + "step": 28928 + }, + { + "epoch": 2.465609818460752, + "grad_norm": 68.94879611827584, + "learning_rate": 9.361125883701616e-07, + "loss": 1.6749, + "step": 28929 + }, + { + "epoch": 2.4656950481547772, + "grad_norm": 40.34002454106039, + "learning_rate": 9.358237401756909e-07, + "loss": 1.3558, + "step": 28930 + }, + { + "epoch": 2.4657802778488027, + "grad_norm": 83.31086075388808, + "learning_rate": 9.3553493195078e-07, + "loss": 2.2693, + "step": 28931 + }, + { + "epoch": 2.4658655075428277, + "grad_norm": 55.974344285482694, + "learning_rate": 9.352461636982685e-07, + "loss": 1.8963, + "step": 28932 + }, + { + "epoch": 2.465950737236853, + "grad_norm": 30.040627235467735, + "learning_rate": 9.349574354209956e-07, + "loss": 1.0969, + "step": 28933 + }, + { + "epoch": 2.4660359669308787, + "grad_norm": 32.92670228915105, + "learning_rate": 9.346687471217992e-07, + "loss": 1.2601, + "step": 28934 + }, + { + "epoch": 2.466121196624904, + "grad_norm": 68.57506894445052, + "learning_rate": 9.343800988035217e-07, + "loss": 1.5612, + "step": 28935 + }, + { + "epoch": 2.4662064263189296, + "grad_norm": 25.518881218794732, + "learning_rate": 9.340914904689996e-07, + "loss": 0.6509, + "step": 28936 + }, + { + "epoch": 2.466291656012955, + "grad_norm": 63.88024238720741, + "learning_rate": 9.338029221210732e-07, + "loss": 1.5238, + "step": 28937 + }, + { + "epoch": 2.46637688570698, + "grad_norm": 122.21575527127506, + "learning_rate": 9.335143937625779e-07, + "loss": 2.2851, + "step": 28938 + }, + { + "epoch": 2.4664621154010056, + "grad_norm": 23.003277768100972, + "learning_rate": 9.332259053963538e-07, + "loss": 0.6411, + "step": 28939 + }, + { + "epoch": 2.466547345095031, + "grad_norm": 70.77535395712309, + "learning_rate": 9.329374570252375e-07, + "loss": 2.2097, + "step": 28940 + }, + { + "epoch": 2.4666325747890565, + "grad_norm": 37.891726498111424, + "learning_rate": 9.326490486520645e-07, + "loss": 0.7914, + "step": 28941 + }, + { + "epoch": 2.466717804483082, + "grad_norm": 84.04981340305463, + "learning_rate": 9.323606802796708e-07, + "loss": 1.9124, + "step": 28942 + }, + { + "epoch": 2.4668030341771074, + "grad_norm": 43.48559800617687, + "learning_rate": 9.320723519108954e-07, + "loss": 1.1198, + "step": 28943 + }, + { + "epoch": 2.466888263871133, + "grad_norm": 64.36232865707943, + "learning_rate": 9.317840635485703e-07, + "loss": 1.8351, + "step": 28944 + }, + { + "epoch": 2.466973493565158, + "grad_norm": 50.052237799651685, + "learning_rate": 9.314958151955334e-07, + "loss": 1.6965, + "step": 28945 + }, + { + "epoch": 2.4670587232591834, + "grad_norm": 36.87407558203926, + "learning_rate": 9.312076068546189e-07, + "loss": 0.9903, + "step": 28946 + }, + { + "epoch": 2.467143952953209, + "grad_norm": 58.28445600291377, + "learning_rate": 9.30919438528659e-07, + "loss": 1.2912, + "step": 28947 + }, + { + "epoch": 2.4672291826472343, + "grad_norm": 42.68815975107112, + "learning_rate": 9.306313102204911e-07, + "loss": 1.1712, + "step": 28948 + }, + { + "epoch": 2.46731441234126, + "grad_norm": 48.40127875437208, + "learning_rate": 9.303432219329473e-07, + "loss": 1.496, + "step": 28949 + }, + { + "epoch": 2.4673996420352853, + "grad_norm": 64.25777077008756, + "learning_rate": 9.300551736688606e-07, + "loss": 1.5496, + "step": 28950 + }, + { + "epoch": 2.4674848717293103, + "grad_norm": 44.187236358890296, + "learning_rate": 9.297671654310631e-07, + "loss": 1.1584, + "step": 28951 + }, + { + "epoch": 2.4675701014233358, + "grad_norm": 60.845672527312864, + "learning_rate": 9.294791972223883e-07, + "loss": 2.1013, + "step": 28952 + }, + { + "epoch": 2.4676553311173612, + "grad_norm": 21.634958812991247, + "learning_rate": 9.29191269045669e-07, + "loss": 0.9105, + "step": 28953 + }, + { + "epoch": 2.4677405608113867, + "grad_norm": 56.09184218994872, + "learning_rate": 9.289033809037362e-07, + "loss": 1.5444, + "step": 28954 + }, + { + "epoch": 2.467825790505412, + "grad_norm": 60.98924956293462, + "learning_rate": 9.286155327994195e-07, + "loss": 1.5753, + "step": 28955 + }, + { + "epoch": 2.4679110201994376, + "grad_norm": 74.9724876468142, + "learning_rate": 9.283277247355527e-07, + "loss": 2.0652, + "step": 28956 + }, + { + "epoch": 2.4679962498934627, + "grad_norm": 59.24906526256352, + "learning_rate": 9.280399567149645e-07, + "loss": 1.5367, + "step": 28957 + }, + { + "epoch": 2.468081479587488, + "grad_norm": 25.802322379838447, + "learning_rate": 9.277522287404856e-07, + "loss": 0.7951, + "step": 28958 + }, + { + "epoch": 2.4681667092815136, + "grad_norm": 38.4714573183485, + "learning_rate": 9.274645408149436e-07, + "loss": 1.0084, + "step": 28959 + }, + { + "epoch": 2.468251938975539, + "grad_norm": 64.30205471162373, + "learning_rate": 9.271768929411701e-07, + "loss": 1.8869, + "step": 28960 + }, + { + "epoch": 2.4683371686695645, + "grad_norm": 59.10727179979787, + "learning_rate": 9.268892851219946e-07, + "loss": 1.5867, + "step": 28961 + }, + { + "epoch": 2.46842239836359, + "grad_norm": 73.63406163611523, + "learning_rate": 9.266017173602443e-07, + "loss": 1.2911, + "step": 28962 + }, + { + "epoch": 2.4685076280576155, + "grad_norm": 51.69095365169599, + "learning_rate": 9.263141896587479e-07, + "loss": 1.2248, + "step": 28963 + }, + { + "epoch": 2.4685928577516405, + "grad_norm": 79.85084133109223, + "learning_rate": 9.260267020203312e-07, + "loss": 1.8439, + "step": 28964 + }, + { + "epoch": 2.468678087445666, + "grad_norm": 34.1006958393668, + "learning_rate": 9.257392544478244e-07, + "loss": 0.912, + "step": 28965 + }, + { + "epoch": 2.4687633171396914, + "grad_norm": 91.70290380395929, + "learning_rate": 9.254518469440526e-07, + "loss": 1.675, + "step": 28966 + }, + { + "epoch": 2.468848546833717, + "grad_norm": 58.07985437623578, + "learning_rate": 9.251644795118425e-07, + "loss": 1.4374, + "step": 28967 + }, + { + "epoch": 2.4689337765277424, + "grad_norm": 79.68236636167735, + "learning_rate": 9.248771521540195e-07, + "loss": 2.1592, + "step": 28968 + }, + { + "epoch": 2.469019006221768, + "grad_norm": 68.35168808922603, + "learning_rate": 9.245898648734125e-07, + "loss": 1.7363, + "step": 28969 + }, + { + "epoch": 2.4691042359157933, + "grad_norm": 66.38798776075662, + "learning_rate": 9.243026176728443e-07, + "loss": 1.1062, + "step": 28970 + }, + { + "epoch": 2.4691894656098183, + "grad_norm": 53.30684024505048, + "learning_rate": 9.240154105551408e-07, + "loss": 1.7908, + "step": 28971 + }, + { + "epoch": 2.469274695303844, + "grad_norm": 64.93625899655777, + "learning_rate": 9.237282435231243e-07, + "loss": 1.488, + "step": 28972 + }, + { + "epoch": 2.4693599249978693, + "grad_norm": 24.199867266066274, + "learning_rate": 9.234411165796225e-07, + "loss": 0.9732, + "step": 28973 + }, + { + "epoch": 2.4694451546918947, + "grad_norm": 35.166766547391255, + "learning_rate": 9.231540297274572e-07, + "loss": 0.917, + "step": 28974 + }, + { + "epoch": 2.46953038438592, + "grad_norm": 25.96062840489954, + "learning_rate": 9.228669829694509e-07, + "loss": 0.8982, + "step": 28975 + }, + { + "epoch": 2.4696156140799452, + "grad_norm": 43.65045331684989, + "learning_rate": 9.225799763084287e-07, + "loss": 0.9917, + "step": 28976 + }, + { + "epoch": 2.4697008437739707, + "grad_norm": 62.60340920387857, + "learning_rate": 9.222930097472111e-07, + "loss": 1.4511, + "step": 28977 + }, + { + "epoch": 2.469786073467996, + "grad_norm": 57.69367237154513, + "learning_rate": 9.220060832886229e-07, + "loss": 1.4498, + "step": 28978 + }, + { + "epoch": 2.4698713031620216, + "grad_norm": 58.789925816205894, + "learning_rate": 9.217191969354844e-07, + "loss": 1.6437, + "step": 28979 + }, + { + "epoch": 2.469956532856047, + "grad_norm": 35.75342136575843, + "learning_rate": 9.21432350690617e-07, + "loss": 0.9327, + "step": 28980 + }, + { + "epoch": 2.4700417625500726, + "grad_norm": 60.91077320963822, + "learning_rate": 9.211455445568401e-07, + "loss": 1.6143, + "step": 28981 + }, + { + "epoch": 2.470126992244098, + "grad_norm": 59.0583831651153, + "learning_rate": 9.208587785369777e-07, + "loss": 1.5851, + "step": 28982 + }, + { + "epoch": 2.470212221938123, + "grad_norm": 41.58810523809885, + "learning_rate": 9.205720526338474e-07, + "loss": 1.2314, + "step": 28983 + }, + { + "epoch": 2.4702974516321485, + "grad_norm": 63.36183772273595, + "learning_rate": 9.202853668502704e-07, + "loss": 1.9506, + "step": 28984 + }, + { + "epoch": 2.470382681326174, + "grad_norm": 68.23332275027194, + "learning_rate": 9.199987211890648e-07, + "loss": 1.746, + "step": 28985 + }, + { + "epoch": 2.4704679110201995, + "grad_norm": 48.072829265832766, + "learning_rate": 9.19712115653052e-07, + "loss": 1.6001, + "step": 28986 + }, + { + "epoch": 2.470553140714225, + "grad_norm": 91.54948223401178, + "learning_rate": 9.194255502450488e-07, + "loss": 2.4357, + "step": 28987 + }, + { + "epoch": 2.4706383704082504, + "grad_norm": 44.23504293604472, + "learning_rate": 9.191390249678744e-07, + "loss": 0.9063, + "step": 28988 + }, + { + "epoch": 2.470723600102276, + "grad_norm": 78.34390404126145, + "learning_rate": 9.188525398243459e-07, + "loss": 2.1772, + "step": 28989 + }, + { + "epoch": 2.470808829796301, + "grad_norm": 58.77052437597074, + "learning_rate": 9.185660948172792e-07, + "loss": 0.9497, + "step": 28990 + }, + { + "epoch": 2.4708940594903264, + "grad_norm": 72.64013695995277, + "learning_rate": 9.182796899494939e-07, + "loss": 1.7477, + "step": 28991 + }, + { + "epoch": 2.470979289184352, + "grad_norm": 48.031640307551385, + "learning_rate": 9.179933252238066e-07, + "loss": 1.1847, + "step": 28992 + }, + { + "epoch": 2.4710645188783773, + "grad_norm": 35.95122562368399, + "learning_rate": 9.177070006430333e-07, + "loss": 1.4211, + "step": 28993 + }, + { + "epoch": 2.4711497485724028, + "grad_norm": 54.44429868520024, + "learning_rate": 9.17420716209988e-07, + "loss": 1.7646, + "step": 28994 + }, + { + "epoch": 2.471234978266428, + "grad_norm": 47.425117092781264, + "learning_rate": 9.171344719274894e-07, + "loss": 1.4895, + "step": 28995 + }, + { + "epoch": 2.4713202079604533, + "grad_norm": 74.75390923197035, + "learning_rate": 9.168482677983504e-07, + "loss": 2.1522, + "step": 28996 + }, + { + "epoch": 2.4714054376544787, + "grad_norm": 80.65360585365144, + "learning_rate": 9.165621038253864e-07, + "loss": 2.0757, + "step": 28997 + }, + { + "epoch": 2.471490667348504, + "grad_norm": 52.97729383619682, + "learning_rate": 9.162759800114102e-07, + "loss": 1.4394, + "step": 28998 + }, + { + "epoch": 2.4715758970425297, + "grad_norm": 54.23876685855995, + "learning_rate": 9.159898963592367e-07, + "loss": 1.4314, + "step": 28999 + }, + { + "epoch": 2.471661126736555, + "grad_norm": 55.33493059208818, + "learning_rate": 9.157038528716816e-07, + "loss": 1.3739, + "step": 29000 + }, + { + "epoch": 2.4717463564305806, + "grad_norm": 48.47274624060541, + "learning_rate": 9.154178495515559e-07, + "loss": 1.0299, + "step": 29001 + }, + { + "epoch": 2.4718315861246056, + "grad_norm": 51.24645073009527, + "learning_rate": 9.151318864016728e-07, + "loss": 0.9805, + "step": 29002 + }, + { + "epoch": 2.471916815818631, + "grad_norm": 46.03041650609756, + "learning_rate": 9.14845963424843e-07, + "loss": 1.155, + "step": 29003 + }, + { + "epoch": 2.4720020455126566, + "grad_norm": 31.496522048404294, + "learning_rate": 9.145600806238808e-07, + "loss": 1.0672, + "step": 29004 + }, + { + "epoch": 2.472087275206682, + "grad_norm": 23.34116255829512, + "learning_rate": 9.142742380015973e-07, + "loss": 0.7511, + "step": 29005 + }, + { + "epoch": 2.4721725049007075, + "grad_norm": 86.99699091961446, + "learning_rate": 9.139884355608014e-07, + "loss": 2.0524, + "step": 29006 + }, + { + "epoch": 2.472257734594733, + "grad_norm": 48.35024729795058, + "learning_rate": 9.137026733043064e-07, + "loss": 1.2482, + "step": 29007 + }, + { + "epoch": 2.4723429642887584, + "grad_norm": 62.02395696362016, + "learning_rate": 9.134169512349222e-07, + "loss": 1.579, + "step": 29008 + }, + { + "epoch": 2.4724281939827835, + "grad_norm": 52.816187918940635, + "learning_rate": 9.131312693554589e-07, + "loss": 1.1213, + "step": 29009 + }, + { + "epoch": 2.472513423676809, + "grad_norm": 88.77684795101108, + "learning_rate": 9.128456276687258e-07, + "loss": 1.9293, + "step": 29010 + }, + { + "epoch": 2.4725986533708344, + "grad_norm": 32.684878746190776, + "learning_rate": 9.125600261775303e-07, + "loss": 0.7597, + "step": 29011 + }, + { + "epoch": 2.47268388306486, + "grad_norm": 80.81634753924403, + "learning_rate": 9.122744648846842e-07, + "loss": 1.3649, + "step": 29012 + }, + { + "epoch": 2.4727691127588853, + "grad_norm": 21.528776128509676, + "learning_rate": 9.119889437929946e-07, + "loss": 1.0627, + "step": 29013 + }, + { + "epoch": 2.4728543424529104, + "grad_norm": 77.91544945742389, + "learning_rate": 9.117034629052679e-07, + "loss": 1.7923, + "step": 29014 + }, + { + "epoch": 2.472939572146936, + "grad_norm": 53.25723042364712, + "learning_rate": 9.11418022224313e-07, + "loss": 1.5259, + "step": 29015 + }, + { + "epoch": 2.4730248018409613, + "grad_norm": 53.6234422999696, + "learning_rate": 9.111326217529387e-07, + "loss": 1.5444, + "step": 29016 + }, + { + "epoch": 2.4731100315349868, + "grad_norm": 62.969996435675775, + "learning_rate": 9.108472614939501e-07, + "loss": 1.7796, + "step": 29017 + }, + { + "epoch": 2.4731952612290122, + "grad_norm": 81.64412953712225, + "learning_rate": 9.105619414501538e-07, + "loss": 1.6353, + "step": 29018 + }, + { + "epoch": 2.4732804909230377, + "grad_norm": 63.64557328054331, + "learning_rate": 9.102766616243557e-07, + "loss": 1.5076, + "step": 29019 + }, + { + "epoch": 2.473365720617063, + "grad_norm": 15.552639320871577, + "learning_rate": 9.099914220193606e-07, + "loss": 0.731, + "step": 29020 + }, + { + "epoch": 2.473450950311088, + "grad_norm": 46.355805383798234, + "learning_rate": 9.097062226379754e-07, + "loss": 1.5581, + "step": 29021 + }, + { + "epoch": 2.4735361800051137, + "grad_norm": 66.00209581412341, + "learning_rate": 9.094210634830036e-07, + "loss": 1.3837, + "step": 29022 + }, + { + "epoch": 2.473621409699139, + "grad_norm": 35.17227722441538, + "learning_rate": 9.091359445572512e-07, + "loss": 0.8743, + "step": 29023 + }, + { + "epoch": 2.4737066393931646, + "grad_norm": 60.389588383339586, + "learning_rate": 9.088508658635198e-07, + "loss": 1.5587, + "step": 29024 + }, + { + "epoch": 2.47379186908719, + "grad_norm": 37.48515621303395, + "learning_rate": 9.085658274046156e-07, + "loss": 1.091, + "step": 29025 + }, + { + "epoch": 2.4738770987812155, + "grad_norm": 67.69814314001472, + "learning_rate": 9.082808291833412e-07, + "loss": 1.4261, + "step": 29026 + }, + { + "epoch": 2.473962328475241, + "grad_norm": 44.975790169549406, + "learning_rate": 9.079958712024983e-07, + "loss": 1.4566, + "step": 29027 + }, + { + "epoch": 2.474047558169266, + "grad_norm": 72.5409104427672, + "learning_rate": 9.077109534648892e-07, + "loss": 2.0528, + "step": 29028 + }, + { + "epoch": 2.4741327878632915, + "grad_norm": 35.32892195862801, + "learning_rate": 9.074260759733184e-07, + "loss": 1.1663, + "step": 29029 + }, + { + "epoch": 2.474218017557317, + "grad_norm": 37.39374388197342, + "learning_rate": 9.071412387305839e-07, + "loss": 1.4002, + "step": 29030 + }, + { + "epoch": 2.4743032472513424, + "grad_norm": 62.46204994525475, + "learning_rate": 9.068564417394909e-07, + "loss": 1.7291, + "step": 29031 + }, + { + "epoch": 2.474388476945368, + "grad_norm": 37.93406939659558, + "learning_rate": 9.065716850028378e-07, + "loss": 0.9899, + "step": 29032 + }, + { + "epoch": 2.474473706639393, + "grad_norm": 69.57367041386206, + "learning_rate": 9.062869685234244e-07, + "loss": 1.8389, + "step": 29033 + }, + { + "epoch": 2.4745589363334184, + "grad_norm": 37.47842247434475, + "learning_rate": 9.060022923040534e-07, + "loss": 0.8879, + "step": 29034 + }, + { + "epoch": 2.474644166027444, + "grad_norm": 32.84142107894703, + "learning_rate": 9.057176563475234e-07, + "loss": 1.0901, + "step": 29035 + }, + { + "epoch": 2.4747293957214693, + "grad_norm": 62.91011065546256, + "learning_rate": 9.054330606566326e-07, + "loss": 1.334, + "step": 29036 + }, + { + "epoch": 2.474814625415495, + "grad_norm": 42.94658997713238, + "learning_rate": 9.051485052341802e-07, + "loss": 1.5755, + "step": 29037 + }, + { + "epoch": 2.4748998551095203, + "grad_norm": 55.370334400887195, + "learning_rate": 9.048639900829643e-07, + "loss": 1.3926, + "step": 29038 + }, + { + "epoch": 2.4749850848035457, + "grad_norm": 59.73099020859895, + "learning_rate": 9.045795152057857e-07, + "loss": 1.9934, + "step": 29039 + }, + { + "epoch": 2.4750703144975708, + "grad_norm": 72.58228775249714, + "learning_rate": 9.042950806054396e-07, + "loss": 1.7388, + "step": 29040 + }, + { + "epoch": 2.4751555441915962, + "grad_norm": 28.70414495125759, + "learning_rate": 9.040106862847231e-07, + "loss": 0.8968, + "step": 29041 + }, + { + "epoch": 2.4752407738856217, + "grad_norm": 66.11138664126744, + "learning_rate": 9.037263322464352e-07, + "loss": 2.0386, + "step": 29042 + }, + { + "epoch": 2.475326003579647, + "grad_norm": 69.43470743940817, + "learning_rate": 9.034420184933712e-07, + "loss": 1.5624, + "step": 29043 + }, + { + "epoch": 2.4754112332736726, + "grad_norm": 67.10739676800246, + "learning_rate": 9.031577450283269e-07, + "loss": 1.4289, + "step": 29044 + }, + { + "epoch": 2.475496462967698, + "grad_norm": 24.04551867724671, + "learning_rate": 9.028735118540966e-07, + "loss": 0.9349, + "step": 29045 + }, + { + "epoch": 2.4755816926617236, + "grad_norm": 107.88392370015029, + "learning_rate": 9.02589318973478e-07, + "loss": 3.0462, + "step": 29046 + }, + { + "epoch": 2.4756669223557486, + "grad_norm": 39.665226421944666, + "learning_rate": 9.023051663892662e-07, + "loss": 1.2066, + "step": 29047 + }, + { + "epoch": 2.475752152049774, + "grad_norm": 28.771554247108995, + "learning_rate": 9.020210541042551e-07, + "loss": 1.0119, + "step": 29048 + }, + { + "epoch": 2.4758373817437995, + "grad_norm": 36.52622921320958, + "learning_rate": 9.017369821212379e-07, + "loss": 1.2736, + "step": 29049 + }, + { + "epoch": 2.475922611437825, + "grad_norm": 20.55725556289727, + "learning_rate": 9.01452950443008e-07, + "loss": 0.7818, + "step": 29050 + }, + { + "epoch": 2.4760078411318505, + "grad_norm": 22.34587591928073, + "learning_rate": 9.011689590723616e-07, + "loss": 0.7996, + "step": 29051 + }, + { + "epoch": 2.4760930708258755, + "grad_norm": 70.21543267328448, + "learning_rate": 9.00885008012089e-07, + "loss": 1.4797, + "step": 29052 + }, + { + "epoch": 2.476178300519901, + "grad_norm": 27.080013364558194, + "learning_rate": 9.006010972649826e-07, + "loss": 0.7723, + "step": 29053 + }, + { + "epoch": 2.4762635302139264, + "grad_norm": 41.80403492245987, + "learning_rate": 9.003172268338356e-07, + "loss": 0.8713, + "step": 29054 + }, + { + "epoch": 2.476348759907952, + "grad_norm": 73.73421161755819, + "learning_rate": 9.000333967214408e-07, + "loss": 1.6017, + "step": 29055 + }, + { + "epoch": 2.4764339896019774, + "grad_norm": 69.17412449351997, + "learning_rate": 8.997496069305883e-07, + "loss": 2.3221, + "step": 29056 + }, + { + "epoch": 2.476519219296003, + "grad_norm": 72.56098102659763, + "learning_rate": 8.994658574640691e-07, + "loss": 1.3873, + "step": 29057 + }, + { + "epoch": 2.4766044489900283, + "grad_norm": 35.02687625558055, + "learning_rate": 8.991821483246726e-07, + "loss": 1.126, + "step": 29058 + }, + { + "epoch": 2.4766896786840533, + "grad_norm": 57.70694822878302, + "learning_rate": 8.988984795151911e-07, + "loss": 0.9853, + "step": 29059 + }, + { + "epoch": 2.476774908378079, + "grad_norm": 36.543152040788954, + "learning_rate": 8.98614851038414e-07, + "loss": 0.9649, + "step": 29060 + }, + { + "epoch": 2.4768601380721043, + "grad_norm": 38.31017528369404, + "learning_rate": 8.983312628971285e-07, + "loss": 1.2055, + "step": 29061 + }, + { + "epoch": 2.4769453677661297, + "grad_norm": 64.74574707448602, + "learning_rate": 8.98047715094127e-07, + "loss": 2.0607, + "step": 29062 + }, + { + "epoch": 2.477030597460155, + "grad_norm": 50.034867552466444, + "learning_rate": 8.97764207632194e-07, + "loss": 1.4567, + "step": 29063 + }, + { + "epoch": 2.4771158271541807, + "grad_norm": 74.18839936050774, + "learning_rate": 8.974807405141222e-07, + "loss": 2.1672, + "step": 29064 + }, + { + "epoch": 2.477201056848206, + "grad_norm": 33.38761809228754, + "learning_rate": 8.971973137426964e-07, + "loss": 1.1943, + "step": 29065 + }, + { + "epoch": 2.477286286542231, + "grad_norm": 48.84581659275245, + "learning_rate": 8.96913927320705e-07, + "loss": 1.8741, + "step": 29066 + }, + { + "epoch": 2.4773715162362566, + "grad_norm": 17.990960519707087, + "learning_rate": 8.966305812509335e-07, + "loss": 0.6239, + "step": 29067 + }, + { + "epoch": 2.477456745930282, + "grad_norm": 67.35337103756534, + "learning_rate": 8.963472755361713e-07, + "loss": 1.8368, + "step": 29068 + }, + { + "epoch": 2.4775419756243076, + "grad_norm": 22.648042456878446, + "learning_rate": 8.960640101792012e-07, + "loss": 0.9291, + "step": 29069 + }, + { + "epoch": 2.477627205318333, + "grad_norm": 48.914559821033066, + "learning_rate": 8.957807851828121e-07, + "loss": 1.6721, + "step": 29070 + }, + { + "epoch": 2.4777124350123585, + "grad_norm": 85.39132843978963, + "learning_rate": 8.954976005497873e-07, + "loss": 2.1743, + "step": 29071 + }, + { + "epoch": 2.4777976647063835, + "grad_norm": 66.06128144324866, + "learning_rate": 8.952144562829135e-07, + "loss": 1.9755, + "step": 29072 + }, + { + "epoch": 2.477882894400409, + "grad_norm": 31.036476322871682, + "learning_rate": 8.949313523849745e-07, + "loss": 0.8204, + "step": 29073 + }, + { + "epoch": 2.4779681240944345, + "grad_norm": 45.42576284091624, + "learning_rate": 8.946482888587548e-07, + "loss": 1.4708, + "step": 29074 + }, + { + "epoch": 2.47805335378846, + "grad_norm": 44.43462977327199, + "learning_rate": 8.943652657070373e-07, + "loss": 0.9779, + "step": 29075 + }, + { + "epoch": 2.4781385834824854, + "grad_norm": 68.93853268881325, + "learning_rate": 8.940822829326051e-07, + "loss": 1.4368, + "step": 29076 + }, + { + "epoch": 2.478223813176511, + "grad_norm": 18.85664109284987, + "learning_rate": 8.937993405382423e-07, + "loss": 1.3067, + "step": 29077 + }, + { + "epoch": 2.478309042870536, + "grad_norm": 41.636720521202626, + "learning_rate": 8.93516438526732e-07, + "loss": 1.3481, + "step": 29078 + }, + { + "epoch": 2.4783942725645614, + "grad_norm": 157.80011021559525, + "learning_rate": 8.93233576900856e-07, + "loss": 2.8845, + "step": 29079 + }, + { + "epoch": 2.478479502258587, + "grad_norm": 113.2901474863658, + "learning_rate": 8.929507556633949e-07, + "loss": 1.8524, + "step": 29080 + }, + { + "epoch": 2.4785647319526123, + "grad_norm": 39.12582090718462, + "learning_rate": 8.926679748171324e-07, + "loss": 0.8549, + "step": 29081 + }, + { + "epoch": 2.4786499616466378, + "grad_norm": 40.16265952972701, + "learning_rate": 8.923852343648481e-07, + "loss": 1.1774, + "step": 29082 + }, + { + "epoch": 2.4787351913406632, + "grad_norm": 64.60220043086017, + "learning_rate": 8.921025343093225e-07, + "loss": 1.9218, + "step": 29083 + }, + { + "epoch": 2.4788204210346887, + "grad_norm": 53.83787149150415, + "learning_rate": 8.918198746533357e-07, + "loss": 1.4961, + "step": 29084 + }, + { + "epoch": 2.4789056507287137, + "grad_norm": 25.638224530093446, + "learning_rate": 8.915372553996676e-07, + "loss": 0.775, + "step": 29085 + }, + { + "epoch": 2.478990880422739, + "grad_norm": 53.73705802870414, + "learning_rate": 8.912546765510993e-07, + "loss": 1.6588, + "step": 29086 + }, + { + "epoch": 2.4790761101167647, + "grad_norm": 47.42299698095315, + "learning_rate": 8.909721381104086e-07, + "loss": 1.2634, + "step": 29087 + }, + { + "epoch": 2.47916133981079, + "grad_norm": 16.328156404648904, + "learning_rate": 8.906896400803727e-07, + "loss": 0.4399, + "step": 29088 + }, + { + "epoch": 2.4792465695048156, + "grad_norm": 72.58718661609065, + "learning_rate": 8.904071824637728e-07, + "loss": 1.8518, + "step": 29089 + }, + { + "epoch": 2.479331799198841, + "grad_norm": 34.18624236256736, + "learning_rate": 8.901247652633849e-07, + "loss": 1.2207, + "step": 29090 + }, + { + "epoch": 2.479417028892866, + "grad_norm": 53.52259220942754, + "learning_rate": 8.898423884819873e-07, + "loss": 1.4139, + "step": 29091 + }, + { + "epoch": 2.4795022585868915, + "grad_norm": 27.15728538940578, + "learning_rate": 8.895600521223551e-07, + "loss": 0.9122, + "step": 29092 + }, + { + "epoch": 2.479587488280917, + "grad_norm": 194.60598262653053, + "learning_rate": 8.892777561872668e-07, + "loss": 1.4751, + "step": 29093 + }, + { + "epoch": 2.4796727179749425, + "grad_norm": 65.94402259056628, + "learning_rate": 8.889955006794992e-07, + "loss": 1.5379, + "step": 29094 + }, + { + "epoch": 2.479757947668968, + "grad_norm": 19.10757660558589, + "learning_rate": 8.887132856018272e-07, + "loss": 0.804, + "step": 29095 + }, + { + "epoch": 2.4798431773629934, + "grad_norm": 41.23289274406547, + "learning_rate": 8.884311109570264e-07, + "loss": 0.9631, + "step": 29096 + }, + { + "epoch": 2.4799284070570184, + "grad_norm": 57.415495428695834, + "learning_rate": 8.881489767478707e-07, + "loss": 1.6949, + "step": 29097 + }, + { + "epoch": 2.480013636751044, + "grad_norm": 72.50464992255738, + "learning_rate": 8.878668829771376e-07, + "loss": 2.1524, + "step": 29098 + }, + { + "epoch": 2.4800988664450694, + "grad_norm": 44.87477399688007, + "learning_rate": 8.875848296475992e-07, + "loss": 1.5506, + "step": 29099 + }, + { + "epoch": 2.480184096139095, + "grad_norm": 50.62134924280765, + "learning_rate": 8.87302816762029e-07, + "loss": 1.4801, + "step": 29100 + }, + { + "epoch": 2.4802693258331203, + "grad_norm": 43.55533747533462, + "learning_rate": 8.870208443232009e-07, + "loss": 1.0982, + "step": 29101 + }, + { + "epoch": 2.480354555527146, + "grad_norm": 67.98216854547078, + "learning_rate": 8.867389123338898e-07, + "loss": 1.3899, + "step": 29102 + }, + { + "epoch": 2.4804397852211713, + "grad_norm": 60.3267028003062, + "learning_rate": 8.864570207968676e-07, + "loss": 1.8864, + "step": 29103 + }, + { + "epoch": 2.4805250149151963, + "grad_norm": 35.35152672502617, + "learning_rate": 8.861751697149057e-07, + "loss": 1.2617, + "step": 29104 + }, + { + "epoch": 2.4806102446092217, + "grad_norm": 46.91474575693356, + "learning_rate": 8.858933590907759e-07, + "loss": 1.5759, + "step": 29105 + }, + { + "epoch": 2.480695474303247, + "grad_norm": 73.91938741049594, + "learning_rate": 8.856115889272493e-07, + "loss": 2.2013, + "step": 29106 + }, + { + "epoch": 2.4807807039972727, + "grad_norm": 84.04843315647666, + "learning_rate": 8.853298592270993e-07, + "loss": 2.105, + "step": 29107 + }, + { + "epoch": 2.480865933691298, + "grad_norm": 65.29385513083244, + "learning_rate": 8.850481699930935e-07, + "loss": 2.1389, + "step": 29108 + }, + { + "epoch": 2.4809511633853236, + "grad_norm": 17.173734470508276, + "learning_rate": 8.847665212280048e-07, + "loss": 0.7114, + "step": 29109 + }, + { + "epoch": 2.481036393079349, + "grad_norm": 20.369810437919686, + "learning_rate": 8.844849129346017e-07, + "loss": 0.7518, + "step": 29110 + }, + { + "epoch": 2.481121622773374, + "grad_norm": 69.90486029272552, + "learning_rate": 8.842033451156545e-07, + "loss": 2.0821, + "step": 29111 + }, + { + "epoch": 2.4812068524673996, + "grad_norm": 73.29166041065245, + "learning_rate": 8.839218177739328e-07, + "loss": 2.2313, + "step": 29112 + }, + { + "epoch": 2.481292082161425, + "grad_norm": 42.32763120272692, + "learning_rate": 8.836403309122038e-07, + "loss": 1.2415, + "step": 29113 + }, + { + "epoch": 2.4813773118554505, + "grad_norm": 60.538270398191756, + "learning_rate": 8.833588845332358e-07, + "loss": 1.7542, + "step": 29114 + }, + { + "epoch": 2.481462541549476, + "grad_norm": 66.50987755917254, + "learning_rate": 8.830774786397983e-07, + "loss": 1.9213, + "step": 29115 + }, + { + "epoch": 2.481547771243501, + "grad_norm": 60.75141671734163, + "learning_rate": 8.827961132346563e-07, + "loss": 2.1194, + "step": 29116 + }, + { + "epoch": 2.4816330009375265, + "grad_norm": 73.55713309193065, + "learning_rate": 8.825147883205804e-07, + "loss": 2.2316, + "step": 29117 + }, + { + "epoch": 2.481718230631552, + "grad_norm": 56.26355230828073, + "learning_rate": 8.822335039003337e-07, + "loss": 1.5211, + "step": 29118 + }, + { + "epoch": 2.4818034603255774, + "grad_norm": 48.52022775126109, + "learning_rate": 8.819522599766855e-07, + "loss": 1.2893, + "step": 29119 + }, + { + "epoch": 2.481888690019603, + "grad_norm": 93.56785125351789, + "learning_rate": 8.816710565524006e-07, + "loss": 2.172, + "step": 29120 + }, + { + "epoch": 2.4819739197136284, + "grad_norm": 44.74683048602119, + "learning_rate": 8.813898936302445e-07, + "loss": 1.373, + "step": 29121 + }, + { + "epoch": 2.482059149407654, + "grad_norm": 34.395225667252454, + "learning_rate": 8.811087712129824e-07, + "loss": 1.1137, + "step": 29122 + }, + { + "epoch": 2.482144379101679, + "grad_norm": 29.748788086265552, + "learning_rate": 8.808276893033774e-07, + "loss": 0.9828, + "step": 29123 + }, + { + "epoch": 2.4822296087957043, + "grad_norm": 42.28729941523295, + "learning_rate": 8.805466479041952e-07, + "loss": 1.3429, + "step": 29124 + }, + { + "epoch": 2.48231483848973, + "grad_norm": 81.70461506982643, + "learning_rate": 8.802656470182008e-07, + "loss": 2.1876, + "step": 29125 + }, + { + "epoch": 2.4824000681837552, + "grad_norm": 54.532358823229046, + "learning_rate": 8.799846866481571e-07, + "loss": 1.7144, + "step": 29126 + }, + { + "epoch": 2.4824852978777807, + "grad_norm": 16.8598663134326, + "learning_rate": 8.797037667968255e-07, + "loss": 0.448, + "step": 29127 + }, + { + "epoch": 2.482570527571806, + "grad_norm": 47.21762436635909, + "learning_rate": 8.794228874669714e-07, + "loss": 0.8858, + "step": 29128 + }, + { + "epoch": 2.4826557572658317, + "grad_norm": 31.454173206762125, + "learning_rate": 8.791420486613561e-07, + "loss": 1.1656, + "step": 29129 + }, + { + "epoch": 2.4827409869598567, + "grad_norm": 63.51024928328357, + "learning_rate": 8.78861250382741e-07, + "loss": 1.3565, + "step": 29130 + }, + { + "epoch": 2.482826216653882, + "grad_norm": 28.249750196406936, + "learning_rate": 8.785804926338864e-07, + "loss": 0.9456, + "step": 29131 + }, + { + "epoch": 2.4829114463479076, + "grad_norm": 36.99161468341719, + "learning_rate": 8.782997754175554e-07, + "loss": 0.9757, + "step": 29132 + }, + { + "epoch": 2.482996676041933, + "grad_norm": 52.18046076532929, + "learning_rate": 8.780190987365095e-07, + "loss": 1.2009, + "step": 29133 + }, + { + "epoch": 2.4830819057359585, + "grad_norm": 25.579443396021777, + "learning_rate": 8.777384625935076e-07, + "loss": 0.7718, + "step": 29134 + }, + { + "epoch": 2.4831671354299836, + "grad_norm": 64.05766571983223, + "learning_rate": 8.774578669913103e-07, + "loss": 1.7069, + "step": 29135 + }, + { + "epoch": 2.483252365124009, + "grad_norm": 46.97565771662736, + "learning_rate": 8.771773119326754e-07, + "loss": 1.4599, + "step": 29136 + }, + { + "epoch": 2.4833375948180345, + "grad_norm": 52.44235109749482, + "learning_rate": 8.768967974203641e-07, + "loss": 1.6352, + "step": 29137 + }, + { + "epoch": 2.48342282451206, + "grad_norm": 54.35110591551319, + "learning_rate": 8.766163234571351e-07, + "loss": 1.5826, + "step": 29138 + }, + { + "epoch": 2.4835080542060854, + "grad_norm": 65.74640773384708, + "learning_rate": 8.763358900457447e-07, + "loss": 1.3285, + "step": 29139 + }, + { + "epoch": 2.483593283900111, + "grad_norm": 31.617412262985326, + "learning_rate": 8.760554971889518e-07, + "loss": 0.8954, + "step": 29140 + }, + { + "epoch": 2.4836785135941364, + "grad_norm": 59.334238110750604, + "learning_rate": 8.757751448895163e-07, + "loss": 1.6567, + "step": 29141 + }, + { + "epoch": 2.4837637432881614, + "grad_norm": 42.66568666671532, + "learning_rate": 8.754948331501928e-07, + "loss": 0.9396, + "step": 29142 + }, + { + "epoch": 2.483848972982187, + "grad_norm": 62.067157877530285, + "learning_rate": 8.75214561973739e-07, + "loss": 1.993, + "step": 29143 + }, + { + "epoch": 2.4839342026762123, + "grad_norm": 32.23330266799234, + "learning_rate": 8.7493433136291e-07, + "loss": 1.2066, + "step": 29144 + }, + { + "epoch": 2.484019432370238, + "grad_norm": 76.37034382296966, + "learning_rate": 8.746541413204634e-07, + "loss": 1.84, + "step": 29145 + }, + { + "epoch": 2.4841046620642633, + "grad_norm": 23.80970831776343, + "learning_rate": 8.743739918491545e-07, + "loss": 1.0644, + "step": 29146 + }, + { + "epoch": 2.4841898917582887, + "grad_norm": 32.66608931119201, + "learning_rate": 8.740938829517365e-07, + "loss": 1.2629, + "step": 29147 + }, + { + "epoch": 2.484275121452314, + "grad_norm": 118.31746800593767, + "learning_rate": 8.738138146309672e-07, + "loss": 3.1667, + "step": 29148 + }, + { + "epoch": 2.4843603511463392, + "grad_norm": 57.15861891017905, + "learning_rate": 8.735337868895977e-07, + "loss": 1.4941, + "step": 29149 + }, + { + "epoch": 2.4844455808403647, + "grad_norm": 47.39730559101659, + "learning_rate": 8.732537997303853e-07, + "loss": 1.4284, + "step": 29150 + }, + { + "epoch": 2.48453081053439, + "grad_norm": 40.324212074904274, + "learning_rate": 8.729738531560822e-07, + "loss": 0.9477, + "step": 29151 + }, + { + "epoch": 2.4846160402284156, + "grad_norm": 68.50306922910697, + "learning_rate": 8.726939471694412e-07, + "loss": 1.8712, + "step": 29152 + }, + { + "epoch": 2.484701269922441, + "grad_norm": 62.393334015491064, + "learning_rate": 8.724140817732135e-07, + "loss": 1.9291, + "step": 29153 + }, + { + "epoch": 2.484786499616466, + "grad_norm": 46.80476103355105, + "learning_rate": 8.721342569701552e-07, + "loss": 1.2536, + "step": 29154 + }, + { + "epoch": 2.4848717293104916, + "grad_norm": 65.81692502231012, + "learning_rate": 8.718544727630146e-07, + "loss": 1.746, + "step": 29155 + }, + { + "epoch": 2.484956959004517, + "grad_norm": 76.743805759238, + "learning_rate": 8.715747291545457e-07, + "loss": 2.4643, + "step": 29156 + }, + { + "epoch": 2.4850421886985425, + "grad_norm": 62.37728373493035, + "learning_rate": 8.712950261474984e-07, + "loss": 2.0639, + "step": 29157 + }, + { + "epoch": 2.485127418392568, + "grad_norm": 65.32976391622307, + "learning_rate": 8.710153637446245e-07, + "loss": 2.1634, + "step": 29158 + }, + { + "epoch": 2.4852126480865935, + "grad_norm": 58.42219536757015, + "learning_rate": 8.70735741948674e-07, + "loss": 1.4235, + "step": 29159 + }, + { + "epoch": 2.485297877780619, + "grad_norm": 28.009912735349893, + "learning_rate": 8.704561607623969e-07, + "loss": 0.9918, + "step": 29160 + }, + { + "epoch": 2.485383107474644, + "grad_norm": 25.993164433707864, + "learning_rate": 8.701766201885409e-07, + "loss": 0.753, + "step": 29161 + }, + { + "epoch": 2.4854683371686694, + "grad_norm": 55.083572819140564, + "learning_rate": 8.698971202298584e-07, + "loss": 1.7649, + "step": 29162 + }, + { + "epoch": 2.485553566862695, + "grad_norm": 46.858789396908286, + "learning_rate": 8.696176608890954e-07, + "loss": 1.3464, + "step": 29163 + }, + { + "epoch": 2.4856387965567204, + "grad_norm": 68.25014908064846, + "learning_rate": 8.693382421690028e-07, + "loss": 1.5072, + "step": 29164 + }, + { + "epoch": 2.485724026250746, + "grad_norm": 70.49332191431878, + "learning_rate": 8.690588640723269e-07, + "loss": 1.9436, + "step": 29165 + }, + { + "epoch": 2.4858092559447713, + "grad_norm": 44.0287052055985, + "learning_rate": 8.687795266018139e-07, + "loss": 1.3712, + "step": 29166 + }, + { + "epoch": 2.485894485638797, + "grad_norm": 39.4305329841075, + "learning_rate": 8.685002297602146e-07, + "loss": 1.2149, + "step": 29167 + }, + { + "epoch": 2.485979715332822, + "grad_norm": 42.357739371594484, + "learning_rate": 8.682209735502739e-07, + "loss": 1.4535, + "step": 29168 + }, + { + "epoch": 2.4860649450268473, + "grad_norm": 14.239353747095006, + "learning_rate": 8.679417579747379e-07, + "loss": 0.4995, + "step": 29169 + }, + { + "epoch": 2.4861501747208727, + "grad_norm": 46.127933564298615, + "learning_rate": 8.676625830363517e-07, + "loss": 1.5095, + "step": 29170 + }, + { + "epoch": 2.486235404414898, + "grad_norm": 70.24136898243184, + "learning_rate": 8.673834487378619e-07, + "loss": 1.3997, + "step": 29171 + }, + { + "epoch": 2.4863206341089237, + "grad_norm": 54.86357792603513, + "learning_rate": 8.671043550820147e-07, + "loss": 1.2459, + "step": 29172 + }, + { + "epoch": 2.4864058638029487, + "grad_norm": 45.37198771158865, + "learning_rate": 8.668253020715539e-07, + "loss": 1.2895, + "step": 29173 + }, + { + "epoch": 2.486491093496974, + "grad_norm": 36.54705977535167, + "learning_rate": 8.665462897092231e-07, + "loss": 1.3047, + "step": 29174 + }, + { + "epoch": 2.4865763231909996, + "grad_norm": 21.91862046111891, + "learning_rate": 8.662673179977676e-07, + "loss": 0.6927, + "step": 29175 + }, + { + "epoch": 2.486661552885025, + "grad_norm": 55.866425219288146, + "learning_rate": 8.65988386939931e-07, + "loss": 1.5599, + "step": 29176 + }, + { + "epoch": 2.4867467825790506, + "grad_norm": 43.91220399219393, + "learning_rate": 8.65709496538456e-07, + "loss": 1.4582, + "step": 29177 + }, + { + "epoch": 2.486832012273076, + "grad_norm": 65.43480940880272, + "learning_rate": 8.654306467960838e-07, + "loss": 1.8038, + "step": 29178 + }, + { + "epoch": 2.4869172419671015, + "grad_norm": 61.43714026762759, + "learning_rate": 8.651518377155582e-07, + "loss": 0.8043, + "step": 29179 + }, + { + "epoch": 2.4870024716611265, + "grad_norm": 41.92207373759138, + "learning_rate": 8.648730692996227e-07, + "loss": 2.0286, + "step": 29180 + }, + { + "epoch": 2.487087701355152, + "grad_norm": 64.5133183790168, + "learning_rate": 8.645943415510178e-07, + "loss": 1.8683, + "step": 29181 + }, + { + "epoch": 2.4871729310491775, + "grad_norm": 59.352762151800825, + "learning_rate": 8.643156544724834e-07, + "loss": 1.293, + "step": 29182 + }, + { + "epoch": 2.487258160743203, + "grad_norm": 80.60033696445852, + "learning_rate": 8.640370080667609e-07, + "loss": 1.8249, + "step": 29183 + }, + { + "epoch": 2.4873433904372284, + "grad_norm": 26.974024881856966, + "learning_rate": 8.637584023365914e-07, + "loss": 1.1447, + "step": 29184 + }, + { + "epoch": 2.487428620131254, + "grad_norm": 30.991723807744336, + "learning_rate": 8.634798372847148e-07, + "loss": 0.9741, + "step": 29185 + }, + { + "epoch": 2.4875138498252793, + "grad_norm": 80.74670047118119, + "learning_rate": 8.632013129138689e-07, + "loss": 1.4506, + "step": 29186 + }, + { + "epoch": 2.4875990795193044, + "grad_norm": 62.27927106599601, + "learning_rate": 8.629228292267944e-07, + "loss": 1.5523, + "step": 29187 + }, + { + "epoch": 2.48768430921333, + "grad_norm": 61.84325206119928, + "learning_rate": 8.626443862262313e-07, + "loss": 1.8601, + "step": 29188 + }, + { + "epoch": 2.4877695389073553, + "grad_norm": 53.072761053062464, + "learning_rate": 8.623659839149168e-07, + "loss": 1.5243, + "step": 29189 + }, + { + "epoch": 2.4878547686013808, + "grad_norm": 49.786954024116326, + "learning_rate": 8.620876222955882e-07, + "loss": 1.7798, + "step": 29190 + }, + { + "epoch": 2.4879399982954062, + "grad_norm": 65.76207551378876, + "learning_rate": 8.618093013709828e-07, + "loss": 1.4236, + "step": 29191 + }, + { + "epoch": 2.4880252279894317, + "grad_norm": 99.41442069673475, + "learning_rate": 8.615310211438393e-07, + "loss": 1.6281, + "step": 29192 + }, + { + "epoch": 2.4881104576834567, + "grad_norm": 41.40480061571881, + "learning_rate": 8.612527816168941e-07, + "loss": 1.4283, + "step": 29193 + }, + { + "epoch": 2.488195687377482, + "grad_norm": 45.89049234372317, + "learning_rate": 8.609745827928817e-07, + "loss": 1.2912, + "step": 29194 + }, + { + "epoch": 2.4882809170715077, + "grad_norm": 39.300122850422426, + "learning_rate": 8.606964246745414e-07, + "loss": 1.7168, + "step": 29195 + }, + { + "epoch": 2.488366146765533, + "grad_norm": 33.84467807811273, + "learning_rate": 8.604183072646055e-07, + "loss": 0.9435, + "step": 29196 + }, + { + "epoch": 2.4884513764595586, + "grad_norm": 66.1644905583178, + "learning_rate": 8.601402305658113e-07, + "loss": 1.4288, + "step": 29197 + }, + { + "epoch": 2.488536606153584, + "grad_norm": 54.32236063937791, + "learning_rate": 8.598621945808938e-07, + "loss": 1.6948, + "step": 29198 + }, + { + "epoch": 2.488621835847609, + "grad_norm": 51.362793938640564, + "learning_rate": 8.595841993125858e-07, + "loss": 1.3971, + "step": 29199 + }, + { + "epoch": 2.4887070655416346, + "grad_norm": 75.83509902230047, + "learning_rate": 8.593062447636208e-07, + "loss": 2.3387, + "step": 29200 + }, + { + "epoch": 2.48879229523566, + "grad_norm": 52.395669318101696, + "learning_rate": 8.590283309367353e-07, + "loss": 1.8755, + "step": 29201 + }, + { + "epoch": 2.4888775249296855, + "grad_norm": 79.63299872099012, + "learning_rate": 8.58750457834659e-07, + "loss": 1.5641, + "step": 29202 + }, + { + "epoch": 2.488962754623711, + "grad_norm": 66.85938503198233, + "learning_rate": 8.584726254601278e-07, + "loss": 1.7534, + "step": 29203 + }, + { + "epoch": 2.4890479843177364, + "grad_norm": 56.028086773552566, + "learning_rate": 8.581948338158713e-07, + "loss": 2.1378, + "step": 29204 + }, + { + "epoch": 2.489133214011762, + "grad_norm": 30.437970173817785, + "learning_rate": 8.579170829046241e-07, + "loss": 0.8095, + "step": 29205 + }, + { + "epoch": 2.489218443705787, + "grad_norm": 34.09113876058812, + "learning_rate": 8.576393727291166e-07, + "loss": 1.0264, + "step": 29206 + }, + { + "epoch": 2.4893036733998124, + "grad_norm": 53.60091245067894, + "learning_rate": 8.573617032920794e-07, + "loss": 1.0695, + "step": 29207 + }, + { + "epoch": 2.489388903093838, + "grad_norm": 27.544969038396374, + "learning_rate": 8.570840745962444e-07, + "loss": 0.7752, + "step": 29208 + }, + { + "epoch": 2.4894741327878633, + "grad_norm": 23.14321109003408, + "learning_rate": 8.568064866443393e-07, + "loss": 0.68, + "step": 29209 + }, + { + "epoch": 2.489559362481889, + "grad_norm": 45.38260630338649, + "learning_rate": 8.565289394390968e-07, + "loss": 1.0905, + "step": 29210 + }, + { + "epoch": 2.4896445921759143, + "grad_norm": 50.0729801484191, + "learning_rate": 8.562514329832466e-07, + "loss": 1.4559, + "step": 29211 + }, + { + "epoch": 2.4897298218699393, + "grad_norm": 83.10272819190958, + "learning_rate": 8.55973967279517e-07, + "loss": 1.4202, + "step": 29212 + }, + { + "epoch": 2.4898150515639648, + "grad_norm": 57.08242942094964, + "learning_rate": 8.556965423306357e-07, + "loss": 1.2924, + "step": 29213 + }, + { + "epoch": 2.4899002812579902, + "grad_norm": 33.87039132876797, + "learning_rate": 8.554191581393328e-07, + "loss": 1.1035, + "step": 29214 + }, + { + "epoch": 2.4899855109520157, + "grad_norm": 71.57317556601622, + "learning_rate": 8.551418147083362e-07, + "loss": 1.3254, + "step": 29215 + }, + { + "epoch": 2.490070740646041, + "grad_norm": 37.67048146976025, + "learning_rate": 8.548645120403725e-07, + "loss": 1.0378, + "step": 29216 + }, + { + "epoch": 2.4901559703400666, + "grad_norm": 49.7283802553699, + "learning_rate": 8.54587250138168e-07, + "loss": 1.5564, + "step": 29217 + }, + { + "epoch": 2.4902412000340917, + "grad_norm": 78.78976310968157, + "learning_rate": 8.543100290044509e-07, + "loss": 1.9955, + "step": 29218 + }, + { + "epoch": 2.490326429728117, + "grad_norm": 35.53863311141738, + "learning_rate": 8.540328486419486e-07, + "loss": 0.8337, + "step": 29219 + }, + { + "epoch": 2.4904116594221426, + "grad_norm": 58.103502799644446, + "learning_rate": 8.537557090533855e-07, + "loss": 1.8576, + "step": 29220 + }, + { + "epoch": 2.490496889116168, + "grad_norm": 42.451269890413975, + "learning_rate": 8.534786102414877e-07, + "loss": 1.1837, + "step": 29221 + }, + { + "epoch": 2.4905821188101935, + "grad_norm": 50.8064621727198, + "learning_rate": 8.532015522089788e-07, + "loss": 1.3063, + "step": 29222 + }, + { + "epoch": 2.490667348504219, + "grad_norm": 75.3938715124671, + "learning_rate": 8.529245349585858e-07, + "loss": 1.7646, + "step": 29223 + }, + { + "epoch": 2.4907525781982445, + "grad_norm": 32.78812599097682, + "learning_rate": 8.526475584930327e-07, + "loss": 0.7691, + "step": 29224 + }, + { + "epoch": 2.4908378078922695, + "grad_norm": 63.0614018530025, + "learning_rate": 8.523706228150414e-07, + "loss": 1.4816, + "step": 29225 + }, + { + "epoch": 2.490923037586295, + "grad_norm": 27.611870803972344, + "learning_rate": 8.520937279273367e-07, + "loss": 0.8984, + "step": 29226 + }, + { + "epoch": 2.4910082672803204, + "grad_norm": 22.276830230751788, + "learning_rate": 8.518168738326432e-07, + "loss": 0.5003, + "step": 29227 + }, + { + "epoch": 2.491093496974346, + "grad_norm": 17.44128659153136, + "learning_rate": 8.515400605336826e-07, + "loss": 0.533, + "step": 29228 + }, + { + "epoch": 2.4911787266683714, + "grad_norm": 73.15530574867634, + "learning_rate": 8.512632880331767e-07, + "loss": 1.3161, + "step": 29229 + }, + { + "epoch": 2.491263956362397, + "grad_norm": 35.53188185187099, + "learning_rate": 8.50986556333847e-07, + "loss": 1.4221, + "step": 29230 + }, + { + "epoch": 2.4913491860564223, + "grad_norm": 67.61277874441481, + "learning_rate": 8.507098654384171e-07, + "loss": 1.9983, + "step": 29231 + }, + { + "epoch": 2.4914344157504473, + "grad_norm": 58.29711664820273, + "learning_rate": 8.504332153496064e-07, + "loss": 1.2608, + "step": 29232 + }, + { + "epoch": 2.491519645444473, + "grad_norm": 43.42134894560201, + "learning_rate": 8.501566060701356e-07, + "loss": 1.0786, + "step": 29233 + }, + { + "epoch": 2.4916048751384983, + "grad_norm": 22.60145175248433, + "learning_rate": 8.498800376027256e-07, + "loss": 0.7236, + "step": 29234 + }, + { + "epoch": 2.4916901048325237, + "grad_norm": 74.04777341087937, + "learning_rate": 8.496035099500971e-07, + "loss": 2.0115, + "step": 29235 + }, + { + "epoch": 2.491775334526549, + "grad_norm": 36.16895698008928, + "learning_rate": 8.493270231149686e-07, + "loss": 1.1868, + "step": 29236 + }, + { + "epoch": 2.4918605642205742, + "grad_norm": 56.494717186537876, + "learning_rate": 8.490505771000601e-07, + "loss": 1.6921, + "step": 29237 + }, + { + "epoch": 2.4919457939145997, + "grad_norm": 32.79458265528678, + "learning_rate": 8.4877417190809e-07, + "loss": 1.0508, + "step": 29238 + }, + { + "epoch": 2.492031023608625, + "grad_norm": 32.4106016846364, + "learning_rate": 8.484978075417743e-07, + "loss": 0.5578, + "step": 29239 + }, + { + "epoch": 2.4921162533026506, + "grad_norm": 124.26625539645482, + "learning_rate": 8.482214840038349e-07, + "loss": 1.554, + "step": 29240 + }, + { + "epoch": 2.492201482996676, + "grad_norm": 48.85519535651101, + "learning_rate": 8.479452012969858e-07, + "loss": 1.7143, + "step": 29241 + }, + { + "epoch": 2.4922867126907016, + "grad_norm": 24.270538180263724, + "learning_rate": 8.476689594239473e-07, + "loss": 0.8365, + "step": 29242 + }, + { + "epoch": 2.492371942384727, + "grad_norm": 38.774765864778786, + "learning_rate": 8.473927583874336e-07, + "loss": 0.8341, + "step": 29243 + }, + { + "epoch": 2.492457172078752, + "grad_norm": 59.2794112734547, + "learning_rate": 8.47116598190163e-07, + "loss": 1.8286, + "step": 29244 + }, + { + "epoch": 2.4925424017727775, + "grad_norm": 59.03137081373893, + "learning_rate": 8.4684047883485e-07, + "loss": 1.8044, + "step": 29245 + }, + { + "epoch": 2.492627631466803, + "grad_norm": 56.58026070084403, + "learning_rate": 8.465644003242113e-07, + "loss": 1.6053, + "step": 29246 + }, + { + "epoch": 2.4927128611608285, + "grad_norm": 37.37512593272775, + "learning_rate": 8.462883626609597e-07, + "loss": 0.9103, + "step": 29247 + }, + { + "epoch": 2.492798090854854, + "grad_norm": 44.89506622634714, + "learning_rate": 8.460123658478131e-07, + "loss": 1.3395, + "step": 29248 + }, + { + "epoch": 2.4928833205488794, + "grad_norm": 62.728684907670456, + "learning_rate": 8.457364098874826e-07, + "loss": 1.8431, + "step": 29249 + }, + { + "epoch": 2.492968550242905, + "grad_norm": 42.587671527140955, + "learning_rate": 8.45460494782685e-07, + "loss": 0.9951, + "step": 29250 + }, + { + "epoch": 2.49305377993693, + "grad_norm": 51.755941224892794, + "learning_rate": 8.451846205361331e-07, + "loss": 1.1596, + "step": 29251 + }, + { + "epoch": 2.4931390096309554, + "grad_norm": 64.88038315644849, + "learning_rate": 8.449087871505379e-07, + "loss": 1.8181, + "step": 29252 + }, + { + "epoch": 2.493224239324981, + "grad_norm": 27.313097811001786, + "learning_rate": 8.446329946286147e-07, + "loss": 0.8146, + "step": 29253 + }, + { + "epoch": 2.4933094690190063, + "grad_norm": 46.29667423822066, + "learning_rate": 8.443572429730751e-07, + "loss": 1.1265, + "step": 29254 + }, + { + "epoch": 2.4933946987130318, + "grad_norm": 41.12960322016259, + "learning_rate": 8.440815321866308e-07, + "loss": 1.3039, + "step": 29255 + }, + { + "epoch": 2.493479928407057, + "grad_norm": 54.217812766142146, + "learning_rate": 8.438058622719919e-07, + "loss": 1.9309, + "step": 29256 + }, + { + "epoch": 2.4935651581010823, + "grad_norm": 28.012772497573344, + "learning_rate": 8.435302332318706e-07, + "loss": 0.8163, + "step": 29257 + }, + { + "epoch": 2.4936503877951077, + "grad_norm": 41.942796502415476, + "learning_rate": 8.432546450689793e-07, + "loss": 1.062, + "step": 29258 + }, + { + "epoch": 2.493735617489133, + "grad_norm": 35.16882794028916, + "learning_rate": 8.429790977860269e-07, + "loss": 1.2366, + "step": 29259 + }, + { + "epoch": 2.4938208471831587, + "grad_norm": 63.87554008902155, + "learning_rate": 8.427035913857223e-07, + "loss": 2.0319, + "step": 29260 + }, + { + "epoch": 2.493906076877184, + "grad_norm": 42.78615559846762, + "learning_rate": 8.424281258707767e-07, + "loss": 1.5591, + "step": 29261 + }, + { + "epoch": 2.4939913065712096, + "grad_norm": 43.47116720604076, + "learning_rate": 8.421527012438985e-07, + "loss": 1.2682, + "step": 29262 + }, + { + "epoch": 2.4940765362652346, + "grad_norm": 62.78102008043983, + "learning_rate": 8.418773175077966e-07, + "loss": 1.829, + "step": 29263 + }, + { + "epoch": 2.49416176595926, + "grad_norm": 58.12689155571491, + "learning_rate": 8.416019746651772e-07, + "loss": 1.2331, + "step": 29264 + }, + { + "epoch": 2.4942469956532856, + "grad_norm": 35.60906835269268, + "learning_rate": 8.413266727187502e-07, + "loss": 0.9105, + "step": 29265 + }, + { + "epoch": 2.494332225347311, + "grad_norm": 60.81581026890217, + "learning_rate": 8.410514116712243e-07, + "loss": 1.7714, + "step": 29266 + }, + { + "epoch": 2.4944174550413365, + "grad_norm": 48.512081801046705, + "learning_rate": 8.407761915253055e-07, + "loss": 1.664, + "step": 29267 + }, + { + "epoch": 2.494502684735362, + "grad_norm": 38.58003016875668, + "learning_rate": 8.405010122836993e-07, + "loss": 1.2999, + "step": 29268 + }, + { + "epoch": 2.4945879144293874, + "grad_norm": 63.33666353940761, + "learning_rate": 8.40225873949112e-07, + "loss": 1.5503, + "step": 29269 + }, + { + "epoch": 2.4946731441234125, + "grad_norm": 35.428865687069695, + "learning_rate": 8.399507765242515e-07, + "loss": 0.6121, + "step": 29270 + }, + { + "epoch": 2.494758373817438, + "grad_norm": 82.93334015373559, + "learning_rate": 8.396757200118221e-07, + "loss": 1.1897, + "step": 29271 + }, + { + "epoch": 2.4948436035114634, + "grad_norm": 73.4868120613471, + "learning_rate": 8.39400704414528e-07, + "loss": 2.01, + "step": 29272 + }, + { + "epoch": 2.494928833205489, + "grad_norm": 36.78466909986495, + "learning_rate": 8.391257297350741e-07, + "loss": 1.022, + "step": 29273 + }, + { + "epoch": 2.4950140628995143, + "grad_norm": 46.24856659558507, + "learning_rate": 8.388507959761672e-07, + "loss": 1.556, + "step": 29274 + }, + { + "epoch": 2.4950992925935394, + "grad_norm": 56.74566325827, + "learning_rate": 8.385759031405083e-07, + "loss": 1.3446, + "step": 29275 + }, + { + "epoch": 2.495184522287565, + "grad_norm": 48.865415370470735, + "learning_rate": 8.383010512308026e-07, + "loss": 1.2279, + "step": 29276 + }, + { + "epoch": 2.4952697519815903, + "grad_norm": 25.646006377313295, + "learning_rate": 8.380262402497508e-07, + "loss": 0.8159, + "step": 29277 + }, + { + "epoch": 2.4953549816756158, + "grad_norm": 83.84205990507918, + "learning_rate": 8.377514702000584e-07, + "loss": 2.0009, + "step": 29278 + }, + { + "epoch": 2.4954402113696412, + "grad_norm": 53.496085060545035, + "learning_rate": 8.374767410844265e-07, + "loss": 1.4541, + "step": 29279 + }, + { + "epoch": 2.4955254410636667, + "grad_norm": 105.11249017933555, + "learning_rate": 8.372020529055552e-07, + "loss": 2.0504, + "step": 29280 + }, + { + "epoch": 2.495610670757692, + "grad_norm": 26.844629719624912, + "learning_rate": 8.369274056661491e-07, + "loss": 0.7741, + "step": 29281 + }, + { + "epoch": 2.495695900451717, + "grad_norm": 44.316067126272415, + "learning_rate": 8.366527993689066e-07, + "loss": 0.7436, + "step": 29282 + }, + { + "epoch": 2.4957811301457427, + "grad_norm": 86.86055679319074, + "learning_rate": 8.363782340165305e-07, + "loss": 1.5097, + "step": 29283 + }, + { + "epoch": 2.495866359839768, + "grad_norm": 50.23126998606507, + "learning_rate": 8.361037096117202e-07, + "loss": 1.1666, + "step": 29284 + }, + { + "epoch": 2.4959515895337936, + "grad_norm": 80.6407132317792, + "learning_rate": 8.358292261571748e-07, + "loss": 1.9999, + "step": 29285 + }, + { + "epoch": 2.496036819227819, + "grad_norm": 35.81195981915921, + "learning_rate": 8.355547836555938e-07, + "loss": 1.2321, + "step": 29286 + }, + { + "epoch": 2.4961220489218445, + "grad_norm": 31.480528255064954, + "learning_rate": 8.352803821096778e-07, + "loss": 0.7726, + "step": 29287 + }, + { + "epoch": 2.49620727861587, + "grad_norm": 26.50715200127567, + "learning_rate": 8.350060215221228e-07, + "loss": 0.9719, + "step": 29288 + }, + { + "epoch": 2.496292508309895, + "grad_norm": 47.32184901593419, + "learning_rate": 8.347317018956297e-07, + "loss": 0.8886, + "step": 29289 + }, + { + "epoch": 2.4963777380039205, + "grad_norm": 33.811729559812626, + "learning_rate": 8.344574232328945e-07, + "loss": 0.7825, + "step": 29290 + }, + { + "epoch": 2.496462967697946, + "grad_norm": 62.60028067531364, + "learning_rate": 8.341831855366161e-07, + "loss": 1.5328, + "step": 29291 + }, + { + "epoch": 2.4965481973919714, + "grad_norm": 50.728061165606405, + "learning_rate": 8.339089888094909e-07, + "loss": 1.2488, + "step": 29292 + }, + { + "epoch": 2.496633427085997, + "grad_norm": 42.35268098380207, + "learning_rate": 8.336348330542154e-07, + "loss": 1.5637, + "step": 29293 + }, + { + "epoch": 2.496718656780022, + "grad_norm": 78.50109781642459, + "learning_rate": 8.333607182734854e-07, + "loss": 1.8137, + "step": 29294 + }, + { + "epoch": 2.4968038864740474, + "grad_norm": 90.56367809567855, + "learning_rate": 8.330866444699964e-07, + "loss": 1.8713, + "step": 29295 + }, + { + "epoch": 2.496889116168073, + "grad_norm": 19.548753289388237, + "learning_rate": 8.328126116464441e-07, + "loss": 0.7759, + "step": 29296 + }, + { + "epoch": 2.4969743458620983, + "grad_norm": 51.3727264778547, + "learning_rate": 8.325386198055252e-07, + "loss": 1.376, + "step": 29297 + }, + { + "epoch": 2.497059575556124, + "grad_norm": 37.611959062016204, + "learning_rate": 8.322646689499331e-07, + "loss": 1.6525, + "step": 29298 + }, + { + "epoch": 2.4971448052501493, + "grad_norm": 24.362020716962476, + "learning_rate": 8.319907590823606e-07, + "loss": 0.9945, + "step": 29299 + }, + { + "epoch": 2.4972300349441747, + "grad_norm": 83.54190774849131, + "learning_rate": 8.317168902055039e-07, + "loss": 2.6967, + "step": 29300 + }, + { + "epoch": 2.4973152646381997, + "grad_norm": 23.110968772079367, + "learning_rate": 8.314430623220554e-07, + "loss": 0.649, + "step": 29301 + }, + { + "epoch": 2.497400494332225, + "grad_norm": 70.13815598624284, + "learning_rate": 8.311692754347078e-07, + "loss": 1.525, + "step": 29302 + }, + { + "epoch": 2.4974857240262507, + "grad_norm": 84.76770656071125, + "learning_rate": 8.308955295461529e-07, + "loss": 2.5674, + "step": 29303 + }, + { + "epoch": 2.497570953720276, + "grad_norm": 42.60930224519332, + "learning_rate": 8.306218246590835e-07, + "loss": 1.3005, + "step": 29304 + }, + { + "epoch": 2.4976561834143016, + "grad_norm": 36.23571944446523, + "learning_rate": 8.303481607761932e-07, + "loss": 1.1466, + "step": 29305 + }, + { + "epoch": 2.497741413108327, + "grad_norm": 68.11770782529179, + "learning_rate": 8.300745379001717e-07, + "loss": 2.1149, + "step": 29306 + }, + { + "epoch": 2.4978266428023526, + "grad_norm": 27.7139133012027, + "learning_rate": 8.298009560337094e-07, + "loss": 0.9808, + "step": 29307 + }, + { + "epoch": 2.4979118724963776, + "grad_norm": 51.23869679900493, + "learning_rate": 8.295274151794985e-07, + "loss": 1.093, + "step": 29308 + }, + { + "epoch": 2.497997102190403, + "grad_norm": 82.78931932815327, + "learning_rate": 8.292539153402279e-07, + "loss": 1.2703, + "step": 29309 + }, + { + "epoch": 2.4980823318844285, + "grad_norm": 58.752484912845624, + "learning_rate": 8.289804565185883e-07, + "loss": 1.3869, + "step": 29310 + }, + { + "epoch": 2.498167561578454, + "grad_norm": 55.04450474882675, + "learning_rate": 8.287070387172669e-07, + "loss": 1.7985, + "step": 29311 + }, + { + "epoch": 2.4982527912724795, + "grad_norm": 34.27517174871705, + "learning_rate": 8.284336619389549e-07, + "loss": 0.9409, + "step": 29312 + }, + { + "epoch": 2.498338020966505, + "grad_norm": 54.65286829068402, + "learning_rate": 8.281603261863408e-07, + "loss": 1.5913, + "step": 29313 + }, + { + "epoch": 2.49842325066053, + "grad_norm": 66.54598279218034, + "learning_rate": 8.278870314621124e-07, + "loss": 2.2796, + "step": 29314 + }, + { + "epoch": 2.4985084803545554, + "grad_norm": 21.863213054246458, + "learning_rate": 8.276137777689569e-07, + "loss": 0.497, + "step": 29315 + }, + { + "epoch": 2.498593710048581, + "grad_norm": 48.961306819912664, + "learning_rate": 8.273405651095612e-07, + "loss": 1.4432, + "step": 29316 + }, + { + "epoch": 2.4986789397426064, + "grad_norm": 64.33905252287731, + "learning_rate": 8.270673934866142e-07, + "loss": 1.3689, + "step": 29317 + }, + { + "epoch": 2.498764169436632, + "grad_norm": 49.57333761178603, + "learning_rate": 8.267942629028009e-07, + "loss": 1.4276, + "step": 29318 + }, + { + "epoch": 2.4988493991306573, + "grad_norm": 71.52402144913329, + "learning_rate": 8.265211733608064e-07, + "loss": 1.9486, + "step": 29319 + }, + { + "epoch": 2.4989346288246823, + "grad_norm": 67.9233735962067, + "learning_rate": 8.262481248633181e-07, + "loss": 1.2866, + "step": 29320 + }, + { + "epoch": 2.499019858518708, + "grad_norm": 44.830163796047806, + "learning_rate": 8.259751174130226e-07, + "loss": 0.9764, + "step": 29321 + }, + { + "epoch": 2.4991050882127332, + "grad_norm": 31.14434729722182, + "learning_rate": 8.257021510126023e-07, + "loss": 1.2717, + "step": 29322 + }, + { + "epoch": 2.4991903179067587, + "grad_norm": 40.22928124767231, + "learning_rate": 8.254292256647434e-07, + "loss": 0.9867, + "step": 29323 + }, + { + "epoch": 2.499275547600784, + "grad_norm": 57.6500244407992, + "learning_rate": 8.251563413721292e-07, + "loss": 1.4469, + "step": 29324 + }, + { + "epoch": 2.4993607772948097, + "grad_norm": 47.84022976859997, + "learning_rate": 8.248834981374421e-07, + "loss": 1.0511, + "step": 29325 + }, + { + "epoch": 2.499446006988835, + "grad_norm": 61.323728560715395, + "learning_rate": 8.246106959633676e-07, + "loss": 1.2131, + "step": 29326 + }, + { + "epoch": 2.49953123668286, + "grad_norm": 57.13245098377897, + "learning_rate": 8.243379348525871e-07, + "loss": 1.2939, + "step": 29327 + }, + { + "epoch": 2.4996164663768856, + "grad_norm": 51.69456888392516, + "learning_rate": 8.240652148077849e-07, + "loss": 0.9758, + "step": 29328 + }, + { + "epoch": 2.499701696070911, + "grad_norm": 46.4848902451647, + "learning_rate": 8.237925358316401e-07, + "loss": 1.4355, + "step": 29329 + }, + { + "epoch": 2.4997869257649366, + "grad_norm": 82.49681554551435, + "learning_rate": 8.235198979268383e-07, + "loss": 1.6537, + "step": 29330 + }, + { + "epoch": 2.499872155458962, + "grad_norm": 44.62146737381864, + "learning_rate": 8.232473010960585e-07, + "loss": 0.8498, + "step": 29331 + }, + { + "epoch": 2.4999573851529875, + "grad_norm": 32.9955305227482, + "learning_rate": 8.229747453419817e-07, + "loss": 1.0199, + "step": 29332 + }, + { + "epoch": 2.500042614847013, + "grad_norm": 43.68107521384908, + "learning_rate": 8.227022306672872e-07, + "loss": 1.2018, + "step": 29333 + }, + { + "epoch": 2.500127844541038, + "grad_norm": 92.88297646038984, + "learning_rate": 8.224297570746576e-07, + "loss": 2.0599, + "step": 29334 + }, + { + "epoch": 2.5002130742350634, + "grad_norm": 17.641384548781975, + "learning_rate": 8.221573245667702e-07, + "loss": 0.5846, + "step": 29335 + }, + { + "epoch": 2.500298303929089, + "grad_norm": 50.23123458509414, + "learning_rate": 8.218849331463063e-07, + "loss": 1.4779, + "step": 29336 + }, + { + "epoch": 2.5003835336231144, + "grad_norm": 24.954676886593376, + "learning_rate": 8.216125828159432e-07, + "loss": 0.5752, + "step": 29337 + }, + { + "epoch": 2.50046876331714, + "grad_norm": 52.72221796110403, + "learning_rate": 8.21340273578361e-07, + "loss": 1.2486, + "step": 29338 + }, + { + "epoch": 2.500553993011165, + "grad_norm": 24.402551058375217, + "learning_rate": 8.210680054362368e-07, + "loss": 0.8416, + "step": 29339 + }, + { + "epoch": 2.5006392227051903, + "grad_norm": 79.15203301366353, + "learning_rate": 8.207957783922482e-07, + "loss": 2.1358, + "step": 29340 + }, + { + "epoch": 2.500724452399216, + "grad_norm": 50.34591804009169, + "learning_rate": 8.205235924490724e-07, + "loss": 1.5967, + "step": 29341 + }, + { + "epoch": 2.5008096820932413, + "grad_norm": 44.61638935670671, + "learning_rate": 8.202514476093853e-07, + "loss": 1.0851, + "step": 29342 + }, + { + "epoch": 2.5008949117872668, + "grad_norm": 43.380413312383055, + "learning_rate": 8.199793438758641e-07, + "loss": 0.9269, + "step": 29343 + }, + { + "epoch": 2.500980141481292, + "grad_norm": 185.15713717752297, + "learning_rate": 8.197072812511864e-07, + "loss": 2.8728, + "step": 29344 + }, + { + "epoch": 2.5010653711753177, + "grad_norm": 26.154709462454242, + "learning_rate": 8.194352597380267e-07, + "loss": 0.5123, + "step": 29345 + }, + { + "epoch": 2.5011506008693427, + "grad_norm": 26.852656001931447, + "learning_rate": 8.191632793390586e-07, + "loss": 1.3969, + "step": 29346 + }, + { + "epoch": 2.501235830563368, + "grad_norm": 47.44805059437996, + "learning_rate": 8.188913400569593e-07, + "loss": 1.6349, + "step": 29347 + }, + { + "epoch": 2.5013210602573936, + "grad_norm": 45.54041339392201, + "learning_rate": 8.186194418944027e-07, + "loss": 1.6553, + "step": 29348 + }, + { + "epoch": 2.501406289951419, + "grad_norm": 65.343724944173, + "learning_rate": 8.183475848540623e-07, + "loss": 1.5191, + "step": 29349 + }, + { + "epoch": 2.5014915196454446, + "grad_norm": 14.545720727943264, + "learning_rate": 8.180757689386104e-07, + "loss": 0.6147, + "step": 29350 + }, + { + "epoch": 2.5015767493394696, + "grad_norm": 213.79159060060948, + "learning_rate": 8.178039941507216e-07, + "loss": 2.256, + "step": 29351 + }, + { + "epoch": 2.5016619790334955, + "grad_norm": 22.186181674096364, + "learning_rate": 8.175322604930697e-07, + "loss": 0.9668, + "step": 29352 + }, + { + "epoch": 2.5017472087275205, + "grad_norm": 48.216054605125926, + "learning_rate": 8.172605679683266e-07, + "loss": 0.804, + "step": 29353 + }, + { + "epoch": 2.501832438421546, + "grad_norm": 40.99947733933156, + "learning_rate": 8.169889165791633e-07, + "loss": 1.1646, + "step": 29354 + }, + { + "epoch": 2.5019176681155715, + "grad_norm": 30.296576962936356, + "learning_rate": 8.167173063282507e-07, + "loss": 0.9123, + "step": 29355 + }, + { + "epoch": 2.502002897809597, + "grad_norm": 36.25152309023219, + "learning_rate": 8.164457372182627e-07, + "loss": 1.2977, + "step": 29356 + }, + { + "epoch": 2.5020881275036224, + "grad_norm": 41.90634346034099, + "learning_rate": 8.161742092518682e-07, + "loss": 1.4245, + "step": 29357 + }, + { + "epoch": 2.5021733571976474, + "grad_norm": 28.370472840117028, + "learning_rate": 8.159027224317362e-07, + "loss": 1.2144, + "step": 29358 + }, + { + "epoch": 2.502258586891673, + "grad_norm": 49.920078199033185, + "learning_rate": 8.15631276760539e-07, + "loss": 1.1458, + "step": 29359 + }, + { + "epoch": 2.5023438165856984, + "grad_norm": 63.51788767534117, + "learning_rate": 8.153598722409468e-07, + "loss": 1.1677, + "step": 29360 + }, + { + "epoch": 2.502429046279724, + "grad_norm": 25.767278242872486, + "learning_rate": 8.150885088756266e-07, + "loss": 0.7231, + "step": 29361 + }, + { + "epoch": 2.5025142759737493, + "grad_norm": 61.681015911999545, + "learning_rate": 8.148171866672488e-07, + "loss": 2.2583, + "step": 29362 + }, + { + "epoch": 2.502599505667775, + "grad_norm": 46.32923271063356, + "learning_rate": 8.145459056184796e-07, + "loss": 1.4028, + "step": 29363 + }, + { + "epoch": 2.5026847353618003, + "grad_norm": 25.658926558175118, + "learning_rate": 8.142746657319889e-07, + "loss": 0.6217, + "step": 29364 + }, + { + "epoch": 2.5027699650558253, + "grad_norm": 70.02543718249024, + "learning_rate": 8.140034670104441e-07, + "loss": 2.0763, + "step": 29365 + }, + { + "epoch": 2.5028551947498507, + "grad_norm": 35.15259756571976, + "learning_rate": 8.137323094565109e-07, + "loss": 1.3777, + "step": 29366 + }, + { + "epoch": 2.502940424443876, + "grad_norm": 45.430441492643936, + "learning_rate": 8.134611930728564e-07, + "loss": 1.2768, + "step": 29367 + }, + { + "epoch": 2.5030256541379017, + "grad_norm": 64.9062872366114, + "learning_rate": 8.131901178621492e-07, + "loss": 1.7355, + "step": 29368 + }, + { + "epoch": 2.503110883831927, + "grad_norm": 59.76378056817544, + "learning_rate": 8.129190838270534e-07, + "loss": 1.7979, + "step": 29369 + }, + { + "epoch": 2.503196113525952, + "grad_norm": 45.62378137414543, + "learning_rate": 8.126480909702344e-07, + "loss": 1.4623, + "step": 29370 + }, + { + "epoch": 2.503281343219978, + "grad_norm": 62.86326595033923, + "learning_rate": 8.123771392943574e-07, + "loss": 1.6575, + "step": 29371 + }, + { + "epoch": 2.503366572914003, + "grad_norm": 70.65622125786275, + "learning_rate": 8.121062288020865e-07, + "loss": 2.3793, + "step": 29372 + }, + { + "epoch": 2.5034518026080286, + "grad_norm": 71.57887949959647, + "learning_rate": 8.118353594960877e-07, + "loss": 1.3427, + "step": 29373 + }, + { + "epoch": 2.503537032302054, + "grad_norm": 41.45473325052057, + "learning_rate": 8.115645313790226e-07, + "loss": 1.8609, + "step": 29374 + }, + { + "epoch": 2.5036222619960795, + "grad_norm": 48.22833804533949, + "learning_rate": 8.112937444535574e-07, + "loss": 1.6077, + "step": 29375 + }, + { + "epoch": 2.503707491690105, + "grad_norm": 68.39159043712735, + "learning_rate": 8.110229987223528e-07, + "loss": 1.3389, + "step": 29376 + }, + { + "epoch": 2.50379272138413, + "grad_norm": 75.9011764259017, + "learning_rate": 8.107522941880736e-07, + "loss": 2.0105, + "step": 29377 + }, + { + "epoch": 2.5038779510781555, + "grad_norm": 61.136617341114714, + "learning_rate": 8.104816308533803e-07, + "loss": 1.4012, + "step": 29378 + }, + { + "epoch": 2.503963180772181, + "grad_norm": 65.09490090874988, + "learning_rate": 8.102110087209364e-07, + "loss": 1.7553, + "step": 29379 + }, + { + "epoch": 2.5040484104662064, + "grad_norm": 65.46339669703463, + "learning_rate": 8.099404277934003e-07, + "loss": 1.5076, + "step": 29380 + }, + { + "epoch": 2.504133640160232, + "grad_norm": 29.509923343705626, + "learning_rate": 8.096698880734366e-07, + "loss": 0.9156, + "step": 29381 + }, + { + "epoch": 2.5042188698542573, + "grad_norm": 51.38406556913226, + "learning_rate": 8.093993895637036e-07, + "loss": 1.5197, + "step": 29382 + }, + { + "epoch": 2.504304099548283, + "grad_norm": 64.07652574468246, + "learning_rate": 8.091289322668633e-07, + "loss": 1.5496, + "step": 29383 + }, + { + "epoch": 2.504389329242308, + "grad_norm": 57.83769853230189, + "learning_rate": 8.088585161855749e-07, + "loss": 1.5412, + "step": 29384 + }, + { + "epoch": 2.5044745589363333, + "grad_norm": 52.11612750248799, + "learning_rate": 8.085881413224961e-07, + "loss": 1.8652, + "step": 29385 + }, + { + "epoch": 2.5045597886303588, + "grad_norm": 26.87708670691274, + "learning_rate": 8.083178076802889e-07, + "loss": 0.6945, + "step": 29386 + }, + { + "epoch": 2.5046450183243842, + "grad_norm": 51.069876515679404, + "learning_rate": 8.080475152616102e-07, + "loss": 0.9017, + "step": 29387 + }, + { + "epoch": 2.5047302480184097, + "grad_norm": 66.69444340574285, + "learning_rate": 8.077772640691183e-07, + "loss": 1.3787, + "step": 29388 + }, + { + "epoch": 2.504815477712435, + "grad_norm": 74.23069287593096, + "learning_rate": 8.075070541054708e-07, + "loss": 1.2984, + "step": 29389 + }, + { + "epoch": 2.5049007074064606, + "grad_norm": 43.43550818545646, + "learning_rate": 8.072368853733247e-07, + "loss": 1.015, + "step": 29390 + }, + { + "epoch": 2.5049859371004857, + "grad_norm": 31.9081004494425, + "learning_rate": 8.069667578753393e-07, + "loss": 1.0651, + "step": 29391 + }, + { + "epoch": 2.505071166794511, + "grad_norm": 72.31750931344521, + "learning_rate": 8.066966716141694e-07, + "loss": 1.9241, + "step": 29392 + }, + { + "epoch": 2.5051563964885366, + "grad_norm": 55.39864801049523, + "learning_rate": 8.064266265924708e-07, + "loss": 1.5483, + "step": 29393 + }, + { + "epoch": 2.505241626182562, + "grad_norm": 27.13153474924174, + "learning_rate": 8.061566228129009e-07, + "loss": 1.0136, + "step": 29394 + }, + { + "epoch": 2.5053268558765875, + "grad_norm": 22.357997909777836, + "learning_rate": 8.05886660278114e-07, + "loss": 0.918, + "step": 29395 + }, + { + "epoch": 2.5054120855706126, + "grad_norm": 15.718069149845853, + "learning_rate": 8.056167389907654e-07, + "loss": 0.7122, + "step": 29396 + }, + { + "epoch": 2.505497315264638, + "grad_norm": 44.74982439375923, + "learning_rate": 8.053468589535079e-07, + "loss": 1.4486, + "step": 29397 + }, + { + "epoch": 2.5055825449586635, + "grad_norm": 58.987131543941686, + "learning_rate": 8.050770201689978e-07, + "loss": 1.7417, + "step": 29398 + }, + { + "epoch": 2.505667774652689, + "grad_norm": 28.171499030920586, + "learning_rate": 8.048072226398895e-07, + "loss": 1.1185, + "step": 29399 + }, + { + "epoch": 2.5057530043467144, + "grad_norm": 26.595267126744787, + "learning_rate": 8.045374663688349e-07, + "loss": 0.842, + "step": 29400 + }, + { + "epoch": 2.50583823404074, + "grad_norm": 24.949778906698857, + "learning_rate": 8.042677513584873e-07, + "loss": 0.668, + "step": 29401 + }, + { + "epoch": 2.5059234637347654, + "grad_norm": 38.10023276758758, + "learning_rate": 8.03998077611498e-07, + "loss": 1.0297, + "step": 29402 + }, + { + "epoch": 2.5060086934287904, + "grad_norm": 65.44716579992213, + "learning_rate": 8.037284451305211e-07, + "loss": 1.3553, + "step": 29403 + }, + { + "epoch": 2.506093923122816, + "grad_norm": 67.23951950632242, + "learning_rate": 8.034588539182081e-07, + "loss": 2.0658, + "step": 29404 + }, + { + "epoch": 2.5061791528168413, + "grad_norm": 52.81397002727868, + "learning_rate": 8.031893039772082e-07, + "loss": 1.5086, + "step": 29405 + }, + { + "epoch": 2.506264382510867, + "grad_norm": 71.02216732840917, + "learning_rate": 8.029197953101736e-07, + "loss": 1.8696, + "step": 29406 + }, + { + "epoch": 2.5063496122048923, + "grad_norm": 46.79176727543581, + "learning_rate": 8.026503279197567e-07, + "loss": 1.0043, + "step": 29407 + }, + { + "epoch": 2.5064348418989177, + "grad_norm": 21.995054604719346, + "learning_rate": 8.023809018086059e-07, + "loss": 0.9596, + "step": 29408 + }, + { + "epoch": 2.506520071592943, + "grad_norm": 37.86067182326578, + "learning_rate": 8.021115169793708e-07, + "loss": 0.8282, + "step": 29409 + }, + { + "epoch": 2.5066053012869682, + "grad_norm": 53.576097150874425, + "learning_rate": 8.018421734346993e-07, + "loss": 1.3297, + "step": 29410 + }, + { + "epoch": 2.5066905309809937, + "grad_norm": 49.22237270759476, + "learning_rate": 8.015728711772436e-07, + "loss": 1.335, + "step": 29411 + }, + { + "epoch": 2.506775760675019, + "grad_norm": 55.9076013079359, + "learning_rate": 8.013036102096505e-07, + "loss": 1.9549, + "step": 29412 + }, + { + "epoch": 2.5068609903690446, + "grad_norm": 44.33510625973092, + "learning_rate": 8.010343905345664e-07, + "loss": 1.6205, + "step": 29413 + }, + { + "epoch": 2.50694622006307, + "grad_norm": 40.55967485890285, + "learning_rate": 8.007652121546422e-07, + "loss": 1.4969, + "step": 29414 + }, + { + "epoch": 2.507031449757095, + "grad_norm": 45.166066165360675, + "learning_rate": 8.004960750725215e-07, + "loss": 1.4068, + "step": 29415 + }, + { + "epoch": 2.507116679451121, + "grad_norm": 64.73120285516724, + "learning_rate": 8.002269792908551e-07, + "loss": 1.7059, + "step": 29416 + }, + { + "epoch": 2.507201909145146, + "grad_norm": 62.20466598878023, + "learning_rate": 7.999579248122873e-07, + "loss": 1.6161, + "step": 29417 + }, + { + "epoch": 2.5072871388391715, + "grad_norm": 30.775785725605747, + "learning_rate": 7.996889116394646e-07, + "loss": 0.9557, + "step": 29418 + }, + { + "epoch": 2.507372368533197, + "grad_norm": 46.27812129141176, + "learning_rate": 7.994199397750313e-07, + "loss": 1.475, + "step": 29419 + }, + { + "epoch": 2.5074575982272225, + "grad_norm": 59.42322496960352, + "learning_rate": 7.991510092216343e-07, + "loss": 1.4398, + "step": 29420 + }, + { + "epoch": 2.507542827921248, + "grad_norm": 36.3043883337872, + "learning_rate": 7.988821199819175e-07, + "loss": 1.3343, + "step": 29421 + }, + { + "epoch": 2.507628057615273, + "grad_norm": 82.07690476825138, + "learning_rate": 7.98613272058526e-07, + "loss": 1.7913, + "step": 29422 + }, + { + "epoch": 2.5077132873092984, + "grad_norm": 64.586128715926, + "learning_rate": 7.983444654541028e-07, + "loss": 1.6981, + "step": 29423 + }, + { + "epoch": 2.507798517003324, + "grad_norm": 33.29737502975254, + "learning_rate": 7.980757001712935e-07, + "loss": 1.0881, + "step": 29424 + }, + { + "epoch": 2.5078837466973494, + "grad_norm": 34.56644352176474, + "learning_rate": 7.9780697621274e-07, + "loss": 1.2235, + "step": 29425 + }, + { + "epoch": 2.507968976391375, + "grad_norm": 66.59860757046351, + "learning_rate": 7.975382935810849e-07, + "loss": 1.8598, + "step": 29426 + }, + { + "epoch": 2.5080542060854003, + "grad_norm": 24.501299257874802, + "learning_rate": 7.972696522789697e-07, + "loss": 0.7366, + "step": 29427 + }, + { + "epoch": 2.5081394357794258, + "grad_norm": 65.26506004504404, + "learning_rate": 7.970010523090388e-07, + "loss": 1.8971, + "step": 29428 + }, + { + "epoch": 2.508224665473451, + "grad_norm": 66.13578275160758, + "learning_rate": 7.967324936739307e-07, + "loss": 1.38, + "step": 29429 + }, + { + "epoch": 2.5083098951674763, + "grad_norm": 71.48608233276775, + "learning_rate": 7.964639763762899e-07, + "loss": 1.4066, + "step": 29430 + }, + { + "epoch": 2.5083951248615017, + "grad_norm": 62.561211798841974, + "learning_rate": 7.961955004187555e-07, + "loss": 1.4583, + "step": 29431 + }, + { + "epoch": 2.508480354555527, + "grad_norm": 72.54903683639893, + "learning_rate": 7.959270658039669e-07, + "loss": 1.4285, + "step": 29432 + }, + { + "epoch": 2.5085655842495527, + "grad_norm": 41.1316760399849, + "learning_rate": 7.956586725345661e-07, + "loss": 1.1036, + "step": 29433 + }, + { + "epoch": 2.5086508139435777, + "grad_norm": 48.9188275617766, + "learning_rate": 7.953903206131919e-07, + "loss": 1.5144, + "step": 29434 + }, + { + "epoch": 2.5087360436376036, + "grad_norm": 46.70704787597404, + "learning_rate": 7.951220100424817e-07, + "loss": 1.3287, + "step": 29435 + }, + { + "epoch": 2.5088212733316286, + "grad_norm": 62.963529961224246, + "learning_rate": 7.948537408250767e-07, + "loss": 1.0956, + "step": 29436 + }, + { + "epoch": 2.508906503025654, + "grad_norm": 64.56881068852618, + "learning_rate": 7.945855129636132e-07, + "loss": 1.3199, + "step": 29437 + }, + { + "epoch": 2.5089917327196796, + "grad_norm": 29.39570689232843, + "learning_rate": 7.943173264607312e-07, + "loss": 0.7436, + "step": 29438 + }, + { + "epoch": 2.509076962413705, + "grad_norm": 44.58340173151457, + "learning_rate": 7.940491813190676e-07, + "loss": 1.133, + "step": 29439 + }, + { + "epoch": 2.5091621921077305, + "grad_norm": 69.70469194101021, + "learning_rate": 7.937810775412574e-07, + "loss": 1.7346, + "step": 29440 + }, + { + "epoch": 2.5092474218017555, + "grad_norm": 39.92331225504504, + "learning_rate": 7.935130151299402e-07, + "loss": 1.4437, + "step": 29441 + }, + { + "epoch": 2.509332651495781, + "grad_norm": 77.38197229848562, + "learning_rate": 7.932449940877506e-07, + "loss": 2.0576, + "step": 29442 + }, + { + "epoch": 2.5094178811898065, + "grad_norm": 46.767403618419564, + "learning_rate": 7.929770144173254e-07, + "loss": 1.7029, + "step": 29443 + }, + { + "epoch": 2.509503110883832, + "grad_norm": 75.36237619571783, + "learning_rate": 7.927090761212986e-07, + "loss": 2.9283, + "step": 29444 + }, + { + "epoch": 2.5095883405778574, + "grad_norm": 33.09497163329203, + "learning_rate": 7.924411792023057e-07, + "loss": 0.9751, + "step": 29445 + }, + { + "epoch": 2.509673570271883, + "grad_norm": 38.49526411975082, + "learning_rate": 7.921733236629831e-07, + "loss": 0.8597, + "step": 29446 + }, + { + "epoch": 2.5097587999659083, + "grad_norm": 66.42535487279797, + "learning_rate": 7.919055095059641e-07, + "loss": 1.1994, + "step": 29447 + }, + { + "epoch": 2.5098440296599334, + "grad_norm": 60.59174881648292, + "learning_rate": 7.916377367338818e-07, + "loss": 1.6296, + "step": 29448 + }, + { + "epoch": 2.509929259353959, + "grad_norm": 75.02373857610058, + "learning_rate": 7.913700053493694e-07, + "loss": 1.3565, + "step": 29449 + }, + { + "epoch": 2.5100144890479843, + "grad_norm": 69.05648107448563, + "learning_rate": 7.911023153550618e-07, + "loss": 1.8476, + "step": 29450 + }, + { + "epoch": 2.5100997187420098, + "grad_norm": 65.13426127147206, + "learning_rate": 7.908346667535905e-07, + "loss": 1.6556, + "step": 29451 + }, + { + "epoch": 2.5101849484360352, + "grad_norm": 50.23185361624307, + "learning_rate": 7.905670595475862e-07, + "loss": 1.1477, + "step": 29452 + }, + { + "epoch": 2.5102701781300603, + "grad_norm": 47.64667243356859, + "learning_rate": 7.902994937396824e-07, + "loss": 1.3333, + "step": 29453 + }, + { + "epoch": 2.510355407824086, + "grad_norm": 28.50854544227958, + "learning_rate": 7.900319693325114e-07, + "loss": 1.1932, + "step": 29454 + }, + { + "epoch": 2.510440637518111, + "grad_norm": 31.631732156217513, + "learning_rate": 7.89764486328703e-07, + "loss": 1.0525, + "step": 29455 + }, + { + "epoch": 2.5105258672121367, + "grad_norm": 37.11604940558518, + "learning_rate": 7.894970447308881e-07, + "loss": 1.5774, + "step": 29456 + }, + { + "epoch": 2.510611096906162, + "grad_norm": 45.18530685945454, + "learning_rate": 7.892296445416969e-07, + "loss": 1.5698, + "step": 29457 + }, + { + "epoch": 2.5106963266001876, + "grad_norm": 34.358104775786316, + "learning_rate": 7.889622857637574e-07, + "loss": 1.2216, + "step": 29458 + }, + { + "epoch": 2.510781556294213, + "grad_norm": 33.3845527470886, + "learning_rate": 7.88694968399702e-07, + "loss": 0.9909, + "step": 29459 + }, + { + "epoch": 2.510866785988238, + "grad_norm": 59.18722802056508, + "learning_rate": 7.884276924521567e-07, + "loss": 2.0298, + "step": 29460 + }, + { + "epoch": 2.5109520156822636, + "grad_norm": 63.07176440360575, + "learning_rate": 7.881604579237528e-07, + "loss": 1.1927, + "step": 29461 + }, + { + "epoch": 2.511037245376289, + "grad_norm": 54.134722607178034, + "learning_rate": 7.878932648171161e-07, + "loss": 1.4585, + "step": 29462 + }, + { + "epoch": 2.5111224750703145, + "grad_norm": 43.8863825992538, + "learning_rate": 7.876261131348762e-07, + "loss": 1.2823, + "step": 29463 + }, + { + "epoch": 2.51120770476434, + "grad_norm": 80.8103784958909, + "learning_rate": 7.873590028796601e-07, + "loss": 2.1064, + "step": 29464 + }, + { + "epoch": 2.5112929344583654, + "grad_norm": 43.1005935737451, + "learning_rate": 7.870919340540939e-07, + "loss": 1.5851, + "step": 29465 + }, + { + "epoch": 2.511378164152391, + "grad_norm": 29.25072721136672, + "learning_rate": 7.868249066608036e-07, + "loss": 0.8891, + "step": 29466 + }, + { + "epoch": 2.511463393846416, + "grad_norm": 58.32929930507621, + "learning_rate": 7.865579207024177e-07, + "loss": 1.6009, + "step": 29467 + }, + { + "epoch": 2.5115486235404414, + "grad_norm": 60.070526954901105, + "learning_rate": 7.862909761815585e-07, + "loss": 1.376, + "step": 29468 + }, + { + "epoch": 2.511633853234467, + "grad_norm": 28.062655741878825, + "learning_rate": 7.860240731008545e-07, + "loss": 0.8849, + "step": 29469 + }, + { + "epoch": 2.5117190829284923, + "grad_norm": 85.97429198125185, + "learning_rate": 7.857572114629297e-07, + "loss": 1.8208, + "step": 29470 + }, + { + "epoch": 2.511804312622518, + "grad_norm": 20.842424471044254, + "learning_rate": 7.854903912704065e-07, + "loss": 0.5431, + "step": 29471 + }, + { + "epoch": 2.511889542316543, + "grad_norm": 63.437990799309425, + "learning_rate": 7.852236125259122e-07, + "loss": 2.0314, + "step": 29472 + }, + { + "epoch": 2.5119747720105687, + "grad_norm": 58.42149106526676, + "learning_rate": 7.849568752320686e-07, + "loss": 1.1825, + "step": 29473 + }, + { + "epoch": 2.5120600017045938, + "grad_norm": 52.330681995198056, + "learning_rate": 7.846901793914985e-07, + "loss": 1.3141, + "step": 29474 + }, + { + "epoch": 2.5121452313986192, + "grad_norm": 39.9284202365226, + "learning_rate": 7.844235250068261e-07, + "loss": 1.2296, + "step": 29475 + }, + { + "epoch": 2.5122304610926447, + "grad_norm": 72.0362280095073, + "learning_rate": 7.841569120806725e-07, + "loss": 1.6265, + "step": 29476 + }, + { + "epoch": 2.51231569078667, + "grad_norm": 28.866144367175092, + "learning_rate": 7.838903406156612e-07, + "loss": 0.7096, + "step": 29477 + }, + { + "epoch": 2.5124009204806956, + "grad_norm": 40.62785646423981, + "learning_rate": 7.836238106144134e-07, + "loss": 1.0427, + "step": 29478 + }, + { + "epoch": 2.5124861501747207, + "grad_norm": 56.226274734645024, + "learning_rate": 7.833573220795487e-07, + "loss": 1.5265, + "step": 29479 + }, + { + "epoch": 2.512571379868746, + "grad_norm": 46.34049997131524, + "learning_rate": 7.830908750136901e-07, + "loss": 1.0818, + "step": 29480 + }, + { + "epoch": 2.5126566095627716, + "grad_norm": 74.59945837616901, + "learning_rate": 7.828244694194576e-07, + "loss": 2.0522, + "step": 29481 + }, + { + "epoch": 2.512741839256797, + "grad_norm": 66.55963582397129, + "learning_rate": 7.82558105299469e-07, + "loss": 1.5622, + "step": 29482 + }, + { + "epoch": 2.5128270689508225, + "grad_norm": 15.91907917787066, + "learning_rate": 7.822917826563476e-07, + "loss": 0.791, + "step": 29483 + }, + { + "epoch": 2.512912298644848, + "grad_norm": 38.292753023354436, + "learning_rate": 7.820255014927086e-07, + "loss": 1.3315, + "step": 29484 + }, + { + "epoch": 2.5129975283388735, + "grad_norm": 41.25163240684081, + "learning_rate": 7.817592618111741e-07, + "loss": 1.2718, + "step": 29485 + }, + { + "epoch": 2.5130827580328985, + "grad_norm": 35.09887738973553, + "learning_rate": 7.814930636143613e-07, + "loss": 0.9417, + "step": 29486 + }, + { + "epoch": 2.513167987726924, + "grad_norm": 36.68064349403243, + "learning_rate": 7.812269069048877e-07, + "loss": 1.2113, + "step": 29487 + }, + { + "epoch": 2.5132532174209494, + "grad_norm": 37.88848000868579, + "learning_rate": 7.809607916853701e-07, + "loss": 1.0086, + "step": 29488 + }, + { + "epoch": 2.513338447114975, + "grad_norm": 60.44157648754047, + "learning_rate": 7.806947179584278e-07, + "loss": 1.2919, + "step": 29489 + }, + { + "epoch": 2.5134236768090004, + "grad_norm": 49.391632581604725, + "learning_rate": 7.804286857266763e-07, + "loss": 1.273, + "step": 29490 + }, + { + "epoch": 2.5135089065030254, + "grad_norm": 25.822254984287355, + "learning_rate": 7.80162694992731e-07, + "loss": 0.6956, + "step": 29491 + }, + { + "epoch": 2.5135941361970513, + "grad_norm": 59.92954122444424, + "learning_rate": 7.798967457592089e-07, + "loss": 1.2037, + "step": 29492 + }, + { + "epoch": 2.5136793658910763, + "grad_norm": 44.45855474545541, + "learning_rate": 7.796308380287259e-07, + "loss": 0.9398, + "step": 29493 + }, + { + "epoch": 2.513764595585102, + "grad_norm": 80.0226590427937, + "learning_rate": 7.79364971803897e-07, + "loss": 1.7327, + "step": 29494 + }, + { + "epoch": 2.5138498252791273, + "grad_norm": 75.65486100063302, + "learning_rate": 7.790991470873366e-07, + "loss": 1.8041, + "step": 29495 + }, + { + "epoch": 2.5139350549731527, + "grad_norm": 43.64943986502438, + "learning_rate": 7.788333638816575e-07, + "loss": 1.3344, + "step": 29496 + }, + { + "epoch": 2.514020284667178, + "grad_norm": 23.48390006142306, + "learning_rate": 7.785676221894761e-07, + "loss": 0.9992, + "step": 29497 + }, + { + "epoch": 2.514105514361203, + "grad_norm": 56.3445263207796, + "learning_rate": 7.783019220134053e-07, + "loss": 1.5876, + "step": 29498 + }, + { + "epoch": 2.5141907440552287, + "grad_norm": 48.68238584793543, + "learning_rate": 7.780362633560556e-07, + "loss": 1.2915, + "step": 29499 + }, + { + "epoch": 2.514275973749254, + "grad_norm": 60.440281920508156, + "learning_rate": 7.77770646220043e-07, + "loss": 1.9396, + "step": 29500 + }, + { + "epoch": 2.5143612034432796, + "grad_norm": 16.240823153678807, + "learning_rate": 7.775050706079773e-07, + "loss": 0.6837, + "step": 29501 + }, + { + "epoch": 2.514446433137305, + "grad_norm": 23.60746298781221, + "learning_rate": 7.772395365224728e-07, + "loss": 0.8678, + "step": 29502 + }, + { + "epoch": 2.5145316628313306, + "grad_norm": 62.42221319550487, + "learning_rate": 7.769740439661388e-07, + "loss": 1.4559, + "step": 29503 + }, + { + "epoch": 2.514616892525356, + "grad_norm": 56.14983804452908, + "learning_rate": 7.767085929415874e-07, + "loss": 1.7129, + "step": 29504 + }, + { + "epoch": 2.514702122219381, + "grad_norm": 50.83792868400029, + "learning_rate": 7.764431834514274e-07, + "loss": 1.5477, + "step": 29505 + }, + { + "epoch": 2.5147873519134065, + "grad_norm": 86.74598995434326, + "learning_rate": 7.761778154982719e-07, + "loss": 2.192, + "step": 29506 + }, + { + "epoch": 2.514872581607432, + "grad_norm": 38.89674494924602, + "learning_rate": 7.75912489084728e-07, + "loss": 1.2592, + "step": 29507 + }, + { + "epoch": 2.5149578113014575, + "grad_norm": 64.92603116918285, + "learning_rate": 7.756472042134078e-07, + "loss": 1.757, + "step": 29508 + }, + { + "epoch": 2.515043040995483, + "grad_norm": 53.0425573096401, + "learning_rate": 7.753819608869168e-07, + "loss": 1.4339, + "step": 29509 + }, + { + "epoch": 2.515128270689508, + "grad_norm": 80.50617262133146, + "learning_rate": 7.751167591078673e-07, + "loss": 1.9611, + "step": 29510 + }, + { + "epoch": 2.515213500383534, + "grad_norm": 46.111241093882875, + "learning_rate": 7.748515988788651e-07, + "loss": 1.7853, + "step": 29511 + }, + { + "epoch": 2.515298730077559, + "grad_norm": 77.93762854919794, + "learning_rate": 7.745864802025194e-07, + "loss": 1.4707, + "step": 29512 + }, + { + "epoch": 2.5153839597715844, + "grad_norm": 37.581262929567536, + "learning_rate": 7.74321403081435e-07, + "loss": 0.7593, + "step": 29513 + }, + { + "epoch": 2.51546918946561, + "grad_norm": 58.52068313217542, + "learning_rate": 7.740563675182212e-07, + "loss": 1.5045, + "step": 29514 + }, + { + "epoch": 2.5155544191596353, + "grad_norm": 79.0648221639244, + "learning_rate": 7.737913735154833e-07, + "loss": 2.3218, + "step": 29515 + }, + { + "epoch": 2.5156396488536608, + "grad_norm": 15.442881322635019, + "learning_rate": 7.735264210758286e-07, + "loss": 0.468, + "step": 29516 + }, + { + "epoch": 2.515724878547686, + "grad_norm": 43.25782622363528, + "learning_rate": 7.732615102018626e-07, + "loss": 1.2767, + "step": 29517 + }, + { + "epoch": 2.5158101082417113, + "grad_norm": 65.81333184982084, + "learning_rate": 7.729966408961886e-07, + "loss": 1.7188, + "step": 29518 + }, + { + "epoch": 2.5158953379357367, + "grad_norm": 37.458661873395094, + "learning_rate": 7.727318131614142e-07, + "loss": 0.9245, + "step": 29519 + }, + { + "epoch": 2.515980567629762, + "grad_norm": 53.45294650255004, + "learning_rate": 7.724670270001428e-07, + "loss": 1.5631, + "step": 29520 + }, + { + "epoch": 2.5160657973237877, + "grad_norm": 57.08081890631364, + "learning_rate": 7.722022824149766e-07, + "loss": 1.4489, + "step": 29521 + }, + { + "epoch": 2.516151027017813, + "grad_norm": 61.21481904251322, + "learning_rate": 7.719375794085226e-07, + "loss": 1.5079, + "step": 29522 + }, + { + "epoch": 2.5162362567118386, + "grad_norm": 35.55668601825511, + "learning_rate": 7.716729179833815e-07, + "loss": 0.8916, + "step": 29523 + }, + { + "epoch": 2.5163214864058636, + "grad_norm": 41.14762333968979, + "learning_rate": 7.714082981421578e-07, + "loss": 1.381, + "step": 29524 + }, + { + "epoch": 2.516406716099889, + "grad_norm": 30.46241333754027, + "learning_rate": 7.711437198874539e-07, + "loss": 1.3134, + "step": 29525 + }, + { + "epoch": 2.5164919457939146, + "grad_norm": 71.25225624712769, + "learning_rate": 7.708791832218692e-07, + "loss": 1.6481, + "step": 29526 + }, + { + "epoch": 2.51657717548794, + "grad_norm": 49.08250516821113, + "learning_rate": 7.706146881480086e-07, + "loss": 1.5665, + "step": 29527 + }, + { + "epoch": 2.5166624051819655, + "grad_norm": 54.77881915890952, + "learning_rate": 7.703502346684721e-07, + "loss": 1.4179, + "step": 29528 + }, + { + "epoch": 2.516747634875991, + "grad_norm": 54.774941388816494, + "learning_rate": 7.700858227858593e-07, + "loss": 0.9002, + "step": 29529 + }, + { + "epoch": 2.5168328645700164, + "grad_norm": 32.1923891337153, + "learning_rate": 7.698214525027725e-07, + "loss": 0.8077, + "step": 29530 + }, + { + "epoch": 2.5169180942640415, + "grad_norm": 67.18609371459168, + "learning_rate": 7.695571238218097e-07, + "loss": 1.6988, + "step": 29531 + }, + { + "epoch": 2.517003323958067, + "grad_norm": 35.815551005706126, + "learning_rate": 7.692928367455732e-07, + "loss": 0.9977, + "step": 29532 + }, + { + "epoch": 2.5170885536520924, + "grad_norm": 30.887199452303317, + "learning_rate": 7.690285912766598e-07, + "loss": 1.1507, + "step": 29533 + }, + { + "epoch": 2.517173783346118, + "grad_norm": 32.739629844774036, + "learning_rate": 7.687643874176692e-07, + "loss": 0.8944, + "step": 29534 + }, + { + "epoch": 2.5172590130401433, + "grad_norm": 54.31899565710797, + "learning_rate": 7.685002251711987e-07, + "loss": 1.0184, + "step": 29535 + }, + { + "epoch": 2.5173442427341683, + "grad_norm": 38.096179749565806, + "learning_rate": 7.682361045398479e-07, + "loss": 1.0956, + "step": 29536 + }, + { + "epoch": 2.5174294724281943, + "grad_norm": 92.27277912836549, + "learning_rate": 7.679720255262136e-07, + "loss": 1.2066, + "step": 29537 + }, + { + "epoch": 2.5175147021222193, + "grad_norm": 29.340291703718588, + "learning_rate": 7.677079881328919e-07, + "loss": 1.0361, + "step": 29538 + }, + { + "epoch": 2.5175999318162448, + "grad_norm": 46.51227596719353, + "learning_rate": 7.674439923624799e-07, + "loss": 1.3608, + "step": 29539 + }, + { + "epoch": 2.51768516151027, + "grad_norm": 24.87358800411689, + "learning_rate": 7.671800382175759e-07, + "loss": 0.5959, + "step": 29540 + }, + { + "epoch": 2.5177703912042957, + "grad_norm": 64.25837643436469, + "learning_rate": 7.66916125700774e-07, + "loss": 1.8893, + "step": 29541 + }, + { + "epoch": 2.517855620898321, + "grad_norm": 91.07545518891892, + "learning_rate": 7.666522548146699e-07, + "loss": 1.5712, + "step": 29542 + }, + { + "epoch": 2.517940850592346, + "grad_norm": 40.05139215489982, + "learning_rate": 7.663884255618592e-07, + "loss": 1.5021, + "step": 29543 + }, + { + "epoch": 2.5180260802863716, + "grad_norm": 41.62318647428104, + "learning_rate": 7.661246379449345e-07, + "loss": 1.1499, + "step": 29544 + }, + { + "epoch": 2.518111309980397, + "grad_norm": 67.03192368640921, + "learning_rate": 7.658608919664928e-07, + "loss": 1.7498, + "step": 29545 + }, + { + "epoch": 2.5181965396744226, + "grad_norm": 44.65609067875513, + "learning_rate": 7.655971876291258e-07, + "loss": 1.2417, + "step": 29546 + }, + { + "epoch": 2.518281769368448, + "grad_norm": 47.860357902218375, + "learning_rate": 7.653335249354282e-07, + "loss": 1.312, + "step": 29547 + }, + { + "epoch": 2.5183669990624735, + "grad_norm": 20.391520987452672, + "learning_rate": 7.650699038879922e-07, + "loss": 0.5935, + "step": 29548 + }, + { + "epoch": 2.518452228756499, + "grad_norm": 59.396062333062176, + "learning_rate": 7.648063244894122e-07, + "loss": 1.6218, + "step": 29549 + }, + { + "epoch": 2.518537458450524, + "grad_norm": 40.86503300776793, + "learning_rate": 7.645427867422789e-07, + "loss": 1.0688, + "step": 29550 + }, + { + "epoch": 2.5186226881445495, + "grad_norm": 30.991271913282787, + "learning_rate": 7.642792906491847e-07, + "loss": 0.8596, + "step": 29551 + }, + { + "epoch": 2.518707917838575, + "grad_norm": 28.898014207627206, + "learning_rate": 7.640158362127187e-07, + "loss": 1.2901, + "step": 29552 + }, + { + "epoch": 2.5187931475326004, + "grad_norm": 42.71180724218771, + "learning_rate": 7.637524234354754e-07, + "loss": 1.5031, + "step": 29553 + }, + { + "epoch": 2.518878377226626, + "grad_norm": 37.96524545056351, + "learning_rate": 7.634890523200423e-07, + "loss": 1.2333, + "step": 29554 + }, + { + "epoch": 2.518963606920651, + "grad_norm": 71.82294598115915, + "learning_rate": 7.632257228690121e-07, + "loss": 2.7754, + "step": 29555 + }, + { + "epoch": 2.519048836614677, + "grad_norm": 17.031980749688426, + "learning_rate": 7.629624350849724e-07, + "loss": 0.76, + "step": 29556 + }, + { + "epoch": 2.519134066308702, + "grad_norm": 34.41505986421747, + "learning_rate": 7.626991889705143e-07, + "loss": 1.2282, + "step": 29557 + }, + { + "epoch": 2.5192192960027273, + "grad_norm": 26.912042226872458, + "learning_rate": 7.624359845282265e-07, + "loss": 0.884, + "step": 29558 + }, + { + "epoch": 2.519304525696753, + "grad_norm": 38.56153763225551, + "learning_rate": 7.621728217606966e-07, + "loss": 1.1407, + "step": 29559 + }, + { + "epoch": 2.5193897553907783, + "grad_norm": 68.33395204013694, + "learning_rate": 7.619097006705117e-07, + "loss": 1.4633, + "step": 29560 + }, + { + "epoch": 2.5194749850848037, + "grad_norm": 42.05981766386685, + "learning_rate": 7.616466212602625e-07, + "loss": 1.5852, + "step": 29561 + }, + { + "epoch": 2.5195602147788287, + "grad_norm": 77.01912355983738, + "learning_rate": 7.613835835325329e-07, + "loss": 1.9764, + "step": 29562 + }, + { + "epoch": 2.519645444472854, + "grad_norm": 36.89875766500649, + "learning_rate": 7.61120587489913e-07, + "loss": 1.0334, + "step": 29563 + }, + { + "epoch": 2.5197306741668797, + "grad_norm": 81.71027814381615, + "learning_rate": 7.608576331349877e-07, + "loss": 1.7453, + "step": 29564 + }, + { + "epoch": 2.519815903860905, + "grad_norm": 34.61069482689439, + "learning_rate": 7.605947204703423e-07, + "loss": 1.2933, + "step": 29565 + }, + { + "epoch": 2.5199011335549306, + "grad_norm": 32.08553042024912, + "learning_rate": 7.60331849498564e-07, + "loss": 0.9466, + "step": 29566 + }, + { + "epoch": 2.519986363248956, + "grad_norm": 30.287231620653838, + "learning_rate": 7.600690202222372e-07, + "loss": 0.7309, + "step": 29567 + }, + { + "epoch": 2.5200715929429816, + "grad_norm": 49.41779142337952, + "learning_rate": 7.59806232643946e-07, + "loss": 0.9639, + "step": 29568 + }, + { + "epoch": 2.5201568226370066, + "grad_norm": 47.69792537228051, + "learning_rate": 7.595434867662766e-07, + "loss": 1.507, + "step": 29569 + }, + { + "epoch": 2.520242052331032, + "grad_norm": 41.45660046090008, + "learning_rate": 7.592807825918108e-07, + "loss": 0.8493, + "step": 29570 + }, + { + "epoch": 2.5203272820250575, + "grad_norm": 39.51292793208899, + "learning_rate": 7.590181201231345e-07, + "loss": 1.0352, + "step": 29571 + }, + { + "epoch": 2.520412511719083, + "grad_norm": 28.02537518712521, + "learning_rate": 7.587554993628299e-07, + "loss": 0.8224, + "step": 29572 + }, + { + "epoch": 2.5204977414131085, + "grad_norm": 42.579821120372245, + "learning_rate": 7.584929203134794e-07, + "loss": 1.5924, + "step": 29573 + }, + { + "epoch": 2.5205829711071335, + "grad_norm": 37.74303533980462, + "learning_rate": 7.582303829776649e-07, + "loss": 1.2638, + "step": 29574 + }, + { + "epoch": 2.5206682008011594, + "grad_norm": 44.074548953475094, + "learning_rate": 7.579678873579698e-07, + "loss": 1.0289, + "step": 29575 + }, + { + "epoch": 2.5207534304951844, + "grad_norm": 67.42584787489308, + "learning_rate": 7.577054334569745e-07, + "loss": 1.3543, + "step": 29576 + }, + { + "epoch": 2.52083866018921, + "grad_norm": 33.944697860873454, + "learning_rate": 7.57443021277261e-07, + "loss": 0.8685, + "step": 29577 + }, + { + "epoch": 2.5209238898832353, + "grad_norm": 49.41596158827773, + "learning_rate": 7.571806508214085e-07, + "loss": 0.9764, + "step": 29578 + }, + { + "epoch": 2.521009119577261, + "grad_norm": 42.841411358905646, + "learning_rate": 7.569183220919995e-07, + "loss": 1.3048, + "step": 29579 + }, + { + "epoch": 2.5210943492712863, + "grad_norm": 25.851697801597417, + "learning_rate": 7.56656035091613e-07, + "loss": 0.9775, + "step": 29580 + }, + { + "epoch": 2.5211795789653113, + "grad_norm": 71.41789939483415, + "learning_rate": 7.563937898228279e-07, + "loss": 1.6567, + "step": 29581 + }, + { + "epoch": 2.5212648086593368, + "grad_norm": 21.87908750805681, + "learning_rate": 7.561315862882229e-07, + "loss": 0.9035, + "step": 29582 + }, + { + "epoch": 2.5213500383533622, + "grad_norm": 42.65858848181034, + "learning_rate": 7.558694244903786e-07, + "loss": 1.2398, + "step": 29583 + }, + { + "epoch": 2.5214352680473877, + "grad_norm": 61.3123369763504, + "learning_rate": 7.556073044318718e-07, + "loss": 1.5774, + "step": 29584 + }, + { + "epoch": 2.521520497741413, + "grad_norm": 44.64336814570408, + "learning_rate": 7.553452261152794e-07, + "loss": 0.9512, + "step": 29585 + }, + { + "epoch": 2.5216057274354386, + "grad_norm": 21.12942734140567, + "learning_rate": 7.550831895431799e-07, + "loss": 0.7139, + "step": 29586 + }, + { + "epoch": 2.521690957129464, + "grad_norm": 41.207527071170944, + "learning_rate": 7.548211947181517e-07, + "loss": 1.2424, + "step": 29587 + }, + { + "epoch": 2.521776186823489, + "grad_norm": 28.63075034555398, + "learning_rate": 7.545592416427705e-07, + "loss": 1.5632, + "step": 29588 + }, + { + "epoch": 2.5218614165175146, + "grad_norm": 38.6002418799879, + "learning_rate": 7.542973303196122e-07, + "loss": 0.7377, + "step": 29589 + }, + { + "epoch": 2.52194664621154, + "grad_norm": 92.10738674400821, + "learning_rate": 7.540354607512524e-07, + "loss": 2.1915, + "step": 29590 + }, + { + "epoch": 2.5220318759055655, + "grad_norm": 72.08487517751935, + "learning_rate": 7.537736329402651e-07, + "loss": 1.9289, + "step": 29591 + }, + { + "epoch": 2.522117105599591, + "grad_norm": 64.8678043170857, + "learning_rate": 7.535118468892283e-07, + "loss": 2.1996, + "step": 29592 + }, + { + "epoch": 2.522202335293616, + "grad_norm": 56.98913203016199, + "learning_rate": 7.532501026007144e-07, + "loss": 1.8086, + "step": 29593 + }, + { + "epoch": 2.522287564987642, + "grad_norm": 33.32550731888035, + "learning_rate": 7.529884000772992e-07, + "loss": 1.2335, + "step": 29594 + }, + { + "epoch": 2.522372794681667, + "grad_norm": 43.817470069055126, + "learning_rate": 7.527267393215537e-07, + "loss": 1.3635, + "step": 29595 + }, + { + "epoch": 2.5224580243756924, + "grad_norm": 44.07346706742296, + "learning_rate": 7.52465120336055e-07, + "loss": 1.6949, + "step": 29596 + }, + { + "epoch": 2.522543254069718, + "grad_norm": 61.601930749258884, + "learning_rate": 7.522035431233732e-07, + "loss": 0.9369, + "step": 29597 + }, + { + "epoch": 2.5226284837637434, + "grad_norm": 29.61615122183369, + "learning_rate": 7.519420076860823e-07, + "loss": 1.2818, + "step": 29598 + }, + { + "epoch": 2.522713713457769, + "grad_norm": 76.15598563920575, + "learning_rate": 7.516805140267525e-07, + "loss": 2.437, + "step": 29599 + }, + { + "epoch": 2.522798943151794, + "grad_norm": 37.6340285857387, + "learning_rate": 7.514190621479578e-07, + "loss": 0.9765, + "step": 29600 + }, + { + "epoch": 2.5228841728458193, + "grad_norm": 46.338188073046496, + "learning_rate": 7.511576520522673e-07, + "loss": 1.2924, + "step": 29601 + }, + { + "epoch": 2.522969402539845, + "grad_norm": 61.23404781240674, + "learning_rate": 7.50896283742254e-07, + "loss": 1.322, + "step": 29602 + }, + { + "epoch": 2.5230546322338703, + "grad_norm": 27.551063861513253, + "learning_rate": 7.506349572204874e-07, + "loss": 1.0401, + "step": 29603 + }, + { + "epoch": 2.5231398619278957, + "grad_norm": 53.00551212283785, + "learning_rate": 7.503736724895361e-07, + "loss": 0.819, + "step": 29604 + }, + { + "epoch": 2.523225091621921, + "grad_norm": 67.41139655146752, + "learning_rate": 7.501124295519724e-07, + "loss": 1.6523, + "step": 29605 + }, + { + "epoch": 2.5233103213159467, + "grad_norm": 25.75721282890138, + "learning_rate": 7.498512284103643e-07, + "loss": 0.9417, + "step": 29606 + }, + { + "epoch": 2.5233955510099717, + "grad_norm": 32.65736039053768, + "learning_rate": 7.495900690672792e-07, + "loss": 1.1569, + "step": 29607 + }, + { + "epoch": 2.523480780703997, + "grad_norm": 64.62398572562694, + "learning_rate": 7.493289515252883e-07, + "loss": 1.6115, + "step": 29608 + }, + { + "epoch": 2.5235660103980226, + "grad_norm": 51.145563739185526, + "learning_rate": 7.490678757869568e-07, + "loss": 1.6384, + "step": 29609 + }, + { + "epoch": 2.523651240092048, + "grad_norm": 50.36309487734709, + "learning_rate": 7.488068418548544e-07, + "loss": 1.0747, + "step": 29610 + }, + { + "epoch": 2.5237364697860736, + "grad_norm": 69.651419943992, + "learning_rate": 7.485458497315478e-07, + "loss": 1.8337, + "step": 29611 + }, + { + "epoch": 2.5238216994800986, + "grad_norm": 56.34687354330464, + "learning_rate": 7.482848994196024e-07, + "loss": 1.6182, + "step": 29612 + }, + { + "epoch": 2.5239069291741245, + "grad_norm": 44.27697143321188, + "learning_rate": 7.480239909215864e-07, + "loss": 1.2318, + "step": 29613 + }, + { + "epoch": 2.5239921588681495, + "grad_norm": 61.793315214288675, + "learning_rate": 7.477631242400651e-07, + "loss": 1.2287, + "step": 29614 + }, + { + "epoch": 2.524077388562175, + "grad_norm": 64.47506631494262, + "learning_rate": 7.475022993776026e-07, + "loss": 1.4041, + "step": 29615 + }, + { + "epoch": 2.5241626182562005, + "grad_norm": 55.388213630102385, + "learning_rate": 7.472415163367664e-07, + "loss": 1.8087, + "step": 29616 + }, + { + "epoch": 2.524247847950226, + "grad_norm": 31.122865582118582, + "learning_rate": 7.469807751201185e-07, + "loss": 0.9002, + "step": 29617 + }, + { + "epoch": 2.5243330776442514, + "grad_norm": 37.23290354792614, + "learning_rate": 7.467200757302262e-07, + "loss": 1.4952, + "step": 29618 + }, + { + "epoch": 2.5244183073382764, + "grad_norm": 31.163427942254803, + "learning_rate": 7.464594181696516e-07, + "loss": 1.3427, + "step": 29619 + }, + { + "epoch": 2.524503537032302, + "grad_norm": 32.226056645272834, + "learning_rate": 7.461988024409588e-07, + "loss": 0.8646, + "step": 29620 + }, + { + "epoch": 2.5245887667263274, + "grad_norm": 44.859148896285944, + "learning_rate": 7.459382285467087e-07, + "loss": 1.3769, + "step": 29621 + }, + { + "epoch": 2.524673996420353, + "grad_norm": 61.95714993204636, + "learning_rate": 7.456776964894674e-07, + "loss": 1.8647, + "step": 29622 + }, + { + "epoch": 2.5247592261143783, + "grad_norm": 65.84875051395203, + "learning_rate": 7.454172062717941e-07, + "loss": 1.32, + "step": 29623 + }, + { + "epoch": 2.5248444558084038, + "grad_norm": 58.2777111858932, + "learning_rate": 7.451567578962532e-07, + "loss": 1.5264, + "step": 29624 + }, + { + "epoch": 2.5249296855024292, + "grad_norm": 32.32565730434712, + "learning_rate": 7.448963513654039e-07, + "loss": 1.0825, + "step": 29625 + }, + { + "epoch": 2.5250149151964543, + "grad_norm": 43.7931044770989, + "learning_rate": 7.446359866818092e-07, + "loss": 1.0155, + "step": 29626 + }, + { + "epoch": 2.5251001448904797, + "grad_norm": 73.92361365987398, + "learning_rate": 7.443756638480287e-07, + "loss": 1.385, + "step": 29627 + }, + { + "epoch": 2.525185374584505, + "grad_norm": 62.973938049528016, + "learning_rate": 7.441153828666226e-07, + "loss": 1.9506, + "step": 29628 + }, + { + "epoch": 2.5252706042785307, + "grad_norm": 25.738924515017086, + "learning_rate": 7.438551437401492e-07, + "loss": 0.7923, + "step": 29629 + }, + { + "epoch": 2.525355833972556, + "grad_norm": 21.38740597741396, + "learning_rate": 7.435949464711706e-07, + "loss": 0.9578, + "step": 29630 + }, + { + "epoch": 2.525441063666581, + "grad_norm": 56.84896535922057, + "learning_rate": 7.433347910622429e-07, + "loss": 1.5677, + "step": 29631 + }, + { + "epoch": 2.525526293360607, + "grad_norm": 71.26834984219661, + "learning_rate": 7.430746775159275e-07, + "loss": 1.9336, + "step": 29632 + }, + { + "epoch": 2.525611523054632, + "grad_norm": 60.73491845796999, + "learning_rate": 7.428146058347807e-07, + "loss": 0.822, + "step": 29633 + }, + { + "epoch": 2.5256967527486576, + "grad_norm": 83.12773977819062, + "learning_rate": 7.425545760213598e-07, + "loss": 2.0283, + "step": 29634 + }, + { + "epoch": 2.525781982442683, + "grad_norm": 47.19695177261683, + "learning_rate": 7.422945880782245e-07, + "loss": 1.2253, + "step": 29635 + }, + { + "epoch": 2.5258672121367085, + "grad_norm": 53.87083210305206, + "learning_rate": 7.420346420079294e-07, + "loss": 1.2446, + "step": 29636 + }, + { + "epoch": 2.525952441830734, + "grad_norm": 100.45158509057141, + "learning_rate": 7.417747378130319e-07, + "loss": 2.8196, + "step": 29637 + }, + { + "epoch": 2.526037671524759, + "grad_norm": 100.46300408004063, + "learning_rate": 7.415148754960866e-07, + "loss": 2.4789, + "step": 29638 + }, + { + "epoch": 2.5261229012187845, + "grad_norm": 61.48231501450948, + "learning_rate": 7.412550550596515e-07, + "loss": 1.7606, + "step": 29639 + }, + { + "epoch": 2.52620813091281, + "grad_norm": 49.09799724568176, + "learning_rate": 7.409952765062794e-07, + "loss": 1.4599, + "step": 29640 + }, + { + "epoch": 2.5262933606068354, + "grad_norm": 43.14017095804019, + "learning_rate": 7.40735539838528e-07, + "loss": 1.2095, + "step": 29641 + }, + { + "epoch": 2.526378590300861, + "grad_norm": 52.03701661827392, + "learning_rate": 7.404758450589488e-07, + "loss": 1.0526, + "step": 29642 + }, + { + "epoch": 2.5264638199948863, + "grad_norm": 74.57912714922082, + "learning_rate": 7.402161921700979e-07, + "loss": 2.1583, + "step": 29643 + }, + { + "epoch": 2.526549049688912, + "grad_norm": 82.04400816646843, + "learning_rate": 7.399565811745285e-07, + "loss": 2.0189, + "step": 29644 + }, + { + "epoch": 2.526634279382937, + "grad_norm": 27.38613911910208, + "learning_rate": 7.396970120747932e-07, + "loss": 0.9037, + "step": 29645 + }, + { + "epoch": 2.5267195090769623, + "grad_norm": 61.86939004491578, + "learning_rate": 7.394374848734437e-07, + "loss": 1.1211, + "step": 29646 + }, + { + "epoch": 2.5268047387709878, + "grad_norm": 86.9930786035617, + "learning_rate": 7.391779995730342e-07, + "loss": 1.9976, + "step": 29647 + }, + { + "epoch": 2.5268899684650132, + "grad_norm": 84.54973409955493, + "learning_rate": 7.389185561761153e-07, + "loss": 1.9537, + "step": 29648 + }, + { + "epoch": 2.5269751981590387, + "grad_norm": 34.35063859261907, + "learning_rate": 7.386591546852406e-07, + "loss": 1.2323, + "step": 29649 + }, + { + "epoch": 2.527060427853064, + "grad_norm": 42.73671994616451, + "learning_rate": 7.383997951029598e-07, + "loss": 1.1825, + "step": 29650 + }, + { + "epoch": 2.5271456575470896, + "grad_norm": 65.75538162256115, + "learning_rate": 7.381404774318224e-07, + "loss": 1.8729, + "step": 29651 + }, + { + "epoch": 2.5272308872411147, + "grad_norm": 68.25707524997496, + "learning_rate": 7.378812016743808e-07, + "loss": 1.4225, + "step": 29652 + }, + { + "epoch": 2.52731611693514, + "grad_norm": 74.43218735541376, + "learning_rate": 7.376219678331842e-07, + "loss": 2.0683, + "step": 29653 + }, + { + "epoch": 2.5274013466291656, + "grad_norm": 50.380612052567734, + "learning_rate": 7.373627759107804e-07, + "loss": 1.663, + "step": 29654 + }, + { + "epoch": 2.527486576323191, + "grad_norm": 64.48978846255982, + "learning_rate": 7.371036259097213e-07, + "loss": 1.5727, + "step": 29655 + }, + { + "epoch": 2.5275718060172165, + "grad_norm": 30.79039664106084, + "learning_rate": 7.368445178325528e-07, + "loss": 1.0796, + "step": 29656 + }, + { + "epoch": 2.5276570357112416, + "grad_norm": 28.661350841818848, + "learning_rate": 7.365854516818261e-07, + "loss": 0.8927, + "step": 29657 + }, + { + "epoch": 2.527742265405267, + "grad_norm": 62.18599645134396, + "learning_rate": 7.363264274600868e-07, + "loss": 1.8469, + "step": 29658 + }, + { + "epoch": 2.5278274950992925, + "grad_norm": 25.384082681159096, + "learning_rate": 7.360674451698818e-07, + "loss": 0.7601, + "step": 29659 + }, + { + "epoch": 2.527912724793318, + "grad_norm": 44.139766614625316, + "learning_rate": 7.358085048137603e-07, + "loss": 1.6772, + "step": 29660 + }, + { + "epoch": 2.5279979544873434, + "grad_norm": 21.653872908353605, + "learning_rate": 7.355496063942679e-07, + "loss": 1.1318, + "step": 29661 + }, + { + "epoch": 2.528083184181369, + "grad_norm": 85.36052714906236, + "learning_rate": 7.352907499139495e-07, + "loss": 1.7189, + "step": 29662 + }, + { + "epoch": 2.5281684138753944, + "grad_norm": 52.20346942639321, + "learning_rate": 7.350319353753532e-07, + "loss": 2.1183, + "step": 29663 + }, + { + "epoch": 2.5282536435694194, + "grad_norm": 26.914878327435915, + "learning_rate": 7.347731627810218e-07, + "loss": 0.7557, + "step": 29664 + }, + { + "epoch": 2.528338873263445, + "grad_norm": 28.488119090145037, + "learning_rate": 7.345144321335024e-07, + "loss": 0.8256, + "step": 29665 + }, + { + "epoch": 2.5284241029574703, + "grad_norm": 55.4591838638244, + "learning_rate": 7.342557434353387e-07, + "loss": 1.5523, + "step": 29666 + }, + { + "epoch": 2.528509332651496, + "grad_norm": 14.311241885577502, + "learning_rate": 7.339970966890753e-07, + "loss": 0.6292, + "step": 29667 + }, + { + "epoch": 2.5285945623455213, + "grad_norm": 59.70846929956092, + "learning_rate": 7.337384918972535e-07, + "loss": 1.2289, + "step": 29668 + }, + { + "epoch": 2.5286797920395467, + "grad_norm": 37.753601884975666, + "learning_rate": 7.334799290624195e-07, + "loss": 0.8849, + "step": 29669 + }, + { + "epoch": 2.528765021733572, + "grad_norm": 37.42683447031771, + "learning_rate": 7.332214081871142e-07, + "loss": 1.2699, + "step": 29670 + }, + { + "epoch": 2.5288502514275972, + "grad_norm": 34.890411965683306, + "learning_rate": 7.329629292738822e-07, + "loss": 1.216, + "step": 29671 + }, + { + "epoch": 2.5289354811216227, + "grad_norm": 74.77323927300262, + "learning_rate": 7.32704492325263e-07, + "loss": 1.3193, + "step": 29672 + }, + { + "epoch": 2.529020710815648, + "grad_norm": 53.324979735481556, + "learning_rate": 7.324460973438002e-07, + "loss": 1.3159, + "step": 29673 + }, + { + "epoch": 2.5291059405096736, + "grad_norm": 39.00991807690013, + "learning_rate": 7.321877443320347e-07, + "loss": 1.303, + "step": 29674 + }, + { + "epoch": 2.529191170203699, + "grad_norm": 46.94021591705477, + "learning_rate": 7.319294332925064e-07, + "loss": 1.5526, + "step": 29675 + }, + { + "epoch": 2.529276399897724, + "grad_norm": 70.17138737092145, + "learning_rate": 7.316711642277568e-07, + "loss": 1.6312, + "step": 29676 + }, + { + "epoch": 2.52936162959175, + "grad_norm": 55.15173665301356, + "learning_rate": 7.31412937140324e-07, + "loss": 1.6544, + "step": 29677 + }, + { + "epoch": 2.529446859285775, + "grad_norm": 51.00500423883499, + "learning_rate": 7.311547520327484e-07, + "loss": 1.3716, + "step": 29678 + }, + { + "epoch": 2.5295320889798005, + "grad_norm": 71.84840965898282, + "learning_rate": 7.308966089075709e-07, + "loss": 1.8512, + "step": 29679 + }, + { + "epoch": 2.529617318673826, + "grad_norm": 62.48357031246757, + "learning_rate": 7.306385077673289e-07, + "loss": 1.355, + "step": 29680 + }, + { + "epoch": 2.5297025483678515, + "grad_norm": 35.87457544959738, + "learning_rate": 7.303804486145594e-07, + "loss": 1.1685, + "step": 29681 + }, + { + "epoch": 2.529787778061877, + "grad_norm": 50.849629431303114, + "learning_rate": 7.30122431451803e-07, + "loss": 1.6278, + "step": 29682 + }, + { + "epoch": 2.529873007755902, + "grad_norm": 54.07490482002699, + "learning_rate": 7.298644562815954e-07, + "loss": 1.1442, + "step": 29683 + }, + { + "epoch": 2.5299582374499274, + "grad_norm": 64.02886159541515, + "learning_rate": 7.296065231064742e-07, + "loss": 1.5614, + "step": 29684 + }, + { + "epoch": 2.530043467143953, + "grad_norm": 54.942032642176905, + "learning_rate": 7.29348631928975e-07, + "loss": 1.8598, + "step": 29685 + }, + { + "epoch": 2.5301286968379784, + "grad_norm": 19.459199679635134, + "learning_rate": 7.290907827516363e-07, + "loss": 1.1989, + "step": 29686 + }, + { + "epoch": 2.530213926532004, + "grad_norm": 40.80557414752137, + "learning_rate": 7.288329755769913e-07, + "loss": 0.954, + "step": 29687 + }, + { + "epoch": 2.5302991562260293, + "grad_norm": 33.909222158816476, + "learning_rate": 7.285752104075777e-07, + "loss": 1.1861, + "step": 29688 + }, + { + "epoch": 2.5303843859200548, + "grad_norm": 77.40634030821086, + "learning_rate": 7.283174872459298e-07, + "loss": 2.0462, + "step": 29689 + }, + { + "epoch": 2.53046961561408, + "grad_norm": 30.269489875428157, + "learning_rate": 7.280598060945809e-07, + "loss": 0.7466, + "step": 29690 + }, + { + "epoch": 2.5305548453081053, + "grad_norm": 46.12496068029567, + "learning_rate": 7.278021669560675e-07, + "loss": 1.3677, + "step": 29691 + }, + { + "epoch": 2.5306400750021307, + "grad_norm": 44.35734794347094, + "learning_rate": 7.275445698329226e-07, + "loss": 1.4521, + "step": 29692 + }, + { + "epoch": 2.530725304696156, + "grad_norm": 28.943857447883552, + "learning_rate": 7.272870147276772e-07, + "loss": 1.6387, + "step": 29693 + }, + { + "epoch": 2.5308105343901817, + "grad_norm": 52.29825969152902, + "learning_rate": 7.27029501642868e-07, + "loss": 1.0575, + "step": 29694 + }, + { + "epoch": 2.5308957640842067, + "grad_norm": 33.85243340469802, + "learning_rate": 7.267720305810244e-07, + "loss": 0.9253, + "step": 29695 + }, + { + "epoch": 2.5309809937782326, + "grad_norm": 37.34023565609723, + "learning_rate": 7.265146015446806e-07, + "loss": 1.1135, + "step": 29696 + }, + { + "epoch": 2.5310662234722576, + "grad_norm": 56.936388556692954, + "learning_rate": 7.262572145363678e-07, + "loss": 1.3642, + "step": 29697 + }, + { + "epoch": 2.531151453166283, + "grad_norm": 111.44646639032719, + "learning_rate": 7.259998695586162e-07, + "loss": 1.6708, + "step": 29698 + }, + { + "epoch": 2.5312366828603086, + "grad_norm": 48.70890700002235, + "learning_rate": 7.257425666139584e-07, + "loss": 1.3068, + "step": 29699 + }, + { + "epoch": 2.531321912554334, + "grad_norm": 66.22051089321596, + "learning_rate": 7.254853057049244e-07, + "loss": 1.8317, + "step": 29700 + }, + { + "epoch": 2.5314071422483595, + "grad_norm": 47.23716636043955, + "learning_rate": 7.252280868340422e-07, + "loss": 1.0543, + "step": 29701 + }, + { + "epoch": 2.5314923719423845, + "grad_norm": 37.33122117841398, + "learning_rate": 7.249709100038449e-07, + "loss": 0.9989, + "step": 29702 + }, + { + "epoch": 2.53157760163641, + "grad_norm": 40.70248717616431, + "learning_rate": 7.247137752168587e-07, + "loss": 1.6013, + "step": 29703 + }, + { + "epoch": 2.5316628313304355, + "grad_norm": 82.01152349158379, + "learning_rate": 7.244566824756144e-07, + "loss": 2.4967, + "step": 29704 + }, + { + "epoch": 2.531748061024461, + "grad_norm": 64.2996020779817, + "learning_rate": 7.241996317826399e-07, + "loss": 1.9351, + "step": 29705 + }, + { + "epoch": 2.5318332907184864, + "grad_norm": 57.1255665658466, + "learning_rate": 7.239426231404628e-07, + "loss": 1.0784, + "step": 29706 + }, + { + "epoch": 2.531918520412512, + "grad_norm": 40.89285674239197, + "learning_rate": 7.236856565516104e-07, + "loss": 0.9, + "step": 29707 + }, + { + "epoch": 2.5320037501065373, + "grad_norm": 56.29877803038703, + "learning_rate": 7.234287320186107e-07, + "loss": 1.4132, + "step": 29708 + }, + { + "epoch": 2.5320889798005624, + "grad_norm": 80.38214256843808, + "learning_rate": 7.231718495439893e-07, + "loss": 2.0474, + "step": 29709 + }, + { + "epoch": 2.532174209494588, + "grad_norm": 42.409265494005076, + "learning_rate": 7.229150091302744e-07, + "loss": 1.2277, + "step": 29710 + }, + { + "epoch": 2.5322594391886133, + "grad_norm": 37.37830530281813, + "learning_rate": 7.226582107799901e-07, + "loss": 0.8094, + "step": 29711 + }, + { + "epoch": 2.5323446688826388, + "grad_norm": 44.71725963537663, + "learning_rate": 7.224014544956631e-07, + "loss": 1.5818, + "step": 29712 + }, + { + "epoch": 2.5324298985766642, + "grad_norm": 97.55255392628897, + "learning_rate": 7.221447402798182e-07, + "loss": 2.2307, + "step": 29713 + }, + { + "epoch": 2.5325151282706893, + "grad_norm": 61.13945559917874, + "learning_rate": 7.218880681349805e-07, + "loss": 1.9039, + "step": 29714 + }, + { + "epoch": 2.532600357964715, + "grad_norm": 62.05789905347575, + "learning_rate": 7.216314380636719e-07, + "loss": 1.306, + "step": 29715 + }, + { + "epoch": 2.53268558765874, + "grad_norm": 69.58114474781077, + "learning_rate": 7.213748500684192e-07, + "loss": 2.0069, + "step": 29716 + }, + { + "epoch": 2.5327708173527657, + "grad_norm": 22.73419730650472, + "learning_rate": 7.211183041517439e-07, + "loss": 0.5935, + "step": 29717 + }, + { + "epoch": 2.532856047046791, + "grad_norm": 75.19019411039501, + "learning_rate": 7.20861800316171e-07, + "loss": 1.2873, + "step": 29718 + }, + { + "epoch": 2.5329412767408166, + "grad_norm": 22.28249864134812, + "learning_rate": 7.206053385642215e-07, + "loss": 0.8924, + "step": 29719 + }, + { + "epoch": 2.533026506434842, + "grad_norm": 74.62080000157282, + "learning_rate": 7.203489188984175e-07, + "loss": 1.0984, + "step": 29720 + }, + { + "epoch": 2.533111736128867, + "grad_norm": 48.77888952572289, + "learning_rate": 7.200925413212817e-07, + "loss": 1.0624, + "step": 29721 + }, + { + "epoch": 2.5331969658228926, + "grad_norm": 55.16252867330963, + "learning_rate": 7.198362058353359e-07, + "loss": 1.3791, + "step": 29722 + }, + { + "epoch": 2.533282195516918, + "grad_norm": 45.8587682614335, + "learning_rate": 7.195799124430997e-07, + "loss": 1.1899, + "step": 29723 + }, + { + "epoch": 2.5333674252109435, + "grad_norm": 49.93014128938796, + "learning_rate": 7.193236611470933e-07, + "loss": 1.1051, + "step": 29724 + }, + { + "epoch": 2.533452654904969, + "grad_norm": 39.82456674185944, + "learning_rate": 7.190674519498376e-07, + "loss": 1.2111, + "step": 29725 + }, + { + "epoch": 2.5335378845989944, + "grad_norm": 67.80552729335618, + "learning_rate": 7.188112848538536e-07, + "loss": 1.5985, + "step": 29726 + }, + { + "epoch": 2.53362311429302, + "grad_norm": 27.56063571302765, + "learning_rate": 7.185551598616592e-07, + "loss": 1.4795, + "step": 29727 + }, + { + "epoch": 2.533708343987045, + "grad_norm": 25.2864912472287, + "learning_rate": 7.182990769757725e-07, + "loss": 0.8867, + "step": 29728 + }, + { + "epoch": 2.5337935736810704, + "grad_norm": 69.10246443425474, + "learning_rate": 7.180430361987146e-07, + "loss": 1.6166, + "step": 29729 + }, + { + "epoch": 2.533878803375096, + "grad_norm": 51.35303175470141, + "learning_rate": 7.177870375330015e-07, + "loss": 1.043, + "step": 29730 + }, + { + "epoch": 2.5339640330691213, + "grad_norm": 85.19358738071277, + "learning_rate": 7.175310809811514e-07, + "loss": 1.8833, + "step": 29731 + }, + { + "epoch": 2.534049262763147, + "grad_norm": 80.90156387107844, + "learning_rate": 7.1727516654568e-07, + "loss": 1.9195, + "step": 29732 + }, + { + "epoch": 2.534134492457172, + "grad_norm": 53.66786875328043, + "learning_rate": 7.170192942291071e-07, + "loss": 1.9869, + "step": 29733 + }, + { + "epoch": 2.5342197221511977, + "grad_norm": 87.7736599552676, + "learning_rate": 7.16763464033946e-07, + "loss": 2.2013, + "step": 29734 + }, + { + "epoch": 2.5343049518452228, + "grad_norm": 82.91353532051853, + "learning_rate": 7.16507675962716e-07, + "loss": 1.8621, + "step": 29735 + }, + { + "epoch": 2.534390181539248, + "grad_norm": 79.184244502476, + "learning_rate": 7.162519300179305e-07, + "loss": 1.9545, + "step": 29736 + }, + { + "epoch": 2.5344754112332737, + "grad_norm": 49.79922663639478, + "learning_rate": 7.159962262021042e-07, + "loss": 1.1195, + "step": 29737 + }, + { + "epoch": 2.534560640927299, + "grad_norm": 27.610509817969714, + "learning_rate": 7.157405645177534e-07, + "loss": 0.5775, + "step": 29738 + }, + { + "epoch": 2.5346458706213246, + "grad_norm": 73.4312552490314, + "learning_rate": 7.154849449673923e-07, + "loss": 1.3721, + "step": 29739 + }, + { + "epoch": 2.5347311003153497, + "grad_norm": 51.82668413379958, + "learning_rate": 7.152293675535327e-07, + "loss": 1.37, + "step": 29740 + }, + { + "epoch": 2.534816330009375, + "grad_norm": 63.164766238278226, + "learning_rate": 7.149738322786909e-07, + "loss": 1.8548, + "step": 29741 + }, + { + "epoch": 2.5349015597034006, + "grad_norm": 19.672766861913914, + "learning_rate": 7.147183391453777e-07, + "loss": 0.6933, + "step": 29742 + }, + { + "epoch": 2.534986789397426, + "grad_norm": 51.35888323301888, + "learning_rate": 7.144628881561078e-07, + "loss": 1.6732, + "step": 29743 + }, + { + "epoch": 2.5350720190914515, + "grad_norm": 63.153521219947656, + "learning_rate": 7.14207479313393e-07, + "loss": 1.3294, + "step": 29744 + }, + { + "epoch": 2.535157248785477, + "grad_norm": 51.333418970858524, + "learning_rate": 7.13952112619743e-07, + "loss": 1.3512, + "step": 29745 + }, + { + "epoch": 2.5352424784795025, + "grad_norm": 37.110132097406755, + "learning_rate": 7.136967880776718e-07, + "loss": 1.8203, + "step": 29746 + }, + { + "epoch": 2.5353277081735275, + "grad_norm": 31.301335658005137, + "learning_rate": 7.134415056896898e-07, + "loss": 1.0022, + "step": 29747 + }, + { + "epoch": 2.535412937867553, + "grad_norm": 79.76954610104126, + "learning_rate": 7.131862654583066e-07, + "loss": 2.1349, + "step": 29748 + }, + { + "epoch": 2.5354981675615784, + "grad_norm": 46.882266369117616, + "learning_rate": 7.129310673860335e-07, + "loss": 1.3256, + "step": 29749 + }, + { + "epoch": 2.535583397255604, + "grad_norm": 66.1663917508781, + "learning_rate": 7.12675911475379e-07, + "loss": 1.6577, + "step": 29750 + }, + { + "epoch": 2.5356686269496294, + "grad_norm": 57.69301984688034, + "learning_rate": 7.124207977288544e-07, + "loss": 1.2825, + "step": 29751 + }, + { + "epoch": 2.5357538566436544, + "grad_norm": 58.40269936071304, + "learning_rate": 7.121657261489678e-07, + "loss": 1.7456, + "step": 29752 + }, + { + "epoch": 2.5358390863376803, + "grad_norm": 44.81772768322126, + "learning_rate": 7.119106967382272e-07, + "loss": 1.3726, + "step": 29753 + }, + { + "epoch": 2.5359243160317053, + "grad_norm": 45.70182753246854, + "learning_rate": 7.116557094991394e-07, + "loss": 1.4541, + "step": 29754 + }, + { + "epoch": 2.536009545725731, + "grad_norm": 49.89026404105191, + "learning_rate": 7.114007644342152e-07, + "loss": 1.0256, + "step": 29755 + }, + { + "epoch": 2.5360947754197563, + "grad_norm": 37.856812751812356, + "learning_rate": 7.111458615459593e-07, + "loss": 0.8751, + "step": 29756 + }, + { + "epoch": 2.5361800051137817, + "grad_norm": 38.54818904210737, + "learning_rate": 7.108910008368807e-07, + "loss": 0.9284, + "step": 29757 + }, + { + "epoch": 2.536265234807807, + "grad_norm": 60.00453868067423, + "learning_rate": 7.106361823094837e-07, + "loss": 2.0293, + "step": 29758 + }, + { + "epoch": 2.536350464501832, + "grad_norm": 39.28084251537038, + "learning_rate": 7.103814059662767e-07, + "loss": 1.3389, + "step": 29759 + }, + { + "epoch": 2.5364356941958577, + "grad_norm": 67.87103777774774, + "learning_rate": 7.101266718097638e-07, + "loss": 1.9669, + "step": 29760 + }, + { + "epoch": 2.536520923889883, + "grad_norm": 79.45720896839607, + "learning_rate": 7.098719798424508e-07, + "loss": 2.2237, + "step": 29761 + }, + { + "epoch": 2.5366061535839086, + "grad_norm": 53.8203667939791, + "learning_rate": 7.096173300668418e-07, + "loss": 1.0336, + "step": 29762 + }, + { + "epoch": 2.536691383277934, + "grad_norm": 62.754996544615494, + "learning_rate": 7.093627224854404e-07, + "loss": 1.7139, + "step": 29763 + }, + { + "epoch": 2.5367766129719596, + "grad_norm": 82.08746704932871, + "learning_rate": 7.091081571007519e-07, + "loss": 1.7495, + "step": 29764 + }, + { + "epoch": 2.536861842665985, + "grad_norm": 47.229250974622495, + "learning_rate": 7.088536339152808e-07, + "loss": 1.9622, + "step": 29765 + }, + { + "epoch": 2.53694707236001, + "grad_norm": 79.77166010322, + "learning_rate": 7.085991529315289e-07, + "loss": 1.7176, + "step": 29766 + }, + { + "epoch": 2.5370323020540355, + "grad_norm": 42.68458858915785, + "learning_rate": 7.083447141519978e-07, + "loss": 0.9994, + "step": 29767 + }, + { + "epoch": 2.537117531748061, + "grad_norm": 61.09017312045509, + "learning_rate": 7.080903175791925e-07, + "loss": 1.5789, + "step": 29768 + }, + { + "epoch": 2.5372027614420865, + "grad_norm": 43.13867350734973, + "learning_rate": 7.078359632156134e-07, + "loss": 0.9279, + "step": 29769 + }, + { + "epoch": 2.537287991136112, + "grad_norm": 31.342688789686317, + "learning_rate": 7.075816510637624e-07, + "loss": 0.7208, + "step": 29770 + }, + { + "epoch": 2.537373220830137, + "grad_norm": 76.39485734317243, + "learning_rate": 7.073273811261383e-07, + "loss": 1.9597, + "step": 29771 + }, + { + "epoch": 2.537458450524163, + "grad_norm": 61.87281724468719, + "learning_rate": 7.070731534052444e-07, + "loss": 1.6565, + "step": 29772 + }, + { + "epoch": 2.537543680218188, + "grad_norm": 33.6266186280725, + "learning_rate": 7.068189679035808e-07, + "loss": 0.7467, + "step": 29773 + }, + { + "epoch": 2.5376289099122133, + "grad_norm": 34.72775662653183, + "learning_rate": 7.065648246236473e-07, + "loss": 1.2855, + "step": 29774 + }, + { + "epoch": 2.537714139606239, + "grad_norm": 53.127978300222686, + "learning_rate": 7.063107235679412e-07, + "loss": 1.4019, + "step": 29775 + }, + { + "epoch": 2.5377993693002643, + "grad_norm": 69.34079761949417, + "learning_rate": 7.060566647389639e-07, + "loss": 1.5439, + "step": 29776 + }, + { + "epoch": 2.5378845989942898, + "grad_norm": 81.26138090224487, + "learning_rate": 7.058026481392133e-07, + "loss": 2.0827, + "step": 29777 + }, + { + "epoch": 2.5379698286883148, + "grad_norm": 45.37652744970138, + "learning_rate": 7.055486737711875e-07, + "loss": 1.349, + "step": 29778 + }, + { + "epoch": 2.5380550583823402, + "grad_norm": 35.79374713534325, + "learning_rate": 7.052947416373829e-07, + "loss": 1.0396, + "step": 29779 + }, + { + "epoch": 2.5381402880763657, + "grad_norm": 31.43058807873048, + "learning_rate": 7.050408517402974e-07, + "loss": 1.021, + "step": 29780 + }, + { + "epoch": 2.538225517770391, + "grad_norm": 45.842517709238365, + "learning_rate": 7.047870040824301e-07, + "loss": 1.104, + "step": 29781 + }, + { + "epoch": 2.5383107474644167, + "grad_norm": 92.05307517985521, + "learning_rate": 7.045331986662757e-07, + "loss": 2.3778, + "step": 29782 + }, + { + "epoch": 2.538395977158442, + "grad_norm": 36.34568947805243, + "learning_rate": 7.042794354943306e-07, + "loss": 1.1193, + "step": 29783 + }, + { + "epoch": 2.5384812068524676, + "grad_norm": 34.179841982369155, + "learning_rate": 7.040257145690893e-07, + "loss": 1.0392, + "step": 29784 + }, + { + "epoch": 2.5385664365464926, + "grad_norm": 37.90305443404069, + "learning_rate": 7.037720358930489e-07, + "loss": 0.906, + "step": 29785 + }, + { + "epoch": 2.538651666240518, + "grad_norm": 30.784689351077215, + "learning_rate": 7.035183994687034e-07, + "loss": 1.3474, + "step": 29786 + }, + { + "epoch": 2.5387368959345435, + "grad_norm": 72.26929467135533, + "learning_rate": 7.032648052985463e-07, + "loss": 2.2876, + "step": 29787 + }, + { + "epoch": 2.538822125628569, + "grad_norm": 53.57630242517745, + "learning_rate": 7.030112533850731e-07, + "loss": 1.5481, + "step": 29788 + }, + { + "epoch": 2.5389073553225945, + "grad_norm": 35.878400328175054, + "learning_rate": 7.027577437307759e-07, + "loss": 1.5852, + "step": 29789 + }, + { + "epoch": 2.53899258501662, + "grad_norm": 33.00832113724427, + "learning_rate": 7.025042763381501e-07, + "loss": 1.3376, + "step": 29790 + }, + { + "epoch": 2.5390778147106454, + "grad_norm": 48.340469580830316, + "learning_rate": 7.022508512096871e-07, + "loss": 1.6604, + "step": 29791 + }, + { + "epoch": 2.5391630444046704, + "grad_norm": 18.492919891895674, + "learning_rate": 7.019974683478786e-07, + "loss": 0.3636, + "step": 29792 + }, + { + "epoch": 2.539248274098696, + "grad_norm": 42.17003917263618, + "learning_rate": 7.017441277552168e-07, + "loss": 1.2068, + "step": 29793 + }, + { + "epoch": 2.5393335037927214, + "grad_norm": 34.66085602501223, + "learning_rate": 7.014908294341943e-07, + "loss": 1.3664, + "step": 29794 + }, + { + "epoch": 2.539418733486747, + "grad_norm": 41.28592838234536, + "learning_rate": 7.012375733873001e-07, + "loss": 0.8677, + "step": 29795 + }, + { + "epoch": 2.5395039631807723, + "grad_norm": 50.289747986798965, + "learning_rate": 7.009843596170273e-07, + "loss": 1.6431, + "step": 29796 + }, + { + "epoch": 2.5395891928747973, + "grad_norm": 40.79271743239734, + "learning_rate": 7.007311881258644e-07, + "loss": 1.0318, + "step": 29797 + }, + { + "epoch": 2.5396744225688233, + "grad_norm": 41.20590814939009, + "learning_rate": 7.004780589163029e-07, + "loss": 1.2103, + "step": 29798 + }, + { + "epoch": 2.5397596522628483, + "grad_norm": 29.200833810383138, + "learning_rate": 7.00224971990831e-07, + "loss": 0.7715, + "step": 29799 + }, + { + "epoch": 2.5398448819568737, + "grad_norm": 80.1204452739108, + "learning_rate": 6.999719273519378e-07, + "loss": 1.8448, + "step": 29800 + }, + { + "epoch": 2.539930111650899, + "grad_norm": 48.26501901396883, + "learning_rate": 6.997189250021113e-07, + "loss": 1.7747, + "step": 29801 + }, + { + "epoch": 2.5400153413449247, + "grad_norm": 20.748236459523955, + "learning_rate": 6.994659649438417e-07, + "loss": 0.9759, + "step": 29802 + }, + { + "epoch": 2.54010057103895, + "grad_norm": 53.96855733074839, + "learning_rate": 6.992130471796138e-07, + "loss": 1.3639, + "step": 29803 + }, + { + "epoch": 2.540185800732975, + "grad_norm": 81.80501918991457, + "learning_rate": 6.989601717119182e-07, + "loss": 1.8282, + "step": 29804 + }, + { + "epoch": 2.5402710304270006, + "grad_norm": 40.174626742079155, + "learning_rate": 6.987073385432386e-07, + "loss": 0.9666, + "step": 29805 + }, + { + "epoch": 2.540356260121026, + "grad_norm": 36.31593719898761, + "learning_rate": 6.984545476760646e-07, + "loss": 0.8344, + "step": 29806 + }, + { + "epoch": 2.5404414898150516, + "grad_norm": 57.49232403300783, + "learning_rate": 6.982017991128809e-07, + "loss": 1.8085, + "step": 29807 + }, + { + "epoch": 2.540526719509077, + "grad_norm": 54.334931062667465, + "learning_rate": 6.979490928561733e-07, + "loss": 1.2959, + "step": 29808 + }, + { + "epoch": 2.5406119492031025, + "grad_norm": 17.6296717670402, + "learning_rate": 6.976964289084265e-07, + "loss": 0.9101, + "step": 29809 + }, + { + "epoch": 2.540697178897128, + "grad_norm": 28.869145306418382, + "learning_rate": 6.974438072721251e-07, + "loss": 0.8208, + "step": 29810 + }, + { + "epoch": 2.540782408591153, + "grad_norm": 17.94381834036151, + "learning_rate": 6.971912279497539e-07, + "loss": 0.6741, + "step": 29811 + }, + { + "epoch": 2.5408676382851785, + "grad_norm": 30.518922738537047, + "learning_rate": 6.969386909437981e-07, + "loss": 1.0683, + "step": 29812 + }, + { + "epoch": 2.540952867979204, + "grad_norm": 34.15768470105871, + "learning_rate": 6.966861962567406e-07, + "loss": 0.9306, + "step": 29813 + }, + { + "epoch": 2.5410380976732294, + "grad_norm": 45.03317553499192, + "learning_rate": 6.964337438910634e-07, + "loss": 1.0793, + "step": 29814 + }, + { + "epoch": 2.541123327367255, + "grad_norm": 49.14020570243911, + "learning_rate": 6.961813338492513e-07, + "loss": 1.2331, + "step": 29815 + }, + { + "epoch": 2.54120855706128, + "grad_norm": 65.90731124352274, + "learning_rate": 6.959289661337859e-07, + "loss": 1.5693, + "step": 29816 + }, + { + "epoch": 2.541293786755306, + "grad_norm": 65.9525357594119, + "learning_rate": 6.956766407471482e-07, + "loss": 1.9647, + "step": 29817 + }, + { + "epoch": 2.541379016449331, + "grad_norm": 27.957339023463618, + "learning_rate": 6.954243576918196e-07, + "loss": 1.0066, + "step": 29818 + }, + { + "epoch": 2.5414642461433563, + "grad_norm": 54.127291933510456, + "learning_rate": 6.951721169702818e-07, + "loss": 1.153, + "step": 29819 + }, + { + "epoch": 2.5415494758373818, + "grad_norm": 41.98198396954162, + "learning_rate": 6.949199185850175e-07, + "loss": 1.328, + "step": 29820 + }, + { + "epoch": 2.5416347055314072, + "grad_norm": 34.32564216677524, + "learning_rate": 6.946677625385045e-07, + "loss": 1.1527, + "step": 29821 + }, + { + "epoch": 2.5417199352254327, + "grad_norm": 61.986987043053084, + "learning_rate": 6.944156488332232e-07, + "loss": 1.6703, + "step": 29822 + }, + { + "epoch": 2.5418051649194577, + "grad_norm": 64.3568525030658, + "learning_rate": 6.941635774716521e-07, + "loss": 0.9687, + "step": 29823 + }, + { + "epoch": 2.541890394613483, + "grad_norm": 64.65818022535225, + "learning_rate": 6.939115484562725e-07, + "loss": 1.6646, + "step": 29824 + }, + { + "epoch": 2.5419756243075087, + "grad_norm": 46.14751538154877, + "learning_rate": 6.93659561789562e-07, + "loss": 1.2293, + "step": 29825 + }, + { + "epoch": 2.542060854001534, + "grad_norm": 31.84435253840344, + "learning_rate": 6.934076174739968e-07, + "loss": 1.021, + "step": 29826 + }, + { + "epoch": 2.5421460836955596, + "grad_norm": 30.06642973246974, + "learning_rate": 6.931557155120566e-07, + "loss": 1.8295, + "step": 29827 + }, + { + "epoch": 2.542231313389585, + "grad_norm": 36.19794199652312, + "learning_rate": 6.929038559062201e-07, + "loss": 0.6533, + "step": 29828 + }, + { + "epoch": 2.5423165430836105, + "grad_norm": 41.16730508300399, + "learning_rate": 6.926520386589624e-07, + "loss": 1.6754, + "step": 29829 + }, + { + "epoch": 2.5424017727776356, + "grad_norm": 40.04858187503566, + "learning_rate": 6.9240026377276e-07, + "loss": 0.9084, + "step": 29830 + }, + { + "epoch": 2.542487002471661, + "grad_norm": 60.37509305683833, + "learning_rate": 6.921485312500887e-07, + "loss": 1.9471, + "step": 29831 + }, + { + "epoch": 2.5425722321656865, + "grad_norm": 54.72936253987577, + "learning_rate": 6.918968410934257e-07, + "loss": 1.6361, + "step": 29832 + }, + { + "epoch": 2.542657461859712, + "grad_norm": 62.99346051584632, + "learning_rate": 6.916451933052454e-07, + "loss": 1.5214, + "step": 29833 + }, + { + "epoch": 2.5427426915537374, + "grad_norm": 81.98511104749043, + "learning_rate": 6.913935878880218e-07, + "loss": 1.9503, + "step": 29834 + }, + { + "epoch": 2.5428279212477625, + "grad_norm": 19.30902314344247, + "learning_rate": 6.911420248442313e-07, + "loss": 0.6604, + "step": 29835 + }, + { + "epoch": 2.5429131509417884, + "grad_norm": 28.65004062451309, + "learning_rate": 6.908905041763453e-07, + "loss": 1.0492, + "step": 29836 + }, + { + "epoch": 2.5429983806358134, + "grad_norm": 41.812072651894795, + "learning_rate": 6.906390258868401e-07, + "loss": 1.1805, + "step": 29837 + }, + { + "epoch": 2.543083610329839, + "grad_norm": 48.91100274897301, + "learning_rate": 6.903875899781881e-07, + "loss": 1.7966, + "step": 29838 + }, + { + "epoch": 2.5431688400238643, + "grad_norm": 22.64161981165598, + "learning_rate": 6.901361964528614e-07, + "loss": 0.6364, + "step": 29839 + }, + { + "epoch": 2.54325406971789, + "grad_norm": 76.47879798421407, + "learning_rate": 6.898848453133317e-07, + "loss": 1.7065, + "step": 29840 + }, + { + "epoch": 2.5433392994119153, + "grad_norm": 52.28424354765661, + "learning_rate": 6.896335365620727e-07, + "loss": 0.984, + "step": 29841 + }, + { + "epoch": 2.5434245291059403, + "grad_norm": 51.923601293776265, + "learning_rate": 6.89382270201554e-07, + "loss": 1.36, + "step": 29842 + }, + { + "epoch": 2.5435097587999658, + "grad_norm": 71.43327899971864, + "learning_rate": 6.891310462342494e-07, + "loss": 2.1457, + "step": 29843 + }, + { + "epoch": 2.5435949884939912, + "grad_norm": 32.3226471167223, + "learning_rate": 6.888798646626266e-07, + "loss": 0.8552, + "step": 29844 + }, + { + "epoch": 2.5436802181880167, + "grad_norm": 14.766529543735178, + "learning_rate": 6.886287254891582e-07, + "loss": 0.9769, + "step": 29845 + }, + { + "epoch": 2.543765447882042, + "grad_norm": 76.00848219256872, + "learning_rate": 6.88377628716313e-07, + "loss": 2.1566, + "step": 29846 + }, + { + "epoch": 2.5438506775760676, + "grad_norm": 34.598621494941376, + "learning_rate": 6.88126574346561e-07, + "loss": 1.727, + "step": 29847 + }, + { + "epoch": 2.543935907270093, + "grad_norm": 37.19520697102085, + "learning_rate": 6.878755623823691e-07, + "loss": 0.9217, + "step": 29848 + }, + { + "epoch": 2.544021136964118, + "grad_norm": 24.804126460825653, + "learning_rate": 6.876245928262093e-07, + "loss": 0.9371, + "step": 29849 + }, + { + "epoch": 2.5441063666581436, + "grad_norm": 60.70207044330208, + "learning_rate": 6.873736656805463e-07, + "loss": 1.7845, + "step": 29850 + }, + { + "epoch": 2.544191596352169, + "grad_norm": 21.420999499681372, + "learning_rate": 6.871227809478515e-07, + "loss": 0.2789, + "step": 29851 + }, + { + "epoch": 2.5442768260461945, + "grad_norm": 73.40164216229087, + "learning_rate": 6.8687193863059e-07, + "loss": 0.6875, + "step": 29852 + }, + { + "epoch": 2.54436205574022, + "grad_norm": 35.66167520298878, + "learning_rate": 6.866211387312277e-07, + "loss": 0.7541, + "step": 29853 + }, + { + "epoch": 2.544447285434245, + "grad_norm": 42.520682724103054, + "learning_rate": 6.863703812522337e-07, + "loss": 0.9758, + "step": 29854 + }, + { + "epoch": 2.544532515128271, + "grad_norm": 34.846133003073646, + "learning_rate": 6.861196661960734e-07, + "loss": 1.0466, + "step": 29855 + }, + { + "epoch": 2.544617744822296, + "grad_norm": 77.01957392809413, + "learning_rate": 6.858689935652119e-07, + "loss": 1.9583, + "step": 29856 + }, + { + "epoch": 2.5447029745163214, + "grad_norm": 23.6885865275971, + "learning_rate": 6.856183633621133e-07, + "loss": 0.6943, + "step": 29857 + }, + { + "epoch": 2.544788204210347, + "grad_norm": 48.45731782044016, + "learning_rate": 6.853677755892435e-07, + "loss": 0.9311, + "step": 29858 + }, + { + "epoch": 2.5448734339043724, + "grad_norm": 41.69968463559032, + "learning_rate": 6.851172302490688e-07, + "loss": 1.4516, + "step": 29859 + }, + { + "epoch": 2.544958663598398, + "grad_norm": 67.74704919411204, + "learning_rate": 6.848667273440512e-07, + "loss": 1.5726, + "step": 29860 + }, + { + "epoch": 2.545043893292423, + "grad_norm": 48.908541539068345, + "learning_rate": 6.846162668766537e-07, + "loss": 1.6186, + "step": 29861 + }, + { + "epoch": 2.5451291229864483, + "grad_norm": 38.906949521469166, + "learning_rate": 6.843658488493415e-07, + "loss": 1.0048, + "step": 29862 + }, + { + "epoch": 2.545214352680474, + "grad_norm": 44.839796484350245, + "learning_rate": 6.841154732645766e-07, + "loss": 0.8965, + "step": 29863 + }, + { + "epoch": 2.5452995823744993, + "grad_norm": 16.02721291271945, + "learning_rate": 6.838651401248209e-07, + "loss": 0.4899, + "step": 29864 + }, + { + "epoch": 2.5453848120685247, + "grad_norm": 61.125664737154494, + "learning_rate": 6.836148494325351e-07, + "loss": 1.4628, + "step": 29865 + }, + { + "epoch": 2.54547004176255, + "grad_norm": 47.48828766510925, + "learning_rate": 6.833646011901823e-07, + "loss": 1.3874, + "step": 29866 + }, + { + "epoch": 2.5455552714565757, + "grad_norm": 47.98434625387518, + "learning_rate": 6.831143954002245e-07, + "loss": 1.3532, + "step": 29867 + }, + { + "epoch": 2.5456405011506007, + "grad_norm": 33.605017623461315, + "learning_rate": 6.828642320651213e-07, + "loss": 1.3201, + "step": 29868 + }, + { + "epoch": 2.545725730844626, + "grad_norm": 32.15926308078134, + "learning_rate": 6.826141111873324e-07, + "loss": 0.9264, + "step": 29869 + }, + { + "epoch": 2.5458109605386516, + "grad_norm": 25.162405783386564, + "learning_rate": 6.823640327693177e-07, + "loss": 0.5769, + "step": 29870 + }, + { + "epoch": 2.545896190232677, + "grad_norm": 60.05187671601422, + "learning_rate": 6.821139968135381e-07, + "loss": 1.3871, + "step": 29871 + }, + { + "epoch": 2.5459814199267026, + "grad_norm": 77.85183122656348, + "learning_rate": 6.818640033224516e-07, + "loss": 1.8961, + "step": 29872 + }, + { + "epoch": 2.5460666496207276, + "grad_norm": 28.19836464169457, + "learning_rate": 6.816140522985154e-07, + "loss": 0.8475, + "step": 29873 + }, + { + "epoch": 2.5461518793147535, + "grad_norm": 59.008771846507805, + "learning_rate": 6.813641437441893e-07, + "loss": 2.2226, + "step": 29874 + }, + { + "epoch": 2.5462371090087785, + "grad_norm": 42.473929109901434, + "learning_rate": 6.811142776619317e-07, + "loss": 0.9763, + "step": 29875 + }, + { + "epoch": 2.546322338702804, + "grad_norm": 34.25537247523068, + "learning_rate": 6.808644540541986e-07, + "loss": 1.0028, + "step": 29876 + }, + { + "epoch": 2.5464075683968295, + "grad_norm": 45.21584653238448, + "learning_rate": 6.806146729234481e-07, + "loss": 1.6138, + "step": 29877 + }, + { + "epoch": 2.546492798090855, + "grad_norm": 29.459463065641252, + "learning_rate": 6.803649342721347e-07, + "loss": 0.6946, + "step": 29878 + }, + { + "epoch": 2.5465780277848804, + "grad_norm": 23.431965981744153, + "learning_rate": 6.801152381027165e-07, + "loss": 0.7587, + "step": 29879 + }, + { + "epoch": 2.5466632574789054, + "grad_norm": 59.69005647919813, + "learning_rate": 6.798655844176488e-07, + "loss": 1.1251, + "step": 29880 + }, + { + "epoch": 2.546748487172931, + "grad_norm": 59.02190512503505, + "learning_rate": 6.796159732193847e-07, + "loss": 1.6593, + "step": 29881 + }, + { + "epoch": 2.5468337168669564, + "grad_norm": 32.564114752347855, + "learning_rate": 6.793664045103826e-07, + "loss": 1.1241, + "step": 29882 + }, + { + "epoch": 2.546918946560982, + "grad_norm": 51.92527234019422, + "learning_rate": 6.791168782930934e-07, + "loss": 1.5854, + "step": 29883 + }, + { + "epoch": 2.5470041762550073, + "grad_norm": 32.35968413985729, + "learning_rate": 6.788673945699741e-07, + "loss": 1.0895, + "step": 29884 + }, + { + "epoch": 2.5470894059490328, + "grad_norm": 24.04042012538102, + "learning_rate": 6.786179533434767e-07, + "loss": 0.8297, + "step": 29885 + }, + { + "epoch": 2.5471746356430582, + "grad_norm": 66.22407117960539, + "learning_rate": 6.783685546160546e-07, + "loss": 1.5396, + "step": 29886 + }, + { + "epoch": 2.5472598653370833, + "grad_norm": 131.2407947603448, + "learning_rate": 6.781191983901597e-07, + "loss": 2.5052, + "step": 29887 + }, + { + "epoch": 2.5473450950311087, + "grad_norm": 39.95808347039918, + "learning_rate": 6.778698846682463e-07, + "loss": 1.1039, + "step": 29888 + }, + { + "epoch": 2.547430324725134, + "grad_norm": 62.544402701711405, + "learning_rate": 6.776206134527636e-07, + "loss": 1.8477, + "step": 29889 + }, + { + "epoch": 2.5475155544191597, + "grad_norm": 60.56642372342819, + "learning_rate": 6.773713847461661e-07, + "loss": 1.3795, + "step": 29890 + }, + { + "epoch": 2.547600784113185, + "grad_norm": 56.00699737610172, + "learning_rate": 6.771221985509019e-07, + "loss": 1.6617, + "step": 29891 + }, + { + "epoch": 2.54768601380721, + "grad_norm": 39.44629709449368, + "learning_rate": 6.768730548694247e-07, + "loss": 1.2138, + "step": 29892 + }, + { + "epoch": 2.547771243501236, + "grad_norm": 66.78000754609785, + "learning_rate": 6.76623953704183e-07, + "loss": 1.407, + "step": 29893 + }, + { + "epoch": 2.547856473195261, + "grad_norm": 53.40320687795644, + "learning_rate": 6.763748950576265e-07, + "loss": 1.159, + "step": 29894 + }, + { + "epoch": 2.5479417028892866, + "grad_norm": 55.20609624029278, + "learning_rate": 6.76125878932205e-07, + "loss": 1.576, + "step": 29895 + }, + { + "epoch": 2.548026932583312, + "grad_norm": 42.93051078485816, + "learning_rate": 6.758769053303665e-07, + "loss": 1.945, + "step": 29896 + }, + { + "epoch": 2.5481121622773375, + "grad_norm": 24.338966117354868, + "learning_rate": 6.756279742545602e-07, + "loss": 0.8035, + "step": 29897 + }, + { + "epoch": 2.548197391971363, + "grad_norm": 89.58437615300315, + "learning_rate": 6.753790857072356e-07, + "loss": 2.5583, + "step": 29898 + }, + { + "epoch": 2.548282621665388, + "grad_norm": 21.71376667273601, + "learning_rate": 6.751302396908393e-07, + "loss": 0.6244, + "step": 29899 + }, + { + "epoch": 2.5483678513594135, + "grad_norm": 54.54034789214383, + "learning_rate": 6.748814362078171e-07, + "loss": 1.4428, + "step": 29900 + }, + { + "epoch": 2.548453081053439, + "grad_norm": 62.81306138808272, + "learning_rate": 6.746326752606186e-07, + "loss": 1.3114, + "step": 29901 + }, + { + "epoch": 2.5485383107474644, + "grad_norm": 54.0488938319173, + "learning_rate": 6.743839568516891e-07, + "loss": 1.7235, + "step": 29902 + }, + { + "epoch": 2.54862354044149, + "grad_norm": 51.984865871690914, + "learning_rate": 6.74135280983475e-07, + "loss": 1.5335, + "step": 29903 + }, + { + "epoch": 2.5487087701355153, + "grad_norm": 60.29265702739318, + "learning_rate": 6.7388664765842e-07, + "loss": 1.7979, + "step": 29904 + }, + { + "epoch": 2.548793999829541, + "grad_norm": 11.801386156870013, + "learning_rate": 6.736380568789702e-07, + "loss": 0.5536, + "step": 29905 + }, + { + "epoch": 2.548879229523566, + "grad_norm": 30.189920478113464, + "learning_rate": 6.733895086475728e-07, + "loss": 1.066, + "step": 29906 + }, + { + "epoch": 2.5489644592175913, + "grad_norm": 67.97750874030773, + "learning_rate": 6.731410029666701e-07, + "loss": 1.3184, + "step": 29907 + }, + { + "epoch": 2.5490496889116168, + "grad_norm": 66.14132251913539, + "learning_rate": 6.728925398387065e-07, + "loss": 2.1795, + "step": 29908 + }, + { + "epoch": 2.5491349186056422, + "grad_norm": 34.30874564089261, + "learning_rate": 6.726441192661243e-07, + "loss": 0.9619, + "step": 29909 + }, + { + "epoch": 2.5492201482996677, + "grad_norm": 50.22248062705943, + "learning_rate": 6.723957412513688e-07, + "loss": 1.3331, + "step": 29910 + }, + { + "epoch": 2.549305377993693, + "grad_norm": 74.09075901306389, + "learning_rate": 6.721474057968813e-07, + "loss": 1.6922, + "step": 29911 + }, + { + "epoch": 2.5493906076877186, + "grad_norm": 81.89769675118531, + "learning_rate": 6.718991129051028e-07, + "loss": 1.5367, + "step": 29912 + }, + { + "epoch": 2.5494758373817437, + "grad_norm": 33.02932564787826, + "learning_rate": 6.716508625784774e-07, + "loss": 0.8116, + "step": 29913 + }, + { + "epoch": 2.549561067075769, + "grad_norm": 74.44803611054958, + "learning_rate": 6.714026548194469e-07, + "loss": 2.0353, + "step": 29914 + }, + { + "epoch": 2.5496462967697946, + "grad_norm": 79.20751348192312, + "learning_rate": 6.711544896304512e-07, + "loss": 1.5719, + "step": 29915 + }, + { + "epoch": 2.54973152646382, + "grad_norm": 46.5835271968684, + "learning_rate": 6.70906367013931e-07, + "loss": 1.2333, + "step": 29916 + }, + { + "epoch": 2.5498167561578455, + "grad_norm": 48.01614707136369, + "learning_rate": 6.70658286972325e-07, + "loss": 1.7376, + "step": 29917 + }, + { + "epoch": 2.5499019858518706, + "grad_norm": 56.500006240186835, + "learning_rate": 6.704102495080761e-07, + "loss": 1.8695, + "step": 29918 + }, + { + "epoch": 2.549987215545896, + "grad_norm": 79.059646438388, + "learning_rate": 6.701622546236214e-07, + "loss": 2.1591, + "step": 29919 + }, + { + "epoch": 2.5500724452399215, + "grad_norm": 61.743745947100216, + "learning_rate": 6.699143023214e-07, + "loss": 1.2777, + "step": 29920 + }, + { + "epoch": 2.550157674933947, + "grad_norm": 25.24228767859842, + "learning_rate": 6.696663926038499e-07, + "loss": 0.602, + "step": 29921 + }, + { + "epoch": 2.5502429046279724, + "grad_norm": 44.166954245145035, + "learning_rate": 6.69418525473412e-07, + "loss": 1.1399, + "step": 29922 + }, + { + "epoch": 2.550328134321998, + "grad_norm": 53.87153226056755, + "learning_rate": 6.691707009325221e-07, + "loss": 1.5106, + "step": 29923 + }, + { + "epoch": 2.5504133640160234, + "grad_norm": 45.531771179603446, + "learning_rate": 6.689229189836172e-07, + "loss": 1.1505, + "step": 29924 + }, + { + "epoch": 2.5504985937100484, + "grad_norm": 45.07731182578906, + "learning_rate": 6.68675179629134e-07, + "loss": 1.5917, + "step": 29925 + }, + { + "epoch": 2.550583823404074, + "grad_norm": 31.138458224007646, + "learning_rate": 6.684274828715087e-07, + "loss": 1.1919, + "step": 29926 + }, + { + "epoch": 2.5506690530980993, + "grad_norm": 28.628691787759777, + "learning_rate": 6.681798287131791e-07, + "loss": 1.2581, + "step": 29927 + }, + { + "epoch": 2.550754282792125, + "grad_norm": 74.01834223759609, + "learning_rate": 6.679322171565783e-07, + "loss": 2.061, + "step": 29928 + }, + { + "epoch": 2.5508395124861503, + "grad_norm": 50.81709480215474, + "learning_rate": 6.676846482041438e-07, + "loss": 1.0725, + "step": 29929 + }, + { + "epoch": 2.5509247421801757, + "grad_norm": 39.38603822967781, + "learning_rate": 6.674371218583087e-07, + "loss": 1.1826, + "step": 29930 + }, + { + "epoch": 2.551009971874201, + "grad_norm": 39.513874667981376, + "learning_rate": 6.671896381215082e-07, + "loss": 0.7161, + "step": 29931 + }, + { + "epoch": 2.5510952015682262, + "grad_norm": 58.685157761680045, + "learning_rate": 6.669421969961765e-07, + "loss": 1.8661, + "step": 29932 + }, + { + "epoch": 2.5511804312622517, + "grad_norm": 50.016003697759444, + "learning_rate": 6.66694798484746e-07, + "loss": 1.0946, + "step": 29933 + }, + { + "epoch": 2.551265660956277, + "grad_norm": 30.810031750769877, + "learning_rate": 6.664474425896494e-07, + "loss": 1.125, + "step": 29934 + }, + { + "epoch": 2.5513508906503026, + "grad_norm": 53.20088250436436, + "learning_rate": 6.66200129313322e-07, + "loss": 1.2013, + "step": 29935 + }, + { + "epoch": 2.551436120344328, + "grad_norm": 84.14968901244396, + "learning_rate": 6.659528586581921e-07, + "loss": 1.8611, + "step": 29936 + }, + { + "epoch": 2.551521350038353, + "grad_norm": 42.07127353286881, + "learning_rate": 6.657056306266951e-07, + "loss": 1.3586, + "step": 29937 + }, + { + "epoch": 2.551606579732379, + "grad_norm": 50.974243148543295, + "learning_rate": 6.654584452212615e-07, + "loss": 1.6003, + "step": 29938 + }, + { + "epoch": 2.551691809426404, + "grad_norm": 42.778424272388484, + "learning_rate": 6.652113024443196e-07, + "loss": 0.8088, + "step": 29939 + }, + { + "epoch": 2.5517770391204295, + "grad_norm": 50.08766869970093, + "learning_rate": 6.649642022983039e-07, + "loss": 1.2982, + "step": 29940 + }, + { + "epoch": 2.551862268814455, + "grad_norm": 50.122235777482445, + "learning_rate": 6.647171447856426e-07, + "loss": 1.208, + "step": 29941 + }, + { + "epoch": 2.5519474985084805, + "grad_norm": 33.92850506548636, + "learning_rate": 6.644701299087652e-07, + "loss": 1.2263, + "step": 29942 + }, + { + "epoch": 2.552032728202506, + "grad_norm": 26.207653288986815, + "learning_rate": 6.642231576701009e-07, + "loss": 0.8317, + "step": 29943 + }, + { + "epoch": 2.552117957896531, + "grad_norm": 43.25318005427614, + "learning_rate": 6.639762280720779e-07, + "loss": 1.6457, + "step": 29944 + }, + { + "epoch": 2.5522031875905564, + "grad_norm": 31.73748754554364, + "learning_rate": 6.637293411171275e-07, + "loss": 0.9045, + "step": 29945 + }, + { + "epoch": 2.552288417284582, + "grad_norm": 37.55376894387639, + "learning_rate": 6.634824968076759e-07, + "loss": 1.1715, + "step": 29946 + }, + { + "epoch": 2.5523736469786074, + "grad_norm": 49.32240147796917, + "learning_rate": 6.6323569514615e-07, + "loss": 1.2117, + "step": 29947 + }, + { + "epoch": 2.552458876672633, + "grad_norm": 73.37747542655936, + "learning_rate": 6.629889361349783e-07, + "loss": 2.1891, + "step": 29948 + }, + { + "epoch": 2.5525441063666583, + "grad_norm": 54.93713458427048, + "learning_rate": 6.627422197765876e-07, + "loss": 1.6075, + "step": 29949 + }, + { + "epoch": 2.5526293360606838, + "grad_norm": 72.34958842303558, + "learning_rate": 6.624955460734034e-07, + "loss": 2.1408, + "step": 29950 + }, + { + "epoch": 2.552714565754709, + "grad_norm": 39.654069712851545, + "learning_rate": 6.622489150278505e-07, + "loss": 1.1209, + "step": 29951 + }, + { + "epoch": 2.5527997954487343, + "grad_norm": 49.02721362788351, + "learning_rate": 6.620023266423564e-07, + "loss": 1.5546, + "step": 29952 + }, + { + "epoch": 2.5528850251427597, + "grad_norm": 29.672047358866692, + "learning_rate": 6.617557809193464e-07, + "loss": 0.7066, + "step": 29953 + }, + { + "epoch": 2.552970254836785, + "grad_norm": 84.55608027379365, + "learning_rate": 6.615092778612447e-07, + "loss": 2.0058, + "step": 29954 + }, + { + "epoch": 2.5530554845308107, + "grad_norm": 89.87022572565998, + "learning_rate": 6.612628174704755e-07, + "loss": 1.9825, + "step": 29955 + }, + { + "epoch": 2.5531407142248357, + "grad_norm": 62.691935642813306, + "learning_rate": 6.610163997494612e-07, + "loss": 2.0463, + "step": 29956 + }, + { + "epoch": 2.5532259439188616, + "grad_norm": 28.466981277533314, + "learning_rate": 6.607700247006271e-07, + "loss": 0.6374, + "step": 29957 + }, + { + "epoch": 2.5533111736128866, + "grad_norm": 64.22214063660076, + "learning_rate": 6.605236923263964e-07, + "loss": 1.6411, + "step": 29958 + }, + { + "epoch": 2.553396403306912, + "grad_norm": 50.31507382461743, + "learning_rate": 6.602774026291891e-07, + "loss": 1.691, + "step": 29959 + }, + { + "epoch": 2.5534816330009376, + "grad_norm": 47.035667402770336, + "learning_rate": 6.600311556114291e-07, + "loss": 1.1299, + "step": 29960 + }, + { + "epoch": 2.553566862694963, + "grad_norm": 25.878344504736067, + "learning_rate": 6.597849512755395e-07, + "loss": 0.9312, + "step": 29961 + }, + { + "epoch": 2.5536520923889885, + "grad_norm": 54.44912351500297, + "learning_rate": 6.595387896239402e-07, + "loss": 1.6639, + "step": 29962 + }, + { + "epoch": 2.5537373220830135, + "grad_norm": 18.245146131777968, + "learning_rate": 6.592926706590518e-07, + "loss": 0.726, + "step": 29963 + }, + { + "epoch": 2.553822551777039, + "grad_norm": 70.63006569637363, + "learning_rate": 6.590465943832947e-07, + "loss": 2.076, + "step": 29964 + }, + { + "epoch": 2.5539077814710645, + "grad_norm": 25.88809010251946, + "learning_rate": 6.588005607990905e-07, + "loss": 0.9226, + "step": 29965 + }, + { + "epoch": 2.55399301116509, + "grad_norm": 59.06986028467505, + "learning_rate": 6.585545699088574e-07, + "loss": 1.4519, + "step": 29966 + }, + { + "epoch": 2.5540782408591154, + "grad_norm": 54.872405172823946, + "learning_rate": 6.583086217150142e-07, + "loss": 1.6731, + "step": 29967 + }, + { + "epoch": 2.554163470553141, + "grad_norm": 28.037118839077678, + "learning_rate": 6.58062716219981e-07, + "loss": 0.9662, + "step": 29968 + }, + { + "epoch": 2.5542487002471663, + "grad_norm": 43.1987146177407, + "learning_rate": 6.578168534261753e-07, + "loss": 1.0899, + "step": 29969 + }, + { + "epoch": 2.5543339299411914, + "grad_norm": 61.171630064553405, + "learning_rate": 6.575710333360158e-07, + "loss": 1.8596, + "step": 29970 + }, + { + "epoch": 2.554419159635217, + "grad_norm": 58.536950678244146, + "learning_rate": 6.573252559519199e-07, + "loss": 1.5174, + "step": 29971 + }, + { + "epoch": 2.5545043893292423, + "grad_norm": 60.87563229371598, + "learning_rate": 6.570795212763049e-07, + "loss": 1.4208, + "step": 29972 + }, + { + "epoch": 2.5545896190232678, + "grad_norm": 55.02704937528124, + "learning_rate": 6.568338293115855e-07, + "loss": 1.1107, + "step": 29973 + }, + { + "epoch": 2.5546748487172932, + "grad_norm": 65.91624472516192, + "learning_rate": 6.565881800601815e-07, + "loss": 2.2241, + "step": 29974 + }, + { + "epoch": 2.5547600784113182, + "grad_norm": 28.08843323755336, + "learning_rate": 6.56342573524505e-07, + "loss": 0.9579, + "step": 29975 + }, + { + "epoch": 2.554845308105344, + "grad_norm": 43.13723329276867, + "learning_rate": 6.560970097069747e-07, + "loss": 0.9404, + "step": 29976 + }, + { + "epoch": 2.554930537799369, + "grad_norm": 31.569759189438077, + "learning_rate": 6.558514886100026e-07, + "loss": 1.2575, + "step": 29977 + }, + { + "epoch": 2.5550157674933947, + "grad_norm": 20.77722359527875, + "learning_rate": 6.556060102360068e-07, + "loss": 1.1637, + "step": 29978 + }, + { + "epoch": 2.55510099718742, + "grad_norm": 64.12644705755962, + "learning_rate": 6.553605745873992e-07, + "loss": 1.4134, + "step": 29979 + }, + { + "epoch": 2.5551862268814456, + "grad_norm": 50.94626765976398, + "learning_rate": 6.551151816665941e-07, + "loss": 1.3387, + "step": 29980 + }, + { + "epoch": 2.555271456575471, + "grad_norm": 65.52227903594064, + "learning_rate": 6.548698314760032e-07, + "loss": 1.7578, + "step": 29981 + }, + { + "epoch": 2.555356686269496, + "grad_norm": 64.8552938414516, + "learning_rate": 6.546245240180421e-07, + "loss": 1.024, + "step": 29982 + }, + { + "epoch": 2.5554419159635215, + "grad_norm": 31.06023838759901, + "learning_rate": 6.543792592951215e-07, + "loss": 1.106, + "step": 29983 + }, + { + "epoch": 2.555527145657547, + "grad_norm": 37.0246102859005, + "learning_rate": 6.541340373096549e-07, + "loss": 0.9324, + "step": 29984 + }, + { + "epoch": 2.5556123753515725, + "grad_norm": 48.758863748289706, + "learning_rate": 6.538888580640529e-07, + "loss": 1.719, + "step": 29985 + }, + { + "epoch": 2.555697605045598, + "grad_norm": 93.5965749871595, + "learning_rate": 6.53643721560726e-07, + "loss": 2.1511, + "step": 29986 + }, + { + "epoch": 2.5557828347396234, + "grad_norm": 67.06267912463001, + "learning_rate": 6.533986278020876e-07, + "loss": 1.5335, + "step": 29987 + }, + { + "epoch": 2.555868064433649, + "grad_norm": 82.34744223115835, + "learning_rate": 6.531535767905461e-07, + "loss": 2.2316, + "step": 29988 + }, + { + "epoch": 2.555953294127674, + "grad_norm": 53.006757696910825, + "learning_rate": 6.529085685285119e-07, + "loss": 1.3851, + "step": 29989 + }, + { + "epoch": 2.5560385238216994, + "grad_norm": 35.814233483992204, + "learning_rate": 6.526636030183935e-07, + "loss": 1.473, + "step": 29990 + }, + { + "epoch": 2.556123753515725, + "grad_norm": 91.64861052224938, + "learning_rate": 6.524186802626015e-07, + "loss": 1.9311, + "step": 29991 + }, + { + "epoch": 2.5562089832097503, + "grad_norm": 34.238216467648336, + "learning_rate": 6.521738002635452e-07, + "loss": 1.2039, + "step": 29992 + }, + { + "epoch": 2.556294212903776, + "grad_norm": 55.388158100048464, + "learning_rate": 6.519289630236314e-07, + "loss": 1.8141, + "step": 29993 + }, + { + "epoch": 2.556379442597801, + "grad_norm": 72.39118414063775, + "learning_rate": 6.516841685452679e-07, + "loss": 1.5681, + "step": 29994 + }, + { + "epoch": 2.5564646722918267, + "grad_norm": 57.346089792094425, + "learning_rate": 6.514394168308636e-07, + "loss": 1.3619, + "step": 29995 + }, + { + "epoch": 2.5565499019858517, + "grad_norm": 74.26896932778745, + "learning_rate": 6.511947078828251e-07, + "loss": 1.8167, + "step": 29996 + }, + { + "epoch": 2.556635131679877, + "grad_norm": 56.764132131547335, + "learning_rate": 6.50950041703558e-07, + "loss": 1.643, + "step": 29997 + }, + { + "epoch": 2.5567203613739027, + "grad_norm": 64.07542447128328, + "learning_rate": 6.507054182954686e-07, + "loss": 1.9042, + "step": 29998 + }, + { + "epoch": 2.556805591067928, + "grad_norm": 41.926146946344254, + "learning_rate": 6.504608376609628e-07, + "loss": 1.252, + "step": 29999 + }, + { + "epoch": 2.5568908207619536, + "grad_norm": 36.36797680444191, + "learning_rate": 6.502162998024475e-07, + "loss": 1.4124, + "step": 30000 + }, + { + "epoch": 2.5569760504559786, + "grad_norm": 45.68440920529821, + "learning_rate": 6.499718047223263e-07, + "loss": 1.4305, + "step": 30001 + }, + { + "epoch": 2.557061280150004, + "grad_norm": 44.702500059744914, + "learning_rate": 6.497273524230041e-07, + "loss": 0.9785, + "step": 30002 + }, + { + "epoch": 2.5571465098440296, + "grad_norm": 34.2475082629352, + "learning_rate": 6.494829429068833e-07, + "loss": 1.217, + "step": 30003 + }, + { + "epoch": 2.557231739538055, + "grad_norm": 49.624088552735245, + "learning_rate": 6.492385761763703e-07, + "loss": 1.3335, + "step": 30004 + }, + { + "epoch": 2.5573169692320805, + "grad_norm": 34.14197123936827, + "learning_rate": 6.489942522338671e-07, + "loss": 1.2037, + "step": 30005 + }, + { + "epoch": 2.557402198926106, + "grad_norm": 84.93173912520007, + "learning_rate": 6.487499710817758e-07, + "loss": 1.7151, + "step": 30006 + }, + { + "epoch": 2.5574874286201315, + "grad_norm": 50.498540368298364, + "learning_rate": 6.485057327224992e-07, + "loss": 1.6286, + "step": 30007 + }, + { + "epoch": 2.5575726583141565, + "grad_norm": 64.6314766893719, + "learning_rate": 6.482615371584405e-07, + "loss": 2.0746, + "step": 30008 + }, + { + "epoch": 2.557657888008182, + "grad_norm": 57.79241041904845, + "learning_rate": 6.480173843920006e-07, + "loss": 1.266, + "step": 30009 + }, + { + "epoch": 2.5577431177022074, + "grad_norm": 73.66523040895729, + "learning_rate": 6.477732744255805e-07, + "loss": 1.5269, + "step": 30010 + }, + { + "epoch": 2.557828347396233, + "grad_norm": 36.03464531721554, + "learning_rate": 6.475292072615807e-07, + "loss": 1.0144, + "step": 30011 + }, + { + "epoch": 2.5579135770902584, + "grad_norm": 86.25843395535291, + "learning_rate": 6.472851829024007e-07, + "loss": 2.785, + "step": 30012 + }, + { + "epoch": 2.5579988067842834, + "grad_norm": 56.70671592049852, + "learning_rate": 6.470412013504424e-07, + "loss": 1.8257, + "step": 30013 + }, + { + "epoch": 2.5580840364783093, + "grad_norm": 67.7734713681686, + "learning_rate": 6.467972626081027e-07, + "loss": 1.7106, + "step": 30014 + }, + { + "epoch": 2.5581692661723343, + "grad_norm": 54.019318502779235, + "learning_rate": 6.465533666777835e-07, + "loss": 1.5504, + "step": 30015 + }, + { + "epoch": 2.55825449586636, + "grad_norm": 49.105053669377796, + "learning_rate": 6.463095135618807e-07, + "loss": 1.4407, + "step": 30016 + }, + { + "epoch": 2.5583397255603852, + "grad_norm": 63.636360837718904, + "learning_rate": 6.460657032627954e-07, + "loss": 1.1919, + "step": 30017 + }, + { + "epoch": 2.5584249552544107, + "grad_norm": 83.81447460183, + "learning_rate": 6.458219357829237e-07, + "loss": 1.8771, + "step": 30018 + }, + { + "epoch": 2.558510184948436, + "grad_norm": 47.351956780126194, + "learning_rate": 6.455782111246628e-07, + "loss": 1.5485, + "step": 30019 + }, + { + "epoch": 2.558595414642461, + "grad_norm": 30.456364911041497, + "learning_rate": 6.453345292904095e-07, + "loss": 0.7498, + "step": 30020 + }, + { + "epoch": 2.5586806443364867, + "grad_norm": 23.957249534380228, + "learning_rate": 6.450908902825609e-07, + "loss": 0.8328, + "step": 30021 + }, + { + "epoch": 2.558765874030512, + "grad_norm": 33.03267467854529, + "learning_rate": 6.448472941035128e-07, + "loss": 1.009, + "step": 30022 + }, + { + "epoch": 2.5588511037245376, + "grad_norm": 54.15369861393761, + "learning_rate": 6.446037407556615e-07, + "loss": 1.5714, + "step": 30023 + }, + { + "epoch": 2.558936333418563, + "grad_norm": 24.596781888599633, + "learning_rate": 6.443602302414003e-07, + "loss": 0.8574, + "step": 30024 + }, + { + "epoch": 2.5590215631125885, + "grad_norm": 54.45293216094235, + "learning_rate": 6.441167625631272e-07, + "loss": 1.9697, + "step": 30025 + }, + { + "epoch": 2.559106792806614, + "grad_norm": 41.849909278732845, + "learning_rate": 6.438733377232348e-07, + "loss": 1.775, + "step": 30026 + }, + { + "epoch": 2.559192022500639, + "grad_norm": 77.08824033593329, + "learning_rate": 6.436299557241171e-07, + "loss": 2.0621, + "step": 30027 + }, + { + "epoch": 2.5592772521946645, + "grad_norm": 47.31899222115416, + "learning_rate": 6.433866165681674e-07, + "loss": 1.3178, + "step": 30028 + }, + { + "epoch": 2.55936248188869, + "grad_norm": 28.147151544248246, + "learning_rate": 6.431433202577785e-07, + "loss": 0.8798, + "step": 30029 + }, + { + "epoch": 2.5594477115827154, + "grad_norm": 57.306888159777394, + "learning_rate": 6.429000667953439e-07, + "loss": 1.3608, + "step": 30030 + }, + { + "epoch": 2.559532941276741, + "grad_norm": 32.51534661344955, + "learning_rate": 6.426568561832563e-07, + "loss": 0.9159, + "step": 30031 + }, + { + "epoch": 2.559618170970766, + "grad_norm": 51.10624877470203, + "learning_rate": 6.424136884239074e-07, + "loss": 1.6038, + "step": 30032 + }, + { + "epoch": 2.559703400664792, + "grad_norm": 76.15621901219356, + "learning_rate": 6.421705635196873e-07, + "loss": 2.0651, + "step": 30033 + }, + { + "epoch": 2.559788630358817, + "grad_norm": 113.40537064905367, + "learning_rate": 6.419274814729892e-07, + "loss": 2.5375, + "step": 30034 + }, + { + "epoch": 2.5598738600528423, + "grad_norm": 50.69952095354804, + "learning_rate": 6.416844422862028e-07, + "loss": 1.9253, + "step": 30035 + }, + { + "epoch": 2.559959089746868, + "grad_norm": 67.914154060028, + "learning_rate": 6.414414459617174e-07, + "loss": 1.5764, + "step": 30036 + }, + { + "epoch": 2.5600443194408933, + "grad_norm": 48.36934625514777, + "learning_rate": 6.411984925019233e-07, + "loss": 1.5332, + "step": 30037 + }, + { + "epoch": 2.5601295491349187, + "grad_norm": 55.53465811697366, + "learning_rate": 6.409555819092095e-07, + "loss": 1.4214, + "step": 30038 + }, + { + "epoch": 2.5602147788289438, + "grad_norm": 33.332148478958736, + "learning_rate": 6.407127141859665e-07, + "loss": 1.3013, + "step": 30039 + }, + { + "epoch": 2.5603000085229692, + "grad_norm": 29.966296270620806, + "learning_rate": 6.404698893345818e-07, + "loss": 1.0046, + "step": 30040 + }, + { + "epoch": 2.5603852382169947, + "grad_norm": 40.465431709165664, + "learning_rate": 6.402271073574435e-07, + "loss": 1.9396, + "step": 30041 + }, + { + "epoch": 2.56047046791102, + "grad_norm": 30.34140135996598, + "learning_rate": 6.399843682569379e-07, + "loss": 1.1205, + "step": 30042 + }, + { + "epoch": 2.5605556976050456, + "grad_norm": 35.42602520381708, + "learning_rate": 6.397416720354543e-07, + "loss": 1.1529, + "step": 30043 + }, + { + "epoch": 2.560640927299071, + "grad_norm": 68.7283398352072, + "learning_rate": 6.394990186953792e-07, + "loss": 2.025, + "step": 30044 + }, + { + "epoch": 2.5607261569930966, + "grad_norm": 59.726351246936325, + "learning_rate": 6.392564082390978e-07, + "loss": 1.2806, + "step": 30045 + }, + { + "epoch": 2.5608113866871216, + "grad_norm": 77.13168448461934, + "learning_rate": 6.390138406689966e-07, + "loss": 1.9751, + "step": 30046 + }, + { + "epoch": 2.560896616381147, + "grad_norm": 43.291409333897676, + "learning_rate": 6.38771315987462e-07, + "loss": 0.9625, + "step": 30047 + }, + { + "epoch": 2.5609818460751725, + "grad_norm": 76.37575478836149, + "learning_rate": 6.385288341968792e-07, + "loss": 2.3537, + "step": 30048 + }, + { + "epoch": 2.561067075769198, + "grad_norm": 46.315523163606954, + "learning_rate": 6.382863952996321e-07, + "loss": 1.3696, + "step": 30049 + }, + { + "epoch": 2.5611523054632235, + "grad_norm": 44.93946630435602, + "learning_rate": 6.380439992981035e-07, + "loss": 0.9815, + "step": 30050 + }, + { + "epoch": 2.561237535157249, + "grad_norm": 56.960981225808034, + "learning_rate": 6.378016461946807e-07, + "loss": 1.6156, + "step": 30051 + }, + { + "epoch": 2.5613227648512744, + "grad_norm": 66.53969418979385, + "learning_rate": 6.375593359917448e-07, + "loss": 1.1553, + "step": 30052 + }, + { + "epoch": 2.5614079945452994, + "grad_norm": 43.11574205094807, + "learning_rate": 6.373170686916785e-07, + "loss": 1.6067, + "step": 30053 + }, + { + "epoch": 2.561493224239325, + "grad_norm": 54.66734593718969, + "learning_rate": 6.370748442968655e-07, + "loss": 1.7384, + "step": 30054 + }, + { + "epoch": 2.5615784539333504, + "grad_norm": 68.48209542411155, + "learning_rate": 6.368326628096882e-07, + "loss": 2.0536, + "step": 30055 + }, + { + "epoch": 2.561663683627376, + "grad_norm": 60.94200321555999, + "learning_rate": 6.365905242325288e-07, + "loss": 1.3067, + "step": 30056 + }, + { + "epoch": 2.5617489133214013, + "grad_norm": 54.59799418961427, + "learning_rate": 6.363484285677668e-07, + "loss": 1.1465, + "step": 30057 + }, + { + "epoch": 2.5618341430154263, + "grad_norm": 73.94686190098355, + "learning_rate": 6.361063758177849e-07, + "loss": 1.137, + "step": 30058 + }, + { + "epoch": 2.5619193727094522, + "grad_norm": 55.67674425766606, + "learning_rate": 6.358643659849612e-07, + "loss": 1.6835, + "step": 30059 + }, + { + "epoch": 2.5620046024034773, + "grad_norm": 41.162396474563074, + "learning_rate": 6.356223990716786e-07, + "loss": 1.2765, + "step": 30060 + }, + { + "epoch": 2.5620898320975027, + "grad_norm": 49.353812268682425, + "learning_rate": 6.353804750803139e-07, + "loss": 1.2055, + "step": 30061 + }, + { + "epoch": 2.562175061791528, + "grad_norm": 57.31102431966766, + "learning_rate": 6.351385940132498e-07, + "loss": 1.7721, + "step": 30062 + }, + { + "epoch": 2.5622602914855537, + "grad_norm": 63.41846368041826, + "learning_rate": 6.348967558728613e-07, + "loss": 1.6961, + "step": 30063 + }, + { + "epoch": 2.562345521179579, + "grad_norm": 33.11526674561025, + "learning_rate": 6.346549606615304e-07, + "loss": 1.2118, + "step": 30064 + }, + { + "epoch": 2.562430750873604, + "grad_norm": 100.1364576288714, + "learning_rate": 6.344132083816329e-07, + "loss": 2.5272, + "step": 30065 + }, + { + "epoch": 2.5625159805676296, + "grad_norm": 44.800670367282514, + "learning_rate": 6.341714990355469e-07, + "loss": 1.1637, + "step": 30066 + }, + { + "epoch": 2.562601210261655, + "grad_norm": 59.295609388353874, + "learning_rate": 6.339298326256488e-07, + "loss": 1.5149, + "step": 30067 + }, + { + "epoch": 2.5626864399556806, + "grad_norm": 39.238633103104725, + "learning_rate": 6.336882091543161e-07, + "loss": 1.0775, + "step": 30068 + }, + { + "epoch": 2.562771669649706, + "grad_norm": 61.88898220171857, + "learning_rate": 6.334466286239244e-07, + "loss": 1.7354, + "step": 30069 + }, + { + "epoch": 2.5628568993437315, + "grad_norm": 26.449460826056665, + "learning_rate": 6.332050910368509e-07, + "loss": 0.7215, + "step": 30070 + }, + { + "epoch": 2.562942129037757, + "grad_norm": 53.567610433668875, + "learning_rate": 6.3296359639547e-07, + "loss": 1.1317, + "step": 30071 + }, + { + "epoch": 2.563027358731782, + "grad_norm": 55.89985106292825, + "learning_rate": 6.327221447021564e-07, + "loss": 1.4913, + "step": 30072 + }, + { + "epoch": 2.5631125884258075, + "grad_norm": 48.946867733014706, + "learning_rate": 6.324807359592855e-07, + "loss": 0.8833, + "step": 30073 + }, + { + "epoch": 2.563197818119833, + "grad_norm": 39.176196189523026, + "learning_rate": 6.322393701692314e-07, + "loss": 1.2394, + "step": 30074 + }, + { + "epoch": 2.5632830478138584, + "grad_norm": 65.47755385287887, + "learning_rate": 6.319980473343673e-07, + "loss": 1.2639, + "step": 30075 + }, + { + "epoch": 2.563368277507884, + "grad_norm": 42.16618403605781, + "learning_rate": 6.317567674570657e-07, + "loss": 1.2854, + "step": 30076 + }, + { + "epoch": 2.563453507201909, + "grad_norm": 24.1274336826353, + "learning_rate": 6.315155305397008e-07, + "loss": 0.7953, + "step": 30077 + }, + { + "epoch": 2.563538736895935, + "grad_norm": 49.0453581330531, + "learning_rate": 6.312743365846457e-07, + "loss": 1.4199, + "step": 30078 + }, + { + "epoch": 2.56362396658996, + "grad_norm": 77.978981500764, + "learning_rate": 6.310331855942714e-07, + "loss": 2.3245, + "step": 30079 + }, + { + "epoch": 2.5637091962839853, + "grad_norm": 63.43510097612737, + "learning_rate": 6.307920775709486e-07, + "loss": 2.0428, + "step": 30080 + }, + { + "epoch": 2.5637944259780108, + "grad_norm": 17.160410775382385, + "learning_rate": 6.305510125170505e-07, + "loss": 0.6878, + "step": 30081 + }, + { + "epoch": 2.5638796556720362, + "grad_norm": 69.68471679362153, + "learning_rate": 6.303099904349474e-07, + "loss": 1.4702, + "step": 30082 + }, + { + "epoch": 2.5639648853660617, + "grad_norm": 29.772467730804753, + "learning_rate": 6.300690113270092e-07, + "loss": 0.7403, + "step": 30083 + }, + { + "epoch": 2.5640501150600867, + "grad_norm": 78.06131627548004, + "learning_rate": 6.298280751956043e-07, + "loss": 1.3863, + "step": 30084 + }, + { + "epoch": 2.564135344754112, + "grad_norm": 55.36473764594901, + "learning_rate": 6.295871820431038e-07, + "loss": 1.5368, + "step": 30085 + }, + { + "epoch": 2.5642205744481377, + "grad_norm": 24.92985097038259, + "learning_rate": 6.293463318718784e-07, + "loss": 0.7635, + "step": 30086 + }, + { + "epoch": 2.564305804142163, + "grad_norm": 63.93715600207353, + "learning_rate": 6.291055246842948e-07, + "loss": 1.5686, + "step": 30087 + }, + { + "epoch": 2.5643910338361886, + "grad_norm": 74.71025410515723, + "learning_rate": 6.288647604827214e-07, + "loss": 1.8419, + "step": 30088 + }, + { + "epoch": 2.564476263530214, + "grad_norm": 59.03029565202335, + "learning_rate": 6.286240392695248e-07, + "loss": 1.057, + "step": 30089 + }, + { + "epoch": 2.5645614932242395, + "grad_norm": 59.06179737547135, + "learning_rate": 6.283833610470752e-07, + "loss": 1.0431, + "step": 30090 + }, + { + "epoch": 2.5646467229182646, + "grad_norm": 42.15855323156999, + "learning_rate": 6.28142725817738e-07, + "loss": 1.1619, + "step": 30091 + }, + { + "epoch": 2.56473195261229, + "grad_norm": 65.1568876068099, + "learning_rate": 6.279021335838791e-07, + "loss": 1.5893, + "step": 30092 + }, + { + "epoch": 2.5648171823063155, + "grad_norm": 49.00620748090092, + "learning_rate": 6.27661584347865e-07, + "loss": 1.1572, + "step": 30093 + }, + { + "epoch": 2.564902412000341, + "grad_norm": 37.77424236949471, + "learning_rate": 6.274210781120632e-07, + "loss": 0.9188, + "step": 30094 + }, + { + "epoch": 2.5649876416943664, + "grad_norm": 72.15559116109412, + "learning_rate": 6.271806148788373e-07, + "loss": 1.7351, + "step": 30095 + }, + { + "epoch": 2.5650728713883915, + "grad_norm": 27.22395341363662, + "learning_rate": 6.269401946505526e-07, + "loss": 0.5014, + "step": 30096 + }, + { + "epoch": 2.5651581010824174, + "grad_norm": 77.04683942561925, + "learning_rate": 6.266998174295724e-07, + "loss": 1.9285, + "step": 30097 + }, + { + "epoch": 2.5652433307764424, + "grad_norm": 37.24760034382816, + "learning_rate": 6.26459483218263e-07, + "loss": 1.1177, + "step": 30098 + }, + { + "epoch": 2.565328560470468, + "grad_norm": 41.91918247267426, + "learning_rate": 6.262191920189864e-07, + "loss": 1.6663, + "step": 30099 + }, + { + "epoch": 2.5654137901644933, + "grad_norm": 50.71482670324701, + "learning_rate": 6.259789438341052e-07, + "loss": 1.379, + "step": 30100 + }, + { + "epoch": 2.565499019858519, + "grad_norm": 56.207190218219004, + "learning_rate": 6.257387386659841e-07, + "loss": 1.2009, + "step": 30101 + }, + { + "epoch": 2.5655842495525443, + "grad_norm": 41.071283260454464, + "learning_rate": 6.25498576516983e-07, + "loss": 1.2897, + "step": 30102 + }, + { + "epoch": 2.5656694792465693, + "grad_norm": 40.35279046887474, + "learning_rate": 6.252584573894665e-07, + "loss": 1.2162, + "step": 30103 + }, + { + "epoch": 2.5657547089405948, + "grad_norm": 45.15232835620155, + "learning_rate": 6.250183812857947e-07, + "loss": 1.5625, + "step": 30104 + }, + { + "epoch": 2.5658399386346202, + "grad_norm": 57.428321317749635, + "learning_rate": 6.247783482083287e-07, + "loss": 1.6583, + "step": 30105 + }, + { + "epoch": 2.5659251683286457, + "grad_norm": 50.46587033030575, + "learning_rate": 6.245383581594277e-07, + "loss": 1.2994, + "step": 30106 + }, + { + "epoch": 2.566010398022671, + "grad_norm": 117.04691042614827, + "learning_rate": 6.242984111414547e-07, + "loss": 1.5839, + "step": 30107 + }, + { + "epoch": 2.5660956277166966, + "grad_norm": 47.769276413775096, + "learning_rate": 6.240585071567668e-07, + "loss": 0.8951, + "step": 30108 + }, + { + "epoch": 2.566180857410722, + "grad_norm": 31.433225666785848, + "learning_rate": 6.238186462077256e-07, + "loss": 1.3964, + "step": 30109 + }, + { + "epoch": 2.566266087104747, + "grad_norm": 55.12616262897749, + "learning_rate": 6.235788282966887e-07, + "loss": 1.612, + "step": 30110 + }, + { + "epoch": 2.5663513167987726, + "grad_norm": 63.013456264218505, + "learning_rate": 6.233390534260154e-07, + "loss": 1.5824, + "step": 30111 + }, + { + "epoch": 2.566436546492798, + "grad_norm": 26.425889243435464, + "learning_rate": 6.230993215980635e-07, + "loss": 0.6755, + "step": 30112 + }, + { + "epoch": 2.5665217761868235, + "grad_norm": 51.105211541465124, + "learning_rate": 6.228596328151909e-07, + "loss": 1.7909, + "step": 30113 + }, + { + "epoch": 2.566607005880849, + "grad_norm": 53.39623537277118, + "learning_rate": 6.226199870797539e-07, + "loss": 1.4373, + "step": 30114 + }, + { + "epoch": 2.566692235574874, + "grad_norm": 37.77343342018881, + "learning_rate": 6.223803843941096e-07, + "loss": 0.9773, + "step": 30115 + }, + { + "epoch": 2.5667774652689, + "grad_norm": 39.51652426297833, + "learning_rate": 6.221408247606142e-07, + "loss": 0.8809, + "step": 30116 + }, + { + "epoch": 2.566862694962925, + "grad_norm": 34.445572337260444, + "learning_rate": 6.219013081816256e-07, + "loss": 0.8715, + "step": 30117 + }, + { + "epoch": 2.5669479246569504, + "grad_norm": 40.83622315598211, + "learning_rate": 6.216618346594977e-07, + "loss": 1.58, + "step": 30118 + }, + { + "epoch": 2.567033154350976, + "grad_norm": 24.8716871700377, + "learning_rate": 6.21422404196585e-07, + "loss": 0.8034, + "step": 30119 + }, + { + "epoch": 2.5671183840450014, + "grad_norm": 48.82306090984157, + "learning_rate": 6.211830167952438e-07, + "loss": 1.209, + "step": 30120 + }, + { + "epoch": 2.567203613739027, + "grad_norm": 39.23965956489246, + "learning_rate": 6.209436724578283e-07, + "loss": 1.0647, + "step": 30121 + }, + { + "epoch": 2.567288843433052, + "grad_norm": 63.33256659146118, + "learning_rate": 6.207043711866912e-07, + "loss": 1.4047, + "step": 30122 + }, + { + "epoch": 2.5673740731270773, + "grad_norm": 26.930954720868765, + "learning_rate": 6.204651129841855e-07, + "loss": 1.1116, + "step": 30123 + }, + { + "epoch": 2.567459302821103, + "grad_norm": 45.98595833556096, + "learning_rate": 6.202258978526648e-07, + "loss": 1.4606, + "step": 30124 + }, + { + "epoch": 2.5675445325151283, + "grad_norm": 33.52634092770197, + "learning_rate": 6.199867257944836e-07, + "loss": 1.0263, + "step": 30125 + }, + { + "epoch": 2.5676297622091537, + "grad_norm": 51.44865385476225, + "learning_rate": 6.197475968119926e-07, + "loss": 1.2497, + "step": 30126 + }, + { + "epoch": 2.567714991903179, + "grad_norm": 20.41509763417713, + "learning_rate": 6.195085109075421e-07, + "loss": 0.6128, + "step": 30127 + }, + { + "epoch": 2.5678002215972047, + "grad_norm": 62.60354503093852, + "learning_rate": 6.192694680834859e-07, + "loss": 1.5225, + "step": 30128 + }, + { + "epoch": 2.5678854512912297, + "grad_norm": 67.13242245293796, + "learning_rate": 6.190304683421738e-07, + "loss": 1.9383, + "step": 30129 + }, + { + "epoch": 2.567970680985255, + "grad_norm": 32.805560255650065, + "learning_rate": 6.187915116859561e-07, + "loss": 1.1972, + "step": 30130 + }, + { + "epoch": 2.5680559106792806, + "grad_norm": 42.916554477880425, + "learning_rate": 6.185525981171819e-07, + "loss": 1.4649, + "step": 30131 + }, + { + "epoch": 2.568141140373306, + "grad_norm": 57.35518349640689, + "learning_rate": 6.183137276382017e-07, + "loss": 1.2246, + "step": 30132 + }, + { + "epoch": 2.5682263700673316, + "grad_norm": 57.39825802935163, + "learning_rate": 6.180749002513664e-07, + "loss": 1.349, + "step": 30133 + }, + { + "epoch": 2.5683115997613566, + "grad_norm": 38.08948692294948, + "learning_rate": 6.17836115959023e-07, + "loss": 0.7286, + "step": 30134 + }, + { + "epoch": 2.5683968294553825, + "grad_norm": 64.6300236781335, + "learning_rate": 6.1759737476352e-07, + "loss": 1.351, + "step": 30135 + }, + { + "epoch": 2.5684820591494075, + "grad_norm": 31.491080478844857, + "learning_rate": 6.173586766672046e-07, + "loss": 1.0862, + "step": 30136 + }, + { + "epoch": 2.568567288843433, + "grad_norm": 49.45838626649312, + "learning_rate": 6.17120021672426e-07, + "loss": 1.1881, + "step": 30137 + }, + { + "epoch": 2.5686525185374585, + "grad_norm": 53.24574390530384, + "learning_rate": 6.168814097815301e-07, + "loss": 1.5427, + "step": 30138 + }, + { + "epoch": 2.568737748231484, + "grad_norm": 55.707974650998686, + "learning_rate": 6.166428409968634e-07, + "loss": 1.3063, + "step": 30139 + }, + { + "epoch": 2.5688229779255094, + "grad_norm": 98.44632744795459, + "learning_rate": 6.164043153207721e-07, + "loss": 2.2997, + "step": 30140 + }, + { + "epoch": 2.5689082076195344, + "grad_norm": 34.862268498394144, + "learning_rate": 6.161658327556042e-07, + "loss": 1.084, + "step": 30141 + }, + { + "epoch": 2.56899343731356, + "grad_norm": 78.1335198485437, + "learning_rate": 6.15927393303703e-07, + "loss": 1.7146, + "step": 30142 + }, + { + "epoch": 2.5690786670075854, + "grad_norm": 45.830972131095514, + "learning_rate": 6.156889969674135e-07, + "loss": 1.453, + "step": 30143 + }, + { + "epoch": 2.569163896701611, + "grad_norm": 55.1799882545663, + "learning_rate": 6.15450643749081e-07, + "loss": 1.0212, + "step": 30144 + }, + { + "epoch": 2.5692491263956363, + "grad_norm": 27.8612671844522, + "learning_rate": 6.15212333651048e-07, + "loss": 0.9647, + "step": 30145 + }, + { + "epoch": 2.5693343560896618, + "grad_norm": 73.2391613971625, + "learning_rate": 6.149740666756604e-07, + "loss": 1.6648, + "step": 30146 + }, + { + "epoch": 2.5694195857836872, + "grad_norm": 95.30111227064688, + "learning_rate": 6.147358428252592e-07, + "loss": 1.8504, + "step": 30147 + }, + { + "epoch": 2.5695048154777123, + "grad_norm": 62.962998436049915, + "learning_rate": 6.144976621021898e-07, + "loss": 1.4626, + "step": 30148 + }, + { + "epoch": 2.5695900451717377, + "grad_norm": 67.88919265125082, + "learning_rate": 6.142595245087917e-07, + "loss": 1.7393, + "step": 30149 + }, + { + "epoch": 2.569675274865763, + "grad_norm": 63.44560813442454, + "learning_rate": 6.140214300474101e-07, + "loss": 1.4229, + "step": 30150 + }, + { + "epoch": 2.5697605045597887, + "grad_norm": 49.822632792953186, + "learning_rate": 6.137833787203845e-07, + "loss": 1.5555, + "step": 30151 + }, + { + "epoch": 2.569845734253814, + "grad_norm": 71.70726721991545, + "learning_rate": 6.135453705300565e-07, + "loss": 1.5423, + "step": 30152 + }, + { + "epoch": 2.569930963947839, + "grad_norm": 90.6203095209029, + "learning_rate": 6.133074054787658e-07, + "loss": 2.2546, + "step": 30153 + }, + { + "epoch": 2.570016193641865, + "grad_norm": 74.8714466248598, + "learning_rate": 6.13069483568855e-07, + "loss": 1.5383, + "step": 30154 + }, + { + "epoch": 2.57010142333589, + "grad_norm": 25.106422134861134, + "learning_rate": 6.128316048026611e-07, + "loss": 0.7113, + "step": 30155 + }, + { + "epoch": 2.5701866530299156, + "grad_norm": 23.852125543336694, + "learning_rate": 6.125937691825267e-07, + "loss": 0.727, + "step": 30156 + }, + { + "epoch": 2.570271882723941, + "grad_norm": 94.40555681614241, + "learning_rate": 6.12355976710789e-07, + "loss": 2.0358, + "step": 30157 + }, + { + "epoch": 2.5703571124179665, + "grad_norm": 35.150046775253, + "learning_rate": 6.121182273897852e-07, + "loss": 1.1728, + "step": 30158 + }, + { + "epoch": 2.570442342111992, + "grad_norm": 82.82300681708696, + "learning_rate": 6.118805212218564e-07, + "loss": 1.8214, + "step": 30159 + }, + { + "epoch": 2.570527571806017, + "grad_norm": 53.565960326535986, + "learning_rate": 6.116428582093393e-07, + "loss": 1.4133, + "step": 30160 + }, + { + "epoch": 2.5706128015000425, + "grad_norm": 51.005289702916876, + "learning_rate": 6.11405238354571e-07, + "loss": 1.2688, + "step": 30161 + }, + { + "epoch": 2.570698031194068, + "grad_norm": 26.32827853719835, + "learning_rate": 6.111676616598871e-07, + "loss": 0.9412, + "step": 30162 + }, + { + "epoch": 2.5707832608880934, + "grad_norm": 24.378223410881358, + "learning_rate": 6.109301281276253e-07, + "loss": 0.8746, + "step": 30163 + }, + { + "epoch": 2.570868490582119, + "grad_norm": 49.182492725751196, + "learning_rate": 6.106926377601224e-07, + "loss": 1.3518, + "step": 30164 + }, + { + "epoch": 2.5709537202761443, + "grad_norm": 94.78597613456242, + "learning_rate": 6.104551905597133e-07, + "loss": 2.1408, + "step": 30165 + }, + { + "epoch": 2.57103894997017, + "grad_norm": 22.304314608819425, + "learning_rate": 6.102177865287323e-07, + "loss": 0.4319, + "step": 30166 + }, + { + "epoch": 2.571124179664195, + "grad_norm": 26.340198868422398, + "learning_rate": 6.099804256695164e-07, + "loss": 0.8892, + "step": 30167 + }, + { + "epoch": 2.5712094093582203, + "grad_norm": 87.01062809905048, + "learning_rate": 6.097431079843979e-07, + "loss": 2.0589, + "step": 30168 + }, + { + "epoch": 2.5712946390522458, + "grad_norm": 56.74720370038772, + "learning_rate": 6.095058334757121e-07, + "loss": 1.2763, + "step": 30169 + }, + { + "epoch": 2.5713798687462712, + "grad_norm": 38.51153184905297, + "learning_rate": 6.092686021457905e-07, + "loss": 1.1224, + "step": 30170 + }, + { + "epoch": 2.5714650984402967, + "grad_norm": 58.040919083066356, + "learning_rate": 6.090314139969672e-07, + "loss": 1.744, + "step": 30171 + }, + { + "epoch": 2.571550328134322, + "grad_norm": 45.580643741205726, + "learning_rate": 6.087942690315768e-07, + "loss": 1.0943, + "step": 30172 + }, + { + "epoch": 2.5716355578283476, + "grad_norm": 58.434936844597495, + "learning_rate": 6.085571672519497e-07, + "loss": 1.2242, + "step": 30173 + }, + { + "epoch": 2.5717207875223727, + "grad_norm": 30.48963917816129, + "learning_rate": 6.083201086604174e-07, + "loss": 0.9111, + "step": 30174 + }, + { + "epoch": 2.571806017216398, + "grad_norm": 96.8215161167525, + "learning_rate": 6.080830932593112e-07, + "loss": 3.1243, + "step": 30175 + }, + { + "epoch": 2.5718912469104236, + "grad_norm": 24.35143926094781, + "learning_rate": 6.078461210509634e-07, + "loss": 0.7915, + "step": 30176 + }, + { + "epoch": 2.571976476604449, + "grad_norm": 81.31932780610173, + "learning_rate": 6.076091920377037e-07, + "loss": 1.5014, + "step": 30177 + }, + { + "epoch": 2.5720617062984745, + "grad_norm": 65.79288459935488, + "learning_rate": 6.073723062218611e-07, + "loss": 1.6886, + "step": 30178 + }, + { + "epoch": 2.5721469359924996, + "grad_norm": 47.50916692300987, + "learning_rate": 6.071354636057669e-07, + "loss": 1.4493, + "step": 30179 + }, + { + "epoch": 2.5722321656865255, + "grad_norm": 38.610907309247374, + "learning_rate": 6.068986641917501e-07, + "loss": 1.4368, + "step": 30180 + }, + { + "epoch": 2.5723173953805505, + "grad_norm": 66.53492948807336, + "learning_rate": 6.0666190798214e-07, + "loss": 1.8293, + "step": 30181 + }, + { + "epoch": 2.572402625074576, + "grad_norm": 30.567389768262736, + "learning_rate": 6.064251949792644e-07, + "loss": 0.8298, + "step": 30182 + }, + { + "epoch": 2.5724878547686014, + "grad_norm": 62.67926852767051, + "learning_rate": 6.061885251854494e-07, + "loss": 1.4602, + "step": 30183 + }, + { + "epoch": 2.572573084462627, + "grad_norm": 38.12087371855734, + "learning_rate": 6.05951898603026e-07, + "loss": 1.0644, + "step": 30184 + }, + { + "epoch": 2.5726583141566524, + "grad_norm": 33.43680140109296, + "learning_rate": 6.057153152343192e-07, + "loss": 1.0085, + "step": 30185 + }, + { + "epoch": 2.5727435438506774, + "grad_norm": 76.94410959600398, + "learning_rate": 6.054787750816554e-07, + "loss": 1.835, + "step": 30186 + }, + { + "epoch": 2.572828773544703, + "grad_norm": 30.332303107185897, + "learning_rate": 6.052422781473627e-07, + "loss": 0.9934, + "step": 30187 + }, + { + "epoch": 2.5729140032387283, + "grad_norm": 19.9557554370216, + "learning_rate": 6.050058244337648e-07, + "loss": 0.6539, + "step": 30188 + }, + { + "epoch": 2.572999232932754, + "grad_norm": 50.68761980496725, + "learning_rate": 6.047694139431892e-07, + "loss": 1.2852, + "step": 30189 + }, + { + "epoch": 2.5730844626267793, + "grad_norm": 24.343525579980934, + "learning_rate": 6.045330466779598e-07, + "loss": 0.906, + "step": 30190 + }, + { + "epoch": 2.5731696923208047, + "grad_norm": 28.93133705699916, + "learning_rate": 6.042967226404017e-07, + "loss": 0.688, + "step": 30191 + }, + { + "epoch": 2.57325492201483, + "grad_norm": 24.776073103952697, + "learning_rate": 6.040604418328371e-07, + "loss": 0.8351, + "step": 30192 + }, + { + "epoch": 2.573340151708855, + "grad_norm": 46.36451059514291, + "learning_rate": 6.038242042575925e-07, + "loss": 2.08, + "step": 30193 + }, + { + "epoch": 2.5734253814028807, + "grad_norm": 50.81858037227459, + "learning_rate": 6.035880099169894e-07, + "loss": 1.3989, + "step": 30194 + }, + { + "epoch": 2.573510611096906, + "grad_norm": 63.363472468966854, + "learning_rate": 6.033518588133519e-07, + "loss": 1.4974, + "step": 30195 + }, + { + "epoch": 2.5735958407909316, + "grad_norm": 56.84594763499952, + "learning_rate": 6.031157509490004e-07, + "loss": 1.8492, + "step": 30196 + }, + { + "epoch": 2.573681070484957, + "grad_norm": 57.87623700393665, + "learning_rate": 6.028796863262598e-07, + "loss": 1.617, + "step": 30197 + }, + { + "epoch": 2.573766300178982, + "grad_norm": 58.566750794813444, + "learning_rate": 6.026436649474505e-07, + "loss": 1.1976, + "step": 30198 + }, + { + "epoch": 2.573851529873008, + "grad_norm": 321.520921490647, + "learning_rate": 6.024076868148931e-07, + "loss": 3.693, + "step": 30199 + }, + { + "epoch": 2.573936759567033, + "grad_norm": 65.37023546706224, + "learning_rate": 6.021717519309073e-07, + "loss": 1.6558, + "step": 30200 + }, + { + "epoch": 2.5740219892610585, + "grad_norm": 43.542714287169055, + "learning_rate": 6.019358602978164e-07, + "loss": 1.4737, + "step": 30201 + }, + { + "epoch": 2.574107218955084, + "grad_norm": 32.99271278187992, + "learning_rate": 6.017000119179373e-07, + "loss": 0.8902, + "step": 30202 + }, + { + "epoch": 2.5741924486491095, + "grad_norm": 52.3666220283108, + "learning_rate": 6.014642067935922e-07, + "loss": 1.373, + "step": 30203 + }, + { + "epoch": 2.574277678343135, + "grad_norm": 37.57315750863995, + "learning_rate": 6.012284449270989e-07, + "loss": 1.4095, + "step": 30204 + }, + { + "epoch": 2.57436290803716, + "grad_norm": 44.069030754951946, + "learning_rate": 6.009927263207744e-07, + "loss": 1.0754, + "step": 30205 + }, + { + "epoch": 2.5744481377311854, + "grad_norm": 53.94423904722471, + "learning_rate": 6.007570509769395e-07, + "loss": 1.2373, + "step": 30206 + }, + { + "epoch": 2.574533367425211, + "grad_norm": 50.02431329007457, + "learning_rate": 6.005214188979113e-07, + "loss": 1.5769, + "step": 30207 + }, + { + "epoch": 2.5746185971192364, + "grad_norm": 43.25239875806587, + "learning_rate": 6.002858300860065e-07, + "loss": 1.3778, + "step": 30208 + }, + { + "epoch": 2.574703826813262, + "grad_norm": 33.83521600856153, + "learning_rate": 6.000502845435418e-07, + "loss": 0.866, + "step": 30209 + }, + { + "epoch": 2.5747890565072873, + "grad_norm": 40.6231243574544, + "learning_rate": 5.998147822728334e-07, + "loss": 0.9382, + "step": 30210 + }, + { + "epoch": 2.5748742862013128, + "grad_norm": 64.50944872818432, + "learning_rate": 5.995793232761999e-07, + "loss": 2.1337, + "step": 30211 + }, + { + "epoch": 2.574959515895338, + "grad_norm": 68.90869417837233, + "learning_rate": 5.993439075559548e-07, + "loss": 1.4293, + "step": 30212 + }, + { + "epoch": 2.5750447455893632, + "grad_norm": 64.5116437967916, + "learning_rate": 5.991085351144127e-07, + "loss": 1.7094, + "step": 30213 + }, + { + "epoch": 2.5751299752833887, + "grad_norm": 57.89753125418914, + "learning_rate": 5.988732059538904e-07, + "loss": 1.1781, + "step": 30214 + }, + { + "epoch": 2.575215204977414, + "grad_norm": 51.52518599260708, + "learning_rate": 5.986379200767012e-07, + "loss": 0.7506, + "step": 30215 + }, + { + "epoch": 2.5753004346714397, + "grad_norm": 19.306705280767837, + "learning_rate": 5.984026774851592e-07, + "loss": 0.6215, + "step": 30216 + }, + { + "epoch": 2.5753856643654647, + "grad_norm": 70.45098729900539, + "learning_rate": 5.981674781815772e-07, + "loss": 1.7834, + "step": 30217 + }, + { + "epoch": 2.5754708940594906, + "grad_norm": 72.3198121093329, + "learning_rate": 5.979323221682687e-07, + "loss": 1.6494, + "step": 30218 + }, + { + "epoch": 2.5755561237535156, + "grad_norm": 51.914864160353375, + "learning_rate": 5.976972094475475e-07, + "loss": 1.148, + "step": 30219 + }, + { + "epoch": 2.575641353447541, + "grad_norm": 90.13778167169639, + "learning_rate": 5.974621400217251e-07, + "loss": 2.0352, + "step": 30220 + }, + { + "epoch": 2.5757265831415666, + "grad_norm": 34.34982657719249, + "learning_rate": 5.97227113893113e-07, + "loss": 1.1767, + "step": 30221 + }, + { + "epoch": 2.575811812835592, + "grad_norm": 54.26038593680713, + "learning_rate": 5.96992131064022e-07, + "loss": 1.4098, + "step": 30222 + }, + { + "epoch": 2.5758970425296175, + "grad_norm": 22.614409952376704, + "learning_rate": 5.967571915367642e-07, + "loss": 0.8958, + "step": 30223 + }, + { + "epoch": 2.5759822722236425, + "grad_norm": 59.47203834243865, + "learning_rate": 5.965222953136502e-07, + "loss": 2.1016, + "step": 30224 + }, + { + "epoch": 2.576067501917668, + "grad_norm": 29.08161993348298, + "learning_rate": 5.962874423969889e-07, + "loss": 0.7803, + "step": 30225 + }, + { + "epoch": 2.5761527316116934, + "grad_norm": 56.03893980610307, + "learning_rate": 5.960526327890903e-07, + "loss": 1.6444, + "step": 30226 + }, + { + "epoch": 2.576237961305719, + "grad_norm": 45.40223327267962, + "learning_rate": 5.958178664922654e-07, + "loss": 1.6773, + "step": 30227 + }, + { + "epoch": 2.5763231909997444, + "grad_norm": 55.52147409073705, + "learning_rate": 5.955831435088216e-07, + "loss": 1.245, + "step": 30228 + }, + { + "epoch": 2.57640842069377, + "grad_norm": 98.49748602766829, + "learning_rate": 5.953484638410673e-07, + "loss": 2.6162, + "step": 30229 + }, + { + "epoch": 2.5764936503877953, + "grad_norm": 76.57098600426633, + "learning_rate": 5.95113827491311e-07, + "loss": 1.7123, + "step": 30230 + }, + { + "epoch": 2.5765788800818203, + "grad_norm": 31.441316848127375, + "learning_rate": 5.948792344618581e-07, + "loss": 0.9037, + "step": 30231 + }, + { + "epoch": 2.576664109775846, + "grad_norm": 40.44466458127853, + "learning_rate": 5.946446847550191e-07, + "loss": 0.9283, + "step": 30232 + }, + { + "epoch": 2.5767493394698713, + "grad_norm": 74.484893151813, + "learning_rate": 5.944101783730983e-07, + "loss": 1.6007, + "step": 30233 + }, + { + "epoch": 2.5768345691638967, + "grad_norm": 84.82529277838927, + "learning_rate": 5.941757153184036e-07, + "loss": 2.1101, + "step": 30234 + }, + { + "epoch": 2.576919798857922, + "grad_norm": 40.23484267567042, + "learning_rate": 5.939412955932389e-07, + "loss": 1.1908, + "step": 30235 + }, + { + "epoch": 2.5770050285519472, + "grad_norm": 59.28013046549906, + "learning_rate": 5.937069191999118e-07, + "loss": 1.555, + "step": 30236 + }, + { + "epoch": 2.577090258245973, + "grad_norm": 27.462217226750887, + "learning_rate": 5.934725861407265e-07, + "loss": 0.9972, + "step": 30237 + }, + { + "epoch": 2.577175487939998, + "grad_norm": 10.138216679613723, + "learning_rate": 5.932382964179867e-07, + "loss": 0.3835, + "step": 30238 + }, + { + "epoch": 2.5772607176340236, + "grad_norm": 32.98209965174023, + "learning_rate": 5.930040500339968e-07, + "loss": 0.8483, + "step": 30239 + }, + { + "epoch": 2.577345947328049, + "grad_norm": 65.09225660254886, + "learning_rate": 5.927698469910614e-07, + "loss": 1.5902, + "step": 30240 + }, + { + "epoch": 2.5774311770220746, + "grad_norm": 53.427870731277395, + "learning_rate": 5.925356872914828e-07, + "loss": 1.4533, + "step": 30241 + }, + { + "epoch": 2.5775164067161, + "grad_norm": 58.263717155945855, + "learning_rate": 5.923015709375651e-07, + "loss": 1.5424, + "step": 30242 + }, + { + "epoch": 2.577601636410125, + "grad_norm": 42.27778299464357, + "learning_rate": 5.920674979316094e-07, + "loss": 1.3004, + "step": 30243 + }, + { + "epoch": 2.5776868661041505, + "grad_norm": 35.375229678893355, + "learning_rate": 5.918334682759186e-07, + "loss": 1.3762, + "step": 30244 + }, + { + "epoch": 2.577772095798176, + "grad_norm": 50.599989176479056, + "learning_rate": 5.915994819727944e-07, + "loss": 1.3002, + "step": 30245 + }, + { + "epoch": 2.5778573254922015, + "grad_norm": 74.79047646710943, + "learning_rate": 5.913655390245376e-07, + "loss": 1.5395, + "step": 30246 + }, + { + "epoch": 2.577942555186227, + "grad_norm": 43.67628766551892, + "learning_rate": 5.911316394334487e-07, + "loss": 1.23, + "step": 30247 + }, + { + "epoch": 2.5780277848802524, + "grad_norm": 40.02169755876011, + "learning_rate": 5.908977832018276e-07, + "loss": 0.9076, + "step": 30248 + }, + { + "epoch": 2.578113014574278, + "grad_norm": 52.26374287895455, + "learning_rate": 5.906639703319744e-07, + "loss": 1.3311, + "step": 30249 + }, + { + "epoch": 2.578198244268303, + "grad_norm": 47.83132949250846, + "learning_rate": 5.904302008261903e-07, + "loss": 1.475, + "step": 30250 + }, + { + "epoch": 2.5782834739623284, + "grad_norm": 76.0379013673553, + "learning_rate": 5.901964746867727e-07, + "loss": 1.7168, + "step": 30251 + }, + { + "epoch": 2.578368703656354, + "grad_norm": 27.033818223200907, + "learning_rate": 5.899627919160194e-07, + "loss": 0.8186, + "step": 30252 + }, + { + "epoch": 2.5784539333503793, + "grad_norm": 27.350653355110726, + "learning_rate": 5.897291525162313e-07, + "loss": 0.6176, + "step": 30253 + }, + { + "epoch": 2.578539163044405, + "grad_norm": 30.946595930759948, + "learning_rate": 5.894955564897037e-07, + "loss": 1.2553, + "step": 30254 + }, + { + "epoch": 2.57862439273843, + "grad_norm": 55.8250537726235, + "learning_rate": 5.892620038387353e-07, + "loss": 1.3102, + "step": 30255 + }, + { + "epoch": 2.5787096224324557, + "grad_norm": 24.619535380618373, + "learning_rate": 5.890284945656211e-07, + "loss": 1.1144, + "step": 30256 + }, + { + "epoch": 2.5787948521264807, + "grad_norm": 97.32265855481818, + "learning_rate": 5.88795028672659e-07, + "loss": 1.8339, + "step": 30257 + }, + { + "epoch": 2.578880081820506, + "grad_norm": 55.841056386546576, + "learning_rate": 5.885616061621463e-07, + "loss": 1.8148, + "step": 30258 + }, + { + "epoch": 2.5789653115145317, + "grad_norm": 26.118306141302316, + "learning_rate": 5.883282270363766e-07, + "loss": 0.7362, + "step": 30259 + }, + { + "epoch": 2.579050541208557, + "grad_norm": 87.35623137533157, + "learning_rate": 5.880948912976465e-07, + "loss": 1.9792, + "step": 30260 + }, + { + "epoch": 2.5791357709025826, + "grad_norm": 27.56811245883361, + "learning_rate": 5.878615989482484e-07, + "loss": 0.9436, + "step": 30261 + }, + { + "epoch": 2.5792210005966076, + "grad_norm": 45.03673073116367, + "learning_rate": 5.876283499904795e-07, + "loss": 1.2229, + "step": 30262 + }, + { + "epoch": 2.579306230290633, + "grad_norm": 35.242733931118465, + "learning_rate": 5.87395144426633e-07, + "loss": 1.0817, + "step": 30263 + }, + { + "epoch": 2.5793914599846586, + "grad_norm": 24.177022570408905, + "learning_rate": 5.871619822590002e-07, + "loss": 0.79, + "step": 30264 + }, + { + "epoch": 2.579476689678684, + "grad_norm": 61.572247485828214, + "learning_rate": 5.869288634898756e-07, + "loss": 1.7202, + "step": 30265 + }, + { + "epoch": 2.5795619193727095, + "grad_norm": 80.38760291977141, + "learning_rate": 5.866957881215535e-07, + "loss": 2.4333, + "step": 30266 + }, + { + "epoch": 2.579647149066735, + "grad_norm": 40.87381921550671, + "learning_rate": 5.864627561563246e-07, + "loss": 0.6891, + "step": 30267 + }, + { + "epoch": 2.5797323787607604, + "grad_norm": 61.418908796681, + "learning_rate": 5.862297675964807e-07, + "loss": 1.846, + "step": 30268 + }, + { + "epoch": 2.5798176084547855, + "grad_norm": 46.81491541276741, + "learning_rate": 5.859968224443119e-07, + "loss": 1.4866, + "step": 30269 + }, + { + "epoch": 2.579902838148811, + "grad_norm": 58.46872490686325, + "learning_rate": 5.857639207021121e-07, + "loss": 1.4161, + "step": 30270 + }, + { + "epoch": 2.5799880678428364, + "grad_norm": 58.99118475885831, + "learning_rate": 5.855310623721694e-07, + "loss": 1.7123, + "step": 30271 + }, + { + "epoch": 2.580073297536862, + "grad_norm": 36.85894371119814, + "learning_rate": 5.852982474567737e-07, + "loss": 0.8488, + "step": 30272 + }, + { + "epoch": 2.5801585272308873, + "grad_norm": 42.93783373597633, + "learning_rate": 5.850654759582158e-07, + "loss": 0.8959, + "step": 30273 + }, + { + "epoch": 2.5802437569249124, + "grad_norm": 57.18394509626599, + "learning_rate": 5.848327478787852e-07, + "loss": 1.8243, + "step": 30274 + }, + { + "epoch": 2.5803289866189383, + "grad_norm": 73.95869848813891, + "learning_rate": 5.846000632207705e-07, + "loss": 1.3768, + "step": 30275 + }, + { + "epoch": 2.5804142163129633, + "grad_norm": 34.54696315751895, + "learning_rate": 5.843674219864598e-07, + "loss": 0.831, + "step": 30276 + }, + { + "epoch": 2.5804994460069888, + "grad_norm": 74.44669548318421, + "learning_rate": 5.841348241781403e-07, + "loss": 1.6109, + "step": 30277 + }, + { + "epoch": 2.5805846757010142, + "grad_norm": 54.29914537470336, + "learning_rate": 5.839022697980995e-07, + "loss": 1.0288, + "step": 30278 + }, + { + "epoch": 2.5806699053950397, + "grad_norm": 27.552926315942152, + "learning_rate": 5.836697588486262e-07, + "loss": 0.9985, + "step": 30279 + }, + { + "epoch": 2.580755135089065, + "grad_norm": 73.66709893639167, + "learning_rate": 5.834372913320047e-07, + "loss": 0.9011, + "step": 30280 + }, + { + "epoch": 2.58084036478309, + "grad_norm": 36.79284211704044, + "learning_rate": 5.832048672505236e-07, + "loss": 1.4366, + "step": 30281 + }, + { + "epoch": 2.5809255944771157, + "grad_norm": 42.76991522587117, + "learning_rate": 5.829724866064662e-07, + "loss": 1.4615, + "step": 30282 + }, + { + "epoch": 2.581010824171141, + "grad_norm": 47.09024532443863, + "learning_rate": 5.827401494021206e-07, + "loss": 0.9639, + "step": 30283 + }, + { + "epoch": 2.5810960538651666, + "grad_norm": 86.38974494189691, + "learning_rate": 5.825078556397706e-07, + "loss": 2.0977, + "step": 30284 + }, + { + "epoch": 2.581181283559192, + "grad_norm": 37.75300874840295, + "learning_rate": 5.822756053217005e-07, + "loss": 0.8537, + "step": 30285 + }, + { + "epoch": 2.5812665132532175, + "grad_norm": 74.4131467463949, + "learning_rate": 5.820433984501928e-07, + "loss": 2.3976, + "step": 30286 + }, + { + "epoch": 2.581351742947243, + "grad_norm": 42.35395003929691, + "learning_rate": 5.81811235027534e-07, + "loss": 1.207, + "step": 30287 + }, + { + "epoch": 2.581436972641268, + "grad_norm": 25.13514947608734, + "learning_rate": 5.815791150560052e-07, + "loss": 0.5501, + "step": 30288 + }, + { + "epoch": 2.5815222023352935, + "grad_norm": 42.127886842222146, + "learning_rate": 5.813470385378911e-07, + "loss": 1.2185, + "step": 30289 + }, + { + "epoch": 2.581607432029319, + "grad_norm": 41.12116628854289, + "learning_rate": 5.811150054754733e-07, + "loss": 0.9517, + "step": 30290 + }, + { + "epoch": 2.5816926617233444, + "grad_norm": 38.46666873294723, + "learning_rate": 5.808830158710322e-07, + "loss": 0.9541, + "step": 30291 + }, + { + "epoch": 2.58177789141737, + "grad_norm": 45.13662194901386, + "learning_rate": 5.806510697268513e-07, + "loss": 1.313, + "step": 30292 + }, + { + "epoch": 2.5818631211113954, + "grad_norm": 59.74751799308225, + "learning_rate": 5.804191670452119e-07, + "loss": 1.8433, + "step": 30293 + }, + { + "epoch": 2.581948350805421, + "grad_norm": 29.846722655931057, + "learning_rate": 5.801873078283931e-07, + "loss": 1.1506, + "step": 30294 + }, + { + "epoch": 2.582033580499446, + "grad_norm": 49.02821160129643, + "learning_rate": 5.799554920786755e-07, + "loss": 1.526, + "step": 30295 + }, + { + "epoch": 2.5821188101934713, + "grad_norm": 53.20005929417971, + "learning_rate": 5.797237197983385e-07, + "loss": 0.9838, + "step": 30296 + }, + { + "epoch": 2.582204039887497, + "grad_norm": 67.55995342570336, + "learning_rate": 5.794919909896641e-07, + "loss": 1.9856, + "step": 30297 + }, + { + "epoch": 2.5822892695815223, + "grad_norm": 41.312024612209036, + "learning_rate": 5.79260305654929e-07, + "loss": 1.2397, + "step": 30298 + }, + { + "epoch": 2.5823744992755477, + "grad_norm": 50.90528191370605, + "learning_rate": 5.79028663796411e-07, + "loss": 1.7908, + "step": 30299 + }, + { + "epoch": 2.5824597289695728, + "grad_norm": 41.58482528419236, + "learning_rate": 5.787970654163905e-07, + "loss": 1.0528, + "step": 30300 + }, + { + "epoch": 2.5825449586635982, + "grad_norm": 25.540851582833294, + "learning_rate": 5.785655105171445e-07, + "loss": 1.1053, + "step": 30301 + }, + { + "epoch": 2.5826301883576237, + "grad_norm": 54.73812326636185, + "learning_rate": 5.783339991009495e-07, + "loss": 1.4745, + "step": 30302 + }, + { + "epoch": 2.582715418051649, + "grad_norm": 82.47856574399154, + "learning_rate": 5.78102531170081e-07, + "loss": 1.8397, + "step": 30303 + }, + { + "epoch": 2.5828006477456746, + "grad_norm": 44.11528367722714, + "learning_rate": 5.778711067268178e-07, + "loss": 0.8736, + "step": 30304 + }, + { + "epoch": 2.5828858774397, + "grad_norm": 35.39771073144644, + "learning_rate": 5.776397257734361e-07, + "loss": 1.2914, + "step": 30305 + }, + { + "epoch": 2.5829711071337256, + "grad_norm": 57.16329457945986, + "learning_rate": 5.7740838831221e-07, + "loss": 1.4971, + "step": 30306 + }, + { + "epoch": 2.5830563368277506, + "grad_norm": 58.2188682205428, + "learning_rate": 5.771770943454152e-07, + "loss": 1.363, + "step": 30307 + }, + { + "epoch": 2.583141566521776, + "grad_norm": 28.926447381357306, + "learning_rate": 5.76945843875325e-07, + "loss": 0.7976, + "step": 30308 + }, + { + "epoch": 2.5832267962158015, + "grad_norm": 37.26663384469729, + "learning_rate": 5.767146369042164e-07, + "loss": 0.8867, + "step": 30309 + }, + { + "epoch": 2.583312025909827, + "grad_norm": 47.038776786017316, + "learning_rate": 5.764834734343617e-07, + "loss": 1.2158, + "step": 30310 + }, + { + "epoch": 2.5833972556038525, + "grad_norm": 63.40666234465246, + "learning_rate": 5.762523534680326e-07, + "loss": 1.8968, + "step": 30311 + }, + { + "epoch": 2.583482485297878, + "grad_norm": 58.46505668069474, + "learning_rate": 5.760212770075041e-07, + "loss": 2.2613, + "step": 30312 + }, + { + "epoch": 2.5835677149919034, + "grad_norm": 39.702370413257555, + "learning_rate": 5.757902440550494e-07, + "loss": 1.4953, + "step": 30313 + }, + { + "epoch": 2.5836529446859284, + "grad_norm": 51.02297821662113, + "learning_rate": 5.755592546129402e-07, + "loss": 1.2685, + "step": 30314 + }, + { + "epoch": 2.583738174379954, + "grad_norm": 25.08159144245693, + "learning_rate": 5.753283086834472e-07, + "loss": 0.6764, + "step": 30315 + }, + { + "epoch": 2.5838234040739794, + "grad_norm": 37.09075125558872, + "learning_rate": 5.750974062688408e-07, + "loss": 0.7002, + "step": 30316 + }, + { + "epoch": 2.583908633768005, + "grad_norm": 40.607256645622385, + "learning_rate": 5.748665473713944e-07, + "loss": 1.5414, + "step": 30317 + }, + { + "epoch": 2.5839938634620303, + "grad_norm": 24.497273229371157, + "learning_rate": 5.74635731993377e-07, + "loss": 0.8474, + "step": 30318 + }, + { + "epoch": 2.5840790931560553, + "grad_norm": 25.50366266790735, + "learning_rate": 5.744049601370582e-07, + "loss": 0.9877, + "step": 30319 + }, + { + "epoch": 2.5841643228500812, + "grad_norm": 70.79876818377285, + "learning_rate": 5.741742318047089e-07, + "loss": 1.8598, + "step": 30320 + }, + { + "epoch": 2.5842495525441063, + "grad_norm": 34.6348043619792, + "learning_rate": 5.73943546998596e-07, + "loss": 1.2333, + "step": 30321 + }, + { + "epoch": 2.5843347822381317, + "grad_norm": 85.59574152645438, + "learning_rate": 5.737129057209906e-07, + "loss": 1.5521, + "step": 30322 + }, + { + "epoch": 2.584420011932157, + "grad_norm": 41.00662705741109, + "learning_rate": 5.734823079741603e-07, + "loss": 0.6714, + "step": 30323 + }, + { + "epoch": 2.5845052416261827, + "grad_norm": 95.47548507492633, + "learning_rate": 5.732517537603721e-07, + "loss": 2.1167, + "step": 30324 + }, + { + "epoch": 2.584590471320208, + "grad_norm": 54.09011245092882, + "learning_rate": 5.730212430818932e-07, + "loss": 1.4998, + "step": 30325 + }, + { + "epoch": 2.584675701014233, + "grad_norm": 55.32254067555274, + "learning_rate": 5.727907759409923e-07, + "loss": 2.1059, + "step": 30326 + }, + { + "epoch": 2.5847609307082586, + "grad_norm": 82.95325547111658, + "learning_rate": 5.725603523399337e-07, + "loss": 1.8291, + "step": 30327 + }, + { + "epoch": 2.584846160402284, + "grad_norm": 31.268201957034478, + "learning_rate": 5.723299722809861e-07, + "loss": 0.8374, + "step": 30328 + }, + { + "epoch": 2.5849313900963096, + "grad_norm": 50.207345709781166, + "learning_rate": 5.720996357664127e-07, + "loss": 1.3725, + "step": 30329 + }, + { + "epoch": 2.585016619790335, + "grad_norm": 45.543149293943806, + "learning_rate": 5.718693427984806e-07, + "loss": 1.5195, + "step": 30330 + }, + { + "epoch": 2.5851018494843605, + "grad_norm": 30.903172147072016, + "learning_rate": 5.716390933794541e-07, + "loss": 0.8651, + "step": 30331 + }, + { + "epoch": 2.585187079178386, + "grad_norm": 35.947346217408274, + "learning_rate": 5.714088875115975e-07, + "loss": 0.8179, + "step": 30332 + }, + { + "epoch": 2.585272308872411, + "grad_norm": 25.248538900479485, + "learning_rate": 5.711787251971751e-07, + "loss": 1.0425, + "step": 30333 + }, + { + "epoch": 2.5853575385664365, + "grad_norm": 95.01812002646966, + "learning_rate": 5.709486064384484e-07, + "loss": 2.1878, + "step": 30334 + }, + { + "epoch": 2.585442768260462, + "grad_norm": 41.42056331646621, + "learning_rate": 5.707185312376828e-07, + "loss": 1.4544, + "step": 30335 + }, + { + "epoch": 2.5855279979544874, + "grad_norm": 45.86330085733316, + "learning_rate": 5.70488499597141e-07, + "loss": 1.3507, + "step": 30336 + }, + { + "epoch": 2.585613227648513, + "grad_norm": 25.330088591200624, + "learning_rate": 5.702585115190851e-07, + "loss": 1.1579, + "step": 30337 + }, + { + "epoch": 2.585698457342538, + "grad_norm": 60.86431528395176, + "learning_rate": 5.700285670057754e-07, + "loss": 1.3364, + "step": 30338 + }, + { + "epoch": 2.585783687036564, + "grad_norm": 64.5928509185603, + "learning_rate": 5.697986660594756e-07, + "loss": 1.7128, + "step": 30339 + }, + { + "epoch": 2.585868916730589, + "grad_norm": 53.581748188324134, + "learning_rate": 5.695688086824453e-07, + "loss": 1.6985, + "step": 30340 + }, + { + "epoch": 2.5859541464246143, + "grad_norm": 38.30057023401057, + "learning_rate": 5.693389948769451e-07, + "loss": 0.8984, + "step": 30341 + }, + { + "epoch": 2.5860393761186398, + "grad_norm": 51.70187450341511, + "learning_rate": 5.691092246452345e-07, + "loss": 1.3933, + "step": 30342 + }, + { + "epoch": 2.5861246058126652, + "grad_norm": 22.157788759043793, + "learning_rate": 5.68879497989574e-07, + "loss": 0.8521, + "step": 30343 + }, + { + "epoch": 2.5862098355066907, + "grad_norm": 75.92048536733509, + "learning_rate": 5.686498149122238e-07, + "loss": 2.0459, + "step": 30344 + }, + { + "epoch": 2.5862950652007157, + "grad_norm": 41.157278222177275, + "learning_rate": 5.684201754154423e-07, + "loss": 1.2755, + "step": 30345 + }, + { + "epoch": 2.586380294894741, + "grad_norm": 44.27441438524508, + "learning_rate": 5.681905795014858e-07, + "loss": 1.2341, + "step": 30346 + }, + { + "epoch": 2.5864655245887667, + "grad_norm": 37.67032844306135, + "learning_rate": 5.679610271726154e-07, + "loss": 0.6319, + "step": 30347 + }, + { + "epoch": 2.586550754282792, + "grad_norm": 71.62682681344542, + "learning_rate": 5.677315184310867e-07, + "loss": 1.7455, + "step": 30348 + }, + { + "epoch": 2.5866359839768176, + "grad_norm": 29.18320003937706, + "learning_rate": 5.675020532791575e-07, + "loss": 0.7628, + "step": 30349 + }, + { + "epoch": 2.586721213670843, + "grad_norm": 31.61975322061919, + "learning_rate": 5.672726317190835e-07, + "loss": 0.897, + "step": 30350 + }, + { + "epoch": 2.5868064433648685, + "grad_norm": 20.521341354411053, + "learning_rate": 5.670432537531218e-07, + "loss": 0.8058, + "step": 30351 + }, + { + "epoch": 2.5868916730588936, + "grad_norm": 28.420192563024322, + "learning_rate": 5.668139193835292e-07, + "loss": 0.9829, + "step": 30352 + }, + { + "epoch": 2.586976902752919, + "grad_norm": 81.64217857173418, + "learning_rate": 5.665846286125598e-07, + "loss": 1.9756, + "step": 30353 + }, + { + "epoch": 2.5870621324469445, + "grad_norm": 83.38212117282885, + "learning_rate": 5.663553814424694e-07, + "loss": 1.2887, + "step": 30354 + }, + { + "epoch": 2.58714736214097, + "grad_norm": 23.452018644151785, + "learning_rate": 5.661261778755106e-07, + "loss": 0.7219, + "step": 30355 + }, + { + "epoch": 2.5872325918349954, + "grad_norm": 43.58811836193584, + "learning_rate": 5.6589701791394e-07, + "loss": 1.6189, + "step": 30356 + }, + { + "epoch": 2.5873178215290205, + "grad_norm": 40.40456712726137, + "learning_rate": 5.656679015600108e-07, + "loss": 1.3387, + "step": 30357 + }, + { + "epoch": 2.5874030512230464, + "grad_norm": 27.08272517046414, + "learning_rate": 5.65438828815974e-07, + "loss": 0.6416, + "step": 30358 + }, + { + "epoch": 2.5874882809170714, + "grad_norm": 69.58124140301383, + "learning_rate": 5.65209799684085e-07, + "loss": 1.817, + "step": 30359 + }, + { + "epoch": 2.587573510611097, + "grad_norm": 60.692062751560854, + "learning_rate": 5.649808141665958e-07, + "loss": 1.3166, + "step": 30360 + }, + { + "epoch": 2.5876587403051223, + "grad_norm": 42.87148905082174, + "learning_rate": 5.647518722657586e-07, + "loss": 1.2223, + "step": 30361 + }, + { + "epoch": 2.587743969999148, + "grad_norm": 58.226792217767404, + "learning_rate": 5.645229739838238e-07, + "loss": 1.8318, + "step": 30362 + }, + { + "epoch": 2.5878291996931733, + "grad_norm": 25.853387121069158, + "learning_rate": 5.642941193230433e-07, + "loss": 0.9659, + "step": 30363 + }, + { + "epoch": 2.5879144293871983, + "grad_norm": 65.41666569600328, + "learning_rate": 5.640653082856662e-07, + "loss": 1.1494, + "step": 30364 + }, + { + "epoch": 2.5879996590812238, + "grad_norm": 94.71941534103688, + "learning_rate": 5.638365408739455e-07, + "loss": 1.7297, + "step": 30365 + }, + { + "epoch": 2.5880848887752492, + "grad_norm": 23.86106344350747, + "learning_rate": 5.636078170901282e-07, + "loss": 0.7746, + "step": 30366 + }, + { + "epoch": 2.5881701184692747, + "grad_norm": 81.17651024380515, + "learning_rate": 5.633791369364666e-07, + "loss": 1.8014, + "step": 30367 + }, + { + "epoch": 2.5882553481633, + "grad_norm": 33.56507038414775, + "learning_rate": 5.631505004152077e-07, + "loss": 0.8144, + "step": 30368 + }, + { + "epoch": 2.5883405778573256, + "grad_norm": 25.629455445331388, + "learning_rate": 5.629219075286008e-07, + "loss": 0.954, + "step": 30369 + }, + { + "epoch": 2.588425807551351, + "grad_norm": 30.59552164615041, + "learning_rate": 5.62693358278894e-07, + "loss": 0.7246, + "step": 30370 + }, + { + "epoch": 2.588511037245376, + "grad_norm": 71.26841919432094, + "learning_rate": 5.624648526683351e-07, + "loss": 2.1478, + "step": 30371 + }, + { + "epoch": 2.5885962669394016, + "grad_norm": 25.148108848939994, + "learning_rate": 5.622363906991695e-07, + "loss": 0.9958, + "step": 30372 + }, + { + "epoch": 2.588681496633427, + "grad_norm": 34.99021756225783, + "learning_rate": 5.620079723736471e-07, + "loss": 1.1904, + "step": 30373 + }, + { + "epoch": 2.5887667263274525, + "grad_norm": 40.98095778766063, + "learning_rate": 5.617795976940122e-07, + "loss": 1.4006, + "step": 30374 + }, + { + "epoch": 2.588851956021478, + "grad_norm": 96.76153538870264, + "learning_rate": 5.615512666625117e-07, + "loss": 2.3455, + "step": 30375 + }, + { + "epoch": 2.588937185715503, + "grad_norm": 22.285054930404204, + "learning_rate": 5.613229792813907e-07, + "loss": 0.797, + "step": 30376 + }, + { + "epoch": 2.589022415409529, + "grad_norm": 50.92417451854133, + "learning_rate": 5.61094735552894e-07, + "loss": 0.9042, + "step": 30377 + }, + { + "epoch": 2.589107645103554, + "grad_norm": 56.09231527178654, + "learning_rate": 5.608665354792675e-07, + "loss": 1.906, + "step": 30378 + }, + { + "epoch": 2.5891928747975794, + "grad_norm": 64.48004550417767, + "learning_rate": 5.606383790627545e-07, + "loss": 2.0055, + "step": 30379 + }, + { + "epoch": 2.589278104491605, + "grad_norm": 36.41058132599823, + "learning_rate": 5.604102663055994e-07, + "loss": 0.9861, + "step": 30380 + }, + { + "epoch": 2.5893633341856304, + "grad_norm": 25.773613408842365, + "learning_rate": 5.601821972100435e-07, + "loss": 0.6385, + "step": 30381 + }, + { + "epoch": 2.589448563879656, + "grad_norm": 81.00015465915955, + "learning_rate": 5.599541717783325e-07, + "loss": 2.2784, + "step": 30382 + }, + { + "epoch": 2.589533793573681, + "grad_norm": 70.84776466442085, + "learning_rate": 5.597261900127082e-07, + "loss": 1.7964, + "step": 30383 + }, + { + "epoch": 2.5896190232677063, + "grad_norm": 33.89511816794734, + "learning_rate": 5.594982519154124e-07, + "loss": 0.767, + "step": 30384 + }, + { + "epoch": 2.589704252961732, + "grad_norm": 37.41373230049424, + "learning_rate": 5.592703574886859e-07, + "loss": 1.4647, + "step": 30385 + }, + { + "epoch": 2.5897894826557573, + "grad_norm": 75.08088905274685, + "learning_rate": 5.590425067347721e-07, + "loss": 1.7888, + "step": 30386 + }, + { + "epoch": 2.5898747123497827, + "grad_norm": 56.468689367069075, + "learning_rate": 5.588146996559108e-07, + "loss": 1.6135, + "step": 30387 + }, + { + "epoch": 2.589959942043808, + "grad_norm": 51.56751080245551, + "learning_rate": 5.585869362543416e-07, + "loss": 1.0846, + "step": 30388 + }, + { + "epoch": 2.5900451717378337, + "grad_norm": 55.67597718506054, + "learning_rate": 5.583592165323043e-07, + "loss": 1.8448, + "step": 30389 + }, + { + "epoch": 2.5901304014318587, + "grad_norm": 45.68651856187235, + "learning_rate": 5.58131540492039e-07, + "loss": 1.1625, + "step": 30390 + }, + { + "epoch": 2.590215631125884, + "grad_norm": 63.54475449765185, + "learning_rate": 5.579039081357862e-07, + "loss": 1.6459, + "step": 30391 + }, + { + "epoch": 2.5903008608199096, + "grad_norm": 44.45637856655745, + "learning_rate": 5.57676319465783e-07, + "loss": 1.1781, + "step": 30392 + }, + { + "epoch": 2.590386090513935, + "grad_norm": 61.886586016852846, + "learning_rate": 5.574487744842683e-07, + "loss": 1.3087, + "step": 30393 + }, + { + "epoch": 2.5904713202079606, + "grad_norm": 75.56411842179094, + "learning_rate": 5.57221273193479e-07, + "loss": 1.6563, + "step": 30394 + }, + { + "epoch": 2.5905565499019856, + "grad_norm": 65.5080875785402, + "learning_rate": 5.569938155956534e-07, + "loss": 1.8686, + "step": 30395 + }, + { + "epoch": 2.5906417795960115, + "grad_norm": 61.324927114565455, + "learning_rate": 5.567664016930285e-07, + "loss": 0.8043, + "step": 30396 + }, + { + "epoch": 2.5907270092900365, + "grad_norm": 51.530754145219376, + "learning_rate": 5.565390314878394e-07, + "loss": 1.1117, + "step": 30397 + }, + { + "epoch": 2.590812238984062, + "grad_norm": 82.79214925333966, + "learning_rate": 5.56311704982323e-07, + "loss": 2.0641, + "step": 30398 + }, + { + "epoch": 2.5908974686780875, + "grad_norm": 69.38713499721652, + "learning_rate": 5.560844221787165e-07, + "loss": 1.3577, + "step": 30399 + }, + { + "epoch": 2.590982698372113, + "grad_norm": 38.94468115881642, + "learning_rate": 5.558571830792536e-07, + "loss": 1.1187, + "step": 30400 + }, + { + "epoch": 2.5910679280661384, + "grad_norm": 21.196195124799182, + "learning_rate": 5.556299876861698e-07, + "loss": 0.8129, + "step": 30401 + }, + { + "epoch": 2.5911531577601634, + "grad_norm": 35.724617737380875, + "learning_rate": 5.554028360016978e-07, + "loss": 0.6628, + "step": 30402 + }, + { + "epoch": 2.591238387454189, + "grad_norm": 20.30569486027959, + "learning_rate": 5.55175728028074e-07, + "loss": 0.4742, + "step": 30403 + }, + { + "epoch": 2.5913236171482144, + "grad_norm": 31.117939072831064, + "learning_rate": 5.549486637675305e-07, + "loss": 0.949, + "step": 30404 + }, + { + "epoch": 2.59140884684224, + "grad_norm": 63.810624023158724, + "learning_rate": 5.547216432222996e-07, + "loss": 1.5947, + "step": 30405 + }, + { + "epoch": 2.5914940765362653, + "grad_norm": 24.220743689563193, + "learning_rate": 5.54494666394616e-07, + "loss": 0.774, + "step": 30406 + }, + { + "epoch": 2.5915793062302908, + "grad_norm": 41.830742866758335, + "learning_rate": 5.542677332867103e-07, + "loss": 1.4579, + "step": 30407 + }, + { + "epoch": 2.5916645359243162, + "grad_norm": 40.324328448761605, + "learning_rate": 5.540408439008155e-07, + "loss": 0.7913, + "step": 30408 + }, + { + "epoch": 2.5917497656183413, + "grad_norm": 28.79391618782707, + "learning_rate": 5.538139982391621e-07, + "loss": 1.2834, + "step": 30409 + }, + { + "epoch": 2.5918349953123667, + "grad_norm": 49.58556400393553, + "learning_rate": 5.535871963039818e-07, + "loss": 1.1887, + "step": 30410 + }, + { + "epoch": 2.591920225006392, + "grad_norm": 45.669487949518526, + "learning_rate": 5.533604380975033e-07, + "loss": 1.4095, + "step": 30411 + }, + { + "epoch": 2.5920054547004177, + "grad_norm": 46.05100275576307, + "learning_rate": 5.531337236219591e-07, + "loss": 1.4668, + "step": 30412 + }, + { + "epoch": 2.592090684394443, + "grad_norm": 35.95009017979006, + "learning_rate": 5.529070528795766e-07, + "loss": 0.9858, + "step": 30413 + }, + { + "epoch": 2.592175914088468, + "grad_norm": 38.39954542803045, + "learning_rate": 5.52680425872587e-07, + "loss": 0.9608, + "step": 30414 + }, + { + "epoch": 2.592261143782494, + "grad_norm": 76.68174012001025, + "learning_rate": 5.524538426032172e-07, + "loss": 1.9521, + "step": 30415 + }, + { + "epoch": 2.592346373476519, + "grad_norm": 69.58295239920035, + "learning_rate": 5.52227303073698e-07, + "loss": 2.0351, + "step": 30416 + }, + { + "epoch": 2.5924316031705446, + "grad_norm": 48.361177737383784, + "learning_rate": 5.520008072862553e-07, + "loss": 0.9533, + "step": 30417 + }, + { + "epoch": 2.59251683286457, + "grad_norm": 46.775861437164245, + "learning_rate": 5.517743552431176e-07, + "loss": 1.0597, + "step": 30418 + }, + { + "epoch": 2.5926020625585955, + "grad_norm": 37.438093248562815, + "learning_rate": 5.515479469465107e-07, + "loss": 0.6915, + "step": 30419 + }, + { + "epoch": 2.592687292252621, + "grad_norm": 65.10565040615887, + "learning_rate": 5.513215823986629e-07, + "loss": 2.1697, + "step": 30420 + }, + { + "epoch": 2.592772521946646, + "grad_norm": 60.011652353612284, + "learning_rate": 5.510952616017983e-07, + "loss": 1.2187, + "step": 30421 + }, + { + "epoch": 2.5928577516406714, + "grad_norm": 51.99377117970592, + "learning_rate": 5.508689845581455e-07, + "loss": 1.0394, + "step": 30422 + }, + { + "epoch": 2.592942981334697, + "grad_norm": 61.40200478954115, + "learning_rate": 5.50642751269928e-07, + "loss": 1.3884, + "step": 30423 + }, + { + "epoch": 2.5930282110287224, + "grad_norm": 67.12984021575612, + "learning_rate": 5.504165617393703e-07, + "loss": 1.567, + "step": 30424 + }, + { + "epoch": 2.593113440722748, + "grad_norm": 23.321446568689044, + "learning_rate": 5.501904159686989e-07, + "loss": 0.9138, + "step": 30425 + }, + { + "epoch": 2.5931986704167733, + "grad_norm": 48.89816184172979, + "learning_rate": 5.499643139601357e-07, + "loss": 0.9149, + "step": 30426 + }, + { + "epoch": 2.593283900110799, + "grad_norm": 51.33022213583682, + "learning_rate": 5.497382557159059e-07, + "loss": 1.2793, + "step": 30427 + }, + { + "epoch": 2.593369129804824, + "grad_norm": 63.63132102960751, + "learning_rate": 5.495122412382309e-07, + "loss": 1.2611, + "step": 30428 + }, + { + "epoch": 2.5934543594988493, + "grad_norm": 48.26034894443627, + "learning_rate": 5.492862705293344e-07, + "loss": 1.2186, + "step": 30429 + }, + { + "epoch": 2.5935395891928748, + "grad_norm": 30.70680730567481, + "learning_rate": 5.490603435914399e-07, + "loss": 1.6067, + "step": 30430 + }, + { + "epoch": 2.5936248188869, + "grad_norm": 33.463742634766156, + "learning_rate": 5.488344604267681e-07, + "loss": 1.1615, + "step": 30431 + }, + { + "epoch": 2.5937100485809257, + "grad_norm": 36.55218477901046, + "learning_rate": 5.486086210375402e-07, + "loss": 1.2533, + "step": 30432 + }, + { + "epoch": 2.593795278274951, + "grad_norm": 51.671419417979045, + "learning_rate": 5.483828254259782e-07, + "loss": 1.565, + "step": 30433 + }, + { + "epoch": 2.5938805079689766, + "grad_norm": 32.18788758878744, + "learning_rate": 5.481570735943026e-07, + "loss": 1.0673, + "step": 30434 + }, + { + "epoch": 2.5939657376630016, + "grad_norm": 24.118376187978306, + "learning_rate": 5.479313655447326e-07, + "loss": 0.9117, + "step": 30435 + }, + { + "epoch": 2.594050967357027, + "grad_norm": 26.458575159797594, + "learning_rate": 5.477057012794878e-07, + "loss": 0.6678, + "step": 30436 + }, + { + "epoch": 2.5941361970510526, + "grad_norm": 38.76348677707103, + "learning_rate": 5.474800808007886e-07, + "loss": 1.32, + "step": 30437 + }, + { + "epoch": 2.594221426745078, + "grad_norm": 51.377083435992496, + "learning_rate": 5.472545041108546e-07, + "loss": 1.3101, + "step": 30438 + }, + { + "epoch": 2.5943066564391035, + "grad_norm": 38.188345725473205, + "learning_rate": 5.470289712119025e-07, + "loss": 0.8197, + "step": 30439 + }, + { + "epoch": 2.5943918861331285, + "grad_norm": 39.49560896214135, + "learning_rate": 5.468034821061519e-07, + "loss": 1.3289, + "step": 30440 + }, + { + "epoch": 2.5944771158271545, + "grad_norm": 25.868470226241318, + "learning_rate": 5.465780367958179e-07, + "loss": 0.5623, + "step": 30441 + }, + { + "epoch": 2.5945623455211795, + "grad_norm": 57.792485090782094, + "learning_rate": 5.463526352831205e-07, + "loss": 1.4595, + "step": 30442 + }, + { + "epoch": 2.594647575215205, + "grad_norm": 44.84854278421338, + "learning_rate": 5.461272775702753e-07, + "loss": 1.0473, + "step": 30443 + }, + { + "epoch": 2.5947328049092304, + "grad_norm": 78.72825135297336, + "learning_rate": 5.45901963659497e-07, + "loss": 1.8621, + "step": 30444 + }, + { + "epoch": 2.594818034603256, + "grad_norm": 62.022323290647904, + "learning_rate": 5.456766935530039e-07, + "loss": 2.0809, + "step": 30445 + }, + { + "epoch": 2.5949032642972814, + "grad_norm": 39.576106127096075, + "learning_rate": 5.454514672530109e-07, + "loss": 1.0685, + "step": 30446 + }, + { + "epoch": 2.5949884939913064, + "grad_norm": 22.87069769042908, + "learning_rate": 5.452262847617329e-07, + "loss": 0.9006, + "step": 30447 + }, + { + "epoch": 2.595073723685332, + "grad_norm": 42.068791224181226, + "learning_rate": 5.450011460813842e-07, + "loss": 0.9607, + "step": 30448 + }, + { + "epoch": 2.5951589533793573, + "grad_norm": 70.12720561237168, + "learning_rate": 5.447760512141786e-07, + "loss": 1.8197, + "step": 30449 + }, + { + "epoch": 2.595244183073383, + "grad_norm": 76.29524038544153, + "learning_rate": 5.4455100016233e-07, + "loss": 2.1385, + "step": 30450 + }, + { + "epoch": 2.5953294127674083, + "grad_norm": 65.6738525224151, + "learning_rate": 5.443259929280526e-07, + "loss": 1.2627, + "step": 30451 + }, + { + "epoch": 2.5954146424614337, + "grad_norm": 58.9763198613793, + "learning_rate": 5.441010295135569e-07, + "loss": 1.5177, + "step": 30452 + }, + { + "epoch": 2.595499872155459, + "grad_norm": 97.69822502950366, + "learning_rate": 5.438761099210588e-07, + "loss": 2.2312, + "step": 30453 + }, + { + "epoch": 2.595585101849484, + "grad_norm": 79.07660619274749, + "learning_rate": 5.436512341527673e-07, + "loss": 1.7091, + "step": 30454 + }, + { + "epoch": 2.5956703315435097, + "grad_norm": 62.84266620115063, + "learning_rate": 5.43426402210896e-07, + "loss": 1.9527, + "step": 30455 + }, + { + "epoch": 2.595755561237535, + "grad_norm": 40.06372564506923, + "learning_rate": 5.432016140976548e-07, + "loss": 0.8934, + "step": 30456 + }, + { + "epoch": 2.5958407909315606, + "grad_norm": 46.42918770576515, + "learning_rate": 5.429768698152554e-07, + "loss": 1.1034, + "step": 30457 + }, + { + "epoch": 2.595926020625586, + "grad_norm": 25.540037766646872, + "learning_rate": 5.42752169365906e-07, + "loss": 0.9437, + "step": 30458 + }, + { + "epoch": 2.596011250319611, + "grad_norm": 43.39967645341038, + "learning_rate": 5.425275127518187e-07, + "loss": 0.9337, + "step": 30459 + }, + { + "epoch": 2.596096480013637, + "grad_norm": 72.5597055949773, + "learning_rate": 5.423028999752016e-07, + "loss": 1.6457, + "step": 30460 + }, + { + "epoch": 2.596181709707662, + "grad_norm": 21.04880030965187, + "learning_rate": 5.42078331038265e-07, + "loss": 0.6342, + "step": 30461 + }, + { + "epoch": 2.5962669394016875, + "grad_norm": 69.63392572968587, + "learning_rate": 5.418538059432154e-07, + "loss": 1.8693, + "step": 30462 + }, + { + "epoch": 2.596352169095713, + "grad_norm": 93.6357833213693, + "learning_rate": 5.416293246922638e-07, + "loss": 3.0967, + "step": 30463 + }, + { + "epoch": 2.5964373987897384, + "grad_norm": 37.631027553197335, + "learning_rate": 5.414048872876154e-07, + "loss": 0.9097, + "step": 30464 + }, + { + "epoch": 2.596522628483764, + "grad_norm": 44.725878108540186, + "learning_rate": 5.411804937314786e-07, + "loss": 0.9101, + "step": 30465 + }, + { + "epoch": 2.596607858177789, + "grad_norm": 54.76955334400805, + "learning_rate": 5.409561440260596e-07, + "loss": 1.5971, + "step": 30466 + }, + { + "epoch": 2.5966930878718144, + "grad_norm": 66.43216644332873, + "learning_rate": 5.407318381735649e-07, + "loss": 1.5588, + "step": 30467 + }, + { + "epoch": 2.59677831756584, + "grad_norm": 25.680576864750186, + "learning_rate": 5.405075761761997e-07, + "loss": 0.8918, + "step": 30468 + }, + { + "epoch": 2.5968635472598653, + "grad_norm": 51.39094670730744, + "learning_rate": 5.402833580361721e-07, + "loss": 1.4405, + "step": 30469 + }, + { + "epoch": 2.596948776953891, + "grad_norm": 58.72252566563746, + "learning_rate": 5.400591837556851e-07, + "loss": 1.2898, + "step": 30470 + }, + { + "epoch": 2.5970340066479163, + "grad_norm": 24.31021158691443, + "learning_rate": 5.398350533369428e-07, + "loss": 0.9637, + "step": 30471 + }, + { + "epoch": 2.5971192363419418, + "grad_norm": 37.64712009387632, + "learning_rate": 5.39610966782152e-07, + "loss": 1.1585, + "step": 30472 + }, + { + "epoch": 2.5972044660359668, + "grad_norm": 55.149463931742005, + "learning_rate": 5.393869240935145e-07, + "loss": 1.4429, + "step": 30473 + }, + { + "epoch": 2.5972896957299922, + "grad_norm": 48.12731455532339, + "learning_rate": 5.391629252732344e-07, + "loss": 1.5411, + "step": 30474 + }, + { + "epoch": 2.5973749254240177, + "grad_norm": 27.36508892948259, + "learning_rate": 5.389389703235132e-07, + "loss": 1.2113, + "step": 30475 + }, + { + "epoch": 2.597460155118043, + "grad_norm": 31.65637600733024, + "learning_rate": 5.387150592465546e-07, + "loss": 1.1056, + "step": 30476 + }, + { + "epoch": 2.5975453848120686, + "grad_norm": 33.333381621042804, + "learning_rate": 5.384911920445618e-07, + "loss": 1.0666, + "step": 30477 + }, + { + "epoch": 2.5976306145060937, + "grad_norm": 27.791710023343462, + "learning_rate": 5.382673687197348e-07, + "loss": 0.884, + "step": 30478 + }, + { + "epoch": 2.5977158442001196, + "grad_norm": 53.29389734529023, + "learning_rate": 5.380435892742758e-07, + "loss": 1.8875, + "step": 30479 + }, + { + "epoch": 2.5978010738941446, + "grad_norm": 54.69922990392516, + "learning_rate": 5.378198537103841e-07, + "loss": 1.5121, + "step": 30480 + }, + { + "epoch": 2.59788630358817, + "grad_norm": 59.40647613700335, + "learning_rate": 5.375961620302616e-07, + "loss": 1.1904, + "step": 30481 + }, + { + "epoch": 2.5979715332821955, + "grad_norm": 72.72058301594029, + "learning_rate": 5.373725142361081e-07, + "loss": 1.4561, + "step": 30482 + }, + { + "epoch": 2.598056762976221, + "grad_norm": 59.127316623904406, + "learning_rate": 5.371489103301214e-07, + "loss": 1.5125, + "step": 30483 + }, + { + "epoch": 2.5981419926702465, + "grad_norm": 36.217252008836056, + "learning_rate": 5.369253503145017e-07, + "loss": 1.1476, + "step": 30484 + }, + { + "epoch": 2.5982272223642715, + "grad_norm": 62.45242085859996, + "learning_rate": 5.367018341914493e-07, + "loss": 1.2362, + "step": 30485 + }, + { + "epoch": 2.598312452058297, + "grad_norm": 37.18361435047709, + "learning_rate": 5.364783619631602e-07, + "loss": 1.5458, + "step": 30486 + }, + { + "epoch": 2.5983976817523224, + "grad_norm": 52.76956272091122, + "learning_rate": 5.362549336318328e-07, + "loss": 1.213, + "step": 30487 + }, + { + "epoch": 2.598482911446348, + "grad_norm": 62.203654101080254, + "learning_rate": 5.360315491996637e-07, + "loss": 1.4873, + "step": 30488 + }, + { + "epoch": 2.5985681411403734, + "grad_norm": 49.86182859448362, + "learning_rate": 5.358082086688516e-07, + "loss": 1.4189, + "step": 30489 + }, + { + "epoch": 2.598653370834399, + "grad_norm": 38.5794634114446, + "learning_rate": 5.355849120415918e-07, + "loss": 0.8548, + "step": 30490 + }, + { + "epoch": 2.5987386005284243, + "grad_norm": 62.49693177868624, + "learning_rate": 5.353616593200795e-07, + "loss": 1.7929, + "step": 30491 + }, + { + "epoch": 2.5988238302224493, + "grad_norm": 42.37126594494519, + "learning_rate": 5.35138450506511e-07, + "loss": 1.089, + "step": 30492 + }, + { + "epoch": 2.598909059916475, + "grad_norm": 27.13389588626318, + "learning_rate": 5.349152856030826e-07, + "loss": 1.1636, + "step": 30493 + }, + { + "epoch": 2.5989942896105003, + "grad_norm": 18.522147597848235, + "learning_rate": 5.346921646119885e-07, + "loss": 0.8648, + "step": 30494 + }, + { + "epoch": 2.5990795193045257, + "grad_norm": 49.87085374198228, + "learning_rate": 5.344690875354219e-07, + "loss": 1.7807, + "step": 30495 + }, + { + "epoch": 2.599164748998551, + "grad_norm": 28.757031337067918, + "learning_rate": 5.342460543755779e-07, + "loss": 0.7485, + "step": 30496 + }, + { + "epoch": 2.5992499786925762, + "grad_norm": 57.68760269632971, + "learning_rate": 5.340230651346484e-07, + "loss": 1.4792, + "step": 30497 + }, + { + "epoch": 2.599335208386602, + "grad_norm": 30.368092724536538, + "learning_rate": 5.338001198148285e-07, + "loss": 0.8342, + "step": 30498 + }, + { + "epoch": 2.599420438080627, + "grad_norm": 27.99787253705015, + "learning_rate": 5.335772184183086e-07, + "loss": 0.8062, + "step": 30499 + }, + { + "epoch": 2.5995056677746526, + "grad_norm": 58.903944578169046, + "learning_rate": 5.33354360947283e-07, + "loss": 1.3578, + "step": 30500 + }, + { + "epoch": 2.599590897468678, + "grad_norm": 55.76020266161882, + "learning_rate": 5.331315474039417e-07, + "loss": 2.0718, + "step": 30501 + }, + { + "epoch": 2.5996761271627036, + "grad_norm": 57.339531517582074, + "learning_rate": 5.329087777904774e-07, + "loss": 1.3964, + "step": 30502 + }, + { + "epoch": 2.599761356856729, + "grad_norm": 68.48406068158988, + "learning_rate": 5.326860521090799e-07, + "loss": 2.1015, + "step": 30503 + }, + { + "epoch": 2.599846586550754, + "grad_norm": 37.697704565670215, + "learning_rate": 5.324633703619403e-07, + "loss": 1.175, + "step": 30504 + }, + { + "epoch": 2.5999318162447795, + "grad_norm": 45.4630869121509, + "learning_rate": 5.322407325512475e-07, + "loss": 0.988, + "step": 30505 + }, + { + "epoch": 2.600017045938805, + "grad_norm": 40.65333066345826, + "learning_rate": 5.320181386791923e-07, + "loss": 1.0389, + "step": 30506 + }, + { + "epoch": 2.6001022756328305, + "grad_norm": 59.02072794983177, + "learning_rate": 5.317955887479625e-07, + "loss": 1.305, + "step": 30507 + }, + { + "epoch": 2.600187505326856, + "grad_norm": 26.936993465862678, + "learning_rate": 5.315730827597487e-07, + "loss": 1.1597, + "step": 30508 + }, + { + "epoch": 2.6002727350208814, + "grad_norm": 35.00652576260754, + "learning_rate": 5.313506207167379e-07, + "loss": 0.9815, + "step": 30509 + }, + { + "epoch": 2.600357964714907, + "grad_norm": 62.300881828208695, + "learning_rate": 5.311282026211168e-07, + "loss": 1.8746, + "step": 30510 + }, + { + "epoch": 2.600443194408932, + "grad_norm": 59.31645072900742, + "learning_rate": 5.309058284750756e-07, + "loss": 1.7723, + "step": 30511 + }, + { + "epoch": 2.6005284241029574, + "grad_norm": 46.267753675187485, + "learning_rate": 5.306834982807996e-07, + "loss": 1.21, + "step": 30512 + }, + { + "epoch": 2.600613653796983, + "grad_norm": 48.968496266122024, + "learning_rate": 5.304612120404751e-07, + "loss": 1.6463, + "step": 30513 + }, + { + "epoch": 2.6006988834910083, + "grad_norm": 26.42281749461835, + "learning_rate": 5.302389697562876e-07, + "loss": 0.7106, + "step": 30514 + }, + { + "epoch": 2.6007841131850338, + "grad_norm": 96.60474789268564, + "learning_rate": 5.300167714304244e-07, + "loss": 1.5435, + "step": 30515 + }, + { + "epoch": 2.600869342879059, + "grad_norm": 56.154686989352996, + "learning_rate": 5.297946170650703e-07, + "loss": 1.7579, + "step": 30516 + }, + { + "epoch": 2.6009545725730847, + "grad_norm": 38.837454672177905, + "learning_rate": 5.295725066624102e-07, + "loss": 1.3417, + "step": 30517 + }, + { + "epoch": 2.6010398022671097, + "grad_norm": 38.075891620281155, + "learning_rate": 5.293504402246269e-07, + "loss": 1.2272, + "step": 30518 + }, + { + "epoch": 2.601125031961135, + "grad_norm": 49.51163795224863, + "learning_rate": 5.291284177539069e-07, + "loss": 1.7031, + "step": 30519 + }, + { + "epoch": 2.6012102616551607, + "grad_norm": 29.534405370861418, + "learning_rate": 5.289064392524323e-07, + "loss": 1.6078, + "step": 30520 + }, + { + "epoch": 2.601295491349186, + "grad_norm": 72.30225556521056, + "learning_rate": 5.286845047223859e-07, + "loss": 2.0655, + "step": 30521 + }, + { + "epoch": 2.6013807210432116, + "grad_norm": 52.34970012719157, + "learning_rate": 5.284626141659499e-07, + "loss": 1.5077, + "step": 30522 + }, + { + "epoch": 2.6014659507372366, + "grad_norm": 46.15996400520422, + "learning_rate": 5.282407675853074e-07, + "loss": 1.0346, + "step": 30523 + }, + { + "epoch": 2.601551180431262, + "grad_norm": 20.153288382950528, + "learning_rate": 5.280189649826412e-07, + "loss": 0.9162, + "step": 30524 + }, + { + "epoch": 2.6016364101252876, + "grad_norm": 28.201627791335643, + "learning_rate": 5.277972063601311e-07, + "loss": 0.8863, + "step": 30525 + }, + { + "epoch": 2.601721639819313, + "grad_norm": 44.65225522302702, + "learning_rate": 5.275754917199583e-07, + "loss": 0.9141, + "step": 30526 + }, + { + "epoch": 2.6018068695133385, + "grad_norm": 29.420458818113218, + "learning_rate": 5.273538210643025e-07, + "loss": 1.0814, + "step": 30527 + }, + { + "epoch": 2.601892099207364, + "grad_norm": 42.29945776567184, + "learning_rate": 5.271321943953462e-07, + "loss": 1.2419, + "step": 30528 + }, + { + "epoch": 2.6019773289013894, + "grad_norm": 22.322034591427826, + "learning_rate": 5.269106117152667e-07, + "loss": 0.6624, + "step": 30529 + }, + { + "epoch": 2.6020625585954145, + "grad_norm": 41.36093088712815, + "learning_rate": 5.26689073026243e-07, + "loss": 1.2303, + "step": 30530 + }, + { + "epoch": 2.60214778828944, + "grad_norm": 67.90786504422773, + "learning_rate": 5.264675783304552e-07, + "loss": 1.6494, + "step": 30531 + }, + { + "epoch": 2.6022330179834654, + "grad_norm": 49.57626627705878, + "learning_rate": 5.262461276300817e-07, + "loss": 1.7747, + "step": 30532 + }, + { + "epoch": 2.602318247677491, + "grad_norm": 38.412975885326205, + "learning_rate": 5.260247209272995e-07, + "loss": 1.3812, + "step": 30533 + }, + { + "epoch": 2.6024034773715163, + "grad_norm": 26.02470116773061, + "learning_rate": 5.258033582242871e-07, + "loss": 0.7416, + "step": 30534 + }, + { + "epoch": 2.6024887070655414, + "grad_norm": 26.160111418123183, + "learning_rate": 5.255820395232197e-07, + "loss": 1.1592, + "step": 30535 + }, + { + "epoch": 2.6025739367595673, + "grad_norm": 35.55001805113691, + "learning_rate": 5.253607648262759e-07, + "loss": 1.3766, + "step": 30536 + }, + { + "epoch": 2.6026591664535923, + "grad_norm": 31.889033377626095, + "learning_rate": 5.251395341356308e-07, + "loss": 1.1845, + "step": 30537 + }, + { + "epoch": 2.6027443961476178, + "grad_norm": 37.18085065499977, + "learning_rate": 5.249183474534592e-07, + "loss": 0.8166, + "step": 30538 + }, + { + "epoch": 2.6028296258416432, + "grad_norm": 38.686445949042415, + "learning_rate": 5.246972047819387e-07, + "loss": 1.1719, + "step": 30539 + }, + { + "epoch": 2.6029148555356687, + "grad_norm": 25.450100847008965, + "learning_rate": 5.244761061232418e-07, + "loss": 1.0005, + "step": 30540 + }, + { + "epoch": 2.603000085229694, + "grad_norm": 42.45077975988602, + "learning_rate": 5.242550514795447e-07, + "loss": 1.0592, + "step": 30541 + }, + { + "epoch": 2.603085314923719, + "grad_norm": 46.6159410292539, + "learning_rate": 5.240340408530209e-07, + "loss": 0.9948, + "step": 30542 + }, + { + "epoch": 2.6031705446177447, + "grad_norm": 26.427303733097016, + "learning_rate": 5.238130742458436e-07, + "loss": 0.8079, + "step": 30543 + }, + { + "epoch": 2.60325577431177, + "grad_norm": 58.426951243567586, + "learning_rate": 5.235921516601855e-07, + "loss": 1.5349, + "step": 30544 + }, + { + "epoch": 2.6033410040057956, + "grad_norm": 41.38064998361569, + "learning_rate": 5.233712730982205e-07, + "loss": 1.483, + "step": 30545 + }, + { + "epoch": 2.603426233699821, + "grad_norm": 25.843524587821637, + "learning_rate": 5.231504385621189e-07, + "loss": 0.832, + "step": 30546 + }, + { + "epoch": 2.6035114633938465, + "grad_norm": 35.80428778320535, + "learning_rate": 5.229296480540552e-07, + "loss": 0.804, + "step": 30547 + }, + { + "epoch": 2.603596693087872, + "grad_norm": 53.00119570372949, + "learning_rate": 5.227089015761983e-07, + "loss": 1.1491, + "step": 30548 + }, + { + "epoch": 2.603681922781897, + "grad_norm": 39.1231790071437, + "learning_rate": 5.224881991307213e-07, + "loss": 1.1597, + "step": 30549 + }, + { + "epoch": 2.6037671524759225, + "grad_norm": 58.19565009133293, + "learning_rate": 5.222675407197936e-07, + "loss": 1.7414, + "step": 30550 + }, + { + "epoch": 2.603852382169948, + "grad_norm": 74.51452902410904, + "learning_rate": 5.220469263455857e-07, + "loss": 1.7844, + "step": 30551 + }, + { + "epoch": 2.6039376118639734, + "grad_norm": 40.80709069863834, + "learning_rate": 5.218263560102666e-07, + "loss": 1.6457, + "step": 30552 + }, + { + "epoch": 2.604022841557999, + "grad_norm": 44.18389800581212, + "learning_rate": 5.216058297160048e-07, + "loss": 1.4438, + "step": 30553 + }, + { + "epoch": 2.6041080712520244, + "grad_norm": 66.61469934003964, + "learning_rate": 5.213853474649699e-07, + "loss": 1.9016, + "step": 30554 + }, + { + "epoch": 2.60419330094605, + "grad_norm": 50.89931770524762, + "learning_rate": 5.211649092593318e-07, + "loss": 1.1426, + "step": 30555 + }, + { + "epoch": 2.604278530640075, + "grad_norm": 20.144741075287243, + "learning_rate": 5.209445151012571e-07, + "loss": 0.6305, + "step": 30556 + }, + { + "epoch": 2.6043637603341003, + "grad_norm": 38.805036282946645, + "learning_rate": 5.207241649929124e-07, + "loss": 1.0445, + "step": 30557 + }, + { + "epoch": 2.604448990028126, + "grad_norm": 77.8490914064821, + "learning_rate": 5.205038589364663e-07, + "loss": 2.0831, + "step": 30558 + }, + { + "epoch": 2.6045342197221513, + "grad_norm": 65.54435319090908, + "learning_rate": 5.202835969340853e-07, + "loss": 1.7602, + "step": 30559 + }, + { + "epoch": 2.6046194494161767, + "grad_norm": 56.550904375556755, + "learning_rate": 5.20063378987935e-07, + "loss": 1.834, + "step": 30560 + }, + { + "epoch": 2.6047046791102018, + "grad_norm": 50.40613268612798, + "learning_rate": 5.198432051001794e-07, + "loss": 1.2697, + "step": 30561 + }, + { + "epoch": 2.6047899088042272, + "grad_norm": 27.47987221981588, + "learning_rate": 5.196230752729864e-07, + "loss": 0.8724, + "step": 30562 + }, + { + "epoch": 2.6048751384982527, + "grad_norm": 37.072831861010705, + "learning_rate": 5.194029895085212e-07, + "loss": 0.8733, + "step": 30563 + }, + { + "epoch": 2.604960368192278, + "grad_norm": 34.886374233801355, + "learning_rate": 5.191829478089467e-07, + "loss": 0.8729, + "step": 30564 + }, + { + "epoch": 2.6050455978863036, + "grad_norm": 62.31954747213601, + "learning_rate": 5.189629501764265e-07, + "loss": 1.6408, + "step": 30565 + }, + { + "epoch": 2.605130827580329, + "grad_norm": 72.60448325505024, + "learning_rate": 5.187429966131263e-07, + "loss": 1.903, + "step": 30566 + }, + { + "epoch": 2.6052160572743546, + "grad_norm": 42.387685489116926, + "learning_rate": 5.185230871212083e-07, + "loss": 1.0796, + "step": 30567 + }, + { + "epoch": 2.6053012869683796, + "grad_norm": 58.136039430749854, + "learning_rate": 5.183032217028344e-07, + "loss": 1.7212, + "step": 30568 + }, + { + "epoch": 2.605386516662405, + "grad_norm": 43.19614710481898, + "learning_rate": 5.180834003601664e-07, + "loss": 1.2231, + "step": 30569 + }, + { + "epoch": 2.6054717463564305, + "grad_norm": 59.97023943941284, + "learning_rate": 5.178636230953677e-07, + "loss": 1.8514, + "step": 30570 + }, + { + "epoch": 2.605556976050456, + "grad_norm": 52.474686856602304, + "learning_rate": 5.176438899106001e-07, + "loss": 1.7213, + "step": 30571 + }, + { + "epoch": 2.6056422057444815, + "grad_norm": 48.91110018139163, + "learning_rate": 5.174242008080238e-07, + "loss": 0.8395, + "step": 30572 + }, + { + "epoch": 2.605727435438507, + "grad_norm": 47.34999862147725, + "learning_rate": 5.172045557897993e-07, + "loss": 1.434, + "step": 30573 + }, + { + "epoch": 2.6058126651325324, + "grad_norm": 46.27949543662018, + "learning_rate": 5.169849548580852e-07, + "loss": 1.5518, + "step": 30574 + }, + { + "epoch": 2.6058978948265574, + "grad_norm": 67.8232515740082, + "learning_rate": 5.167653980150444e-07, + "loss": 2.2568, + "step": 30575 + }, + { + "epoch": 2.605983124520583, + "grad_norm": 56.12720881911411, + "learning_rate": 5.165458852628341e-07, + "loss": 1.43, + "step": 30576 + }, + { + "epoch": 2.6060683542146084, + "grad_norm": 35.07032844188519, + "learning_rate": 5.163264166036124e-07, + "loss": 1.1473, + "step": 30577 + }, + { + "epoch": 2.606153583908634, + "grad_norm": 59.49014509676851, + "learning_rate": 5.161069920395389e-07, + "loss": 1.6313, + "step": 30578 + }, + { + "epoch": 2.6062388136026593, + "grad_norm": 26.584955711279914, + "learning_rate": 5.158876115727723e-07, + "loss": 1.0751, + "step": 30579 + }, + { + "epoch": 2.6063240432966843, + "grad_norm": 58.62383108612437, + "learning_rate": 5.156682752054692e-07, + "loss": 1.7996, + "step": 30580 + }, + { + "epoch": 2.6064092729907102, + "grad_norm": 43.494277327281004, + "learning_rate": 5.154489829397869e-07, + "loss": 1.2804, + "step": 30581 + }, + { + "epoch": 2.6064945026847353, + "grad_norm": 61.51213895038925, + "learning_rate": 5.152297347778817e-07, + "loss": 1.4215, + "step": 30582 + }, + { + "epoch": 2.6065797323787607, + "grad_norm": 41.786352290393125, + "learning_rate": 5.150105307219095e-07, + "loss": 1.3514, + "step": 30583 + }, + { + "epoch": 2.606664962072786, + "grad_norm": 34.19818605936104, + "learning_rate": 5.147913707740265e-07, + "loss": 1.1291, + "step": 30584 + }, + { + "epoch": 2.6067501917668117, + "grad_norm": 51.29445650346069, + "learning_rate": 5.14572254936388e-07, + "loss": 1.453, + "step": 30585 + }, + { + "epoch": 2.606835421460837, + "grad_norm": 76.56274160423206, + "learning_rate": 5.143531832111498e-07, + "loss": 1.8338, + "step": 30586 + }, + { + "epoch": 2.606920651154862, + "grad_norm": 27.254231452095148, + "learning_rate": 5.141341556004647e-07, + "loss": 0.9746, + "step": 30587 + }, + { + "epoch": 2.6070058808488876, + "grad_norm": 43.57555932168067, + "learning_rate": 5.139151721064883e-07, + "loss": 1.5497, + "step": 30588 + }, + { + "epoch": 2.607091110542913, + "grad_norm": 46.57062407905934, + "learning_rate": 5.136962327313738e-07, + "loss": 1.6929, + "step": 30589 + }, + { + "epoch": 2.6071763402369386, + "grad_norm": 36.46098845734088, + "learning_rate": 5.134773374772745e-07, + "loss": 1.1063, + "step": 30590 + }, + { + "epoch": 2.607261569930964, + "grad_norm": 71.36356152702461, + "learning_rate": 5.132584863463414e-07, + "loss": 1.9126, + "step": 30591 + }, + { + "epoch": 2.6073467996249895, + "grad_norm": 58.82453572837495, + "learning_rate": 5.130396793407294e-07, + "loss": 1.8234, + "step": 30592 + }, + { + "epoch": 2.607432029319015, + "grad_norm": 52.87820661453049, + "learning_rate": 5.128209164625886e-07, + "loss": 1.684, + "step": 30593 + }, + { + "epoch": 2.60751725901304, + "grad_norm": 54.46552567644164, + "learning_rate": 5.126021977140716e-07, + "loss": 1.2648, + "step": 30594 + }, + { + "epoch": 2.6076024887070655, + "grad_norm": 74.50514059648971, + "learning_rate": 5.123835230973279e-07, + "loss": 1.8214, + "step": 30595 + }, + { + "epoch": 2.607687718401091, + "grad_norm": 54.43251382968826, + "learning_rate": 5.121648926145101e-07, + "loss": 1.6601, + "step": 30596 + }, + { + "epoch": 2.6077729480951164, + "grad_norm": 15.881321091906807, + "learning_rate": 5.119463062677671e-07, + "loss": 0.7993, + "step": 30597 + }, + { + "epoch": 2.607858177789142, + "grad_norm": 35.896642920873916, + "learning_rate": 5.117277640592488e-07, + "loss": 1.233, + "step": 30598 + }, + { + "epoch": 2.607943407483167, + "grad_norm": 34.58754124958598, + "learning_rate": 5.115092659911047e-07, + "loss": 1.022, + "step": 30599 + }, + { + "epoch": 2.608028637177193, + "grad_norm": 59.61406261418149, + "learning_rate": 5.112908120654825e-07, + "loss": 1.5176, + "step": 30600 + }, + { + "epoch": 2.608113866871218, + "grad_norm": 42.54402932574611, + "learning_rate": 5.110724022845315e-07, + "loss": 1.9245, + "step": 30601 + }, + { + "epoch": 2.6081990965652433, + "grad_norm": 52.498425654861784, + "learning_rate": 5.108540366504e-07, + "loss": 1.4153, + "step": 30602 + }, + { + "epoch": 2.6082843262592688, + "grad_norm": 43.38391604840422, + "learning_rate": 5.106357151652358e-07, + "loss": 1.5901, + "step": 30603 + }, + { + "epoch": 2.6083695559532942, + "grad_norm": 41.9200129053767, + "learning_rate": 5.104174378311849e-07, + "loss": 0.8163, + "step": 30604 + }, + { + "epoch": 2.6084547856473197, + "grad_norm": 94.6077235678065, + "learning_rate": 5.101992046503951e-07, + "loss": 2.5245, + "step": 30605 + }, + { + "epoch": 2.6085400153413447, + "grad_norm": 43.71450152678511, + "learning_rate": 5.099810156250118e-07, + "loss": 1.1965, + "step": 30606 + }, + { + "epoch": 2.60862524503537, + "grad_norm": 64.22380334869557, + "learning_rate": 5.097628707571811e-07, + "loss": 1.4344, + "step": 30607 + }, + { + "epoch": 2.6087104747293957, + "grad_norm": 37.58467057195228, + "learning_rate": 5.095447700490475e-07, + "loss": 1.0263, + "step": 30608 + }, + { + "epoch": 2.608795704423421, + "grad_norm": 43.41657767246388, + "learning_rate": 5.093267135027569e-07, + "loss": 1.0097, + "step": 30609 + }, + { + "epoch": 2.6088809341174466, + "grad_norm": 44.22210891819372, + "learning_rate": 5.091087011204543e-07, + "loss": 1.4502, + "step": 30610 + }, + { + "epoch": 2.608966163811472, + "grad_norm": 51.2615037686543, + "learning_rate": 5.088907329042835e-07, + "loss": 1.0194, + "step": 30611 + }, + { + "epoch": 2.6090513935054975, + "grad_norm": 52.49365448155847, + "learning_rate": 5.086728088563875e-07, + "loss": 1.5423, + "step": 30612 + }, + { + "epoch": 2.6091366231995226, + "grad_norm": 77.74917454832523, + "learning_rate": 5.084549289789087e-07, + "loss": 1.6565, + "step": 30613 + }, + { + "epoch": 2.609221852893548, + "grad_norm": 67.28882613387671, + "learning_rate": 5.082370932739922e-07, + "loss": 1.8384, + "step": 30614 + }, + { + "epoch": 2.6093070825875735, + "grad_norm": 34.75595902403297, + "learning_rate": 5.08019301743779e-07, + "loss": 0.8134, + "step": 30615 + }, + { + "epoch": 2.609392312281599, + "grad_norm": 35.42942619531523, + "learning_rate": 5.078015543904097e-07, + "loss": 1.4873, + "step": 30616 + }, + { + "epoch": 2.6094775419756244, + "grad_norm": 51.75885441631453, + "learning_rate": 5.075838512160275e-07, + "loss": 1.3372, + "step": 30617 + }, + { + "epoch": 2.6095627716696495, + "grad_norm": 67.6169270366518, + "learning_rate": 5.073661922227735e-07, + "loss": 1.5424, + "step": 30618 + }, + { + "epoch": 2.6096480013636754, + "grad_norm": 59.527194560548494, + "learning_rate": 5.071485774127882e-07, + "loss": 1.7809, + "step": 30619 + }, + { + "epoch": 2.6097332310577004, + "grad_norm": 42.66656216860826, + "learning_rate": 5.069310067882111e-07, + "loss": 0.9261, + "step": 30620 + }, + { + "epoch": 2.609818460751726, + "grad_norm": 92.61093241463503, + "learning_rate": 5.067134803511814e-07, + "loss": 1.3288, + "step": 30621 + }, + { + "epoch": 2.6099036904457513, + "grad_norm": 37.16792443198948, + "learning_rate": 5.064959981038403e-07, + "loss": 1.1359, + "step": 30622 + }, + { + "epoch": 2.609988920139777, + "grad_norm": 45.37269900255953, + "learning_rate": 5.06278560048325e-07, + "loss": 1.0967, + "step": 30623 + }, + { + "epoch": 2.6100741498338023, + "grad_norm": 29.583776208586894, + "learning_rate": 5.060611661867737e-07, + "loss": 1.0566, + "step": 30624 + }, + { + "epoch": 2.6101593795278273, + "grad_norm": 49.53177651040271, + "learning_rate": 5.058438165213264e-07, + "loss": 1.7275, + "step": 30625 + }, + { + "epoch": 2.6102446092218528, + "grad_norm": 39.18462450645816, + "learning_rate": 5.056265110541181e-07, + "loss": 1.0161, + "step": 30626 + }, + { + "epoch": 2.610329838915878, + "grad_norm": 57.10858653946028, + "learning_rate": 5.054092497872882e-07, + "loss": 1.6086, + "step": 30627 + }, + { + "epoch": 2.6104150686099037, + "grad_norm": 39.72921164741306, + "learning_rate": 5.051920327229725e-07, + "loss": 1.2817, + "step": 30628 + }, + { + "epoch": 2.610500298303929, + "grad_norm": 57.964631853053916, + "learning_rate": 5.04974859863307e-07, + "loss": 1.3189, + "step": 30629 + }, + { + "epoch": 2.6105855279979546, + "grad_norm": 26.91325954340838, + "learning_rate": 5.047577312104263e-07, + "loss": 0.7513, + "step": 30630 + }, + { + "epoch": 2.61067075769198, + "grad_norm": 58.039434648435204, + "learning_rate": 5.045406467664682e-07, + "loss": 1.6991, + "step": 30631 + }, + { + "epoch": 2.610755987386005, + "grad_norm": 109.77981835303441, + "learning_rate": 5.04323606533566e-07, + "loss": 2.1673, + "step": 30632 + }, + { + "epoch": 2.6108412170800306, + "grad_norm": 76.7694700923486, + "learning_rate": 5.041066105138553e-07, + "loss": 2.055, + "step": 30633 + }, + { + "epoch": 2.610926446774056, + "grad_norm": 82.04138728883326, + "learning_rate": 5.038896587094688e-07, + "loss": 2.081, + "step": 30634 + }, + { + "epoch": 2.6110116764680815, + "grad_norm": 42.42931187047192, + "learning_rate": 5.036727511225414e-07, + "loss": 1.4866, + "step": 30635 + }, + { + "epoch": 2.611096906162107, + "grad_norm": 36.000784378005264, + "learning_rate": 5.034558877552065e-07, + "loss": 1.25, + "step": 30636 + }, + { + "epoch": 2.611182135856132, + "grad_norm": 113.85664095629716, + "learning_rate": 5.032390686095956e-07, + "loss": 2.4315, + "step": 30637 + }, + { + "epoch": 2.611267365550158, + "grad_norm": 38.757619456855906, + "learning_rate": 5.030222936878409e-07, + "loss": 1.3326, + "step": 30638 + }, + { + "epoch": 2.611352595244183, + "grad_norm": 27.040382091567153, + "learning_rate": 5.028055629920764e-07, + "loss": 0.887, + "step": 30639 + }, + { + "epoch": 2.6114378249382084, + "grad_norm": 54.90884064693274, + "learning_rate": 5.025888765244308e-07, + "loss": 1.693, + "step": 30640 + }, + { + "epoch": 2.611523054632234, + "grad_norm": 30.17569092695866, + "learning_rate": 5.023722342870374e-07, + "loss": 1.3256, + "step": 30641 + }, + { + "epoch": 2.6116082843262594, + "grad_norm": 31.23397009996225, + "learning_rate": 5.021556362820262e-07, + "loss": 0.6581, + "step": 30642 + }, + { + "epoch": 2.611693514020285, + "grad_norm": 67.54805385998542, + "learning_rate": 5.019390825115255e-07, + "loss": 2.0397, + "step": 30643 + }, + { + "epoch": 2.61177874371431, + "grad_norm": 40.88464600556363, + "learning_rate": 5.017225729776681e-07, + "loss": 1.0133, + "step": 30644 + }, + { + "epoch": 2.6118639734083353, + "grad_norm": 27.663271051111064, + "learning_rate": 5.015061076825817e-07, + "loss": 0.9013, + "step": 30645 + }, + { + "epoch": 2.611949203102361, + "grad_norm": 27.61665924973061, + "learning_rate": 5.01289686628395e-07, + "loss": 0.9024, + "step": 30646 + }, + { + "epoch": 2.6120344327963863, + "grad_norm": 34.57402718781125, + "learning_rate": 5.010733098172355e-07, + "loss": 0.9864, + "step": 30647 + }, + { + "epoch": 2.6121196624904117, + "grad_norm": 19.55930516663915, + "learning_rate": 5.008569772512329e-07, + "loss": 0.5414, + "step": 30648 + }, + { + "epoch": 2.612204892184437, + "grad_norm": 36.082444935858675, + "learning_rate": 5.006406889325144e-07, + "loss": 1.2817, + "step": 30649 + }, + { + "epoch": 2.6122901218784627, + "grad_norm": 49.78456801327976, + "learning_rate": 5.004244448632073e-07, + "loss": 1.4135, + "step": 30650 + }, + { + "epoch": 2.6123753515724877, + "grad_norm": 48.87108617399192, + "learning_rate": 5.002082450454371e-07, + "loss": 1.405, + "step": 30651 + }, + { + "epoch": 2.612460581266513, + "grad_norm": 57.816779953625336, + "learning_rate": 4.999920894813315e-07, + "loss": 1.4637, + "step": 30652 + }, + { + "epoch": 2.6125458109605386, + "grad_norm": 23.70467844238842, + "learning_rate": 4.997759781730149e-07, + "loss": 0.6132, + "step": 30653 + }, + { + "epoch": 2.612631040654564, + "grad_norm": 66.88965684861334, + "learning_rate": 4.995599111226141e-07, + "loss": 1.7667, + "step": 30654 + }, + { + "epoch": 2.6127162703485896, + "grad_norm": 40.958967244804235, + "learning_rate": 4.993438883322522e-07, + "loss": 1.3991, + "step": 30655 + }, + { + "epoch": 2.6128015000426146, + "grad_norm": 20.510923317517555, + "learning_rate": 4.991279098040546e-07, + "loss": 0.6125, + "step": 30656 + }, + { + "epoch": 2.6128867297366405, + "grad_norm": 89.35360681112485, + "learning_rate": 4.989119755401467e-07, + "loss": 1.6708, + "step": 30657 + }, + { + "epoch": 2.6129719594306655, + "grad_norm": 82.31858193283183, + "learning_rate": 4.986960855426509e-07, + "loss": 2.2915, + "step": 30658 + }, + { + "epoch": 2.613057189124691, + "grad_norm": 47.10976145814225, + "learning_rate": 4.984802398136906e-07, + "loss": 1.5101, + "step": 30659 + }, + { + "epoch": 2.6131424188187165, + "grad_norm": 38.06093491950583, + "learning_rate": 4.98264438355387e-07, + "loss": 0.9157, + "step": 30660 + }, + { + "epoch": 2.613227648512742, + "grad_norm": 32.946669848952816, + "learning_rate": 4.980486811698654e-07, + "loss": 1.011, + "step": 30661 + }, + { + "epoch": 2.6133128782067674, + "grad_norm": 38.268795706764664, + "learning_rate": 4.978329682592453e-07, + "loss": 1.204, + "step": 30662 + }, + { + "epoch": 2.6133981079007924, + "grad_norm": 36.961313995723216, + "learning_rate": 4.97617299625649e-07, + "loss": 1.3523, + "step": 30663 + }, + { + "epoch": 2.613483337594818, + "grad_norm": 58.398173545718116, + "learning_rate": 4.974016752711969e-07, + "loss": 1.0697, + "step": 30664 + }, + { + "epoch": 2.6135685672888433, + "grad_norm": 59.617158788540344, + "learning_rate": 4.971860951980111e-07, + "loss": 1.7299, + "step": 30665 + }, + { + "epoch": 2.613653796982869, + "grad_norm": 72.65726390792632, + "learning_rate": 4.969705594082108e-07, + "loss": 1.9333, + "step": 30666 + }, + { + "epoch": 2.6137390266768943, + "grad_norm": 28.607326854534957, + "learning_rate": 4.967550679039157e-07, + "loss": 0.9148, + "step": 30667 + }, + { + "epoch": 2.6138242563709198, + "grad_norm": 36.49056121972778, + "learning_rate": 4.965396206872447e-07, + "loss": 0.6004, + "step": 30668 + }, + { + "epoch": 2.613909486064945, + "grad_norm": 51.629432749309466, + "learning_rate": 4.963242177603173e-07, + "loss": 1.4014, + "step": 30669 + }, + { + "epoch": 2.6139947157589702, + "grad_norm": 54.424080918127686, + "learning_rate": 4.961088591252522e-07, + "loss": 1.7328, + "step": 30670 + }, + { + "epoch": 2.6140799454529957, + "grad_norm": 63.611000987843006, + "learning_rate": 4.958935447841651e-07, + "loss": 1.1617, + "step": 30671 + }, + { + "epoch": 2.614165175147021, + "grad_norm": 44.29687941656376, + "learning_rate": 4.95678274739177e-07, + "loss": 1.1656, + "step": 30672 + }, + { + "epoch": 2.6142504048410466, + "grad_norm": 35.52427411272068, + "learning_rate": 4.954630489924018e-07, + "loss": 0.9273, + "step": 30673 + }, + { + "epoch": 2.614335634535072, + "grad_norm": 49.10682238282441, + "learning_rate": 4.952478675459587e-07, + "loss": 1.7246, + "step": 30674 + }, + { + "epoch": 2.614420864229097, + "grad_norm": 32.58384986475467, + "learning_rate": 4.950327304019625e-07, + "loss": 1.0949, + "step": 30675 + }, + { + "epoch": 2.614506093923123, + "grad_norm": 29.180417185590898, + "learning_rate": 4.948176375625297e-07, + "loss": 1.1178, + "step": 30676 + }, + { + "epoch": 2.614591323617148, + "grad_norm": 62.5506785398784, + "learning_rate": 4.94602589029774e-07, + "loss": 1.5136, + "step": 30677 + }, + { + "epoch": 2.6146765533111735, + "grad_norm": 41.50161000325169, + "learning_rate": 4.943875848058128e-07, + "loss": 0.4965, + "step": 30678 + }, + { + "epoch": 2.614761783005199, + "grad_norm": 67.2340961242024, + "learning_rate": 4.941726248927581e-07, + "loss": 1.6567, + "step": 30679 + }, + { + "epoch": 2.6148470126992245, + "grad_norm": 28.720421972243347, + "learning_rate": 4.939577092927267e-07, + "loss": 0.8232, + "step": 30680 + }, + { + "epoch": 2.61493224239325, + "grad_norm": 46.74020848940641, + "learning_rate": 4.937428380078291e-07, + "loss": 1.3347, + "step": 30681 + }, + { + "epoch": 2.615017472087275, + "grad_norm": 33.98333423434431, + "learning_rate": 4.935280110401813e-07, + "loss": 1.0036, + "step": 30682 + }, + { + "epoch": 2.6151027017813004, + "grad_norm": 51.912039507086305, + "learning_rate": 4.933132283918951e-07, + "loss": 1.4388, + "step": 30683 + }, + { + "epoch": 2.615187931475326, + "grad_norm": 56.91033516012287, + "learning_rate": 4.93098490065082e-07, + "loss": 1.5084, + "step": 30684 + }, + { + "epoch": 2.6152731611693514, + "grad_norm": 29.60130026734809, + "learning_rate": 4.928837960618543e-07, + "loss": 1.1095, + "step": 30685 + }, + { + "epoch": 2.615358390863377, + "grad_norm": 28.248685227637655, + "learning_rate": 4.92669146384323e-07, + "loss": 0.823, + "step": 30686 + }, + { + "epoch": 2.6154436205574023, + "grad_norm": 84.83618679243497, + "learning_rate": 4.924545410345999e-07, + "loss": 1.6578, + "step": 30687 + }, + { + "epoch": 2.615528850251428, + "grad_norm": 53.37720839256672, + "learning_rate": 4.922399800147953e-07, + "loss": 1.5349, + "step": 30688 + }, + { + "epoch": 2.615614079945453, + "grad_norm": 65.40902592884522, + "learning_rate": 4.9202546332702e-07, + "loss": 1.5532, + "step": 30689 + }, + { + "epoch": 2.6156993096394783, + "grad_norm": 77.72056629658071, + "learning_rate": 4.91810990973382e-07, + "loss": 2.1082, + "step": 30690 + }, + { + "epoch": 2.6157845393335037, + "grad_norm": 35.07379879316987, + "learning_rate": 4.915965629559921e-07, + "loss": 1.2422, + "step": 30691 + }, + { + "epoch": 2.615869769027529, + "grad_norm": 77.06468378835785, + "learning_rate": 4.913821792769585e-07, + "loss": 1.8798, + "step": 30692 + }, + { + "epoch": 2.6159549987215547, + "grad_norm": 40.03824847707967, + "learning_rate": 4.911678399383901e-07, + "loss": 1.5174, + "step": 30693 + }, + { + "epoch": 2.61604022841558, + "grad_norm": 52.107454887073054, + "learning_rate": 4.90953544942393e-07, + "loss": 1.657, + "step": 30694 + }, + { + "epoch": 2.6161254581096056, + "grad_norm": 64.63329173288719, + "learning_rate": 4.907392942910766e-07, + "loss": 1.197, + "step": 30695 + }, + { + "epoch": 2.6162106878036306, + "grad_norm": 43.18722132482671, + "learning_rate": 4.905250879865475e-07, + "loss": 1.1234, + "step": 30696 + }, + { + "epoch": 2.616295917497656, + "grad_norm": 65.54351697465401, + "learning_rate": 4.90310926030913e-07, + "loss": 1.2884, + "step": 30697 + }, + { + "epoch": 2.6163811471916816, + "grad_norm": 37.34642760721085, + "learning_rate": 4.900968084262786e-07, + "loss": 0.871, + "step": 30698 + }, + { + "epoch": 2.616466376885707, + "grad_norm": 54.150264667705656, + "learning_rate": 4.898827351747487e-07, + "loss": 1.3553, + "step": 30699 + }, + { + "epoch": 2.6165516065797325, + "grad_norm": 32.501118806717926, + "learning_rate": 4.896687062784311e-07, + "loss": 0.8937, + "step": 30700 + }, + { + "epoch": 2.6166368362737575, + "grad_norm": 53.37592489087598, + "learning_rate": 4.894547217394291e-07, + "loss": 1.3056, + "step": 30701 + }, + { + "epoch": 2.6167220659677835, + "grad_norm": 33.370981312045444, + "learning_rate": 4.892407815598471e-07, + "loss": 0.94, + "step": 30702 + }, + { + "epoch": 2.6168072956618085, + "grad_norm": 39.76753880464941, + "learning_rate": 4.890268857417895e-07, + "loss": 0.7643, + "step": 30703 + }, + { + "epoch": 2.616892525355834, + "grad_norm": 51.778994583921346, + "learning_rate": 4.888130342873609e-07, + "loss": 1.2206, + "step": 30704 + }, + { + "epoch": 2.6169777550498594, + "grad_norm": 62.58528671443073, + "learning_rate": 4.88599227198664e-07, + "loss": 1.6744, + "step": 30705 + }, + { + "epoch": 2.617062984743885, + "grad_norm": 30.30750533645416, + "learning_rate": 4.883854644778002e-07, + "loss": 0.8598, + "step": 30706 + }, + { + "epoch": 2.6171482144379103, + "grad_norm": 54.96110883079654, + "learning_rate": 4.881717461268726e-07, + "loss": 1.4489, + "step": 30707 + }, + { + "epoch": 2.6172334441319354, + "grad_norm": 68.34417908222461, + "learning_rate": 4.879580721479832e-07, + "loss": 1.5311, + "step": 30708 + }, + { + "epoch": 2.617318673825961, + "grad_norm": 62.934726465168225, + "learning_rate": 4.877444425432337e-07, + "loss": 1.2999, + "step": 30709 + }, + { + "epoch": 2.6174039035199863, + "grad_norm": 55.927827833888145, + "learning_rate": 4.875308573147236e-07, + "loss": 1.5182, + "step": 30710 + }, + { + "epoch": 2.6174891332140118, + "grad_norm": 37.62849678886769, + "learning_rate": 4.873173164645545e-07, + "loss": 0.8165, + "step": 30711 + }, + { + "epoch": 2.6175743629080372, + "grad_norm": 52.23951290811738, + "learning_rate": 4.871038199948269e-07, + "loss": 1.1859, + "step": 30712 + }, + { + "epoch": 2.6176595926020627, + "grad_norm": 21.54117401337224, + "learning_rate": 4.868903679076403e-07, + "loss": 0.8004, + "step": 30713 + }, + { + "epoch": 2.617744822296088, + "grad_norm": 25.23731361232822, + "learning_rate": 4.866769602050936e-07, + "loss": 0.7034, + "step": 30714 + }, + { + "epoch": 2.617830051990113, + "grad_norm": 41.20504341558931, + "learning_rate": 4.864635968892856e-07, + "loss": 1.2846, + "step": 30715 + }, + { + "epoch": 2.6179152816841387, + "grad_norm": 53.718311718598436, + "learning_rate": 4.862502779623141e-07, + "loss": 0.8858, + "step": 30716 + }, + { + "epoch": 2.618000511378164, + "grad_norm": 45.12568761840822, + "learning_rate": 4.860370034262779e-07, + "loss": 1.2382, + "step": 30717 + }, + { + "epoch": 2.6180857410721896, + "grad_norm": 94.08277411141046, + "learning_rate": 4.858237732832732e-07, + "loss": 2.1589, + "step": 30718 + }, + { + "epoch": 2.618170970766215, + "grad_norm": 63.933222627925716, + "learning_rate": 4.856105875353989e-07, + "loss": 1.5488, + "step": 30719 + }, + { + "epoch": 2.61825620046024, + "grad_norm": 29.287447890845076, + "learning_rate": 4.8539744618475e-07, + "loss": 1.1026, + "step": 30720 + }, + { + "epoch": 2.618341430154266, + "grad_norm": 28.401793877594624, + "learning_rate": 4.851843492334241e-07, + "loss": 0.8517, + "step": 30721 + }, + { + "epoch": 2.618426659848291, + "grad_norm": 43.24851303468535, + "learning_rate": 4.849712966835157e-07, + "loss": 1.3047, + "step": 30722 + }, + { + "epoch": 2.6185118895423165, + "grad_norm": 61.85006324678725, + "learning_rate": 4.847582885371211e-07, + "loss": 1.6816, + "step": 30723 + }, + { + "epoch": 2.618597119236342, + "grad_norm": 32.9170090857884, + "learning_rate": 4.845453247963333e-07, + "loss": 1.1197, + "step": 30724 + }, + { + "epoch": 2.6186823489303674, + "grad_norm": 46.94756663895227, + "learning_rate": 4.843324054632487e-07, + "loss": 1.2211, + "step": 30725 + }, + { + "epoch": 2.618767578624393, + "grad_norm": 29.81584090534078, + "learning_rate": 4.841195305399599e-07, + "loss": 0.6872, + "step": 30726 + }, + { + "epoch": 2.618852808318418, + "grad_norm": 57.76383894834214, + "learning_rate": 4.839067000285618e-07, + "loss": 1.662, + "step": 30727 + }, + { + "epoch": 2.6189380380124434, + "grad_norm": 53.69302176696979, + "learning_rate": 4.836939139311475e-07, + "loss": 1.4154, + "step": 30728 + }, + { + "epoch": 2.619023267706469, + "grad_norm": 62.36217389526425, + "learning_rate": 4.834811722498073e-07, + "loss": 1.941, + "step": 30729 + }, + { + "epoch": 2.6191084974004943, + "grad_norm": 29.342225896846678, + "learning_rate": 4.832684749866362e-07, + "loss": 1.3306, + "step": 30730 + }, + { + "epoch": 2.61919372709452, + "grad_norm": 34.30771164462084, + "learning_rate": 4.830558221437248e-07, + "loss": 1.6279, + "step": 30731 + }, + { + "epoch": 2.6192789567885453, + "grad_norm": 73.28938562755904, + "learning_rate": 4.828432137231648e-07, + "loss": 1.9099, + "step": 30732 + }, + { + "epoch": 2.6193641864825707, + "grad_norm": 45.88748308849006, + "learning_rate": 4.826306497270455e-07, + "loss": 0.6199, + "step": 30733 + }, + { + "epoch": 2.6194494161765958, + "grad_norm": 72.34423856055115, + "learning_rate": 4.824181301574587e-07, + "loss": 1.7726, + "step": 30734 + }, + { + "epoch": 2.6195346458706212, + "grad_norm": 35.0933992229219, + "learning_rate": 4.822056550164961e-07, + "loss": 0.9004, + "step": 30735 + }, + { + "epoch": 2.6196198755646467, + "grad_norm": 77.52031773576502, + "learning_rate": 4.819932243062447e-07, + "loss": 2.0458, + "step": 30736 + }, + { + "epoch": 2.619705105258672, + "grad_norm": 24.274101631024706, + "learning_rate": 4.817808380287947e-07, + "loss": 0.6463, + "step": 30737 + }, + { + "epoch": 2.6197903349526976, + "grad_norm": 81.94266317268033, + "learning_rate": 4.81568496186235e-07, + "loss": 2.0218, + "step": 30738 + }, + { + "epoch": 2.6198755646467227, + "grad_norm": 54.78567451757923, + "learning_rate": 4.813561987806542e-07, + "loss": 1.822, + "step": 30739 + }, + { + "epoch": 2.6199607943407486, + "grad_norm": 39.677213804412474, + "learning_rate": 4.811439458141393e-07, + "loss": 1.1781, + "step": 30740 + }, + { + "epoch": 2.6200460240347736, + "grad_norm": 28.70550751817161, + "learning_rate": 4.809317372887773e-07, + "loss": 1.1609, + "step": 30741 + }, + { + "epoch": 2.620131253728799, + "grad_norm": 33.83720900065216, + "learning_rate": 4.807195732066561e-07, + "loss": 0.9989, + "step": 30742 + }, + { + "epoch": 2.6202164834228245, + "grad_norm": 64.07847114494307, + "learning_rate": 4.805074535698628e-07, + "loss": 1.8567, + "step": 30743 + }, + { + "epoch": 2.62030171311685, + "grad_norm": 62.33246750728196, + "learning_rate": 4.802953783804831e-07, + "loss": 1.3634, + "step": 30744 + }, + { + "epoch": 2.6203869428108755, + "grad_norm": 40.76613504377177, + "learning_rate": 4.800833476406019e-07, + "loss": 0.6366, + "step": 30745 + }, + { + "epoch": 2.6204721725049005, + "grad_norm": 21.46866218537639, + "learning_rate": 4.798713613523042e-07, + "loss": 0.5995, + "step": 30746 + }, + { + "epoch": 2.620557402198926, + "grad_norm": 38.370657739688085, + "learning_rate": 4.796594195176768e-07, + "loss": 1.3388, + "step": 30747 + }, + { + "epoch": 2.6206426318929514, + "grad_norm": 46.44044031811927, + "learning_rate": 4.794475221388029e-07, + "loss": 1.4221, + "step": 30748 + }, + { + "epoch": 2.620727861586977, + "grad_norm": 42.407744527868054, + "learning_rate": 4.792356692177647e-07, + "loss": 0.6945, + "step": 30749 + }, + { + "epoch": 2.6208130912810024, + "grad_norm": 30.922564451148997, + "learning_rate": 4.790238607566478e-07, + "loss": 0.9233, + "step": 30750 + }, + { + "epoch": 2.620898320975028, + "grad_norm": 35.73950726647308, + "learning_rate": 4.788120967575355e-07, + "loss": 1.2012, + "step": 30751 + }, + { + "epoch": 2.6209835506690533, + "grad_norm": 77.13450921525165, + "learning_rate": 4.786003772225101e-07, + "loss": 2.2634, + "step": 30752 + }, + { + "epoch": 2.6210687803630783, + "grad_norm": 77.81453756312895, + "learning_rate": 4.783887021536527e-07, + "loss": 1.8969, + "step": 30753 + }, + { + "epoch": 2.621154010057104, + "grad_norm": 58.16329397136791, + "learning_rate": 4.781770715530454e-07, + "loss": 1.7904, + "step": 30754 + }, + { + "epoch": 2.6212392397511293, + "grad_norm": 64.24615548327532, + "learning_rate": 4.779654854227705e-07, + "loss": 1.3229, + "step": 30755 + }, + { + "epoch": 2.6213244694451547, + "grad_norm": 46.19668460343236, + "learning_rate": 4.777539437649081e-07, + "loss": 1.4042, + "step": 30756 + }, + { + "epoch": 2.62140969913918, + "grad_norm": 48.85937558744027, + "learning_rate": 4.775424465815376e-07, + "loss": 1.569, + "step": 30757 + }, + { + "epoch": 2.6214949288332052, + "grad_norm": 44.838048742512804, + "learning_rate": 4.773309938747411e-07, + "loss": 1.3857, + "step": 30758 + }, + { + "epoch": 2.621580158527231, + "grad_norm": 67.6998505042958, + "learning_rate": 4.771195856465965e-07, + "loss": 1.2108, + "step": 30759 + }, + { + "epoch": 2.621665388221256, + "grad_norm": 29.860892199525967, + "learning_rate": 4.769082218991843e-07, + "loss": 0.6916, + "step": 30760 + }, + { + "epoch": 2.6217506179152816, + "grad_norm": 17.59113525110719, + "learning_rate": 4.766969026345825e-07, + "loss": 0.6957, + "step": 30761 + }, + { + "epoch": 2.621835847609307, + "grad_norm": 72.19338368704825, + "learning_rate": 4.764856278548691e-07, + "loss": 1.8063, + "step": 30762 + }, + { + "epoch": 2.6219210773033326, + "grad_norm": 83.61763767685976, + "learning_rate": 4.762743975621215e-07, + "loss": 1.9383, + "step": 30763 + }, + { + "epoch": 2.622006306997358, + "grad_norm": 76.36163913549075, + "learning_rate": 4.7606321175841794e-07, + "loss": 2.0236, + "step": 30764 + }, + { + "epoch": 2.622091536691383, + "grad_norm": 24.112251244541994, + "learning_rate": 4.758520704458347e-07, + "loss": 0.973, + "step": 30765 + }, + { + "epoch": 2.6221767663854085, + "grad_norm": 45.531542185712645, + "learning_rate": 4.7564097362644945e-07, + "loss": 1.527, + "step": 30766 + }, + { + "epoch": 2.622261996079434, + "grad_norm": 54.962147581838416, + "learning_rate": 4.7542992130233665e-07, + "loss": 1.2539, + "step": 30767 + }, + { + "epoch": 2.6223472257734595, + "grad_norm": 54.025669653473116, + "learning_rate": 4.75218913475573e-07, + "loss": 1.2181, + "step": 30768 + }, + { + "epoch": 2.622432455467485, + "grad_norm": 37.93308260225372, + "learning_rate": 4.750079501482341e-07, + "loss": 1.1618, + "step": 30769 + }, + { + "epoch": 2.6225176851615104, + "grad_norm": 37.09193584797654, + "learning_rate": 4.7479703132239373e-07, + "loss": 0.975, + "step": 30770 + }, + { + "epoch": 2.622602914855536, + "grad_norm": 42.52372836441613, + "learning_rate": 4.7458615700012647e-07, + "loss": 1.3824, + "step": 30771 + }, + { + "epoch": 2.622688144549561, + "grad_norm": 74.7022009554261, + "learning_rate": 4.74375327183505e-07, + "loss": 1.5109, + "step": 30772 + }, + { + "epoch": 2.6227733742435864, + "grad_norm": 39.4560156908291, + "learning_rate": 4.7416454187460383e-07, + "loss": 1.009, + "step": 30773 + }, + { + "epoch": 2.622858603937612, + "grad_norm": 59.25736854511649, + "learning_rate": 4.7395380107549684e-07, + "loss": 1.8162, + "step": 30774 + }, + { + "epoch": 2.6229438336316373, + "grad_norm": 73.74686119053987, + "learning_rate": 4.7374310478825626e-07, + "loss": 1.665, + "step": 30775 + }, + { + "epoch": 2.6230290633256628, + "grad_norm": 37.29415272175435, + "learning_rate": 4.735324530149521e-07, + "loss": 1.0639, + "step": 30776 + }, + { + "epoch": 2.623114293019688, + "grad_norm": 28.79121499839579, + "learning_rate": 4.733218457576588e-07, + "loss": 0.7672, + "step": 30777 + }, + { + "epoch": 2.6231995227137137, + "grad_norm": 66.55061585690402, + "learning_rate": 4.7311128301844645e-07, + "loss": 1.5967, + "step": 30778 + }, + { + "epoch": 2.6232847524077387, + "grad_norm": 74.92418015695381, + "learning_rate": 4.7290076479938605e-07, + "loss": 1.422, + "step": 30779 + }, + { + "epoch": 2.623369982101764, + "grad_norm": 83.16033195819628, + "learning_rate": 4.726902911025466e-07, + "loss": 1.7316, + "step": 30780 + }, + { + "epoch": 2.6234552117957897, + "grad_norm": 67.08891958420651, + "learning_rate": 4.7247986192999917e-07, + "loss": 2.0906, + "step": 30781 + }, + { + "epoch": 2.623540441489815, + "grad_norm": 25.84119380986858, + "learning_rate": 4.722694772838143e-07, + "loss": 0.8764, + "step": 30782 + }, + { + "epoch": 2.6236256711838406, + "grad_norm": 40.7618562638205, + "learning_rate": 4.720591371660599e-07, + "loss": 1.5458, + "step": 30783 + }, + { + "epoch": 2.6237109008778656, + "grad_norm": 33.50493269628416, + "learning_rate": 4.718488415788036e-07, + "loss": 0.9102, + "step": 30784 + }, + { + "epoch": 2.623796130571891, + "grad_norm": 53.123408551130154, + "learning_rate": 4.7163859052411553e-07, + "loss": 1.898, + "step": 30785 + }, + { + "epoch": 2.6238813602659166, + "grad_norm": 58.17993810081229, + "learning_rate": 4.714283840040629e-07, + "loss": 1.9942, + "step": 30786 + }, + { + "epoch": 2.623966589959942, + "grad_norm": 51.9667889827229, + "learning_rate": 4.712182220207123e-07, + "loss": 1.1057, + "step": 30787 + }, + { + "epoch": 2.6240518196539675, + "grad_norm": 58.726994422880445, + "learning_rate": 4.7100810457613e-07, + "loss": 1.6397, + "step": 30788 + }, + { + "epoch": 2.624137049347993, + "grad_norm": 57.32193188666103, + "learning_rate": 4.707980316723837e-07, + "loss": 1.824, + "step": 30789 + }, + { + "epoch": 2.6242222790420184, + "grad_norm": 30.93418916346972, + "learning_rate": 4.7058800331153955e-07, + "loss": 0.7693, + "step": 30790 + }, + { + "epoch": 2.6243075087360435, + "grad_norm": 64.97340180593473, + "learning_rate": 4.703780194956625e-07, + "loss": 2.0734, + "step": 30791 + }, + { + "epoch": 2.624392738430069, + "grad_norm": 51.57676579293923, + "learning_rate": 4.701680802268183e-07, + "loss": 1.2241, + "step": 30792 + }, + { + "epoch": 2.6244779681240944, + "grad_norm": 67.56818123560501, + "learning_rate": 4.699581855070695e-07, + "loss": 2.0108, + "step": 30793 + }, + { + "epoch": 2.62456319781812, + "grad_norm": 32.78789228274685, + "learning_rate": 4.697483353384835e-07, + "loss": 1.2792, + "step": 30794 + }, + { + "epoch": 2.6246484275121453, + "grad_norm": 25.41429568258171, + "learning_rate": 4.69538529723122e-07, + "loss": 0.9629, + "step": 30795 + }, + { + "epoch": 2.6247336572061704, + "grad_norm": 43.75779117651556, + "learning_rate": 4.6932876866304766e-07, + "loss": 1.113, + "step": 30796 + }, + { + "epoch": 2.6248188869001963, + "grad_norm": 26.61135943222553, + "learning_rate": 4.6911905216032503e-07, + "loss": 0.5629, + "step": 30797 + }, + { + "epoch": 2.6249041165942213, + "grad_norm": 69.0103517087198, + "learning_rate": 4.6890938021701694e-07, + "loss": 1.6141, + "step": 30798 + }, + { + "epoch": 2.6249893462882468, + "grad_norm": 35.93648414465527, + "learning_rate": 4.6869975283518445e-07, + "loss": 1.3401, + "step": 30799 + }, + { + "epoch": 2.6250745759822722, + "grad_norm": 62.70443472516774, + "learning_rate": 4.684901700168898e-07, + "loss": 1.9171, + "step": 30800 + }, + { + "epoch": 2.6251598056762977, + "grad_norm": 67.14760646347887, + "learning_rate": 4.6828063176419304e-07, + "loss": 2.2444, + "step": 30801 + }, + { + "epoch": 2.625245035370323, + "grad_norm": 39.68730464704454, + "learning_rate": 4.6807113807915473e-07, + "loss": 1.1383, + "step": 30802 + }, + { + "epoch": 2.625330265064348, + "grad_norm": 58.20450243930896, + "learning_rate": 4.678616889638371e-07, + "loss": 1.7392, + "step": 30803 + }, + { + "epoch": 2.6254154947583737, + "grad_norm": 63.93751241183962, + "learning_rate": 4.6765228442029797e-07, + "loss": 0.9284, + "step": 30804 + }, + { + "epoch": 2.625500724452399, + "grad_norm": 63.63909370428217, + "learning_rate": 4.674429244505985e-07, + "loss": 1.9562, + "step": 30805 + }, + { + "epoch": 2.6255859541464246, + "grad_norm": 59.49614784982059, + "learning_rate": 4.6723360905679583e-07, + "loss": 1.1419, + "step": 30806 + }, + { + "epoch": 2.62567118384045, + "grad_norm": 58.397118772499105, + "learning_rate": 4.670243382409506e-07, + "loss": 1.6925, + "step": 30807 + }, + { + "epoch": 2.6257564135344755, + "grad_norm": 35.83649336379296, + "learning_rate": 4.668151120051195e-07, + "loss": 1.2579, + "step": 30808 + }, + { + "epoch": 2.625841643228501, + "grad_norm": 26.685743354835378, + "learning_rate": 4.6660593035136094e-07, + "loss": 0.8813, + "step": 30809 + }, + { + "epoch": 2.625926872922526, + "grad_norm": 44.436016097590965, + "learning_rate": 4.6639679328173037e-07, + "loss": 1.1362, + "step": 30810 + }, + { + "epoch": 2.6260121026165515, + "grad_norm": 46.72912904752051, + "learning_rate": 4.6618770079828677e-07, + "loss": 1.175, + "step": 30811 + }, + { + "epoch": 2.626097332310577, + "grad_norm": 32.000554992555, + "learning_rate": 4.6597865290308464e-07, + "loss": 1.054, + "step": 30812 + }, + { + "epoch": 2.6261825620046024, + "grad_norm": 29.47182920519823, + "learning_rate": 4.6576964959818173e-07, + "loss": 0.7539, + "step": 30813 + }, + { + "epoch": 2.626267791698628, + "grad_norm": 83.0619969518643, + "learning_rate": 4.6556069088563193e-07, + "loss": 2.4185, + "step": 30814 + }, + { + "epoch": 2.6263530213926534, + "grad_norm": 52.25700286127194, + "learning_rate": 4.653517767674914e-07, + "loss": 1.2646, + "step": 30815 + }, + { + "epoch": 2.626438251086679, + "grad_norm": 21.046080466663458, + "learning_rate": 4.651429072458147e-07, + "loss": 0.6308, + "step": 30816 + }, + { + "epoch": 2.626523480780704, + "grad_norm": 32.25692038344762, + "learning_rate": 4.64934082322655e-07, + "loss": 1.3884, + "step": 30817 + }, + { + "epoch": 2.6266087104747293, + "grad_norm": 58.840597914377106, + "learning_rate": 4.6472530200006693e-07, + "loss": 1.8137, + "step": 30818 + }, + { + "epoch": 2.626693940168755, + "grad_norm": 73.65623627015461, + "learning_rate": 4.6451656628010264e-07, + "loss": 1.7775, + "step": 30819 + }, + { + "epoch": 2.6267791698627803, + "grad_norm": 31.378145174883638, + "learning_rate": 4.643078751648156e-07, + "loss": 1.3786, + "step": 30820 + }, + { + "epoch": 2.6268643995568057, + "grad_norm": 83.3771126451298, + "learning_rate": 4.64099228656259e-07, + "loss": 1.6358, + "step": 30821 + }, + { + "epoch": 2.6269496292508308, + "grad_norm": 20.151655364850278, + "learning_rate": 4.638906267564841e-07, + "loss": 0.8176, + "step": 30822 + }, + { + "epoch": 2.627034858944856, + "grad_norm": 113.50077329958803, + "learning_rate": 4.6368206946754146e-07, + "loss": 1.2733, + "step": 30823 + }, + { + "epoch": 2.6271200886388817, + "grad_norm": 49.759219132823624, + "learning_rate": 4.634735567914839e-07, + "loss": 1.3272, + "step": 30824 + }, + { + "epoch": 2.627205318332907, + "grad_norm": 46.80667445409985, + "learning_rate": 4.632650887303619e-07, + "loss": 1.1604, + "step": 30825 + }, + { + "epoch": 2.6272905480269326, + "grad_norm": 37.196487651386526, + "learning_rate": 4.630566652862245e-07, + "loss": 0.9767, + "step": 30826 + }, + { + "epoch": 2.627375777720958, + "grad_norm": 33.28346395423796, + "learning_rate": 4.628482864611211e-07, + "loss": 1.497, + "step": 30827 + }, + { + "epoch": 2.6274610074149836, + "grad_norm": 66.05949123918052, + "learning_rate": 4.6263995225710235e-07, + "loss": 1.4186, + "step": 30828 + }, + { + "epoch": 2.6275462371090086, + "grad_norm": 33.03880864565593, + "learning_rate": 4.624316626762171e-07, + "loss": 0.678, + "step": 30829 + }, + { + "epoch": 2.627631466803034, + "grad_norm": 39.013681075691466, + "learning_rate": 4.6222341772051373e-07, + "loss": 1.5805, + "step": 30830 + }, + { + "epoch": 2.6277166964970595, + "grad_norm": 82.84126949879351, + "learning_rate": 4.620152173920395e-07, + "loss": 1.7308, + "step": 30831 + }, + { + "epoch": 2.627801926191085, + "grad_norm": 19.99198981171686, + "learning_rate": 4.618070616928416e-07, + "loss": 0.5912, + "step": 30832 + }, + { + "epoch": 2.6278871558851105, + "grad_norm": 31.441614785938775, + "learning_rate": 4.615989506249691e-07, + "loss": 1.1904, + "step": 30833 + }, + { + "epoch": 2.627972385579136, + "grad_norm": 72.69638947329675, + "learning_rate": 4.613908841904674e-07, + "loss": 1.4582, + "step": 30834 + }, + { + "epoch": 2.6280576152731614, + "grad_norm": 33.232568353117486, + "learning_rate": 4.611828623913822e-07, + "loss": 1.0909, + "step": 30835 + }, + { + "epoch": 2.6281428449671864, + "grad_norm": 42.234077042240784, + "learning_rate": 4.6097488522975966e-07, + "loss": 1.4133, + "step": 30836 + }, + { + "epoch": 2.628228074661212, + "grad_norm": 49.156342805142955, + "learning_rate": 4.607669527076464e-07, + "loss": 1.3903, + "step": 30837 + }, + { + "epoch": 2.6283133043552374, + "grad_norm": 51.85640346683266, + "learning_rate": 4.605590648270869e-07, + "loss": 1.2689, + "step": 30838 + }, + { + "epoch": 2.628398534049263, + "grad_norm": 65.96794649720755, + "learning_rate": 4.6035122159012457e-07, + "loss": 1.9153, + "step": 30839 + }, + { + "epoch": 2.6284837637432883, + "grad_norm": 55.11038401154729, + "learning_rate": 4.601434229988039e-07, + "loss": 1.3216, + "step": 30840 + }, + { + "epoch": 2.6285689934373133, + "grad_norm": 30.121704257513198, + "learning_rate": 4.599356690551693e-07, + "loss": 1.1979, + "step": 30841 + }, + { + "epoch": 2.6286542231313392, + "grad_norm": 39.7889001667588, + "learning_rate": 4.5972795976126304e-07, + "loss": 1.2136, + "step": 30842 + }, + { + "epoch": 2.6287394528253643, + "grad_norm": 52.328793703178626, + "learning_rate": 4.5952029511912744e-07, + "loss": 1.8036, + "step": 30843 + }, + { + "epoch": 2.6288246825193897, + "grad_norm": 53.83054919205305, + "learning_rate": 4.5931267513080636e-07, + "loss": 1.2391, + "step": 30844 + }, + { + "epoch": 2.628909912213415, + "grad_norm": 52.20188334047297, + "learning_rate": 4.591050997983404e-07, + "loss": 1.2304, + "step": 30845 + }, + { + "epoch": 2.6289951419074407, + "grad_norm": 30.264245446546813, + "learning_rate": 4.588975691237718e-07, + "loss": 0.5991, + "step": 30846 + }, + { + "epoch": 2.629080371601466, + "grad_norm": 38.24098362979902, + "learning_rate": 4.586900831091412e-07, + "loss": 1.0609, + "step": 30847 + }, + { + "epoch": 2.629165601295491, + "grad_norm": 47.92719990442975, + "learning_rate": 4.5848264175648916e-07, + "loss": 1.2631, + "step": 30848 + }, + { + "epoch": 2.6292508309895166, + "grad_norm": 32.47626172512369, + "learning_rate": 4.5827524506785513e-07, + "loss": 0.8782, + "step": 30849 + }, + { + "epoch": 2.629336060683542, + "grad_norm": 52.68288670241478, + "learning_rate": 4.5806789304527974e-07, + "loss": 1.4731, + "step": 30850 + }, + { + "epoch": 2.6294212903775676, + "grad_norm": 60.76979867195316, + "learning_rate": 4.578605856908014e-07, + "loss": 1.889, + "step": 30851 + }, + { + "epoch": 2.629506520071593, + "grad_norm": 75.5925740402431, + "learning_rate": 4.5765332300646e-07, + "loss": 1.4889, + "step": 30852 + }, + { + "epoch": 2.6295917497656185, + "grad_norm": 72.00955266410415, + "learning_rate": 4.5744610499429244e-07, + "loss": 1.9633, + "step": 30853 + }, + { + "epoch": 2.629676979459644, + "grad_norm": 42.500685915423716, + "learning_rate": 4.5723893165633806e-07, + "loss": 1.1941, + "step": 30854 + }, + { + "epoch": 2.629762209153669, + "grad_norm": 59.06309384880901, + "learning_rate": 4.5703180299463414e-07, + "loss": 1.1572, + "step": 30855 + }, + { + "epoch": 2.6298474388476945, + "grad_norm": 51.846176103861275, + "learning_rate": 4.5682471901121684e-07, + "loss": 1.3741, + "step": 30856 + }, + { + "epoch": 2.62993266854172, + "grad_norm": 55.662345034844996, + "learning_rate": 4.566176797081223e-07, + "loss": 0.9468, + "step": 30857 + }, + { + "epoch": 2.6300178982357454, + "grad_norm": 65.48768097777378, + "learning_rate": 4.564106850873884e-07, + "loss": 1.8734, + "step": 30858 + }, + { + "epoch": 2.630103127929771, + "grad_norm": 58.797769363919315, + "learning_rate": 4.5620373515104897e-07, + "loss": 2.1863, + "step": 30859 + }, + { + "epoch": 2.630188357623796, + "grad_norm": 13.99542497277476, + "learning_rate": 4.5599682990114135e-07, + "loss": 0.4715, + "step": 30860 + }, + { + "epoch": 2.630273587317822, + "grad_norm": 63.12665418011753, + "learning_rate": 4.557899693396994e-07, + "loss": 1.8155, + "step": 30861 + }, + { + "epoch": 2.630358817011847, + "grad_norm": 26.007565985139262, + "learning_rate": 4.5558315346875604e-07, + "loss": 0.4035, + "step": 30862 + }, + { + "epoch": 2.6304440467058723, + "grad_norm": 37.130668127705285, + "learning_rate": 4.5537638229034786e-07, + "loss": 1.0282, + "step": 30863 + }, + { + "epoch": 2.6305292763998978, + "grad_norm": 71.65977618173571, + "learning_rate": 4.5516965580650717e-07, + "loss": 1.886, + "step": 30864 + }, + { + "epoch": 2.630614506093923, + "grad_norm": 41.08260571330744, + "learning_rate": 4.549629740192668e-07, + "loss": 1.1783, + "step": 30865 + }, + { + "epoch": 2.6306997357879487, + "grad_norm": 24.96366722946035, + "learning_rate": 4.54756336930659e-07, + "loss": 1.3402, + "step": 30866 + }, + { + "epoch": 2.6307849654819737, + "grad_norm": 64.23420569927025, + "learning_rate": 4.5454974454271606e-07, + "loss": 1.0802, + "step": 30867 + }, + { + "epoch": 2.630870195175999, + "grad_norm": 65.36459580657058, + "learning_rate": 4.543431968574713e-07, + "loss": 1.3461, + "step": 30868 + }, + { + "epoch": 2.6309554248700247, + "grad_norm": 54.211975334502576, + "learning_rate": 4.541366938769548e-07, + "loss": 1.3462, + "step": 30869 + }, + { + "epoch": 2.63104065456405, + "grad_norm": 58.97158119836368, + "learning_rate": 4.5393023560319714e-07, + "loss": 1.6363, + "step": 30870 + }, + { + "epoch": 2.6311258842580756, + "grad_norm": 49.51232564722282, + "learning_rate": 4.537238220382295e-07, + "loss": 1.8545, + "step": 30871 + }, + { + "epoch": 2.631211113952101, + "grad_norm": 90.53318355066251, + "learning_rate": 4.5351745318408194e-07, + "loss": 2.5298, + "step": 30872 + }, + { + "epoch": 2.6312963436461265, + "grad_norm": 49.71740204909495, + "learning_rate": 4.533111290427833e-07, + "loss": 1.5461, + "step": 30873 + }, + { + "epoch": 2.6313815733401515, + "grad_norm": 61.00048367118334, + "learning_rate": 4.531048496163626e-07, + "loss": 1.8749, + "step": 30874 + }, + { + "epoch": 2.631466803034177, + "grad_norm": 24.09027096268817, + "learning_rate": 4.528986149068482e-07, + "loss": 1.1444, + "step": 30875 + }, + { + "epoch": 2.6315520327282025, + "grad_norm": 59.5565148496274, + "learning_rate": 4.5269242491627066e-07, + "loss": 1.2809, + "step": 30876 + }, + { + "epoch": 2.631637262422228, + "grad_norm": 49.52326063591186, + "learning_rate": 4.5248627964665613e-07, + "loss": 1.4854, + "step": 30877 + }, + { + "epoch": 2.6317224921162534, + "grad_norm": 65.89804387471342, + "learning_rate": 4.52280179100032e-07, + "loss": 1.1213, + "step": 30878 + }, + { + "epoch": 2.6318077218102784, + "grad_norm": 50.35767056557356, + "learning_rate": 4.520741232784237e-07, + "loss": 1.2974, + "step": 30879 + }, + { + "epoch": 2.6318929515043044, + "grad_norm": 66.5284926684651, + "learning_rate": 4.5186811218386085e-07, + "loss": 2.0562, + "step": 30880 + }, + { + "epoch": 2.6319781811983294, + "grad_norm": 44.92143774587364, + "learning_rate": 4.516621458183679e-07, + "loss": 0.8161, + "step": 30881 + }, + { + "epoch": 2.632063410892355, + "grad_norm": 30.963804091865462, + "learning_rate": 4.5145622418396926e-07, + "loss": 1.0113, + "step": 30882 + }, + { + "epoch": 2.6321486405863803, + "grad_norm": 66.25671919195045, + "learning_rate": 4.512503472826912e-07, + "loss": 1.482, + "step": 30883 + }, + { + "epoch": 2.632233870280406, + "grad_norm": 34.484006350051004, + "learning_rate": 4.510445151165593e-07, + "loss": 0.9454, + "step": 30884 + }, + { + "epoch": 2.6323190999744313, + "grad_norm": 52.96395190474283, + "learning_rate": 4.5083872768759696e-07, + "loss": 1.2824, + "step": 30885 + }, + { + "epoch": 2.6324043296684563, + "grad_norm": 49.368387488510116, + "learning_rate": 4.506329849978286e-07, + "loss": 1.1048, + "step": 30886 + }, + { + "epoch": 2.6324895593624817, + "grad_norm": 42.9667728486533, + "learning_rate": 4.504272870492754e-07, + "loss": 1.2462, + "step": 30887 + }, + { + "epoch": 2.632574789056507, + "grad_norm": 41.187171270672955, + "learning_rate": 4.502216338439636e-07, + "loss": 1.2349, + "step": 30888 + }, + { + "epoch": 2.6326600187505327, + "grad_norm": 38.68813493186054, + "learning_rate": 4.5001602538391364e-07, + "loss": 1.3247, + "step": 30889 + }, + { + "epoch": 2.632745248444558, + "grad_norm": 54.69644991862668, + "learning_rate": 4.4981046167114794e-07, + "loss": 1.3642, + "step": 30890 + }, + { + "epoch": 2.6328304781385836, + "grad_norm": 20.865073198455566, + "learning_rate": 4.496049427076882e-07, + "loss": 0.8488, + "step": 30891 + }, + { + "epoch": 2.632915707832609, + "grad_norm": 37.51083191984703, + "learning_rate": 4.493994684955555e-07, + "loss": 1.494, + "step": 30892 + }, + { + "epoch": 2.633000937526634, + "grad_norm": 30.126366473527796, + "learning_rate": 4.491940390367716e-07, + "loss": 1.045, + "step": 30893 + }, + { + "epoch": 2.6330861672206596, + "grad_norm": 36.02014597031724, + "learning_rate": 4.4898865433335547e-07, + "loss": 1.3078, + "step": 30894 + }, + { + "epoch": 2.633171396914685, + "grad_norm": 84.35322411210676, + "learning_rate": 4.4878331438732824e-07, + "loss": 2.0543, + "step": 30895 + }, + { + "epoch": 2.6332566266087105, + "grad_norm": 23.407276182892236, + "learning_rate": 4.485780192007072e-07, + "loss": 0.8566, + "step": 30896 + }, + { + "epoch": 2.633341856302736, + "grad_norm": 58.194989221198576, + "learning_rate": 4.4837276877551406e-07, + "loss": 1.1238, + "step": 30897 + }, + { + "epoch": 2.633427085996761, + "grad_norm": 58.18638130185443, + "learning_rate": 4.4816756311376497e-07, + "loss": 1.6614, + "step": 30898 + }, + { + "epoch": 2.633512315690787, + "grad_norm": 49.74085534447649, + "learning_rate": 4.4796240221748055e-07, + "loss": 1.1312, + "step": 30899 + }, + { + "epoch": 2.633597545384812, + "grad_norm": 55.59652291829649, + "learning_rate": 4.477572860886753e-07, + "loss": 1.2777, + "step": 30900 + }, + { + "epoch": 2.6336827750788374, + "grad_norm": 37.15032960391167, + "learning_rate": 4.4755221472936985e-07, + "loss": 1.1232, + "step": 30901 + }, + { + "epoch": 2.633768004772863, + "grad_norm": 58.52879613079016, + "learning_rate": 4.473471881415792e-07, + "loss": 1.5588, + "step": 30902 + }, + { + "epoch": 2.6338532344668883, + "grad_norm": 27.256874661569483, + "learning_rate": 4.471422063273201e-07, + "loss": 0.757, + "step": 30903 + }, + { + "epoch": 2.633938464160914, + "grad_norm": 57.84611295069294, + "learning_rate": 4.4693726928860816e-07, + "loss": 1.9536, + "step": 30904 + }, + { + "epoch": 2.634023693854939, + "grad_norm": 68.35892043054449, + "learning_rate": 4.4673237702745787e-07, + "loss": 1.6544, + "step": 30905 + }, + { + "epoch": 2.6341089235489643, + "grad_norm": 35.47855759527435, + "learning_rate": 4.465275295458854e-07, + "loss": 1.9084, + "step": 30906 + }, + { + "epoch": 2.6341941532429898, + "grad_norm": 40.61007727320058, + "learning_rate": 4.4632272684590637e-07, + "loss": 1.3147, + "step": 30907 + }, + { + "epoch": 2.6342793829370152, + "grad_norm": 81.34389708951079, + "learning_rate": 4.461179689295342e-07, + "loss": 1.5609, + "step": 30908 + }, + { + "epoch": 2.6343646126310407, + "grad_norm": 29.610295526519277, + "learning_rate": 4.4591325579878055e-07, + "loss": 0.6956, + "step": 30909 + }, + { + "epoch": 2.634449842325066, + "grad_norm": 41.9348574894118, + "learning_rate": 4.4570858745566216e-07, + "loss": 1.1388, + "step": 30910 + }, + { + "epoch": 2.6345350720190917, + "grad_norm": 37.04116629901645, + "learning_rate": 4.4550396390218966e-07, + "loss": 1.0471, + "step": 30911 + }, + { + "epoch": 2.6346203017131167, + "grad_norm": 68.59542537326764, + "learning_rate": 4.4529938514037585e-07, + "loss": 1.6677, + "step": 30912 + }, + { + "epoch": 2.634705531407142, + "grad_norm": 52.801209943546496, + "learning_rate": 4.4509485117223197e-07, + "loss": 1.3634, + "step": 30913 + }, + { + "epoch": 2.6347907611011676, + "grad_norm": 41.88256395117136, + "learning_rate": 4.448903619997702e-07, + "loss": 1.3454, + "step": 30914 + }, + { + "epoch": 2.634875990795193, + "grad_norm": 54.84720328386401, + "learning_rate": 4.446859176250023e-07, + "loss": 1.7623, + "step": 30915 + }, + { + "epoch": 2.6349612204892185, + "grad_norm": 54.40577110830462, + "learning_rate": 4.44481518049939e-07, + "loss": 1.1481, + "step": 30916 + }, + { + "epoch": 2.6350464501832436, + "grad_norm": 67.89764035089839, + "learning_rate": 4.442771632765891e-07, + "loss": 1.1794, + "step": 30917 + }, + { + "epoch": 2.6351316798772695, + "grad_norm": 39.46512487915562, + "learning_rate": 4.440728533069627e-07, + "loss": 1.4002, + "step": 30918 + }, + { + "epoch": 2.6352169095712945, + "grad_norm": 32.67925960159256, + "learning_rate": 4.438685881430699e-07, + "loss": 1.3202, + "step": 30919 + }, + { + "epoch": 2.63530213926532, + "grad_norm": 33.641174875309076, + "learning_rate": 4.43664367786919e-07, + "loss": 1.2885, + "step": 30920 + }, + { + "epoch": 2.6353873689593454, + "grad_norm": 65.71743551611853, + "learning_rate": 4.4346019224051795e-07, + "loss": 1.397, + "step": 30921 + }, + { + "epoch": 2.635472598653371, + "grad_norm": 48.010794433252904, + "learning_rate": 4.432560615058756e-07, + "loss": 1.7005, + "step": 30922 + }, + { + "epoch": 2.6355578283473964, + "grad_norm": 33.330165543603734, + "learning_rate": 4.430519755849999e-07, + "loss": 0.9386, + "step": 30923 + }, + { + "epoch": 2.6356430580414214, + "grad_norm": 61.707258024414465, + "learning_rate": 4.4284793447989694e-07, + "loss": 1.9235, + "step": 30924 + }, + { + "epoch": 2.635728287735447, + "grad_norm": 22.042911927785575, + "learning_rate": 4.42643938192574e-07, + "loss": 0.8668, + "step": 30925 + }, + { + "epoch": 2.6358135174294723, + "grad_norm": 16.956924277712638, + "learning_rate": 4.424399867250362e-07, + "loss": 0.3856, + "step": 30926 + }, + { + "epoch": 2.635898747123498, + "grad_norm": 63.69629759301333, + "learning_rate": 4.4223608007929074e-07, + "loss": 1.5911, + "step": 30927 + }, + { + "epoch": 2.6359839768175233, + "grad_norm": 38.89489304262828, + "learning_rate": 4.420322182573422e-07, + "loss": 0.9919, + "step": 30928 + }, + { + "epoch": 2.6360692065115487, + "grad_norm": 56.13802504402829, + "learning_rate": 4.418284012611951e-07, + "loss": 1.4829, + "step": 30929 + }, + { + "epoch": 2.636154436205574, + "grad_norm": 55.18192758593673, + "learning_rate": 4.416246290928544e-07, + "loss": 1.2849, + "step": 30930 + }, + { + "epoch": 2.6362396658995992, + "grad_norm": 33.40982545754679, + "learning_rate": 4.414209017543253e-07, + "loss": 0.6703, + "step": 30931 + }, + { + "epoch": 2.6363248955936247, + "grad_norm": 23.5253736038134, + "learning_rate": 4.4121721924760996e-07, + "loss": 0.6475, + "step": 30932 + }, + { + "epoch": 2.63641012528765, + "grad_norm": 32.79184886833058, + "learning_rate": 4.410135815747113e-07, + "loss": 1.1173, + "step": 30933 + }, + { + "epoch": 2.6364953549816756, + "grad_norm": 23.85766703185424, + "learning_rate": 4.4080998873763324e-07, + "loss": 0.9438, + "step": 30934 + }, + { + "epoch": 2.636580584675701, + "grad_norm": 34.77297461935203, + "learning_rate": 4.406064407383759e-07, + "loss": 0.7957, + "step": 30935 + }, + { + "epoch": 2.636665814369726, + "grad_norm": 49.27041963436334, + "learning_rate": 4.404029375789437e-07, + "loss": 0.9376, + "step": 30936 + }, + { + "epoch": 2.636751044063752, + "grad_norm": 51.278322331367335, + "learning_rate": 4.4019947926133566e-07, + "loss": 1.457, + "step": 30937 + }, + { + "epoch": 2.636836273757777, + "grad_norm": 22.547109182146972, + "learning_rate": 4.399960657875552e-07, + "loss": 0.6746, + "step": 30938 + }, + { + "epoch": 2.6369215034518025, + "grad_norm": 27.43799413051278, + "learning_rate": 4.3979269715960005e-07, + "loss": 0.9014, + "step": 30939 + }, + { + "epoch": 2.637006733145828, + "grad_norm": 38.829901075705045, + "learning_rate": 4.395893733794732e-07, + "loss": 1.2182, + "step": 30940 + }, + { + "epoch": 2.6370919628398535, + "grad_norm": 62.41930301560272, + "learning_rate": 4.3938609444917235e-07, + "loss": 1.7228, + "step": 30941 + }, + { + "epoch": 2.637177192533879, + "grad_norm": 88.37303831871742, + "learning_rate": 4.3918286037069655e-07, + "loss": 1.6004, + "step": 30942 + }, + { + "epoch": 2.637262422227904, + "grad_norm": 22.237773350275894, + "learning_rate": 4.389796711460448e-07, + "loss": 0.7587, + "step": 30943 + }, + { + "epoch": 2.6373476519219294, + "grad_norm": 75.55323337059586, + "learning_rate": 4.387765267772165e-07, + "loss": 1.4643, + "step": 30944 + }, + { + "epoch": 2.637432881615955, + "grad_norm": 54.067347052448994, + "learning_rate": 4.3857342726620735e-07, + "loss": 1.7534, + "step": 30945 + }, + { + "epoch": 2.6375181113099804, + "grad_norm": 59.66173959366531, + "learning_rate": 4.3837037261501734e-07, + "loss": 1.5935, + "step": 30946 + }, + { + "epoch": 2.637603341004006, + "grad_norm": 48.66702330900031, + "learning_rate": 4.3816736282564163e-07, + "loss": 1.3303, + "step": 30947 + }, + { + "epoch": 2.6376885706980313, + "grad_norm": 39.42111787521964, + "learning_rate": 4.379643979000764e-07, + "loss": 0.8356, + "step": 30948 + }, + { + "epoch": 2.6377738003920568, + "grad_norm": 59.23972503371458, + "learning_rate": 4.3776147784032e-07, + "loss": 1.2967, + "step": 30949 + }, + { + "epoch": 2.637859030086082, + "grad_norm": 44.913001925774395, + "learning_rate": 4.375586026483658e-07, + "loss": 1.4653, + "step": 30950 + }, + { + "epoch": 2.6379442597801073, + "grad_norm": 59.48662053813711, + "learning_rate": 4.373557723262106e-07, + "loss": 1.1508, + "step": 30951 + }, + { + "epoch": 2.6380294894741327, + "grad_norm": 38.463194404685126, + "learning_rate": 4.3715298687584673e-07, + "loss": 0.9138, + "step": 30952 + }, + { + "epoch": 2.638114719168158, + "grad_norm": 53.487557445303814, + "learning_rate": 4.3695024629927087e-07, + "loss": 1.5561, + "step": 30953 + }, + { + "epoch": 2.6381999488621837, + "grad_norm": 37.470975840648464, + "learning_rate": 4.367475505984764e-07, + "loss": 1.5963, + "step": 30954 + }, + { + "epoch": 2.638285178556209, + "grad_norm": 60.543789282572604, + "learning_rate": 4.3654489977545676e-07, + "loss": 1.6907, + "step": 30955 + }, + { + "epoch": 2.6383704082502346, + "grad_norm": 82.75226073669977, + "learning_rate": 4.3634229383220374e-07, + "loss": 3.0647, + "step": 30956 + }, + { + "epoch": 2.6384556379442596, + "grad_norm": 45.63955881310409, + "learning_rate": 4.3613973277071175e-07, + "loss": 1.2857, + "step": 30957 + }, + { + "epoch": 2.638540867638285, + "grad_norm": 59.13108864617921, + "learning_rate": 4.3593721659297207e-07, + "loss": 1.067, + "step": 30958 + }, + { + "epoch": 2.6386260973323106, + "grad_norm": 29.146433509085682, + "learning_rate": 4.357347453009764e-07, + "loss": 0.5836, + "step": 30959 + }, + { + "epoch": 2.638711327026336, + "grad_norm": 49.79041868779433, + "learning_rate": 4.355323188967148e-07, + "loss": 1.0917, + "step": 30960 + }, + { + "epoch": 2.6387965567203615, + "grad_norm": 30.04336414740391, + "learning_rate": 4.35329937382179e-07, + "loss": 1.3171, + "step": 30961 + }, + { + "epoch": 2.6388817864143865, + "grad_norm": 37.428862312859486, + "learning_rate": 4.351276007593602e-07, + "loss": 0.9168, + "step": 30962 + }, + { + "epoch": 2.6389670161084124, + "grad_norm": 61.1483442713139, + "learning_rate": 4.34925309030248e-07, + "loss": 1.4226, + "step": 30963 + }, + { + "epoch": 2.6390522458024375, + "grad_norm": 77.01857530680434, + "learning_rate": 4.3472306219683125e-07, + "loss": 1.8267, + "step": 30964 + }, + { + "epoch": 2.639137475496463, + "grad_norm": 47.75367066666423, + "learning_rate": 4.3452086026109843e-07, + "loss": 1.4257, + "step": 30965 + }, + { + "epoch": 2.6392227051904884, + "grad_norm": 59.02979589651727, + "learning_rate": 4.34318703225039e-07, + "loss": 1.3399, + "step": 30966 + }, + { + "epoch": 2.639307934884514, + "grad_norm": 65.08169143269264, + "learning_rate": 4.341165910906414e-07, + "loss": 1.3727, + "step": 30967 + }, + { + "epoch": 2.6393931645785393, + "grad_norm": 52.04200157856267, + "learning_rate": 4.3391452385989186e-07, + "loss": 1.4029, + "step": 30968 + }, + { + "epoch": 2.6394783942725644, + "grad_norm": 61.539250280084744, + "learning_rate": 4.3371250153477875e-07, + "loss": 1.3, + "step": 30969 + }, + { + "epoch": 2.63956362396659, + "grad_norm": 25.93539438900073, + "learning_rate": 4.3351052411728936e-07, + "loss": 0.8871, + "step": 30970 + }, + { + "epoch": 2.6396488536606153, + "grad_norm": 70.55705151257844, + "learning_rate": 4.3330859160940985e-07, + "loss": 1.9447, + "step": 30971 + }, + { + "epoch": 2.6397340833546408, + "grad_norm": 21.189184893873524, + "learning_rate": 4.3310670401312536e-07, + "loss": 0.7397, + "step": 30972 + }, + { + "epoch": 2.6398193130486662, + "grad_norm": 41.59565745305429, + "learning_rate": 4.3290486133042043e-07, + "loss": 1.0373, + "step": 30973 + }, + { + "epoch": 2.6399045427426917, + "grad_norm": 210.25570704137624, + "learning_rate": 4.3270306356328284e-07, + "loss": 1.3987, + "step": 30974 + }, + { + "epoch": 2.639989772436717, + "grad_norm": 67.43604311377317, + "learning_rate": 4.325013107136955e-07, + "loss": 1.6852, + "step": 30975 + }, + { + "epoch": 2.640075002130742, + "grad_norm": 24.793957248923032, + "learning_rate": 4.3229960278364234e-07, + "loss": 0.9597, + "step": 30976 + }, + { + "epoch": 2.6401602318247677, + "grad_norm": 45.592735752581774, + "learning_rate": 4.3209793977510794e-07, + "loss": 1.5483, + "step": 30977 + }, + { + "epoch": 2.640245461518793, + "grad_norm": 45.31406480448245, + "learning_rate": 4.3189632169007456e-07, + "loss": 1.2438, + "step": 30978 + }, + { + "epoch": 2.6403306912128186, + "grad_norm": 69.69973887504291, + "learning_rate": 4.3169474853052674e-07, + "loss": 1.4334, + "step": 30979 + }, + { + "epoch": 2.640415920906844, + "grad_norm": 50.6693054753426, + "learning_rate": 4.3149322029844564e-07, + "loss": 1.9047, + "step": 30980 + }, + { + "epoch": 2.640501150600869, + "grad_norm": 37.69917615700617, + "learning_rate": 4.3129173699581364e-07, + "loss": 0.8801, + "step": 30981 + }, + { + "epoch": 2.640586380294895, + "grad_norm": 24.631687044670763, + "learning_rate": 4.310902986246107e-07, + "loss": 1.2297, + "step": 30982 + }, + { + "epoch": 2.64067160998892, + "grad_norm": 47.92903349432796, + "learning_rate": 4.308889051868198e-07, + "loss": 1.5477, + "step": 30983 + }, + { + "epoch": 2.6407568396829455, + "grad_norm": 60.412348969672095, + "learning_rate": 4.3068755668442044e-07, + "loss": 1.4065, + "step": 30984 + }, + { + "epoch": 2.640842069376971, + "grad_norm": 61.349477826148004, + "learning_rate": 4.3048625311939427e-07, + "loss": 1.042, + "step": 30985 + }, + { + "epoch": 2.6409272990709964, + "grad_norm": 33.790381795477686, + "learning_rate": 4.302849944937193e-07, + "loss": 0.9491, + "step": 30986 + }, + { + "epoch": 2.641012528765022, + "grad_norm": 23.68168355532604, + "learning_rate": 4.300837808093761e-07, + "loss": 0.7795, + "step": 30987 + }, + { + "epoch": 2.641097758459047, + "grad_norm": 86.30219750249863, + "learning_rate": 4.2988261206834305e-07, + "loss": 2.2553, + "step": 30988 + }, + { + "epoch": 2.6411829881530724, + "grad_norm": 42.11786637816829, + "learning_rate": 4.2968148827259857e-07, + "loss": 1.3023, + "step": 30989 + }, + { + "epoch": 2.641268217847098, + "grad_norm": 62.84080396933827, + "learning_rate": 4.294804094241206e-07, + "loss": 1.4261, + "step": 30990 + }, + { + "epoch": 2.6413534475411233, + "grad_norm": 42.80477305312138, + "learning_rate": 4.2927937552488583e-07, + "loss": 1.3402, + "step": 30991 + }, + { + "epoch": 2.641438677235149, + "grad_norm": 27.14242162111682, + "learning_rate": 4.2907838657687216e-07, + "loss": 1.0073, + "step": 30992 + }, + { + "epoch": 2.6415239069291743, + "grad_norm": 64.10496423506893, + "learning_rate": 4.288774425820569e-07, + "loss": 1.9694, + "step": 30993 + }, + { + "epoch": 2.6416091366231997, + "grad_norm": 50.50457207627986, + "learning_rate": 4.286765435424162e-07, + "loss": 0.5654, + "step": 30994 + }, + { + "epoch": 2.6416943663172248, + "grad_norm": 37.89938640452328, + "learning_rate": 4.2847568945992345e-07, + "loss": 1.3239, + "step": 30995 + }, + { + "epoch": 2.6417795960112502, + "grad_norm": 42.168784533314756, + "learning_rate": 4.282748803365572e-07, + "loss": 1.0307, + "step": 30996 + }, + { + "epoch": 2.6418648257052757, + "grad_norm": 56.469729066899504, + "learning_rate": 4.2807411617429074e-07, + "loss": 1.5353, + "step": 30997 + }, + { + "epoch": 2.641950055399301, + "grad_norm": 36.275151700358094, + "learning_rate": 4.278733969750992e-07, + "loss": 0.9041, + "step": 30998 + }, + { + "epoch": 2.6420352850933266, + "grad_norm": 24.49062668128755, + "learning_rate": 4.276727227409544e-07, + "loss": 0.9264, + "step": 30999 + }, + { + "epoch": 2.6421205147873517, + "grad_norm": 62.636615230133046, + "learning_rate": 4.274720934738319e-07, + "loss": 1.0008, + "step": 31000 + }, + { + "epoch": 2.6422057444813776, + "grad_norm": 43.37994306341752, + "learning_rate": 4.2727150917570513e-07, + "loss": 1.1399, + "step": 31001 + }, + { + "epoch": 2.6422909741754026, + "grad_norm": 23.485948181545474, + "learning_rate": 4.270709698485459e-07, + "loss": 0.7278, + "step": 31002 + }, + { + "epoch": 2.642376203869428, + "grad_norm": 27.07605883178261, + "learning_rate": 4.268704754943259e-07, + "loss": 0.6077, + "step": 31003 + }, + { + "epoch": 2.6424614335634535, + "grad_norm": 46.14044744446332, + "learning_rate": 4.2667002611501864e-07, + "loss": 1.3108, + "step": 31004 + }, + { + "epoch": 2.642546663257479, + "grad_norm": 49.5334262870384, + "learning_rate": 4.264696217125941e-07, + "loss": 1.1762, + "step": 31005 + }, + { + "epoch": 2.6426318929515045, + "grad_norm": 25.60916423177588, + "learning_rate": 4.2626926228902356e-07, + "loss": 0.8394, + "step": 31006 + }, + { + "epoch": 2.6427171226455295, + "grad_norm": 55.056570942024166, + "learning_rate": 4.260689478462765e-07, + "loss": 1.5117, + "step": 31007 + }, + { + "epoch": 2.642802352339555, + "grad_norm": 54.43441468059017, + "learning_rate": 4.2586867838632364e-07, + "loss": 1.2527, + "step": 31008 + }, + { + "epoch": 2.6428875820335804, + "grad_norm": 46.28477106115654, + "learning_rate": 4.256684539111361e-07, + "loss": 0.8153, + "step": 31009 + }, + { + "epoch": 2.642972811727606, + "grad_norm": 28.34189431650742, + "learning_rate": 4.2546827442268124e-07, + "loss": 0.7591, + "step": 31010 + }, + { + "epoch": 2.6430580414216314, + "grad_norm": 37.68890346794905, + "learning_rate": 4.252681399229286e-07, + "loss": 1.421, + "step": 31011 + }, + { + "epoch": 2.643143271115657, + "grad_norm": 45.13336911039465, + "learning_rate": 4.250680504138449e-07, + "loss": 1.4276, + "step": 31012 + }, + { + "epoch": 2.6432285008096823, + "grad_norm": 30.91180263946696, + "learning_rate": 4.2486800589739973e-07, + "loss": 1.0445, + "step": 31013 + }, + { + "epoch": 2.6433137305037073, + "grad_norm": 42.83550077868071, + "learning_rate": 4.246680063755598e-07, + "loss": 1.3207, + "step": 31014 + }, + { + "epoch": 2.643398960197733, + "grad_norm": 37.55275004241029, + "learning_rate": 4.244680518502914e-07, + "loss": 1.3208, + "step": 31015 + }, + { + "epoch": 2.6434841898917583, + "grad_norm": 58.13692851714552, + "learning_rate": 4.242681423235612e-07, + "loss": 1.2327, + "step": 31016 + }, + { + "epoch": 2.6435694195857837, + "grad_norm": 21.772273238947676, + "learning_rate": 4.2406827779733653e-07, + "loss": 0.6176, + "step": 31017 + }, + { + "epoch": 2.643654649279809, + "grad_norm": 65.52073173188069, + "learning_rate": 4.238684582735825e-07, + "loss": 2.0592, + "step": 31018 + }, + { + "epoch": 2.6437398789738342, + "grad_norm": 51.156605852404745, + "learning_rate": 4.2366868375426306e-07, + "loss": 1.1232, + "step": 31019 + }, + { + "epoch": 2.64382510866786, + "grad_norm": 57.02523929956853, + "learning_rate": 4.234689542413439e-07, + "loss": 1.4435, + "step": 31020 + }, + { + "epoch": 2.643910338361885, + "grad_norm": 28.91191143283373, + "learning_rate": 4.232692697367885e-07, + "loss": 1.3627, + "step": 31021 + }, + { + "epoch": 2.6439955680559106, + "grad_norm": 27.28355084641643, + "learning_rate": 4.230696302425619e-07, + "loss": 0.9469, + "step": 31022 + }, + { + "epoch": 2.644080797749936, + "grad_norm": 44.3422395473367, + "learning_rate": 4.228700357606258e-07, + "loss": 1.2404, + "step": 31023 + }, + { + "epoch": 2.6441660274439616, + "grad_norm": 41.77059337024502, + "learning_rate": 4.2267048629294537e-07, + "loss": 1.1113, + "step": 31024 + }, + { + "epoch": 2.644251257137987, + "grad_norm": 24.410197791296618, + "learning_rate": 4.224709818414807e-07, + "loss": 0.6833, + "step": 31025 + }, + { + "epoch": 2.644336486832012, + "grad_norm": 87.77405878151124, + "learning_rate": 4.222715224081958e-07, + "loss": 1.9449, + "step": 31026 + }, + { + "epoch": 2.6444217165260375, + "grad_norm": 78.10588447837182, + "learning_rate": 4.2207210799505184e-07, + "loss": 1.7487, + "step": 31027 + }, + { + "epoch": 2.644506946220063, + "grad_norm": 70.34002985263459, + "learning_rate": 4.218727386040089e-07, + "loss": 1.916, + "step": 31028 + }, + { + "epoch": 2.6445921759140885, + "grad_norm": 66.79788542236123, + "learning_rate": 4.2167341423702823e-07, + "loss": 1.68, + "step": 31029 + }, + { + "epoch": 2.644677405608114, + "grad_norm": 39.13047109480358, + "learning_rate": 4.21474134896071e-07, + "loss": 1.2356, + "step": 31030 + }, + { + "epoch": 2.6447626353021394, + "grad_norm": 67.29857077276507, + "learning_rate": 4.2127490058309515e-07, + "loss": 1.1526, + "step": 31031 + }, + { + "epoch": 2.644847864996165, + "grad_norm": 76.88280300462526, + "learning_rate": 4.2107571130006244e-07, + "loss": 2.1171, + "step": 31032 + }, + { + "epoch": 2.64493309469019, + "grad_norm": 38.8029432593333, + "learning_rate": 4.2087656704892953e-07, + "loss": 0.8553, + "step": 31033 + }, + { + "epoch": 2.6450183243842154, + "grad_norm": 61.86774515925369, + "learning_rate": 4.206774678316572e-07, + "loss": 1.8914, + "step": 31034 + }, + { + "epoch": 2.645103554078241, + "grad_norm": 47.07200013983464, + "learning_rate": 4.2047841365020216e-07, + "loss": 1.5556, + "step": 31035 + }, + { + "epoch": 2.6451887837722663, + "grad_norm": 77.05232759420161, + "learning_rate": 4.202794045065217e-07, + "loss": 1.5205, + "step": 31036 + }, + { + "epoch": 2.6452740134662918, + "grad_norm": 72.78773040156626, + "learning_rate": 4.200804404025743e-07, + "loss": 1.5917, + "step": 31037 + }, + { + "epoch": 2.645359243160317, + "grad_norm": 56.21111462235904, + "learning_rate": 4.198815213403146e-07, + "loss": 1.3789, + "step": 31038 + }, + { + "epoch": 2.6454444728543427, + "grad_norm": 22.195437597541563, + "learning_rate": 4.196826473217003e-07, + "loss": 0.7403, + "step": 31039 + }, + { + "epoch": 2.6455297025483677, + "grad_norm": 41.2635585246909, + "learning_rate": 4.1948381834868765e-07, + "loss": 1.4159, + "step": 31040 + }, + { + "epoch": 2.645614932242393, + "grad_norm": 66.89820389719412, + "learning_rate": 4.192850344232319e-07, + "loss": 1.7274, + "step": 31041 + }, + { + "epoch": 2.6457001619364187, + "grad_norm": 41.08648985898381, + "learning_rate": 4.1908629554728687e-07, + "loss": 0.8368, + "step": 31042 + }, + { + "epoch": 2.645785391630444, + "grad_norm": 63.64573300302305, + "learning_rate": 4.188876017228083e-07, + "loss": 2.1771, + "step": 31043 + }, + { + "epoch": 2.6458706213244696, + "grad_norm": 24.193223956824717, + "learning_rate": 4.1868895295175016e-07, + "loss": 1.1151, + "step": 31044 + }, + { + "epoch": 2.6459558510184946, + "grad_norm": 42.98546056071392, + "learning_rate": 4.1849034923606536e-07, + "loss": 1.1221, + "step": 31045 + }, + { + "epoch": 2.64604108071252, + "grad_norm": 48.045144843071014, + "learning_rate": 4.182917905777073e-07, + "loss": 0.8011, + "step": 31046 + }, + { + "epoch": 2.6461263104065456, + "grad_norm": 67.04684173392874, + "learning_rate": 4.1809327697862834e-07, + "loss": 1.591, + "step": 31047 + }, + { + "epoch": 2.646211540100571, + "grad_norm": 64.0639591694567, + "learning_rate": 4.178948084407819e-07, + "loss": 1.788, + "step": 31048 + }, + { + "epoch": 2.6462967697945965, + "grad_norm": 81.01184937840111, + "learning_rate": 4.1769638496611974e-07, + "loss": 1.8726, + "step": 31049 + }, + { + "epoch": 2.646381999488622, + "grad_norm": 69.7144317723455, + "learning_rate": 4.174980065565931e-07, + "loss": 1.6751, + "step": 31050 + }, + { + "epoch": 2.6464672291826474, + "grad_norm": 31.627515936965178, + "learning_rate": 4.17299673214151e-07, + "loss": 1.1524, + "step": 31051 + }, + { + "epoch": 2.6465524588766725, + "grad_norm": 74.80105501761507, + "learning_rate": 4.171013849407474e-07, + "loss": 1.9463, + "step": 31052 + }, + { + "epoch": 2.646637688570698, + "grad_norm": 52.97953255166933, + "learning_rate": 4.1690314173832955e-07, + "loss": 1.2036, + "step": 31053 + }, + { + "epoch": 2.6467229182647234, + "grad_norm": 41.737263723058824, + "learning_rate": 4.167049436088483e-07, + "loss": 1.3302, + "step": 31054 + }, + { + "epoch": 2.646808147958749, + "grad_norm": 72.22236138734746, + "learning_rate": 4.165067905542519e-07, + "loss": 2.0058, + "step": 31055 + }, + { + "epoch": 2.6468933776527743, + "grad_norm": 69.49234070676816, + "learning_rate": 4.1630868257649114e-07, + "loss": 1.536, + "step": 31056 + }, + { + "epoch": 2.6469786073467994, + "grad_norm": 84.21674798285686, + "learning_rate": 4.1611061967751334e-07, + "loss": 1.8, + "step": 31057 + }, + { + "epoch": 2.6470638370408253, + "grad_norm": 21.863686148466694, + "learning_rate": 4.159126018592657e-07, + "loss": 0.4701, + "step": 31058 + }, + { + "epoch": 2.6471490667348503, + "grad_norm": 64.10413991077134, + "learning_rate": 4.1571462912369573e-07, + "loss": 1.2203, + "step": 31059 + }, + { + "epoch": 2.6472342964288758, + "grad_norm": 53.59648680352824, + "learning_rate": 4.1551670147275127e-07, + "loss": 1.2733, + "step": 31060 + }, + { + "epoch": 2.6473195261229012, + "grad_norm": 44.967590746245406, + "learning_rate": 4.1531881890837845e-07, + "loss": 1.4724, + "step": 31061 + }, + { + "epoch": 2.6474047558169267, + "grad_norm": 71.04368828943568, + "learning_rate": 4.1512098143252243e-07, + "loss": 1.9373, + "step": 31062 + }, + { + "epoch": 2.647489985510952, + "grad_norm": 29.427618300500168, + "learning_rate": 4.149231890471306e-07, + "loss": 0.8405, + "step": 31063 + }, + { + "epoch": 2.647575215204977, + "grad_norm": 70.39552905849622, + "learning_rate": 4.147254417541463e-07, + "loss": 2.2099, + "step": 31064 + }, + { + "epoch": 2.6476604448990027, + "grad_norm": 52.94348391595071, + "learning_rate": 4.1452773955551586e-07, + "loss": 1.1462, + "step": 31065 + }, + { + "epoch": 2.647745674593028, + "grad_norm": 53.94652675651794, + "learning_rate": 4.143300824531832e-07, + "loss": 1.4137, + "step": 31066 + }, + { + "epoch": 2.6478309042870536, + "grad_norm": 26.78000259555376, + "learning_rate": 4.1413247044909235e-07, + "loss": 1.1533, + "step": 31067 + }, + { + "epoch": 2.647916133981079, + "grad_norm": 36.635973337185035, + "learning_rate": 4.139349035451851e-07, + "loss": 1.3118, + "step": 31068 + }, + { + "epoch": 2.6480013636751045, + "grad_norm": 72.41208030822283, + "learning_rate": 4.1373738174340707e-07, + "loss": 2.4304, + "step": 31069 + }, + { + "epoch": 2.64808659336913, + "grad_norm": 55.71544547011106, + "learning_rate": 4.135399050456984e-07, + "loss": 1.347, + "step": 31070 + }, + { + "epoch": 2.648171823063155, + "grad_norm": 54.019772221331884, + "learning_rate": 4.1334247345400314e-07, + "loss": 1.0835, + "step": 31071 + }, + { + "epoch": 2.6482570527571805, + "grad_norm": 45.79341123604108, + "learning_rate": 4.1314508697026134e-07, + "loss": 1.2598, + "step": 31072 + }, + { + "epoch": 2.648342282451206, + "grad_norm": 43.874856447301475, + "learning_rate": 4.1294774559641535e-07, + "loss": 1.1205, + "step": 31073 + }, + { + "epoch": 2.6484275121452314, + "grad_norm": 67.2442745482433, + "learning_rate": 4.1275044933440646e-07, + "loss": 1.4939, + "step": 31074 + }, + { + "epoch": 2.648512741839257, + "grad_norm": 18.01657391843503, + "learning_rate": 4.125531981861736e-07, + "loss": 0.7253, + "step": 31075 + }, + { + "epoch": 2.6485979715332824, + "grad_norm": 78.04460689976942, + "learning_rate": 4.1235599215365573e-07, + "loss": 1.8803, + "step": 31076 + }, + { + "epoch": 2.648683201227308, + "grad_norm": 26.772715452262446, + "learning_rate": 4.121588312387953e-07, + "loss": 0.5503, + "step": 31077 + }, + { + "epoch": 2.648768430921333, + "grad_norm": 31.773048264700105, + "learning_rate": 4.11961715443529e-07, + "loss": 0.8998, + "step": 31078 + }, + { + "epoch": 2.6488536606153583, + "grad_norm": 47.33020974357834, + "learning_rate": 4.117646447697965e-07, + "loss": 1.4112, + "step": 31079 + }, + { + "epoch": 2.648938890309384, + "grad_norm": 39.285039768778375, + "learning_rate": 4.115676192195356e-07, + "loss": 1.0884, + "step": 31080 + }, + { + "epoch": 2.6490241200034093, + "grad_norm": 19.12462812497856, + "learning_rate": 4.113706387946831e-07, + "loss": 0.5574, + "step": 31081 + }, + { + "epoch": 2.6491093496974347, + "grad_norm": 58.30108260462725, + "learning_rate": 4.11173703497178e-07, + "loss": 1.7079, + "step": 31082 + }, + { + "epoch": 2.6491945793914597, + "grad_norm": 76.22552847783822, + "learning_rate": 4.10976813328956e-07, + "loss": 1.7626, + "step": 31083 + }, + { + "epoch": 2.6492798090854857, + "grad_norm": 27.243317565357668, + "learning_rate": 4.1077996829195385e-07, + "loss": 1.1385, + "step": 31084 + }, + { + "epoch": 2.6493650387795107, + "grad_norm": 30.177869424728335, + "learning_rate": 4.105831683881062e-07, + "loss": 1.0812, + "step": 31085 + }, + { + "epoch": 2.649450268473536, + "grad_norm": 66.37204753300128, + "learning_rate": 4.103864136193492e-07, + "loss": 1.4738, + "step": 31086 + }, + { + "epoch": 2.6495354981675616, + "grad_norm": 29.814110685497727, + "learning_rate": 4.1018970398761906e-07, + "loss": 1.1785, + "step": 31087 + }, + { + "epoch": 2.649620727861587, + "grad_norm": 39.80018260040512, + "learning_rate": 4.0999303949484924e-07, + "loss": 0.8287, + "step": 31088 + }, + { + "epoch": 2.6497059575556126, + "grad_norm": 61.33406919478943, + "learning_rate": 4.097964201429733e-07, + "loss": 1.4528, + "step": 31089 + }, + { + "epoch": 2.6497911872496376, + "grad_norm": 76.56838036840576, + "learning_rate": 4.0959984593392623e-07, + "loss": 2.136, + "step": 31090 + }, + { + "epoch": 2.649876416943663, + "grad_norm": 47.97021351606051, + "learning_rate": 4.0940331686964096e-07, + "loss": 1.3959, + "step": 31091 + }, + { + "epoch": 2.6499616466376885, + "grad_norm": 69.20756002565133, + "learning_rate": 4.092068329520493e-07, + "loss": 2.5928, + "step": 31092 + }, + { + "epoch": 2.650046876331714, + "grad_norm": 50.96833575141718, + "learning_rate": 4.090103941830842e-07, + "loss": 1.5362, + "step": 31093 + }, + { + "epoch": 2.6501321060257395, + "grad_norm": 46.54997971133869, + "learning_rate": 4.0881400056467734e-07, + "loss": 0.9731, + "step": 31094 + }, + { + "epoch": 2.650217335719765, + "grad_norm": 36.497253063249424, + "learning_rate": 4.0861765209876115e-07, + "loss": 0.8236, + "step": 31095 + }, + { + "epoch": 2.6503025654137904, + "grad_norm": 49.62345151915003, + "learning_rate": 4.084213487872657e-07, + "loss": 1.7772, + "step": 31096 + }, + { + "epoch": 2.6503877951078154, + "grad_norm": 67.92760865933452, + "learning_rate": 4.0822509063212224e-07, + "loss": 1.8484, + "step": 31097 + }, + { + "epoch": 2.650473024801841, + "grad_norm": 59.74387315334782, + "learning_rate": 4.0802887763525924e-07, + "loss": 1.4287, + "step": 31098 + }, + { + "epoch": 2.6505582544958664, + "grad_norm": 86.84811769912596, + "learning_rate": 4.0783270979860844e-07, + "loss": 1.533, + "step": 31099 + }, + { + "epoch": 2.650643484189892, + "grad_norm": 34.33601623491245, + "learning_rate": 4.076365871240984e-07, + "loss": 0.8138, + "step": 31100 + }, + { + "epoch": 2.6507287138839173, + "grad_norm": 48.26658510559747, + "learning_rate": 4.074405096136563e-07, + "loss": 1.3489, + "step": 31101 + }, + { + "epoch": 2.6508139435779423, + "grad_norm": 78.23031777752246, + "learning_rate": 4.0724447726921246e-07, + "loss": 1.9502, + "step": 31102 + }, + { + "epoch": 2.6508991732719682, + "grad_norm": 53.86348877327979, + "learning_rate": 4.0704849009269464e-07, + "loss": 1.5881, + "step": 31103 + }, + { + "epoch": 2.6509844029659932, + "grad_norm": 65.19946919008854, + "learning_rate": 4.068525480860303e-07, + "loss": 2.1092, + "step": 31104 + }, + { + "epoch": 2.6510696326600187, + "grad_norm": 28.326852499837894, + "learning_rate": 4.0665665125114605e-07, + "loss": 0.6159, + "step": 31105 + }, + { + "epoch": 2.651154862354044, + "grad_norm": 62.03740108809413, + "learning_rate": 4.064607995899672e-07, + "loss": 1.7224, + "step": 31106 + }, + { + "epoch": 2.6512400920480697, + "grad_norm": 81.33653498372495, + "learning_rate": 4.062649931044221e-07, + "loss": 2.1662, + "step": 31107 + }, + { + "epoch": 2.651325321742095, + "grad_norm": 37.20519417501118, + "learning_rate": 4.0606923179643543e-07, + "loss": 1.17, + "step": 31108 + }, + { + "epoch": 2.65141055143612, + "grad_norm": 76.2103629985212, + "learning_rate": 4.0587351566793165e-07, + "loss": 1.8597, + "step": 31109 + }, + { + "epoch": 2.6514957811301456, + "grad_norm": 88.40676218822111, + "learning_rate": 4.0567784472083703e-07, + "loss": 2.3206, + "step": 31110 + }, + { + "epoch": 2.651581010824171, + "grad_norm": 72.9234194617059, + "learning_rate": 4.0548221895707396e-07, + "loss": 2.0754, + "step": 31111 + }, + { + "epoch": 2.6516662405181965, + "grad_norm": 48.93118539427556, + "learning_rate": 4.052866383785692e-07, + "loss": 1.0067, + "step": 31112 + }, + { + "epoch": 2.651751470212222, + "grad_norm": 55.01009950097861, + "learning_rate": 4.0509110298724396e-07, + "loss": 1.8364, + "step": 31113 + }, + { + "epoch": 2.6518366999062475, + "grad_norm": 22.22964094533915, + "learning_rate": 4.048956127850223e-07, + "loss": 1.0853, + "step": 31114 + }, + { + "epoch": 2.651921929600273, + "grad_norm": 70.36754602943118, + "learning_rate": 4.0470016777382547e-07, + "loss": 1.9101, + "step": 31115 + }, + { + "epoch": 2.652007159294298, + "grad_norm": 79.55282809366967, + "learning_rate": 4.0450476795557745e-07, + "loss": 1.4179, + "step": 31116 + }, + { + "epoch": 2.6520923889883234, + "grad_norm": 31.555090928299222, + "learning_rate": 4.0430941333219784e-07, + "loss": 0.8368, + "step": 31117 + }, + { + "epoch": 2.652177618682349, + "grad_norm": 35.55595233618404, + "learning_rate": 4.0411410390560955e-07, + "loss": 1.4413, + "step": 31118 + }, + { + "epoch": 2.6522628483763744, + "grad_norm": 50.29142721381971, + "learning_rate": 4.039188396777327e-07, + "loss": 1.2144, + "step": 31119 + }, + { + "epoch": 2.6523480780704, + "grad_norm": 66.46951863999514, + "learning_rate": 4.0372362065048797e-07, + "loss": 1.8983, + "step": 31120 + }, + { + "epoch": 2.652433307764425, + "grad_norm": 70.82266408037783, + "learning_rate": 4.035284468257955e-07, + "loss": 1.9201, + "step": 31121 + }, + { + "epoch": 2.652518537458451, + "grad_norm": 65.79646507709478, + "learning_rate": 4.0333331820557433e-07, + "loss": 1.0034, + "step": 31122 + }, + { + "epoch": 2.652603767152476, + "grad_norm": 56.13635004310554, + "learning_rate": 4.0313823479174287e-07, + "loss": 1.8447, + "step": 31123 + }, + { + "epoch": 2.6526889968465013, + "grad_norm": 45.60391217221984, + "learning_rate": 4.0294319658622015e-07, + "loss": 1.4005, + "step": 31124 + }, + { + "epoch": 2.6527742265405267, + "grad_norm": 58.279439534617765, + "learning_rate": 4.0274820359092414e-07, + "loss": 1.1247, + "step": 31125 + }, + { + "epoch": 2.652859456234552, + "grad_norm": 44.898433683788426, + "learning_rate": 4.0255325580777325e-07, + "loss": 1.3869, + "step": 31126 + }, + { + "epoch": 2.6529446859285777, + "grad_norm": 35.561925426075575, + "learning_rate": 4.023583532386843e-07, + "loss": 1.3874, + "step": 31127 + }, + { + "epoch": 2.6530299156226027, + "grad_norm": 33.444629481016406, + "learning_rate": 4.0216349588557355e-07, + "loss": 1.2266, + "step": 31128 + }, + { + "epoch": 2.653115145316628, + "grad_norm": 28.247931348032214, + "learning_rate": 4.019686837503589e-07, + "loss": 0.9179, + "step": 31129 + }, + { + "epoch": 2.6532003750106536, + "grad_norm": 77.38850528301393, + "learning_rate": 4.0177391683495436e-07, + "loss": 1.5771, + "step": 31130 + }, + { + "epoch": 2.653285604704679, + "grad_norm": 56.5877843910127, + "learning_rate": 4.0157919514127675e-07, + "loss": 1.5117, + "step": 31131 + }, + { + "epoch": 2.6533708343987046, + "grad_norm": 50.7345024057637, + "learning_rate": 4.013845186712395e-07, + "loss": 1.7142, + "step": 31132 + }, + { + "epoch": 2.65345606409273, + "grad_norm": 45.988854068961906, + "learning_rate": 4.0118988742675835e-07, + "loss": 1.1194, + "step": 31133 + }, + { + "epoch": 2.6535412937867555, + "grad_norm": 70.38451192812407, + "learning_rate": 4.0099530140974785e-07, + "loss": 1.3515, + "step": 31134 + }, + { + "epoch": 2.6536265234807805, + "grad_norm": 34.44762794166864, + "learning_rate": 4.008007606221215e-07, + "loss": 1.4585, + "step": 31135 + }, + { + "epoch": 2.653711753174806, + "grad_norm": 53.126329659976804, + "learning_rate": 4.006062650657916e-07, + "loss": 1.6662, + "step": 31136 + }, + { + "epoch": 2.6537969828688315, + "grad_norm": 56.02245911894887, + "learning_rate": 4.004118147426711e-07, + "loss": 1.2608, + "step": 31137 + }, + { + "epoch": 2.653882212562857, + "grad_norm": 41.69105538540664, + "learning_rate": 4.002174096546735e-07, + "loss": 1.4992, + "step": 31138 + }, + { + "epoch": 2.6539674422568824, + "grad_norm": 55.86172451689059, + "learning_rate": 4.0002304980370944e-07, + "loss": 1.8398, + "step": 31139 + }, + { + "epoch": 2.6540526719509074, + "grad_norm": 51.20733553317704, + "learning_rate": 3.998287351916902e-07, + "loss": 1.6362, + "step": 31140 + }, + { + "epoch": 2.6541379016449334, + "grad_norm": 27.127348666628, + "learning_rate": 3.996344658205276e-07, + "loss": 0.8925, + "step": 31141 + }, + { + "epoch": 2.6542231313389584, + "grad_norm": 109.54025507653682, + "learning_rate": 3.994402416921328e-07, + "loss": 1.8229, + "step": 31142 + }, + { + "epoch": 2.654308361032984, + "grad_norm": 84.54839110505579, + "learning_rate": 3.9924606280841496e-07, + "loss": 2.182, + "step": 31143 + }, + { + "epoch": 2.6543935907270093, + "grad_norm": 62.962956999064914, + "learning_rate": 3.990519291712841e-07, + "loss": 1.4547, + "step": 31144 + }, + { + "epoch": 2.654478820421035, + "grad_norm": 39.303366253567816, + "learning_rate": 3.9885784078264877e-07, + "loss": 1.1187, + "step": 31145 + }, + { + "epoch": 2.6545640501150602, + "grad_norm": 74.44952382114002, + "learning_rate": 3.9866379764441854e-07, + "loss": 2.105, + "step": 31146 + }, + { + "epoch": 2.6546492798090853, + "grad_norm": 27.118926006606888, + "learning_rate": 3.9846979975850186e-07, + "loss": 0.8966, + "step": 31147 + }, + { + "epoch": 2.6547345095031107, + "grad_norm": 27.902659927814984, + "learning_rate": 3.9827584712680503e-07, + "loss": 0.9375, + "step": 31148 + }, + { + "epoch": 2.654819739197136, + "grad_norm": 72.48277389823144, + "learning_rate": 3.98081939751237e-07, + "loss": 1.8634, + "step": 31149 + }, + { + "epoch": 2.6549049688911617, + "grad_norm": 72.78874554539118, + "learning_rate": 3.978880776337052e-07, + "loss": 1.809, + "step": 31150 + }, + { + "epoch": 2.654990198585187, + "grad_norm": 42.68926249881189, + "learning_rate": 3.976942607761153e-07, + "loss": 1.591, + "step": 31151 + }, + { + "epoch": 2.6550754282792126, + "grad_norm": 29.0913971247257, + "learning_rate": 3.975004891803741e-07, + "loss": 0.8318, + "step": 31152 + }, + { + "epoch": 2.655160657973238, + "grad_norm": 61.67230790488089, + "learning_rate": 3.973067628483862e-07, + "loss": 1.9155, + "step": 31153 + }, + { + "epoch": 2.655245887667263, + "grad_norm": 51.0693979904192, + "learning_rate": 3.971130817820568e-07, + "loss": 1.3345, + "step": 31154 + }, + { + "epoch": 2.6553311173612886, + "grad_norm": 77.48301203246515, + "learning_rate": 3.96919445983292e-07, + "loss": 1.9256, + "step": 31155 + }, + { + "epoch": 2.655416347055314, + "grad_norm": 30.013006246216836, + "learning_rate": 3.9672585545399434e-07, + "loss": 0.8973, + "step": 31156 + }, + { + "epoch": 2.6555015767493395, + "grad_norm": 27.439777137467313, + "learning_rate": 3.965323101960694e-07, + "loss": 0.8563, + "step": 31157 + }, + { + "epoch": 2.655586806443365, + "grad_norm": 72.50720415631272, + "learning_rate": 3.963388102114196e-07, + "loss": 1.2615, + "step": 31158 + }, + { + "epoch": 2.65567203613739, + "grad_norm": 55.76095509853346, + "learning_rate": 3.9614535550194897e-07, + "loss": 1.4332, + "step": 31159 + }, + { + "epoch": 2.655757265831416, + "grad_norm": 45.96924734592255, + "learning_rate": 3.959519460695593e-07, + "loss": 1.2595, + "step": 31160 + }, + { + "epoch": 2.655842495525441, + "grad_norm": 36.27286111266017, + "learning_rate": 3.9575858191615304e-07, + "loss": 1.3389, + "step": 31161 + }, + { + "epoch": 2.6559277252194664, + "grad_norm": 34.31849747486268, + "learning_rate": 3.955652630436302e-07, + "loss": 1.2155, + "step": 31162 + }, + { + "epoch": 2.656012954913492, + "grad_norm": 52.84978468035622, + "learning_rate": 3.953719894538949e-07, + "loss": 1.8055, + "step": 31163 + }, + { + "epoch": 2.6560981846075173, + "grad_norm": 27.464350614099168, + "learning_rate": 3.95178761148845e-07, + "loss": 1.5801, + "step": 31164 + }, + { + "epoch": 2.656183414301543, + "grad_norm": 45.339424487508225, + "learning_rate": 3.9498557813038295e-07, + "loss": 1.3083, + "step": 31165 + }, + { + "epoch": 2.656268643995568, + "grad_norm": 49.37353764230447, + "learning_rate": 3.947924404004083e-07, + "loss": 1.4963, + "step": 31166 + }, + { + "epoch": 2.6563538736895933, + "grad_norm": 48.52264678111755, + "learning_rate": 3.94599347960819e-07, + "loss": 1.5901, + "step": 31167 + }, + { + "epoch": 2.6564391033836188, + "grad_norm": 38.96765939716739, + "learning_rate": 3.944063008135163e-07, + "loss": 0.9359, + "step": 31168 + }, + { + "epoch": 2.6565243330776442, + "grad_norm": 52.34933503497798, + "learning_rate": 3.942132989603975e-07, + "loss": 1.4453, + "step": 31169 + }, + { + "epoch": 2.6566095627716697, + "grad_norm": 56.45387479339442, + "learning_rate": 3.9402034240336005e-07, + "loss": 1.1835, + "step": 31170 + }, + { + "epoch": 2.656694792465695, + "grad_norm": 43.411855402496236, + "learning_rate": 3.938274311443019e-07, + "loss": 1.2501, + "step": 31171 + }, + { + "epoch": 2.6567800221597206, + "grad_norm": 53.28032389621711, + "learning_rate": 3.936345651851209e-07, + "loss": 1.6672, + "step": 31172 + }, + { + "epoch": 2.6568652518537457, + "grad_norm": 55.264109474744984, + "learning_rate": 3.934417445277144e-07, + "loss": 1.4456, + "step": 31173 + }, + { + "epoch": 2.656950481547771, + "grad_norm": 42.425178454718875, + "learning_rate": 3.932489691739777e-07, + "loss": 1.1505, + "step": 31174 + }, + { + "epoch": 2.6570357112417966, + "grad_norm": 60.336273063306464, + "learning_rate": 3.930562391258058e-07, + "loss": 1.2963, + "step": 31175 + }, + { + "epoch": 2.657120940935822, + "grad_norm": 50.81543575276745, + "learning_rate": 3.9286355438509613e-07, + "loss": 1.2932, + "step": 31176 + }, + { + "epoch": 2.6572061706298475, + "grad_norm": 23.270119855428813, + "learning_rate": 3.9267091495374276e-07, + "loss": 0.9624, + "step": 31177 + }, + { + "epoch": 2.6572914003238726, + "grad_norm": 59.10566470375418, + "learning_rate": 3.924783208336402e-07, + "loss": 1.5297, + "step": 31178 + }, + { + "epoch": 2.6573766300178985, + "grad_norm": 14.833523246435467, + "learning_rate": 3.922857720266815e-07, + "loss": 0.5776, + "step": 31179 + }, + { + "epoch": 2.6574618597119235, + "grad_norm": 60.391169978311375, + "learning_rate": 3.920932685347617e-07, + "loss": 1.4482, + "step": 31180 + }, + { + "epoch": 2.657547089405949, + "grad_norm": 68.87397315010784, + "learning_rate": 3.919008103597738e-07, + "loss": 1.8347, + "step": 31181 + }, + { + "epoch": 2.6576323190999744, + "grad_norm": 25.463221989353983, + "learning_rate": 3.9170839750361065e-07, + "loss": 0.9504, + "step": 31182 + }, + { + "epoch": 2.657717548794, + "grad_norm": 25.597049611400372, + "learning_rate": 3.9151602996816474e-07, + "loss": 1.165, + "step": 31183 + }, + { + "epoch": 2.6578027784880254, + "grad_norm": 49.01857137308349, + "learning_rate": 3.913237077553261e-07, + "loss": 1.3832, + "step": 31184 + }, + { + "epoch": 2.6578880081820504, + "grad_norm": 23.29415056946315, + "learning_rate": 3.9113143086698834e-07, + "loss": 0.9341, + "step": 31185 + }, + { + "epoch": 2.657973237876076, + "grad_norm": 28.34829701283488, + "learning_rate": 3.9093919930504154e-07, + "loss": 0.9087, + "step": 31186 + }, + { + "epoch": 2.6580584675701013, + "grad_norm": 39.66782426156824, + "learning_rate": 3.9074701307137586e-07, + "loss": 0.9967, + "step": 31187 + }, + { + "epoch": 2.658143697264127, + "grad_norm": 50.9961523781805, + "learning_rate": 3.9055487216788146e-07, + "loss": 0.7548, + "step": 31188 + }, + { + "epoch": 2.6582289269581523, + "grad_norm": 41.27247347673216, + "learning_rate": 3.9036277659644907e-07, + "loss": 1.5175, + "step": 31189 + }, + { + "epoch": 2.6583141566521777, + "grad_norm": 20.966732421045958, + "learning_rate": 3.9017072635896716e-07, + "loss": 1.2539, + "step": 31190 + }, + { + "epoch": 2.658399386346203, + "grad_norm": 42.75196495510461, + "learning_rate": 3.899787214573242e-07, + "loss": 1.2845, + "step": 31191 + }, + { + "epoch": 2.6584846160402282, + "grad_norm": 71.63546850859964, + "learning_rate": 3.897867618934076e-07, + "loss": 1.8193, + "step": 31192 + }, + { + "epoch": 2.6585698457342537, + "grad_norm": 65.42396524629694, + "learning_rate": 3.8959484766910747e-07, + "loss": 1.3512, + "step": 31193 + }, + { + "epoch": 2.658655075428279, + "grad_norm": 32.20486473323547, + "learning_rate": 3.894029787863102e-07, + "loss": 1.3111, + "step": 31194 + }, + { + "epoch": 2.6587403051223046, + "grad_norm": 61.53048423119674, + "learning_rate": 3.892111552469019e-07, + "loss": 1.1993, + "step": 31195 + }, + { + "epoch": 2.65882553481633, + "grad_norm": 56.36407716073231, + "learning_rate": 3.8901937705277004e-07, + "loss": 1.2793, + "step": 31196 + }, + { + "epoch": 2.6589107645103556, + "grad_norm": 101.91250478768363, + "learning_rate": 3.888276442057998e-07, + "loss": 1.5023, + "step": 31197 + }, + { + "epoch": 2.658995994204381, + "grad_norm": 28.353524090409085, + "learning_rate": 3.886359567078779e-07, + "loss": 0.7534, + "step": 31198 + }, + { + "epoch": 2.659081223898406, + "grad_norm": 40.47100167908866, + "learning_rate": 3.884443145608896e-07, + "loss": 1.1879, + "step": 31199 + }, + { + "epoch": 2.6591664535924315, + "grad_norm": 58.87694574919344, + "learning_rate": 3.8825271776671835e-07, + "loss": 1.4985, + "step": 31200 + }, + { + "epoch": 2.659251683286457, + "grad_norm": 44.17361527685851, + "learning_rate": 3.880611663272488e-07, + "loss": 0.8046, + "step": 31201 + }, + { + "epoch": 2.6593369129804825, + "grad_norm": 72.94621147726588, + "learning_rate": 3.8786966024436543e-07, + "loss": 2.1561, + "step": 31202 + }, + { + "epoch": 2.659422142674508, + "grad_norm": 62.351422167491556, + "learning_rate": 3.8767819951995077e-07, + "loss": 1.5744, + "step": 31203 + }, + { + "epoch": 2.659507372368533, + "grad_norm": 37.60205105634506, + "learning_rate": 3.874867841558888e-07, + "loss": 1.0013, + "step": 31204 + }, + { + "epoch": 2.6595926020625584, + "grad_norm": 67.73717566457556, + "learning_rate": 3.8729541415406134e-07, + "loss": 1.3286, + "step": 31205 + }, + { + "epoch": 2.659677831756584, + "grad_norm": 60.46925025450741, + "learning_rate": 3.8710408951635084e-07, + "loss": 1.9821, + "step": 31206 + }, + { + "epoch": 2.6597630614506094, + "grad_norm": 42.4524729214202, + "learning_rate": 3.8691281024463846e-07, + "loss": 1.471, + "step": 31207 + }, + { + "epoch": 2.659848291144635, + "grad_norm": 47.145139125808335, + "learning_rate": 3.867215763408061e-07, + "loss": 1.4684, + "step": 31208 + }, + { + "epoch": 2.6599335208386603, + "grad_norm": 66.02507969201828, + "learning_rate": 3.8653038780673227e-07, + "loss": 1.3755, + "step": 31209 + }, + { + "epoch": 2.6600187505326858, + "grad_norm": 36.31609599833254, + "learning_rate": 3.863392446443004e-07, + "loss": 1.0843, + "step": 31210 + }, + { + "epoch": 2.660103980226711, + "grad_norm": 35.76587971654532, + "learning_rate": 3.861481468553879e-07, + "loss": 0.7147, + "step": 31211 + }, + { + "epoch": 2.6601892099207363, + "grad_norm": 21.134802274299126, + "learning_rate": 3.859570944418756e-07, + "loss": 0.9421, + "step": 31212 + }, + { + "epoch": 2.6602744396147617, + "grad_norm": 86.05985633200005, + "learning_rate": 3.857660874056418e-07, + "loss": 1.5395, + "step": 31213 + }, + { + "epoch": 2.660359669308787, + "grad_norm": 50.62708244694074, + "learning_rate": 3.8557512574856403e-07, + "loss": 1.6238, + "step": 31214 + }, + { + "epoch": 2.6604448990028127, + "grad_norm": 79.45610860439707, + "learning_rate": 3.853842094725224e-07, + "loss": 1.5611, + "step": 31215 + }, + { + "epoch": 2.660530128696838, + "grad_norm": 24.77977352138142, + "learning_rate": 3.851933385793938e-07, + "loss": 1.1412, + "step": 31216 + }, + { + "epoch": 2.6606153583908636, + "grad_norm": 77.79847892490423, + "learning_rate": 3.850025130710544e-07, + "loss": 1.832, + "step": 31217 + }, + { + "epoch": 2.6607005880848886, + "grad_norm": 81.27897998132326, + "learning_rate": 3.8481173294938057e-07, + "loss": 2.5753, + "step": 31218 + }, + { + "epoch": 2.660785817778914, + "grad_norm": 52.8355044446397, + "learning_rate": 3.8462099821624966e-07, + "loss": 0.9585, + "step": 31219 + }, + { + "epoch": 2.6608710474729396, + "grad_norm": 60.913206927733164, + "learning_rate": 3.8443030887353795e-07, + "loss": 1.307, + "step": 31220 + }, + { + "epoch": 2.660956277166965, + "grad_norm": 61.58748083602446, + "learning_rate": 3.842396649231206e-07, + "loss": 1.438, + "step": 31221 + }, + { + "epoch": 2.6610415068609905, + "grad_norm": 32.75348302098778, + "learning_rate": 3.8404906636687056e-07, + "loss": 1.053, + "step": 31222 + }, + { + "epoch": 2.6611267365550155, + "grad_norm": 35.98423480749681, + "learning_rate": 3.8385851320666524e-07, + "loss": 1.3625, + "step": 31223 + }, + { + "epoch": 2.6612119662490414, + "grad_norm": 66.72133110628266, + "learning_rate": 3.836680054443764e-07, + "loss": 2.0445, + "step": 31224 + }, + { + "epoch": 2.6612971959430665, + "grad_norm": 78.78114104904708, + "learning_rate": 3.834775430818788e-07, + "loss": 1.8786, + "step": 31225 + }, + { + "epoch": 2.661382425637092, + "grad_norm": 70.14735130226461, + "learning_rate": 3.832871261210447e-07, + "loss": 1.4977, + "step": 31226 + }, + { + "epoch": 2.6614676553311174, + "grad_norm": 72.40607904950839, + "learning_rate": 3.830967545637465e-07, + "loss": 2.368, + "step": 31227 + }, + { + "epoch": 2.661552885025143, + "grad_norm": 53.068033496266665, + "learning_rate": 3.8290642841185833e-07, + "loss": 1.4881, + "step": 31228 + }, + { + "epoch": 2.6616381147191683, + "grad_norm": 51.61665971431255, + "learning_rate": 3.8271614766725087e-07, + "loss": 1.2437, + "step": 31229 + }, + { + "epoch": 2.6617233444131934, + "grad_norm": 57.29174592989663, + "learning_rate": 3.825259123317948e-07, + "loss": 1.3077, + "step": 31230 + }, + { + "epoch": 2.661808574107219, + "grad_norm": 64.06495476405514, + "learning_rate": 3.823357224073615e-07, + "loss": 1.6601, + "step": 31231 + }, + { + "epoch": 2.6618938038012443, + "grad_norm": 27.875511263021643, + "learning_rate": 3.8214557789582164e-07, + "loss": 0.9699, + "step": 31232 + }, + { + "epoch": 2.6619790334952698, + "grad_norm": 56.25601701375053, + "learning_rate": 3.819554787990459e-07, + "loss": 1.5404, + "step": 31233 + }, + { + "epoch": 2.6620642631892952, + "grad_norm": 58.4412063634285, + "learning_rate": 3.8176542511890123e-07, + "loss": 1.406, + "step": 31234 + }, + { + "epoch": 2.6621494928833207, + "grad_norm": 45.15408997972541, + "learning_rate": 3.815754168572594e-07, + "loss": 1.6386, + "step": 31235 + }, + { + "epoch": 2.662234722577346, + "grad_norm": 37.97385311177762, + "learning_rate": 3.8138545401598836e-07, + "loss": 1.128, + "step": 31236 + }, + { + "epoch": 2.662319952271371, + "grad_norm": 66.15341909885163, + "learning_rate": 3.8119553659695606e-07, + "loss": 1.2477, + "step": 31237 + }, + { + "epoch": 2.6624051819653967, + "grad_norm": 24.28212171133741, + "learning_rate": 3.8100566460203046e-07, + "loss": 0.884, + "step": 31238 + }, + { + "epoch": 2.662490411659422, + "grad_norm": 34.0293525103312, + "learning_rate": 3.80815838033079e-07, + "loss": 1.5251, + "step": 31239 + }, + { + "epoch": 2.6625756413534476, + "grad_norm": 31.794475508759998, + "learning_rate": 3.806260568919673e-07, + "loss": 1.2613, + "step": 31240 + }, + { + "epoch": 2.662660871047473, + "grad_norm": 65.85730596047364, + "learning_rate": 3.8043632118056395e-07, + "loss": 1.4486, + "step": 31241 + }, + { + "epoch": 2.662746100741498, + "grad_norm": 60.966138338802715, + "learning_rate": 3.8024663090073244e-07, + "loss": 1.3089, + "step": 31242 + }, + { + "epoch": 2.662831330435524, + "grad_norm": 53.232129207955566, + "learning_rate": 3.800569860543407e-07, + "loss": 1.2618, + "step": 31243 + }, + { + "epoch": 2.662916560129549, + "grad_norm": 27.547964675698474, + "learning_rate": 3.798673866432517e-07, + "loss": 1.2897, + "step": 31244 + }, + { + "epoch": 2.6630017898235745, + "grad_norm": 25.68765720194189, + "learning_rate": 3.796778326693318e-07, + "loss": 0.7094, + "step": 31245 + }, + { + "epoch": 2.6630870195176, + "grad_norm": 39.632318920907856, + "learning_rate": 3.794883241344444e-07, + "loss": 0.9482, + "step": 31246 + }, + { + "epoch": 2.6631722492116254, + "grad_norm": 72.29059868086492, + "learning_rate": 3.7929886104045356e-07, + "loss": 2.0308, + "step": 31247 + }, + { + "epoch": 2.663257478905651, + "grad_norm": 23.425316199718313, + "learning_rate": 3.7910944338922175e-07, + "loss": 0.6378, + "step": 31248 + }, + { + "epoch": 2.663342708599676, + "grad_norm": 41.15136539640946, + "learning_rate": 3.78920071182613e-07, + "loss": 1.2063, + "step": 31249 + }, + { + "epoch": 2.6634279382937014, + "grad_norm": 53.06445966620664, + "learning_rate": 3.78730744422488e-07, + "loss": 1.4019, + "step": 31250 + }, + { + "epoch": 2.663513167987727, + "grad_norm": 27.46795702695352, + "learning_rate": 3.785414631107115e-07, + "loss": 0.7238, + "step": 31251 + }, + { + "epoch": 2.6635983976817523, + "grad_norm": 55.73097857960396, + "learning_rate": 3.783522272491419e-07, + "loss": 1.3562, + "step": 31252 + }, + { + "epoch": 2.663683627375778, + "grad_norm": 49.29517491882245, + "learning_rate": 3.7816303683964216e-07, + "loss": 1.2467, + "step": 31253 + }, + { + "epoch": 2.6637688570698033, + "grad_norm": 63.2419901033686, + "learning_rate": 3.779738918840731e-07, + "loss": 1.5779, + "step": 31254 + }, + { + "epoch": 2.6638540867638287, + "grad_norm": 37.21036017371096, + "learning_rate": 3.7778479238429366e-07, + "loss": 0.6595, + "step": 31255 + }, + { + "epoch": 2.6639393164578538, + "grad_norm": 62.056791895954326, + "learning_rate": 3.7759573834216356e-07, + "loss": 1.6509, + "step": 31256 + }, + { + "epoch": 2.6640245461518792, + "grad_norm": 48.941129020080695, + "learning_rate": 3.7740672975954353e-07, + "loss": 1.2725, + "step": 31257 + }, + { + "epoch": 2.6641097758459047, + "grad_norm": 39.08824014614678, + "learning_rate": 3.7721776663829034e-07, + "loss": 1.1275, + "step": 31258 + }, + { + "epoch": 2.66419500553993, + "grad_norm": 56.78776166636238, + "learning_rate": 3.770288489802643e-07, + "loss": 1.4084, + "step": 31259 + }, + { + "epoch": 2.6642802352339556, + "grad_norm": 52.464209414292384, + "learning_rate": 3.768399767873232e-07, + "loss": 1.0074, + "step": 31260 + }, + { + "epoch": 2.6643654649279807, + "grad_norm": 46.639062694699895, + "learning_rate": 3.7665115006132234e-07, + "loss": 1.1377, + "step": 31261 + }, + { + "epoch": 2.6644506946220066, + "grad_norm": 50.33894267089654, + "learning_rate": 3.7646236880412134e-07, + "loss": 1.5375, + "step": 31262 + }, + { + "epoch": 2.6645359243160316, + "grad_norm": 19.278900259647905, + "learning_rate": 3.762736330175759e-07, + "loss": 0.8719, + "step": 31263 + }, + { + "epoch": 2.664621154010057, + "grad_norm": 47.01818470854221, + "learning_rate": 3.760849427035418e-07, + "loss": 0.9581, + "step": 31264 + }, + { + "epoch": 2.6647063837040825, + "grad_norm": 60.609641704646, + "learning_rate": 3.7589629786387417e-07, + "loss": 2.2127, + "step": 31265 + }, + { + "epoch": 2.664791613398108, + "grad_norm": 43.961100858817325, + "learning_rate": 3.7570769850042874e-07, + "loss": 1.1762, + "step": 31266 + }, + { + "epoch": 2.6648768430921335, + "grad_norm": 49.54552995272636, + "learning_rate": 3.7551914461506136e-07, + "loss": 1.0509, + "step": 31267 + }, + { + "epoch": 2.6649620727861585, + "grad_norm": 42.57203594926915, + "learning_rate": 3.7533063620962593e-07, + "loss": 1.4329, + "step": 31268 + }, + { + "epoch": 2.665047302480184, + "grad_norm": 37.466004294774976, + "learning_rate": 3.751421732859761e-07, + "loss": 0.7969, + "step": 31269 + }, + { + "epoch": 2.6651325321742094, + "grad_norm": 32.303076782343425, + "learning_rate": 3.749537558459643e-07, + "loss": 1.5354, + "step": 31270 + }, + { + "epoch": 2.665217761868235, + "grad_norm": 106.84627529434685, + "learning_rate": 3.747653838914456e-07, + "loss": 2.8122, + "step": 31271 + }, + { + "epoch": 2.6653029915622604, + "grad_norm": 44.61402558107979, + "learning_rate": 3.7457705742427073e-07, + "loss": 1.2579, + "step": 31272 + }, + { + "epoch": 2.665388221256286, + "grad_norm": 60.089664366854315, + "learning_rate": 3.7438877644629225e-07, + "loss": 1.8217, + "step": 31273 + }, + { + "epoch": 2.6654734509503113, + "grad_norm": 81.36260686928283, + "learning_rate": 3.7420054095936185e-07, + "loss": 1.5453, + "step": 31274 + }, + { + "epoch": 2.6655586806443363, + "grad_norm": 33.3898285728948, + "learning_rate": 3.7401235096533206e-07, + "loss": 1.3991, + "step": 31275 + }, + { + "epoch": 2.665643910338362, + "grad_norm": 93.22430352792783, + "learning_rate": 3.73824206466053e-07, + "loss": 2.1848, + "step": 31276 + }, + { + "epoch": 2.6657291400323873, + "grad_norm": 60.317677373402766, + "learning_rate": 3.7363610746337373e-07, + "loss": 1.4714, + "step": 31277 + }, + { + "epoch": 2.6658143697264127, + "grad_norm": 50.64235167523301, + "learning_rate": 3.734480539591451e-07, + "loss": 1.4608, + "step": 31278 + }, + { + "epoch": 2.665899599420438, + "grad_norm": 80.30855593951927, + "learning_rate": 3.732600459552166e-07, + "loss": 1.6741, + "step": 31279 + }, + { + "epoch": 2.665984829114463, + "grad_norm": 30.045940187162262, + "learning_rate": 3.7307208345343736e-07, + "loss": 1.1501, + "step": 31280 + }, + { + "epoch": 2.666070058808489, + "grad_norm": 40.86585224784278, + "learning_rate": 3.728841664556548e-07, + "loss": 1.1924, + "step": 31281 + }, + { + "epoch": 2.666155288502514, + "grad_norm": 28.362420588393846, + "learning_rate": 3.726962949637175e-07, + "loss": 0.8483, + "step": 31282 + }, + { + "epoch": 2.6662405181965396, + "grad_norm": 42.01018492305474, + "learning_rate": 3.725084689794745e-07, + "loss": 1.1326, + "step": 31283 + }, + { + "epoch": 2.666325747890565, + "grad_norm": 78.00995615773724, + "learning_rate": 3.7232068850477197e-07, + "loss": 1.252, + "step": 31284 + }, + { + "epoch": 2.6664109775845906, + "grad_norm": 58.414445916304224, + "learning_rate": 3.721329535414564e-07, + "loss": 1.958, + "step": 31285 + }, + { + "epoch": 2.666496207278616, + "grad_norm": 48.98164122212186, + "learning_rate": 3.7194526409137454e-07, + "loss": 1.3602, + "step": 31286 + }, + { + "epoch": 2.666581436972641, + "grad_norm": 56.082402189883254, + "learning_rate": 3.7175762015637105e-07, + "loss": 1.2937, + "step": 31287 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 55.565641757562695, + "learning_rate": 3.7157002173829335e-07, + "loss": 1.9254, + "step": 31288 + }, + { + "epoch": 2.666751896360692, + "grad_norm": 17.041181557922982, + "learning_rate": 3.713824688389839e-07, + "loss": 0.5367, + "step": 31289 + }, + { + "epoch": 2.6668371260547175, + "grad_norm": 26.652072119785505, + "learning_rate": 3.7119496146029e-07, + "loss": 0.9449, + "step": 31290 + }, + { + "epoch": 2.666922355748743, + "grad_norm": 26.044428407078282, + "learning_rate": 3.7100749960405357e-07, + "loss": 0.7557, + "step": 31291 + }, + { + "epoch": 2.6670075854427684, + "grad_norm": 36.886418604543344, + "learning_rate": 3.7082008327211925e-07, + "loss": 0.8688, + "step": 31292 + }, + { + "epoch": 2.667092815136794, + "grad_norm": 48.35671449924875, + "learning_rate": 3.7063271246633057e-07, + "loss": 1.516, + "step": 31293 + }, + { + "epoch": 2.667178044830819, + "grad_norm": 22.541271915553637, + "learning_rate": 3.704453871885294e-07, + "loss": 0.8701, + "step": 31294 + }, + { + "epoch": 2.6672632745248444, + "grad_norm": 47.414840242686125, + "learning_rate": 3.7025810744055756e-07, + "loss": 1.8035, + "step": 31295 + }, + { + "epoch": 2.66734850421887, + "grad_norm": 130.01231750424074, + "learning_rate": 3.7007087322425803e-07, + "loss": 3.3392, + "step": 31296 + }, + { + "epoch": 2.6674337339128953, + "grad_norm": 87.49413611939015, + "learning_rate": 3.6988368454147105e-07, + "loss": 1.5709, + "step": 31297 + }, + { + "epoch": 2.6675189636069208, + "grad_norm": 35.57812710424999, + "learning_rate": 3.6969654139403897e-07, + "loss": 1.2666, + "step": 31298 + }, + { + "epoch": 2.667604193300946, + "grad_norm": 79.44895056136869, + "learning_rate": 3.6950944378380207e-07, + "loss": 2.175, + "step": 31299 + }, + { + "epoch": 2.6676894229949717, + "grad_norm": 75.84223782040861, + "learning_rate": 3.693223917125982e-07, + "loss": 2.1641, + "step": 31300 + }, + { + "epoch": 2.6677746526889967, + "grad_norm": 33.53394696277542, + "learning_rate": 3.6913538518226987e-07, + "loss": 0.6508, + "step": 31301 + }, + { + "epoch": 2.667859882383022, + "grad_norm": 22.946324085638242, + "learning_rate": 3.689484241946545e-07, + "loss": 1.1555, + "step": 31302 + }, + { + "epoch": 2.6679451120770477, + "grad_norm": 64.50190736620532, + "learning_rate": 3.6876150875159054e-07, + "loss": 1.4875, + "step": 31303 + }, + { + "epoch": 2.668030341771073, + "grad_norm": 57.903172835911064, + "learning_rate": 3.685746388549177e-07, + "loss": 1.8306, + "step": 31304 + }, + { + "epoch": 2.6681155714650986, + "grad_norm": 80.49017881261946, + "learning_rate": 3.6838781450647233e-07, + "loss": 2.0627, + "step": 31305 + }, + { + "epoch": 2.6682008011591236, + "grad_norm": 35.479308909848925, + "learning_rate": 3.6820103570809283e-07, + "loss": 1.5298, + "step": 31306 + }, + { + "epoch": 2.668286030853149, + "grad_norm": 86.35060677189337, + "learning_rate": 3.680143024616162e-07, + "loss": 1.972, + "step": 31307 + }, + { + "epoch": 2.6683712605471746, + "grad_norm": 44.62430946633242, + "learning_rate": 3.6782761476887696e-07, + "loss": 0.8877, + "step": 31308 + }, + { + "epoch": 2.6684564902412, + "grad_norm": 55.26462606173499, + "learning_rate": 3.6764097263171374e-07, + "loss": 1.2938, + "step": 31309 + }, + { + "epoch": 2.6685417199352255, + "grad_norm": 34.17461040239957, + "learning_rate": 3.6745437605196056e-07, + "loss": 1.0325, + "step": 31310 + }, + { + "epoch": 2.668626949629251, + "grad_norm": 61.816340324194876, + "learning_rate": 3.6726782503145264e-07, + "loss": 1.4886, + "step": 31311 + }, + { + "epoch": 2.6687121793232764, + "grad_norm": 35.62923590733297, + "learning_rate": 3.670813195720241e-07, + "loss": 0.9341, + "step": 31312 + }, + { + "epoch": 2.6687974090173014, + "grad_norm": 49.68377537960331, + "learning_rate": 3.6689485967551007e-07, + "loss": 1.4853, + "step": 31313 + }, + { + "epoch": 2.668882638711327, + "grad_norm": 31.9629735878341, + "learning_rate": 3.667084453437447e-07, + "loss": 0.7615, + "step": 31314 + }, + { + "epoch": 2.6689678684053524, + "grad_norm": 20.600460451234078, + "learning_rate": 3.6652207657856097e-07, + "loss": 0.7721, + "step": 31315 + }, + { + "epoch": 2.669053098099378, + "grad_norm": 199.28959186595864, + "learning_rate": 3.663357533817907e-07, + "loss": 2.1514, + "step": 31316 + }, + { + "epoch": 2.6691383277934033, + "grad_norm": 45.31093145647154, + "learning_rate": 3.661494757552669e-07, + "loss": 1.5461, + "step": 31317 + }, + { + "epoch": 2.6692235574874283, + "grad_norm": 43.50742606084352, + "learning_rate": 3.659632437008226e-07, + "loss": 1.5305, + "step": 31318 + }, + { + "epoch": 2.6693087871814543, + "grad_norm": 32.514325516720206, + "learning_rate": 3.657770572202879e-07, + "loss": 0.7921, + "step": 31319 + }, + { + "epoch": 2.6693940168754793, + "grad_norm": 64.93617938103664, + "learning_rate": 3.6559091631549417e-07, + "loss": 1.5354, + "step": 31320 + }, + { + "epoch": 2.6694792465695047, + "grad_norm": 28.553215423984025, + "learning_rate": 3.6540482098827155e-07, + "loss": 0.7929, + "step": 31321 + }, + { + "epoch": 2.66956447626353, + "grad_norm": 53.06590800517833, + "learning_rate": 3.6521877124045203e-07, + "loss": 1.6657, + "step": 31322 + }, + { + "epoch": 2.6696497059575557, + "grad_norm": 23.194826028903492, + "learning_rate": 3.6503276707386457e-07, + "loss": 0.8084, + "step": 31323 + }, + { + "epoch": 2.669734935651581, + "grad_norm": 34.55497229284115, + "learning_rate": 3.6484680849033785e-07, + "loss": 1.2273, + "step": 31324 + }, + { + "epoch": 2.669820165345606, + "grad_norm": 31.63811510232149, + "learning_rate": 3.646608954916997e-07, + "loss": 0.9906, + "step": 31325 + }, + { + "epoch": 2.6699053950396316, + "grad_norm": 17.897057727694207, + "learning_rate": 3.64475028079781e-07, + "loss": 0.589, + "step": 31326 + }, + { + "epoch": 2.669990624733657, + "grad_norm": 41.837207025210844, + "learning_rate": 3.642892062564085e-07, + "loss": 1.1961, + "step": 31327 + }, + { + "epoch": 2.6700758544276826, + "grad_norm": 32.35626424407137, + "learning_rate": 3.6410343002340864e-07, + "loss": 0.8102, + "step": 31328 + }, + { + "epoch": 2.670161084121708, + "grad_norm": 73.41535381611025, + "learning_rate": 3.6391769938261046e-07, + "loss": 1.8582, + "step": 31329 + }, + { + "epoch": 2.6702463138157335, + "grad_norm": 66.8745881218047, + "learning_rate": 3.6373201433583807e-07, + "loss": 1.9116, + "step": 31330 + }, + { + "epoch": 2.670331543509759, + "grad_norm": 35.38219129765943, + "learning_rate": 3.635463748849205e-07, + "loss": 1.1465, + "step": 31331 + }, + { + "epoch": 2.670416773203784, + "grad_norm": 67.53271832300088, + "learning_rate": 3.633607810316814e-07, + "loss": 1.369, + "step": 31332 + }, + { + "epoch": 2.6705020028978095, + "grad_norm": 82.84685169922533, + "learning_rate": 3.63175232777947e-07, + "loss": 1.5345, + "step": 31333 + }, + { + "epoch": 2.670587232591835, + "grad_norm": 47.193581237669136, + "learning_rate": 3.6298973012554084e-07, + "loss": 1.2495, + "step": 31334 + }, + { + "epoch": 2.6706724622858604, + "grad_norm": 43.53343079799229, + "learning_rate": 3.628042730762893e-07, + "loss": 1.3294, + "step": 31335 + }, + { + "epoch": 2.670757691979886, + "grad_norm": 26.393556817613327, + "learning_rate": 3.6261886163201363e-07, + "loss": 1.047, + "step": 31336 + }, + { + "epoch": 2.6708429216739114, + "grad_norm": 24.57438304449387, + "learning_rate": 3.624334957945397e-07, + "loss": 0.885, + "step": 31337 + }, + { + "epoch": 2.670928151367937, + "grad_norm": 50.107497770987095, + "learning_rate": 3.622481755656887e-07, + "loss": 1.7974, + "step": 31338 + }, + { + "epoch": 2.671013381061962, + "grad_norm": 50.56990103592644, + "learning_rate": 3.6206290094728537e-07, + "loss": 1.2139, + "step": 31339 + }, + { + "epoch": 2.6710986107559873, + "grad_norm": 43.187581078434434, + "learning_rate": 3.618776719411499e-07, + "loss": 1.0451, + "step": 31340 + }, + { + "epoch": 2.671183840450013, + "grad_norm": 69.88077293364064, + "learning_rate": 3.6169248854910465e-07, + "loss": 1.9436, + "step": 31341 + }, + { + "epoch": 2.6712690701440382, + "grad_norm": 36.49631847549663, + "learning_rate": 3.6150735077296996e-07, + "loss": 1.2436, + "step": 31342 + }, + { + "epoch": 2.6713542998380637, + "grad_norm": 31.112395123897024, + "learning_rate": 3.6132225861456814e-07, + "loss": 0.9344, + "step": 31343 + }, + { + "epoch": 2.6714395295320887, + "grad_norm": 62.16863015528, + "learning_rate": 3.611372120757178e-07, + "loss": 1.7762, + "step": 31344 + }, + { + "epoch": 2.6715247592261147, + "grad_norm": 57.85290923660646, + "learning_rate": 3.609522111582409e-07, + "loss": 1.4654, + "step": 31345 + }, + { + "epoch": 2.6716099889201397, + "grad_norm": 61.51038189322336, + "learning_rate": 3.607672558639552e-07, + "loss": 1.8336, + "step": 31346 + }, + { + "epoch": 2.671695218614165, + "grad_norm": 92.24475199018887, + "learning_rate": 3.6058234619467893e-07, + "loss": 2.4641, + "step": 31347 + }, + { + "epoch": 2.6717804483081906, + "grad_norm": 55.79097217611061, + "learning_rate": 3.6039748215223326e-07, + "loss": 1.4587, + "step": 31348 + }, + { + "epoch": 2.671865678002216, + "grad_norm": 27.450840071588168, + "learning_rate": 3.6021266373843453e-07, + "loss": 0.8046, + "step": 31349 + }, + { + "epoch": 2.6719509076962416, + "grad_norm": 54.55738248568212, + "learning_rate": 3.6002789095509974e-07, + "loss": 1.9376, + "step": 31350 + }, + { + "epoch": 2.6720361373902666, + "grad_norm": 84.25167636419803, + "learning_rate": 3.5984316380404784e-07, + "loss": 1.9433, + "step": 31351 + }, + { + "epoch": 2.672121367084292, + "grad_norm": 84.13933931441443, + "learning_rate": 3.5965848228709354e-07, + "loss": 2.1105, + "step": 31352 + }, + { + "epoch": 2.6722065967783175, + "grad_norm": 55.97760144607406, + "learning_rate": 3.594738464060549e-07, + "loss": 1.6281, + "step": 31353 + }, + { + "epoch": 2.672291826472343, + "grad_norm": 36.42048146563534, + "learning_rate": 3.59289256162747e-07, + "loss": 1.2014, + "step": 31354 + }, + { + "epoch": 2.6723770561663684, + "grad_norm": 68.63162264634109, + "learning_rate": 3.5910471155898465e-07, + "loss": 1.5444, + "step": 31355 + }, + { + "epoch": 2.672462285860394, + "grad_norm": 61.72321272599498, + "learning_rate": 3.5892021259658406e-07, + "loss": 1.7557, + "step": 31356 + }, + { + "epoch": 2.6725475155544194, + "grad_norm": 36.38813768436808, + "learning_rate": 3.5873575927735937e-07, + "loss": 0.9297, + "step": 31357 + }, + { + "epoch": 2.6726327452484444, + "grad_norm": 26.87952429985456, + "learning_rate": 3.5855135160312417e-07, + "loss": 0.8764, + "step": 31358 + }, + { + "epoch": 2.67271797494247, + "grad_norm": 64.05756103668813, + "learning_rate": 3.5836698957569085e-07, + "loss": 1.5295, + "step": 31359 + }, + { + "epoch": 2.6728032046364953, + "grad_norm": 58.95626892386097, + "learning_rate": 3.5818267319687407e-07, + "loss": 1.4296, + "step": 31360 + }, + { + "epoch": 2.672888434330521, + "grad_norm": 98.27810652603318, + "learning_rate": 3.579984024684868e-07, + "loss": 2.3688, + "step": 31361 + }, + { + "epoch": 2.6729736640245463, + "grad_norm": 67.93396410589041, + "learning_rate": 3.57814177392341e-07, + "loss": 1.71, + "step": 31362 + }, + { + "epoch": 2.6730588937185713, + "grad_norm": 28.585380838245452, + "learning_rate": 3.57629997970248e-07, + "loss": 0.7099, + "step": 31363 + }, + { + "epoch": 2.673144123412597, + "grad_norm": 34.05773144000385, + "learning_rate": 3.5744586420401794e-07, + "loss": 1.1603, + "step": 31364 + }, + { + "epoch": 2.6732293531066222, + "grad_norm": 58.59154633822939, + "learning_rate": 3.572617760954644e-07, + "loss": 1.5368, + "step": 31365 + }, + { + "epoch": 2.6733145828006477, + "grad_norm": 18.708036631953473, + "learning_rate": 3.570777336463965e-07, + "loss": 0.3788, + "step": 31366 + }, + { + "epoch": 2.673399812494673, + "grad_norm": 50.084676018142034, + "learning_rate": 3.568937368586234e-07, + "loss": 1.7287, + "step": 31367 + }, + { + "epoch": 2.6734850421886986, + "grad_norm": 49.74311226519796, + "learning_rate": 3.567097857339552e-07, + "loss": 1.5045, + "step": 31368 + }, + { + "epoch": 2.673570271882724, + "grad_norm": 22.693052980857438, + "learning_rate": 3.565258802742022e-07, + "loss": 0.7177, + "step": 31369 + }, + { + "epoch": 2.673655501576749, + "grad_norm": 66.33049302249334, + "learning_rate": 3.5634202048117183e-07, + "loss": 1.086, + "step": 31370 + }, + { + "epoch": 2.6737407312707746, + "grad_norm": 31.955279726851458, + "learning_rate": 3.5615820635667264e-07, + "loss": 1.2165, + "step": 31371 + }, + { + "epoch": 2.6738259609648, + "grad_norm": 50.30939629034729, + "learning_rate": 3.5597443790251205e-07, + "loss": 1.2363, + "step": 31372 + }, + { + "epoch": 2.6739111906588255, + "grad_norm": 27.249195933347917, + "learning_rate": 3.5579071512049646e-07, + "loss": 0.9138, + "step": 31373 + }, + { + "epoch": 2.673996420352851, + "grad_norm": 67.57431241448757, + "learning_rate": 3.556070380124349e-07, + "loss": 2.3799, + "step": 31374 + }, + { + "epoch": 2.6740816500468765, + "grad_norm": 35.87648477654913, + "learning_rate": 3.554234065801321e-07, + "loss": 1.0523, + "step": 31375 + }, + { + "epoch": 2.674166879740902, + "grad_norm": 59.28755331311668, + "learning_rate": 3.552398208253949e-07, + "loss": 1.2042, + "step": 31376 + }, + { + "epoch": 2.674252109434927, + "grad_norm": 28.837411013727106, + "learning_rate": 3.550562807500274e-07, + "loss": 0.83, + "step": 31377 + }, + { + "epoch": 2.6743373391289524, + "grad_norm": 12.347981571390351, + "learning_rate": 3.548727863558371e-07, + "loss": 0.5353, + "step": 31378 + }, + { + "epoch": 2.674422568822978, + "grad_norm": 71.67106014806023, + "learning_rate": 3.54689337644627e-07, + "loss": 2.2308, + "step": 31379 + }, + { + "epoch": 2.6745077985170034, + "grad_norm": 58.29366223542497, + "learning_rate": 3.5450593461820116e-07, + "loss": 2.2196, + "step": 31380 + }, + { + "epoch": 2.674593028211029, + "grad_norm": 37.51201784910917, + "learning_rate": 3.543225772783626e-07, + "loss": 1.5588, + "step": 31381 + }, + { + "epoch": 2.674678257905054, + "grad_norm": 72.2504592686506, + "learning_rate": 3.5413926562691656e-07, + "loss": 1.7008, + "step": 31382 + }, + { + "epoch": 2.67476348759908, + "grad_norm": 37.85685030839128, + "learning_rate": 3.5395599966566384e-07, + "loss": 0.9953, + "step": 31383 + }, + { + "epoch": 2.674848717293105, + "grad_norm": 31.578125295879396, + "learning_rate": 3.5377277939640855e-07, + "loss": 0.9579, + "step": 31384 + }, + { + "epoch": 2.6749339469871303, + "grad_norm": 73.73585230772815, + "learning_rate": 3.5358960482095193e-07, + "loss": 1.8916, + "step": 31385 + }, + { + "epoch": 2.6750191766811557, + "grad_norm": 59.25649219921381, + "learning_rate": 3.534064759410938e-07, + "loss": 1.2283, + "step": 31386 + }, + { + "epoch": 2.675104406375181, + "grad_norm": 20.137729396409465, + "learning_rate": 3.5322339275863813e-07, + "loss": 1.2243, + "step": 31387 + }, + { + "epoch": 2.6751896360692067, + "grad_norm": 34.516867237852345, + "learning_rate": 3.530403552753836e-07, + "loss": 1.0826, + "step": 31388 + }, + { + "epoch": 2.6752748657632317, + "grad_norm": 30.474258063734027, + "learning_rate": 3.5285736349312984e-07, + "loss": 0.8508, + "step": 31389 + }, + { + "epoch": 2.675360095457257, + "grad_norm": 66.68649002279143, + "learning_rate": 3.5267441741367816e-07, + "loss": 1.699, + "step": 31390 + }, + { + "epoch": 2.6754453251512826, + "grad_norm": 70.17661394625773, + "learning_rate": 3.5249151703882656e-07, + "loss": 1.9511, + "step": 31391 + }, + { + "epoch": 2.675530554845308, + "grad_norm": 60.33902874205558, + "learning_rate": 3.523086623703742e-07, + "loss": 1.4902, + "step": 31392 + }, + { + "epoch": 2.6756157845393336, + "grad_norm": 32.19295152776669, + "learning_rate": 3.521258534101196e-07, + "loss": 1.5229, + "step": 31393 + }, + { + "epoch": 2.675701014233359, + "grad_norm": 66.6170630035815, + "learning_rate": 3.5194309015985973e-07, + "loss": 1.6423, + "step": 31394 + }, + { + "epoch": 2.6757862439273845, + "grad_norm": 75.00351977048072, + "learning_rate": 3.5176037262139363e-07, + "loss": 1.7014, + "step": 31395 + }, + { + "epoch": 2.6758714736214095, + "grad_norm": 72.66455216348109, + "learning_rate": 3.5157770079651653e-07, + "loss": 2.2388, + "step": 31396 + }, + { + "epoch": 2.675956703315435, + "grad_norm": 35.27633170705497, + "learning_rate": 3.5139507468702536e-07, + "loss": 1.1396, + "step": 31397 + }, + { + "epoch": 2.6760419330094605, + "grad_norm": 48.1134801251705, + "learning_rate": 3.5121249429471704e-07, + "loss": 1.5711, + "step": 31398 + }, + { + "epoch": 2.676127162703486, + "grad_norm": 27.54613818367064, + "learning_rate": 3.5102995962138617e-07, + "loss": 0.771, + "step": 31399 + }, + { + "epoch": 2.6762123923975114, + "grad_norm": 52.77039405137833, + "learning_rate": 3.508474706688286e-07, + "loss": 1.5849, + "step": 31400 + }, + { + "epoch": 2.6762976220915364, + "grad_norm": 32.160794419720276, + "learning_rate": 3.50665027438839e-07, + "loss": 1.2229, + "step": 31401 + }, + { + "epoch": 2.6763828517855623, + "grad_norm": 86.292640275834, + "learning_rate": 3.5048262993321146e-07, + "loss": 1.4516, + "step": 31402 + }, + { + "epoch": 2.6764680814795874, + "grad_norm": 37.84406798597133, + "learning_rate": 3.5030027815373903e-07, + "loss": 0.5751, + "step": 31403 + }, + { + "epoch": 2.676553311173613, + "grad_norm": 32.97645420253007, + "learning_rate": 3.501179721022163e-07, + "loss": 1.0539, + "step": 31404 + }, + { + "epoch": 2.6766385408676383, + "grad_norm": 54.24133974015043, + "learning_rate": 3.4993571178043474e-07, + "loss": 1.4749, + "step": 31405 + }, + { + "epoch": 2.6767237705616638, + "grad_norm": 29.790229928656338, + "learning_rate": 3.497534971901889e-07, + "loss": 0.9865, + "step": 31406 + }, + { + "epoch": 2.6768090002556892, + "grad_norm": 49.59028591238323, + "learning_rate": 3.495713283332686e-07, + "loss": 1.0508, + "step": 31407 + }, + { + "epoch": 2.6768942299497143, + "grad_norm": 48.042818062658924, + "learning_rate": 3.493892052114667e-07, + "loss": 1.4416, + "step": 31408 + }, + { + "epoch": 2.6769794596437397, + "grad_norm": 62.532498027474844, + "learning_rate": 3.492071278265746e-07, + "loss": 2.1951, + "step": 31409 + }, + { + "epoch": 2.677064689337765, + "grad_norm": 46.014631055759416, + "learning_rate": 3.4902509618038207e-07, + "loss": 0.8125, + "step": 31410 + }, + { + "epoch": 2.6771499190317907, + "grad_norm": 52.337890560740654, + "learning_rate": 3.4884311027467923e-07, + "loss": 1.6513, + "step": 31411 + }, + { + "epoch": 2.677235148725816, + "grad_norm": 73.19572947665294, + "learning_rate": 3.486611701112563e-07, + "loss": 2.0558, + "step": 31412 + }, + { + "epoch": 2.6773203784198416, + "grad_norm": 63.44286189069744, + "learning_rate": 3.4847927569190307e-07, + "loss": 2.1285, + "step": 31413 + }, + { + "epoch": 2.677405608113867, + "grad_norm": 31.205283984865364, + "learning_rate": 3.48297427018407e-07, + "loss": 1.0559, + "step": 31414 + }, + { + "epoch": 2.677490837807892, + "grad_norm": 64.49238058454938, + "learning_rate": 3.481156240925582e-07, + "loss": 2.2309, + "step": 31415 + }, + { + "epoch": 2.6775760675019176, + "grad_norm": 73.69847923213698, + "learning_rate": 3.479338669161425e-07, + "loss": 1.6429, + "step": 31416 + }, + { + "epoch": 2.677661297195943, + "grad_norm": 51.078208190362076, + "learning_rate": 3.477521554909497e-07, + "loss": 1.3774, + "step": 31417 + }, + { + "epoch": 2.6777465268899685, + "grad_norm": 46.38908099824463, + "learning_rate": 3.4757048981876593e-07, + "loss": 1.2124, + "step": 31418 + }, + { + "epoch": 2.677831756583994, + "grad_norm": 25.867362448371562, + "learning_rate": 3.4738886990137775e-07, + "loss": 0.8811, + "step": 31419 + }, + { + "epoch": 2.677916986278019, + "grad_norm": 76.38962948923204, + "learning_rate": 3.472072957405703e-07, + "loss": 2.0438, + "step": 31420 + }, + { + "epoch": 2.678002215972045, + "grad_norm": 57.636929526023444, + "learning_rate": 3.470257673381311e-07, + "loss": 1.9122, + "step": 31421 + }, + { + "epoch": 2.67808744566607, + "grad_norm": 58.00199285879867, + "learning_rate": 3.4684428469584363e-07, + "loss": 1.3679, + "step": 31422 + }, + { + "epoch": 2.6781726753600954, + "grad_norm": 40.65121533041634, + "learning_rate": 3.4666284781549485e-07, + "loss": 1.2512, + "step": 31423 + }, + { + "epoch": 2.678257905054121, + "grad_norm": 61.145913836192285, + "learning_rate": 3.4648145669886667e-07, + "loss": 1.2196, + "step": 31424 + }, + { + "epoch": 2.6783431347481463, + "grad_norm": 75.24528166520354, + "learning_rate": 3.4630011134774546e-07, + "loss": 2.1395, + "step": 31425 + }, + { + "epoch": 2.678428364442172, + "grad_norm": 26.49300982865669, + "learning_rate": 3.4611881176391303e-07, + "loss": 0.8975, + "step": 31426 + }, + { + "epoch": 2.678513594136197, + "grad_norm": 56.53583179110339, + "learning_rate": 3.4593755794915306e-07, + "loss": 1.2717, + "step": 31427 + }, + { + "epoch": 2.6785988238302223, + "grad_norm": 86.67354083799337, + "learning_rate": 3.4575634990524686e-07, + "loss": 2.0058, + "step": 31428 + }, + { + "epoch": 2.6786840535242478, + "grad_norm": 33.998943516586216, + "learning_rate": 3.4557518763397855e-07, + "loss": 1.1675, + "step": 31429 + }, + { + "epoch": 2.6787692832182732, + "grad_norm": 36.12236397994376, + "learning_rate": 3.4539407113712785e-07, + "loss": 0.9281, + "step": 31430 + }, + { + "epoch": 2.6788545129122987, + "grad_norm": 59.20099844911186, + "learning_rate": 3.4521300041647777e-07, + "loss": 1.5826, + "step": 31431 + }, + { + "epoch": 2.678939742606324, + "grad_norm": 51.270321457862636, + "learning_rate": 3.45031975473808e-07, + "loss": 1.5807, + "step": 31432 + }, + { + "epoch": 2.6790249723003496, + "grad_norm": 88.62997876304519, + "learning_rate": 3.448509963108987e-07, + "loss": 2.1883, + "step": 31433 + }, + { + "epoch": 2.6791102019943747, + "grad_norm": 42.25232723708644, + "learning_rate": 3.446700629295302e-07, + "loss": 1.7075, + "step": 31434 + }, + { + "epoch": 2.6791954316884, + "grad_norm": 35.276431481385245, + "learning_rate": 3.4448917533148217e-07, + "loss": 1.1052, + "step": 31435 + }, + { + "epoch": 2.6792806613824256, + "grad_norm": 18.850842921055587, + "learning_rate": 3.4430833351853264e-07, + "loss": 0.6098, + "step": 31436 + }, + { + "epoch": 2.679365891076451, + "grad_norm": 219.35410126383238, + "learning_rate": 3.441275374924613e-07, + "loss": 1.9083, + "step": 31437 + }, + { + "epoch": 2.6794511207704765, + "grad_norm": 75.923506677555, + "learning_rate": 3.439467872550445e-07, + "loss": 1.6811, + "step": 31438 + }, + { + "epoch": 2.6795363504645016, + "grad_norm": 57.907982321172696, + "learning_rate": 3.437660828080619e-07, + "loss": 1.3999, + "step": 31439 + }, + { + "epoch": 2.6796215801585275, + "grad_norm": 49.794747306112725, + "learning_rate": 3.435854241532899e-07, + "loss": 1.7356, + "step": 31440 + }, + { + "epoch": 2.6797068098525525, + "grad_norm": 38.32970955716453, + "learning_rate": 3.434048112925037e-07, + "loss": 0.9813, + "step": 31441 + }, + { + "epoch": 2.679792039546578, + "grad_norm": 71.31429565808021, + "learning_rate": 3.432242442274819e-07, + "loss": 1.4627, + "step": 31442 + }, + { + "epoch": 2.6798772692406034, + "grad_norm": 57.00658863016333, + "learning_rate": 3.430437229599992e-07, + "loss": 1.3046, + "step": 31443 + }, + { + "epoch": 2.679962498934629, + "grad_norm": 50.43274250170577, + "learning_rate": 3.4286324749183033e-07, + "loss": 1.2246, + "step": 31444 + }, + { + "epoch": 2.6800477286286544, + "grad_norm": 86.71124211205739, + "learning_rate": 3.4268281782475153e-07, + "loss": 1.8225, + "step": 31445 + }, + { + "epoch": 2.6801329583226794, + "grad_norm": 29.59819003182499, + "learning_rate": 3.4250243396053593e-07, + "loss": 1.6044, + "step": 31446 + }, + { + "epoch": 2.680218188016705, + "grad_norm": 55.420733506672214, + "learning_rate": 3.4232209590095876e-07, + "loss": 1.5307, + "step": 31447 + }, + { + "epoch": 2.6803034177107303, + "grad_norm": 55.782536602243624, + "learning_rate": 3.4214180364779306e-07, + "loss": 1.333, + "step": 31448 + }, + { + "epoch": 2.680388647404756, + "grad_norm": 70.26498028249388, + "learning_rate": 3.419615572028118e-07, + "loss": 2.5671, + "step": 31449 + }, + { + "epoch": 2.6804738770987813, + "grad_norm": 110.93574205237451, + "learning_rate": 3.4178135656778686e-07, + "loss": 2.6827, + "step": 31450 + }, + { + "epoch": 2.6805591067928067, + "grad_norm": 77.99011013980288, + "learning_rate": 3.416012017444925e-07, + "loss": 1.7772, + "step": 31451 + }, + { + "epoch": 2.680644336486832, + "grad_norm": 47.75072023565417, + "learning_rate": 3.4142109273469784e-07, + "loss": 1.388, + "step": 31452 + }, + { + "epoch": 2.6807295661808572, + "grad_norm": 45.00443501772117, + "learning_rate": 3.412410295401769e-07, + "loss": 0.8474, + "step": 31453 + }, + { + "epoch": 2.6808147958748827, + "grad_norm": 56.92964881535972, + "learning_rate": 3.410610121626984e-07, + "loss": 1.3586, + "step": 31454 + }, + { + "epoch": 2.680900025568908, + "grad_norm": 62.766476959554666, + "learning_rate": 3.4088104060403416e-07, + "loss": 1.5166, + "step": 31455 + }, + { + "epoch": 2.6809852552629336, + "grad_norm": 59.23622056421151, + "learning_rate": 3.407011148659534e-07, + "loss": 1.4083, + "step": 31456 + }, + { + "epoch": 2.681070484956959, + "grad_norm": 33.470051251166865, + "learning_rate": 3.4052123495022624e-07, + "loss": 1.374, + "step": 31457 + }, + { + "epoch": 2.6811557146509846, + "grad_norm": 47.499975698055295, + "learning_rate": 3.4034140085862087e-07, + "loss": 1.1368, + "step": 31458 + }, + { + "epoch": 2.68124094434501, + "grad_norm": 77.11744753934639, + "learning_rate": 3.4016161259290524e-07, + "loss": 1.6913, + "step": 31459 + }, + { + "epoch": 2.681326174039035, + "grad_norm": 94.5032410245213, + "learning_rate": 3.399818701548496e-07, + "loss": 1.8153, + "step": 31460 + }, + { + "epoch": 2.6814114037330605, + "grad_norm": 55.80870506869594, + "learning_rate": 3.398021735462198e-07, + "loss": 1.3029, + "step": 31461 + }, + { + "epoch": 2.681496633427086, + "grad_norm": 54.87817763946615, + "learning_rate": 3.3962252276878495e-07, + "loss": 1.4866, + "step": 31462 + }, + { + "epoch": 2.6815818631211115, + "grad_norm": 31.834065318787804, + "learning_rate": 3.3944291782430914e-07, + "loss": 1.2899, + "step": 31463 + }, + { + "epoch": 2.681667092815137, + "grad_norm": 60.15999558926179, + "learning_rate": 3.3926335871456164e-07, + "loss": 1.1795, + "step": 31464 + }, + { + "epoch": 2.681752322509162, + "grad_norm": 67.67433902308701, + "learning_rate": 3.3908384544130704e-07, + "loss": 1.4233, + "step": 31465 + }, + { + "epoch": 2.6818375522031874, + "grad_norm": 62.91750994241397, + "learning_rate": 3.389043780063106e-07, + "loss": 1.9275, + "step": 31466 + }, + { + "epoch": 2.681922781897213, + "grad_norm": 73.45744534473693, + "learning_rate": 3.387249564113365e-07, + "loss": 1.4392, + "step": 31467 + }, + { + "epoch": 2.6820080115912384, + "grad_norm": 79.29716314595832, + "learning_rate": 3.3854558065815167e-07, + "loss": 1.7363, + "step": 31468 + }, + { + "epoch": 2.682093241285264, + "grad_norm": 64.98670902092594, + "learning_rate": 3.383662507485175e-07, + "loss": 1.2329, + "step": 31469 + }, + { + "epoch": 2.6821784709792893, + "grad_norm": 46.562454783671704, + "learning_rate": 3.381869666841997e-07, + "loss": 1.8012, + "step": 31470 + }, + { + "epoch": 2.6822637006733148, + "grad_norm": 36.3952694620604, + "learning_rate": 3.3800772846695975e-07, + "loss": 1.2451, + "step": 31471 + }, + { + "epoch": 2.68234893036734, + "grad_norm": 28.865017483475242, + "learning_rate": 3.378285360985623e-07, + "loss": 1.5332, + "step": 31472 + }, + { + "epoch": 2.6824341600613653, + "grad_norm": 67.69705634254647, + "learning_rate": 3.3764938958076863e-07, + "loss": 1.4559, + "step": 31473 + }, + { + "epoch": 2.6825193897553907, + "grad_norm": 44.736389215178704, + "learning_rate": 3.3747028891534086e-07, + "loss": 1.598, + "step": 31474 + }, + { + "epoch": 2.682604619449416, + "grad_norm": 46.50546245730672, + "learning_rate": 3.372912341040385e-07, + "loss": 1.2946, + "step": 31475 + }, + { + "epoch": 2.6826898491434417, + "grad_norm": 80.08035750479395, + "learning_rate": 3.371122251486253e-07, + "loss": 1.7551, + "step": 31476 + }, + { + "epoch": 2.682775078837467, + "grad_norm": 29.321883624250436, + "learning_rate": 3.3693326205085974e-07, + "loss": 1.1197, + "step": 31477 + }, + { + "epoch": 2.6828603085314926, + "grad_norm": 38.84269510827598, + "learning_rate": 3.367543448125038e-07, + "loss": 1.3226, + "step": 31478 + }, + { + "epoch": 2.6829455382255176, + "grad_norm": 54.20524167327473, + "learning_rate": 3.365754734353155e-07, + "loss": 1.5202, + "step": 31479 + }, + { + "epoch": 2.683030767919543, + "grad_norm": 75.99964801038315, + "learning_rate": 3.3639664792105344e-07, + "loss": 1.849, + "step": 31480 + }, + { + "epoch": 2.6831159976135686, + "grad_norm": 46.08655843049052, + "learning_rate": 3.3621786827147837e-07, + "loss": 1.0653, + "step": 31481 + }, + { + "epoch": 2.683201227307594, + "grad_norm": 25.880642313701, + "learning_rate": 3.360391344883468e-07, + "loss": 0.8576, + "step": 31482 + }, + { + "epoch": 2.6832864570016195, + "grad_norm": 47.30741739536863, + "learning_rate": 3.3586044657341666e-07, + "loss": 1.1752, + "step": 31483 + }, + { + "epoch": 2.6833716866956445, + "grad_norm": 59.00903436030165, + "learning_rate": 3.356818045284466e-07, + "loss": 1.3234, + "step": 31484 + }, + { + "epoch": 2.6834569163896704, + "grad_norm": 32.21601127813432, + "learning_rate": 3.355032083551918e-07, + "loss": 0.6293, + "step": 31485 + }, + { + "epoch": 2.6835421460836955, + "grad_norm": 72.12182902782043, + "learning_rate": 3.353246580554098e-07, + "loss": 2.0841, + "step": 31486 + }, + { + "epoch": 2.683627375777721, + "grad_norm": 13.416608630169712, + "learning_rate": 3.3514615363085643e-07, + "loss": 0.3296, + "step": 31487 + }, + { + "epoch": 2.6837126054717464, + "grad_norm": 41.818077776093304, + "learning_rate": 3.349676950832875e-07, + "loss": 0.741, + "step": 31488 + }, + { + "epoch": 2.683797835165772, + "grad_norm": 55.56479318823965, + "learning_rate": 3.3478928241445606e-07, + "loss": 1.9864, + "step": 31489 + }, + { + "epoch": 2.6838830648597973, + "grad_norm": 32.005575540026946, + "learning_rate": 3.346109156261196e-07, + "loss": 1.32, + "step": 31490 + }, + { + "epoch": 2.6839682945538224, + "grad_norm": 44.5837754733725, + "learning_rate": 3.3443259472003e-07, + "loss": 1.1675, + "step": 31491 + }, + { + "epoch": 2.684053524247848, + "grad_norm": 53.61944284721183, + "learning_rate": 3.342543196979425e-07, + "loss": 0.6985, + "step": 31492 + }, + { + "epoch": 2.6841387539418733, + "grad_norm": 31.808974788005383, + "learning_rate": 3.3407609056160914e-07, + "loss": 0.8549, + "step": 31493 + }, + { + "epoch": 2.6842239836358988, + "grad_norm": 45.867421631183305, + "learning_rate": 3.3389790731278404e-07, + "loss": 1.345, + "step": 31494 + }, + { + "epoch": 2.6843092133299242, + "grad_norm": 62.230285410999755, + "learning_rate": 3.3371976995321907e-07, + "loss": 1.7425, + "step": 31495 + }, + { + "epoch": 2.6843944430239497, + "grad_norm": 58.3908122734688, + "learning_rate": 3.335416784846657e-07, + "loss": 1.2793, + "step": 31496 + }, + { + "epoch": 2.684479672717975, + "grad_norm": 42.13932804328958, + "learning_rate": 3.333636329088752e-07, + "loss": 1.167, + "step": 31497 + }, + { + "epoch": 2.684564902412, + "grad_norm": 59.839528339629624, + "learning_rate": 3.331856332275996e-07, + "loss": 1.942, + "step": 31498 + }, + { + "epoch": 2.6846501321060257, + "grad_norm": 65.04508349173699, + "learning_rate": 3.3300767944258794e-07, + "loss": 1.4805, + "step": 31499 + }, + { + "epoch": 2.684735361800051, + "grad_norm": 34.77836160987068, + "learning_rate": 3.3282977155559227e-07, + "loss": 1.0276, + "step": 31500 + }, + { + "epoch": 2.6848205914940766, + "grad_norm": 70.35829133825733, + "learning_rate": 3.326519095683606e-07, + "loss": 1.861, + "step": 31501 + }, + { + "epoch": 2.684905821188102, + "grad_norm": 44.17296687404392, + "learning_rate": 3.3247409348264317e-07, + "loss": 0.7953, + "step": 31502 + }, + { + "epoch": 2.684991050882127, + "grad_norm": 20.288125323430133, + "learning_rate": 3.3229632330018856e-07, + "loss": 0.6971, + "step": 31503 + }, + { + "epoch": 2.685076280576153, + "grad_norm": 47.59489701958262, + "learning_rate": 3.321185990227449e-07, + "loss": 1.5465, + "step": 31504 + }, + { + "epoch": 2.685161510270178, + "grad_norm": 44.882962778962636, + "learning_rate": 3.319409206520602e-07, + "loss": 0.7726, + "step": 31505 + }, + { + "epoch": 2.6852467399642035, + "grad_norm": 42.1320329307962, + "learning_rate": 3.3176328818988077e-07, + "loss": 1.4501, + "step": 31506 + }, + { + "epoch": 2.685331969658229, + "grad_norm": 23.420374661143413, + "learning_rate": 3.3158570163795477e-07, + "loss": 0.903, + "step": 31507 + }, + { + "epoch": 2.6854171993522544, + "grad_norm": 52.782726844062836, + "learning_rate": 3.31408160998028e-07, + "loss": 1.8526, + "step": 31508 + }, + { + "epoch": 2.68550242904628, + "grad_norm": 35.15892074434403, + "learning_rate": 3.312306662718473e-07, + "loss": 0.7374, + "step": 31509 + }, + { + "epoch": 2.685587658740305, + "grad_norm": 72.01697623408236, + "learning_rate": 3.3105321746115747e-07, + "loss": 1.6385, + "step": 31510 + }, + { + "epoch": 2.6856728884343304, + "grad_norm": 63.93020951012568, + "learning_rate": 3.3087581456770437e-07, + "loss": 1.0544, + "step": 31511 + }, + { + "epoch": 2.685758118128356, + "grad_norm": 112.79531502636891, + "learning_rate": 3.306984575932326e-07, + "loss": 2.6732, + "step": 31512 + }, + { + "epoch": 2.6858433478223813, + "grad_norm": 37.529628299434464, + "learning_rate": 3.3052114653948587e-07, + "loss": 0.8269, + "step": 31513 + }, + { + "epoch": 2.685928577516407, + "grad_norm": 82.15406869851164, + "learning_rate": 3.303438814082077e-07, + "loss": 1.6841, + "step": 31514 + }, + { + "epoch": 2.6860138072104323, + "grad_norm": 46.04902335366334, + "learning_rate": 3.3016666220114233e-07, + "loss": 1.323, + "step": 31515 + }, + { + "epoch": 2.6860990369044577, + "grad_norm": 80.08691601829916, + "learning_rate": 3.299894889200317e-07, + "loss": 2.0628, + "step": 31516 + }, + { + "epoch": 2.6861842665984828, + "grad_norm": 82.46039015837815, + "learning_rate": 3.2981236156661936e-07, + "loss": 2.0455, + "step": 31517 + }, + { + "epoch": 2.686269496292508, + "grad_norm": 39.452958201036594, + "learning_rate": 3.2963528014264666e-07, + "loss": 0.8967, + "step": 31518 + }, + { + "epoch": 2.6863547259865337, + "grad_norm": 33.67146896607558, + "learning_rate": 3.2945824464985455e-07, + "loss": 0.7678, + "step": 31519 + }, + { + "epoch": 2.686439955680559, + "grad_norm": 44.25316224855603, + "learning_rate": 3.2928125508998544e-07, + "loss": 1.6926, + "step": 31520 + }, + { + "epoch": 2.6865251853745846, + "grad_norm": 86.73112871985138, + "learning_rate": 3.291043114647796e-07, + "loss": 2.7006, + "step": 31521 + }, + { + "epoch": 2.6866104150686096, + "grad_norm": 60.15981099349364, + "learning_rate": 3.289274137759757e-07, + "loss": 1.7349, + "step": 31522 + }, + { + "epoch": 2.6866956447626356, + "grad_norm": 29.78986839404792, + "learning_rate": 3.2875056202531506e-07, + "loss": 0.821, + "step": 31523 + }, + { + "epoch": 2.6867808744566606, + "grad_norm": 20.010320253316934, + "learning_rate": 3.285737562145358e-07, + "loss": 0.5377, + "step": 31524 + }, + { + "epoch": 2.686866104150686, + "grad_norm": 19.477642059307932, + "learning_rate": 3.283969963453787e-07, + "loss": 0.648, + "step": 31525 + }, + { + "epoch": 2.6869513338447115, + "grad_norm": 26.596806476834903, + "learning_rate": 3.282202824195807e-07, + "loss": 0.7949, + "step": 31526 + }, + { + "epoch": 2.687036563538737, + "grad_norm": 27.343941104421706, + "learning_rate": 3.280436144388788e-07, + "loss": 0.702, + "step": 31527 + }, + { + "epoch": 2.6871217932327625, + "grad_norm": 39.41269343298929, + "learning_rate": 3.2786699240501206e-07, + "loss": 1.0958, + "step": 31528 + }, + { + "epoch": 2.6872070229267875, + "grad_norm": 16.578560916870646, + "learning_rate": 3.27690416319717e-07, + "loss": 0.5357, + "step": 31529 + }, + { + "epoch": 2.687292252620813, + "grad_norm": 70.94385090699065, + "learning_rate": 3.275138861847299e-07, + "loss": 2.2406, + "step": 31530 + }, + { + "epoch": 2.6873774823148384, + "grad_norm": 48.38677254557662, + "learning_rate": 3.273374020017872e-07, + "loss": 0.7693, + "step": 31531 + }, + { + "epoch": 2.687462712008864, + "grad_norm": 54.1501266347821, + "learning_rate": 3.271609637726242e-07, + "loss": 1.068, + "step": 31532 + }, + { + "epoch": 2.6875479417028894, + "grad_norm": 54.359200040976205, + "learning_rate": 3.2698457149897667e-07, + "loss": 1.7107, + "step": 31533 + }, + { + "epoch": 2.687633171396915, + "grad_norm": 25.351353807525637, + "learning_rate": 3.268082251825794e-07, + "loss": 0.7516, + "step": 31534 + }, + { + "epoch": 2.6877184010909403, + "grad_norm": 94.17815017089033, + "learning_rate": 3.2663192482516594e-07, + "loss": 2.4807, + "step": 31535 + }, + { + "epoch": 2.6878036307849653, + "grad_norm": 81.90082062905012, + "learning_rate": 3.264556704284694e-07, + "loss": 2.4077, + "step": 31536 + }, + { + "epoch": 2.687888860478991, + "grad_norm": 100.43607214959373, + "learning_rate": 3.2627946199422555e-07, + "loss": 1.915, + "step": 31537 + }, + { + "epoch": 2.6879740901730163, + "grad_norm": 25.794752716968418, + "learning_rate": 3.261032995241653e-07, + "loss": 0.8496, + "step": 31538 + }, + { + "epoch": 2.6880593198670417, + "grad_norm": 35.80706628833317, + "learning_rate": 3.259271830200222e-07, + "loss": 0.8697, + "step": 31539 + }, + { + "epoch": 2.688144549561067, + "grad_norm": 52.829928931228174, + "learning_rate": 3.257511124835278e-07, + "loss": 1.7226, + "step": 31540 + }, + { + "epoch": 2.688229779255092, + "grad_norm": 43.877411613393, + "learning_rate": 3.2557508791641437e-07, + "loss": 1.2592, + "step": 31541 + }, + { + "epoch": 2.688315008949118, + "grad_norm": 29.204166967741394, + "learning_rate": 3.253991093204123e-07, + "loss": 0.7599, + "step": 31542 + }, + { + "epoch": 2.688400238643143, + "grad_norm": 66.738665803551, + "learning_rate": 3.2522317669725246e-07, + "loss": 1.4907, + "step": 31543 + }, + { + "epoch": 2.6884854683371686, + "grad_norm": 77.10564326766514, + "learning_rate": 3.250472900486645e-07, + "loss": 1.3015, + "step": 31544 + }, + { + "epoch": 2.688570698031194, + "grad_norm": 78.14476086768539, + "learning_rate": 3.248714493763799e-07, + "loss": 2.5244, + "step": 31545 + }, + { + "epoch": 2.6886559277252196, + "grad_norm": 27.676735054096696, + "learning_rate": 3.246956546821256e-07, + "loss": 0.8937, + "step": 31546 + }, + { + "epoch": 2.688741157419245, + "grad_norm": 22.57529758132879, + "learning_rate": 3.245199059676324e-07, + "loss": 0.809, + "step": 31547 + }, + { + "epoch": 2.68882638711327, + "grad_norm": 42.166577608555464, + "learning_rate": 3.2434420323462887e-07, + "loss": 1.4437, + "step": 31548 + }, + { + "epoch": 2.6889116168072955, + "grad_norm": 24.967709243235202, + "learning_rate": 3.2416854648484043e-07, + "loss": 0.8401, + "step": 31549 + }, + { + "epoch": 2.688996846501321, + "grad_norm": 52.59369933184799, + "learning_rate": 3.239929357199978e-07, + "loss": 1.5389, + "step": 31550 + }, + { + "epoch": 2.6890820761953464, + "grad_norm": 40.513520511379255, + "learning_rate": 3.238173709418263e-07, + "loss": 0.9819, + "step": 31551 + }, + { + "epoch": 2.689167305889372, + "grad_norm": 31.219430155319095, + "learning_rate": 3.236418521520529e-07, + "loss": 1.1412, + "step": 31552 + }, + { + "epoch": 2.6892525355833974, + "grad_norm": 33.870706761616695, + "learning_rate": 3.234663793524029e-07, + "loss": 1.3344, + "step": 31553 + }, + { + "epoch": 2.689337765277423, + "grad_norm": 49.50705448143784, + "learning_rate": 3.2329095254460263e-07, + "loss": 0.9888, + "step": 31554 + }, + { + "epoch": 2.689422994971448, + "grad_norm": 36.794436556984984, + "learning_rate": 3.2311557173037854e-07, + "loss": 0.8491, + "step": 31555 + }, + { + "epoch": 2.6895082246654733, + "grad_norm": 45.79557966931206, + "learning_rate": 3.2294023691145426e-07, + "loss": 1.7155, + "step": 31556 + }, + { + "epoch": 2.689593454359499, + "grad_norm": 71.85217794352447, + "learning_rate": 3.2276494808955395e-07, + "loss": 1.3715, + "step": 31557 + }, + { + "epoch": 2.6896786840535243, + "grad_norm": 86.37052896861546, + "learning_rate": 3.225897052664018e-07, + "loss": 1.9942, + "step": 31558 + }, + { + "epoch": 2.6897639137475498, + "grad_norm": 25.642050255397887, + "learning_rate": 3.224145084437219e-07, + "loss": 0.6541, + "step": 31559 + }, + { + "epoch": 2.6898491434415748, + "grad_norm": 33.38616748244417, + "learning_rate": 3.222393576232363e-07, + "loss": 0.6969, + "step": 31560 + }, + { + "epoch": 2.6899343731356007, + "grad_norm": 13.734621775217763, + "learning_rate": 3.22064252806667e-07, + "loss": 0.5384, + "step": 31561 + }, + { + "epoch": 2.6900196028296257, + "grad_norm": 54.76599501159423, + "learning_rate": 3.21889193995738e-07, + "loss": 1.3536, + "step": 31562 + }, + { + "epoch": 2.690104832523651, + "grad_norm": 28.561780574104016, + "learning_rate": 3.2171418119216924e-07, + "loss": 1.0178, + "step": 31563 + }, + { + "epoch": 2.6901900622176766, + "grad_norm": 47.402394585145856, + "learning_rate": 3.2153921439768256e-07, + "loss": 1.4776, + "step": 31564 + }, + { + "epoch": 2.690275291911702, + "grad_norm": 50.875574302704976, + "learning_rate": 3.2136429361399933e-07, + "loss": 0.7242, + "step": 31565 + }, + { + "epoch": 2.6903605216057276, + "grad_norm": 60.50870494884436, + "learning_rate": 3.2118941884283827e-07, + "loss": 1.2838, + "step": 31566 + }, + { + "epoch": 2.6904457512997526, + "grad_norm": 81.10226155176836, + "learning_rate": 3.210145900859207e-07, + "loss": 1.8611, + "step": 31567 + }, + { + "epoch": 2.690530980993778, + "grad_norm": 30.83096220149953, + "learning_rate": 3.208398073449659e-07, + "loss": 0.8829, + "step": 31568 + }, + { + "epoch": 2.6906162106878035, + "grad_norm": 74.19577844507923, + "learning_rate": 3.206650706216907e-07, + "loss": 1.5555, + "step": 31569 + }, + { + "epoch": 2.690701440381829, + "grad_norm": 74.57069667455856, + "learning_rate": 3.204903799178166e-07, + "loss": 1.4735, + "step": 31570 + }, + { + "epoch": 2.6907866700758545, + "grad_norm": 17.199842006617235, + "learning_rate": 3.203157352350589e-07, + "loss": 0.4168, + "step": 31571 + }, + { + "epoch": 2.69087189976988, + "grad_norm": 41.87243772809271, + "learning_rate": 3.201411365751378e-07, + "loss": 1.1456, + "step": 31572 + }, + { + "epoch": 2.6909571294639054, + "grad_norm": 63.18462628922865, + "learning_rate": 3.199665839397687e-07, + "loss": 0.902, + "step": 31573 + }, + { + "epoch": 2.6910423591579304, + "grad_norm": 39.9466080351977, + "learning_rate": 3.197920773306673e-07, + "loss": 1.0467, + "step": 31574 + }, + { + "epoch": 2.691127588851956, + "grad_norm": 57.13005062333456, + "learning_rate": 3.196176167495524e-07, + "loss": 1.9506, + "step": 31575 + }, + { + "epoch": 2.6912128185459814, + "grad_norm": 59.647902376077674, + "learning_rate": 3.194432021981386e-07, + "loss": 1.3937, + "step": 31576 + }, + { + "epoch": 2.691298048240007, + "grad_norm": 35.15125413427031, + "learning_rate": 3.1926883367813963e-07, + "loss": 0.9851, + "step": 31577 + }, + { + "epoch": 2.6913832779340323, + "grad_norm": 44.35202534851033, + "learning_rate": 3.19094511191273e-07, + "loss": 1.1831, + "step": 31578 + }, + { + "epoch": 2.6914685076280573, + "grad_norm": 21.385869984868254, + "learning_rate": 3.1892023473925113e-07, + "loss": 0.6912, + "step": 31579 + }, + { + "epoch": 2.6915537373220833, + "grad_norm": 72.6562063671607, + "learning_rate": 3.1874600432378944e-07, + "loss": 1.7407, + "step": 31580 + }, + { + "epoch": 2.6916389670161083, + "grad_norm": 61.85264666714461, + "learning_rate": 3.1857181994660035e-07, + "loss": 1.4045, + "step": 31581 + }, + { + "epoch": 2.6917241967101337, + "grad_norm": 56.86328284509796, + "learning_rate": 3.1839768160939753e-07, + "loss": 1.4172, + "step": 31582 + }, + { + "epoch": 2.691809426404159, + "grad_norm": 28.098107224497028, + "learning_rate": 3.182235893138919e-07, + "loss": 0.6989, + "step": 31583 + }, + { + "epoch": 2.6918946560981847, + "grad_norm": 36.585630531866826, + "learning_rate": 3.1804954306179804e-07, + "loss": 1.2163, + "step": 31584 + }, + { + "epoch": 2.69197988579221, + "grad_norm": 47.12324486934823, + "learning_rate": 3.178755428548258e-07, + "loss": 2.2097, + "step": 31585 + }, + { + "epoch": 2.692065115486235, + "grad_norm": 38.5401961199888, + "learning_rate": 3.177015886946877e-07, + "loss": 1.1267, + "step": 31586 + }, + { + "epoch": 2.6921503451802606, + "grad_norm": 40.14160230588976, + "learning_rate": 3.1752768058309347e-07, + "loss": 1.1445, + "step": 31587 + }, + { + "epoch": 2.692235574874286, + "grad_norm": 30.232149142125788, + "learning_rate": 3.173538185217545e-07, + "loss": 1.2065, + "step": 31588 + }, + { + "epoch": 2.6923208045683116, + "grad_norm": 55.79697750737312, + "learning_rate": 3.1718000251238e-07, + "loss": 1.5077, + "step": 31589 + }, + { + "epoch": 2.692406034262337, + "grad_norm": 30.23764609131685, + "learning_rate": 3.1700623255667915e-07, + "loss": 0.9545, + "step": 31590 + }, + { + "epoch": 2.6924912639563625, + "grad_norm": 23.97739228436842, + "learning_rate": 3.168325086563612e-07, + "loss": 0.5097, + "step": 31591 + }, + { + "epoch": 2.692576493650388, + "grad_norm": 34.859869446037884, + "learning_rate": 3.166588308131341e-07, + "loss": 0.6104, + "step": 31592 + }, + { + "epoch": 2.692661723344413, + "grad_norm": 44.25595355127011, + "learning_rate": 3.164851990287066e-07, + "loss": 1.4176, + "step": 31593 + }, + { + "epoch": 2.6927469530384385, + "grad_norm": 51.306565739440394, + "learning_rate": 3.163116133047867e-07, + "loss": 1.366, + "step": 31594 + }, + { + "epoch": 2.692832182732464, + "grad_norm": 53.67585682592396, + "learning_rate": 3.161380736430808e-07, + "loss": 1.2879, + "step": 31595 + }, + { + "epoch": 2.6929174124264894, + "grad_norm": 27.217611101688153, + "learning_rate": 3.1596458004529484e-07, + "loss": 0.7953, + "step": 31596 + }, + { + "epoch": 2.693002642120515, + "grad_norm": 46.00565082428786, + "learning_rate": 3.157911325131369e-07, + "loss": 1.4728, + "step": 31597 + }, + { + "epoch": 2.6930878718145403, + "grad_norm": 33.04030033621484, + "learning_rate": 3.1561773104831216e-07, + "loss": 1.0455, + "step": 31598 + }, + { + "epoch": 2.693173101508566, + "grad_norm": 27.177259096367965, + "learning_rate": 3.154443756525255e-07, + "loss": 0.9438, + "step": 31599 + }, + { + "epoch": 2.693258331202591, + "grad_norm": 62.4216033544056, + "learning_rate": 3.1527106632748104e-07, + "loss": 1.4809, + "step": 31600 + }, + { + "epoch": 2.6933435608966163, + "grad_norm": 24.969512781346182, + "learning_rate": 3.1509780307488415e-07, + "loss": 0.7428, + "step": 31601 + }, + { + "epoch": 2.6934287905906418, + "grad_norm": 32.74722046254484, + "learning_rate": 3.149245858964395e-07, + "loss": 1.243, + "step": 31602 + }, + { + "epoch": 2.6935140202846672, + "grad_norm": 36.88029225530488, + "learning_rate": 3.1475141479385016e-07, + "loss": 1.1646, + "step": 31603 + }, + { + "epoch": 2.6935992499786927, + "grad_norm": 77.66824784051015, + "learning_rate": 3.1457828976881876e-07, + "loss": 2.2764, + "step": 31604 + }, + { + "epoch": 2.6936844796727177, + "grad_norm": 51.94276438683605, + "learning_rate": 3.144052108230472e-07, + "loss": 1.7761, + "step": 31605 + }, + { + "epoch": 2.6937697093667436, + "grad_norm": 99.0724357251607, + "learning_rate": 3.1423217795823915e-07, + "loss": 2.6668, + "step": 31606 + }, + { + "epoch": 2.6938549390607687, + "grad_norm": 117.59989281782813, + "learning_rate": 3.1405919117609595e-07, + "loss": 2.2092, + "step": 31607 + }, + { + "epoch": 2.693940168754794, + "grad_norm": 67.1344289684475, + "learning_rate": 3.13886250478318e-07, + "loss": 1.7097, + "step": 31608 + }, + { + "epoch": 2.6940253984488196, + "grad_norm": 52.77771361331349, + "learning_rate": 3.137133558666072e-07, + "loss": 1.6657, + "step": 31609 + }, + { + "epoch": 2.694110628142845, + "grad_norm": 65.47688488836431, + "learning_rate": 3.135405073426623e-07, + "loss": 1.9924, + "step": 31610 + }, + { + "epoch": 2.6941958578368705, + "grad_norm": 61.79510846094506, + "learning_rate": 3.133677049081857e-07, + "loss": 2.0969, + "step": 31611 + }, + { + "epoch": 2.6942810875308956, + "grad_norm": 38.62708048566623, + "learning_rate": 3.1319494856487496e-07, + "loss": 1.0634, + "step": 31612 + }, + { + "epoch": 2.694366317224921, + "grad_norm": 46.77858818071846, + "learning_rate": 3.130222383144294e-07, + "loss": 1.2186, + "step": 31613 + }, + { + "epoch": 2.6944515469189465, + "grad_norm": 54.22334820122316, + "learning_rate": 3.12849574158548e-07, + "loss": 1.2566, + "step": 31614 + }, + { + "epoch": 2.694536776612972, + "grad_norm": 49.07236652695199, + "learning_rate": 3.1267695609892846e-07, + "loss": 0.8828, + "step": 31615 + }, + { + "epoch": 2.6946220063069974, + "grad_norm": 58.668910073489705, + "learning_rate": 3.1250438413726766e-07, + "loss": 1.7394, + "step": 31616 + }, + { + "epoch": 2.694707236001023, + "grad_norm": 75.85338818746744, + "learning_rate": 3.123318582752649e-07, + "loss": 2.1571, + "step": 31617 + }, + { + "epoch": 2.6947924656950484, + "grad_norm": 40.49494470094305, + "learning_rate": 3.121593785146143e-07, + "loss": 1.0423, + "step": 31618 + }, + { + "epoch": 2.6948776953890734, + "grad_norm": 41.726108951909, + "learning_rate": 3.1198694485701455e-07, + "loss": 1.0443, + "step": 31619 + }, + { + "epoch": 2.694962925083099, + "grad_norm": 74.07881319806216, + "learning_rate": 3.118145573041603e-07, + "loss": 1.8303, + "step": 31620 + }, + { + "epoch": 2.6950481547771243, + "grad_norm": 58.06158635339031, + "learning_rate": 3.11642215857747e-07, + "loss": 1.3484, + "step": 31621 + }, + { + "epoch": 2.69513338447115, + "grad_norm": 30.968345899559615, + "learning_rate": 3.114699205194688e-07, + "loss": 0.6731, + "step": 31622 + }, + { + "epoch": 2.6952186141651753, + "grad_norm": 26.882587296991126, + "learning_rate": 3.112976712910215e-07, + "loss": 0.789, + "step": 31623 + }, + { + "epoch": 2.6953038438592003, + "grad_norm": 62.62525531245671, + "learning_rate": 3.111254681740977e-07, + "loss": 1.3551, + "step": 31624 + }, + { + "epoch": 2.695389073553226, + "grad_norm": 71.82770715304925, + "learning_rate": 3.1095331117039276e-07, + "loss": 1.9409, + "step": 31625 + }, + { + "epoch": 2.6954743032472512, + "grad_norm": 54.63553329103335, + "learning_rate": 3.107812002815974e-07, + "loss": 1.4964, + "step": 31626 + }, + { + "epoch": 2.6955595329412767, + "grad_norm": 28.225112131322916, + "learning_rate": 3.106091355094071e-07, + "loss": 1.1346, + "step": 31627 + }, + { + "epoch": 2.695644762635302, + "grad_norm": 65.10838877180586, + "learning_rate": 3.1043711685551205e-07, + "loss": 1.3061, + "step": 31628 + }, + { + "epoch": 2.6957299923293276, + "grad_norm": 25.810957531767354, + "learning_rate": 3.102651443216048e-07, + "loss": 0.9481, + "step": 31629 + }, + { + "epoch": 2.695815222023353, + "grad_norm": 66.69062970860816, + "learning_rate": 3.1009321790937575e-07, + "loss": 1.2803, + "step": 31630 + }, + { + "epoch": 2.695900451717378, + "grad_norm": 28.317690195170016, + "learning_rate": 3.099213376205168e-07, + "loss": 1.0355, + "step": 31631 + }, + { + "epoch": 2.6959856814114036, + "grad_norm": 63.28523423653497, + "learning_rate": 3.097495034567172e-07, + "loss": 1.9489, + "step": 31632 + }, + { + "epoch": 2.696070911105429, + "grad_norm": 35.43376694966286, + "learning_rate": 3.095777154196683e-07, + "loss": 0.7937, + "step": 31633 + }, + { + "epoch": 2.6961561407994545, + "grad_norm": 34.74105136313267, + "learning_rate": 3.094059735110583e-07, + "loss": 1.176, + "step": 31634 + }, + { + "epoch": 2.69624137049348, + "grad_norm": 63.289351617945705, + "learning_rate": 3.092342777325763e-07, + "loss": 1.6446, + "step": 31635 + }, + { + "epoch": 2.6963266001875055, + "grad_norm": 20.94256680593323, + "learning_rate": 3.0906262808591216e-07, + "loss": 0.7284, + "step": 31636 + }, + { + "epoch": 2.696411829881531, + "grad_norm": 63.8104768851017, + "learning_rate": 3.088910245727533e-07, + "loss": 1.4854, + "step": 31637 + }, + { + "epoch": 2.696497059575556, + "grad_norm": 59.68684538897947, + "learning_rate": 3.0871946719478687e-07, + "loss": 1.2137, + "step": 31638 + }, + { + "epoch": 2.6965822892695814, + "grad_norm": 23.569826679096305, + "learning_rate": 3.085479559536991e-07, + "loss": 0.8472, + "step": 31639 + }, + { + "epoch": 2.696667518963607, + "grad_norm": 74.77601785826054, + "learning_rate": 3.0837649085117826e-07, + "loss": 1.8107, + "step": 31640 + }, + { + "epoch": 2.6967527486576324, + "grad_norm": 63.1802069641266, + "learning_rate": 3.082050718889118e-07, + "loss": 1.5575, + "step": 31641 + }, + { + "epoch": 2.696837978351658, + "grad_norm": 31.593677789159443, + "learning_rate": 3.0803369906858337e-07, + "loss": 0.7773, + "step": 31642 + }, + { + "epoch": 2.696923208045683, + "grad_norm": 51.14351658754193, + "learning_rate": 3.0786237239187834e-07, + "loss": 1.458, + "step": 31643 + }, + { + "epoch": 2.6970084377397088, + "grad_norm": 53.24551444062537, + "learning_rate": 3.0769109186048363e-07, + "loss": 1.1781, + "step": 31644 + }, + { + "epoch": 2.697093667433734, + "grad_norm": 68.2676672181104, + "learning_rate": 3.075198574760824e-07, + "loss": 2.1409, + "step": 31645 + }, + { + "epoch": 2.6971788971277593, + "grad_norm": 56.141310152036276, + "learning_rate": 3.073486692403582e-07, + "loss": 1.0939, + "step": 31646 + }, + { + "epoch": 2.6972641268217847, + "grad_norm": 47.63940659871277, + "learning_rate": 3.071775271549948e-07, + "loss": 1.2644, + "step": 31647 + }, + { + "epoch": 2.69734935651581, + "grad_norm": 76.32632170577908, + "learning_rate": 3.0700643122167585e-07, + "loss": 1.8882, + "step": 31648 + }, + { + "epoch": 2.6974345862098357, + "grad_norm": 36.619038589075465, + "learning_rate": 3.068353814420844e-07, + "loss": 1.4971, + "step": 31649 + }, + { + "epoch": 2.6975198159038607, + "grad_norm": 61.95553583578542, + "learning_rate": 3.066643778179024e-07, + "loss": 1.2977, + "step": 31650 + }, + { + "epoch": 2.697605045597886, + "grad_norm": 30.745595864481242, + "learning_rate": 3.0649342035081077e-07, + "loss": 1.0908, + "step": 31651 + }, + { + "epoch": 2.6976902752919116, + "grad_norm": 20.74815246930881, + "learning_rate": 3.0632250904249093e-07, + "loss": 0.7187, + "step": 31652 + }, + { + "epoch": 2.697775504985937, + "grad_norm": 31.71881184349499, + "learning_rate": 3.061516438946249e-07, + "loss": 0.7433, + "step": 31653 + }, + { + "epoch": 2.6978607346799626, + "grad_norm": 45.28712752746882, + "learning_rate": 3.0598082490889194e-07, + "loss": 1.2975, + "step": 31654 + }, + { + "epoch": 2.697945964373988, + "grad_norm": 66.14587198387002, + "learning_rate": 3.058100520869717e-07, + "loss": 1.6255, + "step": 31655 + }, + { + "epoch": 2.6980311940680135, + "grad_norm": 68.14880042721657, + "learning_rate": 3.0563932543054565e-07, + "loss": 2.2356, + "step": 31656 + }, + { + "epoch": 2.6981164237620385, + "grad_norm": 53.801516658948685, + "learning_rate": 3.0546864494129024e-07, + "loss": 1.4377, + "step": 31657 + }, + { + "epoch": 2.698201653456064, + "grad_norm": 36.388052192843205, + "learning_rate": 3.0529801062088584e-07, + "loss": 1.2273, + "step": 31658 + }, + { + "epoch": 2.6982868831500895, + "grad_norm": 104.01698168553199, + "learning_rate": 3.0512742247101046e-07, + "loss": 2.2145, + "step": 31659 + }, + { + "epoch": 2.698372112844115, + "grad_norm": 59.999098210275996, + "learning_rate": 3.0495688049334006e-07, + "loss": 1.7988, + "step": 31660 + }, + { + "epoch": 2.6984573425381404, + "grad_norm": 32.100294980852006, + "learning_rate": 3.0478638468955436e-07, + "loss": 0.8107, + "step": 31661 + }, + { + "epoch": 2.6985425722321654, + "grad_norm": 79.51855427528632, + "learning_rate": 3.046159350613287e-07, + "loss": 1.5405, + "step": 31662 + }, + { + "epoch": 2.6986278019261913, + "grad_norm": 67.53276422150348, + "learning_rate": 3.044455316103384e-07, + "loss": 2.0136, + "step": 31663 + }, + { + "epoch": 2.6987130316202164, + "grad_norm": 86.3728018488855, + "learning_rate": 3.042751743382616e-07, + "loss": 1.6772, + "step": 31664 + }, + { + "epoch": 2.698798261314242, + "grad_norm": 62.39302619665839, + "learning_rate": 3.041048632467719e-07, + "loss": 1.5723, + "step": 31665 + }, + { + "epoch": 2.6988834910082673, + "grad_norm": 79.61656093961676, + "learning_rate": 3.039345983375458e-07, + "loss": 2.1559, + "step": 31666 + }, + { + "epoch": 2.6989687207022928, + "grad_norm": 34.2147561854488, + "learning_rate": 3.0376437961225634e-07, + "loss": 0.9834, + "step": 31667 + }, + { + "epoch": 2.6990539503963182, + "grad_norm": 80.82632393856186, + "learning_rate": 3.035942070725789e-07, + "loss": 2.0898, + "step": 31668 + }, + { + "epoch": 2.6991391800903433, + "grad_norm": 64.89455975601769, + "learning_rate": 3.034240807201849e-07, + "loss": 1.5497, + "step": 31669 + }, + { + "epoch": 2.6992244097843687, + "grad_norm": 51.53301179729206, + "learning_rate": 3.032540005567497e-07, + "loss": 1.331, + "step": 31670 + }, + { + "epoch": 2.699309639478394, + "grad_norm": 50.222095468174956, + "learning_rate": 3.030839665839441e-07, + "loss": 1.3882, + "step": 31671 + }, + { + "epoch": 2.6993948691724197, + "grad_norm": 33.92920759182306, + "learning_rate": 3.0291397880344244e-07, + "loss": 0.6215, + "step": 31672 + }, + { + "epoch": 2.699480098866445, + "grad_norm": 68.76907515048006, + "learning_rate": 3.0274403721691434e-07, + "loss": 1.6312, + "step": 31673 + }, + { + "epoch": 2.6995653285604706, + "grad_norm": 65.51882613651563, + "learning_rate": 3.0257414182603307e-07, + "loss": 1.8983, + "step": 31674 + }, + { + "epoch": 2.699650558254496, + "grad_norm": 68.73897279093724, + "learning_rate": 3.024042926324688e-07, + "loss": 1.8024, + "step": 31675 + }, + { + "epoch": 2.699735787948521, + "grad_norm": 66.09309685547485, + "learning_rate": 3.0223448963789147e-07, + "loss": 1.4048, + "step": 31676 + }, + { + "epoch": 2.6998210176425466, + "grad_norm": 47.038990586445244, + "learning_rate": 3.0206473284397077e-07, + "loss": 2.1042, + "step": 31677 + }, + { + "epoch": 2.699906247336572, + "grad_norm": 75.23183118300396, + "learning_rate": 3.0189502225237645e-07, + "loss": 2.0908, + "step": 31678 + }, + { + "epoch": 2.6999914770305975, + "grad_norm": 29.696568840608055, + "learning_rate": 3.0172535786477774e-07, + "loss": 0.954, + "step": 31679 + }, + { + "epoch": 2.700076706724623, + "grad_norm": 27.882875808370542, + "learning_rate": 3.0155573968284335e-07, + "loss": 0.9445, + "step": 31680 + }, + { + "epoch": 2.700161936418648, + "grad_norm": 19.95738794353074, + "learning_rate": 3.0138616770824193e-07, + "loss": 0.8477, + "step": 31681 + }, + { + "epoch": 2.700247166112674, + "grad_norm": 28.49307605649284, + "learning_rate": 3.0121664194263934e-07, + "loss": 1.1262, + "step": 31682 + }, + { + "epoch": 2.700332395806699, + "grad_norm": 44.44825550958213, + "learning_rate": 3.010471623877054e-07, + "loss": 1.2988, + "step": 31683 + }, + { + "epoch": 2.7004176255007244, + "grad_norm": 56.37951011720542, + "learning_rate": 3.008777290451048e-07, + "loss": 1.5756, + "step": 31684 + }, + { + "epoch": 2.70050285519475, + "grad_norm": 62.32782761265777, + "learning_rate": 3.0070834191650465e-07, + "loss": 1.6828, + "step": 31685 + }, + { + "epoch": 2.7005880848887753, + "grad_norm": 70.52683278226242, + "learning_rate": 3.005390010035697e-07, + "loss": 1.6202, + "step": 31686 + }, + { + "epoch": 2.700673314582801, + "grad_norm": 29.908252498869032, + "learning_rate": 3.0036970630796633e-07, + "loss": 1.198, + "step": 31687 + }, + { + "epoch": 2.700758544276826, + "grad_norm": 22.373142930377636, + "learning_rate": 3.0020045783136053e-07, + "loss": 0.6966, + "step": 31688 + }, + { + "epoch": 2.7008437739708513, + "grad_norm": 57.056693842602364, + "learning_rate": 3.0003125557541534e-07, + "loss": 1.6851, + "step": 31689 + }, + { + "epoch": 2.7009290036648768, + "grad_norm": 49.45272681403148, + "learning_rate": 2.99862099541795e-07, + "loss": 0.9515, + "step": 31690 + }, + { + "epoch": 2.7010142333589022, + "grad_norm": 46.23778131085286, + "learning_rate": 2.996929897321632e-07, + "loss": 1.2143, + "step": 31691 + }, + { + "epoch": 2.7010994630529277, + "grad_norm": 77.74938341203614, + "learning_rate": 2.995239261481836e-07, + "loss": 2.1699, + "step": 31692 + }, + { + "epoch": 2.701184692746953, + "grad_norm": 59.971415955128705, + "learning_rate": 2.9935490879151873e-07, + "loss": 1.4247, + "step": 31693 + }, + { + "epoch": 2.7012699224409786, + "grad_norm": 53.52255172369073, + "learning_rate": 2.99185937663829e-07, + "loss": 1.1939, + "step": 31694 + }, + { + "epoch": 2.7013551521350037, + "grad_norm": 54.271205936283295, + "learning_rate": 2.9901701276677795e-07, + "loss": 1.3598, + "step": 31695 + }, + { + "epoch": 2.701440381829029, + "grad_norm": 67.10096309017004, + "learning_rate": 2.988481341020277e-07, + "loss": 2.1815, + "step": 31696 + }, + { + "epoch": 2.7015256115230546, + "grad_norm": 42.25764387076699, + "learning_rate": 2.986793016712375e-07, + "loss": 1.4515, + "step": 31697 + }, + { + "epoch": 2.70161084121708, + "grad_norm": 24.922256073223494, + "learning_rate": 2.985105154760687e-07, + "loss": 0.6268, + "step": 31698 + }, + { + "epoch": 2.7016960709111055, + "grad_norm": 48.80039680053757, + "learning_rate": 2.9834177551817944e-07, + "loss": 1.6125, + "step": 31699 + }, + { + "epoch": 2.7017813006051306, + "grad_norm": 93.0378576502127, + "learning_rate": 2.981730817992318e-07, + "loss": 2.3702, + "step": 31700 + }, + { + "epoch": 2.7018665302991565, + "grad_norm": 40.229262558606635, + "learning_rate": 2.980044343208832e-07, + "loss": 1.0885, + "step": 31701 + }, + { + "epoch": 2.7019517599931815, + "grad_norm": 24.58961960430567, + "learning_rate": 2.978358330847925e-07, + "loss": 0.7248, + "step": 31702 + }, + { + "epoch": 2.702036989687207, + "grad_norm": 80.47970922217218, + "learning_rate": 2.976672780926182e-07, + "loss": 1.8349, + "step": 31703 + }, + { + "epoch": 2.7021222193812324, + "grad_norm": 49.560577762528716, + "learning_rate": 2.9749876934601687e-07, + "loss": 1.8335, + "step": 31704 + }, + { + "epoch": 2.702207449075258, + "grad_norm": 34.56609464645848, + "learning_rate": 2.973303068466471e-07, + "loss": 0.708, + "step": 31705 + }, + { + "epoch": 2.7022926787692834, + "grad_norm": 38.08713767674136, + "learning_rate": 2.971618905961654e-07, + "loss": 0.8693, + "step": 31706 + }, + { + "epoch": 2.7023779084633084, + "grad_norm": 50.28457287267791, + "learning_rate": 2.969935205962282e-07, + "loss": 1.3198, + "step": 31707 + }, + { + "epoch": 2.702463138157334, + "grad_norm": 64.70817583564687, + "learning_rate": 2.9682519684848977e-07, + "loss": 1.2728, + "step": 31708 + }, + { + "epoch": 2.7025483678513593, + "grad_norm": 78.24452259281357, + "learning_rate": 2.9665691935460707e-07, + "loss": 2.3371, + "step": 31709 + }, + { + "epoch": 2.702633597545385, + "grad_norm": 41.813385047423246, + "learning_rate": 2.9648868811623434e-07, + "loss": 0.9148, + "step": 31710 + }, + { + "epoch": 2.7027188272394103, + "grad_norm": 86.39408916171398, + "learning_rate": 2.963205031350269e-07, + "loss": 1.5424, + "step": 31711 + }, + { + "epoch": 2.7028040569334357, + "grad_norm": 92.59760210490565, + "learning_rate": 2.9615236441263794e-07, + "loss": 2.1478, + "step": 31712 + }, + { + "epoch": 2.702889286627461, + "grad_norm": 47.78030815151587, + "learning_rate": 2.9598427195072165e-07, + "loss": 1.3694, + "step": 31713 + }, + { + "epoch": 2.702974516321486, + "grad_norm": 44.8360249729892, + "learning_rate": 2.9581622575093116e-07, + "loss": 1.1997, + "step": 31714 + }, + { + "epoch": 2.7030597460155117, + "grad_norm": 52.677462803612926, + "learning_rate": 2.956482258149185e-07, + "loss": 1.8082, + "step": 31715 + }, + { + "epoch": 2.703144975709537, + "grad_norm": 78.40024322332451, + "learning_rate": 2.954802721443356e-07, + "loss": 1.6089, + "step": 31716 + }, + { + "epoch": 2.7032302054035626, + "grad_norm": 67.55297750560891, + "learning_rate": 2.953123647408357e-07, + "loss": 1.5114, + "step": 31717 + }, + { + "epoch": 2.703315435097588, + "grad_norm": 100.40350580385994, + "learning_rate": 2.951445036060685e-07, + "loss": 2.6535, + "step": 31718 + }, + { + "epoch": 2.7034006647916136, + "grad_norm": 18.035969046469287, + "learning_rate": 2.9497668874168604e-07, + "loss": 0.5295, + "step": 31719 + }, + { + "epoch": 2.703485894485639, + "grad_norm": 66.37688292748945, + "learning_rate": 2.9480892014933813e-07, + "loss": 1.6196, + "step": 31720 + }, + { + "epoch": 2.703571124179664, + "grad_norm": 46.737981331297135, + "learning_rate": 2.9464119783067515e-07, + "loss": 1.4411, + "step": 31721 + }, + { + "epoch": 2.7036563538736895, + "grad_norm": 40.509939821389764, + "learning_rate": 2.944735217873462e-07, + "loss": 0.9169, + "step": 31722 + }, + { + "epoch": 2.703741583567715, + "grad_norm": 58.819620904915666, + "learning_rate": 2.9430589202100014e-07, + "loss": 1.9901, + "step": 31723 + }, + { + "epoch": 2.7038268132617405, + "grad_norm": 32.49231387236733, + "learning_rate": 2.941383085332861e-07, + "loss": 1.2349, + "step": 31724 + }, + { + "epoch": 2.703912042955766, + "grad_norm": 68.7712962259532, + "learning_rate": 2.9397077132585106e-07, + "loss": 1.2529, + "step": 31725 + }, + { + "epoch": 2.703997272649791, + "grad_norm": 84.83617971172195, + "learning_rate": 2.938032804003438e-07, + "loss": 2.2332, + "step": 31726 + }, + { + "epoch": 2.704082502343817, + "grad_norm": 53.05377331551314, + "learning_rate": 2.936358357584118e-07, + "loss": 1.6051, + "step": 31727 + }, + { + "epoch": 2.704167732037842, + "grad_norm": 40.575565038476476, + "learning_rate": 2.934684374017011e-07, + "loss": 1.2666, + "step": 31728 + }, + { + "epoch": 2.7042529617318674, + "grad_norm": 50.52709624768665, + "learning_rate": 2.9330108533185743e-07, + "loss": 1.5876, + "step": 31729 + }, + { + "epoch": 2.704338191425893, + "grad_norm": 86.01429411853009, + "learning_rate": 2.9313377955052793e-07, + "loss": 2.1546, + "step": 31730 + }, + { + "epoch": 2.7044234211199183, + "grad_norm": 61.598289013686724, + "learning_rate": 2.9296652005935735e-07, + "loss": 2.1171, + "step": 31731 + }, + { + "epoch": 2.7045086508139438, + "grad_norm": 61.37629281512164, + "learning_rate": 2.92799306859991e-07, + "loss": 0.8836, + "step": 31732 + }, + { + "epoch": 2.704593880507969, + "grad_norm": 25.820763251282337, + "learning_rate": 2.9263213995407215e-07, + "loss": 0.95, + "step": 31733 + }, + { + "epoch": 2.7046791102019943, + "grad_norm": 66.78885008884528, + "learning_rate": 2.9246501934324545e-07, + "loss": 1.5722, + "step": 31734 + }, + { + "epoch": 2.7047643398960197, + "grad_norm": 25.744127528361865, + "learning_rate": 2.922979450291552e-07, + "loss": 0.8482, + "step": 31735 + }, + { + "epoch": 2.704849569590045, + "grad_norm": 95.99304006364635, + "learning_rate": 2.9213091701344453e-07, + "loss": 1.9365, + "step": 31736 + }, + { + "epoch": 2.7049347992840707, + "grad_norm": 15.440331507092925, + "learning_rate": 2.919639352977555e-07, + "loss": 0.6837, + "step": 31737 + }, + { + "epoch": 2.705020028978096, + "grad_norm": 39.90202261678087, + "learning_rate": 2.9179699988372946e-07, + "loss": 1.3753, + "step": 31738 + }, + { + "epoch": 2.7051052586721216, + "grad_norm": 71.15427495498189, + "learning_rate": 2.9163011077301016e-07, + "loss": 1.3819, + "step": 31739 + }, + { + "epoch": 2.7051904883661466, + "grad_norm": 54.15139854711823, + "learning_rate": 2.9146326796723744e-07, + "loss": 1.5313, + "step": 31740 + }, + { + "epoch": 2.705275718060172, + "grad_norm": 46.56571003915383, + "learning_rate": 2.9129647146805164e-07, + "loss": 0.9035, + "step": 31741 + }, + { + "epoch": 2.7053609477541976, + "grad_norm": 80.40489212587246, + "learning_rate": 2.9112972127709416e-07, + "loss": 1.9377, + "step": 31742 + }, + { + "epoch": 2.705446177448223, + "grad_norm": 28.342599615631563, + "learning_rate": 2.9096301739600595e-07, + "loss": 0.7903, + "step": 31743 + }, + { + "epoch": 2.7055314071422485, + "grad_norm": 17.53012441985523, + "learning_rate": 2.9079635982642465e-07, + "loss": 0.56, + "step": 31744 + }, + { + "epoch": 2.7056166368362735, + "grad_norm": 46.606536825624595, + "learning_rate": 2.9062974856998995e-07, + "loss": 1.66, + "step": 31745 + }, + { + "epoch": 2.7057018665302994, + "grad_norm": 39.86510599140281, + "learning_rate": 2.9046318362834e-07, + "loss": 1.1117, + "step": 31746 + }, + { + "epoch": 2.7057870962243245, + "grad_norm": 71.85023727337722, + "learning_rate": 2.902966650031136e-07, + "loss": 1.6581, + "step": 31747 + }, + { + "epoch": 2.70587232591835, + "grad_norm": 55.32550986430544, + "learning_rate": 2.9013019269594824e-07, + "loss": 1.2659, + "step": 31748 + }, + { + "epoch": 2.7059575556123754, + "grad_norm": 87.08731472131345, + "learning_rate": 2.8996376670848037e-07, + "loss": 1.9154, + "step": 31749 + }, + { + "epoch": 2.706042785306401, + "grad_norm": 37.32274032445159, + "learning_rate": 2.8979738704234764e-07, + "loss": 1.9871, + "step": 31750 + }, + { + "epoch": 2.7061280150004263, + "grad_norm": 39.0452891366724, + "learning_rate": 2.896310536991853e-07, + "loss": 1.0146, + "step": 31751 + }, + { + "epoch": 2.7062132446944513, + "grad_norm": 44.367349751558805, + "learning_rate": 2.894647666806305e-07, + "loss": 1.1661, + "step": 31752 + }, + { + "epoch": 2.706298474388477, + "grad_norm": 48.60011238207334, + "learning_rate": 2.8929852598831796e-07, + "loss": 1.3847, + "step": 31753 + }, + { + "epoch": 2.7063837040825023, + "grad_norm": 64.92089287851438, + "learning_rate": 2.891323316238831e-07, + "loss": 1.4815, + "step": 31754 + }, + { + "epoch": 2.7064689337765278, + "grad_norm": 35.99724675086525, + "learning_rate": 2.8896618358895847e-07, + "loss": 1.2519, + "step": 31755 + }, + { + "epoch": 2.706554163470553, + "grad_norm": 28.111015410431506, + "learning_rate": 2.888000818851805e-07, + "loss": 1.0897, + "step": 31756 + }, + { + "epoch": 2.7066393931645787, + "grad_norm": 23.247412158059248, + "learning_rate": 2.886340265141807e-07, + "loss": 0.886, + "step": 31757 + }, + { + "epoch": 2.706724622858604, + "grad_norm": 27.602124266968985, + "learning_rate": 2.884680174775939e-07, + "loss": 0.8657, + "step": 31758 + }, + { + "epoch": 2.706809852552629, + "grad_norm": 74.5505885225113, + "learning_rate": 2.8830205477705156e-07, + "loss": 1.5149, + "step": 31759 + }, + { + "epoch": 2.7068950822466546, + "grad_norm": 52.34480860792931, + "learning_rate": 2.8813613841418673e-07, + "loss": 1.9383, + "step": 31760 + }, + { + "epoch": 2.70698031194068, + "grad_norm": 72.15522750638895, + "learning_rate": 2.879702683906305e-07, + "loss": 2.0004, + "step": 31761 + }, + { + "epoch": 2.7070655416347056, + "grad_norm": 44.32133900540048, + "learning_rate": 2.878044447080147e-07, + "loss": 1.5542, + "step": 31762 + }, + { + "epoch": 2.707150771328731, + "grad_norm": 23.13516800027218, + "learning_rate": 2.876386673679687e-07, + "loss": 1.1582, + "step": 31763 + }, + { + "epoch": 2.707236001022756, + "grad_norm": 27.519072274570767, + "learning_rate": 2.8747293637212516e-07, + "loss": 0.9664, + "step": 31764 + }, + { + "epoch": 2.707321230716782, + "grad_norm": 30.35136531494907, + "learning_rate": 2.873072517221115e-07, + "loss": 0.8749, + "step": 31765 + }, + { + "epoch": 2.707406460410807, + "grad_norm": 95.85434600094194, + "learning_rate": 2.871416134195593e-07, + "loss": 1.71, + "step": 31766 + }, + { + "epoch": 2.7074916901048325, + "grad_norm": 66.04030489337478, + "learning_rate": 2.8697602146609604e-07, + "loss": 1.854, + "step": 31767 + }, + { + "epoch": 2.707576919798858, + "grad_norm": 56.64296369546077, + "learning_rate": 2.8681047586335055e-07, + "loss": 1.8717, + "step": 31768 + }, + { + "epoch": 2.7076621494928834, + "grad_norm": 38.09772582551197, + "learning_rate": 2.86644976612952e-07, + "loss": 1.1722, + "step": 31769 + }, + { + "epoch": 2.707747379186909, + "grad_norm": 82.60262352294957, + "learning_rate": 2.8647952371652743e-07, + "loss": 1.6984, + "step": 31770 + }, + { + "epoch": 2.707832608880934, + "grad_norm": 50.24381216286922, + "learning_rate": 2.8631411717570336e-07, + "loss": 0.9427, + "step": 31771 + }, + { + "epoch": 2.7079178385749594, + "grad_norm": 46.81683965966984, + "learning_rate": 2.861487569921062e-07, + "loss": 1.2604, + "step": 31772 + }, + { + "epoch": 2.708003068268985, + "grad_norm": 65.85668822690286, + "learning_rate": 2.8598344316736304e-07, + "loss": 1.8589, + "step": 31773 + }, + { + "epoch": 2.7080882979630103, + "grad_norm": 94.96044738048849, + "learning_rate": 2.8581817570310034e-07, + "loss": 1.6616, + "step": 31774 + }, + { + "epoch": 2.708173527657036, + "grad_norm": 70.70018284183786, + "learning_rate": 2.8565295460094287e-07, + "loss": 1.5581, + "step": 31775 + }, + { + "epoch": 2.7082587573510613, + "grad_norm": 115.24849520172711, + "learning_rate": 2.854877798625144e-07, + "loss": 2.9105, + "step": 31776 + }, + { + "epoch": 2.7083439870450867, + "grad_norm": 43.266727459690955, + "learning_rate": 2.8532265148944083e-07, + "loss": 1.1498, + "step": 31777 + }, + { + "epoch": 2.7084292167391117, + "grad_norm": 19.70354760470321, + "learning_rate": 2.851575694833453e-07, + "loss": 0.9007, + "step": 31778 + }, + { + "epoch": 2.708514446433137, + "grad_norm": 50.19403064241771, + "learning_rate": 2.84992533845852e-07, + "loss": 1.1233, + "step": 31779 + }, + { + "epoch": 2.7085996761271627, + "grad_norm": 42.508996932139596, + "learning_rate": 2.848275445785825e-07, + "loss": 1.2528, + "step": 31780 + }, + { + "epoch": 2.708684905821188, + "grad_norm": 55.305062279997344, + "learning_rate": 2.8466260168316105e-07, + "loss": 1.6653, + "step": 31781 + }, + { + "epoch": 2.7087701355152136, + "grad_norm": 44.624797134422785, + "learning_rate": 2.8449770516120967e-07, + "loss": 1.0878, + "step": 31782 + }, + { + "epoch": 2.7088553652092386, + "grad_norm": 36.77501110085709, + "learning_rate": 2.8433285501434925e-07, + "loss": 0.674, + "step": 31783 + }, + { + "epoch": 2.7089405949032646, + "grad_norm": 62.8628774932624, + "learning_rate": 2.841680512442019e-07, + "loss": 1.859, + "step": 31784 + }, + { + "epoch": 2.7090258245972896, + "grad_norm": 78.7611502092218, + "learning_rate": 2.8400329385238676e-07, + "loss": 1.6694, + "step": 31785 + }, + { + "epoch": 2.709111054291315, + "grad_norm": 29.981672026052266, + "learning_rate": 2.83838582840526e-07, + "loss": 0.8771, + "step": 31786 + }, + { + "epoch": 2.7091962839853405, + "grad_norm": 71.85461839173512, + "learning_rate": 2.8367391821023884e-07, + "loss": 1.6387, + "step": 31787 + }, + { + "epoch": 2.709281513679366, + "grad_norm": 62.94720964356793, + "learning_rate": 2.8350929996314393e-07, + "loss": 1.5983, + "step": 31788 + }, + { + "epoch": 2.7093667433733915, + "grad_norm": 34.89186996106942, + "learning_rate": 2.8334472810086113e-07, + "loss": 0.5422, + "step": 31789 + }, + { + "epoch": 2.7094519730674165, + "grad_norm": 40.9878070146162, + "learning_rate": 2.831802026250091e-07, + "loss": 1.1691, + "step": 31790 + }, + { + "epoch": 2.709537202761442, + "grad_norm": 81.97441513101089, + "learning_rate": 2.830157235372055e-07, + "loss": 1.666, + "step": 31791 + }, + { + "epoch": 2.7096224324554674, + "grad_norm": 63.181856481081795, + "learning_rate": 2.828512908390679e-07, + "loss": 1.4842, + "step": 31792 + }, + { + "epoch": 2.709707662149493, + "grad_norm": 56.03279317307461, + "learning_rate": 2.826869045322128e-07, + "loss": 1.4683, + "step": 31793 + }, + { + "epoch": 2.7097928918435183, + "grad_norm": 66.47936153830089, + "learning_rate": 2.8252256461825835e-07, + "loss": 1.9297, + "step": 31794 + }, + { + "epoch": 2.709878121537544, + "grad_norm": 60.278416499259485, + "learning_rate": 2.8235827109881986e-07, + "loss": 1.758, + "step": 31795 + }, + { + "epoch": 2.7099633512315693, + "grad_norm": 54.656800708240105, + "learning_rate": 2.8219402397551275e-07, + "loss": 1.3011, + "step": 31796 + }, + { + "epoch": 2.7100485809255943, + "grad_norm": 48.868899683553664, + "learning_rate": 2.82029823249953e-07, + "loss": 1.6058, + "step": 31797 + }, + { + "epoch": 2.7101338106196198, + "grad_norm": 55.68039767506797, + "learning_rate": 2.818656689237548e-07, + "loss": 1.5099, + "step": 31798 + }, + { + "epoch": 2.7102190403136452, + "grad_norm": 25.837594513748584, + "learning_rate": 2.817015609985335e-07, + "loss": 1.0799, + "step": 31799 + }, + { + "epoch": 2.7103042700076707, + "grad_norm": 39.37165245236175, + "learning_rate": 2.815374994759024e-07, + "loss": 0.9573, + "step": 31800 + }, + { + "epoch": 2.710389499701696, + "grad_norm": 62.291244015782254, + "learning_rate": 2.8137348435747503e-07, + "loss": 1.5267, + "step": 31801 + }, + { + "epoch": 2.710474729395721, + "grad_norm": 54.49449013448384, + "learning_rate": 2.812095156448641e-07, + "loss": 1.2679, + "step": 31802 + }, + { + "epoch": 2.710559959089747, + "grad_norm": 33.88920980483397, + "learning_rate": 2.810455933396833e-07, + "loss": 0.8434, + "step": 31803 + }, + { + "epoch": 2.710645188783772, + "grad_norm": 51.95725071037786, + "learning_rate": 2.8088171744354244e-07, + "loss": 1.6706, + "step": 31804 + }, + { + "epoch": 2.7107304184777976, + "grad_norm": 61.91157323017589, + "learning_rate": 2.807178879580563e-07, + "loss": 1.2535, + "step": 31805 + }, + { + "epoch": 2.710815648171823, + "grad_norm": 25.90202393924797, + "learning_rate": 2.8055410488483313e-07, + "loss": 0.9659, + "step": 31806 + }, + { + "epoch": 2.7109008778658485, + "grad_norm": 64.04662904066934, + "learning_rate": 2.8039036822548604e-07, + "loss": 1.0231, + "step": 31807 + }, + { + "epoch": 2.710986107559874, + "grad_norm": 23.593098997321878, + "learning_rate": 2.802266779816243e-07, + "loss": 0.8296, + "step": 31808 + }, + { + "epoch": 2.711071337253899, + "grad_norm": 27.064473956760526, + "learning_rate": 2.800630341548577e-07, + "loss": 0.8555, + "step": 31809 + }, + { + "epoch": 2.7111565669479245, + "grad_norm": 41.15702014983717, + "learning_rate": 2.798994367467955e-07, + "loss": 0.8944, + "step": 31810 + }, + { + "epoch": 2.71124179664195, + "grad_norm": 68.74513511650893, + "learning_rate": 2.7973588575904596e-07, + "loss": 1.5081, + "step": 31811 + }, + { + "epoch": 2.7113270263359754, + "grad_norm": 31.704816620072773, + "learning_rate": 2.795723811932188e-07, + "loss": 0.8754, + "step": 31812 + }, + { + "epoch": 2.711412256030001, + "grad_norm": 73.34913791065827, + "learning_rate": 2.794089230509217e-07, + "loss": 1.8907, + "step": 31813 + }, + { + "epoch": 2.7114974857240264, + "grad_norm": 116.22056737899074, + "learning_rate": 2.792455113337622e-07, + "loss": 2.8371, + "step": 31814 + }, + { + "epoch": 2.711582715418052, + "grad_norm": 77.50295890406085, + "learning_rate": 2.7908214604334684e-07, + "loss": 1.9467, + "step": 31815 + }, + { + "epoch": 2.711667945112077, + "grad_norm": 22.665177352625197, + "learning_rate": 2.7891882718128315e-07, + "loss": 0.7815, + "step": 31816 + }, + { + "epoch": 2.7117531748061023, + "grad_norm": 45.41346769926715, + "learning_rate": 2.7875555474917714e-07, + "loss": 1.3592, + "step": 31817 + }, + { + "epoch": 2.711838404500128, + "grad_norm": 77.21985679125237, + "learning_rate": 2.7859232874863364e-07, + "loss": 2.244, + "step": 31818 + }, + { + "epoch": 2.7119236341941533, + "grad_norm": 66.36944303013337, + "learning_rate": 2.784291491812585e-07, + "loss": 1.7209, + "step": 31819 + }, + { + "epoch": 2.7120088638881787, + "grad_norm": 53.273641589116465, + "learning_rate": 2.7826601604865555e-07, + "loss": 1.6879, + "step": 31820 + }, + { + "epoch": 2.7120940935822038, + "grad_norm": 72.6650190100367, + "learning_rate": 2.781029293524318e-07, + "loss": 1.6779, + "step": 31821 + }, + { + "epoch": 2.7121793232762297, + "grad_norm": 76.49874686451989, + "learning_rate": 2.7793988909418867e-07, + "loss": 1.6532, + "step": 31822 + }, + { + "epoch": 2.7122645529702547, + "grad_norm": 32.53952894094851, + "learning_rate": 2.7777689527553e-07, + "loss": 1.1484, + "step": 31823 + }, + { + "epoch": 2.71234978266428, + "grad_norm": 63.74989930102081, + "learning_rate": 2.7761394789805994e-07, + "loss": 1.3027, + "step": 31824 + }, + { + "epoch": 2.7124350123583056, + "grad_norm": 72.81978518161785, + "learning_rate": 2.774510469633801e-07, + "loss": 1.3465, + "step": 31825 + }, + { + "epoch": 2.712520242052331, + "grad_norm": 35.97652606964, + "learning_rate": 2.772881924730925e-07, + "loss": 0.926, + "step": 31826 + }, + { + "epoch": 2.7126054717463566, + "grad_norm": 75.67329882604066, + "learning_rate": 2.7712538442879855e-07, + "loss": 2.0283, + "step": 31827 + }, + { + "epoch": 2.7126907014403816, + "grad_norm": 28.971405576544722, + "learning_rate": 2.769626228320993e-07, + "loss": 0.8513, + "step": 31828 + }, + { + "epoch": 2.712775931134407, + "grad_norm": 56.65887378349615, + "learning_rate": 2.767999076845973e-07, + "loss": 1.5964, + "step": 31829 + }, + { + "epoch": 2.7128611608284325, + "grad_norm": 44.418242143332286, + "learning_rate": 2.766372389878907e-07, + "loss": 1.325, + "step": 31830 + }, + { + "epoch": 2.712946390522458, + "grad_norm": 46.077139732426204, + "learning_rate": 2.764746167435806e-07, + "loss": 1.3067, + "step": 31831 + }, + { + "epoch": 2.7130316202164835, + "grad_norm": 41.31933985431292, + "learning_rate": 2.763120409532644e-07, + "loss": 1.3066, + "step": 31832 + }, + { + "epoch": 2.713116849910509, + "grad_norm": 65.46736599005226, + "learning_rate": 2.761495116185431e-07, + "loss": 1.4387, + "step": 31833 + }, + { + "epoch": 2.7132020796045344, + "grad_norm": 27.389253797469415, + "learning_rate": 2.759870287410144e-07, + "loss": 1.0595, + "step": 31834 + }, + { + "epoch": 2.7132873092985594, + "grad_norm": 31.944238630216052, + "learning_rate": 2.758245923222758e-07, + "loss": 0.7536, + "step": 31835 + }, + { + "epoch": 2.713372538992585, + "grad_norm": 23.012954039512692, + "learning_rate": 2.756622023639244e-07, + "loss": 0.7851, + "step": 31836 + }, + { + "epoch": 2.7134577686866104, + "grad_norm": 29.746543822657834, + "learning_rate": 2.754998588675589e-07, + "loss": 0.7195, + "step": 31837 + }, + { + "epoch": 2.713542998380636, + "grad_norm": 44.55410007061586, + "learning_rate": 2.7533756183477534e-07, + "loss": 1.4915, + "step": 31838 + }, + { + "epoch": 2.7136282280746613, + "grad_norm": 54.252888016019064, + "learning_rate": 2.75175311267169e-07, + "loss": 1.1158, + "step": 31839 + }, + { + "epoch": 2.7137134577686868, + "grad_norm": 42.70133739245172, + "learning_rate": 2.7501310716633643e-07, + "loss": 1.1983, + "step": 31840 + }, + { + "epoch": 2.7137986874627122, + "grad_norm": 59.2905140139606, + "learning_rate": 2.7485094953387137e-07, + "loss": 1.8302, + "step": 31841 + }, + { + "epoch": 2.7138839171567373, + "grad_norm": 32.85992859174147, + "learning_rate": 2.7468883837137027e-07, + "loss": 1.3561, + "step": 31842 + }, + { + "epoch": 2.7139691468507627, + "grad_norm": 70.25838723840761, + "learning_rate": 2.7452677368042634e-07, + "loss": 1.5014, + "step": 31843 + }, + { + "epoch": 2.714054376544788, + "grad_norm": 51.57593409499749, + "learning_rate": 2.7436475546263496e-07, + "loss": 1.5281, + "step": 31844 + }, + { + "epoch": 2.7141396062388137, + "grad_norm": 45.76100927892878, + "learning_rate": 2.742027837195871e-07, + "loss": 1.2392, + "step": 31845 + }, + { + "epoch": 2.714224835932839, + "grad_norm": 35.040406272893605, + "learning_rate": 2.740408584528781e-07, + "loss": 0.9039, + "step": 31846 + }, + { + "epoch": 2.714310065626864, + "grad_norm": 56.34587969789538, + "learning_rate": 2.7387897966409894e-07, + "loss": 1.9706, + "step": 31847 + }, + { + "epoch": 2.7143952953208896, + "grad_norm": 33.44141071862671, + "learning_rate": 2.7371714735484224e-07, + "loss": 1.3042, + "step": 31848 + }, + { + "epoch": 2.714480525014915, + "grad_norm": 28.042122057108376, + "learning_rate": 2.7355536152669835e-07, + "loss": 1.1201, + "step": 31849 + }, + { + "epoch": 2.7145657547089406, + "grad_norm": 28.93004534117915, + "learning_rate": 2.7339362218126055e-07, + "loss": 0.8456, + "step": 31850 + }, + { + "epoch": 2.714650984402966, + "grad_norm": 16.492431393644868, + "learning_rate": 2.7323192932011744e-07, + "loss": 0.5713, + "step": 31851 + }, + { + "epoch": 2.7147362140969915, + "grad_norm": 39.68757493942157, + "learning_rate": 2.7307028294486114e-07, + "loss": 0.7654, + "step": 31852 + }, + { + "epoch": 2.714821443791017, + "grad_norm": 88.93616208339279, + "learning_rate": 2.7290868305707987e-07, + "loss": 2.3131, + "step": 31853 + }, + { + "epoch": 2.714906673485042, + "grad_norm": 67.50027529941325, + "learning_rate": 2.727471296583628e-07, + "loss": 2.2628, + "step": 31854 + }, + { + "epoch": 2.7149919031790675, + "grad_norm": 36.48117759785528, + "learning_rate": 2.7258562275029986e-07, + "loss": 0.8024, + "step": 31855 + }, + { + "epoch": 2.715077132873093, + "grad_norm": 83.51857713963072, + "learning_rate": 2.7242416233447866e-07, + "loss": 2.2384, + "step": 31856 + }, + { + "epoch": 2.7151623625671184, + "grad_norm": 77.47698039913018, + "learning_rate": 2.7226274841248736e-07, + "loss": 1.6418, + "step": 31857 + }, + { + "epoch": 2.715247592261144, + "grad_norm": 32.44517385838184, + "learning_rate": 2.72101380985913e-07, + "loss": 0.965, + "step": 31858 + }, + { + "epoch": 2.7153328219551693, + "grad_norm": 57.8203919228273, + "learning_rate": 2.7194006005634266e-07, + "loss": 0.9343, + "step": 31859 + }, + { + "epoch": 2.715418051649195, + "grad_norm": 44.267495238109966, + "learning_rate": 2.717787856253634e-07, + "loss": 1.1215, + "step": 31860 + }, + { + "epoch": 2.71550328134322, + "grad_norm": 94.52124195500743, + "learning_rate": 2.716175576945612e-07, + "loss": 1.7122, + "step": 31861 + }, + { + "epoch": 2.7155885110372453, + "grad_norm": 34.70167505381896, + "learning_rate": 2.714563762655209e-07, + "loss": 0.878, + "step": 31862 + }, + { + "epoch": 2.7156737407312708, + "grad_norm": 75.02357787205288, + "learning_rate": 2.71295241339829e-07, + "loss": 1.8225, + "step": 31863 + }, + { + "epoch": 2.7157589704252962, + "grad_norm": 35.15920291158412, + "learning_rate": 2.711341529190692e-07, + "loss": 0.739, + "step": 31864 + }, + { + "epoch": 2.7158442001193217, + "grad_norm": 47.377275286689155, + "learning_rate": 2.7097311100482635e-07, + "loss": 1.9347, + "step": 31865 + }, + { + "epoch": 2.7159294298133467, + "grad_norm": 62.341223802690365, + "learning_rate": 2.708121155986826e-07, + "loss": 0.9827, + "step": 31866 + }, + { + "epoch": 2.7160146595073726, + "grad_norm": 22.638412295291253, + "learning_rate": 2.706511667022227e-07, + "loss": 0.6363, + "step": 31867 + }, + { + "epoch": 2.7160998892013977, + "grad_norm": 66.36838618942501, + "learning_rate": 2.7049026431702986e-07, + "loss": 1.1177, + "step": 31868 + }, + { + "epoch": 2.716185118895423, + "grad_norm": 64.75186549261467, + "learning_rate": 2.7032940844468614e-07, + "loss": 1.7021, + "step": 31869 + }, + { + "epoch": 2.7162703485894486, + "grad_norm": 60.565754080879756, + "learning_rate": 2.7016859908677306e-07, + "loss": 1.3788, + "step": 31870 + }, + { + "epoch": 2.716355578283474, + "grad_norm": 58.39843073274982, + "learning_rate": 2.7000783624487157e-07, + "loss": 1.9593, + "step": 31871 + }, + { + "epoch": 2.7164408079774995, + "grad_norm": 32.086838135231275, + "learning_rate": 2.698471199205649e-07, + "loss": 1.037, + "step": 31872 + }, + { + "epoch": 2.7165260376715246, + "grad_norm": 66.1508593764463, + "learning_rate": 2.696864501154317e-07, + "loss": 1.8664, + "step": 31873 + }, + { + "epoch": 2.71661126736555, + "grad_norm": 42.80641831835334, + "learning_rate": 2.695258268310519e-07, + "loss": 1.4661, + "step": 31874 + }, + { + "epoch": 2.7166964970595755, + "grad_norm": 96.35697203031813, + "learning_rate": 2.693652500690058e-07, + "loss": 1.9503, + "step": 31875 + }, + { + "epoch": 2.716781726753601, + "grad_norm": 22.89090155283091, + "learning_rate": 2.692047198308739e-07, + "loss": 0.8437, + "step": 31876 + }, + { + "epoch": 2.7168669564476264, + "grad_norm": 52.61823176780746, + "learning_rate": 2.6904423611823325e-07, + "loss": 1.2981, + "step": 31877 + }, + { + "epoch": 2.716952186141652, + "grad_norm": 69.36863174945599, + "learning_rate": 2.688837989326626e-07, + "loss": 1.7516, + "step": 31878 + }, + { + "epoch": 2.7170374158356774, + "grad_norm": 66.96186417139864, + "learning_rate": 2.687234082757395e-07, + "loss": 1.2574, + "step": 31879 + }, + { + "epoch": 2.7171226455297024, + "grad_norm": 62.56801575142721, + "learning_rate": 2.685630641490422e-07, + "loss": 1.4887, + "step": 31880 + }, + { + "epoch": 2.717207875223728, + "grad_norm": 41.52450240893604, + "learning_rate": 2.684027665541472e-07, + "loss": 0.988, + "step": 31881 + }, + { + "epoch": 2.7172931049177533, + "grad_norm": 18.150018729207044, + "learning_rate": 2.682425154926299e-07, + "loss": 0.4839, + "step": 31882 + }, + { + "epoch": 2.717378334611779, + "grad_norm": 43.44953807558812, + "learning_rate": 2.6808231096606793e-07, + "loss": 1.1701, + "step": 31883 + }, + { + "epoch": 2.7174635643058043, + "grad_norm": 71.78675872128692, + "learning_rate": 2.6792215297603565e-07, + "loss": 1.9718, + "step": 31884 + }, + { + "epoch": 2.7175487939998293, + "grad_norm": 44.98014254741501, + "learning_rate": 2.6776204152410947e-07, + "loss": 0.9309, + "step": 31885 + }, + { + "epoch": 2.717634023693855, + "grad_norm": 85.9163299486438, + "learning_rate": 2.6760197661186315e-07, + "loss": 1.6155, + "step": 31886 + }, + { + "epoch": 2.7177192533878802, + "grad_norm": 30.495420813480784, + "learning_rate": 2.6744195824087047e-07, + "loss": 1.4195, + "step": 31887 + }, + { + "epoch": 2.7178044830819057, + "grad_norm": 42.66252836811539, + "learning_rate": 2.672819864127052e-07, + "loss": 1.1247, + "step": 31888 + }, + { + "epoch": 2.717889712775931, + "grad_norm": 55.12562396541542, + "learning_rate": 2.671220611289416e-07, + "loss": 1.7904, + "step": 31889 + }, + { + "epoch": 2.7179749424699566, + "grad_norm": 30.768935452094, + "learning_rate": 2.669621823911517e-07, + "loss": 1.2145, + "step": 31890 + }, + { + "epoch": 2.718060172163982, + "grad_norm": 27.250444080354615, + "learning_rate": 2.6680235020090815e-07, + "loss": 0.6296, + "step": 31891 + }, + { + "epoch": 2.718145401858007, + "grad_norm": 71.23875002895053, + "learning_rate": 2.66642564559782e-07, + "loss": 1.3749, + "step": 31892 + }, + { + "epoch": 2.7182306315520326, + "grad_norm": 64.26781003090785, + "learning_rate": 2.664828254693463e-07, + "loss": 1.4041, + "step": 31893 + }, + { + "epoch": 2.718315861246058, + "grad_norm": 50.39172127434292, + "learning_rate": 2.6632313293117106e-07, + "loss": 1.4337, + "step": 31894 + }, + { + "epoch": 2.7184010909400835, + "grad_norm": 43.983727073780344, + "learning_rate": 2.661634869468266e-07, + "loss": 1.7917, + "step": 31895 + }, + { + "epoch": 2.718486320634109, + "grad_norm": 33.78422742623237, + "learning_rate": 2.6600388751788276e-07, + "loss": 1.4414, + "step": 31896 + }, + { + "epoch": 2.7185715503281345, + "grad_norm": 54.279907323815884, + "learning_rate": 2.6584433464590995e-07, + "loss": 1.1968, + "step": 31897 + }, + { + "epoch": 2.71865678002216, + "grad_norm": 43.05085643562801, + "learning_rate": 2.6568482833247644e-07, + "loss": 1.2022, + "step": 31898 + }, + { + "epoch": 2.718742009716185, + "grad_norm": 73.98872694543819, + "learning_rate": 2.655253685791526e-07, + "loss": 2.0647, + "step": 31899 + }, + { + "epoch": 2.7188272394102104, + "grad_norm": 19.46635143584652, + "learning_rate": 2.6536595538750486e-07, + "loss": 0.4228, + "step": 31900 + }, + { + "epoch": 2.718912469104236, + "grad_norm": 50.90998804924644, + "learning_rate": 2.65206588759101e-07, + "loss": 1.6269, + "step": 31901 + }, + { + "epoch": 2.7189976987982614, + "grad_norm": 57.858858259028004, + "learning_rate": 2.6504726869550966e-07, + "loss": 1.3818, + "step": 31902 + }, + { + "epoch": 2.719082928492287, + "grad_norm": 53.419561873494615, + "learning_rate": 2.648879951982969e-07, + "loss": 1.499, + "step": 31903 + }, + { + "epoch": 2.719168158186312, + "grad_norm": 77.75502693440303, + "learning_rate": 2.6472876826902915e-07, + "loss": 1.7908, + "step": 31904 + }, + { + "epoch": 2.7192533878803378, + "grad_norm": 29.13801611681021, + "learning_rate": 2.645695879092719e-07, + "loss": 0.7622, + "step": 31905 + }, + { + "epoch": 2.719338617574363, + "grad_norm": 70.92031020540607, + "learning_rate": 2.6441045412059105e-07, + "loss": 2.3004, + "step": 31906 + }, + { + "epoch": 2.7194238472683883, + "grad_norm": 51.93304928869744, + "learning_rate": 2.6425136690455213e-07, + "loss": 1.2558, + "step": 31907 + }, + { + "epoch": 2.7195090769624137, + "grad_norm": 64.5669504619214, + "learning_rate": 2.640923262627193e-07, + "loss": 1.3685, + "step": 31908 + }, + { + "epoch": 2.719594306656439, + "grad_norm": 57.44860038172311, + "learning_rate": 2.639333321966558e-07, + "loss": 1.1855, + "step": 31909 + }, + { + "epoch": 2.7196795363504647, + "grad_norm": 62.20961326105871, + "learning_rate": 2.637743847079266e-07, + "loss": 1.0193, + "step": 31910 + }, + { + "epoch": 2.7197647660444897, + "grad_norm": 44.29218795688538, + "learning_rate": 2.6361548379809477e-07, + "loss": 1.0466, + "step": 31911 + }, + { + "epoch": 2.719849995738515, + "grad_norm": 38.42370667414485, + "learning_rate": 2.634566294687224e-07, + "loss": 1.2276, + "step": 31912 + }, + { + "epoch": 2.7199352254325406, + "grad_norm": 45.76745287551531, + "learning_rate": 2.632978217213711e-07, + "loss": 1.256, + "step": 31913 + }, + { + "epoch": 2.720020455126566, + "grad_norm": 40.42635810032731, + "learning_rate": 2.631390605576034e-07, + "loss": 1.3611, + "step": 31914 + }, + { + "epoch": 2.7201056848205916, + "grad_norm": 57.41119688049438, + "learning_rate": 2.62980345978982e-07, + "loss": 1.5928, + "step": 31915 + }, + { + "epoch": 2.720190914514617, + "grad_norm": 23.371581792002477, + "learning_rate": 2.6282167798706625e-07, + "loss": 1.0321, + "step": 31916 + }, + { + "epoch": 2.7202761442086425, + "grad_norm": 45.30499405424217, + "learning_rate": 2.6266305658341704e-07, + "loss": 1.3186, + "step": 31917 + }, + { + "epoch": 2.7203613739026675, + "grad_norm": 33.703797382020724, + "learning_rate": 2.6250448176959373e-07, + "loss": 0.8347, + "step": 31918 + }, + { + "epoch": 2.720446603596693, + "grad_norm": 84.68405194577113, + "learning_rate": 2.6234595354715676e-07, + "loss": 2.857, + "step": 31919 + }, + { + "epoch": 2.7205318332907185, + "grad_norm": 38.70384306851863, + "learning_rate": 2.621874719176648e-07, + "loss": 1.4133, + "step": 31920 + }, + { + "epoch": 2.720617062984744, + "grad_norm": 38.364497077759516, + "learning_rate": 2.620290368826761e-07, + "loss": 1.3188, + "step": 31921 + }, + { + "epoch": 2.7207022926787694, + "grad_norm": 33.48532891344172, + "learning_rate": 2.618706484437483e-07, + "loss": 0.9989, + "step": 31922 + }, + { + "epoch": 2.7207875223727944, + "grad_norm": 53.43180946126337, + "learning_rate": 2.617123066024413e-07, + "loss": 0.9404, + "step": 31923 + }, + { + "epoch": 2.7208727520668203, + "grad_norm": 80.19451676145073, + "learning_rate": 2.61554011360311e-07, + "loss": 1.6234, + "step": 31924 + }, + { + "epoch": 2.7209579817608454, + "grad_norm": 82.77754341373567, + "learning_rate": 2.61395762718914e-07, + "loss": 1.944, + "step": 31925 + }, + { + "epoch": 2.721043211454871, + "grad_norm": 66.5752618593826, + "learning_rate": 2.6123756067980674e-07, + "loss": 1.3839, + "step": 31926 + }, + { + "epoch": 2.7211284411488963, + "grad_norm": 60.46385837405798, + "learning_rate": 2.610794052445442e-07, + "loss": 1.3535, + "step": 31927 + }, + { + "epoch": 2.7212136708429218, + "grad_norm": 36.52083989901463, + "learning_rate": 2.6092129641468343e-07, + "loss": 0.9082, + "step": 31928 + }, + { + "epoch": 2.7212989005369472, + "grad_norm": 30.160148012807625, + "learning_rate": 2.607632341917776e-07, + "loss": 0.6634, + "step": 31929 + }, + { + "epoch": 2.7213841302309723, + "grad_norm": 83.16068900840129, + "learning_rate": 2.606052185773833e-07, + "loss": 2.3929, + "step": 31930 + }, + { + "epoch": 2.7214693599249977, + "grad_norm": 48.99992301657398, + "learning_rate": 2.604472495730526e-07, + "loss": 1.4236, + "step": 31931 + }, + { + "epoch": 2.721554589619023, + "grad_norm": 57.61511103582239, + "learning_rate": 2.6028932718034084e-07, + "loss": 1.4947, + "step": 31932 + }, + { + "epoch": 2.7216398193130487, + "grad_norm": 48.222166145738946, + "learning_rate": 2.6013145140079964e-07, + "loss": 1.1617, + "step": 31933 + }, + { + "epoch": 2.721725049007074, + "grad_norm": 75.66859064103232, + "learning_rate": 2.5997362223598277e-07, + "loss": 1.6822, + "step": 31934 + }, + { + "epoch": 2.7218102787010996, + "grad_norm": 42.29723352225282, + "learning_rate": 2.598158396874406e-07, + "loss": 1.2971, + "step": 31935 + }, + { + "epoch": 2.721895508395125, + "grad_norm": 36.12556045106928, + "learning_rate": 2.5965810375672694e-07, + "loss": 0.9408, + "step": 31936 + }, + { + "epoch": 2.72198073808915, + "grad_norm": 62.56619304998683, + "learning_rate": 2.5950041444539166e-07, + "loss": 1.6728, + "step": 31937 + }, + { + "epoch": 2.7220659677831756, + "grad_norm": 76.38806035755975, + "learning_rate": 2.593427717549868e-07, + "loss": 1.9718, + "step": 31938 + }, + { + "epoch": 2.722151197477201, + "grad_norm": 57.09405011700289, + "learning_rate": 2.5918517568706114e-07, + "loss": 1.6137, + "step": 31939 + }, + { + "epoch": 2.7222364271712265, + "grad_norm": 30.25298305450084, + "learning_rate": 2.5902762624316625e-07, + "loss": 0.7044, + "step": 31940 + }, + { + "epoch": 2.722321656865252, + "grad_norm": 42.93616130734437, + "learning_rate": 2.588701234248508e-07, + "loss": 0.9414, + "step": 31941 + }, + { + "epoch": 2.722406886559277, + "grad_norm": 41.995315879931525, + "learning_rate": 2.5871266723366373e-07, + "loss": 1.3502, + "step": 31942 + }, + { + "epoch": 2.722492116253303, + "grad_norm": 97.07008162068087, + "learning_rate": 2.5855525767115366e-07, + "loss": 2.2325, + "step": 31943 + }, + { + "epoch": 2.722577345947328, + "grad_norm": 20.612621693740287, + "learning_rate": 2.5839789473886766e-07, + "loss": 0.5623, + "step": 31944 + }, + { + "epoch": 2.7226625756413534, + "grad_norm": 77.71700927389071, + "learning_rate": 2.582405784383546e-07, + "loss": 1.3875, + "step": 31945 + }, + { + "epoch": 2.722747805335379, + "grad_norm": 43.258327884467164, + "learning_rate": 2.580833087711615e-07, + "loss": 1.9774, + "step": 31946 + }, + { + "epoch": 2.7228330350294043, + "grad_norm": 36.57541944413457, + "learning_rate": 2.5792608573883494e-07, + "loss": 1.1209, + "step": 31947 + }, + { + "epoch": 2.72291826472343, + "grad_norm": 66.52322771297361, + "learning_rate": 2.577689093429203e-07, + "loss": 2.2489, + "step": 31948 + }, + { + "epoch": 2.723003494417455, + "grad_norm": 40.96415689590652, + "learning_rate": 2.5761177958496533e-07, + "loss": 0.8373, + "step": 31949 + }, + { + "epoch": 2.7230887241114803, + "grad_norm": 62.25834396574913, + "learning_rate": 2.574546964665131e-07, + "loss": 1.1892, + "step": 31950 + }, + { + "epoch": 2.7231739538055058, + "grad_norm": 55.868422860707305, + "learning_rate": 2.5729765998911027e-07, + "loss": 1.2975, + "step": 31951 + }, + { + "epoch": 2.723259183499531, + "grad_norm": 88.58163456565775, + "learning_rate": 2.571406701542994e-07, + "loss": 1.9016, + "step": 31952 + }, + { + "epoch": 2.7233444131935567, + "grad_norm": 49.620084723719884, + "learning_rate": 2.569837269636255e-07, + "loss": 1.6319, + "step": 31953 + }, + { + "epoch": 2.723429642887582, + "grad_norm": 70.3425774916854, + "learning_rate": 2.568268304186322e-07, + "loss": 1.2292, + "step": 31954 + }, + { + "epoch": 2.7235148725816076, + "grad_norm": 54.40569582365979, + "learning_rate": 2.566699805208628e-07, + "loss": 1.3697, + "step": 31955 + }, + { + "epoch": 2.7236001022756327, + "grad_norm": 52.944904294241255, + "learning_rate": 2.5651317727185877e-07, + "loss": 1.8429, + "step": 31956 + }, + { + "epoch": 2.723685331969658, + "grad_norm": 31.85185302745973, + "learning_rate": 2.563564206731628e-07, + "loss": 1.1448, + "step": 31957 + }, + { + "epoch": 2.7237705616636836, + "grad_norm": 30.864038332717207, + "learning_rate": 2.5619971072631644e-07, + "loss": 0.6216, + "step": 31958 + }, + { + "epoch": 2.723855791357709, + "grad_norm": 83.72790423677786, + "learning_rate": 2.5604304743286126e-07, + "loss": 2.4866, + "step": 31959 + }, + { + "epoch": 2.7239410210517345, + "grad_norm": 61.379948039221375, + "learning_rate": 2.558864307943371e-07, + "loss": 1.8309, + "step": 31960 + }, + { + "epoch": 2.7240262507457595, + "grad_norm": 32.091794733900585, + "learning_rate": 2.557298608122849e-07, + "loss": 0.6624, + "step": 31961 + }, + { + "epoch": 2.7241114804397855, + "grad_norm": 44.83680132410878, + "learning_rate": 2.5557333748824465e-07, + "loss": 1.1013, + "step": 31962 + }, + { + "epoch": 2.7241967101338105, + "grad_norm": 53.40054432447245, + "learning_rate": 2.554168608237556e-07, + "loss": 1.2092, + "step": 31963 + }, + { + "epoch": 2.724281939827836, + "grad_norm": 22.767130035406378, + "learning_rate": 2.55260430820356e-07, + "loss": 0.7642, + "step": 31964 + }, + { + "epoch": 2.7243671695218614, + "grad_norm": 53.71778038183396, + "learning_rate": 2.5510404747958404e-07, + "loss": 1.8189, + "step": 31965 + }, + { + "epoch": 2.724452399215887, + "grad_norm": 35.4939411946216, + "learning_rate": 2.5494771080297966e-07, + "loss": 1.2482, + "step": 31966 + }, + { + "epoch": 2.7245376289099124, + "grad_norm": 48.87840150795612, + "learning_rate": 2.5479142079207876e-07, + "loss": 1.3768, + "step": 31967 + }, + { + "epoch": 2.7246228586039374, + "grad_norm": 101.88866255441496, + "learning_rate": 2.546351774484179e-07, + "loss": 2.322, + "step": 31968 + }, + { + "epoch": 2.724708088297963, + "grad_norm": 56.93637029118722, + "learning_rate": 2.544789807735343e-07, + "loss": 1.9189, + "step": 31969 + }, + { + "epoch": 2.7247933179919883, + "grad_norm": 40.618870945009924, + "learning_rate": 2.543228307689649e-07, + "loss": 1.4522, + "step": 31970 + }, + { + "epoch": 2.724878547686014, + "grad_norm": 80.52786866421528, + "learning_rate": 2.5416672743624525e-07, + "loss": 1.9134, + "step": 31971 + }, + { + "epoch": 2.7249637773800393, + "grad_norm": 63.717875807881626, + "learning_rate": 2.5401067077690966e-07, + "loss": 2.1918, + "step": 31972 + }, + { + "epoch": 2.7250490070740647, + "grad_norm": 50.865322539414215, + "learning_rate": 2.53854660792493e-07, + "loss": 1.2975, + "step": 31973 + }, + { + "epoch": 2.72513423676809, + "grad_norm": 38.33158854471479, + "learning_rate": 2.5369869748452957e-07, + "loss": 1.1104, + "step": 31974 + }, + { + "epoch": 2.725219466462115, + "grad_norm": 55.148360723304855, + "learning_rate": 2.5354278085455376e-07, + "loss": 1.2891, + "step": 31975 + }, + { + "epoch": 2.7253046961561407, + "grad_norm": 29.75004940940876, + "learning_rate": 2.533869109040982e-07, + "loss": 0.9881, + "step": 31976 + }, + { + "epoch": 2.725389925850166, + "grad_norm": 40.391519901854664, + "learning_rate": 2.5323108763469663e-07, + "loss": 1.2451, + "step": 31977 + }, + { + "epoch": 2.7254751555441916, + "grad_norm": 39.3994526776418, + "learning_rate": 2.530753110478812e-07, + "loss": 1.2344, + "step": 31978 + }, + { + "epoch": 2.725560385238217, + "grad_norm": 36.08031366493545, + "learning_rate": 2.52919581145184e-07, + "loss": 1.2979, + "step": 31979 + }, + { + "epoch": 2.7256456149322426, + "grad_norm": 57.420722646207246, + "learning_rate": 2.5276389792813603e-07, + "loss": 1.2949, + "step": 31980 + }, + { + "epoch": 2.725730844626268, + "grad_norm": 47.64565021991106, + "learning_rate": 2.526082613982694e-07, + "loss": 1.2273, + "step": 31981 + }, + { + "epoch": 2.725816074320293, + "grad_norm": 17.7813384002317, + "learning_rate": 2.5245267155711296e-07, + "loss": 0.6489, + "step": 31982 + }, + { + "epoch": 2.7259013040143185, + "grad_norm": 72.65111052763939, + "learning_rate": 2.5229712840619927e-07, + "loss": 1.7186, + "step": 31983 + }, + { + "epoch": 2.725986533708344, + "grad_norm": 67.57206894612243, + "learning_rate": 2.5214163194705544e-07, + "loss": 2.2398, + "step": 31984 + }, + { + "epoch": 2.7260717634023695, + "grad_norm": 65.62172103754276, + "learning_rate": 2.5198618218121305e-07, + "loss": 1.558, + "step": 31985 + }, + { + "epoch": 2.726156993096395, + "grad_norm": 62.02107808233576, + "learning_rate": 2.518307791101998e-07, + "loss": 1.5173, + "step": 31986 + }, + { + "epoch": 2.72624222279042, + "grad_norm": 39.621949878280255, + "learning_rate": 2.5167542273554335e-07, + "loss": 1.4239, + "step": 31987 + }, + { + "epoch": 2.726327452484446, + "grad_norm": 71.5495892667207, + "learning_rate": 2.5152011305877356e-07, + "loss": 2.1623, + "step": 31988 + }, + { + "epoch": 2.726412682178471, + "grad_norm": 51.33710085939394, + "learning_rate": 2.5136485008141587e-07, + "loss": 1.7552, + "step": 31989 + }, + { + "epoch": 2.7264979118724963, + "grad_norm": 33.74355734040979, + "learning_rate": 2.5120963380499854e-07, + "loss": 0.9067, + "step": 31990 + }, + { + "epoch": 2.726583141566522, + "grad_norm": 75.37566549894457, + "learning_rate": 2.510544642310464e-07, + "loss": 2.0845, + "step": 31991 + }, + { + "epoch": 2.7266683712605473, + "grad_norm": 45.320527284239844, + "learning_rate": 2.5089934136108665e-07, + "loss": 1.3559, + "step": 31992 + }, + { + "epoch": 2.7267536009545728, + "grad_norm": 51.046174441384174, + "learning_rate": 2.507442651966452e-07, + "loss": 1.3062, + "step": 31993 + }, + { + "epoch": 2.7268388306485978, + "grad_norm": 50.16353255554237, + "learning_rate": 2.50589235739247e-07, + "loss": 1.3029, + "step": 31994 + }, + { + "epoch": 2.7269240603426232, + "grad_norm": 60.9204112400986, + "learning_rate": 2.504342529904158e-07, + "loss": 1.3408, + "step": 31995 + }, + { + "epoch": 2.7270092900366487, + "grad_norm": 77.34265879924216, + "learning_rate": 2.502793169516771e-07, + "loss": 1.4939, + "step": 31996 + }, + { + "epoch": 2.727094519730674, + "grad_norm": 83.70851718629775, + "learning_rate": 2.50124427624554e-07, + "loss": 1.9699, + "step": 31997 + }, + { + "epoch": 2.7271797494246997, + "grad_norm": 54.517040401780605, + "learning_rate": 2.499695850105693e-07, + "loss": 1.2423, + "step": 31998 + }, + { + "epoch": 2.727264979118725, + "grad_norm": 72.97808943276466, + "learning_rate": 2.4981478911124623e-07, + "loss": 1.4011, + "step": 31999 + }, + { + "epoch": 2.7273502088127506, + "grad_norm": 48.17274086836681, + "learning_rate": 2.496600399281068e-07, + "loss": 1.7143, + "step": 32000 + }, + { + "epoch": 2.7274354385067756, + "grad_norm": 37.74725857131365, + "learning_rate": 2.4950533746267437e-07, + "loss": 1.0944, + "step": 32001 + }, + { + "epoch": 2.727520668200801, + "grad_norm": 64.2154141037848, + "learning_rate": 2.493506817164687e-07, + "loss": 1.1679, + "step": 32002 + }, + { + "epoch": 2.7276058978948265, + "grad_norm": 107.00819137874318, + "learning_rate": 2.491960726910114e-07, + "loss": 2.382, + "step": 32003 + }, + { + "epoch": 2.727691127588852, + "grad_norm": 29.89859619799014, + "learning_rate": 2.4904151038782244e-07, + "loss": 1.2714, + "step": 32004 + }, + { + "epoch": 2.7277763572828775, + "grad_norm": 77.6202942016407, + "learning_rate": 2.488869948084233e-07, + "loss": 1.9527, + "step": 32005 + }, + { + "epoch": 2.7278615869769025, + "grad_norm": 54.835278633651725, + "learning_rate": 2.4873252595433215e-07, + "loss": 1.2743, + "step": 32006 + }, + { + "epoch": 2.7279468166709284, + "grad_norm": 26.569618884431126, + "learning_rate": 2.485781038270685e-07, + "loss": 0.7794, + "step": 32007 + }, + { + "epoch": 2.7280320463649534, + "grad_norm": 54.22716418984011, + "learning_rate": 2.48423728428151e-07, + "loss": 1.1119, + "step": 32008 + }, + { + "epoch": 2.728117276058979, + "grad_norm": 82.25992413677305, + "learning_rate": 2.4826939975909846e-07, + "loss": 2.3429, + "step": 32009 + }, + { + "epoch": 2.7282025057530044, + "grad_norm": 53.13679205426248, + "learning_rate": 2.481151178214286e-07, + "loss": 1.517, + "step": 32010 + }, + { + "epoch": 2.72828773544703, + "grad_norm": 52.39740176960583, + "learning_rate": 2.4796088261665796e-07, + "loss": 1.9203, + "step": 32011 + }, + { + "epoch": 2.7283729651410553, + "grad_norm": 68.34779991443854, + "learning_rate": 2.478066941463031e-07, + "loss": 1.4373, + "step": 32012 + }, + { + "epoch": 2.7284581948350803, + "grad_norm": 50.225376312209946, + "learning_rate": 2.4765255241188223e-07, + "loss": 1.6114, + "step": 32013 + }, + { + "epoch": 2.728543424529106, + "grad_norm": 57.6446663735206, + "learning_rate": 2.4749845741490975e-07, + "loss": 1.6111, + "step": 32014 + }, + { + "epoch": 2.7286286542231313, + "grad_norm": 43.34397234708724, + "learning_rate": 2.473444091569005e-07, + "loss": 1.5602, + "step": 32015 + }, + { + "epoch": 2.7287138839171567, + "grad_norm": 71.1148196819743, + "learning_rate": 2.471904076393716e-07, + "loss": 2.2217, + "step": 32016 + }, + { + "epoch": 2.728799113611182, + "grad_norm": 44.531157135001784, + "learning_rate": 2.470364528638358e-07, + "loss": 1.4221, + "step": 32017 + }, + { + "epoch": 2.7288843433052077, + "grad_norm": 66.3906793161612, + "learning_rate": 2.468825448318085e-07, + "loss": 1.9183, + "step": 32018 + }, + { + "epoch": 2.728969572999233, + "grad_norm": 42.221641937161834, + "learning_rate": 2.467286835448024e-07, + "loss": 1.2035, + "step": 32019 + }, + { + "epoch": 2.729054802693258, + "grad_norm": 31.717975443047575, + "learning_rate": 2.4657486900433125e-07, + "loss": 1.03, + "step": 32020 + }, + { + "epoch": 2.7291400323872836, + "grad_norm": 54.51982507562305, + "learning_rate": 2.4642110121190666e-07, + "loss": 1.5122, + "step": 32021 + }, + { + "epoch": 2.729225262081309, + "grad_norm": 26.503455884992203, + "learning_rate": 2.462673801690424e-07, + "loss": 0.7733, + "step": 32022 + }, + { + "epoch": 2.7293104917753346, + "grad_norm": 40.91322097691977, + "learning_rate": 2.461137058772484e-07, + "loss": 1.7394, + "step": 32023 + }, + { + "epoch": 2.72939572146936, + "grad_norm": 21.640988829048673, + "learning_rate": 2.459600783380384e-07, + "loss": 0.7241, + "step": 32024 + }, + { + "epoch": 2.729480951163385, + "grad_norm": 64.77662927641657, + "learning_rate": 2.458064975529206e-07, + "loss": 1.1665, + "step": 32025 + }, + { + "epoch": 2.729566180857411, + "grad_norm": 31.456981912480128, + "learning_rate": 2.4565296352340784e-07, + "loss": 0.6149, + "step": 32026 + }, + { + "epoch": 2.729651410551436, + "grad_norm": 73.21357314903145, + "learning_rate": 2.4549947625100876e-07, + "loss": 2.0685, + "step": 32027 + }, + { + "epoch": 2.7297366402454615, + "grad_norm": 34.77950219800528, + "learning_rate": 2.4534603573723336e-07, + "loss": 0.9553, + "step": 32028 + }, + { + "epoch": 2.729821869939487, + "grad_norm": 65.15822939225487, + "learning_rate": 2.4519264198359037e-07, + "loss": 1.2007, + "step": 32029 + }, + { + "epoch": 2.7299070996335124, + "grad_norm": 57.33498553402907, + "learning_rate": 2.450392949915875e-07, + "loss": 1.4271, + "step": 32030 + }, + { + "epoch": 2.729992329327538, + "grad_norm": 56.33390136563408, + "learning_rate": 2.448859947627341e-07, + "loss": 1.3647, + "step": 32031 + }, + { + "epoch": 2.730077559021563, + "grad_norm": 75.90198331416275, + "learning_rate": 2.4473274129853786e-07, + "loss": 1.8929, + "step": 32032 + }, + { + "epoch": 2.7301627887155884, + "grad_norm": 50.68699650871808, + "learning_rate": 2.445795346005053e-07, + "loss": 1.4328, + "step": 32033 + }, + { + "epoch": 2.730248018409614, + "grad_norm": 67.50193779429426, + "learning_rate": 2.444263746701431e-07, + "loss": 1.9603, + "step": 32034 + }, + { + "epoch": 2.7303332481036393, + "grad_norm": 24.362627094435677, + "learning_rate": 2.4427326150895825e-07, + "loss": 0.8088, + "step": 32035 + }, + { + "epoch": 2.730418477797665, + "grad_norm": 136.60818281450076, + "learning_rate": 2.441201951184563e-07, + "loss": 3.1201, + "step": 32036 + }, + { + "epoch": 2.7305037074916902, + "grad_norm": 42.478534270766446, + "learning_rate": 2.4396717550014273e-07, + "loss": 0.948, + "step": 32037 + }, + { + "epoch": 2.7305889371857157, + "grad_norm": 57.491233614503614, + "learning_rate": 2.438142026555207e-07, + "loss": 1.3308, + "step": 32038 + }, + { + "epoch": 2.7306741668797407, + "grad_norm": 90.3860122307523, + "learning_rate": 2.4366127658609685e-07, + "loss": 1.3562, + "step": 32039 + }, + { + "epoch": 2.730759396573766, + "grad_norm": 50.538961449932, + "learning_rate": 2.435083972933744e-07, + "loss": 1.03, + "step": 32040 + }, + { + "epoch": 2.7308446262677917, + "grad_norm": 45.514370564676206, + "learning_rate": 2.4335556477885715e-07, + "loss": 1.041, + "step": 32041 + }, + { + "epoch": 2.730929855961817, + "grad_norm": 30.399003293682355, + "learning_rate": 2.432027790440472e-07, + "loss": 0.8265, + "step": 32042 + }, + { + "epoch": 2.7310150856558426, + "grad_norm": 53.55866926693143, + "learning_rate": 2.4305004009044844e-07, + "loss": 1.6293, + "step": 32043 + }, + { + "epoch": 2.7311003153498676, + "grad_norm": 31.66483586067248, + "learning_rate": 2.428973479195618e-07, + "loss": 0.9569, + "step": 32044 + }, + { + "epoch": 2.7311855450438935, + "grad_norm": 44.97208320667362, + "learning_rate": 2.4274470253288995e-07, + "loss": 1.3088, + "step": 32045 + }, + { + "epoch": 2.7312707747379186, + "grad_norm": 58.16089360568536, + "learning_rate": 2.4259210393193287e-07, + "loss": 1.6907, + "step": 32046 + }, + { + "epoch": 2.731356004431944, + "grad_norm": 58.40619625576149, + "learning_rate": 2.424395521181916e-07, + "loss": 2.0983, + "step": 32047 + }, + { + "epoch": 2.7314412341259695, + "grad_norm": 56.41814319171557, + "learning_rate": 2.422870470931682e-07, + "loss": 1.4659, + "step": 32048 + }, + { + "epoch": 2.731526463819995, + "grad_norm": 27.42505340483739, + "learning_rate": 2.4213458885836093e-07, + "loss": 1.0378, + "step": 32049 + }, + { + "epoch": 2.7316116935140204, + "grad_norm": 82.1044508491005, + "learning_rate": 2.419821774152692e-07, + "loss": 1.6591, + "step": 32050 + }, + { + "epoch": 2.7316969232080455, + "grad_norm": 118.18364188822073, + "learning_rate": 2.418298127653917e-07, + "loss": 2.6398, + "step": 32051 + }, + { + "epoch": 2.731782152902071, + "grad_norm": 41.31038414299602, + "learning_rate": 2.4167749491022795e-07, + "loss": 1.115, + "step": 32052 + }, + { + "epoch": 2.7318673825960964, + "grad_norm": 28.82584290233402, + "learning_rate": 2.41525223851275e-07, + "loss": 1.2811, + "step": 32053 + }, + { + "epoch": 2.731952612290122, + "grad_norm": 77.1456641999002, + "learning_rate": 2.413729995900299e-07, + "loss": 1.9855, + "step": 32054 + }, + { + "epoch": 2.7320378419841473, + "grad_norm": 30.691514390660082, + "learning_rate": 2.41220822127991e-07, + "loss": 0.8341, + "step": 32055 + }, + { + "epoch": 2.732123071678173, + "grad_norm": 82.83252012596586, + "learning_rate": 2.4106869146665437e-07, + "loss": 2.0761, + "step": 32056 + }, + { + "epoch": 2.7322083013721983, + "grad_norm": 69.91100440427952, + "learning_rate": 2.409166076075164e-07, + "loss": 1.8124, + "step": 32057 + }, + { + "epoch": 2.7322935310662233, + "grad_norm": 82.26858615185363, + "learning_rate": 2.407645705520728e-07, + "loss": 1.8815, + "step": 32058 + }, + { + "epoch": 2.7323787607602488, + "grad_norm": 63.46265088216446, + "learning_rate": 2.4061258030181835e-07, + "loss": 2.1184, + "step": 32059 + }, + { + "epoch": 2.7324639904542742, + "grad_norm": 29.542269982511208, + "learning_rate": 2.404606368582474e-07, + "loss": 0.9308, + "step": 32060 + }, + { + "epoch": 2.7325492201482997, + "grad_norm": 58.7873934669168, + "learning_rate": 2.403087402228549e-07, + "loss": 1.3933, + "step": 32061 + }, + { + "epoch": 2.732634449842325, + "grad_norm": 55.18292438802712, + "learning_rate": 2.401568903971346e-07, + "loss": 1.1021, + "step": 32062 + }, + { + "epoch": 2.73271967953635, + "grad_norm": 73.14230301293175, + "learning_rate": 2.4000508738258044e-07, + "loss": 1.4069, + "step": 32063 + }, + { + "epoch": 2.732804909230376, + "grad_norm": 68.69415066148215, + "learning_rate": 2.3985333118068386e-07, + "loss": 1.1656, + "step": 32064 + }, + { + "epoch": 2.732890138924401, + "grad_norm": 94.76713807920807, + "learning_rate": 2.397016217929393e-07, + "loss": 1.7312, + "step": 32065 + }, + { + "epoch": 2.7329753686184266, + "grad_norm": 26.857778400682868, + "learning_rate": 2.395499592208378e-07, + "loss": 0.8726, + "step": 32066 + }, + { + "epoch": 2.733060598312452, + "grad_norm": 46.23619442632048, + "learning_rate": 2.393983434658703e-07, + "loss": 1.0761, + "step": 32067 + }, + { + "epoch": 2.7331458280064775, + "grad_norm": 50.36102364688959, + "learning_rate": 2.392467745295285e-07, + "loss": 1.6345, + "step": 32068 + }, + { + "epoch": 2.733231057700503, + "grad_norm": 73.73834833955178, + "learning_rate": 2.390952524133028e-07, + "loss": 2.1797, + "step": 32069 + }, + { + "epoch": 2.733316287394528, + "grad_norm": 59.72316189543769, + "learning_rate": 2.3894377711868365e-07, + "loss": 1.5693, + "step": 32070 + }, + { + "epoch": 2.7334015170885535, + "grad_norm": 50.9449550114828, + "learning_rate": 2.387923486471605e-07, + "loss": 1.3288, + "step": 32071 + }, + { + "epoch": 2.733486746782579, + "grad_norm": 24.341308941911013, + "learning_rate": 2.3864096700022323e-07, + "loss": 0.8055, + "step": 32072 + }, + { + "epoch": 2.7335719764766044, + "grad_norm": 35.70813048585192, + "learning_rate": 2.3848963217935896e-07, + "loss": 1.0608, + "step": 32073 + }, + { + "epoch": 2.73365720617063, + "grad_norm": 60.26051772105008, + "learning_rate": 2.3833834418605818e-07, + "loss": 1.0992, + "step": 32074 + }, + { + "epoch": 2.7337424358646554, + "grad_norm": 57.383376387738636, + "learning_rate": 2.3818710302180746e-07, + "loss": 1.3805, + "step": 32075 + }, + { + "epoch": 2.733827665558681, + "grad_norm": 36.45262808459182, + "learning_rate": 2.3803590868809456e-07, + "loss": 1.3376, + "step": 32076 + }, + { + "epoch": 2.733912895252706, + "grad_norm": 32.717340955514125, + "learning_rate": 2.3788476118640548e-07, + "loss": 1.0107, + "step": 32077 + }, + { + "epoch": 2.7339981249467313, + "grad_norm": 21.264747391689927, + "learning_rate": 2.3773366051822732e-07, + "loss": 0.4468, + "step": 32078 + }, + { + "epoch": 2.734083354640757, + "grad_norm": 25.464768295254533, + "learning_rate": 2.3758260668504728e-07, + "loss": 0.7649, + "step": 32079 + }, + { + "epoch": 2.7341685843347823, + "grad_norm": 56.777710879337086, + "learning_rate": 2.3743159968834972e-07, + "loss": 1.4283, + "step": 32080 + }, + { + "epoch": 2.7342538140288077, + "grad_norm": 27.89326731608914, + "learning_rate": 2.3728063952961956e-07, + "loss": 1.1377, + "step": 32081 + }, + { + "epoch": 2.7343390437228328, + "grad_norm": 69.71361634582819, + "learning_rate": 2.3712972621034225e-07, + "loss": 1.1551, + "step": 32082 + }, + { + "epoch": 2.7344242734168587, + "grad_norm": 66.70294480087297, + "learning_rate": 2.3697885973200107e-07, + "loss": 1.4533, + "step": 32083 + }, + { + "epoch": 2.7345095031108837, + "grad_norm": 40.37786476424537, + "learning_rate": 2.3682804009608095e-07, + "loss": 1.2345, + "step": 32084 + }, + { + "epoch": 2.734594732804909, + "grad_norm": 61.46498082384935, + "learning_rate": 2.3667726730406292e-07, + "loss": 1.4711, + "step": 32085 + }, + { + "epoch": 2.7346799624989346, + "grad_norm": 59.89901245337552, + "learning_rate": 2.365265413574319e-07, + "loss": 1.1626, + "step": 32086 + }, + { + "epoch": 2.73476519219296, + "grad_norm": 33.5933208004137, + "learning_rate": 2.3637586225766951e-07, + "loss": 0.8428, + "step": 32087 + }, + { + "epoch": 2.7348504218869856, + "grad_norm": 59.75320525568884, + "learning_rate": 2.362252300062584e-07, + "loss": 1.616, + "step": 32088 + }, + { + "epoch": 2.7349356515810106, + "grad_norm": 29.870438837687566, + "learning_rate": 2.3607464460467854e-07, + "loss": 0.8143, + "step": 32089 + }, + { + "epoch": 2.735020881275036, + "grad_norm": 22.599430836855127, + "learning_rate": 2.3592410605441152e-07, + "loss": 1.1166, + "step": 32090 + }, + { + "epoch": 2.7351061109690615, + "grad_norm": 19.842557402511268, + "learning_rate": 2.3577361435693778e-07, + "loss": 0.7953, + "step": 32091 + }, + { + "epoch": 2.735191340663087, + "grad_norm": 41.023069098597055, + "learning_rate": 2.3562316951373787e-07, + "loss": 1.4523, + "step": 32092 + }, + { + "epoch": 2.7352765703571125, + "grad_norm": 19.3662690981179, + "learning_rate": 2.3547277152629055e-07, + "loss": 1.1235, + "step": 32093 + }, + { + "epoch": 2.735361800051138, + "grad_norm": 42.99052433826005, + "learning_rate": 2.3532242039607468e-07, + "loss": 1.1111, + "step": 32094 + }, + { + "epoch": 2.7354470297451634, + "grad_norm": 64.46382945500065, + "learning_rate": 2.3517211612457068e-07, + "loss": 2.2776, + "step": 32095 + }, + { + "epoch": 2.7355322594391884, + "grad_norm": 34.26123551540289, + "learning_rate": 2.3502185871325522e-07, + "loss": 1.2295, + "step": 32096 + }, + { + "epoch": 2.735617489133214, + "grad_norm": 47.34986526069649, + "learning_rate": 2.348716481636065e-07, + "loss": 1.3457, + "step": 32097 + }, + { + "epoch": 2.7357027188272394, + "grad_norm": 43.14640974457521, + "learning_rate": 2.3472148447710063e-07, + "loss": 1.314, + "step": 32098 + }, + { + "epoch": 2.735787948521265, + "grad_norm": 31.299021369565935, + "learning_rate": 2.3457136765521638e-07, + "loss": 0.8392, + "step": 32099 + }, + { + "epoch": 2.7358731782152903, + "grad_norm": 69.99107813190457, + "learning_rate": 2.3442129769942922e-07, + "loss": 2.2181, + "step": 32100 + }, + { + "epoch": 2.7359584079093158, + "grad_norm": 57.25974075761775, + "learning_rate": 2.3427127461121414e-07, + "loss": 1.7662, + "step": 32101 + }, + { + "epoch": 2.7360436376033412, + "grad_norm": 21.21206390941023, + "learning_rate": 2.3412129839204823e-07, + "loss": 0.9549, + "step": 32102 + }, + { + "epoch": 2.7361288672973663, + "grad_norm": 56.379548343262734, + "learning_rate": 2.339713690434048e-07, + "loss": 1.5868, + "step": 32103 + }, + { + "epoch": 2.7362140969913917, + "grad_norm": 84.10096287654397, + "learning_rate": 2.3382148656675986e-07, + "loss": 2.5727, + "step": 32104 + }, + { + "epoch": 2.736299326685417, + "grad_norm": 86.57469234183134, + "learning_rate": 2.3367165096358614e-07, + "loss": 1.6814, + "step": 32105 + }, + { + "epoch": 2.7363845563794427, + "grad_norm": 27.299671614419626, + "learning_rate": 2.3352186223535855e-07, + "loss": 0.9358, + "step": 32106 + }, + { + "epoch": 2.736469786073468, + "grad_norm": 50.86487950615378, + "learning_rate": 2.3337212038354818e-07, + "loss": 1.5251, + "step": 32107 + }, + { + "epoch": 2.736555015767493, + "grad_norm": 65.24785044662849, + "learning_rate": 2.3322242540962993e-07, + "loss": 1.8154, + "step": 32108 + }, + { + "epoch": 2.7366402454615186, + "grad_norm": 70.14577161497762, + "learning_rate": 2.3307277731507428e-07, + "loss": 1.9446, + "step": 32109 + }, + { + "epoch": 2.736725475155544, + "grad_norm": 51.12959444128104, + "learning_rate": 2.329231761013545e-07, + "loss": 1.1875, + "step": 32110 + }, + { + "epoch": 2.7368107048495696, + "grad_norm": 32.3284537262367, + "learning_rate": 2.3277362176994e-07, + "loss": 0.8726, + "step": 32111 + }, + { + "epoch": 2.736895934543595, + "grad_norm": 39.43910834595878, + "learning_rate": 2.3262411432230347e-07, + "loss": 0.7439, + "step": 32112 + }, + { + "epoch": 2.7369811642376205, + "grad_norm": 70.26675261506969, + "learning_rate": 2.3247465375991485e-07, + "loss": 1.8864, + "step": 32113 + }, + { + "epoch": 2.737066393931646, + "grad_norm": 23.385619055748357, + "learning_rate": 2.3232524008424294e-07, + "loss": 0.8499, + "step": 32114 + }, + { + "epoch": 2.737151623625671, + "grad_norm": 92.5190610434723, + "learning_rate": 2.3217587329675716e-07, + "loss": 2.371, + "step": 32115 + }, + { + "epoch": 2.7372368533196965, + "grad_norm": 83.820741777412, + "learning_rate": 2.32026553398928e-07, + "loss": 2.1438, + "step": 32116 + }, + { + "epoch": 2.737322083013722, + "grad_norm": 59.07721709249669, + "learning_rate": 2.3187728039222257e-07, + "loss": 1.553, + "step": 32117 + }, + { + "epoch": 2.7374073127077474, + "grad_norm": 47.75832172248951, + "learning_rate": 2.3172805427810975e-07, + "loss": 1.7173, + "step": 32118 + }, + { + "epoch": 2.737492542401773, + "grad_norm": 54.80892999933311, + "learning_rate": 2.315788750580572e-07, + "loss": 1.5255, + "step": 32119 + }, + { + "epoch": 2.7375777720957983, + "grad_norm": 78.9230225360453, + "learning_rate": 2.3142974273353047e-07, + "loss": 1.5787, + "step": 32120 + }, + { + "epoch": 2.737663001789824, + "grad_norm": 59.96707945706303, + "learning_rate": 2.3128065730599835e-07, + "loss": 1.4625, + "step": 32121 + }, + { + "epoch": 2.737748231483849, + "grad_norm": 54.57667075928018, + "learning_rate": 2.311316187769258e-07, + "loss": 1.4139, + "step": 32122 + }, + { + "epoch": 2.7378334611778743, + "grad_norm": 29.192105075433016, + "learning_rate": 2.3098262714777942e-07, + "loss": 0.6573, + "step": 32123 + }, + { + "epoch": 2.7379186908718998, + "grad_norm": 56.24174812722704, + "learning_rate": 2.3083368242002246e-07, + "loss": 1.3772, + "step": 32124 + }, + { + "epoch": 2.7380039205659252, + "grad_norm": 21.552140431092788, + "learning_rate": 2.3068478459512155e-07, + "loss": 0.8768, + "step": 32125 + }, + { + "epoch": 2.7380891502599507, + "grad_norm": 24.613840689293927, + "learning_rate": 2.3053593367454108e-07, + "loss": 0.6129, + "step": 32126 + }, + { + "epoch": 2.7381743799539757, + "grad_norm": 56.31078627325932, + "learning_rate": 2.3038712965974486e-07, + "loss": 1.5488, + "step": 32127 + }, + { + "epoch": 2.7382596096480016, + "grad_norm": 68.34704097160844, + "learning_rate": 2.3023837255219505e-07, + "loss": 1.7898, + "step": 32128 + }, + { + "epoch": 2.7383448393420267, + "grad_norm": 55.77740735523459, + "learning_rate": 2.300896623533566e-07, + "loss": 1.3832, + "step": 32129 + }, + { + "epoch": 2.738430069036052, + "grad_norm": 37.580663086467595, + "learning_rate": 2.2994099906469058e-07, + "loss": 0.8674, + "step": 32130 + }, + { + "epoch": 2.7385152987300776, + "grad_norm": 133.52534279059623, + "learning_rate": 2.2979238268765913e-07, + "loss": 2.1147, + "step": 32131 + }, + { + "epoch": 2.738600528424103, + "grad_norm": 40.70869270849691, + "learning_rate": 2.2964381322372387e-07, + "loss": 1.3377, + "step": 32132 + }, + { + "epoch": 2.7386857581181285, + "grad_norm": 57.966167742528334, + "learning_rate": 2.2949529067434585e-07, + "loss": 1.3589, + "step": 32133 + }, + { + "epoch": 2.7387709878121536, + "grad_norm": 38.92667546101277, + "learning_rate": 2.2934681504098722e-07, + "loss": 1.106, + "step": 32134 + }, + { + "epoch": 2.738856217506179, + "grad_norm": 91.0752568350563, + "learning_rate": 2.2919838632510626e-07, + "loss": 2.1686, + "step": 32135 + }, + { + "epoch": 2.7389414472002045, + "grad_norm": 37.68630560238795, + "learning_rate": 2.2905000452816405e-07, + "loss": 0.8769, + "step": 32136 + }, + { + "epoch": 2.73902667689423, + "grad_norm": 42.1226417695898, + "learning_rate": 2.289016696516183e-07, + "loss": 0.6789, + "step": 32137 + }, + { + "epoch": 2.7391119065882554, + "grad_norm": 58.800321978654246, + "learning_rate": 2.2875338169692951e-07, + "loss": 1.5137, + "step": 32138 + }, + { + "epoch": 2.739197136282281, + "grad_norm": 31.219878647249665, + "learning_rate": 2.286051406655554e-07, + "loss": 1.1042, + "step": 32139 + }, + { + "epoch": 2.7392823659763064, + "grad_norm": 36.66084455228446, + "learning_rate": 2.284569465589531e-07, + "loss": 0.7293, + "step": 32140 + }, + { + "epoch": 2.7393675956703314, + "grad_norm": 32.51036451787139, + "learning_rate": 2.2830879937858042e-07, + "loss": 0.9433, + "step": 32141 + }, + { + "epoch": 2.739452825364357, + "grad_norm": 52.776187591619454, + "learning_rate": 2.2816069912589556e-07, + "loss": 1.5453, + "step": 32142 + }, + { + "epoch": 2.7395380550583823, + "grad_norm": 64.59761922335345, + "learning_rate": 2.2801264580235405e-07, + "loss": 1.4081, + "step": 32143 + }, + { + "epoch": 2.739623284752408, + "grad_norm": 84.89742790127046, + "learning_rate": 2.2786463940941194e-07, + "loss": 2.2968, + "step": 32144 + }, + { + "epoch": 2.7397085144464333, + "grad_norm": 73.98258025680325, + "learning_rate": 2.2771667994852475e-07, + "loss": 1.3751, + "step": 32145 + }, + { + "epoch": 2.7397937441404583, + "grad_norm": 93.2833782069388, + "learning_rate": 2.2756876742114687e-07, + "loss": 1.5253, + "step": 32146 + }, + { + "epoch": 2.739878973834484, + "grad_norm": 51.34772261513395, + "learning_rate": 2.2742090182873489e-07, + "loss": 1.4023, + "step": 32147 + }, + { + "epoch": 2.7399642035285092, + "grad_norm": 63.025383734904125, + "learning_rate": 2.27273083172741e-07, + "loss": 1.4039, + "step": 32148 + }, + { + "epoch": 2.7400494332225347, + "grad_norm": 16.124249548199284, + "learning_rate": 2.2712531145462014e-07, + "loss": 0.5773, + "step": 32149 + }, + { + "epoch": 2.74013466291656, + "grad_norm": 20.873340167714847, + "learning_rate": 2.2697758667582503e-07, + "loss": 0.772, + "step": 32150 + }, + { + "epoch": 2.7402198926105856, + "grad_norm": 51.44212701689132, + "learning_rate": 2.2682990883780954e-07, + "loss": 1.3082, + "step": 32151 + }, + { + "epoch": 2.740305122304611, + "grad_norm": 94.97592755303702, + "learning_rate": 2.2668227794202525e-07, + "loss": 2.9946, + "step": 32152 + }, + { + "epoch": 2.740390351998636, + "grad_norm": 57.91065864384258, + "learning_rate": 2.265346939899238e-07, + "loss": 1.5904, + "step": 32153 + }, + { + "epoch": 2.7404755816926616, + "grad_norm": 33.38824272155114, + "learning_rate": 2.263871569829562e-07, + "loss": 0.8461, + "step": 32154 + }, + { + "epoch": 2.740560811386687, + "grad_norm": 40.024004697181475, + "learning_rate": 2.262396669225747e-07, + "loss": 1.6583, + "step": 32155 + }, + { + "epoch": 2.7406460410807125, + "grad_norm": 60.777895464584894, + "learning_rate": 2.260922238102281e-07, + "loss": 1.7752, + "step": 32156 + }, + { + "epoch": 2.740731270774738, + "grad_norm": 127.811393095276, + "learning_rate": 2.2594482764736913e-07, + "loss": 1.3896, + "step": 32157 + }, + { + "epoch": 2.7408165004687635, + "grad_norm": 44.99942124777164, + "learning_rate": 2.2579747843544443e-07, + "loss": 0.9448, + "step": 32158 + }, + { + "epoch": 2.740901730162789, + "grad_norm": 58.41581644522297, + "learning_rate": 2.2565017617590502e-07, + "loss": 1.9518, + "step": 32159 + }, + { + "epoch": 2.740986959856814, + "grad_norm": 19.407257709190795, + "learning_rate": 2.255029208701992e-07, + "loss": 0.9356, + "step": 32160 + }, + { + "epoch": 2.7410721895508394, + "grad_norm": 37.201560099430786, + "learning_rate": 2.253557125197753e-07, + "loss": 1.0268, + "step": 32161 + }, + { + "epoch": 2.741157419244865, + "grad_norm": 40.1762282632087, + "learning_rate": 2.2520855112608043e-07, + "loss": 1.1712, + "step": 32162 + }, + { + "epoch": 2.7412426489388904, + "grad_norm": 84.37405269090249, + "learning_rate": 2.2506143669056124e-07, + "loss": 1.5924, + "step": 32163 + }, + { + "epoch": 2.741327878632916, + "grad_norm": 73.51997409127455, + "learning_rate": 2.2491436921466548e-07, + "loss": 1.755, + "step": 32164 + }, + { + "epoch": 2.741413108326941, + "grad_norm": 23.37540365376467, + "learning_rate": 2.2476734869984028e-07, + "loss": 0.7066, + "step": 32165 + }, + { + "epoch": 2.7414983380209668, + "grad_norm": 41.147781615626755, + "learning_rate": 2.2462037514753122e-07, + "loss": 0.8593, + "step": 32166 + }, + { + "epoch": 2.741583567714992, + "grad_norm": 32.93897589893823, + "learning_rate": 2.2447344855918207e-07, + "loss": 1.0673, + "step": 32167 + }, + { + "epoch": 2.7416687974090173, + "grad_norm": 51.37902336650072, + "learning_rate": 2.243265689362395e-07, + "loss": 1.5519, + "step": 32168 + }, + { + "epoch": 2.7417540271030427, + "grad_norm": 44.17954518485304, + "learning_rate": 2.2417973628014734e-07, + "loss": 1.0519, + "step": 32169 + }, + { + "epoch": 2.741839256797068, + "grad_norm": 68.17911873731614, + "learning_rate": 2.2403295059234997e-07, + "loss": 2.0547, + "step": 32170 + }, + { + "epoch": 2.7419244864910937, + "grad_norm": 75.90432114278616, + "learning_rate": 2.2388621187429017e-07, + "loss": 2.1072, + "step": 32171 + }, + { + "epoch": 2.7420097161851187, + "grad_norm": 63.9467602218919, + "learning_rate": 2.2373952012741172e-07, + "loss": 1.7591, + "step": 32172 + }, + { + "epoch": 2.742094945879144, + "grad_norm": 29.31756728490202, + "learning_rate": 2.2359287535315742e-07, + "loss": 1.0742, + "step": 32173 + }, + { + "epoch": 2.7421801755731696, + "grad_norm": 40.86358727180412, + "learning_rate": 2.2344627755297e-07, + "loss": 0.7325, + "step": 32174 + }, + { + "epoch": 2.742265405267195, + "grad_norm": 50.579681889877776, + "learning_rate": 2.2329972672828992e-07, + "loss": 1.2852, + "step": 32175 + }, + { + "epoch": 2.7423506349612206, + "grad_norm": 49.41560089167958, + "learning_rate": 2.231532228805583e-07, + "loss": 1.2629, + "step": 32176 + }, + { + "epoch": 2.742435864655246, + "grad_norm": 29.67589534386352, + "learning_rate": 2.2300676601121673e-07, + "loss": 1.1646, + "step": 32177 + }, + { + "epoch": 2.7425210943492715, + "grad_norm": 59.46161059000935, + "learning_rate": 2.2286035612170632e-07, + "loss": 1.2586, + "step": 32178 + }, + { + "epoch": 2.7426063240432965, + "grad_norm": 69.89511329801847, + "learning_rate": 2.2271399321346476e-07, + "loss": 1.3576, + "step": 32179 + }, + { + "epoch": 2.742691553737322, + "grad_norm": 39.00283127755545, + "learning_rate": 2.2256767728793316e-07, + "loss": 1.2271, + "step": 32180 + }, + { + "epoch": 2.7427767834313475, + "grad_norm": 58.07176253366549, + "learning_rate": 2.224214083465509e-07, + "loss": 1.5315, + "step": 32181 + }, + { + "epoch": 2.742862013125373, + "grad_norm": 28.014819513342204, + "learning_rate": 2.2227518639075518e-07, + "loss": 0.6673, + "step": 32182 + }, + { + "epoch": 2.7429472428193984, + "grad_norm": 45.93207115101723, + "learning_rate": 2.2212901142198483e-07, + "loss": 1.6635, + "step": 32183 + }, + { + "epoch": 2.7430324725134234, + "grad_norm": 38.4532833022373, + "learning_rate": 2.219828834416765e-07, + "loss": 1.5148, + "step": 32184 + }, + { + "epoch": 2.7431177022074493, + "grad_norm": 59.779033492943654, + "learning_rate": 2.2183680245126848e-07, + "loss": 1.8366, + "step": 32185 + }, + { + "epoch": 2.7432029319014744, + "grad_norm": 23.510047632402326, + "learning_rate": 2.2169076845219683e-07, + "loss": 0.981, + "step": 32186 + }, + { + "epoch": 2.7432881615955, + "grad_norm": 45.46844130360374, + "learning_rate": 2.2154478144589709e-07, + "loss": 0.9786, + "step": 32187 + }, + { + "epoch": 2.7433733912895253, + "grad_norm": 49.12601421197126, + "learning_rate": 2.213988414338053e-07, + "loss": 1.5067, + "step": 32188 + }, + { + "epoch": 2.7434586209835508, + "grad_norm": 40.86866554613958, + "learning_rate": 2.2125294841735812e-07, + "loss": 0.8374, + "step": 32189 + }, + { + "epoch": 2.7435438506775762, + "grad_norm": 16.65909160494494, + "learning_rate": 2.2110710239798938e-07, + "loss": 0.468, + "step": 32190 + }, + { + "epoch": 2.7436290803716012, + "grad_norm": 46.34477633173873, + "learning_rate": 2.2096130337713295e-07, + "loss": 1.6563, + "step": 32191 + }, + { + "epoch": 2.7437143100656267, + "grad_norm": 69.12143685247784, + "learning_rate": 2.2081555135622378e-07, + "loss": 1.4743, + "step": 32192 + }, + { + "epoch": 2.743799539759652, + "grad_norm": 41.77903758038381, + "learning_rate": 2.2066984633669353e-07, + "loss": 0.9793, + "step": 32193 + }, + { + "epoch": 2.7438847694536777, + "grad_norm": 20.34563028227446, + "learning_rate": 2.2052418831997659e-07, + "loss": 0.7416, + "step": 32194 + }, + { + "epoch": 2.743969999147703, + "grad_norm": 43.13589715767083, + "learning_rate": 2.2037857730750456e-07, + "loss": 1.0961, + "step": 32195 + }, + { + "epoch": 2.7440552288417286, + "grad_norm": 29.854753660487606, + "learning_rate": 2.2023301330071078e-07, + "loss": 0.8556, + "step": 32196 + }, + { + "epoch": 2.744140458535754, + "grad_norm": 63.023446966164265, + "learning_rate": 2.2008749630102577e-07, + "loss": 1.294, + "step": 32197 + }, + { + "epoch": 2.744225688229779, + "grad_norm": 82.45827235713891, + "learning_rate": 2.1994202630988114e-07, + "loss": 1.4705, + "step": 32198 + }, + { + "epoch": 2.7443109179238045, + "grad_norm": 57.34224818661976, + "learning_rate": 2.1979660332870745e-07, + "loss": 1.4055, + "step": 32199 + }, + { + "epoch": 2.74439614761783, + "grad_norm": 51.4532212972953, + "learning_rate": 2.1965122735893406e-07, + "loss": 0.9666, + "step": 32200 + }, + { + "epoch": 2.7444813773118555, + "grad_norm": 29.04090286775567, + "learning_rate": 2.1950589840199155e-07, + "loss": 0.8551, + "step": 32201 + }, + { + "epoch": 2.744566607005881, + "grad_norm": 23.31197036639203, + "learning_rate": 2.1936061645930874e-07, + "loss": 0.6836, + "step": 32202 + }, + { + "epoch": 2.744651836699906, + "grad_norm": 45.980860375758965, + "learning_rate": 2.1921538153231447e-07, + "loss": 1.4037, + "step": 32203 + }, + { + "epoch": 2.744737066393932, + "grad_norm": 76.50725598056701, + "learning_rate": 2.190701936224382e-07, + "loss": 1.4425, + "step": 32204 + }, + { + "epoch": 2.744822296087957, + "grad_norm": 71.77115371277537, + "learning_rate": 2.1892505273110652e-07, + "loss": 1.8397, + "step": 32205 + }, + { + "epoch": 2.7449075257819824, + "grad_norm": 35.65249340666942, + "learning_rate": 2.1877995885974613e-07, + "loss": 0.9298, + "step": 32206 + }, + { + "epoch": 2.744992755476008, + "grad_norm": 40.89621237647521, + "learning_rate": 2.1863491200978582e-07, + "loss": 0.8117, + "step": 32207 + }, + { + "epoch": 2.7450779851700333, + "grad_norm": 65.71119457630806, + "learning_rate": 2.184899121826517e-07, + "loss": 1.4082, + "step": 32208 + }, + { + "epoch": 2.745163214864059, + "grad_norm": 42.46029141805433, + "learning_rate": 2.1834495937976873e-07, + "loss": 0.7582, + "step": 32209 + }, + { + "epoch": 2.745248444558084, + "grad_norm": 70.64970205578203, + "learning_rate": 2.1820005360256247e-07, + "loss": 1.2325, + "step": 32210 + }, + { + "epoch": 2.7453336742521093, + "grad_norm": 51.59532117995758, + "learning_rate": 2.1805519485245895e-07, + "loss": 0.7073, + "step": 32211 + }, + { + "epoch": 2.7454189039461347, + "grad_norm": 92.68255809984862, + "learning_rate": 2.1791038313088264e-07, + "loss": 1.4247, + "step": 32212 + }, + { + "epoch": 2.74550413364016, + "grad_norm": 30.5759805254912, + "learning_rate": 2.1776561843925793e-07, + "loss": 0.6693, + "step": 32213 + }, + { + "epoch": 2.7455893633341857, + "grad_norm": 45.38282101346989, + "learning_rate": 2.1762090077900699e-07, + "loss": 1.2666, + "step": 32214 + }, + { + "epoch": 2.745674593028211, + "grad_norm": 54.51842585423954, + "learning_rate": 2.1747623015155484e-07, + "loss": 1.2559, + "step": 32215 + }, + { + "epoch": 2.7457598227222366, + "grad_norm": 48.089690066937536, + "learning_rate": 2.173316065583242e-07, + "loss": 0.8503, + "step": 32216 + }, + { + "epoch": 2.7458450524162616, + "grad_norm": 49.95586458618827, + "learning_rate": 2.1718703000073615e-07, + "loss": 1.6383, + "step": 32217 + }, + { + "epoch": 2.745930282110287, + "grad_norm": 65.86527952236746, + "learning_rate": 2.1704250048021237e-07, + "loss": 1.62, + "step": 32218 + }, + { + "epoch": 2.7460155118043126, + "grad_norm": 54.48283266392871, + "learning_rate": 2.1689801799817557e-07, + "loss": 1.245, + "step": 32219 + }, + { + "epoch": 2.746100741498338, + "grad_norm": 22.460588382674175, + "learning_rate": 2.1675358255604685e-07, + "loss": 0.8531, + "step": 32220 + }, + { + "epoch": 2.7461859711923635, + "grad_norm": 53.87681598071316, + "learning_rate": 2.1660919415524562e-07, + "loss": 1.1263, + "step": 32221 + }, + { + "epoch": 2.7462712008863885, + "grad_norm": 26.449676686056407, + "learning_rate": 2.1646485279719188e-07, + "loss": 0.7809, + "step": 32222 + }, + { + "epoch": 2.7463564305804145, + "grad_norm": 34.748585234443325, + "learning_rate": 2.1632055848330557e-07, + "loss": 1.1854, + "step": 32223 + }, + { + "epoch": 2.7464416602744395, + "grad_norm": 49.629847082950796, + "learning_rate": 2.1617631121500616e-07, + "loss": 1.8899, + "step": 32224 + }, + { + "epoch": 2.746526889968465, + "grad_norm": 33.477322815906845, + "learning_rate": 2.1603211099371135e-07, + "loss": 1.0827, + "step": 32225 + }, + { + "epoch": 2.7466121196624904, + "grad_norm": 38.89588918506569, + "learning_rate": 2.1588795782084005e-07, + "loss": 1.1435, + "step": 32226 + }, + { + "epoch": 2.746697349356516, + "grad_norm": 49.45826972105984, + "learning_rate": 2.1574385169780887e-07, + "loss": 1.1212, + "step": 32227 + }, + { + "epoch": 2.7467825790505414, + "grad_norm": 56.20681543963568, + "learning_rate": 2.155997926260367e-07, + "loss": 1.4719, + "step": 32228 + }, + { + "epoch": 2.7468678087445664, + "grad_norm": 64.93508411662285, + "learning_rate": 2.1545578060693962e-07, + "loss": 1.2858, + "step": 32229 + }, + { + "epoch": 2.746953038438592, + "grad_norm": 74.42053074314151, + "learning_rate": 2.1531181564193372e-07, + "loss": 1.7139, + "step": 32230 + }, + { + "epoch": 2.7470382681326173, + "grad_norm": 111.09827193801398, + "learning_rate": 2.1516789773243395e-07, + "loss": 2.5284, + "step": 32231 + }, + { + "epoch": 2.747123497826643, + "grad_norm": 52.98127174029985, + "learning_rate": 2.1502402687985756e-07, + "loss": 0.8843, + "step": 32232 + }, + { + "epoch": 2.7472087275206682, + "grad_norm": 61.696259112084434, + "learning_rate": 2.148802030856184e-07, + "loss": 1.5622, + "step": 32233 + }, + { + "epoch": 2.7472939572146937, + "grad_norm": 49.278917807926895, + "learning_rate": 2.1473642635113034e-07, + "loss": 1.7795, + "step": 32234 + }, + { + "epoch": 2.747379186908719, + "grad_norm": 36.32565710770829, + "learning_rate": 2.1459269667780834e-07, + "loss": 0.9227, + "step": 32235 + }, + { + "epoch": 2.747464416602744, + "grad_norm": 63.416105559239234, + "learning_rate": 2.144490140670652e-07, + "loss": 1.9029, + "step": 32236 + }, + { + "epoch": 2.7475496462967697, + "grad_norm": 37.290839339763586, + "learning_rate": 2.143053785203153e-07, + "loss": 1.0118, + "step": 32237 + }, + { + "epoch": 2.747634875990795, + "grad_norm": 37.40344548381446, + "learning_rate": 2.141617900389703e-07, + "loss": 1.2608, + "step": 32238 + }, + { + "epoch": 2.7477201056848206, + "grad_norm": 59.18489283798356, + "learning_rate": 2.1401824862444243e-07, + "loss": 1.1009, + "step": 32239 + }, + { + "epoch": 2.747805335378846, + "grad_norm": 30.00134563376655, + "learning_rate": 2.1387475427814276e-07, + "loss": 1.1895, + "step": 32240 + }, + { + "epoch": 2.7478905650728715, + "grad_norm": 24.819536200194463, + "learning_rate": 2.1373130700148347e-07, + "loss": 0.7281, + "step": 32241 + }, + { + "epoch": 2.747975794766897, + "grad_norm": 48.34485678066763, + "learning_rate": 2.1358790679587403e-07, + "loss": 0.9949, + "step": 32242 + }, + { + "epoch": 2.748061024460922, + "grad_norm": 32.689031627517615, + "learning_rate": 2.1344455366272666e-07, + "loss": 1.2586, + "step": 32243 + }, + { + "epoch": 2.7481462541549475, + "grad_norm": 57.47709766645931, + "learning_rate": 2.1330124760344907e-07, + "loss": 0.9656, + "step": 32244 + }, + { + "epoch": 2.748231483848973, + "grad_norm": 23.15869583925166, + "learning_rate": 2.1315798861945292e-07, + "loss": 0.8491, + "step": 32245 + }, + { + "epoch": 2.7483167135429984, + "grad_norm": 36.402527527515225, + "learning_rate": 2.1301477671214543e-07, + "loss": 0.9629, + "step": 32246 + }, + { + "epoch": 2.748401943237024, + "grad_norm": 80.48773470229004, + "learning_rate": 2.128716118829355e-07, + "loss": 1.948, + "step": 32247 + }, + { + "epoch": 2.748487172931049, + "grad_norm": 74.21013024263469, + "learning_rate": 2.1272849413323083e-07, + "loss": 1.757, + "step": 32248 + }, + { + "epoch": 2.748572402625075, + "grad_norm": 85.37204817283204, + "learning_rate": 2.1258542346443867e-07, + "loss": 1.8577, + "step": 32249 + }, + { + "epoch": 2.7486576323191, + "grad_norm": 55.60944278850983, + "learning_rate": 2.1244239987796678e-07, + "loss": 1.4837, + "step": 32250 + }, + { + "epoch": 2.7487428620131253, + "grad_norm": 23.84651546886714, + "learning_rate": 2.122994233752218e-07, + "loss": 1.2803, + "step": 32251 + }, + { + "epoch": 2.748828091707151, + "grad_norm": 36.5970814981765, + "learning_rate": 2.1215649395760984e-07, + "loss": 1.4204, + "step": 32252 + }, + { + "epoch": 2.7489133214011763, + "grad_norm": 60.91428707753444, + "learning_rate": 2.1201361162653534e-07, + "loss": 1.6525, + "step": 32253 + }, + { + "epoch": 2.7489985510952017, + "grad_norm": 29.545511142009428, + "learning_rate": 2.1187077638340548e-07, + "loss": 0.5586, + "step": 32254 + }, + { + "epoch": 2.7490837807892268, + "grad_norm": 37.96807182166114, + "learning_rate": 2.117279882296236e-07, + "loss": 1.2539, + "step": 32255 + }, + { + "epoch": 2.7491690104832522, + "grad_norm": 56.55339107193713, + "learning_rate": 2.1158524716659412e-07, + "loss": 1.0592, + "step": 32256 + }, + { + "epoch": 2.7492542401772777, + "grad_norm": 60.63507456312592, + "learning_rate": 2.1144255319572093e-07, + "loss": 1.6592, + "step": 32257 + }, + { + "epoch": 2.749339469871303, + "grad_norm": 51.157997832081804, + "learning_rate": 2.112999063184068e-07, + "loss": 1.3631, + "step": 32258 + }, + { + "epoch": 2.7494246995653286, + "grad_norm": 77.37306315955732, + "learning_rate": 2.1115730653605614e-07, + "loss": 1.418, + "step": 32259 + }, + { + "epoch": 2.749509929259354, + "grad_norm": 40.890234666079614, + "learning_rate": 2.1101475385007064e-07, + "loss": 0.8906, + "step": 32260 + }, + { + "epoch": 2.7495951589533796, + "grad_norm": 61.56960118528588, + "learning_rate": 2.1087224826185138e-07, + "loss": 1.8418, + "step": 32261 + }, + { + "epoch": 2.7496803886474046, + "grad_norm": 30.49912061468326, + "learning_rate": 2.107297897728011e-07, + "loss": 0.9592, + "step": 32262 + }, + { + "epoch": 2.74976561834143, + "grad_norm": 66.05232073261355, + "learning_rate": 2.1058737838431986e-07, + "loss": 1.9377, + "step": 32263 + }, + { + "epoch": 2.7498508480354555, + "grad_norm": 24.059364458587737, + "learning_rate": 2.1044501409780925e-07, + "loss": 0.6336, + "step": 32264 + }, + { + "epoch": 2.749936077729481, + "grad_norm": 51.46009624815199, + "learning_rate": 2.1030269691466764e-07, + "loss": 1.4791, + "step": 32265 + }, + { + "epoch": 2.7500213074235065, + "grad_norm": 32.54084909740569, + "learning_rate": 2.1016042683629612e-07, + "loss": 1.0639, + "step": 32266 + }, + { + "epoch": 2.7501065371175315, + "grad_norm": 30.149299155627915, + "learning_rate": 2.1001820386409411e-07, + "loss": 1.0922, + "step": 32267 + }, + { + "epoch": 2.7501917668115574, + "grad_norm": 45.63250549823788, + "learning_rate": 2.0987602799945995e-07, + "loss": 1.0334, + "step": 32268 + }, + { + "epoch": 2.7502769965055824, + "grad_norm": 64.67910115888421, + "learning_rate": 2.097338992437914e-07, + "loss": 1.9007, + "step": 32269 + }, + { + "epoch": 2.750362226199608, + "grad_norm": 53.61227339953482, + "learning_rate": 2.0959181759848567e-07, + "loss": 1.8096, + "step": 32270 + }, + { + "epoch": 2.7504474558936334, + "grad_norm": 43.16431694190081, + "learning_rate": 2.094497830649417e-07, + "loss": 1.3647, + "step": 32271 + }, + { + "epoch": 2.750532685587659, + "grad_norm": 32.82820628253865, + "learning_rate": 2.093077956445555e-07, + "loss": 0.7715, + "step": 32272 + }, + { + "epoch": 2.7506179152816843, + "grad_norm": 27.283050018478328, + "learning_rate": 2.091658553387227e-07, + "loss": 0.7472, + "step": 32273 + }, + { + "epoch": 2.7507031449757093, + "grad_norm": 24.497596000730486, + "learning_rate": 2.0902396214883992e-07, + "loss": 0.7914, + "step": 32274 + }, + { + "epoch": 2.750788374669735, + "grad_norm": 50.90355454668879, + "learning_rate": 2.0888211607630383e-07, + "loss": 1.4836, + "step": 32275 + }, + { + "epoch": 2.7508736043637603, + "grad_norm": 55.799634483150214, + "learning_rate": 2.087403171225083e-07, + "loss": 1.4972, + "step": 32276 + }, + { + "epoch": 2.7509588340577857, + "grad_norm": 39.5231832446641, + "learning_rate": 2.0859856528884725e-07, + "loss": 1.1901, + "step": 32277 + }, + { + "epoch": 2.751044063751811, + "grad_norm": 35.98534485238829, + "learning_rate": 2.0845686057671622e-07, + "loss": 1.1297, + "step": 32278 + }, + { + "epoch": 2.7511292934458367, + "grad_norm": 61.8734947114932, + "learning_rate": 2.0831520298750684e-07, + "loss": 1.6316, + "step": 32279 + }, + { + "epoch": 2.751214523139862, + "grad_norm": 66.51364956007285, + "learning_rate": 2.0817359252261416e-07, + "loss": 1.713, + "step": 32280 + }, + { + "epoch": 2.751299752833887, + "grad_norm": 47.49693492813449, + "learning_rate": 2.0803202918342924e-07, + "loss": 0.9604, + "step": 32281 + }, + { + "epoch": 2.7513849825279126, + "grad_norm": 40.93700564651979, + "learning_rate": 2.0789051297134654e-07, + "loss": 0.6242, + "step": 32282 + }, + { + "epoch": 2.751470212221938, + "grad_norm": 74.30909449684805, + "learning_rate": 2.0774904388775497e-07, + "loss": 1.6852, + "step": 32283 + }, + { + "epoch": 2.7515554419159636, + "grad_norm": 86.45891853064029, + "learning_rate": 2.0760762193404837e-07, + "loss": 1.8773, + "step": 32284 + }, + { + "epoch": 2.751640671609989, + "grad_norm": 33.518823316863376, + "learning_rate": 2.0746624711161621e-07, + "loss": 1.0724, + "step": 32285 + }, + { + "epoch": 2.751725901304014, + "grad_norm": 37.86718617736807, + "learning_rate": 2.0732491942184963e-07, + "loss": 1.1031, + "step": 32286 + }, + { + "epoch": 2.75181113099804, + "grad_norm": 51.19121598625111, + "learning_rate": 2.071836388661369e-07, + "loss": 1.2511, + "step": 32287 + }, + { + "epoch": 2.751896360692065, + "grad_norm": 81.56953758053892, + "learning_rate": 2.0704240544586972e-07, + "loss": 1.5666, + "step": 32288 + }, + { + "epoch": 2.7519815903860905, + "grad_norm": 44.39598164192587, + "learning_rate": 2.0690121916243533e-07, + "loss": 1.2676, + "step": 32289 + }, + { + "epoch": 2.752066820080116, + "grad_norm": 55.709944792044276, + "learning_rate": 2.0676008001722315e-07, + "loss": 0.9762, + "step": 32290 + }, + { + "epoch": 2.7521520497741414, + "grad_norm": 43.24018668407733, + "learning_rate": 2.0661898801162094e-07, + "loss": 0.8806, + "step": 32291 + }, + { + "epoch": 2.752237279468167, + "grad_norm": 41.17367600344396, + "learning_rate": 2.0647794314701597e-07, + "loss": 1.6102, + "step": 32292 + }, + { + "epoch": 2.752322509162192, + "grad_norm": 60.22199380913848, + "learning_rate": 2.0633694542479599e-07, + "loss": 1.8728, + "step": 32293 + }, + { + "epoch": 2.7524077388562174, + "grad_norm": 53.957128223594786, + "learning_rate": 2.0619599484634712e-07, + "loss": 1.293, + "step": 32294 + }, + { + "epoch": 2.752492968550243, + "grad_norm": 44.28403093405931, + "learning_rate": 2.0605509141305603e-07, + "loss": 1.1287, + "step": 32295 + }, + { + "epoch": 2.7525781982442683, + "grad_norm": 43.22458341968722, + "learning_rate": 2.059142351263077e-07, + "loss": 0.8976, + "step": 32296 + }, + { + "epoch": 2.7526634279382938, + "grad_norm": 25.77162768460853, + "learning_rate": 2.0577342598748772e-07, + "loss": 0.8552, + "step": 32297 + }, + { + "epoch": 2.7527486576323192, + "grad_norm": 77.53763070282373, + "learning_rate": 2.0563266399798165e-07, + "loss": 2.6457, + "step": 32298 + }, + { + "epoch": 2.7528338873263447, + "grad_norm": 85.78937257483594, + "learning_rate": 2.0549194915917337e-07, + "loss": 2.5135, + "step": 32299 + }, + { + "epoch": 2.7529191170203697, + "grad_norm": 67.95438220185875, + "learning_rate": 2.0535128147244622e-07, + "loss": 2.2154, + "step": 32300 + }, + { + "epoch": 2.753004346714395, + "grad_norm": 22.145641068388155, + "learning_rate": 2.0521066093918408e-07, + "loss": 0.8281, + "step": 32301 + }, + { + "epoch": 2.7530895764084207, + "grad_norm": 65.17574373711089, + "learning_rate": 2.050700875607703e-07, + "loss": 2.4723, + "step": 32302 + }, + { + "epoch": 2.753174806102446, + "grad_norm": 87.64379494419352, + "learning_rate": 2.0492956133858654e-07, + "loss": 2.1482, + "step": 32303 + }, + { + "epoch": 2.7532600357964716, + "grad_norm": 65.06919303746658, + "learning_rate": 2.0478908227401505e-07, + "loss": 1.7007, + "step": 32304 + }, + { + "epoch": 2.7533452654904966, + "grad_norm": 62.46539263915804, + "learning_rate": 2.0464865036843696e-07, + "loss": 1.5308, + "step": 32305 + }, + { + "epoch": 2.7534304951845225, + "grad_norm": 49.2186784841747, + "learning_rate": 2.0450826562323446e-07, + "loss": 1.2723, + "step": 32306 + }, + { + "epoch": 2.7535157248785476, + "grad_norm": 40.51458835559976, + "learning_rate": 2.043679280397881e-07, + "loss": 1.2527, + "step": 32307 + }, + { + "epoch": 2.753600954572573, + "grad_norm": 55.501575674146764, + "learning_rate": 2.042276376194774e-07, + "loss": 1.712, + "step": 32308 + }, + { + "epoch": 2.7536861842665985, + "grad_norm": 57.447702515031835, + "learning_rate": 2.0408739436368174e-07, + "loss": 1.1075, + "step": 32309 + }, + { + "epoch": 2.753771413960624, + "grad_norm": 46.625276746307286, + "learning_rate": 2.039471982737812e-07, + "loss": 1.832, + "step": 32310 + }, + { + "epoch": 2.7538566436546494, + "grad_norm": 61.01870109402504, + "learning_rate": 2.038070493511546e-07, + "loss": 1.503, + "step": 32311 + }, + { + "epoch": 2.7539418733486745, + "grad_norm": 75.31984093571756, + "learning_rate": 2.0366694759717865e-07, + "loss": 1.7888, + "step": 32312 + }, + { + "epoch": 2.7540271030427, + "grad_norm": 38.48245874777142, + "learning_rate": 2.035268930132328e-07, + "loss": 0.9875, + "step": 32313 + }, + { + "epoch": 2.7541123327367254, + "grad_norm": 57.031425978370194, + "learning_rate": 2.033868856006943e-07, + "loss": 1.384, + "step": 32314 + }, + { + "epoch": 2.754197562430751, + "grad_norm": 49.77533968475468, + "learning_rate": 2.0324692536094037e-07, + "loss": 1.6948, + "step": 32315 + }, + { + "epoch": 2.7542827921247763, + "grad_norm": 45.537760771838386, + "learning_rate": 2.0310701229534658e-07, + "loss": 1.7189, + "step": 32316 + }, + { + "epoch": 2.754368021818802, + "grad_norm": 66.7360249528321, + "learning_rate": 2.0296714640528848e-07, + "loss": 1.5192, + "step": 32317 + }, + { + "epoch": 2.7544532515128273, + "grad_norm": 52.6990421555818, + "learning_rate": 2.0282732769214274e-07, + "loss": 1.3601, + "step": 32318 + }, + { + "epoch": 2.7545384812068523, + "grad_norm": 90.72573593549326, + "learning_rate": 2.026875561572844e-07, + "loss": 1.7017, + "step": 32319 + }, + { + "epoch": 2.7546237109008778, + "grad_norm": 30.964738865856447, + "learning_rate": 2.025478318020868e-07, + "loss": 0.7751, + "step": 32320 + }, + { + "epoch": 2.7547089405949032, + "grad_norm": 56.19205272875374, + "learning_rate": 2.0240815462792606e-07, + "loss": 1.6328, + "step": 32321 + }, + { + "epoch": 2.7547941702889287, + "grad_norm": 48.104035160329396, + "learning_rate": 2.0226852463617385e-07, + "loss": 1.3994, + "step": 32322 + }, + { + "epoch": 2.754879399982954, + "grad_norm": 29.044745843175114, + "learning_rate": 2.0212894182820464e-07, + "loss": 1.044, + "step": 32323 + }, + { + "epoch": 2.754964629676979, + "grad_norm": 78.93910340692342, + "learning_rate": 2.019894062053912e-07, + "loss": 2.6122, + "step": 32324 + }, + { + "epoch": 2.755049859371005, + "grad_norm": 74.90418950238195, + "learning_rate": 2.0184991776910524e-07, + "loss": 1.5787, + "step": 32325 + }, + { + "epoch": 2.75513508906503, + "grad_norm": 100.60777849084708, + "learning_rate": 2.0171047652071785e-07, + "loss": 1.2339, + "step": 32326 + }, + { + "epoch": 2.7552203187590556, + "grad_norm": 59.32865792202208, + "learning_rate": 2.015710824616024e-07, + "loss": 1.5161, + "step": 32327 + }, + { + "epoch": 2.755305548453081, + "grad_norm": 19.722923523521693, + "learning_rate": 2.014317355931278e-07, + "loss": 0.7871, + "step": 32328 + }, + { + "epoch": 2.7553907781471065, + "grad_norm": 71.54599362923108, + "learning_rate": 2.012924359166657e-07, + "loss": 1.6821, + "step": 32329 + }, + { + "epoch": 2.755476007841132, + "grad_norm": 49.79463972208526, + "learning_rate": 2.0115318343358558e-07, + "loss": 1.0521, + "step": 32330 + }, + { + "epoch": 2.755561237535157, + "grad_norm": 26.659879220489486, + "learning_rate": 2.0101397814525747e-07, + "loss": 0.8919, + "step": 32331 + }, + { + "epoch": 2.7556464672291825, + "grad_norm": 31.581129982868955, + "learning_rate": 2.008748200530497e-07, + "loss": 0.9843, + "step": 32332 + }, + { + "epoch": 2.755731696923208, + "grad_norm": 42.50654851097171, + "learning_rate": 2.007357091583312e-07, + "loss": 1.2269, + "step": 32333 + }, + { + "epoch": 2.7558169266172334, + "grad_norm": 39.85365213082162, + "learning_rate": 2.0059664546246915e-07, + "loss": 0.8109, + "step": 32334 + }, + { + "epoch": 2.755902156311259, + "grad_norm": 59.156731974868954, + "learning_rate": 2.0045762896683306e-07, + "loss": 1.6594, + "step": 32335 + }, + { + "epoch": 2.7559873860052844, + "grad_norm": 69.04801755194467, + "learning_rate": 2.0031865967278796e-07, + "loss": 1.8919, + "step": 32336 + }, + { + "epoch": 2.75607261569931, + "grad_norm": 24.999948447659474, + "learning_rate": 2.0017973758170272e-07, + "loss": 0.9045, + "step": 32337 + }, + { + "epoch": 2.756157845393335, + "grad_norm": 34.19872708890088, + "learning_rate": 2.0004086269494237e-07, + "loss": 0.6238, + "step": 32338 + }, + { + "epoch": 2.7562430750873603, + "grad_norm": 44.89557220864634, + "learning_rate": 1.9990203501387194e-07, + "loss": 0.6522, + "step": 32339 + }, + { + "epoch": 2.756328304781386, + "grad_norm": 65.12793708452463, + "learning_rate": 1.9976325453985812e-07, + "loss": 2.137, + "step": 32340 + }, + { + "epoch": 2.7564135344754113, + "grad_norm": 75.15665886882908, + "learning_rate": 1.996245212742659e-07, + "loss": 1.9683, + "step": 32341 + }, + { + "epoch": 2.7564987641694367, + "grad_norm": 27.587685401523622, + "learning_rate": 1.9948583521845867e-07, + "loss": 1.1905, + "step": 32342 + }, + { + "epoch": 2.7565839938634618, + "grad_norm": 55.65149406530005, + "learning_rate": 1.9934719637379973e-07, + "loss": 1.5085, + "step": 32343 + }, + { + "epoch": 2.7566692235574877, + "grad_norm": 100.29403449017762, + "learning_rate": 1.9920860474165415e-07, + "loss": 2.469, + "step": 32344 + }, + { + "epoch": 2.7567544532515127, + "grad_norm": 47.4291760068356, + "learning_rate": 1.990700603233847e-07, + "loss": 1.4432, + "step": 32345 + }, + { + "epoch": 2.756839682945538, + "grad_norm": 58.03096768405117, + "learning_rate": 1.989315631203531e-07, + "loss": 1.7165, + "step": 32346 + }, + { + "epoch": 2.7569249126395636, + "grad_norm": 67.92947570897034, + "learning_rate": 1.9879311313392159e-07, + "loss": 2.084, + "step": 32347 + }, + { + "epoch": 2.757010142333589, + "grad_norm": 60.23479579411014, + "learning_rate": 1.9865471036545293e-07, + "loss": 1.2427, + "step": 32348 + }, + { + "epoch": 2.7570953720276146, + "grad_norm": 55.21077547275458, + "learning_rate": 1.985163548163066e-07, + "loss": 1.733, + "step": 32349 + }, + { + "epoch": 2.7571806017216396, + "grad_norm": 52.05807143007601, + "learning_rate": 1.9837804648784432e-07, + "loss": 1.6237, + "step": 32350 + }, + { + "epoch": 2.757265831415665, + "grad_norm": 38.97091404235042, + "learning_rate": 1.9823978538142552e-07, + "loss": 1.0629, + "step": 32351 + }, + { + "epoch": 2.7573510611096905, + "grad_norm": 41.67897812544289, + "learning_rate": 1.9810157149841024e-07, + "loss": 1.3054, + "step": 32352 + }, + { + "epoch": 2.757436290803716, + "grad_norm": 48.81460795189078, + "learning_rate": 1.9796340484015853e-07, + "loss": 1.2271, + "step": 32353 + }, + { + "epoch": 2.7575215204977415, + "grad_norm": 25.86904628224569, + "learning_rate": 1.978252854080287e-07, + "loss": 0.8956, + "step": 32354 + }, + { + "epoch": 2.757606750191767, + "grad_norm": 28.7865018420193, + "learning_rate": 1.9768721320337914e-07, + "loss": 0.7231, + "step": 32355 + }, + { + "epoch": 2.7576919798857924, + "grad_norm": 57.97200083156978, + "learning_rate": 1.9754918822756653e-07, + "loss": 1.5498, + "step": 32356 + }, + { + "epoch": 2.7577772095798174, + "grad_norm": 76.68392569310413, + "learning_rate": 1.9741121048195033e-07, + "loss": 1.3135, + "step": 32357 + }, + { + "epoch": 2.757862439273843, + "grad_norm": 67.48322558728044, + "learning_rate": 1.9727327996788615e-07, + "loss": 1.8816, + "step": 32358 + }, + { + "epoch": 2.7579476689678684, + "grad_norm": 46.0601209961262, + "learning_rate": 1.9713539668673064e-07, + "loss": 1.4166, + "step": 32359 + }, + { + "epoch": 2.758032898661894, + "grad_norm": 49.42806921610446, + "learning_rate": 1.9699756063984e-07, + "loss": 2.2612, + "step": 32360 + }, + { + "epoch": 2.7581181283559193, + "grad_norm": 75.21925093878724, + "learning_rate": 1.968597718285703e-07, + "loss": 2.1805, + "step": 32361 + }, + { + "epoch": 2.7582033580499448, + "grad_norm": 98.50962085781664, + "learning_rate": 1.9672203025427604e-07, + "loss": 2.1325, + "step": 32362 + }, + { + "epoch": 2.7582885877439702, + "grad_norm": 38.899862350782115, + "learning_rate": 1.9658433591831228e-07, + "loss": 0.9257, + "step": 32363 + }, + { + "epoch": 2.7583738174379953, + "grad_norm": 56.82463609648236, + "learning_rate": 1.9644668882203234e-07, + "loss": 2.0361, + "step": 32364 + }, + { + "epoch": 2.7584590471320207, + "grad_norm": 28.413575700679637, + "learning_rate": 1.9630908896679013e-07, + "loss": 0.8315, + "step": 32365 + }, + { + "epoch": 2.758544276826046, + "grad_norm": 31.943459978546848, + "learning_rate": 1.9617153635394016e-07, + "loss": 1.2459, + "step": 32366 + }, + { + "epoch": 2.7586295065200717, + "grad_norm": 26.4436049263082, + "learning_rate": 1.9603403098483353e-07, + "loss": 1.0142, + "step": 32367 + }, + { + "epoch": 2.758714736214097, + "grad_norm": 57.21631175368987, + "learning_rate": 1.9589657286082363e-07, + "loss": 1.6475, + "step": 32368 + }, + { + "epoch": 2.758799965908122, + "grad_norm": 92.98485688982888, + "learning_rate": 1.957591619832616e-07, + "loss": 1.6304, + "step": 32369 + }, + { + "epoch": 2.7588851956021476, + "grad_norm": 60.45989485348568, + "learning_rate": 1.956217983534997e-07, + "loss": 1.5029, + "step": 32370 + }, + { + "epoch": 2.758970425296173, + "grad_norm": 41.86028904922002, + "learning_rate": 1.954844819728885e-07, + "loss": 1.8841, + "step": 32371 + }, + { + "epoch": 2.7590556549901986, + "grad_norm": 29.462775448154208, + "learning_rate": 1.9534721284277802e-07, + "loss": 0.9923, + "step": 32372 + }, + { + "epoch": 2.759140884684224, + "grad_norm": 43.325985229434124, + "learning_rate": 1.9520999096451833e-07, + "loss": 1.3461, + "step": 32373 + }, + { + "epoch": 2.7592261143782495, + "grad_norm": 87.8914850899276, + "learning_rate": 1.950728163394594e-07, + "loss": 2.6977, + "step": 32374 + }, + { + "epoch": 2.759311344072275, + "grad_norm": 31.815892187835495, + "learning_rate": 1.9493568896894965e-07, + "loss": 0.9261, + "step": 32375 + }, + { + "epoch": 2.7593965737663, + "grad_norm": 31.08489007469884, + "learning_rate": 1.9479860885433856e-07, + "loss": 1.4135, + "step": 32376 + }, + { + "epoch": 2.7594818034603255, + "grad_norm": 57.62988035496829, + "learning_rate": 1.9466157599697278e-07, + "loss": 1.4507, + "step": 32377 + }, + { + "epoch": 2.759567033154351, + "grad_norm": 66.70264499653129, + "learning_rate": 1.945245903982018e-07, + "loss": 1.165, + "step": 32378 + }, + { + "epoch": 2.7596522628483764, + "grad_norm": 37.53236362799081, + "learning_rate": 1.9438765205937237e-07, + "loss": 1.1641, + "step": 32379 + }, + { + "epoch": 2.759737492542402, + "grad_norm": 30.05733230036994, + "learning_rate": 1.942507609818306e-07, + "loss": 0.8784, + "step": 32380 + }, + { + "epoch": 2.7598227222364273, + "grad_norm": 78.47628239744135, + "learning_rate": 1.9411391716692262e-07, + "loss": 1.8333, + "step": 32381 + }, + { + "epoch": 2.759907951930453, + "grad_norm": 41.65277469793946, + "learning_rate": 1.9397712061599405e-07, + "loss": 1.168, + "step": 32382 + }, + { + "epoch": 2.759993181624478, + "grad_norm": 83.45409876344341, + "learning_rate": 1.9384037133039101e-07, + "loss": 1.7667, + "step": 32383 + }, + { + "epoch": 2.7600784113185033, + "grad_norm": 45.084360752513106, + "learning_rate": 1.9370366931145856e-07, + "loss": 1.2282, + "step": 32384 + }, + { + "epoch": 2.7601636410125288, + "grad_norm": 61.25517154061776, + "learning_rate": 1.9356701456054118e-07, + "loss": 1.6689, + "step": 32385 + }, + { + "epoch": 2.7602488707065542, + "grad_norm": 41.515298560292905, + "learning_rate": 1.9343040707898108e-07, + "loss": 1.5104, + "step": 32386 + }, + { + "epoch": 2.7603341004005797, + "grad_norm": 73.66474013524656, + "learning_rate": 1.9329384686812392e-07, + "loss": 1.8277, + "step": 32387 + }, + { + "epoch": 2.7604193300946047, + "grad_norm": 17.548957234821998, + "learning_rate": 1.931573339293119e-07, + "loss": 0.7567, + "step": 32388 + }, + { + "epoch": 2.7605045597886306, + "grad_norm": 82.5148320648467, + "learning_rate": 1.9302086826388733e-07, + "loss": 2.156, + "step": 32389 + }, + { + "epoch": 2.7605897894826557, + "grad_norm": 38.845526889966884, + "learning_rate": 1.9288444987319188e-07, + "loss": 0.8674, + "step": 32390 + }, + { + "epoch": 2.760675019176681, + "grad_norm": 37.107257081929475, + "learning_rate": 1.9274807875856728e-07, + "loss": 0.8901, + "step": 32391 + }, + { + "epoch": 2.7607602488707066, + "grad_norm": 153.64937593261465, + "learning_rate": 1.9261175492135632e-07, + "loss": 1.9824, + "step": 32392 + }, + { + "epoch": 2.760845478564732, + "grad_norm": 24.177141853183134, + "learning_rate": 1.9247547836289792e-07, + "loss": 0.7865, + "step": 32393 + }, + { + "epoch": 2.7609307082587575, + "grad_norm": 50.473937427129286, + "learning_rate": 1.9233924908453327e-07, + "loss": 1.4513, + "step": 32394 + }, + { + "epoch": 2.7610159379527826, + "grad_norm": 79.04953448020439, + "learning_rate": 1.9220306708760073e-07, + "loss": 2.3402, + "step": 32395 + }, + { + "epoch": 2.761101167646808, + "grad_norm": 68.61036487909608, + "learning_rate": 1.9206693237344143e-07, + "loss": 1.8885, + "step": 32396 + }, + { + "epoch": 2.7611863973408335, + "grad_norm": 52.52176366250045, + "learning_rate": 1.9193084494339376e-07, + "loss": 1.6629, + "step": 32397 + }, + { + "epoch": 2.761271627034859, + "grad_norm": 36.578528585156846, + "learning_rate": 1.9179480479879443e-07, + "loss": 1.0158, + "step": 32398 + }, + { + "epoch": 2.7613568567288844, + "grad_norm": 57.639521061460684, + "learning_rate": 1.9165881194098292e-07, + "loss": 1.7628, + "step": 32399 + }, + { + "epoch": 2.76144208642291, + "grad_norm": 59.69556309780665, + "learning_rate": 1.9152286637129703e-07, + "loss": 1.1583, + "step": 32400 + }, + { + "epoch": 2.7615273161169354, + "grad_norm": 85.9184063577351, + "learning_rate": 1.9138696809107294e-07, + "loss": 1.7288, + "step": 32401 + }, + { + "epoch": 2.7616125458109604, + "grad_norm": 56.39770417286039, + "learning_rate": 1.9125111710164734e-07, + "loss": 1.7677, + "step": 32402 + }, + { + "epoch": 2.761697775504986, + "grad_norm": 37.368471866028756, + "learning_rate": 1.911153134043553e-07, + "loss": 1.1269, + "step": 32403 + }, + { + "epoch": 2.7617830051990113, + "grad_norm": 72.84583175281183, + "learning_rate": 1.9097955700053404e-07, + "loss": 1.6779, + "step": 32404 + }, + { + "epoch": 2.761868234893037, + "grad_norm": 71.38033369246722, + "learning_rate": 1.908438478915181e-07, + "loss": 1.7893, + "step": 32405 + }, + { + "epoch": 2.7619534645870623, + "grad_norm": 43.13981949995626, + "learning_rate": 1.907081860786414e-07, + "loss": 1.7038, + "step": 32406 + }, + { + "epoch": 2.7620386942810873, + "grad_norm": 48.287383117928215, + "learning_rate": 1.905725715632384e-07, + "loss": 2.0011, + "step": 32407 + }, + { + "epoch": 2.762123923975113, + "grad_norm": 52.34831025036737, + "learning_rate": 1.904370043466436e-07, + "loss": 0.9431, + "step": 32408 + }, + { + "epoch": 2.762209153669138, + "grad_norm": 26.933018707766312, + "learning_rate": 1.903014844301898e-07, + "loss": 0.7539, + "step": 32409 + }, + { + "epoch": 2.7622943833631637, + "grad_norm": 45.587037755056286, + "learning_rate": 1.901660118152099e-07, + "loss": 1.4283, + "step": 32410 + }, + { + "epoch": 2.762379613057189, + "grad_norm": 48.73274193169446, + "learning_rate": 1.9003058650303608e-07, + "loss": 1.5917, + "step": 32411 + }, + { + "epoch": 2.7624648427512146, + "grad_norm": 66.44186701301868, + "learning_rate": 1.8989520849499955e-07, + "loss": 1.7179, + "step": 32412 + }, + { + "epoch": 2.76255007244524, + "grad_norm": 17.689936504848113, + "learning_rate": 1.8975987779243255e-07, + "loss": 0.6795, + "step": 32413 + }, + { + "epoch": 2.762635302139265, + "grad_norm": 34.93942662847705, + "learning_rate": 1.8962459439666515e-07, + "loss": 1.5662, + "step": 32414 + }, + { + "epoch": 2.7627205318332906, + "grad_norm": 45.58045032623305, + "learning_rate": 1.8948935830902958e-07, + "loss": 0.9847, + "step": 32415 + }, + { + "epoch": 2.762805761527316, + "grad_norm": 54.042649214475134, + "learning_rate": 1.8935416953085373e-07, + "loss": 0.8734, + "step": 32416 + }, + { + "epoch": 2.7628909912213415, + "grad_norm": 33.49787024083863, + "learning_rate": 1.892190280634687e-07, + "loss": 1.0851, + "step": 32417 + }, + { + "epoch": 2.762976220915367, + "grad_norm": 48.99672222347029, + "learning_rate": 1.8908393390820235e-07, + "loss": 1.1677, + "step": 32418 + }, + { + "epoch": 2.7630614506093925, + "grad_norm": 70.86567045194887, + "learning_rate": 1.889488870663847e-07, + "loss": 2.708, + "step": 32419 + }, + { + "epoch": 2.763146680303418, + "grad_norm": 44.243395315534094, + "learning_rate": 1.8881388753934193e-07, + "loss": 1.4434, + "step": 32420 + }, + { + "epoch": 2.763231909997443, + "grad_norm": 34.71780204446781, + "learning_rate": 1.8867893532840354e-07, + "loss": 1.1587, + "step": 32421 + }, + { + "epoch": 2.7633171396914684, + "grad_norm": 22.51345316894865, + "learning_rate": 1.8854403043489512e-07, + "loss": 0.753, + "step": 32422 + }, + { + "epoch": 2.763402369385494, + "grad_norm": 41.96030946002281, + "learning_rate": 1.884091728601456e-07, + "loss": 1.509, + "step": 32423 + }, + { + "epoch": 2.7634875990795194, + "grad_norm": 49.992466460501, + "learning_rate": 1.8827436260547892e-07, + "loss": 1.342, + "step": 32424 + }, + { + "epoch": 2.763572828773545, + "grad_norm": 49.51834216858762, + "learning_rate": 1.8813959967222183e-07, + "loss": 1.6581, + "step": 32425 + }, + { + "epoch": 2.76365805846757, + "grad_norm": 60.06166726616189, + "learning_rate": 1.8800488406170047e-07, + "loss": 1.8879, + "step": 32426 + }, + { + "epoch": 2.7637432881615958, + "grad_norm": 61.94491832498381, + "learning_rate": 1.878702157752388e-07, + "loss": 1.0834, + "step": 32427 + }, + { + "epoch": 2.763828517855621, + "grad_norm": 29.78048314621151, + "learning_rate": 1.877355948141618e-07, + "loss": 0.8775, + "step": 32428 + }, + { + "epoch": 2.7639137475496462, + "grad_norm": 24.94753542184764, + "learning_rate": 1.8760102117979183e-07, + "loss": 0.7622, + "step": 32429 + }, + { + "epoch": 2.7639989772436717, + "grad_norm": 37.3194813368663, + "learning_rate": 1.8746649487345393e-07, + "loss": 0.8525, + "step": 32430 + }, + { + "epoch": 2.764084206937697, + "grad_norm": 31.890656561800917, + "learning_rate": 1.8733201589647087e-07, + "loss": 1.0059, + "step": 32431 + }, + { + "epoch": 2.7641694366317227, + "grad_norm": 60.85175412900264, + "learning_rate": 1.8719758425016555e-07, + "loss": 1.7955, + "step": 32432 + }, + { + "epoch": 2.7642546663257477, + "grad_norm": 62.10294727039784, + "learning_rate": 1.870631999358591e-07, + "loss": 1.5224, + "step": 32433 + }, + { + "epoch": 2.764339896019773, + "grad_norm": 88.32453373007695, + "learning_rate": 1.869288629548738e-07, + "loss": 1.5435, + "step": 32434 + }, + { + "epoch": 2.7644251257137986, + "grad_norm": 47.32174523031627, + "learning_rate": 1.8679457330853078e-07, + "loss": 1.6988, + "step": 32435 + }, + { + "epoch": 2.764510355407824, + "grad_norm": 70.43142560967684, + "learning_rate": 1.8666033099815017e-07, + "loss": 1.1586, + "step": 32436 + }, + { + "epoch": 2.7645955851018496, + "grad_norm": 67.82368844226404, + "learning_rate": 1.8652613602505254e-07, + "loss": 2.2191, + "step": 32437 + }, + { + "epoch": 2.764680814795875, + "grad_norm": 54.42956212447459, + "learning_rate": 1.863919883905574e-07, + "loss": 1.7436, + "step": 32438 + }, + { + "epoch": 2.7647660444899005, + "grad_norm": 29.054944543688922, + "learning_rate": 1.8625788809598478e-07, + "loss": 0.9314, + "step": 32439 + }, + { + "epoch": 2.7648512741839255, + "grad_norm": 23.955981353124113, + "learning_rate": 1.861238351426531e-07, + "loss": 0.7043, + "step": 32440 + }, + { + "epoch": 2.764936503877951, + "grad_norm": 57.16581760331423, + "learning_rate": 1.859898295318807e-07, + "loss": 1.7729, + "step": 32441 + }, + { + "epoch": 2.7650217335719764, + "grad_norm": 54.949605948772565, + "learning_rate": 1.8585587126498494e-07, + "loss": 1.5835, + "step": 32442 + }, + { + "epoch": 2.765106963266002, + "grad_norm": 60.21982713642947, + "learning_rate": 1.8572196034328415e-07, + "loss": 1.4762, + "step": 32443 + }, + { + "epoch": 2.7651921929600274, + "grad_norm": 19.354699133006363, + "learning_rate": 1.8558809676809507e-07, + "loss": 0.7572, + "step": 32444 + }, + { + "epoch": 2.7652774226540524, + "grad_norm": 71.76938862484272, + "learning_rate": 1.8545428054073334e-07, + "loss": 1.9976, + "step": 32445 + }, + { + "epoch": 2.7653626523480783, + "grad_norm": 40.57626388174706, + "learning_rate": 1.8532051166251563e-07, + "loss": 0.8139, + "step": 32446 + }, + { + "epoch": 2.7654478820421033, + "grad_norm": 65.31071846343788, + "learning_rate": 1.8518679013475816e-07, + "loss": 1.5965, + "step": 32447 + }, + { + "epoch": 2.765533111736129, + "grad_norm": 80.3428475328164, + "learning_rate": 1.850531159587754e-07, + "loss": 2.3011, + "step": 32448 + }, + { + "epoch": 2.7656183414301543, + "grad_norm": 52.000459927217676, + "learning_rate": 1.8491948913588186e-07, + "loss": 1.4033, + "step": 32449 + }, + { + "epoch": 2.7657035711241797, + "grad_norm": 61.63835418032478, + "learning_rate": 1.847859096673915e-07, + "loss": 1.7291, + "step": 32450 + }, + { + "epoch": 2.765788800818205, + "grad_norm": 28.079012927223, + "learning_rate": 1.8465237755461885e-07, + "loss": 0.7614, + "step": 32451 + }, + { + "epoch": 2.7658740305122302, + "grad_norm": 86.860590996666, + "learning_rate": 1.8451889279887612e-07, + "loss": 2.0757, + "step": 32452 + }, + { + "epoch": 2.7659592602062557, + "grad_norm": 34.71539839445956, + "learning_rate": 1.8438545540147678e-07, + "loss": 1.1741, + "step": 32453 + }, + { + "epoch": 2.766044489900281, + "grad_norm": 68.6779995003519, + "learning_rate": 1.8425206536373307e-07, + "loss": 2.0528, + "step": 32454 + }, + { + "epoch": 2.7661297195943066, + "grad_norm": 28.9463857884984, + "learning_rate": 1.841187226869562e-07, + "loss": 0.8459, + "step": 32455 + }, + { + "epoch": 2.766214949288332, + "grad_norm": 34.40774239813663, + "learning_rate": 1.839854273724584e-07, + "loss": 1.0368, + "step": 32456 + }, + { + "epoch": 2.7663001789823576, + "grad_norm": 66.24386018395782, + "learning_rate": 1.838521794215503e-07, + "loss": 1.4786, + "step": 32457 + }, + { + "epoch": 2.766385408676383, + "grad_norm": 65.12334721852001, + "learning_rate": 1.8371897883554258e-07, + "loss": 1.9237, + "step": 32458 + }, + { + "epoch": 2.766470638370408, + "grad_norm": 52.20828043559161, + "learning_rate": 1.8358582561574413e-07, + "loss": 1.3137, + "step": 32459 + }, + { + "epoch": 2.7665558680644335, + "grad_norm": 71.71630420588365, + "learning_rate": 1.8345271976346611e-07, + "loss": 1.53, + "step": 32460 + }, + { + "epoch": 2.766641097758459, + "grad_norm": 40.91891611070374, + "learning_rate": 1.8331966128001587e-07, + "loss": 1.03, + "step": 32461 + }, + { + "epoch": 2.7667263274524845, + "grad_norm": 67.02906342642927, + "learning_rate": 1.831866501667029e-07, + "loss": 1.8444, + "step": 32462 + }, + { + "epoch": 2.76681155714651, + "grad_norm": 40.468842841596285, + "learning_rate": 1.8305368642483556e-07, + "loss": 0.9163, + "step": 32463 + }, + { + "epoch": 2.766896786840535, + "grad_norm": 48.6162653487929, + "learning_rate": 1.8292077005572118e-07, + "loss": 1.1442, + "step": 32464 + }, + { + "epoch": 2.766982016534561, + "grad_norm": 49.869132345645895, + "learning_rate": 1.82787901060667e-07, + "loss": 1.565, + "step": 32465 + }, + { + "epoch": 2.767067246228586, + "grad_norm": 58.021929344257956, + "learning_rate": 1.8265507944097926e-07, + "loss": 1.2234, + "step": 32466 + }, + { + "epoch": 2.7671524759226114, + "grad_norm": 37.838580934685815, + "learning_rate": 1.825223051979652e-07, + "loss": 0.8595, + "step": 32467 + }, + { + "epoch": 2.767237705616637, + "grad_norm": 54.83286637815103, + "learning_rate": 1.823895783329288e-07, + "loss": 1.1123, + "step": 32468 + }, + { + "epoch": 2.7673229353106623, + "grad_norm": 59.98064681423721, + "learning_rate": 1.8225689884717736e-07, + "loss": 1.7318, + "step": 32469 + }, + { + "epoch": 2.767408165004688, + "grad_norm": 67.43034166965697, + "learning_rate": 1.8212426674201477e-07, + "loss": 1.5513, + "step": 32470 + }, + { + "epoch": 2.767493394698713, + "grad_norm": 29.483641910327883, + "learning_rate": 1.8199168201874618e-07, + "loss": 1.4225, + "step": 32471 + }, + { + "epoch": 2.7675786243927383, + "grad_norm": 46.630440301569465, + "learning_rate": 1.8185914467867383e-07, + "loss": 1.5583, + "step": 32472 + }, + { + "epoch": 2.7676638540867637, + "grad_norm": 28.654589352437476, + "learning_rate": 1.8172665472310336e-07, + "loss": 0.9696, + "step": 32473 + }, + { + "epoch": 2.767749083780789, + "grad_norm": 58.860784678219474, + "learning_rate": 1.8159421215333595e-07, + "loss": 1.5543, + "step": 32474 + }, + { + "epoch": 2.7678343134748147, + "grad_norm": 60.405820824908965, + "learning_rate": 1.8146181697067555e-07, + "loss": 1.0308, + "step": 32475 + }, + { + "epoch": 2.76791954316884, + "grad_norm": 65.11708923780294, + "learning_rate": 1.8132946917642225e-07, + "loss": 1.9257, + "step": 32476 + }, + { + "epoch": 2.7680047728628656, + "grad_norm": 39.78796569218236, + "learning_rate": 1.8119716877187942e-07, + "loss": 0.9509, + "step": 32477 + }, + { + "epoch": 2.7680900025568906, + "grad_norm": 31.425154411019285, + "learning_rate": 1.8106491575834772e-07, + "loss": 0.7387, + "step": 32478 + }, + { + "epoch": 2.768175232250916, + "grad_norm": 24.880997549469644, + "learning_rate": 1.8093271013712776e-07, + "loss": 0.6971, + "step": 32479 + }, + { + "epoch": 2.7682604619449416, + "grad_norm": 51.533972088109195, + "learning_rate": 1.8080055190951962e-07, + "loss": 1.4088, + "step": 32480 + }, + { + "epoch": 2.768345691638967, + "grad_norm": 61.03749284615673, + "learning_rate": 1.8066844107682335e-07, + "loss": 1.5003, + "step": 32481 + }, + { + "epoch": 2.7684309213329925, + "grad_norm": 37.7521758412504, + "learning_rate": 1.8053637764033794e-07, + "loss": 1.3084, + "step": 32482 + }, + { + "epoch": 2.7685161510270175, + "grad_norm": 51.95731143185099, + "learning_rate": 1.804043616013623e-07, + "loss": 1.5538, + "step": 32483 + }, + { + "epoch": 2.7686013807210434, + "grad_norm": 35.22765633056103, + "learning_rate": 1.802723929611938e-07, + "loss": 1.1151, + "step": 32484 + }, + { + "epoch": 2.7686866104150685, + "grad_norm": 46.81404030559976, + "learning_rate": 1.8014047172113136e-07, + "loss": 1.9081, + "step": 32485 + }, + { + "epoch": 2.768771840109094, + "grad_norm": 36.744816271959884, + "learning_rate": 1.8000859788247228e-07, + "loss": 0.7806, + "step": 32486 + }, + { + "epoch": 2.7688570698031194, + "grad_norm": 74.2564955858317, + "learning_rate": 1.7987677144651382e-07, + "loss": 1.5725, + "step": 32487 + }, + { + "epoch": 2.768942299497145, + "grad_norm": 36.289186659516304, + "learning_rate": 1.7974499241455224e-07, + "loss": 0.9248, + "step": 32488 + }, + { + "epoch": 2.7690275291911703, + "grad_norm": 86.09107941723194, + "learning_rate": 1.79613260787882e-07, + "loss": 1.7258, + "step": 32489 + }, + { + "epoch": 2.7691127588851954, + "grad_norm": 34.69415076888775, + "learning_rate": 1.7948157656780096e-07, + "loss": 0.7544, + "step": 32490 + }, + { + "epoch": 2.769197988579221, + "grad_norm": 60.051628866047004, + "learning_rate": 1.7934993975560312e-07, + "loss": 1.3296, + "step": 32491 + }, + { + "epoch": 2.7692832182732463, + "grad_norm": 29.042885720682452, + "learning_rate": 1.792183503525824e-07, + "loss": 0.9628, + "step": 32492 + }, + { + "epoch": 2.7693684479672718, + "grad_norm": 55.448168520359445, + "learning_rate": 1.7908680836003388e-07, + "loss": 1.7689, + "step": 32493 + }, + { + "epoch": 2.7694536776612972, + "grad_norm": 84.86246503988774, + "learning_rate": 1.7895531377925102e-07, + "loss": 1.0114, + "step": 32494 + }, + { + "epoch": 2.7695389073553227, + "grad_norm": 34.97353006821601, + "learning_rate": 1.7882386661152717e-07, + "loss": 1.1046, + "step": 32495 + }, + { + "epoch": 2.769624137049348, + "grad_norm": 43.552381013128894, + "learning_rate": 1.7869246685815522e-07, + "loss": 1.1854, + "step": 32496 + }, + { + "epoch": 2.769709366743373, + "grad_norm": 51.734805618483385, + "learning_rate": 1.7856111452042636e-07, + "loss": 1.1476, + "step": 32497 + }, + { + "epoch": 2.7697945964373987, + "grad_norm": 41.54126127720078, + "learning_rate": 1.7842980959963285e-07, + "loss": 1.0209, + "step": 32498 + }, + { + "epoch": 2.769879826131424, + "grad_norm": 22.631233911798628, + "learning_rate": 1.7829855209706703e-07, + "loss": 0.5533, + "step": 32499 + }, + { + "epoch": 2.7699650558254496, + "grad_norm": 36.89432659959593, + "learning_rate": 1.7816734201401843e-07, + "loss": 1.4716, + "step": 32500 + }, + { + "epoch": 2.770050285519475, + "grad_norm": 72.66721546494267, + "learning_rate": 1.780361793517782e-07, + "loss": 1.4636, + "step": 32501 + }, + { + "epoch": 2.7701355152135005, + "grad_norm": 35.11297481643788, + "learning_rate": 1.779050641116359e-07, + "loss": 1.044, + "step": 32502 + }, + { + "epoch": 2.770220744907526, + "grad_norm": 63.50290093476003, + "learning_rate": 1.7777399629488158e-07, + "loss": 1.421, + "step": 32503 + }, + { + "epoch": 2.770305974601551, + "grad_norm": 27.533712919882106, + "learning_rate": 1.7764297590280365e-07, + "loss": 0.8748, + "step": 32504 + }, + { + "epoch": 2.7703912042955765, + "grad_norm": 59.505063187050915, + "learning_rate": 1.7751200293669114e-07, + "loss": 1.0922, + "step": 32505 + }, + { + "epoch": 2.770476433989602, + "grad_norm": 38.76617330740256, + "learning_rate": 1.773810773978313e-07, + "loss": 1.1636, + "step": 32506 + }, + { + "epoch": 2.7705616636836274, + "grad_norm": 27.716181835512394, + "learning_rate": 1.7725019928751253e-07, + "loss": 0.8382, + "step": 32507 + }, + { + "epoch": 2.770646893377653, + "grad_norm": 30.19612081274235, + "learning_rate": 1.7711936860702105e-07, + "loss": 0.5966, + "step": 32508 + }, + { + "epoch": 2.770732123071678, + "grad_norm": 45.74117006650509, + "learning_rate": 1.769885853576453e-07, + "loss": 1.6579, + "step": 32509 + }, + { + "epoch": 2.770817352765704, + "grad_norm": 76.17784997439833, + "learning_rate": 1.768578495406692e-07, + "loss": 1.3052, + "step": 32510 + }, + { + "epoch": 2.770902582459729, + "grad_norm": 45.59955744997045, + "learning_rate": 1.767271611573801e-07, + "loss": 1.2494, + "step": 32511 + }, + { + "epoch": 2.7709878121537543, + "grad_norm": 44.53425405328368, + "learning_rate": 1.7659652020906304e-07, + "loss": 1.0734, + "step": 32512 + }, + { + "epoch": 2.77107304184778, + "grad_norm": 28.36927679726814, + "learning_rate": 1.7646592669700256e-07, + "loss": 1.025, + "step": 32513 + }, + { + "epoch": 2.7711582715418053, + "grad_norm": 62.319663746727215, + "learning_rate": 1.7633538062248324e-07, + "loss": 1.3957, + "step": 32514 + }, + { + "epoch": 2.7712435012358307, + "grad_norm": 20.857627390735004, + "learning_rate": 1.7620488198678788e-07, + "loss": 0.5143, + "step": 32515 + }, + { + "epoch": 2.7713287309298558, + "grad_norm": 44.812171311588706, + "learning_rate": 1.76074430791201e-07, + "loss": 1.0688, + "step": 32516 + }, + { + "epoch": 2.7714139606238812, + "grad_norm": 57.6542619973495, + "learning_rate": 1.7594402703700552e-07, + "loss": 2.0355, + "step": 32517 + }, + { + "epoch": 2.7714991903179067, + "grad_norm": 48.90256445530424, + "learning_rate": 1.7581367072548373e-07, + "loss": 1.1073, + "step": 32518 + }, + { + "epoch": 2.771584420011932, + "grad_norm": 99.42455394332218, + "learning_rate": 1.756833618579168e-07, + "loss": 1.3627, + "step": 32519 + }, + { + "epoch": 2.7716696497059576, + "grad_norm": 17.002585745511393, + "learning_rate": 1.755531004355876e-07, + "loss": 0.4928, + "step": 32520 + }, + { + "epoch": 2.771754879399983, + "grad_norm": 73.59562680007726, + "learning_rate": 1.754228864597768e-07, + "loss": 1.9703, + "step": 32521 + }, + { + "epoch": 2.7718401090940086, + "grad_norm": 63.32598079962508, + "learning_rate": 1.75292719931765e-07, + "loss": 1.27, + "step": 32522 + }, + { + "epoch": 2.7719253387880336, + "grad_norm": 54.48573146562681, + "learning_rate": 1.751626008528312e-07, + "loss": 1.3939, + "step": 32523 + }, + { + "epoch": 2.772010568482059, + "grad_norm": 59.69508882066386, + "learning_rate": 1.7503252922425606e-07, + "loss": 1.2655, + "step": 32524 + }, + { + "epoch": 2.7720957981760845, + "grad_norm": 36.43013851917586, + "learning_rate": 1.7490250504731965e-07, + "loss": 0.7333, + "step": 32525 + }, + { + "epoch": 2.77218102787011, + "grad_norm": 59.063665337757065, + "learning_rate": 1.7477252832329928e-07, + "loss": 1.7155, + "step": 32526 + }, + { + "epoch": 2.7722662575641355, + "grad_norm": 65.7249942857977, + "learning_rate": 1.7464259905347392e-07, + "loss": 1.2459, + "step": 32527 + }, + { + "epoch": 2.7723514872581605, + "grad_norm": 66.23958437549736, + "learning_rate": 1.745127172391209e-07, + "loss": 1.9244, + "step": 32528 + }, + { + "epoch": 2.7724367169521864, + "grad_norm": 67.62251465093146, + "learning_rate": 1.7438288288151804e-07, + "loss": 1.5695, + "step": 32529 + }, + { + "epoch": 2.7725219466462114, + "grad_norm": 57.66870906741994, + "learning_rate": 1.7425309598194218e-07, + "loss": 2.0233, + "step": 32530 + }, + { + "epoch": 2.772607176340237, + "grad_norm": 52.13533696416697, + "learning_rate": 1.7412335654166944e-07, + "loss": 1.5002, + "step": 32531 + }, + { + "epoch": 2.7726924060342624, + "grad_norm": 63.842773429782554, + "learning_rate": 1.7399366456197498e-07, + "loss": 0.8921, + "step": 32532 + }, + { + "epoch": 2.772777635728288, + "grad_norm": 29.95732521367521, + "learning_rate": 1.738640200441366e-07, + "loss": 1.5585, + "step": 32533 + }, + { + "epoch": 2.7728628654223133, + "grad_norm": 60.56096495773538, + "learning_rate": 1.7373442298942721e-07, + "loss": 1.4948, + "step": 32534 + }, + { + "epoch": 2.7729480951163383, + "grad_norm": 43.84572238946354, + "learning_rate": 1.7360487339912247e-07, + "loss": 1.1423, + "step": 32535 + }, + { + "epoch": 2.773033324810364, + "grad_norm": 70.23702472280586, + "learning_rate": 1.7347537127449522e-07, + "loss": 1.6565, + "step": 32536 + }, + { + "epoch": 2.7731185545043893, + "grad_norm": 29.75720872678906, + "learning_rate": 1.7334591661682e-07, + "loss": 0.6783, + "step": 32537 + }, + { + "epoch": 2.7732037841984147, + "grad_norm": 44.73787124842417, + "learning_rate": 1.7321650942737024e-07, + "loss": 1.1396, + "step": 32538 + }, + { + "epoch": 2.77328901389244, + "grad_norm": 52.76807026171607, + "learning_rate": 1.730871497074177e-07, + "loss": 0.9615, + "step": 32539 + }, + { + "epoch": 2.7733742435864657, + "grad_norm": 77.21088767631582, + "learning_rate": 1.7295783745823524e-07, + "loss": 1.6703, + "step": 32540 + }, + { + "epoch": 2.773459473280491, + "grad_norm": 40.610519155861994, + "learning_rate": 1.7282857268109353e-07, + "loss": 1.031, + "step": 32541 + }, + { + "epoch": 2.773544702974516, + "grad_norm": 51.793836614256065, + "learning_rate": 1.72699355377266e-07, + "loss": 1.0933, + "step": 32542 + }, + { + "epoch": 2.7736299326685416, + "grad_norm": 32.78477155011798, + "learning_rate": 1.7257018554802163e-07, + "loss": 1.2803, + "step": 32543 + }, + { + "epoch": 2.773715162362567, + "grad_norm": 38.96831964591627, + "learning_rate": 1.7244106319463162e-07, + "loss": 1.1215, + "step": 32544 + }, + { + "epoch": 2.7738003920565926, + "grad_norm": 41.92751230441757, + "learning_rate": 1.7231198831836437e-07, + "loss": 1.0241, + "step": 32545 + }, + { + "epoch": 2.773885621750618, + "grad_norm": 57.55463546996448, + "learning_rate": 1.7218296092049113e-07, + "loss": 1.4112, + "step": 32546 + }, + { + "epoch": 2.773970851444643, + "grad_norm": 48.53706569178308, + "learning_rate": 1.7205398100227976e-07, + "loss": 1.4793, + "step": 32547 + }, + { + "epoch": 2.774056081138669, + "grad_norm": 90.36026333542156, + "learning_rate": 1.7192504856499926e-07, + "loss": 2.3892, + "step": 32548 + }, + { + "epoch": 2.774141310832694, + "grad_norm": 53.28869965048588, + "learning_rate": 1.7179616360991748e-07, + "loss": 1.5676, + "step": 32549 + }, + { + "epoch": 2.7742265405267195, + "grad_norm": 64.91840567422786, + "learning_rate": 1.7166732613830173e-07, + "loss": 1.548, + "step": 32550 + }, + { + "epoch": 2.774311770220745, + "grad_norm": 65.89923335908875, + "learning_rate": 1.7153853615141936e-07, + "loss": 1.7239, + "step": 32551 + }, + { + "epoch": 2.7743969999147704, + "grad_norm": 60.48780938987223, + "learning_rate": 1.7140979365053712e-07, + "loss": 1.5811, + "step": 32552 + }, + { + "epoch": 2.774482229608796, + "grad_norm": 48.283610925510196, + "learning_rate": 1.712810986369201e-07, + "loss": 1.3343, + "step": 32553 + }, + { + "epoch": 2.774567459302821, + "grad_norm": 40.745849733659384, + "learning_rate": 1.7115245111183566e-07, + "loss": 1.052, + "step": 32554 + }, + { + "epoch": 2.7746526889968464, + "grad_norm": 67.57412759200503, + "learning_rate": 1.710238510765472e-07, + "loss": 1.1268, + "step": 32555 + }, + { + "epoch": 2.774737918690872, + "grad_norm": 71.34863808725144, + "learning_rate": 1.708952985323209e-07, + "loss": 1.574, + "step": 32556 + }, + { + "epoch": 2.7748231483848973, + "grad_norm": 76.42606648627405, + "learning_rate": 1.7076679348042025e-07, + "loss": 2.0091, + "step": 32557 + }, + { + "epoch": 2.7749083780789228, + "grad_norm": 29.415296564536074, + "learning_rate": 1.706383359221092e-07, + "loss": 0.71, + "step": 32558 + }, + { + "epoch": 2.7749936077729482, + "grad_norm": 37.38163033141656, + "learning_rate": 1.7050992585865122e-07, + "loss": 1.2692, + "step": 32559 + }, + { + "epoch": 2.7750788374669737, + "grad_norm": 25.36956247142767, + "learning_rate": 1.7038156329130916e-07, + "loss": 0.6891, + "step": 32560 + }, + { + "epoch": 2.7751640671609987, + "grad_norm": 37.879443506054514, + "learning_rate": 1.7025324822134536e-07, + "loss": 0.9775, + "step": 32561 + }, + { + "epoch": 2.775249296855024, + "grad_norm": 29.211975263358994, + "learning_rate": 1.7012498065002158e-07, + "loss": 0.6895, + "step": 32562 + }, + { + "epoch": 2.7753345265490497, + "grad_norm": 24.32060312297115, + "learning_rate": 1.6999676057859903e-07, + "loss": 0.6212, + "step": 32563 + }, + { + "epoch": 2.775419756243075, + "grad_norm": 40.692108832735826, + "learning_rate": 1.698685880083395e-07, + "loss": 0.9131, + "step": 32564 + }, + { + "epoch": 2.7755049859371006, + "grad_norm": 59.00715244225067, + "learning_rate": 1.6974046294050305e-07, + "loss": 1.5564, + "step": 32565 + }, + { + "epoch": 2.7755902156311256, + "grad_norm": 37.68587275425248, + "learning_rate": 1.6961238537634928e-07, + "loss": 1.0707, + "step": 32566 + }, + { + "epoch": 2.7756754453251515, + "grad_norm": 58.866289259953, + "learning_rate": 1.6948435531713935e-07, + "loss": 1.761, + "step": 32567 + }, + { + "epoch": 2.7757606750191766, + "grad_norm": 26.102013485211838, + "learning_rate": 1.6935637276413063e-07, + "loss": 0.9081, + "step": 32568 + }, + { + "epoch": 2.775845904713202, + "grad_norm": 35.767167600931785, + "learning_rate": 1.6922843771858266e-07, + "loss": 0.9528, + "step": 32569 + }, + { + "epoch": 2.7759311344072275, + "grad_norm": 57.096038599673626, + "learning_rate": 1.6910055018175332e-07, + "loss": 1.0918, + "step": 32570 + }, + { + "epoch": 2.776016364101253, + "grad_norm": 76.60653328935965, + "learning_rate": 1.689727101548999e-07, + "loss": 1.6749, + "step": 32571 + }, + { + "epoch": 2.7761015937952784, + "grad_norm": 88.26834895857812, + "learning_rate": 1.6884491763928146e-07, + "loss": 2.0963, + "step": 32572 + }, + { + "epoch": 2.7761868234893035, + "grad_norm": 24.554906989000486, + "learning_rate": 1.6871717263615306e-07, + "loss": 0.6517, + "step": 32573 + }, + { + "epoch": 2.776272053183329, + "grad_norm": 25.478316296397036, + "learning_rate": 1.685894751467715e-07, + "loss": 0.7559, + "step": 32574 + }, + { + "epoch": 2.7763572828773544, + "grad_norm": 31.274982564360485, + "learning_rate": 1.6846182517239184e-07, + "loss": 0.6624, + "step": 32575 + }, + { + "epoch": 2.77644251257138, + "grad_norm": 45.31238795511677, + "learning_rate": 1.6833422271427148e-07, + "loss": 1.3523, + "step": 32576 + }, + { + "epoch": 2.7765277422654053, + "grad_norm": 78.20828776157447, + "learning_rate": 1.682066677736638e-07, + "loss": 1.1587, + "step": 32577 + }, + { + "epoch": 2.776612971959431, + "grad_norm": 85.86586095380729, + "learning_rate": 1.6807916035182281e-07, + "loss": 2.4919, + "step": 32578 + }, + { + "epoch": 2.7766982016534563, + "grad_norm": 30.950725546370116, + "learning_rate": 1.6795170045000308e-07, + "loss": 1.0479, + "step": 32579 + }, + { + "epoch": 2.7767834313474813, + "grad_norm": 38.128176909316906, + "learning_rate": 1.678242880694597e-07, + "loss": 1.2792, + "step": 32580 + }, + { + "epoch": 2.7768686610415068, + "grad_norm": 27.507008439217895, + "learning_rate": 1.6769692321144339e-07, + "loss": 0.9903, + "step": 32581 + }, + { + "epoch": 2.7769538907355322, + "grad_norm": 70.27560497698718, + "learning_rate": 1.675696058772086e-07, + "loss": 1.87, + "step": 32582 + }, + { + "epoch": 2.7770391204295577, + "grad_norm": 39.80657671175216, + "learning_rate": 1.6744233606800497e-07, + "loss": 1.3741, + "step": 32583 + }, + { + "epoch": 2.777124350123583, + "grad_norm": 82.5719988408399, + "learning_rate": 1.6731511378508702e-07, + "loss": 2.5982, + "step": 32584 + }, + { + "epoch": 2.777209579817608, + "grad_norm": 65.24834097146118, + "learning_rate": 1.6718793902970433e-07, + "loss": 1.9318, + "step": 32585 + }, + { + "epoch": 2.777294809511634, + "grad_norm": 33.435320053940906, + "learning_rate": 1.670608118031075e-07, + "loss": 0.9598, + "step": 32586 + }, + { + "epoch": 2.777380039205659, + "grad_norm": 67.1849925875008, + "learning_rate": 1.6693373210654728e-07, + "loss": 1.5432, + "step": 32587 + }, + { + "epoch": 2.7774652688996846, + "grad_norm": 93.23188737223396, + "learning_rate": 1.6680669994127318e-07, + "loss": 1.9222, + "step": 32588 + }, + { + "epoch": 2.77755049859371, + "grad_norm": 73.50746107617775, + "learning_rate": 1.6667971530853533e-07, + "loss": 1.7286, + "step": 32589 + }, + { + "epoch": 2.7776357282877355, + "grad_norm": 30.706736978429202, + "learning_rate": 1.665527782095816e-07, + "loss": 0.7746, + "step": 32590 + }, + { + "epoch": 2.777720957981761, + "grad_norm": 63.35394086102564, + "learning_rate": 1.6642588864566045e-07, + "loss": 1.5064, + "step": 32591 + }, + { + "epoch": 2.777806187675786, + "grad_norm": 49.40587597279369, + "learning_rate": 1.6629904661801976e-07, + "loss": 1.1323, + "step": 32592 + }, + { + "epoch": 2.7778914173698115, + "grad_norm": 51.59208827891936, + "learning_rate": 1.6617225212790799e-07, + "loss": 1.5557, + "step": 32593 + }, + { + "epoch": 2.777976647063837, + "grad_norm": 29.504578322917766, + "learning_rate": 1.660455051765708e-07, + "loss": 1.2168, + "step": 32594 + }, + { + "epoch": 2.7780618767578624, + "grad_norm": 49.21642687376267, + "learning_rate": 1.6591880576525554e-07, + "loss": 1.4337, + "step": 32595 + }, + { + "epoch": 2.778147106451888, + "grad_norm": 64.30424360884442, + "learning_rate": 1.6579215389520732e-07, + "loss": 1.95, + "step": 32596 + }, + { + "epoch": 2.7782323361459134, + "grad_norm": 61.178620699628794, + "learning_rate": 1.6566554956767288e-07, + "loss": 1.9325, + "step": 32597 + }, + { + "epoch": 2.778317565839939, + "grad_norm": 36.891193190721914, + "learning_rate": 1.6553899278389684e-07, + "loss": 0.8662, + "step": 32598 + }, + { + "epoch": 2.778402795533964, + "grad_norm": 28.977339487968777, + "learning_rate": 1.6541248354512428e-07, + "loss": 1.0806, + "step": 32599 + }, + { + "epoch": 2.7784880252279893, + "grad_norm": 50.65351114409147, + "learning_rate": 1.6528602185259812e-07, + "loss": 1.6934, + "step": 32600 + }, + { + "epoch": 2.778573254922015, + "grad_norm": 19.99005532370981, + "learning_rate": 1.651596077075629e-07, + "loss": 0.701, + "step": 32601 + }, + { + "epoch": 2.7786584846160403, + "grad_norm": 63.6153877497906, + "learning_rate": 1.6503324111126095e-07, + "loss": 1.6925, + "step": 32602 + }, + { + "epoch": 2.7787437143100657, + "grad_norm": 46.11111988148652, + "learning_rate": 1.6490692206493687e-07, + "loss": 1.4425, + "step": 32603 + }, + { + "epoch": 2.7788289440040908, + "grad_norm": 35.54803902455211, + "learning_rate": 1.647806505698324e-07, + "loss": 0.94, + "step": 32604 + }, + { + "epoch": 2.7789141736981167, + "grad_norm": 59.895938063126536, + "learning_rate": 1.6465442662718766e-07, + "loss": 1.2089, + "step": 32605 + }, + { + "epoch": 2.7789994033921417, + "grad_norm": 60.42333596663211, + "learning_rate": 1.6452825023824616e-07, + "loss": 1.3073, + "step": 32606 + }, + { + "epoch": 2.779084633086167, + "grad_norm": 31.966857073964533, + "learning_rate": 1.6440212140424794e-07, + "loss": 1.0599, + "step": 32607 + }, + { + "epoch": 2.7791698627801926, + "grad_norm": 57.53344796814128, + "learning_rate": 1.6427604012643316e-07, + "loss": 1.1564, + "step": 32608 + }, + { + "epoch": 2.779255092474218, + "grad_norm": 64.82838735496121, + "learning_rate": 1.6415000640604138e-07, + "loss": 1.5349, + "step": 32609 + }, + { + "epoch": 2.7793403221682436, + "grad_norm": 47.40397297424176, + "learning_rate": 1.6402402024431274e-07, + "loss": 1.3212, + "step": 32610 + }, + { + "epoch": 2.7794255518622686, + "grad_norm": 34.35930238762877, + "learning_rate": 1.6389808164248678e-07, + "loss": 1.053, + "step": 32611 + }, + { + "epoch": 2.779510781556294, + "grad_norm": 37.22265370419941, + "learning_rate": 1.6377219060180195e-07, + "loss": 0.8266, + "step": 32612 + }, + { + "epoch": 2.7795960112503195, + "grad_norm": 51.354146611686396, + "learning_rate": 1.6364634712349557e-07, + "loss": 1.4879, + "step": 32613 + }, + { + "epoch": 2.779681240944345, + "grad_norm": 81.67611259217465, + "learning_rate": 1.6352055120880507e-07, + "loss": 2.1486, + "step": 32614 + }, + { + "epoch": 2.7797664706383705, + "grad_norm": 27.74307163433309, + "learning_rate": 1.6339480285896826e-07, + "loss": 0.8141, + "step": 32615 + }, + { + "epoch": 2.779851700332396, + "grad_norm": 35.833589752918726, + "learning_rate": 1.6326910207522196e-07, + "loss": 0.9753, + "step": 32616 + }, + { + "epoch": 2.7799369300264214, + "grad_norm": 38.997983641922346, + "learning_rate": 1.6314344885880184e-07, + "loss": 1.049, + "step": 32617 + }, + { + "epoch": 2.7800221597204464, + "grad_norm": 28.718449809772864, + "learning_rate": 1.630178432109436e-07, + "loss": 0.8184, + "step": 32618 + }, + { + "epoch": 2.780107389414472, + "grad_norm": 70.98710423388866, + "learning_rate": 1.6289228513288346e-07, + "loss": 1.0897, + "step": 32619 + }, + { + "epoch": 2.7801926191084974, + "grad_norm": 34.700611371411064, + "learning_rate": 1.62766774625856e-07, + "loss": 0.9548, + "step": 32620 + }, + { + "epoch": 2.780277848802523, + "grad_norm": 70.40097238719711, + "learning_rate": 1.6264131169109466e-07, + "loss": 2.1961, + "step": 32621 + }, + { + "epoch": 2.7803630784965483, + "grad_norm": 51.68262954270843, + "learning_rate": 1.6251589632983346e-07, + "loss": 1.5039, + "step": 32622 + }, + { + "epoch": 2.7804483081905738, + "grad_norm": 50.435471801882855, + "learning_rate": 1.6239052854330694e-07, + "loss": 1.6263, + "step": 32623 + }, + { + "epoch": 2.7805335378845992, + "grad_norm": 19.086359903535694, + "learning_rate": 1.6226520833274696e-07, + "loss": 0.7665, + "step": 32624 + }, + { + "epoch": 2.7806187675786243, + "grad_norm": 52.79272621840856, + "learning_rate": 1.6213993569938634e-07, + "loss": 1.7371, + "step": 32625 + }, + { + "epoch": 2.7807039972726497, + "grad_norm": 48.688010512342025, + "learning_rate": 1.620147106444564e-07, + "loss": 1.3501, + "step": 32626 + }, + { + "epoch": 2.780789226966675, + "grad_norm": 41.30500201360605, + "learning_rate": 1.6188953316919e-07, + "loss": 1.3812, + "step": 32627 + }, + { + "epoch": 2.7808744566607007, + "grad_norm": 32.48537912589252, + "learning_rate": 1.617644032748178e-07, + "loss": 0.8913, + "step": 32628 + }, + { + "epoch": 2.780959686354726, + "grad_norm": 31.673046971764773, + "learning_rate": 1.6163932096257052e-07, + "loss": 1.2512, + "step": 32629 + }, + { + "epoch": 2.781044916048751, + "grad_norm": 55.779426396702696, + "learning_rate": 1.6151428623367715e-07, + "loss": 1.3849, + "step": 32630 + }, + { + "epoch": 2.781130145742777, + "grad_norm": 29.729559942078723, + "learning_rate": 1.613892990893684e-07, + "loss": 0.7282, + "step": 32631 + }, + { + "epoch": 2.781215375436802, + "grad_norm": 61.089406968360684, + "learning_rate": 1.6126435953087328e-07, + "loss": 1.4641, + "step": 32632 + }, + { + "epoch": 2.7813006051308276, + "grad_norm": 25.369988100525255, + "learning_rate": 1.6113946755941968e-07, + "loss": 0.4684, + "step": 32633 + }, + { + "epoch": 2.781385834824853, + "grad_norm": 47.84182060211298, + "learning_rate": 1.6101462317623772e-07, + "loss": 1.4055, + "step": 32634 + }, + { + "epoch": 2.7814710645188785, + "grad_norm": 44.965185709584254, + "learning_rate": 1.6088982638255312e-07, + "loss": 1.1902, + "step": 32635 + }, + { + "epoch": 2.781556294212904, + "grad_norm": 40.18924222774304, + "learning_rate": 1.6076507717959543e-07, + "loss": 1.3356, + "step": 32636 + }, + { + "epoch": 2.781641523906929, + "grad_norm": 28.085869297289136, + "learning_rate": 1.606403755685898e-07, + "loss": 0.8227, + "step": 32637 + }, + { + "epoch": 2.7817267536009544, + "grad_norm": 40.78875970299404, + "learning_rate": 1.60515721550763e-07, + "loss": 1.1414, + "step": 32638 + }, + { + "epoch": 2.78181198329498, + "grad_norm": 76.13164159542039, + "learning_rate": 1.6039111512734073e-07, + "loss": 1.5257, + "step": 32639 + }, + { + "epoch": 2.7818972129890054, + "grad_norm": 66.87551481503084, + "learning_rate": 1.6026655629954922e-07, + "loss": 1.5647, + "step": 32640 + }, + { + "epoch": 2.781982442683031, + "grad_norm": 42.97280043205349, + "learning_rate": 1.6014204506861254e-07, + "loss": 1.1777, + "step": 32641 + }, + { + "epoch": 2.7820676723770563, + "grad_norm": 20.943492908703714, + "learning_rate": 1.600175814357563e-07, + "loss": 0.6635, + "step": 32642 + }, + { + "epoch": 2.782152902071082, + "grad_norm": 73.86198414019016, + "learning_rate": 1.5989316540220346e-07, + "loss": 1.1939, + "step": 32643 + }, + { + "epoch": 2.782238131765107, + "grad_norm": 23.836278860996426, + "learning_rate": 1.5976879696917802e-07, + "loss": 0.7671, + "step": 32644 + }, + { + "epoch": 2.7823233614591323, + "grad_norm": 55.42790798917162, + "learning_rate": 1.5964447613790345e-07, + "loss": 1.1912, + "step": 32645 + }, + { + "epoch": 2.7824085911531578, + "grad_norm": 46.190266612762066, + "learning_rate": 1.5952020290960214e-07, + "loss": 1.5034, + "step": 32646 + }, + { + "epoch": 2.782493820847183, + "grad_norm": 31.741459944536775, + "learning_rate": 1.5939597728549638e-07, + "loss": 1.7696, + "step": 32647 + }, + { + "epoch": 2.7825790505412087, + "grad_norm": 45.19971484159298, + "learning_rate": 1.5927179926680692e-07, + "loss": 1.1424, + "step": 32648 + }, + { + "epoch": 2.7826642802352337, + "grad_norm": 47.33985786979613, + "learning_rate": 1.5914766885475606e-07, + "loss": 1.3981, + "step": 32649 + }, + { + "epoch": 2.7827495099292596, + "grad_norm": 77.74574911872982, + "learning_rate": 1.5902358605056455e-07, + "loss": 1.4473, + "step": 32650 + }, + { + "epoch": 2.7828347396232846, + "grad_norm": 24.50111701471504, + "learning_rate": 1.588995508554525e-07, + "loss": 1.311, + "step": 32651 + }, + { + "epoch": 2.78291996931731, + "grad_norm": 22.802925173637096, + "learning_rate": 1.587755632706395e-07, + "loss": 1.0657, + "step": 32652 + }, + { + "epoch": 2.7830051990113356, + "grad_norm": 58.426380818477796, + "learning_rate": 1.5865162329734506e-07, + "loss": 1.1732, + "step": 32653 + }, + { + "epoch": 2.783090428705361, + "grad_norm": 65.48881121979397, + "learning_rate": 1.5852773093678885e-07, + "loss": 1.519, + "step": 32654 + }, + { + "epoch": 2.7831756583993865, + "grad_norm": 52.684620035178526, + "learning_rate": 1.584038861901882e-07, + "loss": 1.5377, + "step": 32655 + }, + { + "epoch": 2.7832608880934115, + "grad_norm": 61.45880127890999, + "learning_rate": 1.5828008905876102e-07, + "loss": 1.9981, + "step": 32656 + }, + { + "epoch": 2.783346117787437, + "grad_norm": 68.26332115523059, + "learning_rate": 1.5815633954372523e-07, + "loss": 1.4675, + "step": 32657 + }, + { + "epoch": 2.7834313474814625, + "grad_norm": 51.72474492196898, + "learning_rate": 1.5803263764629816e-07, + "loss": 1.5805, + "step": 32658 + }, + { + "epoch": 2.783516577175488, + "grad_norm": 65.95050851184065, + "learning_rate": 1.5790898336769666e-07, + "loss": 1.6321, + "step": 32659 + }, + { + "epoch": 2.7836018068695134, + "grad_norm": 74.82334389091689, + "learning_rate": 1.5778537670913584e-07, + "loss": 1.6994, + "step": 32660 + }, + { + "epoch": 2.783687036563539, + "grad_norm": 37.16395683517569, + "learning_rate": 1.576618176718314e-07, + "loss": 1.0769, + "step": 32661 + }, + { + "epoch": 2.7837722662575644, + "grad_norm": 65.73577927129979, + "learning_rate": 1.5753830625699905e-07, + "loss": 1.4306, + "step": 32662 + }, + { + "epoch": 2.7838574959515894, + "grad_norm": 67.3542537329063, + "learning_rate": 1.5741484246585337e-07, + "loss": 1.2211, + "step": 32663 + }, + { + "epoch": 2.783942725645615, + "grad_norm": 60.000511258541806, + "learning_rate": 1.5729142629960782e-07, + "loss": 1.6482, + "step": 32664 + }, + { + "epoch": 2.7840279553396403, + "grad_norm": 44.283950924478006, + "learning_rate": 1.5716805775947697e-07, + "loss": 0.8879, + "step": 32665 + }, + { + "epoch": 2.784113185033666, + "grad_norm": 118.44666313805001, + "learning_rate": 1.5704473684667487e-07, + "loss": 3.0949, + "step": 32666 + }, + { + "epoch": 2.7841984147276913, + "grad_norm": 42.78487699391615, + "learning_rate": 1.5692146356241277e-07, + "loss": 1.3463, + "step": 32667 + }, + { + "epoch": 2.7842836444217163, + "grad_norm": 32.95135854880605, + "learning_rate": 1.567982379079036e-07, + "loss": 0.9797, + "step": 32668 + }, + { + "epoch": 2.784368874115742, + "grad_norm": 33.4015233337561, + "learning_rate": 1.5667505988435917e-07, + "loss": 1.1058, + "step": 32669 + }, + { + "epoch": 2.784454103809767, + "grad_norm": 47.89326987082792, + "learning_rate": 1.5655192949299125e-07, + "loss": 1.4644, + "step": 32670 + }, + { + "epoch": 2.7845393335037927, + "grad_norm": 69.90613781192893, + "learning_rate": 1.5642884673501056e-07, + "loss": 1.1036, + "step": 32671 + }, + { + "epoch": 2.784624563197818, + "grad_norm": 40.901640697015324, + "learning_rate": 1.5630581161162671e-07, + "loss": 1.2325, + "step": 32672 + }, + { + "epoch": 2.7847097928918436, + "grad_norm": 32.31796880080219, + "learning_rate": 1.561828241240515e-07, + "loss": 1.1753, + "step": 32673 + }, + { + "epoch": 2.784795022585869, + "grad_norm": 31.60959857143002, + "learning_rate": 1.5605988427349284e-07, + "loss": 0.5723, + "step": 32674 + }, + { + "epoch": 2.784880252279894, + "grad_norm": 48.790923076573954, + "learning_rate": 1.5593699206116086e-07, + "loss": 1.1576, + "step": 32675 + }, + { + "epoch": 2.7849654819739196, + "grad_norm": 46.62197627874736, + "learning_rate": 1.558141474882635e-07, + "loss": 1.7803, + "step": 32676 + }, + { + "epoch": 2.785050711667945, + "grad_norm": 38.12697735288808, + "learning_rate": 1.5569135055600925e-07, + "loss": 1.2712, + "step": 32677 + }, + { + "epoch": 2.7851359413619705, + "grad_norm": 42.12476858544892, + "learning_rate": 1.5556860126560546e-07, + "loss": 1.2502, + "step": 32678 + }, + { + "epoch": 2.785221171055996, + "grad_norm": 80.3428521708318, + "learning_rate": 1.5544589961825952e-07, + "loss": 1.7659, + "step": 32679 + }, + { + "epoch": 2.7853064007500214, + "grad_norm": 88.92121321956886, + "learning_rate": 1.553232456151782e-07, + "loss": 2.1378, + "step": 32680 + }, + { + "epoch": 2.785391630444047, + "grad_norm": 77.9653855533364, + "learning_rate": 1.552006392575678e-07, + "loss": 1.8869, + "step": 32681 + }, + { + "epoch": 2.785476860138072, + "grad_norm": 35.35819373641846, + "learning_rate": 1.5507808054663343e-07, + "loss": 1.4358, + "step": 32682 + }, + { + "epoch": 2.7855620898320974, + "grad_norm": 58.22996937632533, + "learning_rate": 1.5495556948358138e-07, + "loss": 1.1309, + "step": 32683 + }, + { + "epoch": 2.785647319526123, + "grad_norm": 48.902323079749, + "learning_rate": 1.5483310606961622e-07, + "loss": 1.1383, + "step": 32684 + }, + { + "epoch": 2.7857325492201483, + "grad_norm": 69.45647353828042, + "learning_rate": 1.54710690305942e-07, + "loss": 1.4525, + "step": 32685 + }, + { + "epoch": 2.785817778914174, + "grad_norm": 64.85367700377748, + "learning_rate": 1.545883221937633e-07, + "loss": 1.7846, + "step": 32686 + }, + { + "epoch": 2.785903008608199, + "grad_norm": 63.984824469826385, + "learning_rate": 1.5446600173428194e-07, + "loss": 1.3242, + "step": 32687 + }, + { + "epoch": 2.7859882383022248, + "grad_norm": 76.73950633337971, + "learning_rate": 1.5434372892870253e-07, + "loss": 1.7122, + "step": 32688 + }, + { + "epoch": 2.7860734679962498, + "grad_norm": 53.862243722336125, + "learning_rate": 1.5422150377822742e-07, + "loss": 0.9719, + "step": 32689 + }, + { + "epoch": 2.7861586976902752, + "grad_norm": 47.262031462862176, + "learning_rate": 1.5409932628405787e-07, + "loss": 1.347, + "step": 32690 + }, + { + "epoch": 2.7862439273843007, + "grad_norm": 32.1021645393397, + "learning_rate": 1.5397719644739573e-07, + "loss": 0.9481, + "step": 32691 + }, + { + "epoch": 2.786329157078326, + "grad_norm": 95.39309658183845, + "learning_rate": 1.5385511426944278e-07, + "loss": 1.4669, + "step": 32692 + }, + { + "epoch": 2.7864143867723516, + "grad_norm": 39.214989242092926, + "learning_rate": 1.5373307975139918e-07, + "loss": 1.0321, + "step": 32693 + }, + { + "epoch": 2.7864996164663767, + "grad_norm": 44.72794277443534, + "learning_rate": 1.5361109289446507e-07, + "loss": 0.9339, + "step": 32694 + }, + { + "epoch": 2.786584846160402, + "grad_norm": 44.667050101321095, + "learning_rate": 1.534891536998395e-07, + "loss": 1.4951, + "step": 32695 + }, + { + "epoch": 2.7866700758544276, + "grad_norm": 31.382877621961537, + "learning_rate": 1.533672621687221e-07, + "loss": 0.8295, + "step": 32696 + }, + { + "epoch": 2.786755305548453, + "grad_norm": 80.75009075611388, + "learning_rate": 1.5324541830231244e-07, + "loss": 2.2782, + "step": 32697 + }, + { + "epoch": 2.7868405352424785, + "grad_norm": 32.311668236067554, + "learning_rate": 1.5312362210180843e-07, + "loss": 1.1444, + "step": 32698 + }, + { + "epoch": 2.786925764936504, + "grad_norm": 51.017015061893126, + "learning_rate": 1.530018735684069e-07, + "loss": 2.09, + "step": 32699 + }, + { + "epoch": 2.7870109946305295, + "grad_norm": 40.715663631917174, + "learning_rate": 1.5288017270330635e-07, + "loss": 1.4849, + "step": 32700 + }, + { + "epoch": 2.7870962243245545, + "grad_norm": 37.70982473873574, + "learning_rate": 1.5275851950770305e-07, + "loss": 1.5143, + "step": 32701 + }, + { + "epoch": 2.78718145401858, + "grad_norm": 68.39634967110999, + "learning_rate": 1.5263691398279435e-07, + "loss": 1.3353, + "step": 32702 + }, + { + "epoch": 2.7872666837126054, + "grad_norm": 103.1106611423458, + "learning_rate": 1.525153561297743e-07, + "loss": 1.7245, + "step": 32703 + }, + { + "epoch": 2.787351913406631, + "grad_norm": 60.27091598122537, + "learning_rate": 1.5239384594983975e-07, + "loss": 1.5967, + "step": 32704 + }, + { + "epoch": 2.7874371431006564, + "grad_norm": 36.70869218873294, + "learning_rate": 1.522723834441858e-07, + "loss": 1.1371, + "step": 32705 + }, + { + "epoch": 2.7875223727946814, + "grad_norm": 57.87563693672114, + "learning_rate": 1.5215096861400657e-07, + "loss": 1.7045, + "step": 32706 + }, + { + "epoch": 2.7876076024887073, + "grad_norm": 39.68385608199964, + "learning_rate": 1.520296014604966e-07, + "loss": 1.5637, + "step": 32707 + }, + { + "epoch": 2.7876928321827323, + "grad_norm": 19.306512701716876, + "learning_rate": 1.519082819848483e-07, + "loss": 0.2861, + "step": 32708 + }, + { + "epoch": 2.787778061876758, + "grad_norm": 52.79137947244023, + "learning_rate": 1.5178701018825627e-07, + "loss": 2.0513, + "step": 32709 + }, + { + "epoch": 2.7878632915707833, + "grad_norm": 80.50465694037493, + "learning_rate": 1.5166578607191285e-07, + "loss": 1.9587, + "step": 32710 + }, + { + "epoch": 2.7879485212648087, + "grad_norm": 61.770946115890766, + "learning_rate": 1.5154460963700878e-07, + "loss": 1.145, + "step": 32711 + }, + { + "epoch": 2.788033750958834, + "grad_norm": 31.287685972309646, + "learning_rate": 1.5142348088473702e-07, + "loss": 1.1239, + "step": 32712 + }, + { + "epoch": 2.7881189806528592, + "grad_norm": 52.146643679648115, + "learning_rate": 1.5130239981628936e-07, + "loss": 1.9045, + "step": 32713 + }, + { + "epoch": 2.7882042103468847, + "grad_norm": 67.68058704941606, + "learning_rate": 1.5118136643285598e-07, + "loss": 1.8822, + "step": 32714 + }, + { + "epoch": 2.78828944004091, + "grad_norm": 34.69859893735041, + "learning_rate": 1.51060380735627e-07, + "loss": 0.9854, + "step": 32715 + }, + { + "epoch": 2.7883746697349356, + "grad_norm": 46.819388975090064, + "learning_rate": 1.5093944272579264e-07, + "loss": 1.6833, + "step": 32716 + }, + { + "epoch": 2.788459899428961, + "grad_norm": 37.761716157332096, + "learning_rate": 1.508185524045408e-07, + "loss": 1.0911, + "step": 32717 + }, + { + "epoch": 2.7885451291229866, + "grad_norm": 48.84141605422984, + "learning_rate": 1.5069770977306274e-07, + "loss": 1.0893, + "step": 32718 + }, + { + "epoch": 2.788630358817012, + "grad_norm": 68.15809311762196, + "learning_rate": 1.5057691483254534e-07, + "loss": 1.263, + "step": 32719 + }, + { + "epoch": 2.788715588511037, + "grad_norm": 57.465206562430204, + "learning_rate": 1.5045616758417703e-07, + "loss": 1.351, + "step": 32720 + }, + { + "epoch": 2.7888008182050625, + "grad_norm": 76.42078977063512, + "learning_rate": 1.5033546802914522e-07, + "loss": 1.3373, + "step": 32721 + }, + { + "epoch": 2.788886047899088, + "grad_norm": 27.634260529431987, + "learning_rate": 1.5021481616863732e-07, + "loss": 0.6401, + "step": 32722 + }, + { + "epoch": 2.7889712775931135, + "grad_norm": 37.663046409707796, + "learning_rate": 1.500942120038401e-07, + "loss": 1.2627, + "step": 32723 + }, + { + "epoch": 2.789056507287139, + "grad_norm": 54.292352494053674, + "learning_rate": 1.4997365553593824e-07, + "loss": 1.8046, + "step": 32724 + }, + { + "epoch": 2.789141736981164, + "grad_norm": 29.233619002238505, + "learning_rate": 1.498531467661185e-07, + "loss": 0.5634, + "step": 32725 + }, + { + "epoch": 2.78922696667519, + "grad_norm": 59.968256654947, + "learning_rate": 1.497326856955661e-07, + "loss": 1.7562, + "step": 32726 + }, + { + "epoch": 2.789312196369215, + "grad_norm": 23.092089881986357, + "learning_rate": 1.4961227232546448e-07, + "loss": 0.9347, + "step": 32727 + }, + { + "epoch": 2.7893974260632404, + "grad_norm": 73.88102209502823, + "learning_rate": 1.4949190665699997e-07, + "loss": 1.9326, + "step": 32728 + }, + { + "epoch": 2.789482655757266, + "grad_norm": 78.68985416858035, + "learning_rate": 1.4937158869135436e-07, + "loss": 2.4154, + "step": 32729 + }, + { + "epoch": 2.7895678854512913, + "grad_norm": 28.026342836950004, + "learning_rate": 1.492513184297123e-07, + "loss": 1.1534, + "step": 32730 + }, + { + "epoch": 2.7896531151453168, + "grad_norm": 60.41021801596204, + "learning_rate": 1.491310958732556e-07, + "loss": 2.1313, + "step": 32731 + }, + { + "epoch": 2.789738344839342, + "grad_norm": 65.65197322359838, + "learning_rate": 1.4901092102316772e-07, + "loss": 1.4945, + "step": 32732 + }, + { + "epoch": 2.7898235745333673, + "grad_norm": 57.354869688773356, + "learning_rate": 1.4889079388062945e-07, + "loss": 1.0403, + "step": 32733 + }, + { + "epoch": 2.7899088042273927, + "grad_norm": 68.75637727680858, + "learning_rate": 1.48770714446822e-07, + "loss": 1.5076, + "step": 32734 + }, + { + "epoch": 2.789994033921418, + "grad_norm": 47.99298674453463, + "learning_rate": 1.4865068272292725e-07, + "loss": 1.1394, + "step": 32735 + }, + { + "epoch": 2.7900792636154437, + "grad_norm": 28.979290030478815, + "learning_rate": 1.4853069871012538e-07, + "loss": 0.9011, + "step": 32736 + }, + { + "epoch": 2.790164493309469, + "grad_norm": 30.424279287949485, + "learning_rate": 1.4841076240959652e-07, + "loss": 0.7986, + "step": 32737 + }, + { + "epoch": 2.7902497230034946, + "grad_norm": 71.70720218597363, + "learning_rate": 1.4829087382251916e-07, + "loss": 2.1481, + "step": 32738 + }, + { + "epoch": 2.7903349526975196, + "grad_norm": 67.88385615026023, + "learning_rate": 1.4817103295007408e-07, + "loss": 1.8428, + "step": 32739 + }, + { + "epoch": 2.790420182391545, + "grad_norm": 73.45220463441296, + "learning_rate": 1.4805123979343916e-07, + "loss": 1.4761, + "step": 32740 + }, + { + "epoch": 2.7905054120855706, + "grad_norm": 43.95053899089651, + "learning_rate": 1.4793149435379184e-07, + "loss": 1.345, + "step": 32741 + }, + { + "epoch": 2.790590641779596, + "grad_norm": 64.55945266770436, + "learning_rate": 1.4781179663231004e-07, + "loss": 1.8088, + "step": 32742 + }, + { + "epoch": 2.7906758714736215, + "grad_norm": 41.52968376505811, + "learning_rate": 1.4769214663017062e-07, + "loss": 1.0217, + "step": 32743 + }, + { + "epoch": 2.790761101167647, + "grad_norm": 57.11694554094945, + "learning_rate": 1.4757254434855205e-07, + "loss": 1.5145, + "step": 32744 + }, + { + "epoch": 2.7908463308616724, + "grad_norm": 59.766200609767466, + "learning_rate": 1.47452989788629e-07, + "loss": 1.438, + "step": 32745 + }, + { + "epoch": 2.7909315605556975, + "grad_norm": 37.879343829101515, + "learning_rate": 1.4733348295157769e-07, + "loss": 1.3395, + "step": 32746 + }, + { + "epoch": 2.791016790249723, + "grad_norm": 18.555229295949378, + "learning_rate": 1.4721402383857274e-07, + "loss": 0.5821, + "step": 32747 + }, + { + "epoch": 2.7911020199437484, + "grad_norm": 35.3291599657577, + "learning_rate": 1.4709461245078993e-07, + "loss": 1.1829, + "step": 32748 + }, + { + "epoch": 2.791187249637774, + "grad_norm": 70.77959223739009, + "learning_rate": 1.469752487894033e-07, + "loss": 1.7373, + "step": 32749 + }, + { + "epoch": 2.7912724793317993, + "grad_norm": 54.732475771580425, + "learning_rate": 1.468559328555863e-07, + "loss": 1.2511, + "step": 32750 + }, + { + "epoch": 2.7913577090258244, + "grad_norm": 55.56457876092769, + "learning_rate": 1.4673666465051251e-07, + "loss": 1.0906, + "step": 32751 + }, + { + "epoch": 2.79144293871985, + "grad_norm": 67.90825640448102, + "learning_rate": 1.4661744417535595e-07, + "loss": 2.0503, + "step": 32752 + }, + { + "epoch": 2.7915281684138753, + "grad_norm": 40.14338400606422, + "learning_rate": 1.4649827143128792e-07, + "loss": 1.1862, + "step": 32753 + }, + { + "epoch": 2.7916133981079008, + "grad_norm": 53.0075522600455, + "learning_rate": 1.4637914641948136e-07, + "loss": 1.4102, + "step": 32754 + }, + { + "epoch": 2.7916986278019262, + "grad_norm": 51.783660841046355, + "learning_rate": 1.4626006914110648e-07, + "loss": 1.843, + "step": 32755 + }, + { + "epoch": 2.7917838574959517, + "grad_norm": 45.119267214378254, + "learning_rate": 1.4614103959733562e-07, + "loss": 1.5685, + "step": 32756 + }, + { + "epoch": 2.791869087189977, + "grad_norm": 73.7378506384688, + "learning_rate": 1.4602205778933841e-07, + "loss": 1.838, + "step": 32757 + }, + { + "epoch": 2.791954316884002, + "grad_norm": 18.389708551936614, + "learning_rate": 1.4590312371828564e-07, + "loss": 0.5957, + "step": 32758 + }, + { + "epoch": 2.7920395465780277, + "grad_norm": 67.23551939367239, + "learning_rate": 1.4578423738534686e-07, + "loss": 1.2957, + "step": 32759 + }, + { + "epoch": 2.792124776272053, + "grad_norm": 65.37064393140136, + "learning_rate": 1.4566539879169117e-07, + "loss": 1.5972, + "step": 32760 + }, + { + "epoch": 2.7922100059660786, + "grad_norm": 56.82187179555083, + "learning_rate": 1.4554660793848763e-07, + "loss": 1.8669, + "step": 32761 + }, + { + "epoch": 2.792295235660104, + "grad_norm": 39.57492405961905, + "learning_rate": 1.454278648269042e-07, + "loss": 0.7938, + "step": 32762 + }, + { + "epoch": 2.7923804653541295, + "grad_norm": 55.19059003488656, + "learning_rate": 1.453091694581088e-07, + "loss": 1.5626, + "step": 32763 + }, + { + "epoch": 2.792465695048155, + "grad_norm": 38.92025656044793, + "learning_rate": 1.4519052183326832e-07, + "loss": 1.3018, + "step": 32764 + }, + { + "epoch": 2.79255092474218, + "grad_norm": 31.16790312674039, + "learning_rate": 1.4507192195355014e-07, + "loss": 1.1296, + "step": 32765 + }, + { + "epoch": 2.7926361544362055, + "grad_norm": 34.60400369676922, + "learning_rate": 1.4495336982012e-07, + "loss": 1.1774, + "step": 32766 + }, + { + "epoch": 2.792721384130231, + "grad_norm": 45.1827176691528, + "learning_rate": 1.448348654341447e-07, + "loss": 1.3168, + "step": 32767 + }, + { + "epoch": 2.7928066138242564, + "grad_norm": 47.818471617137234, + "learning_rate": 1.4471640879678895e-07, + "loss": 1.0686, + "step": 32768 + }, + { + "epoch": 2.792891843518282, + "grad_norm": 42.144156087567175, + "learning_rate": 1.445979999092184e-07, + "loss": 1.0219, + "step": 32769 + }, + { + "epoch": 2.792977073212307, + "grad_norm": 61.20064258792317, + "learning_rate": 1.444796387725972e-07, + "loss": 1.9242, + "step": 32770 + }, + { + "epoch": 2.793062302906333, + "grad_norm": 34.36870890719631, + "learning_rate": 1.4436132538808934e-07, + "loss": 0.8612, + "step": 32771 + }, + { + "epoch": 2.793147532600358, + "grad_norm": 41.12940064693164, + "learning_rate": 1.4424305975685838e-07, + "loss": 0.9338, + "step": 32772 + }, + { + "epoch": 2.7932327622943833, + "grad_norm": 64.56378191722706, + "learning_rate": 1.4412484188006727e-07, + "loss": 1.2364, + "step": 32773 + }, + { + "epoch": 2.793317991988409, + "grad_norm": 42.0317814175536, + "learning_rate": 1.440066717588784e-07, + "loss": 1.225, + "step": 32774 + }, + { + "epoch": 2.7934032216824343, + "grad_norm": 51.413274390865226, + "learning_rate": 1.4388854939445528e-07, + "loss": 1.7916, + "step": 32775 + }, + { + "epoch": 2.7934884513764597, + "grad_norm": 52.94455200553481, + "learning_rate": 1.4377047478795814e-07, + "loss": 1.2834, + "step": 32776 + }, + { + "epoch": 2.7935736810704848, + "grad_norm": 24.57358095956515, + "learning_rate": 1.4365244794054822e-07, + "loss": 0.7678, + "step": 32777 + }, + { + "epoch": 2.7936589107645102, + "grad_norm": 36.828355515862505, + "learning_rate": 1.4353446885338739e-07, + "loss": 1.2923, + "step": 32778 + }, + { + "epoch": 2.7937441404585357, + "grad_norm": 69.51815482008466, + "learning_rate": 1.4341653752763528e-07, + "loss": 1.9876, + "step": 32779 + }, + { + "epoch": 2.793829370152561, + "grad_norm": 88.10325805488063, + "learning_rate": 1.4329865396445207e-07, + "loss": 1.8872, + "step": 32780 + }, + { + "epoch": 2.7939145998465866, + "grad_norm": 24.69441051518444, + "learning_rate": 1.4318081816499574e-07, + "loss": 1.0467, + "step": 32781 + }, + { + "epoch": 2.793999829540612, + "grad_norm": 53.304065531238564, + "learning_rate": 1.4306303013042644e-07, + "loss": 2.1024, + "step": 32782 + }, + { + "epoch": 2.7940850592346376, + "grad_norm": 50.87959850405776, + "learning_rate": 1.429452898619027e-07, + "loss": 1.7426, + "step": 32783 + }, + { + "epoch": 2.7941702889286626, + "grad_norm": 36.15458423570573, + "learning_rate": 1.4282759736058193e-07, + "loss": 1.1136, + "step": 32784 + }, + { + "epoch": 2.794255518622688, + "grad_norm": 76.08088658030594, + "learning_rate": 1.42709952627621e-07, + "loss": 1.924, + "step": 32785 + }, + { + "epoch": 2.7943407483167135, + "grad_norm": 71.5968403902747, + "learning_rate": 1.425923556641784e-07, + "loss": 1.9602, + "step": 32786 + }, + { + "epoch": 2.794425978010739, + "grad_norm": 49.078156146780195, + "learning_rate": 1.4247480647140988e-07, + "loss": 1.4846, + "step": 32787 + }, + { + "epoch": 2.7945112077047645, + "grad_norm": 32.89600683801349, + "learning_rate": 1.4235730505047118e-07, + "loss": 1.088, + "step": 32788 + }, + { + "epoch": 2.7945964373987895, + "grad_norm": 55.896771603752526, + "learning_rate": 1.4223985140251805e-07, + "loss": 1.7241, + "step": 32789 + }, + { + "epoch": 2.7946816670928154, + "grad_norm": 25.746345958453503, + "learning_rate": 1.4212244552870513e-07, + "loss": 0.929, + "step": 32790 + }, + { + "epoch": 2.7947668967868404, + "grad_norm": 21.395934612575683, + "learning_rate": 1.4200508743018872e-07, + "loss": 0.4809, + "step": 32791 + }, + { + "epoch": 2.794852126480866, + "grad_norm": 20.156527106776092, + "learning_rate": 1.4188777710812174e-07, + "loss": 0.7334, + "step": 32792 + }, + { + "epoch": 2.7949373561748914, + "grad_norm": 31.696296194002027, + "learning_rate": 1.4177051456365775e-07, + "loss": 0.9545, + "step": 32793 + }, + { + "epoch": 2.795022585868917, + "grad_norm": 25.234332809118374, + "learning_rate": 1.4165329979794972e-07, + "loss": 0.8182, + "step": 32794 + }, + { + "epoch": 2.7951078155629423, + "grad_norm": 58.99707792672027, + "learning_rate": 1.4153613281215174e-07, + "loss": 1.479, + "step": 32795 + }, + { + "epoch": 2.7951930452569673, + "grad_norm": 85.83401690706127, + "learning_rate": 1.414190136074156e-07, + "loss": 2.3311, + "step": 32796 + }, + { + "epoch": 2.795278274950993, + "grad_norm": 40.002194086938566, + "learning_rate": 1.4130194218489213e-07, + "loss": 1.4538, + "step": 32797 + }, + { + "epoch": 2.7953635046450183, + "grad_norm": 61.77003143092881, + "learning_rate": 1.4118491854573313e-07, + "loss": 1.7893, + "step": 32798 + }, + { + "epoch": 2.7954487343390437, + "grad_norm": 76.67110577304724, + "learning_rate": 1.4106794269109048e-07, + "loss": 1.7355, + "step": 32799 + }, + { + "epoch": 2.795533964033069, + "grad_norm": 27.669602593846644, + "learning_rate": 1.409510146221138e-07, + "loss": 1.131, + "step": 32800 + }, + { + "epoch": 2.7956191937270947, + "grad_norm": 57.62307383837653, + "learning_rate": 1.408341343399533e-07, + "loss": 1.339, + "step": 32801 + }, + { + "epoch": 2.79570442342112, + "grad_norm": 52.8839178170278, + "learning_rate": 1.4071730184575748e-07, + "loss": 1.6834, + "step": 32802 + }, + { + "epoch": 2.795789653115145, + "grad_norm": 45.04307247990224, + "learning_rate": 1.4060051714067658e-07, + "loss": 1.2606, + "step": 32803 + }, + { + "epoch": 2.7958748828091706, + "grad_norm": 77.87148146672382, + "learning_rate": 1.4048378022585906e-07, + "loss": 1.8149, + "step": 32804 + }, + { + "epoch": 2.795960112503196, + "grad_norm": 33.4634002963693, + "learning_rate": 1.4036709110245183e-07, + "loss": 0.6868, + "step": 32805 + }, + { + "epoch": 2.7960453421972216, + "grad_norm": 72.42467619864048, + "learning_rate": 1.402504497716034e-07, + "loss": 1.3761, + "step": 32806 + }, + { + "epoch": 2.796130571891247, + "grad_norm": 78.21988010692976, + "learning_rate": 1.4013385623446064e-07, + "loss": 1.64, + "step": 32807 + }, + { + "epoch": 2.796215801585272, + "grad_norm": 55.29864512827694, + "learning_rate": 1.400173104921704e-07, + "loss": 1.54, + "step": 32808 + }, + { + "epoch": 2.796301031279298, + "grad_norm": 66.38757092880755, + "learning_rate": 1.399008125458795e-07, + "loss": 1.7449, + "step": 32809 + }, + { + "epoch": 2.796386260973323, + "grad_norm": 56.89387113703049, + "learning_rate": 1.397843623967321e-07, + "loss": 1.9426, + "step": 32810 + }, + { + "epoch": 2.7964714906673485, + "grad_norm": 38.9763735314252, + "learning_rate": 1.3966796004587392e-07, + "loss": 1.2747, + "step": 32811 + }, + { + "epoch": 2.796556720361374, + "grad_norm": 54.49885252306184, + "learning_rate": 1.395516054944507e-07, + "loss": 1.4712, + "step": 32812 + }, + { + "epoch": 2.7966419500553994, + "grad_norm": 49.19925000119395, + "learning_rate": 1.394352987436054e-07, + "loss": 1.2727, + "step": 32813 + }, + { + "epoch": 2.796727179749425, + "grad_norm": 26.360143898904738, + "learning_rate": 1.3931903979448325e-07, + "loss": 0.8222, + "step": 32814 + }, + { + "epoch": 2.79681240944345, + "grad_norm": 42.537282408351764, + "learning_rate": 1.3920282864822666e-07, + "loss": 1.064, + "step": 32815 + }, + { + "epoch": 2.7968976391374754, + "grad_norm": 41.15688946600148, + "learning_rate": 1.390866653059786e-07, + "loss": 1.4516, + "step": 32816 + }, + { + "epoch": 2.796982868831501, + "grad_norm": 77.96737570246754, + "learning_rate": 1.3897054976888202e-07, + "loss": 1.7163, + "step": 32817 + }, + { + "epoch": 2.7970680985255263, + "grad_norm": 29.956110266745398, + "learning_rate": 1.3885448203807882e-07, + "loss": 1.2904, + "step": 32818 + }, + { + "epoch": 2.7971533282195518, + "grad_norm": 63.704501339452726, + "learning_rate": 1.3873846211470977e-07, + "loss": 1.1696, + "step": 32819 + }, + { + "epoch": 2.7972385579135772, + "grad_norm": 43.929288590322926, + "learning_rate": 1.3862248999991557e-07, + "loss": 1.0272, + "step": 32820 + }, + { + "epoch": 2.7973237876076027, + "grad_norm": 40.862226384492125, + "learning_rate": 1.3850656569483756e-07, + "loss": 1.0337, + "step": 32821 + }, + { + "epoch": 2.7974090173016277, + "grad_norm": 53.494548584010346, + "learning_rate": 1.383906892006165e-07, + "loss": 1.0305, + "step": 32822 + }, + { + "epoch": 2.797494246995653, + "grad_norm": 25.43361715469469, + "learning_rate": 1.382748605183909e-07, + "loss": 0.806, + "step": 32823 + }, + { + "epoch": 2.7975794766896787, + "grad_norm": 67.59785163892553, + "learning_rate": 1.3815907964929986e-07, + "loss": 1.7538, + "step": 32824 + }, + { + "epoch": 2.797664706383704, + "grad_norm": 45.356176604620806, + "learning_rate": 1.3804334659448303e-07, + "loss": 0.969, + "step": 32825 + }, + { + "epoch": 2.7977499360777296, + "grad_norm": 38.9135602117824, + "learning_rate": 1.3792766135507786e-07, + "loss": 1.4913, + "step": 32826 + }, + { + "epoch": 2.7978351657717546, + "grad_norm": 63.98443355059911, + "learning_rate": 1.3781202393222172e-07, + "loss": 1.6948, + "step": 32827 + }, + { + "epoch": 2.7979203954657805, + "grad_norm": 57.98260318685057, + "learning_rate": 1.3769643432705205e-07, + "loss": 1.6163, + "step": 32828 + }, + { + "epoch": 2.7980056251598056, + "grad_norm": 49.8191761199584, + "learning_rate": 1.3758089254070629e-07, + "loss": 1.1767, + "step": 32829 + }, + { + "epoch": 2.798090854853831, + "grad_norm": 37.40587495443354, + "learning_rate": 1.3746539857432016e-07, + "loss": 0.8026, + "step": 32830 + }, + { + "epoch": 2.7981760845478565, + "grad_norm": 68.7969985956198, + "learning_rate": 1.3734995242903003e-07, + "loss": 2.2708, + "step": 32831 + }, + { + "epoch": 2.798261314241882, + "grad_norm": 31.12080514385399, + "learning_rate": 1.3723455410597054e-07, + "loss": 0.7138, + "step": 32832 + }, + { + "epoch": 2.7983465439359074, + "grad_norm": 77.68192782206673, + "learning_rate": 1.3711920360627685e-07, + "loss": 1.7515, + "step": 32833 + }, + { + "epoch": 2.7984317736299325, + "grad_norm": 75.87119257594703, + "learning_rate": 1.370039009310836e-07, + "loss": 2.3457, + "step": 32834 + }, + { + "epoch": 2.798517003323958, + "grad_norm": 43.71879955065575, + "learning_rate": 1.3688864608152498e-07, + "loss": 1.0937, + "step": 32835 + }, + { + "epoch": 2.7986022330179834, + "grad_norm": 64.97899667188469, + "learning_rate": 1.367734390587333e-07, + "loss": 1.5479, + "step": 32836 + }, + { + "epoch": 2.798687462712009, + "grad_norm": 49.4076681007955, + "learning_rate": 1.3665827986384272e-07, + "loss": 1.1512, + "step": 32837 + }, + { + "epoch": 2.7987726924060343, + "grad_norm": 64.02490786553993, + "learning_rate": 1.3654316849798566e-07, + "loss": 1.9621, + "step": 32838 + }, + { + "epoch": 2.79885792210006, + "grad_norm": 53.3922176596453, + "learning_rate": 1.3642810496229398e-07, + "loss": 1.4789, + "step": 32839 + }, + { + "epoch": 2.7989431517940853, + "grad_norm": 50.34115936956428, + "learning_rate": 1.3631308925789955e-07, + "loss": 1.5275, + "step": 32840 + }, + { + "epoch": 2.7990283814881103, + "grad_norm": 68.63715614263866, + "learning_rate": 1.361981213859326e-07, + "loss": 1.6811, + "step": 32841 + }, + { + "epoch": 2.7991136111821358, + "grad_norm": 30.748640284552863, + "learning_rate": 1.3608320134752494e-07, + "loss": 1.0449, + "step": 32842 + }, + { + "epoch": 2.799198840876161, + "grad_norm": 25.422719990286474, + "learning_rate": 1.359683291438063e-07, + "loss": 0.8622, + "step": 32843 + }, + { + "epoch": 2.7992840705701867, + "grad_norm": 78.71370346657757, + "learning_rate": 1.3585350477590576e-07, + "loss": 1.7092, + "step": 32844 + }, + { + "epoch": 2.799369300264212, + "grad_norm": 51.71681177450485, + "learning_rate": 1.357387282449535e-07, + "loss": 1.4611, + "step": 32845 + }, + { + "epoch": 2.799454529958237, + "grad_norm": 60.478514295970605, + "learning_rate": 1.3562399955207807e-07, + "loss": 1.9606, + "step": 32846 + }, + { + "epoch": 2.799539759652263, + "grad_norm": 37.95668956884628, + "learning_rate": 1.35509318698408e-07, + "loss": 1.0328, + "step": 32847 + }, + { + "epoch": 2.799624989346288, + "grad_norm": 67.80295736083333, + "learning_rate": 1.3539468568507074e-07, + "loss": 1.3664, + "step": 32848 + }, + { + "epoch": 2.7997102190403136, + "grad_norm": 27.929170753015633, + "learning_rate": 1.3528010051319317e-07, + "loss": 1.0807, + "step": 32849 + }, + { + "epoch": 2.799795448734339, + "grad_norm": 57.357676650949074, + "learning_rate": 1.3516556318390328e-07, + "loss": 1.063, + "step": 32850 + }, + { + "epoch": 2.7998806784283645, + "grad_norm": 81.19303400847235, + "learning_rate": 1.3505107369832682e-07, + "loss": 2.4547, + "step": 32851 + }, + { + "epoch": 2.79996590812239, + "grad_norm": 68.45228693819942, + "learning_rate": 1.3493663205758956e-07, + "loss": 1.5231, + "step": 32852 + }, + { + "epoch": 2.800051137816415, + "grad_norm": 32.70593068612961, + "learning_rate": 1.348222382628178e-07, + "loss": 0.8338, + "step": 32853 + }, + { + "epoch": 2.8001363675104405, + "grad_norm": 24.533911297125087, + "learning_rate": 1.347078923151357e-07, + "loss": 0.7613, + "step": 32854 + }, + { + "epoch": 2.800221597204466, + "grad_norm": 41.33637662666905, + "learning_rate": 1.345935942156684e-07, + "loss": 1.3516, + "step": 32855 + }, + { + "epoch": 2.8003068268984914, + "grad_norm": 26.002974040507112, + "learning_rate": 1.3447934396554008e-07, + "loss": 0.7303, + "step": 32856 + }, + { + "epoch": 2.800392056592517, + "grad_norm": 45.392308635885335, + "learning_rate": 1.343651415658742e-07, + "loss": 1.3201, + "step": 32857 + }, + { + "epoch": 2.8004772862865424, + "grad_norm": 32.89436413098764, + "learning_rate": 1.342509870177927e-07, + "loss": 1.7418, + "step": 32858 + }, + { + "epoch": 2.800562515980568, + "grad_norm": 44.29014712156723, + "learning_rate": 1.3413688032242023e-07, + "loss": 1.1536, + "step": 32859 + }, + { + "epoch": 2.800647745674593, + "grad_norm": 29.09956702846694, + "learning_rate": 1.3402282148087754e-07, + "loss": 0.884, + "step": 32860 + }, + { + "epoch": 2.8007329753686183, + "grad_norm": 89.47684403427836, + "learning_rate": 1.3390881049428707e-07, + "loss": 2.083, + "step": 32861 + }, + { + "epoch": 2.800818205062644, + "grad_norm": 64.82112328861484, + "learning_rate": 1.3379484736377012e-07, + "loss": 2.4221, + "step": 32862 + }, + { + "epoch": 2.8009034347566693, + "grad_norm": 41.146790948272304, + "learning_rate": 1.3368093209044643e-07, + "loss": 1.1526, + "step": 32863 + }, + { + "epoch": 2.8009886644506947, + "grad_norm": 40.7050505876167, + "learning_rate": 1.335670646754378e-07, + "loss": 1.2851, + "step": 32864 + }, + { + "epoch": 2.8010738941447197, + "grad_norm": 22.691557334936753, + "learning_rate": 1.3345324511986335e-07, + "loss": 1.0756, + "step": 32865 + }, + { + "epoch": 2.8011591238387457, + "grad_norm": 28.160487219840036, + "learning_rate": 1.333394734248422e-07, + "loss": 0.6598, + "step": 32866 + }, + { + "epoch": 2.8012443535327707, + "grad_norm": 55.09377372034172, + "learning_rate": 1.3322574959149348e-07, + "loss": 1.0106, + "step": 32867 + }, + { + "epoch": 2.801329583226796, + "grad_norm": 64.56985440436264, + "learning_rate": 1.3311207362093514e-07, + "loss": 1.9915, + "step": 32868 + }, + { + "epoch": 2.8014148129208216, + "grad_norm": 39.657941921652245, + "learning_rate": 1.329984455142863e-07, + "loss": 1.2876, + "step": 32869 + }, + { + "epoch": 2.801500042614847, + "grad_norm": 60.81991356261512, + "learning_rate": 1.3288486527266388e-07, + "loss": 0.9938, + "step": 32870 + }, + { + "epoch": 2.8015852723088726, + "grad_norm": 63.639291283048784, + "learning_rate": 1.327713328971847e-07, + "loss": 1.7513, + "step": 32871 + }, + { + "epoch": 2.8016705020028976, + "grad_norm": 58.35100831931679, + "learning_rate": 1.3265784838896513e-07, + "loss": 1.46, + "step": 32872 + }, + { + "epoch": 2.801755731696923, + "grad_norm": 63.023705422851506, + "learning_rate": 1.3254441174912203e-07, + "loss": 1.3841, + "step": 32873 + }, + { + "epoch": 2.8018409613909485, + "grad_norm": 20.147631029996123, + "learning_rate": 1.3243102297877064e-07, + "loss": 0.5755, + "step": 32874 + }, + { + "epoch": 2.801926191084974, + "grad_norm": 58.09736670235344, + "learning_rate": 1.3231768207902507e-07, + "loss": 1.9214, + "step": 32875 + }, + { + "epoch": 2.8020114207789995, + "grad_norm": 39.67520692818434, + "learning_rate": 1.322043890510011e-07, + "loss": 1.5534, + "step": 32876 + }, + { + "epoch": 2.802096650473025, + "grad_norm": 55.8101661520802, + "learning_rate": 1.3209114389581278e-07, + "loss": 1.2406, + "step": 32877 + }, + { + "epoch": 2.8021818801670504, + "grad_norm": 39.947678645233516, + "learning_rate": 1.319779466145743e-07, + "loss": 1.3662, + "step": 32878 + }, + { + "epoch": 2.8022671098610754, + "grad_norm": 51.55123568969516, + "learning_rate": 1.3186479720839806e-07, + "loss": 1.4327, + "step": 32879 + }, + { + "epoch": 2.802352339555101, + "grad_norm": 17.85535759772843, + "learning_rate": 1.3175169567839708e-07, + "loss": 0.7169, + "step": 32880 + }, + { + "epoch": 2.8024375692491263, + "grad_norm": 40.98538783233637, + "learning_rate": 1.3163864202568376e-07, + "loss": 1.0342, + "step": 32881 + }, + { + "epoch": 2.802522798943152, + "grad_norm": 59.88691934696124, + "learning_rate": 1.3152563625137004e-07, + "loss": 1.6516, + "step": 32882 + }, + { + "epoch": 2.8026080286371773, + "grad_norm": 55.82805485701295, + "learning_rate": 1.3141267835656613e-07, + "loss": 1.699, + "step": 32883 + }, + { + "epoch": 2.8026932583312028, + "grad_norm": 35.08178855919117, + "learning_rate": 1.3129976834238445e-07, + "loss": 0.8053, + "step": 32884 + }, + { + "epoch": 2.802778488025228, + "grad_norm": 33.620632201809094, + "learning_rate": 1.3118690620993524e-07, + "loss": 1.2605, + "step": 32885 + }, + { + "epoch": 2.8028637177192532, + "grad_norm": 66.54845166337819, + "learning_rate": 1.3107409196032872e-07, + "loss": 1.9862, + "step": 32886 + }, + { + "epoch": 2.8029489474132787, + "grad_norm": 28.88679963541151, + "learning_rate": 1.309613255946729e-07, + "loss": 0.8546, + "step": 32887 + }, + { + "epoch": 2.803034177107304, + "grad_norm": 105.13763744496943, + "learning_rate": 1.30848607114078e-07, + "loss": 2.5299, + "step": 32888 + }, + { + "epoch": 2.8031194068013296, + "grad_norm": 48.49858740755733, + "learning_rate": 1.30735936519652e-07, + "loss": 1.4202, + "step": 32889 + }, + { + "epoch": 2.803204636495355, + "grad_norm": 45.83620192465593, + "learning_rate": 1.3062331381250348e-07, + "loss": 1.186, + "step": 32890 + }, + { + "epoch": 2.80328986618938, + "grad_norm": 65.69942094617834, + "learning_rate": 1.3051073899373934e-07, + "loss": 1.3152, + "step": 32891 + }, + { + "epoch": 2.803375095883406, + "grad_norm": 52.939304065309756, + "learning_rate": 1.3039821206446757e-07, + "loss": 1.3611, + "step": 32892 + }, + { + "epoch": 2.803460325577431, + "grad_norm": 50.83163089652294, + "learning_rate": 1.3028573302579394e-07, + "loss": 1.3363, + "step": 32893 + }, + { + "epoch": 2.8035455552714565, + "grad_norm": 48.6801973066322, + "learning_rate": 1.3017330187882592e-07, + "loss": 0.8233, + "step": 32894 + }, + { + "epoch": 2.803630784965482, + "grad_norm": 68.60723870863856, + "learning_rate": 1.300609186246682e-07, + "loss": 1.7093, + "step": 32895 + }, + { + "epoch": 2.8037160146595075, + "grad_norm": 34.60193807241021, + "learning_rate": 1.2994858326442594e-07, + "loss": 1.611, + "step": 32896 + }, + { + "epoch": 2.803801244353533, + "grad_norm": 35.91408661883115, + "learning_rate": 1.2983629579920387e-07, + "loss": 0.9941, + "step": 32897 + }, + { + "epoch": 2.803886474047558, + "grad_norm": 20.495466610881152, + "learning_rate": 1.297240562301072e-07, + "loss": 0.6317, + "step": 32898 + }, + { + "epoch": 2.8039717037415834, + "grad_norm": 48.26771728497638, + "learning_rate": 1.2961186455823838e-07, + "loss": 1.1023, + "step": 32899 + }, + { + "epoch": 2.804056933435609, + "grad_norm": 37.43449516332102, + "learning_rate": 1.2949972078470207e-07, + "loss": 0.9758, + "step": 32900 + }, + { + "epoch": 2.8041421631296344, + "grad_norm": 28.053075192799014, + "learning_rate": 1.2938762491060074e-07, + "loss": 0.753, + "step": 32901 + }, + { + "epoch": 2.80422739282366, + "grad_norm": 72.60980708058277, + "learning_rate": 1.2927557693703684e-07, + "loss": 1.6901, + "step": 32902 + }, + { + "epoch": 2.8043126225176853, + "grad_norm": 72.08646177843785, + "learning_rate": 1.2916357686511173e-07, + "loss": 1.9306, + "step": 32903 + }, + { + "epoch": 2.804397852211711, + "grad_norm": 37.760149853579755, + "learning_rate": 1.2905162469592835e-07, + "loss": 0.692, + "step": 32904 + }, + { + "epoch": 2.804483081905736, + "grad_norm": 26.256968457665163, + "learning_rate": 1.289397204305859e-07, + "loss": 0.6936, + "step": 32905 + }, + { + "epoch": 2.8045683115997613, + "grad_norm": 88.36720982531811, + "learning_rate": 1.2882786407018566e-07, + "loss": 1.6546, + "step": 32906 + }, + { + "epoch": 2.8046535412937867, + "grad_norm": 46.09827471889836, + "learning_rate": 1.2871605561582735e-07, + "loss": 1.2625, + "step": 32907 + }, + { + "epoch": 2.804738770987812, + "grad_norm": 27.04721044144788, + "learning_rate": 1.2860429506861172e-07, + "loss": 1.1988, + "step": 32908 + }, + { + "epoch": 2.8048240006818377, + "grad_norm": 28.58872442672095, + "learning_rate": 1.2849258242963735e-07, + "loss": 0.8859, + "step": 32909 + }, + { + "epoch": 2.8049092303758627, + "grad_norm": 59.02758935367024, + "learning_rate": 1.2838091770000227e-07, + "loss": 1.494, + "step": 32910 + }, + { + "epoch": 2.8049944600698886, + "grad_norm": 22.206080759008863, + "learning_rate": 1.2826930088080502e-07, + "loss": 0.8628, + "step": 32911 + }, + { + "epoch": 2.8050796897639136, + "grad_norm": 82.77380855744107, + "learning_rate": 1.2815773197314364e-07, + "loss": 1.4769, + "step": 32912 + }, + { + "epoch": 2.805164919457939, + "grad_norm": 32.327780437636164, + "learning_rate": 1.2804621097811553e-07, + "loss": 1.0369, + "step": 32913 + }, + { + "epoch": 2.8052501491519646, + "grad_norm": 44.658992119673975, + "learning_rate": 1.27934737896816e-07, + "loss": 1.3491, + "step": 32914 + }, + { + "epoch": 2.80533537884599, + "grad_norm": 41.476365327619945, + "learning_rate": 1.2782331273034242e-07, + "loss": 1.4706, + "step": 32915 + }, + { + "epoch": 2.8054206085400155, + "grad_norm": 37.61709450464902, + "learning_rate": 1.2771193547979123e-07, + "loss": 1.4085, + "step": 32916 + }, + { + "epoch": 2.8055058382340405, + "grad_norm": 69.10996007528709, + "learning_rate": 1.2760060614625647e-07, + "loss": 1.6736, + "step": 32917 + }, + { + "epoch": 2.805591067928066, + "grad_norm": 51.20251171438637, + "learning_rate": 1.2748932473083397e-07, + "loss": 1.4315, + "step": 32918 + }, + { + "epoch": 2.8056762976220915, + "grad_norm": 47.91092417043602, + "learning_rate": 1.2737809123461786e-07, + "loss": 1.5885, + "step": 32919 + }, + { + "epoch": 2.805761527316117, + "grad_norm": 57.41764530156431, + "learning_rate": 1.2726690565870226e-07, + "loss": 1.748, + "step": 32920 + }, + { + "epoch": 2.8058467570101424, + "grad_norm": 37.68549027148186, + "learning_rate": 1.2715576800418017e-07, + "loss": 0.7371, + "step": 32921 + }, + { + "epoch": 2.805931986704168, + "grad_norm": 68.64063209405064, + "learning_rate": 1.2704467827214463e-07, + "loss": 1.9757, + "step": 32922 + }, + { + "epoch": 2.8060172163981933, + "grad_norm": 68.30801258545011, + "learning_rate": 1.2693363646368806e-07, + "loss": 1.8147, + "step": 32923 + }, + { + "epoch": 2.8061024460922184, + "grad_norm": 86.10249996271013, + "learning_rate": 1.268226425799035e-07, + "loss": 2.281, + "step": 32924 + }, + { + "epoch": 2.806187675786244, + "grad_norm": 31.401237668625402, + "learning_rate": 1.2671169662188176e-07, + "loss": 1.2477, + "step": 32925 + }, + { + "epoch": 2.8062729054802693, + "grad_norm": 89.04122646456271, + "learning_rate": 1.2660079859071362e-07, + "loss": 0.6209, + "step": 32926 + }, + { + "epoch": 2.8063581351742948, + "grad_norm": 30.172391825099496, + "learning_rate": 1.2648994848749042e-07, + "loss": 1.3334, + "step": 32927 + }, + { + "epoch": 2.8064433648683202, + "grad_norm": 51.135645288395075, + "learning_rate": 1.2637914631330184e-07, + "loss": 1.0874, + "step": 32928 + }, + { + "epoch": 2.8065285945623453, + "grad_norm": 65.4309636938649, + "learning_rate": 1.2626839206923814e-07, + "loss": 1.6573, + "step": 32929 + }, + { + "epoch": 2.806613824256371, + "grad_norm": 23.26505754802358, + "learning_rate": 1.2615768575638733e-07, + "loss": 0.5845, + "step": 32930 + }, + { + "epoch": 2.806699053950396, + "grad_norm": 22.31850223505485, + "learning_rate": 1.2604702737583962e-07, + "loss": 0.6354, + "step": 32931 + }, + { + "epoch": 2.8067842836444217, + "grad_norm": 36.10421048828226, + "learning_rate": 1.2593641692868253e-07, + "loss": 1.2051, + "step": 32932 + }, + { + "epoch": 2.806869513338447, + "grad_norm": 66.11972623292405, + "learning_rate": 1.258258544160046e-07, + "loss": 1.429, + "step": 32933 + }, + { + "epoch": 2.8069547430324726, + "grad_norm": 28.693820813908644, + "learning_rate": 1.2571533983889216e-07, + "loss": 0.9946, + "step": 32934 + }, + { + "epoch": 2.807039972726498, + "grad_norm": 49.4322794251091, + "learning_rate": 1.2560487319843272e-07, + "loss": 1.4967, + "step": 32935 + }, + { + "epoch": 2.807125202420523, + "grad_norm": 31.702034545623015, + "learning_rate": 1.2549445449571206e-07, + "loss": 1.1641, + "step": 32936 + }, + { + "epoch": 2.8072104321145486, + "grad_norm": 53.12572643972412, + "learning_rate": 1.2538408373181655e-07, + "loss": 1.1869, + "step": 32937 + }, + { + "epoch": 2.807295661808574, + "grad_norm": 54.84736479684966, + "learning_rate": 1.252737609078314e-07, + "loss": 1.1107, + "step": 32938 + }, + { + "epoch": 2.8073808915025995, + "grad_norm": 23.14934391082563, + "learning_rate": 1.251634860248424e-07, + "loss": 1.2086, + "step": 32939 + }, + { + "epoch": 2.807466121196625, + "grad_norm": 65.66647974487597, + "learning_rate": 1.2505325908393262e-07, + "loss": 2.1605, + "step": 32940 + }, + { + "epoch": 2.8075513508906504, + "grad_norm": 56.662750736665, + "learning_rate": 1.2494308008618783e-07, + "loss": 1.5793, + "step": 32941 + }, + { + "epoch": 2.807636580584676, + "grad_norm": 22.975532615281157, + "learning_rate": 1.2483294903269049e-07, + "loss": 0.9355, + "step": 32942 + }, + { + "epoch": 2.807721810278701, + "grad_norm": 49.40353670306691, + "learning_rate": 1.2472286592452366e-07, + "loss": 1.7204, + "step": 32943 + }, + { + "epoch": 2.8078070399727264, + "grad_norm": 47.08936606900722, + "learning_rate": 1.2461283076276976e-07, + "loss": 1.4947, + "step": 32944 + }, + { + "epoch": 2.807892269666752, + "grad_norm": 41.09761272291218, + "learning_rate": 1.2450284354851238e-07, + "loss": 1.2724, + "step": 32945 + }, + { + "epoch": 2.8079774993607773, + "grad_norm": 31.123854289491117, + "learning_rate": 1.2439290428283123e-07, + "loss": 0.9448, + "step": 32946 + }, + { + "epoch": 2.808062729054803, + "grad_norm": 62.45493406002965, + "learning_rate": 1.2428301296680933e-07, + "loss": 2.0402, + "step": 32947 + }, + { + "epoch": 2.808147958748828, + "grad_norm": 92.46551562732508, + "learning_rate": 1.241731696015258e-07, + "loss": 1.9308, + "step": 32948 + }, + { + "epoch": 2.8082331884428537, + "grad_norm": 39.34772323572009, + "learning_rate": 1.2406337418806204e-07, + "loss": 1.2489, + "step": 32949 + }, + { + "epoch": 2.8083184181368788, + "grad_norm": 83.86880686220925, + "learning_rate": 1.2395362672749767e-07, + "loss": 1.9161, + "step": 32950 + }, + { + "epoch": 2.8084036478309042, + "grad_norm": 29.988325397164665, + "learning_rate": 1.2384392722091186e-07, + "loss": 0.8317, + "step": 32951 + }, + { + "epoch": 2.8084888775249297, + "grad_norm": 34.06111952051973, + "learning_rate": 1.2373427566938378e-07, + "loss": 1.2272, + "step": 32952 + }, + { + "epoch": 2.808574107218955, + "grad_norm": 44.701373007432885, + "learning_rate": 1.236246720739903e-07, + "loss": 1.6084, + "step": 32953 + }, + { + "epoch": 2.8086593369129806, + "grad_norm": 46.911688957777685, + "learning_rate": 1.235151164358106e-07, + "loss": 1.042, + "step": 32954 + }, + { + "epoch": 2.8087445666070057, + "grad_norm": 59.42013381372818, + "learning_rate": 1.2340560875592266e-07, + "loss": 1.7021, + "step": 32955 + }, + { + "epoch": 2.808829796301031, + "grad_norm": 31.668648883099547, + "learning_rate": 1.2329614903540289e-07, + "loss": 0.4829, + "step": 32956 + }, + { + "epoch": 2.8089150259950566, + "grad_norm": 52.33770622746853, + "learning_rate": 1.2318673727532759e-07, + "loss": 1.6982, + "step": 32957 + }, + { + "epoch": 2.809000255689082, + "grad_norm": 38.63778295162713, + "learning_rate": 1.2307737347677263e-07, + "loss": 0.9089, + "step": 32958 + }, + { + "epoch": 2.8090854853831075, + "grad_norm": 74.17373863516941, + "learning_rate": 1.2296805764081432e-07, + "loss": 2.1148, + "step": 32959 + }, + { + "epoch": 2.809170715077133, + "grad_norm": 29.930254411507605, + "learning_rate": 1.2285878976852684e-07, + "loss": 0.8551, + "step": 32960 + }, + { + "epoch": 2.8092559447711585, + "grad_norm": 51.0859173507982, + "learning_rate": 1.2274956986098486e-07, + "loss": 1.5951, + "step": 32961 + }, + { + "epoch": 2.8093411744651835, + "grad_norm": 106.16249145949853, + "learning_rate": 1.2264039791926252e-07, + "loss": 1.8259, + "step": 32962 + }, + { + "epoch": 2.809426404159209, + "grad_norm": 115.04254614924771, + "learning_rate": 1.2253127394443455e-07, + "loss": 2.4949, + "step": 32963 + }, + { + "epoch": 2.8095116338532344, + "grad_norm": 61.62580749454187, + "learning_rate": 1.2242219793757337e-07, + "loss": 1.5021, + "step": 32964 + }, + { + "epoch": 2.80959686354726, + "grad_norm": 30.048329224259984, + "learning_rate": 1.2231316989975206e-07, + "loss": 0.8761, + "step": 32965 + }, + { + "epoch": 2.8096820932412854, + "grad_norm": 33.859924473388084, + "learning_rate": 1.2220418983204141e-07, + "loss": 1.1199, + "step": 32966 + }, + { + "epoch": 2.8097673229353104, + "grad_norm": 27.160603741793043, + "learning_rate": 1.22095257735515e-07, + "loss": 0.8105, + "step": 32967 + }, + { + "epoch": 2.8098525526293363, + "grad_norm": 31.666852012742762, + "learning_rate": 1.2198637361124367e-07, + "loss": 0.7717, + "step": 32968 + }, + { + "epoch": 2.8099377823233613, + "grad_norm": 40.13649243791047, + "learning_rate": 1.2187753746029762e-07, + "loss": 1.7626, + "step": 32969 + }, + { + "epoch": 2.810023012017387, + "grad_norm": 27.297078286830818, + "learning_rate": 1.2176874928374717e-07, + "loss": 0.6254, + "step": 32970 + }, + { + "epoch": 2.8101082417114123, + "grad_norm": 35.172578221543844, + "learning_rate": 1.216600090826636e-07, + "loss": 0.9003, + "step": 32971 + }, + { + "epoch": 2.8101934714054377, + "grad_norm": 44.38635605236938, + "learning_rate": 1.2155131685811506e-07, + "loss": 1.2166, + "step": 32972 + }, + { + "epoch": 2.810278701099463, + "grad_norm": 50.45536866138469, + "learning_rate": 1.214426726111706e-07, + "loss": 1.3247, + "step": 32973 + }, + { + "epoch": 2.8103639307934882, + "grad_norm": 79.44123545025322, + "learning_rate": 1.2133407634289885e-07, + "loss": 1.8418, + "step": 32974 + }, + { + "epoch": 2.8104491604875137, + "grad_norm": 69.52456232417013, + "learning_rate": 1.2122552805436838e-07, + "loss": 2.026, + "step": 32975 + }, + { + "epoch": 2.810534390181539, + "grad_norm": 41.114689860791565, + "learning_rate": 1.2111702774664613e-07, + "loss": 1.0851, + "step": 32976 + }, + { + "epoch": 2.8106196198755646, + "grad_norm": 68.98795277134317, + "learning_rate": 1.2100857542079847e-07, + "loss": 1.6448, + "step": 32977 + }, + { + "epoch": 2.81070484956959, + "grad_norm": 58.547743644078174, + "learning_rate": 1.2090017107789343e-07, + "loss": 1.5861, + "step": 32978 + }, + { + "epoch": 2.8107900792636156, + "grad_norm": 41.72612264604981, + "learning_rate": 1.207918147189957e-07, + "loss": 1.6215, + "step": 32979 + }, + { + "epoch": 2.810875308957641, + "grad_norm": 58.96044988826812, + "learning_rate": 1.206835063451728e-07, + "loss": 1.3475, + "step": 32980 + }, + { + "epoch": 2.810960538651666, + "grad_norm": 48.416612941927696, + "learning_rate": 1.2057524595748826e-07, + "loss": 1.588, + "step": 32981 + }, + { + "epoch": 2.8110457683456915, + "grad_norm": 137.2598297515842, + "learning_rate": 1.204670335570074e-07, + "loss": 3.5, + "step": 32982 + }, + { + "epoch": 2.811130998039717, + "grad_norm": 49.38873258824562, + "learning_rate": 1.2035886914479377e-07, + "loss": 1.5015, + "step": 32983 + }, + { + "epoch": 2.8112162277337425, + "grad_norm": 43.966781584622964, + "learning_rate": 1.202507527219121e-07, + "loss": 1.236, + "step": 32984 + }, + { + "epoch": 2.811301457427768, + "grad_norm": 18.051789839932134, + "learning_rate": 1.2014268428942487e-07, + "loss": 0.5276, + "step": 32985 + }, + { + "epoch": 2.811386687121793, + "grad_norm": 46.194520832988616, + "learning_rate": 1.2003466384839624e-07, + "loss": 1.9049, + "step": 32986 + }, + { + "epoch": 2.811471916815819, + "grad_norm": 52.93958468577675, + "learning_rate": 1.1992669139988643e-07, + "loss": 1.2756, + "step": 32987 + }, + { + "epoch": 2.811557146509844, + "grad_norm": 81.42725056311014, + "learning_rate": 1.1981876694495908e-07, + "loss": 1.8924, + "step": 32988 + }, + { + "epoch": 2.8116423762038694, + "grad_norm": 70.98448610744921, + "learning_rate": 1.1971089048467498e-07, + "loss": 1.737, + "step": 32989 + }, + { + "epoch": 2.811727605897895, + "grad_norm": 55.56112789597831, + "learning_rate": 1.1960306202009499e-07, + "loss": 1.5982, + "step": 32990 + }, + { + "epoch": 2.8118128355919203, + "grad_norm": 101.85868883471841, + "learning_rate": 1.1949528155227928e-07, + "loss": 1.4096, + "step": 32991 + }, + { + "epoch": 2.8118980652859458, + "grad_norm": 81.3737219238357, + "learning_rate": 1.193875490822888e-07, + "loss": 2.1194, + "step": 32992 + }, + { + "epoch": 2.811983294979971, + "grad_norm": 72.45303279284956, + "learning_rate": 1.1927986461118147e-07, + "loss": 2.4535, + "step": 32993 + }, + { + "epoch": 2.8120685246739963, + "grad_norm": 55.38082502918245, + "learning_rate": 1.1917222814001817e-07, + "loss": 1.6846, + "step": 32994 + }, + { + "epoch": 2.8121537543680217, + "grad_norm": 57.84652121302719, + "learning_rate": 1.1906463966985638e-07, + "loss": 1.8079, + "step": 32995 + }, + { + "epoch": 2.812238984062047, + "grad_norm": 42.9432996444665, + "learning_rate": 1.1895709920175413e-07, + "loss": 1.1016, + "step": 32996 + }, + { + "epoch": 2.8123242137560727, + "grad_norm": 57.34285430527219, + "learning_rate": 1.1884960673676949e-07, + "loss": 1.5039, + "step": 32997 + }, + { + "epoch": 2.812409443450098, + "grad_norm": 50.79983025206823, + "learning_rate": 1.1874216227595936e-07, + "loss": 0.6335, + "step": 32998 + }, + { + "epoch": 2.8124946731441236, + "grad_norm": 72.89827148267848, + "learning_rate": 1.1863476582038014e-07, + "loss": 1.796, + "step": 32999 + }, + { + "epoch": 2.8125799028381486, + "grad_norm": 45.64707400118129, + "learning_rate": 1.1852741737108875e-07, + "loss": 1.6206, + "step": 33000 + }, + { + "epoch": 2.812665132532174, + "grad_norm": 52.28344639935393, + "learning_rate": 1.1842011692913991e-07, + "loss": 1.5719, + "step": 33001 + }, + { + "epoch": 2.8127503622261996, + "grad_norm": 61.381307239086695, + "learning_rate": 1.1831286449558998e-07, + "loss": 1.5138, + "step": 33002 + }, + { + "epoch": 2.812835591920225, + "grad_norm": 39.25852388951342, + "learning_rate": 1.1820566007149314e-07, + "loss": 1.0657, + "step": 33003 + }, + { + "epoch": 2.8129208216142505, + "grad_norm": 36.205228833061824, + "learning_rate": 1.1809850365790354e-07, + "loss": 1.1974, + "step": 33004 + }, + { + "epoch": 2.813006051308276, + "grad_norm": 45.81105360271845, + "learning_rate": 1.1799139525587533e-07, + "loss": 1.295, + "step": 33005 + }, + { + "epoch": 2.8130912810023014, + "grad_norm": 75.15594082501443, + "learning_rate": 1.1788433486646212e-07, + "loss": 1.7145, + "step": 33006 + }, + { + "epoch": 2.8131765106963265, + "grad_norm": 58.36195693597181, + "learning_rate": 1.1777732249071694e-07, + "loss": 1.299, + "step": 33007 + }, + { + "epoch": 2.813261740390352, + "grad_norm": 34.392489014143464, + "learning_rate": 1.1767035812969063e-07, + "loss": 0.9619, + "step": 33008 + }, + { + "epoch": 2.8133469700843774, + "grad_norm": 40.12256741104666, + "learning_rate": 1.1756344178443679e-07, + "loss": 1.1128, + "step": 33009 + }, + { + "epoch": 2.813432199778403, + "grad_norm": 56.20227610536551, + "learning_rate": 1.1745657345600681e-07, + "loss": 1.9208, + "step": 33010 + }, + { + "epoch": 2.8135174294724283, + "grad_norm": 61.99718152775243, + "learning_rate": 1.1734975314545094e-07, + "loss": 1.9748, + "step": 33011 + }, + { + "epoch": 2.8136026591664534, + "grad_norm": 60.394476900940056, + "learning_rate": 1.1724298085382002e-07, + "loss": 2.1139, + "step": 33012 + }, + { + "epoch": 2.813687888860479, + "grad_norm": 66.69337166240044, + "learning_rate": 1.1713625658216431e-07, + "loss": 1.9993, + "step": 33013 + }, + { + "epoch": 2.8137731185545043, + "grad_norm": 40.73740716822439, + "learning_rate": 1.1702958033153356e-07, + "loss": 0.9333, + "step": 33014 + }, + { + "epoch": 2.8138583482485298, + "grad_norm": 43.164506930060206, + "learning_rate": 1.1692295210297633e-07, + "loss": 1.0435, + "step": 33015 + }, + { + "epoch": 2.8139435779425552, + "grad_norm": 20.876270233787395, + "learning_rate": 1.1681637189754069e-07, + "loss": 0.6951, + "step": 33016 + }, + { + "epoch": 2.8140288076365807, + "grad_norm": 42.23519173336843, + "learning_rate": 1.1670983971627637e-07, + "loss": 1.1845, + "step": 33017 + }, + { + "epoch": 2.814114037330606, + "grad_norm": 87.12233792970873, + "learning_rate": 1.1660335556023028e-07, + "loss": 1.136, + "step": 33018 + }, + { + "epoch": 2.814199267024631, + "grad_norm": 50.1288537239062, + "learning_rate": 1.164969194304505e-07, + "loss": 1.4372, + "step": 33019 + }, + { + "epoch": 2.8142844967186567, + "grad_norm": 30.727252194916876, + "learning_rate": 1.1639053132798228e-07, + "loss": 0.7589, + "step": 33020 + }, + { + "epoch": 2.814369726412682, + "grad_norm": 58.70706234672355, + "learning_rate": 1.1628419125387258e-07, + "loss": 1.3687, + "step": 33021 + }, + { + "epoch": 2.8144549561067076, + "grad_norm": 92.2044333055512, + "learning_rate": 1.1617789920916722e-07, + "loss": 2.0156, + "step": 33022 + }, + { + "epoch": 2.814540185800733, + "grad_norm": 78.01063360708466, + "learning_rate": 1.1607165519491204e-07, + "loss": 1.9712, + "step": 33023 + }, + { + "epoch": 2.8146254154947585, + "grad_norm": 57.246622783002024, + "learning_rate": 1.1596545921215063e-07, + "loss": 1.8876, + "step": 33024 + }, + { + "epoch": 2.814710645188784, + "grad_norm": 69.49968039378726, + "learning_rate": 1.1585931126192884e-07, + "loss": 1.5145, + "step": 33025 + }, + { + "epoch": 2.814795874882809, + "grad_norm": 57.1639851438328, + "learning_rate": 1.1575321134528971e-07, + "loss": 1.5354, + "step": 33026 + }, + { + "epoch": 2.8148811045768345, + "grad_norm": 33.997843505607825, + "learning_rate": 1.1564715946327687e-07, + "loss": 0.9021, + "step": 33027 + }, + { + "epoch": 2.81496633427086, + "grad_norm": 38.02539753147941, + "learning_rate": 1.1554115561693391e-07, + "loss": 1.3388, + "step": 33028 + }, + { + "epoch": 2.8150515639648854, + "grad_norm": 24.66540157343916, + "learning_rate": 1.1543519980730278e-07, + "loss": 1.0412, + "step": 33029 + }, + { + "epoch": 2.815136793658911, + "grad_norm": 41.62293042894613, + "learning_rate": 1.1532929203542486e-07, + "loss": 1.7169, + "step": 33030 + }, + { + "epoch": 2.815222023352936, + "grad_norm": 30.09472409959588, + "learning_rate": 1.1522343230234267e-07, + "loss": 1.0745, + "step": 33031 + }, + { + "epoch": 2.815307253046962, + "grad_norm": 33.59074539147772, + "learning_rate": 1.1511762060909648e-07, + "loss": 1.0641, + "step": 33032 + }, + { + "epoch": 2.815392482740987, + "grad_norm": 30.776131187609415, + "learning_rate": 1.1501185695672823e-07, + "loss": 1.1055, + "step": 33033 + }, + { + "epoch": 2.8154777124350123, + "grad_norm": 61.231439493697145, + "learning_rate": 1.1490614134627654e-07, + "loss": 1.7037, + "step": 33034 + }, + { + "epoch": 2.815562942129038, + "grad_norm": 23.575030221079523, + "learning_rate": 1.1480047377878278e-07, + "loss": 0.5848, + "step": 33035 + }, + { + "epoch": 2.8156481718230633, + "grad_norm": 54.10257593362635, + "learning_rate": 1.1469485425528448e-07, + "loss": 1.2123, + "step": 33036 + }, + { + "epoch": 2.8157334015170887, + "grad_norm": 48.40158617948443, + "learning_rate": 1.1458928277682135e-07, + "loss": 0.989, + "step": 33037 + }, + { + "epoch": 2.8158186312111138, + "grad_norm": 75.93218062186976, + "learning_rate": 1.1448375934443145e-07, + "loss": 2.2998, + "step": 33038 + }, + { + "epoch": 2.8159038609051392, + "grad_norm": 63.88865300560579, + "learning_rate": 1.1437828395915229e-07, + "loss": 1.8166, + "step": 33039 + }, + { + "epoch": 2.8159890905991647, + "grad_norm": 64.50182824186385, + "learning_rate": 1.1427285662202082e-07, + "loss": 1.2943, + "step": 33040 + }, + { + "epoch": 2.81607432029319, + "grad_norm": 47.787838423841364, + "learning_rate": 1.1416747733407508e-07, + "loss": 1.4098, + "step": 33041 + }, + { + "epoch": 2.8161595499872156, + "grad_norm": 27.700864765873146, + "learning_rate": 1.1406214609635092e-07, + "loss": 1.0965, + "step": 33042 + }, + { + "epoch": 2.816244779681241, + "grad_norm": 21.69158401036553, + "learning_rate": 1.1395686290988306e-07, + "loss": 0.7347, + "step": 33043 + }, + { + "epoch": 2.8163300093752666, + "grad_norm": 33.02366355600586, + "learning_rate": 1.1385162777570902e-07, + "loss": 1.0587, + "step": 33044 + }, + { + "epoch": 2.8164152390692916, + "grad_norm": 30.958728166634337, + "learning_rate": 1.137464406948624e-07, + "loss": 0.7267, + "step": 33045 + }, + { + "epoch": 2.816500468763317, + "grad_norm": 39.139630993799514, + "learning_rate": 1.1364130166837795e-07, + "loss": 1.3961, + "step": 33046 + }, + { + "epoch": 2.8165856984573425, + "grad_norm": 64.52287869713281, + "learning_rate": 1.135362106972887e-07, + "loss": 1.0971, + "step": 33047 + }, + { + "epoch": 2.816670928151368, + "grad_norm": 47.183377022345326, + "learning_rate": 1.1343116778262996e-07, + "loss": 1.5926, + "step": 33048 + }, + { + "epoch": 2.8167561578453935, + "grad_norm": 76.32566479219116, + "learning_rate": 1.1332617292543369e-07, + "loss": 1.6759, + "step": 33049 + }, + { + "epoch": 2.8168413875394185, + "grad_norm": 36.31687817025377, + "learning_rate": 1.1322122612673347e-07, + "loss": 1.3859, + "step": 33050 + }, + { + "epoch": 2.8169266172334444, + "grad_norm": 57.322874794898645, + "learning_rate": 1.1311632738755962e-07, + "loss": 1.534, + "step": 33051 + }, + { + "epoch": 2.8170118469274694, + "grad_norm": 59.55196089166976, + "learning_rate": 1.1301147670894574e-07, + "loss": 1.0677, + "step": 33052 + }, + { + "epoch": 2.817097076621495, + "grad_norm": 35.10815755930866, + "learning_rate": 1.1290667409192158e-07, + "loss": 1.0094, + "step": 33053 + }, + { + "epoch": 2.8171823063155204, + "grad_norm": 49.08690965895288, + "learning_rate": 1.1280191953751851e-07, + "loss": 1.2353, + "step": 33054 + }, + { + "epoch": 2.817267536009546, + "grad_norm": 32.33637448531164, + "learning_rate": 1.1269721304676629e-07, + "loss": 0.9906, + "step": 33055 + }, + { + "epoch": 2.8173527657035713, + "grad_norm": 59.347418737373545, + "learning_rate": 1.1259255462069519e-07, + "loss": 1.5563, + "step": 33056 + }, + { + "epoch": 2.8174379953975963, + "grad_norm": 86.25008310311004, + "learning_rate": 1.1248794426033438e-07, + "loss": 2.1161, + "step": 33057 + }, + { + "epoch": 2.817523225091622, + "grad_norm": 60.770496372251785, + "learning_rate": 1.123833819667125e-07, + "loss": 1.577, + "step": 33058 + }, + { + "epoch": 2.8176084547856473, + "grad_norm": 16.618570270056882, + "learning_rate": 1.1227886774085761e-07, + "loss": 0.6047, + "step": 33059 + }, + { + "epoch": 2.8176936844796727, + "grad_norm": 24.236955714939906, + "learning_rate": 1.1217440158379777e-07, + "loss": 0.9479, + "step": 33060 + }, + { + "epoch": 2.817778914173698, + "grad_norm": 81.66722355906617, + "learning_rate": 1.1206998349656106e-07, + "loss": 1.4516, + "step": 33061 + }, + { + "epoch": 2.8178641438677237, + "grad_norm": 49.57988221855908, + "learning_rate": 1.1196561348017387e-07, + "loss": 1.2294, + "step": 33062 + }, + { + "epoch": 2.817949373561749, + "grad_norm": 52.41914069835725, + "learning_rate": 1.118612915356615e-07, + "loss": 1.4951, + "step": 33063 + }, + { + "epoch": 2.818034603255774, + "grad_norm": 56.24894627889167, + "learning_rate": 1.1175701766405145e-07, + "loss": 1.7867, + "step": 33064 + }, + { + "epoch": 2.8181198329497996, + "grad_norm": 31.225350851655264, + "learning_rate": 1.1165279186636901e-07, + "loss": 1.0962, + "step": 33065 + }, + { + "epoch": 2.818205062643825, + "grad_norm": 36.90525828679013, + "learning_rate": 1.1154861414363894e-07, + "loss": 1.3349, + "step": 33066 + }, + { + "epoch": 2.8182902923378506, + "grad_norm": 88.44770855427615, + "learning_rate": 1.1144448449688594e-07, + "loss": 1.9409, + "step": 33067 + }, + { + "epoch": 2.818375522031876, + "grad_norm": 68.24480065678206, + "learning_rate": 1.1134040292713366e-07, + "loss": 1.6234, + "step": 33068 + }, + { + "epoch": 2.818460751725901, + "grad_norm": 29.374262488892107, + "learning_rate": 1.1123636943540573e-07, + "loss": 0.7926, + "step": 33069 + }, + { + "epoch": 2.818545981419927, + "grad_norm": 61.94387111472493, + "learning_rate": 1.1113238402272574e-07, + "loss": 1.4288, + "step": 33070 + }, + { + "epoch": 2.818631211113952, + "grad_norm": 74.26174845113975, + "learning_rate": 1.1102844669011625e-07, + "loss": 1.2296, + "step": 33071 + }, + { + "epoch": 2.8187164408079775, + "grad_norm": 78.667212987667, + "learning_rate": 1.109245574385992e-07, + "loss": 1.9536, + "step": 33072 + }, + { + "epoch": 2.818801670502003, + "grad_norm": 45.91945066655647, + "learning_rate": 1.1082071626919599e-07, + "loss": 1.5283, + "step": 33073 + }, + { + "epoch": 2.8188869001960284, + "grad_norm": 22.50898555747113, + "learning_rate": 1.107169231829286e-07, + "loss": 0.6242, + "step": 33074 + }, + { + "epoch": 2.818972129890054, + "grad_norm": 31.13328035937042, + "learning_rate": 1.1061317818081785e-07, + "loss": 0.9748, + "step": 33075 + }, + { + "epoch": 2.819057359584079, + "grad_norm": 24.617168390027413, + "learning_rate": 1.105094812638835e-07, + "loss": 0.6791, + "step": 33076 + }, + { + "epoch": 2.8191425892781043, + "grad_norm": 65.91051311966254, + "learning_rate": 1.1040583243314474e-07, + "loss": 1.8428, + "step": 33077 + }, + { + "epoch": 2.81922781897213, + "grad_norm": 49.04865741948707, + "learning_rate": 1.1030223168962239e-07, + "loss": 1.1079, + "step": 33078 + }, + { + "epoch": 2.8193130486661553, + "grad_norm": 55.36549265036446, + "learning_rate": 1.1019867903433401e-07, + "loss": 1.495, + "step": 33079 + }, + { + "epoch": 2.8193982783601808, + "grad_norm": 40.58518282058415, + "learning_rate": 1.1009517446829932e-07, + "loss": 1.0482, + "step": 33080 + }, + { + "epoch": 2.8194835080542062, + "grad_norm": 39.92764006897829, + "learning_rate": 1.0999171799253528e-07, + "loss": 0.5904, + "step": 33081 + }, + { + "epoch": 2.8195687377482317, + "grad_norm": 49.144860364337, + "learning_rate": 1.098883096080594e-07, + "loss": 1.3539, + "step": 33082 + }, + { + "epoch": 2.8196539674422567, + "grad_norm": 59.91847721122574, + "learning_rate": 1.0978494931588924e-07, + "loss": 1.5379, + "step": 33083 + }, + { + "epoch": 2.819739197136282, + "grad_norm": 58.87347314767605, + "learning_rate": 1.0968163711704061e-07, + "loss": 1.512, + "step": 33084 + }, + { + "epoch": 2.8198244268303077, + "grad_norm": 34.75119169268351, + "learning_rate": 1.0957837301252994e-07, + "loss": 1.3065, + "step": 33085 + }, + { + "epoch": 2.819909656524333, + "grad_norm": 41.84925809988869, + "learning_rate": 1.0947515700337252e-07, + "loss": 0.8167, + "step": 33086 + }, + { + "epoch": 2.8199948862183586, + "grad_norm": 85.59908996213971, + "learning_rate": 1.0937198909058366e-07, + "loss": 2.236, + "step": 33087 + }, + { + "epoch": 2.8200801159123836, + "grad_norm": 59.87395165332228, + "learning_rate": 1.0926886927517811e-07, + "loss": 1.5863, + "step": 33088 + }, + { + "epoch": 2.8201653456064095, + "grad_norm": 80.815548924665, + "learning_rate": 1.0916579755817003e-07, + "loss": 2.2595, + "step": 33089 + }, + { + "epoch": 2.8202505753004345, + "grad_norm": 61.463254288536916, + "learning_rate": 1.0906277394057252e-07, + "loss": 1.4931, + "step": 33090 + }, + { + "epoch": 2.82033580499446, + "grad_norm": 54.374039896097706, + "learning_rate": 1.0895979842339977e-07, + "loss": 1.425, + "step": 33091 + }, + { + "epoch": 2.8204210346884855, + "grad_norm": 54.519183312404856, + "learning_rate": 1.0885687100766318e-07, + "loss": 2.1377, + "step": 33092 + }, + { + "epoch": 2.820506264382511, + "grad_norm": 38.27389453004994, + "learning_rate": 1.087539916943764e-07, + "loss": 1.1679, + "step": 33093 + }, + { + "epoch": 2.8205914940765364, + "grad_norm": 49.54112457146369, + "learning_rate": 1.086511604845497e-07, + "loss": 1.0007, + "step": 33094 + }, + { + "epoch": 2.8206767237705614, + "grad_norm": 40.8792733520906, + "learning_rate": 1.0854837737919565e-07, + "loss": 1.1702, + "step": 33095 + }, + { + "epoch": 2.820761953464587, + "grad_norm": 22.375817310287236, + "learning_rate": 1.0844564237932453e-07, + "loss": 0.5322, + "step": 33096 + }, + { + "epoch": 2.8208471831586124, + "grad_norm": 33.88589328644041, + "learning_rate": 1.083429554859472e-07, + "loss": 1.3464, + "step": 33097 + }, + { + "epoch": 2.820932412852638, + "grad_norm": 67.93912979703805, + "learning_rate": 1.0824031670007284e-07, + "loss": 1.8929, + "step": 33098 + }, + { + "epoch": 2.8210176425466633, + "grad_norm": 67.83507165345189, + "learning_rate": 1.0813772602271067e-07, + "loss": 1.6137, + "step": 33099 + }, + { + "epoch": 2.821102872240689, + "grad_norm": 28.05931989051182, + "learning_rate": 1.0803518345487096e-07, + "loss": 1.2413, + "step": 33100 + }, + { + "epoch": 2.8211881019347143, + "grad_norm": 49.71776968397753, + "learning_rate": 1.0793268899756126e-07, + "loss": 1.1857, + "step": 33101 + }, + { + "epoch": 2.8212733316287393, + "grad_norm": 45.07011614999822, + "learning_rate": 1.078302426517891e-07, + "loss": 1.1779, + "step": 33102 + }, + { + "epoch": 2.8213585613227647, + "grad_norm": 61.804117982785726, + "learning_rate": 1.0772784441856255e-07, + "loss": 1.5177, + "step": 33103 + }, + { + "epoch": 2.82144379101679, + "grad_norm": 53.17374828806536, + "learning_rate": 1.0762549429888913e-07, + "loss": 1.144, + "step": 33104 + }, + { + "epoch": 2.8215290207108157, + "grad_norm": 64.36042910618082, + "learning_rate": 1.0752319229377473e-07, + "loss": 1.3933, + "step": 33105 + }, + { + "epoch": 2.821614250404841, + "grad_norm": 52.85114743368239, + "learning_rate": 1.0742093840422574e-07, + "loss": 1.8534, + "step": 33106 + }, + { + "epoch": 2.821699480098866, + "grad_norm": 32.52328828170399, + "learning_rate": 1.0731873263124748e-07, + "loss": 1.0624, + "step": 33107 + }, + { + "epoch": 2.821784709792892, + "grad_norm": 28.532904198593265, + "learning_rate": 1.0721657497584582e-07, + "loss": 0.8123, + "step": 33108 + }, + { + "epoch": 2.821869939486917, + "grad_norm": 72.16106915015102, + "learning_rate": 1.0711446543902493e-07, + "loss": 1.9991, + "step": 33109 + }, + { + "epoch": 2.8219551691809426, + "grad_norm": 82.12252636940077, + "learning_rate": 1.0701240402178791e-07, + "loss": 1.408, + "step": 33110 + }, + { + "epoch": 2.822040398874968, + "grad_norm": 62.04325162837873, + "learning_rate": 1.0691039072514064e-07, + "loss": 1.4317, + "step": 33111 + }, + { + "epoch": 2.8221256285689935, + "grad_norm": 49.17290624744381, + "learning_rate": 1.0680842555008508e-07, + "loss": 1.5187, + "step": 33112 + }, + { + "epoch": 2.822210858263019, + "grad_norm": 77.43231572473663, + "learning_rate": 1.0670650849762432e-07, + "loss": 1.1148, + "step": 33113 + }, + { + "epoch": 2.822296087957044, + "grad_norm": 21.29191832842061, + "learning_rate": 1.0660463956876089e-07, + "loss": 0.5831, + "step": 33114 + }, + { + "epoch": 2.8223813176510695, + "grad_norm": 21.34367508371551, + "learning_rate": 1.0650281876449619e-07, + "loss": 0.6984, + "step": 33115 + }, + { + "epoch": 2.822466547345095, + "grad_norm": 61.95101040288438, + "learning_rate": 1.0640104608583168e-07, + "loss": 1.6972, + "step": 33116 + }, + { + "epoch": 2.8225517770391204, + "grad_norm": 52.13422180518063, + "learning_rate": 1.0629932153376876e-07, + "loss": 1.7165, + "step": 33117 + }, + { + "epoch": 2.822637006733146, + "grad_norm": 38.13587397238921, + "learning_rate": 1.0619764510930663e-07, + "loss": 1.1426, + "step": 33118 + }, + { + "epoch": 2.8227222364271713, + "grad_norm": 37.1646074448038, + "learning_rate": 1.0609601681344672e-07, + "loss": 0.7055, + "step": 33119 + }, + { + "epoch": 2.822807466121197, + "grad_norm": 30.679037286052285, + "learning_rate": 1.0599443664718711e-07, + "loss": 0.8631, + "step": 33120 + }, + { + "epoch": 2.822892695815222, + "grad_norm": 59.187700072043484, + "learning_rate": 1.0589290461152812e-07, + "loss": 1.8448, + "step": 33121 + }, + { + "epoch": 2.8229779255092473, + "grad_norm": 43.44075794560508, + "learning_rate": 1.0579142070746784e-07, + "loss": 1.0939, + "step": 33122 + }, + { + "epoch": 2.823063155203273, + "grad_norm": 31.032482329414062, + "learning_rate": 1.0568998493600436e-07, + "loss": 1.0258, + "step": 33123 + }, + { + "epoch": 2.8231483848972982, + "grad_norm": 58.228240734301515, + "learning_rate": 1.0558859729813409e-07, + "loss": 1.4258, + "step": 33124 + }, + { + "epoch": 2.8232336145913237, + "grad_norm": 76.94570111355262, + "learning_rate": 1.0548725779485625e-07, + "loss": 1.332, + "step": 33125 + }, + { + "epoch": 2.8233188442853487, + "grad_norm": 54.79221335000499, + "learning_rate": 1.0538596642716558e-07, + "loss": 1.5518, + "step": 33126 + }, + { + "epoch": 2.8234040739793747, + "grad_norm": 85.03421956691766, + "learning_rate": 1.052847231960591e-07, + "loss": 2.3775, + "step": 33127 + }, + { + "epoch": 2.8234893036733997, + "grad_norm": 40.293641017181116, + "learning_rate": 1.0518352810253263e-07, + "loss": 1.2907, + "step": 33128 + }, + { + "epoch": 2.823574533367425, + "grad_norm": 37.89413457332628, + "learning_rate": 1.0508238114758095e-07, + "loss": 0.8485, + "step": 33129 + }, + { + "epoch": 2.8236597630614506, + "grad_norm": 51.29995102683286, + "learning_rate": 1.049812823321994e-07, + "loss": 1.5726, + "step": 33130 + }, + { + "epoch": 2.823744992755476, + "grad_norm": 43.84052804439095, + "learning_rate": 1.048802316573816e-07, + "loss": 0.9816, + "step": 33131 + }, + { + "epoch": 2.8238302224495015, + "grad_norm": 34.172405562108196, + "learning_rate": 1.0477922912412174e-07, + "loss": 1.2408, + "step": 33132 + }, + { + "epoch": 2.8239154521435266, + "grad_norm": 39.016804807135436, + "learning_rate": 1.046782747334124e-07, + "loss": 0.7072, + "step": 33133 + }, + { + "epoch": 2.824000681837552, + "grad_norm": 49.36822102091164, + "learning_rate": 1.0457736848624723e-07, + "loss": 1.2398, + "step": 33134 + }, + { + "epoch": 2.8240859115315775, + "grad_norm": 104.2402273309572, + "learning_rate": 1.0447651038361872e-07, + "loss": 2.6362, + "step": 33135 + }, + { + "epoch": 2.824171141225603, + "grad_norm": 78.58133857841341, + "learning_rate": 1.0437570042651835e-07, + "loss": 2.1274, + "step": 33136 + }, + { + "epoch": 2.8242563709196284, + "grad_norm": 28.21436158290817, + "learning_rate": 1.0427493861593696e-07, + "loss": 0.964, + "step": 33137 + }, + { + "epoch": 2.824341600613654, + "grad_norm": 54.940557401919705, + "learning_rate": 1.0417422495286712e-07, + "loss": 1.4484, + "step": 33138 + }, + { + "epoch": 2.8244268303076794, + "grad_norm": 45.823545816325094, + "learning_rate": 1.04073559438298e-07, + "loss": 1.6233, + "step": 33139 + }, + { + "epoch": 2.8245120600017044, + "grad_norm": 61.84844318262533, + "learning_rate": 1.0397294207322051e-07, + "loss": 1.8982, + "step": 33140 + }, + { + "epoch": 2.82459728969573, + "grad_norm": 40.74347696911114, + "learning_rate": 1.0387237285862273e-07, + "loss": 0.9578, + "step": 33141 + }, + { + "epoch": 2.8246825193897553, + "grad_norm": 64.55946832121256, + "learning_rate": 1.0377185179549443e-07, + "loss": 1.6534, + "step": 33142 + }, + { + "epoch": 2.824767749083781, + "grad_norm": 31.775105146835028, + "learning_rate": 1.0367137888482537e-07, + "loss": 0.7617, + "step": 33143 + }, + { + "epoch": 2.8248529787778063, + "grad_norm": 45.790119963069756, + "learning_rate": 1.0357095412760254e-07, + "loss": 1.4016, + "step": 33144 + }, + { + "epoch": 2.8249382084718317, + "grad_norm": 32.6416658787481, + "learning_rate": 1.0347057752481349e-07, + "loss": 0.7712, + "step": 33145 + }, + { + "epoch": 2.825023438165857, + "grad_norm": 30.534095436287945, + "learning_rate": 1.0337024907744575e-07, + "loss": 0.822, + "step": 33146 + }, + { + "epoch": 2.8251086678598822, + "grad_norm": 53.61027198424055, + "learning_rate": 1.0326996878648576e-07, + "loss": 1.0227, + "step": 33147 + }, + { + "epoch": 2.8251938975539077, + "grad_norm": 80.11555266991637, + "learning_rate": 1.0316973665291997e-07, + "loss": 1.9914, + "step": 33148 + }, + { + "epoch": 2.825279127247933, + "grad_norm": 58.26704445286284, + "learning_rate": 1.0306955267773366e-07, + "loss": 0.9921, + "step": 33149 + }, + { + "epoch": 2.8253643569419586, + "grad_norm": 67.57663271236892, + "learning_rate": 1.029694168619122e-07, + "loss": 1.7473, + "step": 33150 + }, + { + "epoch": 2.825449586635984, + "grad_norm": 47.21002322989734, + "learning_rate": 1.0286932920644143e-07, + "loss": 1.4758, + "step": 33151 + }, + { + "epoch": 2.825534816330009, + "grad_norm": 87.42212713951993, + "learning_rate": 1.0276928971230504e-07, + "loss": 2.1797, + "step": 33152 + }, + { + "epoch": 2.825620046024035, + "grad_norm": 78.62676854659459, + "learning_rate": 1.0266929838048611e-07, + "loss": 1.632, + "step": 33153 + }, + { + "epoch": 2.82570527571806, + "grad_norm": 56.067216158243596, + "learning_rate": 1.0256935521196887e-07, + "loss": 1.3915, + "step": 33154 + }, + { + "epoch": 2.8257905054120855, + "grad_norm": 27.799315448886166, + "learning_rate": 1.0246946020773585e-07, + "loss": 0.6363, + "step": 33155 + }, + { + "epoch": 2.825875735106111, + "grad_norm": 77.16475682264203, + "learning_rate": 1.023696133687696e-07, + "loss": 1.901, + "step": 33156 + }, + { + "epoch": 2.8259609648001365, + "grad_norm": 53.68498365773865, + "learning_rate": 1.0226981469605213e-07, + "loss": 1.2576, + "step": 33157 + }, + { + "epoch": 2.826046194494162, + "grad_norm": 102.57680375629789, + "learning_rate": 1.0217006419056485e-07, + "loss": 2.3601, + "step": 33158 + }, + { + "epoch": 2.826131424188187, + "grad_norm": 41.35263869280337, + "learning_rate": 1.0207036185328867e-07, + "loss": 1.0454, + "step": 33159 + }, + { + "epoch": 2.8262166538822124, + "grad_norm": 51.450083073816856, + "learning_rate": 1.0197070768520445e-07, + "loss": 1.4425, + "step": 33160 + }, + { + "epoch": 2.826301883576238, + "grad_norm": 45.99501487753904, + "learning_rate": 1.0187110168729198e-07, + "loss": 1.4182, + "step": 33161 + }, + { + "epoch": 2.8263871132702634, + "grad_norm": 62.80498122642701, + "learning_rate": 1.01771543860531e-07, + "loss": 1.4089, + "step": 33162 + }, + { + "epoch": 2.826472342964289, + "grad_norm": 41.94745846197605, + "learning_rate": 1.0167203420590021e-07, + "loss": 1.235, + "step": 33163 + }, + { + "epoch": 2.8265575726583143, + "grad_norm": 37.09200644687947, + "learning_rate": 1.0157257272437882e-07, + "loss": 0.809, + "step": 33164 + }, + { + "epoch": 2.82664280235234, + "grad_norm": 66.03193710983182, + "learning_rate": 1.0147315941694435e-07, + "loss": 1.3815, + "step": 33165 + }, + { + "epoch": 2.826728032046365, + "grad_norm": 55.8683061392652, + "learning_rate": 1.0137379428457495e-07, + "loss": 1.0044, + "step": 33166 + }, + { + "epoch": 2.8268132617403903, + "grad_norm": 52.41995466917005, + "learning_rate": 1.0127447732824758e-07, + "loss": 1.1996, + "step": 33167 + }, + { + "epoch": 2.8268984914344157, + "grad_norm": 48.866481375242024, + "learning_rate": 1.0117520854893981e-07, + "loss": 1.6414, + "step": 33168 + }, + { + "epoch": 2.826983721128441, + "grad_norm": 71.18152412291559, + "learning_rate": 1.0107598794762752e-07, + "loss": 1.1359, + "step": 33169 + }, + { + "epoch": 2.8270689508224667, + "grad_norm": 80.10363718647469, + "learning_rate": 1.0097681552528549e-07, + "loss": 1.6608, + "step": 33170 + }, + { + "epoch": 2.8271541805164917, + "grad_norm": 73.23960126578143, + "learning_rate": 1.0087769128289016e-07, + "loss": 1.3646, + "step": 33171 + }, + { + "epoch": 2.8272394102105176, + "grad_norm": 36.6313957945858, + "learning_rate": 1.0077861522141575e-07, + "loss": 0.8436, + "step": 33172 + }, + { + "epoch": 2.8273246399045426, + "grad_norm": 24.84138624228086, + "learning_rate": 1.0067958734183703e-07, + "loss": 0.7965, + "step": 33173 + }, + { + "epoch": 2.827409869598568, + "grad_norm": 49.59364540774316, + "learning_rate": 1.0058060764512767e-07, + "loss": 1.2254, + "step": 33174 + }, + { + "epoch": 2.8274950992925936, + "grad_norm": 16.61983776483737, + "learning_rate": 1.0048167613226134e-07, + "loss": 0.5769, + "step": 33175 + }, + { + "epoch": 2.827580328986619, + "grad_norm": 33.74541614822654, + "learning_rate": 1.003827928042106e-07, + "loss": 0.8356, + "step": 33176 + }, + { + "epoch": 2.8276655586806445, + "grad_norm": 71.39716043433648, + "learning_rate": 1.0028395766194854e-07, + "loss": 1.855, + "step": 33177 + }, + { + "epoch": 2.8277507883746695, + "grad_norm": 33.127097674144274, + "learning_rate": 1.0018517070644662e-07, + "loss": 1.0053, + "step": 33178 + }, + { + "epoch": 2.827836018068695, + "grad_norm": 87.44584395886947, + "learning_rate": 1.0008643193867629e-07, + "loss": 1.9821, + "step": 33179 + }, + { + "epoch": 2.8279212477627205, + "grad_norm": 70.88081914485208, + "learning_rate": 9.998774135960954e-08, + "loss": 1.648, + "step": 33180 + }, + { + "epoch": 2.828006477456746, + "grad_norm": 59.78215159964689, + "learning_rate": 9.988909897021559e-08, + "loss": 0.9279, + "step": 33181 + }, + { + "epoch": 2.8280917071507714, + "grad_norm": 32.20834793611908, + "learning_rate": 9.97905047714659e-08, + "loss": 0.908, + "step": 33182 + }, + { + "epoch": 2.828176936844797, + "grad_norm": 80.02240891752437, + "learning_rate": 9.969195876432914e-08, + "loss": 1.8269, + "step": 33183 + }, + { + "epoch": 2.8282621665388223, + "grad_norm": 13.980548001680324, + "learning_rate": 9.959346094977451e-08, + "loss": 0.6342, + "step": 33184 + }, + { + "epoch": 2.8283473962328474, + "grad_norm": 29.314623099967612, + "learning_rate": 9.949501132877126e-08, + "loss": 0.8681, + "step": 33185 + }, + { + "epoch": 2.828432625926873, + "grad_norm": 44.07501967184331, + "learning_rate": 9.93966099022875e-08, + "loss": 0.8695, + "step": 33186 + }, + { + "epoch": 2.8285178556208983, + "grad_norm": 73.5380018866734, + "learning_rate": 9.929825667129023e-08, + "loss": 1.9434, + "step": 33187 + }, + { + "epoch": 2.8286030853149238, + "grad_norm": 33.08100293893417, + "learning_rate": 9.919995163674756e-08, + "loss": 1.2379, + "step": 33188 + }, + { + "epoch": 2.8286883150089492, + "grad_norm": 76.66304347668242, + "learning_rate": 9.91016947996254e-08, + "loss": 1.8032, + "step": 33189 + }, + { + "epoch": 2.8287735447029743, + "grad_norm": 22.180813644316217, + "learning_rate": 9.900348616089128e-08, + "loss": 0.5736, + "step": 33190 + }, + { + "epoch": 2.828858774397, + "grad_norm": 21.484200926317683, + "learning_rate": 9.890532572151056e-08, + "loss": 0.814, + "step": 33191 + }, + { + "epoch": 2.828944004091025, + "grad_norm": 52.085274003581375, + "learning_rate": 9.880721348244859e-08, + "loss": 1.6616, + "step": 33192 + }, + { + "epoch": 2.8290292337850507, + "grad_norm": 48.15072024367599, + "learning_rate": 9.870914944466903e-08, + "loss": 1.5626, + "step": 33193 + }, + { + "epoch": 2.829114463479076, + "grad_norm": 65.75584536200387, + "learning_rate": 9.861113360913832e-08, + "loss": 1.3592, + "step": 33194 + }, + { + "epoch": 2.8291996931731016, + "grad_norm": 55.50890941830528, + "learning_rate": 9.851316597681959e-08, + "loss": 1.0503, + "step": 33195 + }, + { + "epoch": 2.829284922867127, + "grad_norm": 41.27254728319242, + "learning_rate": 9.841524654867596e-08, + "loss": 1.0531, + "step": 33196 + }, + { + "epoch": 2.829370152561152, + "grad_norm": 71.4333103922073, + "learning_rate": 9.831737532566998e-08, + "loss": 1.6116, + "step": 33197 + }, + { + "epoch": 2.8294553822551776, + "grad_norm": 32.089041532955015, + "learning_rate": 9.82195523087659e-08, + "loss": 0.7725, + "step": 33198 + }, + { + "epoch": 2.829540611949203, + "grad_norm": 51.47066646542839, + "learning_rate": 9.81217774989246e-08, + "loss": 1.098, + "step": 33199 + }, + { + "epoch": 2.8296258416432285, + "grad_norm": 32.16431101482888, + "learning_rate": 9.802405089710754e-08, + "loss": 1.5158, + "step": 33200 + }, + { + "epoch": 2.829711071337254, + "grad_norm": 63.168163208258676, + "learning_rate": 9.792637250427617e-08, + "loss": 1.4266, + "step": 33201 + }, + { + "epoch": 2.8297963010312794, + "grad_norm": 89.24540660904484, + "learning_rate": 9.782874232139084e-08, + "loss": 1.9692, + "step": 33202 + }, + { + "epoch": 2.829881530725305, + "grad_norm": 76.26240853989754, + "learning_rate": 9.773116034941243e-08, + "loss": 2.1023, + "step": 33203 + }, + { + "epoch": 2.82996676041933, + "grad_norm": 54.219259247652985, + "learning_rate": 9.763362658929909e-08, + "loss": 1.3011, + "step": 33204 + }, + { + "epoch": 2.8300519901133554, + "grad_norm": 52.46660516759684, + "learning_rate": 9.75361410420117e-08, + "loss": 1.158, + "step": 33205 + }, + { + "epoch": 2.830137219807381, + "grad_norm": 43.340182333818944, + "learning_rate": 9.743870370850838e-08, + "loss": 1.4817, + "step": 33206 + }, + { + "epoch": 2.8302224495014063, + "grad_norm": 43.604095644298916, + "learning_rate": 9.734131458974727e-08, + "loss": 1.3038, + "step": 33207 + }, + { + "epoch": 2.830307679195432, + "grad_norm": 30.339177080682404, + "learning_rate": 9.724397368668592e-08, + "loss": 0.7681, + "step": 33208 + }, + { + "epoch": 2.830392908889457, + "grad_norm": 47.946333981460654, + "learning_rate": 9.714668100028246e-08, + "loss": 1.3556, + "step": 33209 + }, + { + "epoch": 2.8304781385834827, + "grad_norm": 42.47384451026807, + "learning_rate": 9.70494365314928e-08, + "loss": 1.4064, + "step": 33210 + }, + { + "epoch": 2.8305633682775078, + "grad_norm": 28.09246928993054, + "learning_rate": 9.695224028127393e-08, + "loss": 0.9791, + "step": 33211 + }, + { + "epoch": 2.8306485979715332, + "grad_norm": 40.33533967416566, + "learning_rate": 9.685509225058065e-08, + "loss": 1.345, + "step": 33212 + }, + { + "epoch": 2.8307338276655587, + "grad_norm": 37.17716093012574, + "learning_rate": 9.675799244036999e-08, + "loss": 1.0127, + "step": 33213 + }, + { + "epoch": 2.830819057359584, + "grad_norm": 63.85902601544281, + "learning_rate": 9.666094085159617e-08, + "loss": 1.7845, + "step": 33214 + }, + { + "epoch": 2.8309042870536096, + "grad_norm": 73.06025277638321, + "learning_rate": 9.656393748521343e-08, + "loss": 1.6961, + "step": 33215 + }, + { + "epoch": 2.8309895167476347, + "grad_norm": 45.6317723020432, + "learning_rate": 9.646698234217599e-08, + "loss": 1.1234, + "step": 33216 + }, + { + "epoch": 2.83107474644166, + "grad_norm": 60.104210203160434, + "learning_rate": 9.637007542343757e-08, + "loss": 1.8878, + "step": 33217 + }, + { + "epoch": 2.8311599761356856, + "grad_norm": 67.2829629024859, + "learning_rate": 9.627321672995016e-08, + "loss": 1.6701, + "step": 33218 + }, + { + "epoch": 2.831245205829711, + "grad_norm": 24.192730025324884, + "learning_rate": 9.6176406262668e-08, + "loss": 0.8087, + "step": 33219 + }, + { + "epoch": 2.8313304355237365, + "grad_norm": 43.30841570848549, + "learning_rate": 9.607964402254144e-08, + "loss": 1.9132, + "step": 33220 + }, + { + "epoch": 2.831415665217762, + "grad_norm": 48.13905674467095, + "learning_rate": 9.598293001052361e-08, + "loss": 1.2636, + "step": 33221 + }, + { + "epoch": 2.8315008949117875, + "grad_norm": 55.713193782789965, + "learning_rate": 9.588626422756486e-08, + "loss": 1.3831, + "step": 33222 + }, + { + "epoch": 2.8315861246058125, + "grad_norm": 28.85456222953655, + "learning_rate": 9.578964667461554e-08, + "loss": 0.9016, + "step": 33223 + }, + { + "epoch": 2.831671354299838, + "grad_norm": 23.890339150755235, + "learning_rate": 9.569307735262712e-08, + "loss": 0.7859, + "step": 33224 + }, + { + "epoch": 2.8317565839938634, + "grad_norm": 48.84717376697891, + "learning_rate": 9.559655626254827e-08, + "loss": 0.9373, + "step": 33225 + }, + { + "epoch": 2.831841813687889, + "grad_norm": 30.233400618606492, + "learning_rate": 9.550008340532768e-08, + "loss": 0.8784, + "step": 33226 + }, + { + "epoch": 2.8319270433819144, + "grad_norm": 68.26255149902224, + "learning_rate": 9.54036587819157e-08, + "loss": 1.6636, + "step": 33227 + }, + { + "epoch": 2.8320122730759394, + "grad_norm": 63.78293625246965, + "learning_rate": 9.530728239325937e-08, + "loss": 1.0331, + "step": 33228 + }, + { + "epoch": 2.8320975027699653, + "grad_norm": 60.23215200666138, + "learning_rate": 9.52109542403079e-08, + "loss": 1.78, + "step": 33229 + }, + { + "epoch": 2.8321827324639903, + "grad_norm": 38.63701993306785, + "learning_rate": 9.511467432400723e-08, + "loss": 1.3525, + "step": 33230 + }, + { + "epoch": 2.832267962158016, + "grad_norm": 54.726137751625835, + "learning_rate": 9.501844264530435e-08, + "loss": 1.2425, + "step": 33231 + }, + { + "epoch": 2.8323531918520413, + "grad_norm": 52.33163973044807, + "learning_rate": 9.49222592051463e-08, + "loss": 1.455, + "step": 33232 + }, + { + "epoch": 2.8324384215460667, + "grad_norm": 40.98197712729694, + "learning_rate": 9.482612400447844e-08, + "loss": 1.3724, + "step": 33233 + }, + { + "epoch": 2.832523651240092, + "grad_norm": 23.283890904242813, + "learning_rate": 9.473003704424722e-08, + "loss": 0.84, + "step": 33234 + }, + { + "epoch": 2.8326088809341172, + "grad_norm": 57.616789749910154, + "learning_rate": 9.463399832539633e-08, + "loss": 1.2049, + "step": 33235 + }, + { + "epoch": 2.8326941106281427, + "grad_norm": 23.430670786618588, + "learning_rate": 9.453800784887057e-08, + "loss": 1.0342, + "step": 33236 + }, + { + "epoch": 2.832779340322168, + "grad_norm": 46.610951603705374, + "learning_rate": 9.444206561561475e-08, + "loss": 1.3722, + "step": 33237 + }, + { + "epoch": 2.8328645700161936, + "grad_norm": 66.53143636857617, + "learning_rate": 9.434617162657256e-08, + "loss": 2.2166, + "step": 33238 + }, + { + "epoch": 2.832949799710219, + "grad_norm": 42.945115641531594, + "learning_rate": 9.425032588268601e-08, + "loss": 1.3628, + "step": 33239 + }, + { + "epoch": 2.8330350294042446, + "grad_norm": 39.11922640562134, + "learning_rate": 9.415452838489714e-08, + "loss": 1.112, + "step": 33240 + }, + { + "epoch": 2.83312025909827, + "grad_norm": 67.4833280454524, + "learning_rate": 9.405877913415018e-08, + "loss": 1.6075, + "step": 33241 + }, + { + "epoch": 2.833205488792295, + "grad_norm": 28.37111829071228, + "learning_rate": 9.39630781313855e-08, + "loss": 0.6655, + "step": 33242 + }, + { + "epoch": 2.8332907184863205, + "grad_norm": 49.17644704108271, + "learning_rate": 9.386742537754456e-08, + "loss": 1.2328, + "step": 33243 + }, + { + "epoch": 2.833375948180346, + "grad_norm": 56.505824664531275, + "learning_rate": 9.377182087356773e-08, + "loss": 1.5667, + "step": 33244 + }, + { + "epoch": 2.8334611778743715, + "grad_norm": 45.40772536648055, + "learning_rate": 9.367626462039536e-08, + "loss": 1.2865, + "step": 33245 + }, + { + "epoch": 2.833546407568397, + "grad_norm": 56.661860087133796, + "learning_rate": 9.358075661896782e-08, + "loss": 1.3635, + "step": 33246 + }, + { + "epoch": 2.833631637262422, + "grad_norm": 44.256091204753254, + "learning_rate": 9.348529687022379e-08, + "loss": 0.909, + "step": 33247 + }, + { + "epoch": 2.833716866956448, + "grad_norm": 68.001693188584, + "learning_rate": 9.338988537510252e-08, + "loss": 1.9368, + "step": 33248 + }, + { + "epoch": 2.833802096650473, + "grad_norm": 72.35781947226177, + "learning_rate": 9.329452213454105e-08, + "loss": 1.8078, + "step": 33249 + }, + { + "epoch": 2.8338873263444984, + "grad_norm": 62.103145086218234, + "learning_rate": 9.319920714947917e-08, + "loss": 1.9095, + "step": 33250 + }, + { + "epoch": 2.833972556038524, + "grad_norm": 50.712954914539324, + "learning_rate": 9.310394042085224e-08, + "loss": 0.9761, + "step": 33251 + }, + { + "epoch": 2.8340577857325493, + "grad_norm": 68.46704164311552, + "learning_rate": 9.300872194959953e-08, + "loss": 1.8015, + "step": 33252 + }, + { + "epoch": 2.8341430154265748, + "grad_norm": 25.575743303800817, + "learning_rate": 9.291355173665528e-08, + "loss": 1.0179, + "step": 33253 + }, + { + "epoch": 2.8342282451206, + "grad_norm": 77.64723693207634, + "learning_rate": 9.281842978295652e-08, + "loss": 2.202, + "step": 33254 + }, + { + "epoch": 2.8343134748146253, + "grad_norm": 27.682256083902125, + "learning_rate": 9.272335608943916e-08, + "loss": 0.9414, + "step": 33255 + }, + { + "epoch": 2.8343987045086507, + "grad_norm": 29.766791600977438, + "learning_rate": 9.262833065703747e-08, + "loss": 0.8541, + "step": 33256 + }, + { + "epoch": 2.834483934202676, + "grad_norm": 44.72213517148409, + "learning_rate": 9.253335348668568e-08, + "loss": 1.4548, + "step": 33257 + }, + { + "epoch": 2.8345691638967017, + "grad_norm": 47.935632378277816, + "learning_rate": 9.243842457931862e-08, + "loss": 0.9986, + "step": 33258 + }, + { + "epoch": 2.834654393590727, + "grad_norm": 51.88534872120769, + "learning_rate": 9.234354393586887e-08, + "loss": 1.3022, + "step": 33259 + }, + { + "epoch": 2.8347396232847526, + "grad_norm": 58.945286595198574, + "learning_rate": 9.224871155727123e-08, + "loss": 1.8126, + "step": 33260 + }, + { + "epoch": 2.8348248529787776, + "grad_norm": 54.323432098180945, + "learning_rate": 9.215392744445717e-08, + "loss": 1.2013, + "step": 33261 + }, + { + "epoch": 2.834910082672803, + "grad_norm": 40.54533526768611, + "learning_rate": 9.205919159835875e-08, + "loss": 1.6212, + "step": 33262 + }, + { + "epoch": 2.8349953123668286, + "grad_norm": 44.65809092334851, + "learning_rate": 9.196450401990798e-08, + "loss": 1.5459, + "step": 33263 + }, + { + "epoch": 2.835080542060854, + "grad_norm": 89.68365765280124, + "learning_rate": 9.186986471003634e-08, + "loss": 2.2465, + "step": 33264 + }, + { + "epoch": 2.8351657717548795, + "grad_norm": 47.31241054894074, + "learning_rate": 9.17752736696742e-08, + "loss": 0.9097, + "step": 33265 + }, + { + "epoch": 2.835251001448905, + "grad_norm": 40.32329666643216, + "learning_rate": 9.168073089975193e-08, + "loss": 1.3694, + "step": 33266 + }, + { + "epoch": 2.8353362311429304, + "grad_norm": 25.223208351336467, + "learning_rate": 9.158623640119879e-08, + "loss": 0.869, + "step": 33267 + }, + { + "epoch": 2.8354214608369555, + "grad_norm": 65.94065328977038, + "learning_rate": 9.149179017494514e-08, + "loss": 1.2912, + "step": 33268 + }, + { + "epoch": 2.835506690530981, + "grad_norm": 71.79528404826667, + "learning_rate": 9.139739222191913e-08, + "loss": 1.4459, + "step": 33269 + }, + { + "epoch": 2.8355919202250064, + "grad_norm": 52.73368107594912, + "learning_rate": 9.130304254304889e-08, + "loss": 1.149, + "step": 33270 + }, + { + "epoch": 2.835677149919032, + "grad_norm": 28.85337164895145, + "learning_rate": 9.12087411392637e-08, + "loss": 0.9072, + "step": 33271 + }, + { + "epoch": 2.8357623796130573, + "grad_norm": 34.52907769702929, + "learning_rate": 9.111448801148947e-08, + "loss": 1.1331, + "step": 33272 + }, + { + "epoch": 2.8358476093070824, + "grad_norm": 51.565456457725375, + "learning_rate": 9.102028316065326e-08, + "loss": 1.7482, + "step": 33273 + }, + { + "epoch": 2.8359328390011083, + "grad_norm": 31.365352737969758, + "learning_rate": 9.092612658768207e-08, + "loss": 1.3129, + "step": 33274 + }, + { + "epoch": 2.8360180686951333, + "grad_norm": 72.90962247443674, + "learning_rate": 9.083201829350185e-08, + "loss": 1.9783, + "step": 33275 + }, + { + "epoch": 2.8361032983891588, + "grad_norm": 79.62461199066648, + "learning_rate": 9.073795827903798e-08, + "loss": 2.1094, + "step": 33276 + }, + { + "epoch": 2.8361885280831842, + "grad_norm": 84.58350304766293, + "learning_rate": 9.06439465452158e-08, + "loss": 2.2383, + "step": 33277 + }, + { + "epoch": 2.8362737577772097, + "grad_norm": 36.87897135597708, + "learning_rate": 9.054998309296015e-08, + "loss": 1.2705, + "step": 33278 + }, + { + "epoch": 2.836358987471235, + "grad_norm": 31.49995437334572, + "learning_rate": 9.045606792319361e-08, + "loss": 0.6576, + "step": 33279 + }, + { + "epoch": 2.83644421716526, + "grad_norm": 16.757724654722555, + "learning_rate": 9.036220103684101e-08, + "loss": 0.6056, + "step": 33280 + }, + { + "epoch": 2.8365294468592857, + "grad_norm": 36.95455521224887, + "learning_rate": 9.02683824348255e-08, + "loss": 1.2457, + "step": 33281 + }, + { + "epoch": 2.836614676553311, + "grad_norm": 28.44019115007082, + "learning_rate": 9.017461211806911e-08, + "loss": 0.8893, + "step": 33282 + }, + { + "epoch": 2.8366999062473366, + "grad_norm": 80.7309851050987, + "learning_rate": 9.008089008749444e-08, + "loss": 1.3925, + "step": 33283 + }, + { + "epoch": 2.836785135941362, + "grad_norm": 37.34096773187565, + "learning_rate": 8.998721634402352e-08, + "loss": 1.2139, + "step": 33284 + }, + { + "epoch": 2.8368703656353875, + "grad_norm": 22.501047812114315, + "learning_rate": 8.98935908885773e-08, + "loss": 1.0312, + "step": 33285 + }, + { + "epoch": 2.836955595329413, + "grad_norm": 53.750990373721855, + "learning_rate": 8.980001372207669e-08, + "loss": 1.5599, + "step": 33286 + }, + { + "epoch": 2.837040825023438, + "grad_norm": 33.1144417083373, + "learning_rate": 8.970648484544154e-08, + "loss": 1.2369, + "step": 33287 + }, + { + "epoch": 2.8371260547174635, + "grad_norm": 57.93623106123928, + "learning_rate": 8.961300425959163e-08, + "loss": 1.3585, + "step": 33288 + }, + { + "epoch": 2.837211284411489, + "grad_norm": 63.645344666487375, + "learning_rate": 8.951957196544736e-08, + "loss": 1.6535, + "step": 33289 + }, + { + "epoch": 2.8372965141055144, + "grad_norm": 49.798049744875534, + "learning_rate": 8.942618796392633e-08, + "loss": 0.9832, + "step": 33290 + }, + { + "epoch": 2.83738174379954, + "grad_norm": 47.358840262609434, + "learning_rate": 8.933285225594723e-08, + "loss": 0.9772, + "step": 33291 + }, + { + "epoch": 2.837466973493565, + "grad_norm": 52.38901417422244, + "learning_rate": 8.92395648424288e-08, + "loss": 1.2297, + "step": 33292 + }, + { + "epoch": 2.837552203187591, + "grad_norm": 60.02993369833231, + "learning_rate": 8.91463257242875e-08, + "loss": 1.4762, + "step": 33293 + }, + { + "epoch": 2.837637432881616, + "grad_norm": 74.06669215743422, + "learning_rate": 8.905313490244094e-08, + "loss": 1.1122, + "step": 33294 + }, + { + "epoch": 2.8377226625756413, + "grad_norm": 69.5970924409067, + "learning_rate": 8.895999237780562e-08, + "loss": 1.9602, + "step": 33295 + }, + { + "epoch": 2.837807892269667, + "grad_norm": 46.02133855788244, + "learning_rate": 8.88668981512969e-08, + "loss": 1.173, + "step": 33296 + }, + { + "epoch": 2.8378931219636923, + "grad_norm": 42.471030726711625, + "learning_rate": 8.877385222383128e-08, + "loss": 1.6181, + "step": 33297 + }, + { + "epoch": 2.8379783516577177, + "grad_norm": 59.7157995225459, + "learning_rate": 8.868085459632247e-08, + "loss": 1.9284, + "step": 33298 + }, + { + "epoch": 2.8380635813517427, + "grad_norm": 85.48437984959746, + "learning_rate": 8.858790526968641e-08, + "loss": 2.0049, + "step": 33299 + }, + { + "epoch": 2.838148811045768, + "grad_norm": 40.8843067705087, + "learning_rate": 8.849500424483682e-08, + "loss": 0.9598, + "step": 33300 + }, + { + "epoch": 2.8382340407397937, + "grad_norm": 33.42128878978, + "learning_rate": 8.840215152268683e-08, + "loss": 0.7871, + "step": 33301 + }, + { + "epoch": 2.838319270433819, + "grad_norm": 78.3784703000429, + "learning_rate": 8.830934710415018e-08, + "loss": 2.0036, + "step": 33302 + }, + { + "epoch": 2.8384045001278446, + "grad_norm": 71.23785415163452, + "learning_rate": 8.821659099013946e-08, + "loss": 1.6567, + "step": 33303 + }, + { + "epoch": 2.83848972982187, + "grad_norm": 77.30525740202081, + "learning_rate": 8.812388318156673e-08, + "loss": 1.5357, + "step": 33304 + }, + { + "epoch": 2.8385749595158956, + "grad_norm": 46.53165839067865, + "learning_rate": 8.803122367934403e-08, + "loss": 1.2204, + "step": 33305 + }, + { + "epoch": 2.8386601892099206, + "grad_norm": 62.564159390022624, + "learning_rate": 8.793861248438173e-08, + "loss": 1.8013, + "step": 33306 + }, + { + "epoch": 2.838745418903946, + "grad_norm": 91.47630922162443, + "learning_rate": 8.78460495975919e-08, + "loss": 1.7382, + "step": 33307 + }, + { + "epoch": 2.8388306485979715, + "grad_norm": 31.861407944331326, + "learning_rate": 8.775353501988438e-08, + "loss": 1.0251, + "step": 33308 + }, + { + "epoch": 2.838915878291997, + "grad_norm": 39.7863518808259, + "learning_rate": 8.76610687521684e-08, + "loss": 0.8663, + "step": 33309 + }, + { + "epoch": 2.8390011079860225, + "grad_norm": 24.593953245995728, + "learning_rate": 8.756865079535436e-08, + "loss": 0.7038, + "step": 33310 + }, + { + "epoch": 2.8390863376800475, + "grad_norm": 28.81314084912753, + "learning_rate": 8.747628115035045e-08, + "loss": 1.0553, + "step": 33311 + }, + { + "epoch": 2.8391715673740734, + "grad_norm": 56.62137446507719, + "learning_rate": 8.738395981806535e-08, + "loss": 1.5281, + "step": 33312 + }, + { + "epoch": 2.8392567970680984, + "grad_norm": 38.497371318226875, + "learning_rate": 8.729168679940669e-08, + "loss": 0.9028, + "step": 33313 + }, + { + "epoch": 2.839342026762124, + "grad_norm": 1111.7931519105364, + "learning_rate": 8.719946209528207e-08, + "loss": 3.7338, + "step": 33314 + }, + { + "epoch": 2.8394272564561494, + "grad_norm": 59.37699268752646, + "learning_rate": 8.71072857065991e-08, + "loss": 1.4498, + "step": 33315 + }, + { + "epoch": 2.839512486150175, + "grad_norm": 57.412971139901956, + "learning_rate": 8.701515763426371e-08, + "loss": 1.3887, + "step": 33316 + }, + { + "epoch": 2.8395977158442003, + "grad_norm": 32.8215569748193, + "learning_rate": 8.692307787918186e-08, + "loss": 0.8975, + "step": 33317 + }, + { + "epoch": 2.8396829455382253, + "grad_norm": 67.86409764427621, + "learning_rate": 8.683104644225893e-08, + "loss": 1.2878, + "step": 33318 + }, + { + "epoch": 2.839768175232251, + "grad_norm": 113.28661190904798, + "learning_rate": 8.673906332440086e-08, + "loss": 1.7452, + "step": 33319 + }, + { + "epoch": 2.8398534049262762, + "grad_norm": 50.2750514439882, + "learning_rate": 8.664712852651136e-08, + "loss": 1.6191, + "step": 33320 + }, + { + "epoch": 2.8399386346203017, + "grad_norm": 56.630710817625705, + "learning_rate": 8.655524204949529e-08, + "loss": 1.1996, + "step": 33321 + }, + { + "epoch": 2.840023864314327, + "grad_norm": 52.52784072679495, + "learning_rate": 8.646340389425578e-08, + "loss": 1.2017, + "step": 33322 + }, + { + "epoch": 2.8401090940083527, + "grad_norm": 42.86444643782325, + "learning_rate": 8.637161406169603e-08, + "loss": 1.2463, + "step": 33323 + }, + { + "epoch": 2.840194323702378, + "grad_norm": 49.24232264523916, + "learning_rate": 8.627987255271974e-08, + "loss": 1.061, + "step": 33324 + }, + { + "epoch": 2.840279553396403, + "grad_norm": 30.188113852360765, + "learning_rate": 8.618817936822787e-08, + "loss": 0.5594, + "step": 33325 + }, + { + "epoch": 2.8403647830904286, + "grad_norm": 40.069144119941434, + "learning_rate": 8.609653450912248e-08, + "loss": 0.7768, + "step": 33326 + }, + { + "epoch": 2.840450012784454, + "grad_norm": 48.40318503085808, + "learning_rate": 8.60049379763056e-08, + "loss": 1.325, + "step": 33327 + }, + { + "epoch": 2.8405352424784795, + "grad_norm": 20.96684972594448, + "learning_rate": 8.591338977067764e-08, + "loss": 0.8788, + "step": 33328 + }, + { + "epoch": 2.840620472172505, + "grad_norm": 66.2634017543654, + "learning_rate": 8.582188989313788e-08, + "loss": 1.8795, + "step": 33329 + }, + { + "epoch": 2.84070570186653, + "grad_norm": 58.84524869746857, + "learning_rate": 8.573043834458783e-08, + "loss": 1.8762, + "step": 33330 + }, + { + "epoch": 2.840790931560556, + "grad_norm": 89.72956264947473, + "learning_rate": 8.563903512592619e-08, + "loss": 2.2332, + "step": 33331 + }, + { + "epoch": 2.840876161254581, + "grad_norm": 57.361012431088554, + "learning_rate": 8.554768023805171e-08, + "loss": 1.3676, + "step": 33332 + }, + { + "epoch": 2.8409613909486064, + "grad_norm": 56.40582362055845, + "learning_rate": 8.545637368186311e-08, + "loss": 1.6826, + "step": 33333 + }, + { + "epoch": 2.841046620642632, + "grad_norm": 32.065933111888, + "learning_rate": 8.536511545825854e-08, + "loss": 1.5491, + "step": 33334 + }, + { + "epoch": 2.8411318503366574, + "grad_norm": 56.45323132061251, + "learning_rate": 8.527390556813452e-08, + "loss": 1.3639, + "step": 33335 + }, + { + "epoch": 2.841217080030683, + "grad_norm": 50.33914622580918, + "learning_rate": 8.518274401238869e-08, + "loss": 2.1279, + "step": 33336 + }, + { + "epoch": 2.841302309724708, + "grad_norm": 43.19456170529957, + "learning_rate": 8.509163079191806e-08, + "loss": 1.2371, + "step": 33337 + }, + { + "epoch": 2.8413875394187333, + "grad_norm": 61.27875635440303, + "learning_rate": 8.500056590761807e-08, + "loss": 1.5493, + "step": 33338 + }, + { + "epoch": 2.841472769112759, + "grad_norm": 27.053504013315102, + "learning_rate": 8.490954936038409e-08, + "loss": 0.8617, + "step": 33339 + }, + { + "epoch": 2.8415579988067843, + "grad_norm": 36.200651486193, + "learning_rate": 8.481858115111208e-08, + "loss": 1.1071, + "step": 33340 + }, + { + "epoch": 2.8416432285008097, + "grad_norm": 55.42069271925058, + "learning_rate": 8.472766128069632e-08, + "loss": 1.3928, + "step": 33341 + }, + { + "epoch": 2.841728458194835, + "grad_norm": 70.2426083919843, + "learning_rate": 8.463678975003109e-08, + "loss": 1.7847, + "step": 33342 + }, + { + "epoch": 2.8418136878888607, + "grad_norm": 45.85133209253329, + "learning_rate": 8.454596656000902e-08, + "loss": 1.0064, + "step": 33343 + }, + { + "epoch": 2.8418989175828857, + "grad_norm": 30.862936772452052, + "learning_rate": 8.445519171152494e-08, + "loss": 0.7466, + "step": 33344 + }, + { + "epoch": 2.841984147276911, + "grad_norm": 73.87131441044781, + "learning_rate": 8.436446520547037e-08, + "loss": 1.5441, + "step": 33345 + }, + { + "epoch": 2.8420693769709366, + "grad_norm": 49.03558673395405, + "learning_rate": 8.427378704273792e-08, + "loss": 2.2077, + "step": 33346 + }, + { + "epoch": 2.842154606664962, + "grad_norm": 64.45398405992461, + "learning_rate": 8.418315722422022e-08, + "loss": 1.2551, + "step": 33347 + }, + { + "epoch": 2.8422398363589876, + "grad_norm": 55.70236086954076, + "learning_rate": 8.409257575080654e-08, + "loss": 1.591, + "step": 33348 + }, + { + "epoch": 2.8423250660530126, + "grad_norm": 58.11853693347738, + "learning_rate": 8.400204262339006e-08, + "loss": 1.491, + "step": 33349 + }, + { + "epoch": 2.8424102957470385, + "grad_norm": 49.013129187066674, + "learning_rate": 8.391155784286009e-08, + "loss": 1.1047, + "step": 33350 + }, + { + "epoch": 2.8424955254410635, + "grad_norm": 55.8652957444086, + "learning_rate": 8.38211214101059e-08, + "loss": 1.7409, + "step": 33351 + }, + { + "epoch": 2.842580755135089, + "grad_norm": 52.46840102827162, + "learning_rate": 8.373073332601734e-08, + "loss": 1.1688, + "step": 33352 + }, + { + "epoch": 2.8426659848291145, + "grad_norm": 89.8526994510861, + "learning_rate": 8.364039359148369e-08, + "loss": 2.341, + "step": 33353 + }, + { + "epoch": 2.84275121452314, + "grad_norm": 47.37638497542816, + "learning_rate": 8.355010220739368e-08, + "loss": 1.131, + "step": 33354 + }, + { + "epoch": 2.8428364442171654, + "grad_norm": 12.555291956660781, + "learning_rate": 8.34598591746344e-08, + "loss": 0.165, + "step": 33355 + }, + { + "epoch": 2.8429216739111904, + "grad_norm": 49.02873752533751, + "learning_rate": 8.336966449409401e-08, + "loss": 1.7267, + "step": 33356 + }, + { + "epoch": 2.843006903605216, + "grad_norm": 58.97342603689268, + "learning_rate": 8.327951816665903e-08, + "loss": 0.951, + "step": 33357 + }, + { + "epoch": 2.8430921332992414, + "grad_norm": 77.11157005857454, + "learning_rate": 8.318942019321652e-08, + "loss": 1.6374, + "step": 33358 + }, + { + "epoch": 2.843177362993267, + "grad_norm": 32.93627610500653, + "learning_rate": 8.309937057465189e-08, + "loss": 0.9697, + "step": 33359 + }, + { + "epoch": 2.8432625926872923, + "grad_norm": 77.5864194254415, + "learning_rate": 8.300936931185166e-08, + "loss": 1.7315, + "step": 33360 + }, + { + "epoch": 2.843347822381318, + "grad_norm": 55.38268168115143, + "learning_rate": 8.291941640570012e-08, + "loss": 1.5554, + "step": 33361 + }, + { + "epoch": 2.8434330520753432, + "grad_norm": 31.459874811541518, + "learning_rate": 8.28295118570821e-08, + "loss": 0.7762, + "step": 33362 + }, + { + "epoch": 2.8435182817693683, + "grad_norm": 65.78756969851185, + "learning_rate": 8.273965566688246e-08, + "loss": 1.3414, + "step": 33363 + }, + { + "epoch": 2.8436035114633937, + "grad_norm": 56.2666989750278, + "learning_rate": 8.26498478359844e-08, + "loss": 1.4778, + "step": 33364 + }, + { + "epoch": 2.843688741157419, + "grad_norm": 55.31286945133719, + "learning_rate": 8.256008836527051e-08, + "loss": 1.2857, + "step": 33365 + }, + { + "epoch": 2.8437739708514447, + "grad_norm": 39.145647339411006, + "learning_rate": 8.247037725562457e-08, + "loss": 1.5206, + "step": 33366 + }, + { + "epoch": 2.84385920054547, + "grad_norm": 65.81192197709167, + "learning_rate": 8.238071450792805e-08, + "loss": 1.387, + "step": 33367 + }, + { + "epoch": 2.843944430239495, + "grad_norm": 45.346389197175256, + "learning_rate": 8.229110012306363e-08, + "loss": 1.5435, + "step": 33368 + }, + { + "epoch": 2.844029659933521, + "grad_norm": 41.32725831513363, + "learning_rate": 8.220153410191167e-08, + "loss": 1.1956, + "step": 33369 + }, + { + "epoch": 2.844114889627546, + "grad_norm": 68.13140249805191, + "learning_rate": 8.211201644535427e-08, + "loss": 1.7393, + "step": 33370 + }, + { + "epoch": 2.8442001193215716, + "grad_norm": 55.87187925042128, + "learning_rate": 8.202254715427071e-08, + "loss": 1.3067, + "step": 33371 + }, + { + "epoch": 2.844285349015597, + "grad_norm": 48.474899156848416, + "learning_rate": 8.193312622954086e-08, + "loss": 1.4684, + "step": 33372 + }, + { + "epoch": 2.8443705787096225, + "grad_norm": 56.20979547348076, + "learning_rate": 8.184375367204512e-08, + "loss": 1.6792, + "step": 33373 + }, + { + "epoch": 2.844455808403648, + "grad_norm": 36.022798645790566, + "learning_rate": 8.175442948266055e-08, + "loss": 0.9601, + "step": 33374 + }, + { + "epoch": 2.844541038097673, + "grad_norm": 47.10207034073056, + "learning_rate": 8.166515366226757e-08, + "loss": 1.3529, + "step": 33375 + }, + { + "epoch": 2.8446262677916985, + "grad_norm": 58.826766385712695, + "learning_rate": 8.157592621174327e-08, + "loss": 1.5173, + "step": 33376 + }, + { + "epoch": 2.844711497485724, + "grad_norm": 71.19626886304951, + "learning_rate": 8.14867471319658e-08, + "loss": 1.2513, + "step": 33377 + }, + { + "epoch": 2.8447967271797494, + "grad_norm": 75.2539386224642, + "learning_rate": 8.139761642381117e-08, + "loss": 1.0893, + "step": 33378 + }, + { + "epoch": 2.844881956873775, + "grad_norm": 50.40620284611137, + "learning_rate": 8.130853408815697e-08, + "loss": 1.5478, + "step": 33379 + }, + { + "epoch": 2.8449671865678003, + "grad_norm": 41.05409543493291, + "learning_rate": 8.121950012587865e-08, + "loss": 0.7689, + "step": 33380 + }, + { + "epoch": 2.845052416261826, + "grad_norm": 40.23582983102756, + "learning_rate": 8.113051453785214e-08, + "loss": 1.1475, + "step": 33381 + }, + { + "epoch": 2.845137645955851, + "grad_norm": 36.69583470784604, + "learning_rate": 8.104157732495232e-08, + "loss": 1.3026, + "step": 33382 + }, + { + "epoch": 2.8452228756498763, + "grad_norm": 61.387078151347694, + "learning_rate": 8.095268848805405e-08, + "loss": 1.2568, + "step": 33383 + }, + { + "epoch": 2.8453081053439018, + "grad_norm": 40.131638875659505, + "learning_rate": 8.08638480280316e-08, + "loss": 1.1707, + "step": 33384 + }, + { + "epoch": 2.8453933350379272, + "grad_norm": 30.417510437536883, + "learning_rate": 8.077505594575818e-08, + "loss": 1.0529, + "step": 33385 + }, + { + "epoch": 2.8454785647319527, + "grad_norm": 60.976886426530925, + "learning_rate": 8.068631224210754e-08, + "loss": 1.0447, + "step": 33386 + }, + { + "epoch": 2.845563794425978, + "grad_norm": 29.05354268400005, + "learning_rate": 8.059761691795232e-08, + "loss": 0.9537, + "step": 33387 + }, + { + "epoch": 2.8456490241200036, + "grad_norm": 19.101125214036283, + "learning_rate": 8.050896997416458e-08, + "loss": 0.6376, + "step": 33388 + }, + { + "epoch": 2.8457342538140287, + "grad_norm": 69.482619499816, + "learning_rate": 8.042037141161696e-08, + "loss": 1.525, + "step": 33389 + }, + { + "epoch": 2.845819483508054, + "grad_norm": 72.34953945877777, + "learning_rate": 8.033182123117933e-08, + "loss": 1.5228, + "step": 33390 + }, + { + "epoch": 2.8459047132020796, + "grad_norm": 35.74728581360032, + "learning_rate": 8.024331943372377e-08, + "loss": 1.2694, + "step": 33391 + }, + { + "epoch": 2.845989942896105, + "grad_norm": 35.49627645175786, + "learning_rate": 8.015486602011957e-08, + "loss": 1.0143, + "step": 33392 + }, + { + "epoch": 2.8460751725901305, + "grad_norm": 59.94738621219054, + "learning_rate": 8.006646099123827e-08, + "loss": 1.5588, + "step": 33393 + }, + { + "epoch": 2.8461604022841556, + "grad_norm": 27.969686366150093, + "learning_rate": 7.997810434794806e-08, + "loss": 0.9625, + "step": 33394 + }, + { + "epoch": 2.846245631978181, + "grad_norm": 25.18268937622957, + "learning_rate": 7.988979609111769e-08, + "loss": 1.09, + "step": 33395 + }, + { + "epoch": 2.8463308616722065, + "grad_norm": 49.602257400615386, + "learning_rate": 7.980153622161646e-08, + "loss": 1.1939, + "step": 33396 + }, + { + "epoch": 2.846416091366232, + "grad_norm": 55.103343150618244, + "learning_rate": 7.971332474031257e-08, + "loss": 1.5263, + "step": 33397 + }, + { + "epoch": 2.8465013210602574, + "grad_norm": 64.01135614647143, + "learning_rate": 7.962516164807199e-08, + "loss": 1.3346, + "step": 33398 + }, + { + "epoch": 2.846586550754283, + "grad_norm": 57.474020310120764, + "learning_rate": 7.953704694576348e-08, + "loss": 1.9683, + "step": 33399 + }, + { + "epoch": 2.8466717804483084, + "grad_norm": 65.38896881904309, + "learning_rate": 7.944898063425244e-08, + "loss": 1.4507, + "step": 33400 + }, + { + "epoch": 2.8467570101423334, + "grad_norm": 63.51090619879618, + "learning_rate": 7.936096271440541e-08, + "loss": 1.6484, + "step": 33401 + }, + { + "epoch": 2.846842239836359, + "grad_norm": 52.611654196118174, + "learning_rate": 7.927299318708892e-08, + "loss": 1.5623, + "step": 33402 + }, + { + "epoch": 2.8469274695303843, + "grad_norm": 54.74589835114734, + "learning_rate": 7.918507205316672e-08, + "loss": 1.6356, + "step": 33403 + }, + { + "epoch": 2.84701269922441, + "grad_norm": 47.77339577826681, + "learning_rate": 7.909719931350368e-08, + "loss": 1.0366, + "step": 33404 + }, + { + "epoch": 2.8470979289184353, + "grad_norm": 39.52785872637205, + "learning_rate": 7.900937496896466e-08, + "loss": 0.7577, + "step": 33405 + }, + { + "epoch": 2.8471831586124607, + "grad_norm": 63.3874071201688, + "learning_rate": 7.89215990204123e-08, + "loss": 1.6649, + "step": 33406 + }, + { + "epoch": 2.847268388306486, + "grad_norm": 48.3117090153296, + "learning_rate": 7.883387146871146e-08, + "loss": 1.9059, + "step": 33407 + }, + { + "epoch": 2.8473536180005112, + "grad_norm": 53.745111137779524, + "learning_rate": 7.874619231472369e-08, + "loss": 0.872, + "step": 33408 + }, + { + "epoch": 2.8474388476945367, + "grad_norm": 70.53631421661555, + "learning_rate": 7.865856155931162e-08, + "loss": 1.4013, + "step": 33409 + }, + { + "epoch": 2.847524077388562, + "grad_norm": 48.611155802953895, + "learning_rate": 7.857097920333734e-08, + "loss": 1.2816, + "step": 33410 + }, + { + "epoch": 2.8476093070825876, + "grad_norm": 42.51068449391623, + "learning_rate": 7.848344524766182e-08, + "loss": 1.1126, + "step": 33411 + }, + { + "epoch": 2.847694536776613, + "grad_norm": 75.67819433178512, + "learning_rate": 7.83959596931455e-08, + "loss": 2.3732, + "step": 33412 + }, + { + "epoch": 2.847779766470638, + "grad_norm": 64.23163780856811, + "learning_rate": 7.830852254064935e-08, + "loss": 1.5819, + "step": 33413 + }, + { + "epoch": 2.847864996164664, + "grad_norm": 45.521052824445064, + "learning_rate": 7.822113379103325e-08, + "loss": 1.0611, + "step": 33414 + }, + { + "epoch": 2.847950225858689, + "grad_norm": 72.4333788472308, + "learning_rate": 7.813379344515703e-08, + "loss": 1.5306, + "step": 33415 + }, + { + "epoch": 2.8480354555527145, + "grad_norm": 71.60358477999664, + "learning_rate": 7.804650150387838e-08, + "loss": 1.4498, + "step": 33416 + }, + { + "epoch": 2.84812068524674, + "grad_norm": 84.78234314698011, + "learning_rate": 7.79592579680577e-08, + "loss": 1.9303, + "step": 33417 + }, + { + "epoch": 2.8482059149407655, + "grad_norm": 33.17525703533666, + "learning_rate": 7.787206283855098e-08, + "loss": 0.9341, + "step": 33418 + }, + { + "epoch": 2.848291144634791, + "grad_norm": 59.861393231756544, + "learning_rate": 7.778491611621752e-08, + "loss": 1.7154, + "step": 33419 + }, + { + "epoch": 2.848376374328816, + "grad_norm": 63.16272931238212, + "learning_rate": 7.769781780191277e-08, + "loss": 1.6347, + "step": 33420 + }, + { + "epoch": 2.8484616040228414, + "grad_norm": 53.636653891512516, + "learning_rate": 7.761076789649436e-08, + "loss": 1.4105, + "step": 33421 + }, + { + "epoch": 2.848546833716867, + "grad_norm": 62.13961196831699, + "learning_rate": 7.752376640081771e-08, + "loss": 1.8481, + "step": 33422 + }, + { + "epoch": 2.8486320634108924, + "grad_norm": 56.954639315267436, + "learning_rate": 7.743681331573938e-08, + "loss": 1.4064, + "step": 33423 + }, + { + "epoch": 2.848717293104918, + "grad_norm": 37.5775715843843, + "learning_rate": 7.734990864211367e-08, + "loss": 1.591, + "step": 33424 + }, + { + "epoch": 2.8488025227989433, + "grad_norm": 29.18987704238974, + "learning_rate": 7.72630523807949e-08, + "loss": 0.786, + "step": 33425 + }, + { + "epoch": 2.8488877524929688, + "grad_norm": 60.97647561018717, + "learning_rate": 7.717624453263905e-08, + "loss": 1.1596, + "step": 33426 + }, + { + "epoch": 2.848972982186994, + "grad_norm": 68.81149404136679, + "learning_rate": 7.708948509849823e-08, + "loss": 1.5711, + "step": 33427 + }, + { + "epoch": 2.8490582118810193, + "grad_norm": 30.072818965262556, + "learning_rate": 7.700277407922619e-08, + "loss": 0.763, + "step": 33428 + }, + { + "epoch": 2.8491434415750447, + "grad_norm": 63.962981959849756, + "learning_rate": 7.691611147567558e-08, + "loss": 1.8569, + "step": 33429 + }, + { + "epoch": 2.84922867126907, + "grad_norm": 61.31536601037547, + "learning_rate": 7.682949728869848e-08, + "loss": 1.2606, + "step": 33430 + }, + { + "epoch": 2.8493139009630957, + "grad_norm": 62.128344738713615, + "learning_rate": 7.674293151914702e-08, + "loss": 1.6533, + "step": 33431 + }, + { + "epoch": 2.8493991306571207, + "grad_norm": 91.66147639700304, + "learning_rate": 7.665641416787329e-08, + "loss": 2.3265, + "step": 33432 + }, + { + "epoch": 2.8494843603511466, + "grad_norm": 23.33503402709945, + "learning_rate": 7.656994523572714e-08, + "loss": 0.4717, + "step": 33433 + }, + { + "epoch": 2.8495695900451716, + "grad_norm": 26.23398008222656, + "learning_rate": 7.648352472355847e-08, + "loss": 0.9796, + "step": 33434 + }, + { + "epoch": 2.849654819739197, + "grad_norm": 46.50528937183281, + "learning_rate": 7.639715263221825e-08, + "loss": 0.8086, + "step": 33435 + }, + { + "epoch": 2.8497400494332226, + "grad_norm": 47.93873706591658, + "learning_rate": 7.631082896255637e-08, + "loss": 1.6986, + "step": 33436 + }, + { + "epoch": 2.849825279127248, + "grad_norm": 57.901500006129545, + "learning_rate": 7.622455371541992e-08, + "loss": 1.1548, + "step": 33437 + }, + { + "epoch": 2.8499105088212735, + "grad_norm": 60.38830042134806, + "learning_rate": 7.613832689165879e-08, + "loss": 1.5381, + "step": 33438 + }, + { + "epoch": 2.8499957385152985, + "grad_norm": 18.10214587708828, + "learning_rate": 7.605214849212062e-08, + "loss": 0.8178, + "step": 33439 + }, + { + "epoch": 2.850080968209324, + "grad_norm": 57.212599848339224, + "learning_rate": 7.596601851765362e-08, + "loss": 1.8239, + "step": 33440 + }, + { + "epoch": 2.8501661979033495, + "grad_norm": 58.536585654483915, + "learning_rate": 7.587993696910378e-08, + "loss": 1.6621, + "step": 33441 + }, + { + "epoch": 2.850251427597375, + "grad_norm": 15.496449586141344, + "learning_rate": 7.579390384731766e-08, + "loss": 0.6275, + "step": 33442 + }, + { + "epoch": 2.8503366572914004, + "grad_norm": 24.434953473038085, + "learning_rate": 7.570791915314236e-08, + "loss": 0.7038, + "step": 33443 + }, + { + "epoch": 2.850421886985426, + "grad_norm": 66.42641632610184, + "learning_rate": 7.562198288742274e-08, + "loss": 1.5661, + "step": 33444 + }, + { + "epoch": 2.8505071166794513, + "grad_norm": 23.58106987846831, + "learning_rate": 7.553609505100423e-08, + "loss": 0.762, + "step": 33445 + }, + { + "epoch": 2.8505923463734764, + "grad_norm": 74.64755692369152, + "learning_rate": 7.545025564473174e-08, + "loss": 1.9106, + "step": 33446 + }, + { + "epoch": 2.850677576067502, + "grad_norm": 41.83064339271147, + "learning_rate": 7.536446466944847e-08, + "loss": 0.9429, + "step": 33447 + }, + { + "epoch": 2.8507628057615273, + "grad_norm": 53.96100759292291, + "learning_rate": 7.52787221259993e-08, + "loss": 1.2668, + "step": 33448 + }, + { + "epoch": 2.8508480354555528, + "grad_norm": 69.39460218349483, + "learning_rate": 7.519302801522687e-08, + "loss": 1.3969, + "step": 33449 + }, + { + "epoch": 2.8509332651495782, + "grad_norm": 78.58547660128221, + "learning_rate": 7.510738233797444e-08, + "loss": 1.9688, + "step": 33450 + }, + { + "epoch": 2.8510184948436033, + "grad_norm": 41.49828656992859, + "learning_rate": 7.502178509508351e-08, + "loss": 0.9276, + "step": 33451 + }, + { + "epoch": 2.851103724537629, + "grad_norm": 75.97379165819515, + "learning_rate": 7.493623628739621e-08, + "loss": 1.8865, + "step": 33452 + }, + { + "epoch": 2.851188954231654, + "grad_norm": 43.57560463957272, + "learning_rate": 7.485073591575408e-08, + "loss": 1.2812, + "step": 33453 + }, + { + "epoch": 2.8512741839256797, + "grad_norm": 64.62917244908847, + "learning_rate": 7.476528398099813e-08, + "loss": 1.3358, + "step": 33454 + }, + { + "epoch": 2.851359413619705, + "grad_norm": 53.164751928668124, + "learning_rate": 7.467988048396824e-08, + "loss": 1.4275, + "step": 33455 + }, + { + "epoch": 2.8514446433137306, + "grad_norm": 90.32876945073309, + "learning_rate": 7.459452542550538e-08, + "loss": 2.418, + "step": 33456 + }, + { + "epoch": 2.851529873007756, + "grad_norm": 71.84777144127959, + "learning_rate": 7.450921880644779e-08, + "loss": 1.8507, + "step": 33457 + }, + { + "epoch": 2.851615102701781, + "grad_norm": 100.56543292746025, + "learning_rate": 7.44239606276348e-08, + "loss": 1.7503, + "step": 33458 + }, + { + "epoch": 2.8517003323958066, + "grad_norm": 81.07664924420425, + "learning_rate": 7.433875088990406e-08, + "loss": 1.7849, + "step": 33459 + }, + { + "epoch": 2.851785562089832, + "grad_norm": 30.488188732136653, + "learning_rate": 7.425358959409545e-08, + "loss": 0.932, + "step": 33460 + }, + { + "epoch": 2.8518707917838575, + "grad_norm": 51.63265007968864, + "learning_rate": 7.4168476741045e-08, + "loss": 1.4044, + "step": 33461 + }, + { + "epoch": 2.851956021477883, + "grad_norm": 30.14382106515191, + "learning_rate": 7.408341233159089e-08, + "loss": 0.7646, + "step": 33462 + }, + { + "epoch": 2.8520412511719084, + "grad_norm": 29.22089435171167, + "learning_rate": 7.399839636656857e-08, + "loss": 0.9209, + "step": 33463 + }, + { + "epoch": 2.852126480865934, + "grad_norm": 49.93546953252967, + "learning_rate": 7.391342884681462e-08, + "loss": 1.0966, + "step": 33464 + }, + { + "epoch": 2.852211710559959, + "grad_norm": 47.15932349054195, + "learning_rate": 7.382850977316447e-08, + "loss": 1.7912, + "step": 33465 + }, + { + "epoch": 2.8522969402539844, + "grad_norm": 40.62307899164014, + "learning_rate": 7.37436391464541e-08, + "loss": 1.4521, + "step": 33466 + }, + { + "epoch": 2.85238216994801, + "grad_norm": 34.32589062438223, + "learning_rate": 7.365881696751731e-08, + "loss": 1.2719, + "step": 33467 + }, + { + "epoch": 2.8524673996420353, + "grad_norm": 53.48800931261748, + "learning_rate": 7.357404323718786e-08, + "loss": 0.979, + "step": 33468 + }, + { + "epoch": 2.852552629336061, + "grad_norm": 63.98627627420327, + "learning_rate": 7.348931795630065e-08, + "loss": 1.2013, + "step": 33469 + }, + { + "epoch": 2.852637859030086, + "grad_norm": 64.84432241050449, + "learning_rate": 7.340464112568834e-08, + "loss": 1.9661, + "step": 33470 + }, + { + "epoch": 2.8527230887241117, + "grad_norm": 46.76436089718898, + "learning_rate": 7.332001274618416e-08, + "loss": 1.0101, + "step": 33471 + }, + { + "epoch": 2.8528083184181368, + "grad_norm": 55.33523007224317, + "learning_rate": 7.323543281861911e-08, + "loss": 1.9067, + "step": 33472 + }, + { + "epoch": 2.8528935481121622, + "grad_norm": 40.64722649991431, + "learning_rate": 7.315090134382641e-08, + "loss": 1.04, + "step": 33473 + }, + { + "epoch": 2.8529787778061877, + "grad_norm": 65.54015434281274, + "learning_rate": 7.306641832263705e-08, + "loss": 1.7235, + "step": 33474 + }, + { + "epoch": 2.853064007500213, + "grad_norm": 44.49006294842775, + "learning_rate": 7.29819837558815e-08, + "loss": 1.175, + "step": 33475 + }, + { + "epoch": 2.8531492371942386, + "grad_norm": 24.07051539850534, + "learning_rate": 7.289759764439019e-08, + "loss": 0.6241, + "step": 33476 + }, + { + "epoch": 2.8532344668882637, + "grad_norm": 58.03459823953995, + "learning_rate": 7.281325998899303e-08, + "loss": 2.1294, + "step": 33477 + }, + { + "epoch": 2.853319696582289, + "grad_norm": 31.499182658893897, + "learning_rate": 7.272897079051989e-08, + "loss": 0.9372, + "step": 33478 + }, + { + "epoch": 2.8534049262763146, + "grad_norm": 25.661922821397656, + "learning_rate": 7.264473004979955e-08, + "loss": 0.6303, + "step": 33479 + }, + { + "epoch": 2.85349015597034, + "grad_norm": 66.12187354183106, + "learning_rate": 7.256053776766025e-08, + "loss": 1.5057, + "step": 33480 + }, + { + "epoch": 2.8535753856643655, + "grad_norm": 61.97221439237505, + "learning_rate": 7.247639394492967e-08, + "loss": 1.9982, + "step": 33481 + }, + { + "epoch": 2.853660615358391, + "grad_norm": 76.41994808203984, + "learning_rate": 7.239229858243601e-08, + "loss": 1.8091, + "step": 33482 + }, + { + "epoch": 2.8537458450524165, + "grad_norm": 20.236324483585346, + "learning_rate": 7.230825168100641e-08, + "loss": 0.7103, + "step": 33483 + }, + { + "epoch": 2.8538310747464415, + "grad_norm": 38.60774680843712, + "learning_rate": 7.222425324146632e-08, + "loss": 1.0442, + "step": 33484 + }, + { + "epoch": 2.853916304440467, + "grad_norm": 59.57078384148614, + "learning_rate": 7.214030326464338e-08, + "loss": 1.3329, + "step": 33485 + }, + { + "epoch": 2.8540015341344924, + "grad_norm": 52.71338881773082, + "learning_rate": 7.205640175136197e-08, + "loss": 1.154, + "step": 33486 + }, + { + "epoch": 2.854086763828518, + "grad_norm": 44.708481353479144, + "learning_rate": 7.197254870244751e-08, + "loss": 1.216, + "step": 33487 + }, + { + "epoch": 2.8541719935225434, + "grad_norm": 66.60862507339117, + "learning_rate": 7.188874411872549e-08, + "loss": 1.6137, + "step": 33488 + }, + { + "epoch": 2.8542572232165684, + "grad_norm": 65.70832948343035, + "learning_rate": 7.18049880010191e-08, + "loss": 1.6212, + "step": 33489 + }, + { + "epoch": 2.8543424529105943, + "grad_norm": 59.136182609070346, + "learning_rate": 7.172128035015214e-08, + "loss": 1.5338, + "step": 33490 + }, + { + "epoch": 2.8544276826046193, + "grad_norm": 109.71462472615552, + "learning_rate": 7.163762116694839e-08, + "loss": 2.9044, + "step": 33491 + }, + { + "epoch": 2.854512912298645, + "grad_norm": 58.99788855464972, + "learning_rate": 7.155401045222998e-08, + "loss": 1.5343, + "step": 33492 + }, + { + "epoch": 2.8545981419926703, + "grad_norm": 75.08457288470014, + "learning_rate": 7.147044820682014e-08, + "loss": 1.8739, + "step": 33493 + }, + { + "epoch": 2.8546833716866957, + "grad_norm": 46.33740286950168, + "learning_rate": 7.138693443153932e-08, + "loss": 1.4939, + "step": 33494 + }, + { + "epoch": 2.854768601380721, + "grad_norm": 32.57256361204111, + "learning_rate": 7.130346912720965e-08, + "loss": 0.5411, + "step": 33495 + }, + { + "epoch": 2.854853831074746, + "grad_norm": 52.76897112302322, + "learning_rate": 7.122005229465213e-08, + "loss": 1.5769, + "step": 33496 + }, + { + "epoch": 2.8549390607687717, + "grad_norm": 52.62648938702725, + "learning_rate": 7.11366839346872e-08, + "loss": 1.4298, + "step": 33497 + }, + { + "epoch": 2.855024290462797, + "grad_norm": 17.874611427358644, + "learning_rate": 7.105336404813367e-08, + "loss": 0.5867, + "step": 33498 + }, + { + "epoch": 2.8551095201568226, + "grad_norm": 56.38507404210303, + "learning_rate": 7.097009263581256e-08, + "loss": 1.4182, + "step": 33499 + }, + { + "epoch": 2.855194749850848, + "grad_norm": 79.81978834594938, + "learning_rate": 7.088686969854153e-08, + "loss": 2.3059, + "step": 33500 + }, + { + "epoch": 2.8552799795448736, + "grad_norm": 63.66347511979275, + "learning_rate": 7.080369523713938e-08, + "loss": 1.5737, + "step": 33501 + }, + { + "epoch": 2.855365209238899, + "grad_norm": 82.85924015259987, + "learning_rate": 7.072056925242432e-08, + "loss": 0.9646, + "step": 33502 + }, + { + "epoch": 2.855450438932924, + "grad_norm": 38.31772745702064, + "learning_rate": 7.063749174521406e-08, + "loss": 1.1239, + "step": 33503 + }, + { + "epoch": 2.8555356686269495, + "grad_norm": 70.90778953828809, + "learning_rate": 7.055446271632516e-08, + "loss": 1.7259, + "step": 33504 + }, + { + "epoch": 2.855620898320975, + "grad_norm": 66.19965434113496, + "learning_rate": 7.047148216657418e-08, + "loss": 2.3531, + "step": 33505 + }, + { + "epoch": 2.8557061280150005, + "grad_norm": 58.94352423244016, + "learning_rate": 7.03885500967777e-08, + "loss": 2.0316, + "step": 33506 + }, + { + "epoch": 2.855791357709026, + "grad_norm": 79.4763424664958, + "learning_rate": 7.030566650775062e-08, + "loss": 1.3851, + "step": 33507 + }, + { + "epoch": 2.855876587403051, + "grad_norm": 67.99397149952742, + "learning_rate": 7.02228314003084e-08, + "loss": 1.3028, + "step": 33508 + }, + { + "epoch": 2.855961817097077, + "grad_norm": 41.97018152100846, + "learning_rate": 7.014004477526593e-08, + "loss": 1.2205, + "step": 33509 + }, + { + "epoch": 2.856047046791102, + "grad_norm": 51.16899605599597, + "learning_rate": 7.005730663343758e-08, + "loss": 1.1803, + "step": 33510 + }, + { + "epoch": 2.8561322764851274, + "grad_norm": 54.59954325093058, + "learning_rate": 6.997461697563601e-08, + "loss": 1.1491, + "step": 33511 + }, + { + "epoch": 2.856217506179153, + "grad_norm": 60.5658691062127, + "learning_rate": 6.989197580267504e-08, + "loss": 0.9911, + "step": 33512 + }, + { + "epoch": 2.8563027358731783, + "grad_norm": 45.433582778536206, + "learning_rate": 6.980938311536789e-08, + "loss": 1.0992, + "step": 33513 + }, + { + "epoch": 2.8563879655672038, + "grad_norm": 34.53333355975045, + "learning_rate": 6.97268389145267e-08, + "loss": 1.0731, + "step": 33514 + }, + { + "epoch": 2.856473195261229, + "grad_norm": 24.917231460435485, + "learning_rate": 6.964434320096191e-08, + "loss": 0.7064, + "step": 33515 + }, + { + "epoch": 2.8565584249552542, + "grad_norm": 78.59348509347626, + "learning_rate": 6.956189597548624e-08, + "loss": 1.419, + "step": 33516 + }, + { + "epoch": 2.8566436546492797, + "grad_norm": 42.12710336811412, + "learning_rate": 6.947949723891067e-08, + "loss": 1.0683, + "step": 33517 + }, + { + "epoch": 2.856728884343305, + "grad_norm": 43.05330079446823, + "learning_rate": 6.939714699204514e-08, + "loss": 1.3538, + "step": 33518 + }, + { + "epoch": 2.8568141140373307, + "grad_norm": 34.66008259330918, + "learning_rate": 6.931484523569898e-08, + "loss": 0.8448, + "step": 33519 + }, + { + "epoch": 2.856899343731356, + "grad_norm": 79.47253151772163, + "learning_rate": 6.923259197068211e-08, + "loss": 1.4288, + "step": 33520 + }, + { + "epoch": 2.8569845734253816, + "grad_norm": 39.74345084103205, + "learning_rate": 6.915038719780331e-08, + "loss": 1.2735, + "step": 33521 + }, + { + "epoch": 2.8570698031194066, + "grad_norm": 65.50706790156784, + "learning_rate": 6.90682309178714e-08, + "loss": 2.3327, + "step": 33522 + }, + { + "epoch": 2.857155032813432, + "grad_norm": 84.37740990046153, + "learning_rate": 6.898612313169351e-08, + "loss": 1.5362, + "step": 33523 + }, + { + "epoch": 2.8572402625074576, + "grad_norm": 50.730128878025084, + "learning_rate": 6.890406384007786e-08, + "loss": 1.3413, + "step": 33524 + }, + { + "epoch": 2.857325492201483, + "grad_norm": 66.88522951098673, + "learning_rate": 6.88220530438316e-08, + "loss": 1.6287, + "step": 33525 + }, + { + "epoch": 2.8574107218955085, + "grad_norm": 65.32560535193646, + "learning_rate": 6.874009074376131e-08, + "loss": 1.603, + "step": 33526 + }, + { + "epoch": 2.857495951589534, + "grad_norm": 47.82341929950518, + "learning_rate": 6.865817694067245e-08, + "loss": 1.5668, + "step": 33527 + }, + { + "epoch": 2.8575811812835594, + "grad_norm": 51.575514084282354, + "learning_rate": 6.857631163537049e-08, + "loss": 1.0691, + "step": 33528 + }, + { + "epoch": 2.8576664109775844, + "grad_norm": 77.43062030954961, + "learning_rate": 6.849449482866143e-08, + "loss": 2.0147, + "step": 33529 + }, + { + "epoch": 2.85775164067161, + "grad_norm": 56.78992860807975, + "learning_rate": 6.841272652134967e-08, + "loss": 1.7059, + "step": 33530 + }, + { + "epoch": 2.8578368703656354, + "grad_norm": 88.86542292424416, + "learning_rate": 6.833100671423842e-08, + "loss": 1.9133, + "step": 33531 + }, + { + "epoch": 2.857922100059661, + "grad_norm": 70.13376131435936, + "learning_rate": 6.824933540813261e-08, + "loss": 1.7723, + "step": 33532 + }, + { + "epoch": 2.8580073297536863, + "grad_norm": 68.61244935142317, + "learning_rate": 6.816771260383436e-08, + "loss": 1.5413, + "step": 33533 + }, + { + "epoch": 2.8580925594477113, + "grad_norm": 67.53740212567998, + "learning_rate": 6.80861383021475e-08, + "loss": 1.3167, + "step": 33534 + }, + { + "epoch": 2.8581777891417373, + "grad_norm": 54.12990827996653, + "learning_rate": 6.800461250387358e-08, + "loss": 1.2448, + "step": 33535 + }, + { + "epoch": 2.8582630188357623, + "grad_norm": 22.461836818583276, + "learning_rate": 6.792313520981475e-08, + "loss": 0.8173, + "step": 33536 + }, + { + "epoch": 2.8583482485297877, + "grad_norm": 41.89500828227752, + "learning_rate": 6.784170642077148e-08, + "loss": 1.1504, + "step": 33537 + }, + { + "epoch": 2.858433478223813, + "grad_norm": 30.68413242283715, + "learning_rate": 6.776032613754535e-08, + "loss": 0.9244, + "step": 33538 + }, + { + "epoch": 2.8585187079178387, + "grad_norm": 66.86863171518435, + "learning_rate": 6.767899436093627e-08, + "loss": 2.1568, + "step": 33539 + }, + { + "epoch": 2.858603937611864, + "grad_norm": 31.179574908793004, + "learning_rate": 6.759771109174474e-08, + "loss": 0.81, + "step": 33540 + }, + { + "epoch": 2.858689167305889, + "grad_norm": 41.32874360381456, + "learning_rate": 6.751647633076952e-08, + "loss": 1.2944, + "step": 33541 + }, + { + "epoch": 2.8587743969999146, + "grad_norm": 39.086183110158984, + "learning_rate": 6.743529007881055e-08, + "loss": 1.2577, + "step": 33542 + }, + { + "epoch": 2.85885962669394, + "grad_norm": 46.82283833240547, + "learning_rate": 6.735415233666498e-08, + "loss": 1.2249, + "step": 33543 + }, + { + "epoch": 2.8589448563879656, + "grad_norm": 53.53842437656296, + "learning_rate": 6.727306310513104e-08, + "loss": 1.3397, + "step": 33544 + }, + { + "epoch": 2.859030086081991, + "grad_norm": 38.454230778656026, + "learning_rate": 6.719202238500644e-08, + "loss": 1.3492, + "step": 33545 + }, + { + "epoch": 2.8591153157760165, + "grad_norm": 64.06191354607654, + "learning_rate": 6.711103017708886e-08, + "loss": 1.6567, + "step": 33546 + }, + { + "epoch": 2.859200545470042, + "grad_norm": 50.97902301619575, + "learning_rate": 6.703008648217323e-08, + "loss": 1.2943, + "step": 33547 + }, + { + "epoch": 2.859285775164067, + "grad_norm": 31.232061864509465, + "learning_rate": 6.694919130105726e-08, + "loss": 0.6853, + "step": 33548 + }, + { + "epoch": 2.8593710048580925, + "grad_norm": 57.92538161752102, + "learning_rate": 6.686834463453584e-08, + "loss": 1.549, + "step": 33549 + }, + { + "epoch": 2.859456234552118, + "grad_norm": 16.060101126451748, + "learning_rate": 6.678754648340335e-08, + "loss": 0.7926, + "step": 33550 + }, + { + "epoch": 2.8595414642461434, + "grad_norm": 64.11155062923767, + "learning_rate": 6.670679684845582e-08, + "loss": 1.8575, + "step": 33551 + }, + { + "epoch": 2.859626693940169, + "grad_norm": 72.37948060271692, + "learning_rate": 6.662609573048596e-08, + "loss": 1.4683, + "step": 33552 + }, + { + "epoch": 2.859711923634194, + "grad_norm": 51.842788895060075, + "learning_rate": 6.654544313028866e-08, + "loss": 1.0488, + "step": 33553 + }, + { + "epoch": 2.85979715332822, + "grad_norm": 47.692343291497444, + "learning_rate": 6.646483904865608e-08, + "loss": 0.9435, + "step": 33554 + }, + { + "epoch": 2.859882383022245, + "grad_norm": 23.32002264598768, + "learning_rate": 6.638428348638149e-08, + "loss": 0.6454, + "step": 33555 + }, + { + "epoch": 2.8599676127162703, + "grad_norm": 55.177323615360585, + "learning_rate": 6.630377644425756e-08, + "loss": 1.9043, + "step": 33556 + }, + { + "epoch": 2.860052842410296, + "grad_norm": 79.690006950084, + "learning_rate": 6.622331792307535e-08, + "loss": 1.8651, + "step": 33557 + }, + { + "epoch": 2.8601380721043212, + "grad_norm": 54.38766515849013, + "learning_rate": 6.614290792362588e-08, + "loss": 1.2411, + "step": 33558 + }, + { + "epoch": 2.8602233017983467, + "grad_norm": 75.83324598088896, + "learning_rate": 6.606254644670074e-08, + "loss": 2.316, + "step": 33559 + }, + { + "epoch": 2.8603085314923717, + "grad_norm": 30.47724220567604, + "learning_rate": 6.598223349308985e-08, + "loss": 0.9557, + "step": 33560 + }, + { + "epoch": 2.860393761186397, + "grad_norm": 42.09324066677041, + "learning_rate": 6.590196906358315e-08, + "loss": 1.0514, + "step": 33561 + }, + { + "epoch": 2.8604789908804227, + "grad_norm": 33.978145502335536, + "learning_rate": 6.582175315896999e-08, + "loss": 1.0477, + "step": 33562 + }, + { + "epoch": 2.860564220574448, + "grad_norm": 49.83895565155888, + "learning_rate": 6.574158578003865e-08, + "loss": 1.3284, + "step": 33563 + }, + { + "epoch": 2.8606494502684736, + "grad_norm": 48.99886794618424, + "learning_rate": 6.566146692757902e-08, + "loss": 0.7879, + "step": 33564 + }, + { + "epoch": 2.860734679962499, + "grad_norm": 72.81073096523808, + "learning_rate": 6.558139660237773e-08, + "loss": 1.4379, + "step": 33565 + }, + { + "epoch": 2.8608199096565246, + "grad_norm": 53.849673091424876, + "learning_rate": 6.550137480522301e-08, + "loss": 1.6705, + "step": 33566 + }, + { + "epoch": 2.8609051393505496, + "grad_norm": 27.69011142795136, + "learning_rate": 6.542140153690091e-08, + "loss": 0.5961, + "step": 33567 + }, + { + "epoch": 2.860990369044575, + "grad_norm": 64.12241551345556, + "learning_rate": 6.534147679819858e-08, + "loss": 1.4287, + "step": 33568 + }, + { + "epoch": 2.8610755987386005, + "grad_norm": 69.94197811028724, + "learning_rate": 6.526160058990261e-08, + "loss": 1.8179, + "step": 33569 + }, + { + "epoch": 2.861160828432626, + "grad_norm": 41.82756010276429, + "learning_rate": 6.518177291279737e-08, + "loss": 1.3094, + "step": 33570 + }, + { + "epoch": 2.8612460581266514, + "grad_norm": 32.942071999474244, + "learning_rate": 6.510199376766835e-08, + "loss": 1.0824, + "step": 33571 + }, + { + "epoch": 2.8613312878206765, + "grad_norm": 29.03171701702223, + "learning_rate": 6.502226315530047e-08, + "loss": 0.8462, + "step": 33572 + }, + { + "epoch": 2.8614165175147024, + "grad_norm": 52.35304913831195, + "learning_rate": 6.49425810764781e-08, + "loss": 1.3645, + "step": 33573 + }, + { + "epoch": 2.8615017472087274, + "grad_norm": 54.516943791708336, + "learning_rate": 6.486294753198396e-08, + "loss": 1.0676, + "step": 33574 + }, + { + "epoch": 2.861586976902753, + "grad_norm": 89.77465452467689, + "learning_rate": 6.478336252260187e-08, + "loss": 2.0291, + "step": 33575 + }, + { + "epoch": 2.8616722065967783, + "grad_norm": 72.96094812376639, + "learning_rate": 6.470382604911452e-08, + "loss": 1.6741, + "step": 33576 + }, + { + "epoch": 2.861757436290804, + "grad_norm": 30.121652514311677, + "learning_rate": 6.462433811230351e-08, + "loss": 0.631, + "step": 33577 + }, + { + "epoch": 2.8618426659848293, + "grad_norm": 42.08827598599452, + "learning_rate": 6.4544898712951e-08, + "loss": 1.146, + "step": 33578 + }, + { + "epoch": 2.8619278956788543, + "grad_norm": 87.00230763931476, + "learning_rate": 6.446550785183858e-08, + "loss": 2.06, + "step": 33579 + }, + { + "epoch": 2.8620131253728798, + "grad_norm": 39.77883884192945, + "learning_rate": 6.438616552974675e-08, + "loss": 1.3766, + "step": 33580 + }, + { + "epoch": 2.8620983550669052, + "grad_norm": 60.966490361774284, + "learning_rate": 6.430687174745543e-08, + "loss": 1.5853, + "step": 33581 + }, + { + "epoch": 2.8621835847609307, + "grad_norm": 44.86664036585382, + "learning_rate": 6.42276265057451e-08, + "loss": 1.2173, + "step": 33582 + }, + { + "epoch": 2.862268814454956, + "grad_norm": 68.74243242357902, + "learning_rate": 6.414842980539516e-08, + "loss": 1.6604, + "step": 33583 + }, + { + "epoch": 2.8623540441489816, + "grad_norm": 42.81926930606941, + "learning_rate": 6.40692816471833e-08, + "loss": 1.178, + "step": 33584 + }, + { + "epoch": 2.862439273843007, + "grad_norm": 42.05221382467417, + "learning_rate": 6.399018203188889e-08, + "loss": 1.4089, + "step": 33585 + }, + { + "epoch": 2.862524503537032, + "grad_norm": 27.957554740095667, + "learning_rate": 6.391113096028967e-08, + "loss": 0.7781, + "step": 33586 + }, + { + "epoch": 2.8626097332310576, + "grad_norm": 93.9454336867642, + "learning_rate": 6.383212843316333e-08, + "loss": 2.6926, + "step": 33587 + }, + { + "epoch": 2.862694962925083, + "grad_norm": 41.36769246038979, + "learning_rate": 6.375317445128593e-08, + "loss": 1.1255, + "step": 33588 + }, + { + "epoch": 2.8627801926191085, + "grad_norm": 58.36284061134314, + "learning_rate": 6.367426901543517e-08, + "loss": 1.1336, + "step": 33589 + }, + { + "epoch": 2.862865422313134, + "grad_norm": 60.12974974320021, + "learning_rate": 6.359541212638653e-08, + "loss": 1.6284, + "step": 33590 + }, + { + "epoch": 2.862950652007159, + "grad_norm": 33.688680484415194, + "learning_rate": 6.351660378491554e-08, + "loss": 1.0738, + "step": 33591 + }, + { + "epoch": 2.863035881701185, + "grad_norm": 40.63311796565703, + "learning_rate": 6.343784399179709e-08, + "loss": 1.2358, + "step": 33592 + }, + { + "epoch": 2.86312111139521, + "grad_norm": 59.08749217901608, + "learning_rate": 6.335913274780559e-08, + "loss": 0.9506, + "step": 33593 + }, + { + "epoch": 2.8632063410892354, + "grad_norm": 31.913372485604057, + "learning_rate": 6.328047005371485e-08, + "loss": 0.6825, + "step": 33594 + }, + { + "epoch": 2.863291570783261, + "grad_norm": 37.54698937913273, + "learning_rate": 6.320185591029981e-08, + "loss": 0.794, + "step": 33595 + }, + { + "epoch": 2.8633768004772864, + "grad_norm": 47.24873613771979, + "learning_rate": 6.31232903183332e-08, + "loss": 1.6509, + "step": 33596 + }, + { + "epoch": 2.863462030171312, + "grad_norm": 50.13962187019365, + "learning_rate": 6.30447732785866e-08, + "loss": 1.571, + "step": 33597 + }, + { + "epoch": 2.863547259865337, + "grad_norm": 54.935394904915604, + "learning_rate": 6.296630479183329e-08, + "loss": 1.0864, + "step": 33598 + }, + { + "epoch": 2.8636324895593623, + "grad_norm": 47.256262929273646, + "learning_rate": 6.288788485884434e-08, + "loss": 1.0929, + "step": 33599 + }, + { + "epoch": 2.863717719253388, + "grad_norm": 48.15506117302606, + "learning_rate": 6.280951348039133e-08, + "loss": 1.5181, + "step": 33600 + }, + { + "epoch": 2.8638029489474133, + "grad_norm": 32.034895968741985, + "learning_rate": 6.273119065724476e-08, + "loss": 0.9656, + "step": 33601 + }, + { + "epoch": 2.8638881786414387, + "grad_norm": 91.54523071013465, + "learning_rate": 6.265291639017512e-08, + "loss": 2.4553, + "step": 33602 + }, + { + "epoch": 2.863973408335464, + "grad_norm": 93.27592541880419, + "learning_rate": 6.257469067995237e-08, + "loss": 1.7058, + "step": 33603 + }, + { + "epoch": 2.8640586380294897, + "grad_norm": 68.83694266651197, + "learning_rate": 6.249651352734531e-08, + "loss": 1.9419, + "step": 33604 + }, + { + "epoch": 2.8641438677235147, + "grad_norm": 33.02896730200868, + "learning_rate": 6.24183849331228e-08, + "loss": 1.1424, + "step": 33605 + }, + { + "epoch": 2.86422909741754, + "grad_norm": 23.376149701238436, + "learning_rate": 6.234030489805421e-08, + "loss": 1.2264, + "step": 33606 + }, + { + "epoch": 2.8643143271115656, + "grad_norm": 44.29352592647227, + "learning_rate": 6.226227342290614e-08, + "loss": 0.7555, + "step": 33607 + }, + { + "epoch": 2.864399556805591, + "grad_norm": 36.42784723251476, + "learning_rate": 6.218429050844688e-08, + "loss": 1.309, + "step": 33608 + }, + { + "epoch": 2.8644847864996166, + "grad_norm": 102.8214900238089, + "learning_rate": 6.210635615544302e-08, + "loss": 2.1661, + "step": 33609 + }, + { + "epoch": 2.8645700161936416, + "grad_norm": 79.14321257046937, + "learning_rate": 6.202847036466064e-08, + "loss": 1.5674, + "step": 33610 + }, + { + "epoch": 2.8646552458876675, + "grad_norm": 24.75081107430361, + "learning_rate": 6.195063313686634e-08, + "loss": 0.4714, + "step": 33611 + }, + { + "epoch": 2.8647404755816925, + "grad_norm": 62.677765472091146, + "learning_rate": 6.18728444728256e-08, + "loss": 1.5235, + "step": 33612 + }, + { + "epoch": 2.864825705275718, + "grad_norm": 30.48094273619191, + "learning_rate": 6.179510437330282e-08, + "loss": 1.0113, + "step": 33613 + }, + { + "epoch": 2.8649109349697435, + "grad_norm": 50.86730983566938, + "learning_rate": 6.171741283906296e-08, + "loss": 0.9924, + "step": 33614 + }, + { + "epoch": 2.864996164663769, + "grad_norm": 64.24073542760867, + "learning_rate": 6.16397698708704e-08, + "loss": 1.7904, + "step": 33615 + }, + { + "epoch": 2.8650813943577944, + "grad_norm": 44.49065592087368, + "learning_rate": 6.15621754694884e-08, + "loss": 0.5523, + "step": 33616 + }, + { + "epoch": 2.8651666240518194, + "grad_norm": 86.47102430740667, + "learning_rate": 6.14846296356797e-08, + "loss": 1.8473, + "step": 33617 + }, + { + "epoch": 2.865251853745845, + "grad_norm": 66.12520376715173, + "learning_rate": 6.140713237020702e-08, + "loss": 1.9893, + "step": 33618 + }, + { + "epoch": 2.8653370834398704, + "grad_norm": 59.68786805428987, + "learning_rate": 6.132968367383308e-08, + "loss": 1.8413, + "step": 33619 + }, + { + "epoch": 2.865422313133896, + "grad_norm": 63.74578995945112, + "learning_rate": 6.125228354732005e-08, + "loss": 1.5891, + "step": 33620 + }, + { + "epoch": 2.8655075428279213, + "grad_norm": 65.9650551460473, + "learning_rate": 6.117493199142732e-08, + "loss": 1.6888, + "step": 33621 + }, + { + "epoch": 2.8655927725219468, + "grad_norm": 78.34578433732713, + "learning_rate": 6.109762900691763e-08, + "loss": 1.9268, + "step": 33622 + }, + { + "epoch": 2.8656780022159722, + "grad_norm": 57.88061837441028, + "learning_rate": 6.102037459454924e-08, + "loss": 1.3301, + "step": 33623 + }, + { + "epoch": 2.8657632319099973, + "grad_norm": 84.14316293950631, + "learning_rate": 6.094316875508321e-08, + "loss": 1.9131, + "step": 33624 + }, + { + "epoch": 2.8658484616040227, + "grad_norm": 81.01373987273507, + "learning_rate": 6.08660114892784e-08, + "loss": 1.9429, + "step": 33625 + }, + { + "epoch": 2.865933691298048, + "grad_norm": 113.09035964161764, + "learning_rate": 6.078890279789417e-08, + "loss": 2.6238, + "step": 33626 + }, + { + "epoch": 2.8660189209920737, + "grad_norm": 42.87103566564184, + "learning_rate": 6.071184268168773e-08, + "loss": 1.0618, + "step": 33627 + }, + { + "epoch": 2.866104150686099, + "grad_norm": 30.85802310928903, + "learning_rate": 6.063483114141843e-08, + "loss": 0.5855, + "step": 33628 + }, + { + "epoch": 2.866189380380124, + "grad_norm": 37.07505627161409, + "learning_rate": 6.055786817784292e-08, + "loss": 0.7686, + "step": 33629 + }, + { + "epoch": 2.86627461007415, + "grad_norm": 38.52125330547392, + "learning_rate": 6.048095379171782e-08, + "loss": 1.537, + "step": 33630 + }, + { + "epoch": 2.866359839768175, + "grad_norm": 75.54710578793832, + "learning_rate": 6.040408798379915e-08, + "loss": 2.051, + "step": 33631 + }, + { + "epoch": 2.8664450694622006, + "grad_norm": 46.81499787603392, + "learning_rate": 6.032727075484413e-08, + "loss": 1.1736, + "step": 33632 + }, + { + "epoch": 2.866530299156226, + "grad_norm": 53.18167631861514, + "learning_rate": 6.025050210560712e-08, + "loss": 1.5569, + "step": 33633 + }, + { + "epoch": 2.8666155288502515, + "grad_norm": 29.835160958814374, + "learning_rate": 6.017378203684365e-08, + "loss": 1.0373, + "step": 33634 + }, + { + "epoch": 2.866700758544277, + "grad_norm": 40.46753006106187, + "learning_rate": 6.00971105493081e-08, + "loss": 0.7991, + "step": 33635 + }, + { + "epoch": 2.866785988238302, + "grad_norm": 60.06971325720393, + "learning_rate": 6.00204876437549e-08, + "loss": 1.5007, + "step": 33636 + }, + { + "epoch": 2.8668712179323275, + "grad_norm": 64.77431139878662, + "learning_rate": 5.99439133209373e-08, + "loss": 1.616, + "step": 33637 + }, + { + "epoch": 2.866956447626353, + "grad_norm": 90.46987594125147, + "learning_rate": 5.986738758160804e-08, + "loss": 2.1204, + "step": 33638 + }, + { + "epoch": 2.8670416773203784, + "grad_norm": 44.05404008893817, + "learning_rate": 5.979091042652041e-08, + "loss": 0.7808, + "step": 33639 + }, + { + "epoch": 2.867126907014404, + "grad_norm": 32.06657587294091, + "learning_rate": 5.971448185642548e-08, + "loss": 1.3779, + "step": 33640 + }, + { + "epoch": 2.8672121367084293, + "grad_norm": 38.96227302255463, + "learning_rate": 5.963810187207597e-08, + "loss": 1.1051, + "step": 33641 + }, + { + "epoch": 2.867297366402455, + "grad_norm": 74.79727135820961, + "learning_rate": 5.95617704742224e-08, + "loss": 1.2156, + "step": 33642 + }, + { + "epoch": 2.86738259609648, + "grad_norm": 66.34844247709732, + "learning_rate": 5.9485487663615837e-08, + "loss": 0.9931, + "step": 33643 + }, + { + "epoch": 2.8674678257905053, + "grad_norm": 38.54366077669631, + "learning_rate": 5.940925344100623e-08, + "loss": 1.0808, + "step": 33644 + }, + { + "epoch": 2.8675530554845308, + "grad_norm": 32.19075933302666, + "learning_rate": 5.933306780714354e-08, + "loss": 1.2153, + "step": 33645 + }, + { + "epoch": 2.8676382851785562, + "grad_norm": 68.84772841047794, + "learning_rate": 5.925693076277717e-08, + "loss": 2.0591, + "step": 33646 + }, + { + "epoch": 2.8677235148725817, + "grad_norm": 70.32545301859089, + "learning_rate": 5.918084230865539e-08, + "loss": 2.0837, + "step": 33647 + }, + { + "epoch": 2.867808744566607, + "grad_norm": 83.02022340391404, + "learning_rate": 5.910480244552652e-08, + "loss": 1.8714, + "step": 33648 + }, + { + "epoch": 2.8678939742606326, + "grad_norm": 45.33897293921581, + "learning_rate": 5.9028811174138276e-08, + "loss": 1.8026, + "step": 33649 + }, + { + "epoch": 2.8679792039546577, + "grad_norm": 21.181701861427918, + "learning_rate": 5.895286849523896e-08, + "loss": 0.6297, + "step": 33650 + }, + { + "epoch": 2.868064433648683, + "grad_norm": 59.657907680917276, + "learning_rate": 5.8876974409574626e-08, + "loss": 1.129, + "step": 33651 + }, + { + "epoch": 2.8681496633427086, + "grad_norm": 60.54970565384735, + "learning_rate": 5.880112891789191e-08, + "loss": 0.9617, + "step": 33652 + }, + { + "epoch": 2.868234893036734, + "grad_norm": 49.710122784547046, + "learning_rate": 5.872533202093633e-08, + "loss": 1.3229, + "step": 33653 + }, + { + "epoch": 2.8683201227307595, + "grad_norm": 63.02374746033465, + "learning_rate": 5.864958371945395e-08, + "loss": 1.8658, + "step": 33654 + }, + { + "epoch": 2.8684053524247846, + "grad_norm": 57.42559289137388, + "learning_rate": 5.857388401418973e-08, + "loss": 1.7788, + "step": 33655 + }, + { + "epoch": 2.86849058211881, + "grad_norm": 13.929765902066583, + "learning_rate": 5.849823290588697e-08, + "loss": 0.5895, + "step": 33656 + }, + { + "epoch": 2.8685758118128355, + "grad_norm": 32.84329204092915, + "learning_rate": 5.8422630395291166e-08, + "loss": 0.812, + "step": 33657 + }, + { + "epoch": 2.868661041506861, + "grad_norm": 87.76783609840119, + "learning_rate": 5.834707648314508e-08, + "loss": 1.4945, + "step": 33658 + }, + { + "epoch": 2.8687462712008864, + "grad_norm": 42.01417997542549, + "learning_rate": 5.8271571170192e-08, + "loss": 0.7715, + "step": 33659 + }, + { + "epoch": 2.868831500894912, + "grad_norm": 61.22661436373194, + "learning_rate": 5.81961144571741e-08, + "loss": 1.7308, + "step": 33660 + }, + { + "epoch": 2.8689167305889374, + "grad_norm": 38.87054184138254, + "learning_rate": 5.812070634483358e-08, + "loss": 1.0136, + "step": 33661 + }, + { + "epoch": 2.8690019602829624, + "grad_norm": 40.83755329473525, + "learning_rate": 5.804534683391261e-08, + "loss": 1.2366, + "step": 33662 + }, + { + "epoch": 2.869087189976988, + "grad_norm": 37.64119334047872, + "learning_rate": 5.7970035925151716e-08, + "loss": 1.0608, + "step": 33663 + }, + { + "epoch": 2.8691724196710133, + "grad_norm": 53.12855417460172, + "learning_rate": 5.789477361929197e-08, + "loss": 1.7432, + "step": 33664 + }, + { + "epoch": 2.869257649365039, + "grad_norm": 73.12092973472517, + "learning_rate": 5.781955991707278e-08, + "loss": 1.792, + "step": 33665 + }, + { + "epoch": 2.8693428790590643, + "grad_norm": 88.23407907843348, + "learning_rate": 5.774439481923466e-08, + "loss": 1.1713, + "step": 33666 + }, + { + "epoch": 2.8694281087530897, + "grad_norm": 50.9268757140202, + "learning_rate": 5.766927832651703e-08, + "loss": 0.8716, + "step": 33667 + }, + { + "epoch": 2.869513338447115, + "grad_norm": 45.914753789860235, + "learning_rate": 5.7594210439657626e-08, + "loss": 1.3408, + "step": 33668 + }, + { + "epoch": 2.8695985681411402, + "grad_norm": 59.74252699907033, + "learning_rate": 5.751919115939586e-08, + "loss": 1.5011, + "step": 33669 + }, + { + "epoch": 2.8696837978351657, + "grad_norm": 30.353721921785294, + "learning_rate": 5.74442204864678e-08, + "loss": 0.8292, + "step": 33670 + }, + { + "epoch": 2.869769027529191, + "grad_norm": 39.33621492774071, + "learning_rate": 5.736929842161287e-08, + "loss": 0.6862, + "step": 33671 + }, + { + "epoch": 2.8698542572232166, + "grad_norm": 22.151110241388565, + "learning_rate": 5.729442496556603e-08, + "loss": 0.6565, + "step": 33672 + }, + { + "epoch": 2.869939486917242, + "grad_norm": 27.674571534195938, + "learning_rate": 5.721960011906558e-08, + "loss": 1.1532, + "step": 33673 + }, + { + "epoch": 2.870024716611267, + "grad_norm": 64.6203646669301, + "learning_rate": 5.714482388284537e-08, + "loss": 1.3716, + "step": 33674 + }, + { + "epoch": 2.870109946305293, + "grad_norm": 67.19696852022089, + "learning_rate": 5.7070096257642036e-08, + "loss": 2.1815, + "step": 33675 + }, + { + "epoch": 2.870195175999318, + "grad_norm": 20.055031125662016, + "learning_rate": 5.6995417244189996e-08, + "loss": 0.7254, + "step": 33676 + }, + { + "epoch": 2.8702804056933435, + "grad_norm": 52.98719346180617, + "learning_rate": 5.692078684322422e-08, + "loss": 1.5123, + "step": 33677 + }, + { + "epoch": 2.870365635387369, + "grad_norm": 65.4979711710829, + "learning_rate": 5.6846205055478e-08, + "loss": 1.7081, + "step": 33678 + }, + { + "epoch": 2.8704508650813945, + "grad_norm": 50.57666218223826, + "learning_rate": 5.677167188168575e-08, + "loss": 1.4261, + "step": 33679 + }, + { + "epoch": 2.87053609477542, + "grad_norm": 77.29554791962235, + "learning_rate": 5.6697187322579116e-08, + "loss": 1.4409, + "step": 33680 + }, + { + "epoch": 2.870621324469445, + "grad_norm": 24.55506619277949, + "learning_rate": 5.6622751378891397e-08, + "loss": 0.7936, + "step": 33681 + }, + { + "epoch": 2.8707065541634704, + "grad_norm": 37.24876789948057, + "learning_rate": 5.654836405135533e-08, + "loss": 1.0742, + "step": 33682 + }, + { + "epoch": 2.870791783857496, + "grad_norm": 77.42991966022532, + "learning_rate": 5.64740253407009e-08, + "loss": 1.7138, + "step": 33683 + }, + { + "epoch": 2.8708770135515214, + "grad_norm": 45.043250408804056, + "learning_rate": 5.639973524766029e-08, + "loss": 0.9268, + "step": 33684 + }, + { + "epoch": 2.870962243245547, + "grad_norm": 72.57221434361301, + "learning_rate": 5.632549377296459e-08, + "loss": 1.9554, + "step": 33685 + }, + { + "epoch": 2.8710474729395723, + "grad_norm": 66.72519721101862, + "learning_rate": 5.625130091734265e-08, + "loss": 2.0314, + "step": 33686 + }, + { + "epoch": 2.8711327026335978, + "grad_norm": 53.20240231854887, + "learning_rate": 5.617715668152446e-08, + "loss": 1.7982, + "step": 33687 + }, + { + "epoch": 2.871217932327623, + "grad_norm": 72.77697333811626, + "learning_rate": 5.610306106623942e-08, + "loss": 1.867, + "step": 33688 + }, + { + "epoch": 2.8713031620216483, + "grad_norm": 45.303588572050344, + "learning_rate": 5.6029014072216947e-08, + "loss": 1.4015, + "step": 33689 + }, + { + "epoch": 2.8713883917156737, + "grad_norm": 53.54569352153178, + "learning_rate": 5.5955015700184243e-08, + "loss": 1.9293, + "step": 33690 + }, + { + "epoch": 2.871473621409699, + "grad_norm": 34.853491812826334, + "learning_rate": 5.588106595086906e-08, + "loss": 0.9397, + "step": 33691 + }, + { + "epoch": 2.8715588511037247, + "grad_norm": 92.59199689540058, + "learning_rate": 5.5807164824999705e-08, + "loss": 2.0057, + "step": 33692 + }, + { + "epoch": 2.8716440807977497, + "grad_norm": 31.277103953604655, + "learning_rate": 5.57333123233017e-08, + "loss": 0.6834, + "step": 33693 + }, + { + "epoch": 2.8717293104917756, + "grad_norm": 52.980543300716405, + "learning_rate": 5.565950844650225e-08, + "loss": 1.683, + "step": 33694 + }, + { + "epoch": 2.8718145401858006, + "grad_norm": 49.13381413966347, + "learning_rate": 5.558575319532633e-08, + "loss": 1.1938, + "step": 33695 + }, + { + "epoch": 2.871899769879826, + "grad_norm": 49.637898249860164, + "learning_rate": 5.551204657050002e-08, + "loss": 1.3809, + "step": 33696 + }, + { + "epoch": 2.8719849995738516, + "grad_norm": 75.54520832682131, + "learning_rate": 5.54383885727483e-08, + "loss": 1.6275, + "step": 33697 + }, + { + "epoch": 2.872070229267877, + "grad_norm": 70.52005480671956, + "learning_rate": 5.536477920279504e-08, + "loss": 1.5062, + "step": 33698 + }, + { + "epoch": 2.8721554589619025, + "grad_norm": 53.94342539647541, + "learning_rate": 5.529121846136465e-08, + "loss": 1.1856, + "step": 33699 + }, + { + "epoch": 2.8722406886559275, + "grad_norm": 78.92862759559752, + "learning_rate": 5.521770634917989e-08, + "loss": 1.5372, + "step": 33700 + }, + { + "epoch": 2.872325918349953, + "grad_norm": 54.99750077313316, + "learning_rate": 5.5144242866964624e-08, + "loss": 1.1586, + "step": 33701 + }, + { + "epoch": 2.8724111480439785, + "grad_norm": 17.884137617106234, + "learning_rate": 5.5070828015440506e-08, + "loss": 0.3328, + "step": 33702 + }, + { + "epoch": 2.872496377738004, + "grad_norm": 37.5346832508397, + "learning_rate": 5.499746179532972e-08, + "loss": 1.2743, + "step": 33703 + }, + { + "epoch": 2.8725816074320294, + "grad_norm": 28.92219313372496, + "learning_rate": 5.492414420735337e-08, + "loss": 1.0067, + "step": 33704 + }, + { + "epoch": 2.872666837126055, + "grad_norm": 21.15192188427162, + "learning_rate": 5.4850875252234205e-08, + "loss": 1.0008, + "step": 33705 + }, + { + "epoch": 2.8727520668200803, + "grad_norm": 34.89764712424089, + "learning_rate": 5.4777654930691094e-08, + "loss": 0.9272, + "step": 33706 + }, + { + "epoch": 2.8728372965141054, + "grad_norm": 58.87961822612621, + "learning_rate": 5.470448324344513e-08, + "loss": 1.721, + "step": 33707 + }, + { + "epoch": 2.872922526208131, + "grad_norm": 40.9919703199848, + "learning_rate": 5.463136019121518e-08, + "loss": 1.1102, + "step": 33708 + }, + { + "epoch": 2.8730077559021563, + "grad_norm": 52.387030407071975, + "learning_rate": 5.4558285774720665e-08, + "loss": 1.5985, + "step": 33709 + }, + { + "epoch": 2.8730929855961818, + "grad_norm": 89.00702162784695, + "learning_rate": 5.448525999468046e-08, + "loss": 0.9495, + "step": 33710 + }, + { + "epoch": 2.8731782152902072, + "grad_norm": 77.06229723647506, + "learning_rate": 5.4412282851812326e-08, + "loss": 1.6735, + "step": 33711 + }, + { + "epoch": 2.8732634449842323, + "grad_norm": 29.780473195956407, + "learning_rate": 5.433935434683457e-08, + "loss": 1.0572, + "step": 33712 + }, + { + "epoch": 2.873348674678258, + "grad_norm": 59.59518257723901, + "learning_rate": 5.426647448046385e-08, + "loss": 1.4497, + "step": 33713 + }, + { + "epoch": 2.873433904372283, + "grad_norm": 67.15759505756486, + "learning_rate": 5.419364325341736e-08, + "loss": 1.3566, + "step": 33714 + }, + { + "epoch": 2.8735191340663087, + "grad_norm": 58.926734334506406, + "learning_rate": 5.412086066641065e-08, + "loss": 1.7697, + "step": 33715 + }, + { + "epoch": 2.873604363760334, + "grad_norm": 67.17226472459154, + "learning_rate": 5.404812672016035e-08, + "loss": 1.9692, + "step": 33716 + }, + { + "epoch": 2.8736895934543596, + "grad_norm": 60.227987218235086, + "learning_rate": 5.397544141538091e-08, + "loss": 1.9447, + "step": 33717 + }, + { + "epoch": 2.873774823148385, + "grad_norm": 39.94807006925382, + "learning_rate": 5.3902804752788415e-08, + "loss": 0.9646, + "step": 33718 + }, + { + "epoch": 2.87386005284241, + "grad_norm": 36.22083842136545, + "learning_rate": 5.383021673309563e-08, + "loss": 0.8679, + "step": 33719 + }, + { + "epoch": 2.8739452825364356, + "grad_norm": 20.70933290656024, + "learning_rate": 5.3757677357017535e-08, + "loss": 0.5055, + "step": 33720 + }, + { + "epoch": 2.874030512230461, + "grad_norm": 34.165994364477825, + "learning_rate": 5.3685186625267447e-08, + "loss": 0.9325, + "step": 33721 + }, + { + "epoch": 2.8741157419244865, + "grad_norm": 44.49332185432095, + "learning_rate": 5.3612744538558136e-08, + "loss": 0.5435, + "step": 33722 + }, + { + "epoch": 2.874200971618512, + "grad_norm": 62.98021581307865, + "learning_rate": 5.3540351097601804e-08, + "loss": 1.6549, + "step": 33723 + }, + { + "epoch": 2.8742862013125374, + "grad_norm": 47.00334231010442, + "learning_rate": 5.346800630311066e-08, + "loss": 1.1016, + "step": 33724 + }, + { + "epoch": 2.874371431006563, + "grad_norm": 72.95827627908068, + "learning_rate": 5.339571015579636e-08, + "loss": 1.945, + "step": 33725 + }, + { + "epoch": 2.874456660700588, + "grad_norm": 49.874568319164894, + "learning_rate": 5.33234626563689e-08, + "loss": 1.3622, + "step": 33726 + }, + { + "epoch": 2.8745418903946134, + "grad_norm": 35.184734693637424, + "learning_rate": 5.325126380553991e-08, + "loss": 1.6733, + "step": 33727 + }, + { + "epoch": 2.874627120088639, + "grad_norm": 52.469382841502494, + "learning_rate": 5.3179113604019395e-08, + "loss": 1.4751, + "step": 33728 + }, + { + "epoch": 2.8747123497826643, + "grad_norm": 56.77304701204819, + "learning_rate": 5.310701205251678e-08, + "loss": 1.2459, + "step": 33729 + }, + { + "epoch": 2.87479757947669, + "grad_norm": 41.65031550868161, + "learning_rate": 5.3034959151740395e-08, + "loss": 1.1236, + "step": 33730 + }, + { + "epoch": 2.874882809170715, + "grad_norm": 59.79247097687169, + "learning_rate": 5.296295490239967e-08, + "loss": 1.563, + "step": 33731 + }, + { + "epoch": 2.8749680388647407, + "grad_norm": 77.58283791874902, + "learning_rate": 5.2890999305202915e-08, + "loss": 2.2656, + "step": 33732 + }, + { + "epoch": 2.8750532685587658, + "grad_norm": 45.069217229309615, + "learning_rate": 5.28190923608568e-08, + "loss": 0.8467, + "step": 33733 + }, + { + "epoch": 2.875138498252791, + "grad_norm": 99.55923757827787, + "learning_rate": 5.27472340700691e-08, + "loss": 2.3374, + "step": 33734 + }, + { + "epoch": 2.8752237279468167, + "grad_norm": 46.959386977643106, + "learning_rate": 5.267542443354645e-08, + "loss": 1.7332, + "step": 33735 + }, + { + "epoch": 2.875308957640842, + "grad_norm": 75.07116809104431, + "learning_rate": 5.260366345199552e-08, + "loss": 1.3858, + "step": 33736 + }, + { + "epoch": 2.8753941873348676, + "grad_norm": 76.42752791331688, + "learning_rate": 5.253195112612186e-08, + "loss": 1.8041, + "step": 33737 + }, + { + "epoch": 2.8754794170288926, + "grad_norm": 64.07578122263055, + "learning_rate": 5.24602874566299e-08, + "loss": 1.3541, + "step": 33738 + }, + { + "epoch": 2.875564646722918, + "grad_norm": 53.41809063065556, + "learning_rate": 5.238867244422574e-08, + "loss": 1.5367, + "step": 33739 + }, + { + "epoch": 2.8756498764169436, + "grad_norm": 94.8001010018095, + "learning_rate": 5.231710608961271e-08, + "loss": 2.6927, + "step": 33740 + }, + { + "epoch": 2.875735106110969, + "grad_norm": 57.966346638782746, + "learning_rate": 5.224558839349525e-08, + "loss": 1.8473, + "step": 33741 + }, + { + "epoch": 2.8758203358049945, + "grad_norm": 78.23155451670075, + "learning_rate": 5.217411935657557e-08, + "loss": 1.2932, + "step": 33742 + }, + { + "epoch": 2.87590556549902, + "grad_norm": 49.55033083014221, + "learning_rate": 5.2102698979557555e-08, + "loss": 1.4255, + "step": 33743 + }, + { + "epoch": 2.8759907951930455, + "grad_norm": 51.51132111568178, + "learning_rate": 5.203132726314397e-08, + "loss": 1.2815, + "step": 33744 + }, + { + "epoch": 2.8760760248870705, + "grad_norm": 34.375319843313704, + "learning_rate": 5.196000420803593e-08, + "loss": 0.9981, + "step": 33745 + }, + { + "epoch": 2.876161254581096, + "grad_norm": 33.86217744515549, + "learning_rate": 5.188872981493509e-08, + "loss": 1.0776, + "step": 33746 + }, + { + "epoch": 2.8762464842751214, + "grad_norm": 25.488405559243386, + "learning_rate": 5.1817504084542e-08, + "loss": 0.9056, + "step": 33747 + }, + { + "epoch": 2.876331713969147, + "grad_norm": 47.579482185327485, + "learning_rate": 5.1746327017557776e-08, + "loss": 1.0157, + "step": 33748 + }, + { + "epoch": 2.8764169436631724, + "grad_norm": 79.18754525372314, + "learning_rate": 5.16751986146824e-08, + "loss": 2.0895, + "step": 33749 + }, + { + "epoch": 2.8765021733571974, + "grad_norm": 47.08716201763101, + "learning_rate": 5.160411887661476e-08, + "loss": 1.2936, + "step": 33750 + }, + { + "epoch": 2.8765874030512233, + "grad_norm": 47.61369655546877, + "learning_rate": 5.1533087804054304e-08, + "loss": 1.5869, + "step": 33751 + }, + { + "epoch": 2.8766726327452483, + "grad_norm": 69.25059557786388, + "learning_rate": 5.1462105397699913e-08, + "loss": 1.3611, + "step": 33752 + }, + { + "epoch": 2.876757862439274, + "grad_norm": 31.50536285408762, + "learning_rate": 5.139117165824936e-08, + "loss": 1.1256, + "step": 33753 + }, + { + "epoch": 2.8768430921332993, + "grad_norm": 92.76571640014272, + "learning_rate": 5.1320286586400424e-08, + "loss": 1.2326, + "step": 33754 + }, + { + "epoch": 2.8769283218273247, + "grad_norm": 55.82610116579449, + "learning_rate": 5.124945018284977e-08, + "loss": 1.2088, + "step": 33755 + }, + { + "epoch": 2.87701355152135, + "grad_norm": 39.87739183932359, + "learning_rate": 5.117866244829461e-08, + "loss": 0.7186, + "step": 33756 + }, + { + "epoch": 2.877098781215375, + "grad_norm": 49.76918961403761, + "learning_rate": 5.1107923383430516e-08, + "loss": 0.817, + "step": 33757 + }, + { + "epoch": 2.8771840109094007, + "grad_norm": 60.67890907976389, + "learning_rate": 5.103723298895358e-08, + "loss": 1.6262, + "step": 33758 + }, + { + "epoch": 2.877269240603426, + "grad_norm": 64.56626572985728, + "learning_rate": 5.0966591265558806e-08, + "loss": 1.3964, + "step": 33759 + }, + { + "epoch": 2.8773544702974516, + "grad_norm": 67.45373352559857, + "learning_rate": 5.08959982139412e-08, + "loss": 1.8426, + "step": 33760 + }, + { + "epoch": 2.877439699991477, + "grad_norm": 80.69821273654395, + "learning_rate": 5.082545383479465e-08, + "loss": 2.1432, + "step": 33761 + }, + { + "epoch": 2.8775249296855026, + "grad_norm": 68.03651592786572, + "learning_rate": 5.0754958128813594e-08, + "loss": 1.4737, + "step": 33762 + }, + { + "epoch": 2.877610159379528, + "grad_norm": 51.133064182550804, + "learning_rate": 5.0684511096690816e-08, + "loss": 1.3116, + "step": 33763 + }, + { + "epoch": 2.877695389073553, + "grad_norm": 76.12017077039937, + "learning_rate": 5.061411273911909e-08, + "loss": 1.4966, + "step": 33764 + }, + { + "epoch": 2.8777806187675785, + "grad_norm": 65.64530544110767, + "learning_rate": 5.0543763056790654e-08, + "loss": 1.3416, + "step": 33765 + }, + { + "epoch": 2.877865848461604, + "grad_norm": 25.00621052767325, + "learning_rate": 5.047346205039771e-08, + "loss": 0.6034, + "step": 33766 + }, + { + "epoch": 2.8779510781556294, + "grad_norm": 89.91027195525142, + "learning_rate": 5.0403209720631394e-08, + "loss": 2.0762, + "step": 33767 + }, + { + "epoch": 2.878036307849655, + "grad_norm": 29.55748215274722, + "learning_rate": 5.0333006068183366e-08, + "loss": 0.8541, + "step": 33768 + }, + { + "epoch": 2.87812153754368, + "grad_norm": 80.49217257057539, + "learning_rate": 5.026285109374251e-08, + "loss": 1.4667, + "step": 33769 + }, + { + "epoch": 2.878206767237706, + "grad_norm": 74.93851703757853, + "learning_rate": 5.0192744798000515e-08, + "loss": 1.9716, + "step": 33770 + }, + { + "epoch": 2.878291996931731, + "grad_norm": 43.57871983296585, + "learning_rate": 5.01226871816457e-08, + "loss": 1.1423, + "step": 33771 + }, + { + "epoch": 2.8783772266257563, + "grad_norm": 46.10922532046411, + "learning_rate": 5.0052678245367526e-08, + "loss": 1.1581, + "step": 33772 + }, + { + "epoch": 2.878462456319782, + "grad_norm": 57.43549765463646, + "learning_rate": 4.998271798985377e-08, + "loss": 1.9397, + "step": 33773 + }, + { + "epoch": 2.8785476860138073, + "grad_norm": 31.453797726208823, + "learning_rate": 4.991280641579332e-08, + "loss": 1.0642, + "step": 33774 + }, + { + "epoch": 2.8786329157078328, + "grad_norm": 99.35893691945611, + "learning_rate": 4.9842943523873424e-08, + "loss": 1.9097, + "step": 33775 + }, + { + "epoch": 2.8787181454018578, + "grad_norm": 30.385824911018165, + "learning_rate": 4.977312931478129e-08, + "loss": 0.689, + "step": 33776 + }, + { + "epoch": 2.8788033750958832, + "grad_norm": 44.20957167557523, + "learning_rate": 4.970336378920304e-08, + "loss": 1.2491, + "step": 33777 + }, + { + "epoch": 2.8788886047899087, + "grad_norm": 62.02869597699882, + "learning_rate": 4.963364694782591e-08, + "loss": 1.6394, + "step": 33778 + }, + { + "epoch": 2.878973834483934, + "grad_norm": 54.39028814765318, + "learning_rate": 4.956397879133434e-08, + "loss": 1.2788, + "step": 33779 + }, + { + "epoch": 2.8790590641779596, + "grad_norm": 88.4186942653817, + "learning_rate": 4.949435932041391e-08, + "loss": 2.5632, + "step": 33780 + }, + { + "epoch": 2.879144293871985, + "grad_norm": 81.92321264280748, + "learning_rate": 4.9424788535749616e-08, + "loss": 2.5458, + "step": 33781 + }, + { + "epoch": 2.8792295235660106, + "grad_norm": 57.653445444835, + "learning_rate": 4.93552664380248e-08, + "loss": 1.5532, + "step": 33782 + }, + { + "epoch": 2.8793147532600356, + "grad_norm": 50.99463702517139, + "learning_rate": 4.928579302792391e-08, + "loss": 1.5343, + "step": 33783 + }, + { + "epoch": 2.879399982954061, + "grad_norm": 25.253079447001294, + "learning_rate": 4.92163683061303e-08, + "loss": 0.5725, + "step": 33784 + }, + { + "epoch": 2.8794852126480865, + "grad_norm": 79.57241833114875, + "learning_rate": 4.914699227332675e-08, + "loss": 1.3876, + "step": 33785 + }, + { + "epoch": 2.879570442342112, + "grad_norm": 63.68911211687787, + "learning_rate": 4.9077664930194394e-08, + "loss": 1.482, + "step": 33786 + }, + { + "epoch": 2.8796556720361375, + "grad_norm": 60.010253976756466, + "learning_rate": 4.9008386277416553e-08, + "loss": 1.4123, + "step": 33787 + }, + { + "epoch": 2.879740901730163, + "grad_norm": 62.403102187714445, + "learning_rate": 4.893915631567381e-08, + "loss": 1.3968, + "step": 33788 + }, + { + "epoch": 2.8798261314241884, + "grad_norm": 66.00683004686083, + "learning_rate": 4.886997504564672e-08, + "loss": 1.7878, + "step": 33789 + }, + { + "epoch": 2.8799113611182134, + "grad_norm": 35.257230085211155, + "learning_rate": 4.880084246801586e-08, + "loss": 0.7551, + "step": 33790 + }, + { + "epoch": 2.879996590812239, + "grad_norm": 40.27660392852978, + "learning_rate": 4.873175858346235e-08, + "loss": 1.177, + "step": 33791 + }, + { + "epoch": 2.8800818205062644, + "grad_norm": 86.53458425298021, + "learning_rate": 4.866272339266398e-08, + "loss": 1.7454, + "step": 33792 + }, + { + "epoch": 2.88016705020029, + "grad_norm": 39.56376756935171, + "learning_rate": 4.85937368963002e-08, + "loss": 1.3227, + "step": 33793 + }, + { + "epoch": 2.8802522798943153, + "grad_norm": 33.8312374645425, + "learning_rate": 4.852479909504937e-08, + "loss": 0.8755, + "step": 33794 + }, + { + "epoch": 2.8803375095883403, + "grad_norm": 28.254771682146536, + "learning_rate": 4.8455909989589824e-08, + "loss": 0.8823, + "step": 33795 + }, + { + "epoch": 2.8804227392823663, + "grad_norm": 62.91856436181148, + "learning_rate": 4.83870695805988e-08, + "loss": 1.9884, + "step": 33796 + }, + { + "epoch": 2.8805079689763913, + "grad_norm": 24.389397948441513, + "learning_rate": 4.831827786875354e-08, + "loss": 0.8094, + "step": 33797 + }, + { + "epoch": 2.8805931986704167, + "grad_norm": 56.04447420040395, + "learning_rate": 4.824953485473016e-08, + "loss": 1.4607, + "step": 33798 + }, + { + "epoch": 2.880678428364442, + "grad_norm": 57.75779705483295, + "learning_rate": 4.81808405392048e-08, + "loss": 1.2254, + "step": 33799 + }, + { + "epoch": 2.8807636580584677, + "grad_norm": 69.0100769700015, + "learning_rate": 4.811219492285357e-08, + "loss": 1.671, + "step": 33800 + }, + { + "epoch": 2.880848887752493, + "grad_norm": 18.826772024561343, + "learning_rate": 4.80435980063515e-08, + "loss": 0.7774, + "step": 33801 + }, + { + "epoch": 2.880934117446518, + "grad_norm": 60.20786096244333, + "learning_rate": 4.797504979037249e-08, + "loss": 1.9889, + "step": 33802 + }, + { + "epoch": 2.8810193471405436, + "grad_norm": 27.497059447593713, + "learning_rate": 4.790655027559099e-08, + "loss": 0.7697, + "step": 33803 + }, + { + "epoch": 2.881104576834569, + "grad_norm": 36.59770412399763, + "learning_rate": 4.7838099462681474e-08, + "loss": 0.8807, + "step": 33804 + }, + { + "epoch": 2.8811898065285946, + "grad_norm": 33.30314000342573, + "learning_rate": 4.776969735231563e-08, + "loss": 1.081, + "step": 33805 + }, + { + "epoch": 2.88127503622262, + "grad_norm": 44.78536952807985, + "learning_rate": 4.7701343945167344e-08, + "loss": 0.8639, + "step": 33806 + }, + { + "epoch": 2.8813602659166455, + "grad_norm": 46.83819978821171, + "learning_rate": 4.763303924190832e-08, + "loss": 1.1306, + "step": 33807 + }, + { + "epoch": 2.881445495610671, + "grad_norm": 68.57792800020736, + "learning_rate": 4.756478324321079e-08, + "loss": 1.9171, + "step": 33808 + }, + { + "epoch": 2.881530725304696, + "grad_norm": 76.80130681075718, + "learning_rate": 4.749657594974588e-08, + "loss": 2.04, + "step": 33809 + }, + { + "epoch": 2.8816159549987215, + "grad_norm": 43.58702190052201, + "learning_rate": 4.742841736218362e-08, + "loss": 1.2281, + "step": 33810 + }, + { + "epoch": 2.881701184692747, + "grad_norm": 50.72026324996844, + "learning_rate": 4.736030748119513e-08, + "loss": 1.1549, + "step": 33811 + }, + { + "epoch": 2.8817864143867724, + "grad_norm": 51.95419138207267, + "learning_rate": 4.729224630745044e-08, + "loss": 0.9434, + "step": 33812 + }, + { + "epoch": 2.881871644080798, + "grad_norm": 61.30280111758649, + "learning_rate": 4.72242338416179e-08, + "loss": 1.769, + "step": 33813 + }, + { + "epoch": 2.881956873774823, + "grad_norm": 55.640621681704104, + "learning_rate": 4.7156270084367515e-08, + "loss": 1.4779, + "step": 33814 + }, + { + "epoch": 2.882042103468849, + "grad_norm": 40.16690640658632, + "learning_rate": 4.7088355036366554e-08, + "loss": 1.2451, + "step": 33815 + }, + { + "epoch": 2.882127333162874, + "grad_norm": 61.00976290813322, + "learning_rate": 4.702048869828391e-08, + "loss": 1.1957, + "step": 33816 + }, + { + "epoch": 2.8822125628568993, + "grad_norm": 29.551532400363037, + "learning_rate": 4.695267107078683e-08, + "loss": 1.4837, + "step": 33817 + }, + { + "epoch": 2.8822977925509248, + "grad_norm": 55.9842250581501, + "learning_rate": 4.688490215454145e-08, + "loss": 1.0174, + "step": 33818 + }, + { + "epoch": 2.8823830222449502, + "grad_norm": 35.456040557868164, + "learning_rate": 4.681718195021556e-08, + "loss": 0.7501, + "step": 33819 + }, + { + "epoch": 2.8824682519389757, + "grad_norm": 111.15388632625294, + "learning_rate": 4.67495104584742e-08, + "loss": 2.1355, + "step": 33820 + }, + { + "epoch": 2.8825534816330007, + "grad_norm": 26.4656759362084, + "learning_rate": 4.668188767998294e-08, + "loss": 0.7874, + "step": 33821 + }, + { + "epoch": 2.882638711327026, + "grad_norm": 55.508570263605876, + "learning_rate": 4.661431361540736e-08, + "loss": 1.737, + "step": 33822 + }, + { + "epoch": 2.8827239410210517, + "grad_norm": 30.48691507801764, + "learning_rate": 4.654678826541137e-08, + "loss": 0.8345, + "step": 33823 + }, + { + "epoch": 2.882809170715077, + "grad_norm": 70.85097839554312, + "learning_rate": 4.6479311630659994e-08, + "loss": 1.4008, + "step": 33824 + }, + { + "epoch": 2.8828944004091026, + "grad_norm": 31.400753935787794, + "learning_rate": 4.641188371181604e-08, + "loss": 0.6986, + "step": 33825 + }, + { + "epoch": 2.882979630103128, + "grad_norm": 59.61736331679763, + "learning_rate": 4.6344504509542863e-08, + "loss": 1.5363, + "step": 33826 + }, + { + "epoch": 2.8830648597971535, + "grad_norm": 68.50827364171714, + "learning_rate": 4.627717402450271e-08, + "loss": 1.8858, + "step": 33827 + }, + { + "epoch": 2.8831500894911786, + "grad_norm": 93.42496606397714, + "learning_rate": 4.620989225735839e-08, + "loss": 1.9, + "step": 33828 + }, + { + "epoch": 2.883235319185204, + "grad_norm": 54.779609777199056, + "learning_rate": 4.6142659208771035e-08, + "loss": 1.4836, + "step": 33829 + }, + { + "epoch": 2.8833205488792295, + "grad_norm": 68.68909843228664, + "learning_rate": 4.607547487940289e-08, + "loss": 1.9001, + "step": 33830 + }, + { + "epoch": 2.883405778573255, + "grad_norm": 33.33708542955776, + "learning_rate": 4.600833926991344e-08, + "loss": 0.9825, + "step": 33831 + }, + { + "epoch": 2.8834910082672804, + "grad_norm": 32.751792913709146, + "learning_rate": 4.594125238096381e-08, + "loss": 0.9819, + "step": 33832 + }, + { + "epoch": 2.8835762379613055, + "grad_norm": 74.40162307758386, + "learning_rate": 4.587421421321292e-08, + "loss": 1.6248, + "step": 33833 + }, + { + "epoch": 2.8836614676553314, + "grad_norm": 25.00781581780378, + "learning_rate": 4.58072247673208e-08, + "loss": 1.5102, + "step": 33834 + }, + { + "epoch": 2.8837466973493564, + "grad_norm": 28.280686597077523, + "learning_rate": 4.574028404394581e-08, + "loss": 0.9268, + "step": 33835 + }, + { + "epoch": 2.883831927043382, + "grad_norm": 53.38529890668816, + "learning_rate": 4.567339204374688e-08, + "loss": 1.3642, + "step": 33836 + }, + { + "epoch": 2.8839171567374073, + "grad_norm": 36.989871538393004, + "learning_rate": 4.560654876738069e-08, + "loss": 0.8131, + "step": 33837 + }, + { + "epoch": 2.884002386431433, + "grad_norm": 43.50500569126851, + "learning_rate": 4.553975421550616e-08, + "loss": 1.1845, + "step": 33838 + }, + { + "epoch": 2.8840876161254583, + "grad_norm": 54.8322986488724, + "learning_rate": 4.5473008388779436e-08, + "loss": 1.7214, + "step": 33839 + }, + { + "epoch": 2.8841728458194833, + "grad_norm": 46.210645258028705, + "learning_rate": 4.540631128785722e-08, + "loss": 1.2785, + "step": 33840 + }, + { + "epoch": 2.8842580755135088, + "grad_norm": 118.14958459533894, + "learning_rate": 4.533966291339453e-08, + "loss": 2.6079, + "step": 33841 + }, + { + "epoch": 2.8843433052075342, + "grad_norm": 35.96919875575105, + "learning_rate": 4.5273063266047525e-08, + "loss": 1.2574, + "step": 33842 + }, + { + "epoch": 2.8844285349015597, + "grad_norm": 51.85562569120383, + "learning_rate": 4.520651234647122e-08, + "loss": 2.7935, + "step": 33843 + }, + { + "epoch": 2.884513764595585, + "grad_norm": 52.85006756482283, + "learning_rate": 4.51400101553201e-08, + "loss": 1.3255, + "step": 33844 + }, + { + "epoch": 2.8845989942896106, + "grad_norm": 76.72834292332968, + "learning_rate": 4.507355669324809e-08, + "loss": 1.4287, + "step": 33845 + }, + { + "epoch": 2.884684223983636, + "grad_norm": 64.41874371965739, + "learning_rate": 4.5007151960908545e-08, + "loss": 2.1137, + "step": 33846 + }, + { + "epoch": 2.884769453677661, + "grad_norm": 67.48598501575445, + "learning_rate": 4.49407959589554e-08, + "loss": 1.4905, + "step": 33847 + }, + { + "epoch": 2.8848546833716866, + "grad_norm": 36.124611002616255, + "learning_rate": 4.487448868804034e-08, + "loss": 1.1764, + "step": 33848 + }, + { + "epoch": 2.884939913065712, + "grad_norm": 44.244210536092005, + "learning_rate": 4.4808230148815636e-08, + "loss": 0.8192, + "step": 33849 + }, + { + "epoch": 2.8850251427597375, + "grad_norm": 29.4778937984372, + "learning_rate": 4.474202034193242e-08, + "loss": 0.9704, + "step": 33850 + }, + { + "epoch": 2.885110372453763, + "grad_norm": 29.150236754188885, + "learning_rate": 4.467585926804297e-08, + "loss": 0.7789, + "step": 33851 + }, + { + "epoch": 2.885195602147788, + "grad_norm": 27.445883034595806, + "learning_rate": 4.460974692779729e-08, + "loss": 0.939, + "step": 33852 + }, + { + "epoch": 2.885280831841814, + "grad_norm": 32.69506983641503, + "learning_rate": 4.454368332184544e-08, + "loss": 0.7339, + "step": 33853 + }, + { + "epoch": 2.885366061535839, + "grad_norm": 41.56367065333718, + "learning_rate": 4.447766845083745e-08, + "loss": 1.1192, + "step": 33854 + }, + { + "epoch": 2.8854512912298644, + "grad_norm": 50.83519582893209, + "learning_rate": 4.44117023154228e-08, + "loss": 1.3942, + "step": 33855 + }, + { + "epoch": 2.88553652092389, + "grad_norm": 36.79677710446709, + "learning_rate": 4.4345784916249855e-08, + "loss": 1.4029, + "step": 33856 + }, + { + "epoch": 2.8856217506179154, + "grad_norm": 27.21047251222835, + "learning_rate": 4.4279916253966434e-08, + "loss": 0.7742, + "step": 33857 + }, + { + "epoch": 2.885706980311941, + "grad_norm": 53.53616829610731, + "learning_rate": 4.421409632922147e-08, + "loss": 1.093, + "step": 33858 + }, + { + "epoch": 2.885792210005966, + "grad_norm": 41.495533913437576, + "learning_rate": 4.4148325142661094e-08, + "loss": 1.0999, + "step": 33859 + }, + { + "epoch": 2.8858774396999913, + "grad_norm": 58.29369112715012, + "learning_rate": 4.408260269493259e-08, + "loss": 1.5135, + "step": 33860 + }, + { + "epoch": 2.885962669394017, + "grad_norm": 62.592128589003686, + "learning_rate": 4.4016928986683196e-08, + "loss": 1.7054, + "step": 33861 + }, + { + "epoch": 2.8860478990880423, + "grad_norm": 57.146825360223204, + "learning_rate": 4.3951304018557405e-08, + "loss": 1.6115, + "step": 33862 + }, + { + "epoch": 2.8861331287820677, + "grad_norm": 58.832912744387194, + "learning_rate": 4.388572779120082e-08, + "loss": 1.3555, + "step": 33863 + }, + { + "epoch": 2.886218358476093, + "grad_norm": 37.18276578691337, + "learning_rate": 4.382020030525957e-08, + "loss": 1.1117, + "step": 33864 + }, + { + "epoch": 2.8863035881701187, + "grad_norm": 62.22923840275012, + "learning_rate": 4.375472156137706e-08, + "loss": 1.2369, + "step": 33865 + }, + { + "epoch": 2.8863888178641437, + "grad_norm": 53.02455868491964, + "learning_rate": 4.368929156019719e-08, + "loss": 1.4941, + "step": 33866 + }, + { + "epoch": 2.886474047558169, + "grad_norm": 23.09764340390331, + "learning_rate": 4.36239103023639e-08, + "loss": 0.5295, + "step": 33867 + }, + { + "epoch": 2.8865592772521946, + "grad_norm": 21.563208185482807, + "learning_rate": 4.355857778851946e-08, + "loss": 0.8943, + "step": 33868 + }, + { + "epoch": 2.88664450694622, + "grad_norm": 56.46598809182192, + "learning_rate": 4.349329401930724e-08, + "loss": 1.3, + "step": 33869 + }, + { + "epoch": 2.8867297366402456, + "grad_norm": 57.45726825764487, + "learning_rate": 4.34280589953695e-08, + "loss": 1.6873, + "step": 33870 + }, + { + "epoch": 2.8868149663342706, + "grad_norm": 33.38091042350174, + "learning_rate": 4.336287271734685e-08, + "loss": 0.9885, + "step": 33871 + }, + { + "epoch": 2.8869001960282965, + "grad_norm": 71.51440079856292, + "learning_rate": 4.329773518588043e-08, + "loss": 1.4319, + "step": 33872 + }, + { + "epoch": 2.8869854257223215, + "grad_norm": 98.81111985213388, + "learning_rate": 4.32326464016114e-08, + "loss": 2.3474, + "step": 33873 + }, + { + "epoch": 2.887070655416347, + "grad_norm": 82.57744138721537, + "learning_rate": 4.3167606365179806e-08, + "loss": 2.2495, + "step": 33874 + }, + { + "epoch": 2.8871558851103725, + "grad_norm": 34.387897639739975, + "learning_rate": 4.310261507722513e-08, + "loss": 0.8871, + "step": 33875 + }, + { + "epoch": 2.887241114804398, + "grad_norm": 40.2578163786699, + "learning_rate": 4.303767253838631e-08, + "loss": 1.3042, + "step": 33876 + }, + { + "epoch": 2.8873263444984234, + "grad_norm": 36.6459384460631, + "learning_rate": 4.297277874930228e-08, + "loss": 1.3822, + "step": 33877 + }, + { + "epoch": 2.8874115741924484, + "grad_norm": 74.87842771114441, + "learning_rate": 4.290793371061197e-08, + "loss": 2.2361, + "step": 33878 + }, + { + "epoch": 2.887496803886474, + "grad_norm": 50.523518919870725, + "learning_rate": 4.2843137422951545e-08, + "loss": 1.4462, + "step": 33879 + }, + { + "epoch": 2.8875820335804994, + "grad_norm": 47.62480754246868, + "learning_rate": 4.277838988695937e-08, + "loss": 1.4437, + "step": 33880 + }, + { + "epoch": 2.887667263274525, + "grad_norm": 27.260720764404063, + "learning_rate": 4.271369110327217e-08, + "loss": 0.6576, + "step": 33881 + }, + { + "epoch": 2.8877524929685503, + "grad_norm": 37.29659585886815, + "learning_rate": 4.2649041072525545e-08, + "loss": 1.0334, + "step": 33882 + }, + { + "epoch": 2.8878377226625758, + "grad_norm": 59.50650579404024, + "learning_rate": 4.258443979535565e-08, + "loss": 1.5099, + "step": 33883 + }, + { + "epoch": 2.8879229523566012, + "grad_norm": 39.7433850220526, + "learning_rate": 4.2519887272398086e-08, + "loss": 0.9621, + "step": 33884 + }, + { + "epoch": 2.8880081820506263, + "grad_norm": 43.224513795108415, + "learning_rate": 4.2455383504287906e-08, + "loss": 1.3478, + "step": 33885 + }, + { + "epoch": 2.8880934117446517, + "grad_norm": 66.03455333411677, + "learning_rate": 4.2390928491659046e-08, + "loss": 1.5918, + "step": 33886 + }, + { + "epoch": 2.888178641438677, + "grad_norm": 50.421189299815914, + "learning_rate": 4.232652223514488e-08, + "loss": 1.2532, + "step": 33887 + }, + { + "epoch": 2.8882638711327027, + "grad_norm": 56.627125397022404, + "learning_rate": 4.226216473537992e-08, + "loss": 1.131, + "step": 33888 + }, + { + "epoch": 2.888349100826728, + "grad_norm": 51.08834978473176, + "learning_rate": 4.219785599299642e-08, + "loss": 1.0978, + "step": 33889 + }, + { + "epoch": 2.888434330520753, + "grad_norm": 66.39451355188132, + "learning_rate": 4.2133596008626656e-08, + "loss": 1.2139, + "step": 33890 + }, + { + "epoch": 2.888519560214779, + "grad_norm": 38.76715986378123, + "learning_rate": 4.2069384782903455e-08, + "loss": 1.0655, + "step": 33891 + }, + { + "epoch": 2.888604789908804, + "grad_norm": 90.4092341034573, + "learning_rate": 4.200522231645743e-08, + "loss": 1.9655, + "step": 33892 + }, + { + "epoch": 2.8886900196028296, + "grad_norm": 26.99609214735561, + "learning_rate": 4.194110860991973e-08, + "loss": 0.7291, + "step": 33893 + }, + { + "epoch": 2.888775249296855, + "grad_norm": 69.13070441469635, + "learning_rate": 4.187704366392153e-08, + "loss": 1.4885, + "step": 33894 + }, + { + "epoch": 2.8888604789908805, + "grad_norm": 56.372587944624655, + "learning_rate": 4.181302747909233e-08, + "loss": 1.4779, + "step": 33895 + }, + { + "epoch": 2.888945708684906, + "grad_norm": 43.46615103120331, + "learning_rate": 4.174906005606161e-08, + "loss": 1.4768, + "step": 33896 + }, + { + "epoch": 2.889030938378931, + "grad_norm": 53.30854472024312, + "learning_rate": 4.168514139545887e-08, + "loss": 1.1154, + "step": 33897 + }, + { + "epoch": 2.8891161680729565, + "grad_norm": 24.08957463280911, + "learning_rate": 4.1621271497912505e-08, + "loss": 0.9019, + "step": 33898 + }, + { + "epoch": 2.889201397766982, + "grad_norm": 72.33244924027198, + "learning_rate": 4.1557450364050344e-08, + "loss": 2.1616, + "step": 33899 + }, + { + "epoch": 2.8892866274610074, + "grad_norm": 66.475985148468, + "learning_rate": 4.149367799450077e-08, + "loss": 1.5534, + "step": 33900 + }, + { + "epoch": 2.889371857155033, + "grad_norm": 77.71415754007234, + "learning_rate": 4.14299543898905e-08, + "loss": 1.8267, + "step": 33901 + }, + { + "epoch": 2.8894570868490583, + "grad_norm": 40.020604979554236, + "learning_rate": 4.136627955084571e-08, + "loss": 0.982, + "step": 33902 + }, + { + "epoch": 2.889542316543084, + "grad_norm": 31.348367209139152, + "learning_rate": 4.130265347799367e-08, + "loss": 1.0692, + "step": 33903 + }, + { + "epoch": 2.889627546237109, + "grad_norm": 51.93197026885736, + "learning_rate": 4.123907617195999e-08, + "loss": 1.6393, + "step": 33904 + }, + { + "epoch": 2.8897127759311343, + "grad_norm": 32.87649010626801, + "learning_rate": 4.117554763336917e-08, + "loss": 0.8498, + "step": 33905 + }, + { + "epoch": 2.8897980056251598, + "grad_norm": 31.283378667190398, + "learning_rate": 4.1112067862846274e-08, + "loss": 0.8707, + "step": 33906 + }, + { + "epoch": 2.8898832353191852, + "grad_norm": 24.325021193200765, + "learning_rate": 4.104863686101579e-08, + "loss": 0.5159, + "step": 33907 + }, + { + "epoch": 2.8899684650132107, + "grad_norm": 18.515521780998466, + "learning_rate": 4.098525462850167e-08, + "loss": 0.3874, + "step": 33908 + }, + { + "epoch": 2.890053694707236, + "grad_norm": 79.44196843661638, + "learning_rate": 4.09219211659273e-08, + "loss": 1.4049, + "step": 33909 + }, + { + "epoch": 2.8901389244012616, + "grad_norm": 41.5005544334921, + "learning_rate": 4.0858636473914416e-08, + "loss": 1.1057, + "step": 33910 + }, + { + "epoch": 2.8902241540952867, + "grad_norm": 40.92550275311963, + "learning_rate": 4.079540055308695e-08, + "loss": 1.0612, + "step": 33911 + }, + { + "epoch": 2.890309383789312, + "grad_norm": 31.0412473232615, + "learning_rate": 4.073221340406608e-08, + "loss": 0.6448, + "step": 33912 + }, + { + "epoch": 2.8903946134833376, + "grad_norm": 39.50482843901557, + "learning_rate": 4.0669075027473526e-08, + "loss": 0.9903, + "step": 33913 + }, + { + "epoch": 2.890479843177363, + "grad_norm": 72.84106389917201, + "learning_rate": 4.060598542392935e-08, + "loss": 2.0118, + "step": 33914 + }, + { + "epoch": 2.8905650728713885, + "grad_norm": 62.18792737411806, + "learning_rate": 4.054294459405472e-08, + "loss": 1.256, + "step": 33915 + }, + { + "epoch": 2.8906503025654136, + "grad_norm": 51.868511309363775, + "learning_rate": 4.0479952538469704e-08, + "loss": 1.103, + "step": 33916 + }, + { + "epoch": 2.890735532259439, + "grad_norm": 69.43714437751233, + "learning_rate": 4.041700925779379e-08, + "loss": 1.968, + "step": 33917 + }, + { + "epoch": 2.8908207619534645, + "grad_norm": 26.27310912769231, + "learning_rate": 4.0354114752645944e-08, + "loss": 1.0215, + "step": 33918 + }, + { + "epoch": 2.89090599164749, + "grad_norm": 40.0220487808511, + "learning_rate": 4.0291269023643995e-08, + "loss": 0.9075, + "step": 33919 + }, + { + "epoch": 2.8909912213415154, + "grad_norm": 30.201247270139916, + "learning_rate": 4.022847207140745e-08, + "loss": 1.1547, + "step": 33920 + }, + { + "epoch": 2.891076451035541, + "grad_norm": 50.54929778651782, + "learning_rate": 4.016572389655249e-08, + "loss": 1.3243, + "step": 33921 + }, + { + "epoch": 2.8911616807295664, + "grad_norm": 63.29682159885728, + "learning_rate": 4.010302449969694e-08, + "loss": 1.446, + "step": 33922 + }, + { + "epoch": 2.8912469104235914, + "grad_norm": 81.93286057077765, + "learning_rate": 4.0040373881456985e-08, + "loss": 1.0512, + "step": 33923 + }, + { + "epoch": 2.891332140117617, + "grad_norm": 42.01798983225463, + "learning_rate": 3.997777204244935e-08, + "loss": 0.9173, + "step": 33924 + }, + { + "epoch": 2.8914173698116423, + "grad_norm": 31.864864307744725, + "learning_rate": 3.991521898328965e-08, + "loss": 0.7823, + "step": 33925 + }, + { + "epoch": 2.891502599505668, + "grad_norm": 60.33019642980827, + "learning_rate": 3.98527147045924e-08, + "loss": 1.1143, + "step": 33926 + }, + { + "epoch": 2.8915878291996933, + "grad_norm": 62.96157353684518, + "learning_rate": 3.979025920697266e-08, + "loss": 1.188, + "step": 33927 + }, + { + "epoch": 2.8916730588937187, + "grad_norm": 69.2813768949409, + "learning_rate": 3.9727852491045496e-08, + "loss": 1.4779, + "step": 33928 + }, + { + "epoch": 2.891758288587744, + "grad_norm": 28.637255356560374, + "learning_rate": 3.966549455742319e-08, + "loss": 0.6428, + "step": 33929 + }, + { + "epoch": 2.891843518281769, + "grad_norm": 90.82588428150079, + "learning_rate": 3.96031854067197e-08, + "loss": 1.6813, + "step": 33930 + }, + { + "epoch": 2.8919287479757947, + "grad_norm": 79.42412899144581, + "learning_rate": 3.954092503954787e-08, + "loss": 1.4376, + "step": 33931 + }, + { + "epoch": 2.89201397766982, + "grad_norm": 46.597985246542, + "learning_rate": 3.947871345651999e-08, + "loss": 1.2274, + "step": 33932 + }, + { + "epoch": 2.8920992073638456, + "grad_norm": 86.8602062158919, + "learning_rate": 3.9416550658248345e-08, + "loss": 1.9141, + "step": 33933 + }, + { + "epoch": 2.892184437057871, + "grad_norm": 29.03224687872201, + "learning_rate": 3.9354436645343e-08, + "loss": 0.9657, + "step": 33934 + }, + { + "epoch": 2.892269666751896, + "grad_norm": 50.063665081263, + "learning_rate": 3.9292371418416245e-08, + "loss": 1.5071, + "step": 33935 + }, + { + "epoch": 2.892354896445922, + "grad_norm": 43.380701574444075, + "learning_rate": 3.923035497807759e-08, + "loss": 1.3891, + "step": 33936 + }, + { + "epoch": 2.892440126139947, + "grad_norm": 49.733731507042116, + "learning_rate": 3.916838732493711e-08, + "loss": 1.6298, + "step": 33937 + }, + { + "epoch": 2.8925253558339725, + "grad_norm": 81.99319260542451, + "learning_rate": 3.910646845960375e-08, + "loss": 1.2909, + "step": 33938 + }, + { + "epoch": 2.892610585527998, + "grad_norm": 27.367208448147082, + "learning_rate": 3.904459838268815e-08, + "loss": 0.9689, + "step": 33939 + }, + { + "epoch": 2.8926958152220235, + "grad_norm": 55.80146099948256, + "learning_rate": 3.898277709479648e-08, + "loss": 1.2503, + "step": 33940 + }, + { + "epoch": 2.892781044916049, + "grad_norm": 41.646997857658185, + "learning_rate": 3.8921004596538805e-08, + "loss": 1.4673, + "step": 33941 + }, + { + "epoch": 2.892866274610074, + "grad_norm": 54.90344123586284, + "learning_rate": 3.885928088852131e-08, + "loss": 1.599, + "step": 33942 + }, + { + "epoch": 2.8929515043040994, + "grad_norm": 36.48995602767633, + "learning_rate": 3.879760597135185e-08, + "loss": 1.2239, + "step": 33943 + }, + { + "epoch": 2.893036733998125, + "grad_norm": 48.823967714319146, + "learning_rate": 3.873597984563604e-08, + "loss": 0.9619, + "step": 33944 + }, + { + "epoch": 2.8931219636921504, + "grad_norm": 64.04285006954747, + "learning_rate": 3.867440251198062e-08, + "loss": 2.0136, + "step": 33945 + }, + { + "epoch": 2.893207193386176, + "grad_norm": 80.40880268649532, + "learning_rate": 3.861287397099067e-08, + "loss": 2.2401, + "step": 33946 + }, + { + "epoch": 2.8932924230802013, + "grad_norm": 36.537279136741084, + "learning_rate": 3.855139422327181e-08, + "loss": 1.2942, + "step": 33947 + }, + { + "epoch": 2.8933776527742268, + "grad_norm": 58.929609028326766, + "learning_rate": 3.848996326942911e-08, + "loss": 1.4418, + "step": 33948 + }, + { + "epoch": 2.893462882468252, + "grad_norm": 65.55182339758176, + "learning_rate": 3.842858111006542e-08, + "loss": 1.741, + "step": 33949 + }, + { + "epoch": 2.8935481121622773, + "grad_norm": 65.84820967238677, + "learning_rate": 3.8367247745785264e-08, + "loss": 1.3518, + "step": 33950 + }, + { + "epoch": 2.8936333418563027, + "grad_norm": 75.42090288381395, + "learning_rate": 3.8305963177192044e-08, + "loss": 1.8999, + "step": 33951 + }, + { + "epoch": 2.893718571550328, + "grad_norm": 45.07522490586904, + "learning_rate": 3.824472740488749e-08, + "loss": 1.1971, + "step": 33952 + }, + { + "epoch": 2.8938038012443537, + "grad_norm": 43.22430161441904, + "learning_rate": 3.8183540429474476e-08, + "loss": 1.0298, + "step": 33953 + }, + { + "epoch": 2.8938890309383787, + "grad_norm": 53.0898208898965, + "learning_rate": 3.812240225155472e-08, + "loss": 1.7201, + "step": 33954 + }, + { + "epoch": 2.8939742606324046, + "grad_norm": 41.38504019536929, + "learning_rate": 3.8061312871729986e-08, + "loss": 1.1845, + "step": 33955 + }, + { + "epoch": 2.8940594903264296, + "grad_norm": 32.883144557909056, + "learning_rate": 3.8000272290600325e-08, + "loss": 1.1979, + "step": 33956 + }, + { + "epoch": 2.894144720020455, + "grad_norm": 43.8780085046792, + "learning_rate": 3.7939280508766386e-08, + "loss": 1.0679, + "step": 33957 + }, + { + "epoch": 2.8942299497144806, + "grad_norm": 32.81600367052898, + "learning_rate": 3.787833752682768e-08, + "loss": 0.6424, + "step": 33958 + }, + { + "epoch": 2.894315179408506, + "grad_norm": 57.72002773713933, + "learning_rate": 3.781744334538373e-08, + "loss": 1.0239, + "step": 33959 + }, + { + "epoch": 2.8944004091025315, + "grad_norm": 24.792223937252245, + "learning_rate": 3.775659796503406e-08, + "loss": 0.7916, + "step": 33960 + }, + { + "epoch": 2.8944856387965565, + "grad_norm": 35.62225282923631, + "learning_rate": 3.769580138637541e-08, + "loss": 0.8678, + "step": 33961 + }, + { + "epoch": 2.894570868490582, + "grad_norm": 28.599109848786618, + "learning_rate": 3.76350536100073e-08, + "loss": 0.9347, + "step": 33962 + }, + { + "epoch": 2.8946560981846075, + "grad_norm": 31.892132048945076, + "learning_rate": 3.7574354636526476e-08, + "loss": 0.7507, + "step": 33963 + }, + { + "epoch": 2.894741327878633, + "grad_norm": 62.377940238859296, + "learning_rate": 3.751370446653024e-08, + "loss": 1.2465, + "step": 33964 + }, + { + "epoch": 2.8948265575726584, + "grad_norm": 73.79548546785345, + "learning_rate": 3.745310310061478e-08, + "loss": 2.0438, + "step": 33965 + }, + { + "epoch": 2.894911787266684, + "grad_norm": 37.309982676336666, + "learning_rate": 3.7392550539375736e-08, + "loss": 0.8047, + "step": 33966 + }, + { + "epoch": 2.8949970169607093, + "grad_norm": 59.85551053221176, + "learning_rate": 3.733204678340929e-08, + "loss": 1.9882, + "step": 33967 + }, + { + "epoch": 2.8950822466547343, + "grad_norm": 59.00418554602897, + "learning_rate": 3.727159183331053e-08, + "loss": 1.0498, + "step": 33968 + }, + { + "epoch": 2.89516747634876, + "grad_norm": 75.68352733145086, + "learning_rate": 3.721118568967286e-08, + "loss": 2.0621, + "step": 33969 + }, + { + "epoch": 2.8952527060427853, + "grad_norm": 33.21404589880979, + "learning_rate": 3.715082835309136e-08, + "loss": 0.8738, + "step": 33970 + }, + { + "epoch": 2.8953379357368108, + "grad_norm": 72.88438743180454, + "learning_rate": 3.709051982416001e-08, + "loss": 1.6009, + "step": 33971 + }, + { + "epoch": 2.895423165430836, + "grad_norm": 49.540135474252146, + "learning_rate": 3.703026010347111e-08, + "loss": 1.3626, + "step": 33972 + }, + { + "epoch": 2.8955083951248612, + "grad_norm": 75.19553882350209, + "learning_rate": 3.6970049191617506e-08, + "loss": 2.2418, + "step": 33973 + }, + { + "epoch": 2.895593624818887, + "grad_norm": 74.25703998004136, + "learning_rate": 3.690988708919152e-08, + "loss": 1.8969, + "step": 33974 + }, + { + "epoch": 2.895678854512912, + "grad_norm": 44.178073082649824, + "learning_rate": 3.684977379678378e-08, + "loss": 1.4622, + "step": 33975 + }, + { + "epoch": 2.8957640842069376, + "grad_norm": 31.056248557239154, + "learning_rate": 3.678970931498715e-08, + "loss": 1.4157, + "step": 33976 + }, + { + "epoch": 2.895849313900963, + "grad_norm": 29.087638344809914, + "learning_rate": 3.6729693644391164e-08, + "loss": 1.2079, + "step": 33977 + }, + { + "epoch": 2.8959345435949886, + "grad_norm": 63.55248315786169, + "learning_rate": 3.6669726785587e-08, + "loss": 1.7366, + "step": 33978 + }, + { + "epoch": 2.896019773289014, + "grad_norm": 97.79694130497688, + "learning_rate": 3.660980873916309e-08, + "loss": 1.9566, + "step": 33979 + }, + { + "epoch": 2.896105002983039, + "grad_norm": 47.70224300915414, + "learning_rate": 3.6549939505710066e-08, + "loss": 1.8655, + "step": 33980 + }, + { + "epoch": 2.8961902326770645, + "grad_norm": 56.14726291000746, + "learning_rate": 3.649011908581579e-08, + "loss": 1.4762, + "step": 33981 + }, + { + "epoch": 2.89627546237109, + "grad_norm": 42.01868140013494, + "learning_rate": 3.643034748006924e-08, + "loss": 1.0099, + "step": 33982 + }, + { + "epoch": 2.8963606920651155, + "grad_norm": 29.408081709777793, + "learning_rate": 3.637062468905717e-08, + "loss": 0.9711, + "step": 33983 + }, + { + "epoch": 2.896445921759141, + "grad_norm": 81.49313600637844, + "learning_rate": 3.6310950713368545e-08, + "loss": 2.1283, + "step": 33984 + }, + { + "epoch": 2.8965311514531664, + "grad_norm": 21.591169547051493, + "learning_rate": 3.6251325553588454e-08, + "loss": 0.865, + "step": 33985 + }, + { + "epoch": 2.896616381147192, + "grad_norm": 29.04115718582102, + "learning_rate": 3.6191749210304775e-08, + "loss": 0.5738, + "step": 33986 + }, + { + "epoch": 2.896701610841217, + "grad_norm": 52.62324982307762, + "learning_rate": 3.6132221684102576e-08, + "loss": 1.3921, + "step": 33987 + }, + { + "epoch": 2.8967868405352424, + "grad_norm": 69.17070306416419, + "learning_rate": 3.607274297556751e-08, + "loss": 1.4706, + "step": 33988 + }, + { + "epoch": 2.896872070229268, + "grad_norm": 47.649140454024504, + "learning_rate": 3.601331308528466e-08, + "loss": 1.1162, + "step": 33989 + }, + { + "epoch": 2.8969572999232933, + "grad_norm": 64.29986182818764, + "learning_rate": 3.595393201383856e-08, + "loss": 1.664, + "step": 33990 + }, + { + "epoch": 2.897042529617319, + "grad_norm": 42.2138866180723, + "learning_rate": 3.58945997618132e-08, + "loss": 1.1438, + "step": 33991 + }, + { + "epoch": 2.897127759311344, + "grad_norm": 61.016337467092356, + "learning_rate": 3.583531632979087e-08, + "loss": 1.4119, + "step": 33992 + }, + { + "epoch": 2.8972129890053697, + "grad_norm": 30.419894483411777, + "learning_rate": 3.5776081718356114e-08, + "loss": 0.9865, + "step": 33993 + }, + { + "epoch": 2.8972982186993947, + "grad_norm": 58.40789100634594, + "learning_rate": 3.57168959280918e-08, + "loss": 1.1075, + "step": 33994 + }, + { + "epoch": 2.89738344839342, + "grad_norm": 34.775095384171756, + "learning_rate": 3.565775895957857e-08, + "loss": 1.1142, + "step": 33995 + }, + { + "epoch": 2.8974686780874457, + "grad_norm": 64.28200887008016, + "learning_rate": 3.559867081339818e-08, + "loss": 1.9349, + "step": 33996 + }, + { + "epoch": 2.897553907781471, + "grad_norm": 80.09048450773903, + "learning_rate": 3.553963149013295e-08, + "loss": 1.9119, + "step": 33997 + }, + { + "epoch": 2.8976391374754966, + "grad_norm": 27.72930093344673, + "learning_rate": 3.548064099036241e-08, + "loss": 0.8138, + "step": 33998 + }, + { + "epoch": 2.8977243671695216, + "grad_norm": 58.94114412779499, + "learning_rate": 3.542169931466721e-08, + "loss": 2.0373, + "step": 33999 + }, + { + "epoch": 2.897809596863547, + "grad_norm": 30.844785239156106, + "learning_rate": 3.5362806463626886e-08, + "loss": 0.7721, + "step": 34000 + }, + { + "epoch": 2.8978948265575726, + "grad_norm": 79.94570939359376, + "learning_rate": 3.5303962437820416e-08, + "loss": 2.2787, + "step": 34001 + }, + { + "epoch": 2.897980056251598, + "grad_norm": 60.14768399411513, + "learning_rate": 3.524516723782678e-08, + "loss": 1.3532, + "step": 34002 + }, + { + "epoch": 2.8980652859456235, + "grad_norm": 25.226492112877356, + "learning_rate": 3.518642086422441e-08, + "loss": 0.8727, + "step": 34003 + }, + { + "epoch": 2.898150515639649, + "grad_norm": 32.65796666368808, + "learning_rate": 3.5127723317590626e-08, + "loss": 0.9796, + "step": 34004 + }, + { + "epoch": 2.8982357453336745, + "grad_norm": 65.30406155607605, + "learning_rate": 3.5069074598502726e-08, + "loss": 1.5043, + "step": 34005 + }, + { + "epoch": 2.8983209750276995, + "grad_norm": 52.176205936147674, + "learning_rate": 3.501047470753749e-08, + "loss": 1.4104, + "step": 34006 + }, + { + "epoch": 2.898406204721725, + "grad_norm": 56.63781383721403, + "learning_rate": 3.495192364527167e-08, + "loss": 1.6827, + "step": 34007 + }, + { + "epoch": 2.8984914344157504, + "grad_norm": 77.6487912684529, + "learning_rate": 3.489342141228036e-08, + "loss": 1.9784, + "step": 34008 + }, + { + "epoch": 2.898576664109776, + "grad_norm": 26.650315875701665, + "learning_rate": 3.483496800913921e-08, + "loss": 0.9307, + "step": 34009 + }, + { + "epoch": 2.8986618938038013, + "grad_norm": 35.69185072014904, + "learning_rate": 3.477656343642333e-08, + "loss": 1.3124, + "step": 34010 + }, + { + "epoch": 2.8987471234978264, + "grad_norm": 27.008123494635928, + "learning_rate": 3.471820769470724e-08, + "loss": 0.9782, + "step": 34011 + }, + { + "epoch": 2.8988323531918523, + "grad_norm": 73.26287895579911, + "learning_rate": 3.465990078456494e-08, + "loss": 2.4274, + "step": 34012 + }, + { + "epoch": 2.8989175828858773, + "grad_norm": 56.71916841343347, + "learning_rate": 3.460164270656874e-08, + "loss": 1.2, + "step": 34013 + }, + { + "epoch": 2.8990028125799028, + "grad_norm": 59.65842595837226, + "learning_rate": 3.454343346129263e-08, + "loss": 1.0472, + "step": 34014 + }, + { + "epoch": 2.8990880422739282, + "grad_norm": 56.39984701926929, + "learning_rate": 3.448527304930893e-08, + "loss": 0.9812, + "step": 34015 + }, + { + "epoch": 2.8991732719679537, + "grad_norm": 32.54705164088666, + "learning_rate": 3.4427161471188854e-08, + "loss": 1.1548, + "step": 34016 + }, + { + "epoch": 2.899258501661979, + "grad_norm": 29.48083902242935, + "learning_rate": 3.436909872750527e-08, + "loss": 0.8101, + "step": 34017 + }, + { + "epoch": 2.899343731356004, + "grad_norm": 56.399610880821314, + "learning_rate": 3.4311084818827725e-08, + "loss": 1.5712, + "step": 34018 + }, + { + "epoch": 2.8994289610500297, + "grad_norm": 22.956777806706963, + "learning_rate": 3.425311974572798e-08, + "loss": 0.7938, + "step": 34019 + }, + { + "epoch": 2.899514190744055, + "grad_norm": 38.79713268202957, + "learning_rate": 3.4195203508775585e-08, + "loss": 1.32, + "step": 34020 + }, + { + "epoch": 2.8995994204380806, + "grad_norm": 37.4461892641206, + "learning_rate": 3.4137336108540083e-08, + "loss": 1.1849, + "step": 34021 + }, + { + "epoch": 2.899684650132106, + "grad_norm": 17.9611536155313, + "learning_rate": 3.407951754559047e-08, + "loss": 0.773, + "step": 34022 + }, + { + "epoch": 2.8997698798261315, + "grad_norm": 23.58898275705653, + "learning_rate": 3.402174782049572e-08, + "loss": 0.7473, + "step": 34023 + }, + { + "epoch": 2.899855109520157, + "grad_norm": 38.450410210414574, + "learning_rate": 3.396402693382372e-08, + "loss": 1.023, + "step": 34024 + }, + { + "epoch": 2.899940339214182, + "grad_norm": 57.775486156900186, + "learning_rate": 3.390635488614236e-08, + "loss": 1.9233, + "step": 34025 + }, + { + "epoch": 2.9000255689082075, + "grad_norm": 47.706407621026656, + "learning_rate": 3.3848731678018386e-08, + "loss": 1.1933, + "step": 34026 + }, + { + "epoch": 2.900110798602233, + "grad_norm": 57.6255797366932, + "learning_rate": 3.379115731001914e-08, + "loss": 1.4064, + "step": 34027 + }, + { + "epoch": 2.9001960282962584, + "grad_norm": 31.25451404175916, + "learning_rate": 3.373363178271027e-08, + "loss": 0.8593, + "step": 34028 + }, + { + "epoch": 2.900281257990284, + "grad_norm": 58.96949274043754, + "learning_rate": 3.3676155096658e-08, + "loss": 1.4454, + "step": 34029 + }, + { + "epoch": 2.900366487684309, + "grad_norm": 55.34949053062831, + "learning_rate": 3.3618727252426877e-08, + "loss": 1.955, + "step": 34030 + }, + { + "epoch": 2.900451717378335, + "grad_norm": 72.83192992492651, + "learning_rate": 3.3561348250581996e-08, + "loss": 1.7467, + "step": 34031 + }, + { + "epoch": 2.90053694707236, + "grad_norm": 76.02850314313153, + "learning_rate": 3.350401809168791e-08, + "loss": 2.2849, + "step": 34032 + }, + { + "epoch": 2.9006221767663853, + "grad_norm": 61.62196032780358, + "learning_rate": 3.344673677630861e-08, + "loss": 1.1996, + "step": 34033 + }, + { + "epoch": 2.900707406460411, + "grad_norm": 41.655457047063926, + "learning_rate": 3.3389504305007535e-08, + "loss": 1.4054, + "step": 34034 + }, + { + "epoch": 2.9007926361544363, + "grad_norm": 67.64179574170868, + "learning_rate": 3.333232067834646e-08, + "loss": 1.4539, + "step": 34035 + }, + { + "epoch": 2.9008778658484617, + "grad_norm": 73.80865288293059, + "learning_rate": 3.327518589688883e-08, + "loss": 1.6978, + "step": 34036 + }, + { + "epoch": 2.9009630955424868, + "grad_norm": 32.70556104803279, + "learning_rate": 3.321809996119585e-08, + "loss": 0.9829, + "step": 34037 + }, + { + "epoch": 2.9010483252365122, + "grad_norm": 69.95633015476282, + "learning_rate": 3.3161062871829296e-08, + "loss": 2.0273, + "step": 34038 + }, + { + "epoch": 2.9011335549305377, + "grad_norm": 78.05038454907847, + "learning_rate": 3.310407462935039e-08, + "loss": 1.5748, + "step": 34039 + }, + { + "epoch": 2.901218784624563, + "grad_norm": 41.384794539972965, + "learning_rate": 3.3047135234318686e-08, + "loss": 0.756, + "step": 34040 + }, + { + "epoch": 2.9013040143185886, + "grad_norm": 46.68229436820128, + "learning_rate": 3.299024468729484e-08, + "loss": 1.2085, + "step": 34041 + }, + { + "epoch": 2.901389244012614, + "grad_norm": 65.54671907695176, + "learning_rate": 3.293340298883896e-08, + "loss": 2.0318, + "step": 34042 + }, + { + "epoch": 2.9014744737066396, + "grad_norm": 42.19785364032697, + "learning_rate": 3.287661013950838e-08, + "loss": 1.2085, + "step": 34043 + }, + { + "epoch": 2.9015597034006646, + "grad_norm": 45.59204463024498, + "learning_rate": 3.281986613986321e-08, + "loss": 1.459, + "step": 34044 + }, + { + "epoch": 2.90164493309469, + "grad_norm": 63.687178029961196, + "learning_rate": 3.276317099046078e-08, + "loss": 1.3061, + "step": 34045 + }, + { + "epoch": 2.9017301627887155, + "grad_norm": 23.69907038802934, + "learning_rate": 3.2706524691858976e-08, + "loss": 0.7988, + "step": 34046 + }, + { + "epoch": 2.901815392482741, + "grad_norm": 46.89503168142665, + "learning_rate": 3.2649927244614576e-08, + "loss": 1.0133, + "step": 34047 + }, + { + "epoch": 2.9019006221767665, + "grad_norm": 32.643121249984645, + "learning_rate": 3.25933786492838e-08, + "loss": 1.0636, + "step": 34048 + }, + { + "epoch": 2.901985851870792, + "grad_norm": 68.95679482404128, + "learning_rate": 3.253687890642398e-08, + "loss": 1.9496, + "step": 34049 + }, + { + "epoch": 2.9020710815648174, + "grad_norm": 28.36433153019953, + "learning_rate": 3.2480428016589684e-08, + "loss": 0.8748, + "step": 34050 + }, + { + "epoch": 2.9021563112588424, + "grad_norm": 17.71406906431088, + "learning_rate": 3.242402598033656e-08, + "loss": 0.4293, + "step": 34051 + }, + { + "epoch": 2.902241540952868, + "grad_norm": 93.28236537609307, + "learning_rate": 3.236767279821918e-08, + "loss": 2.3926, + "step": 34052 + }, + { + "epoch": 2.9023267706468934, + "grad_norm": 48.45094297744217, + "learning_rate": 3.2311368470792084e-08, + "loss": 1.7009, + "step": 34053 + }, + { + "epoch": 2.902412000340919, + "grad_norm": 61.590649036407754, + "learning_rate": 3.225511299860873e-08, + "loss": 2.128, + "step": 34054 + }, + { + "epoch": 2.9024972300349443, + "grad_norm": 70.05081727418053, + "learning_rate": 3.219890638222145e-08, + "loss": 1.6961, + "step": 34055 + }, + { + "epoch": 2.9025824597289693, + "grad_norm": 34.35029423898341, + "learning_rate": 3.2142748622184807e-08, + "loss": 1.3833, + "step": 34056 + }, + { + "epoch": 2.9026676894229952, + "grad_norm": 72.32712968616963, + "learning_rate": 3.2086639719050017e-08, + "loss": 1.3533, + "step": 34057 + }, + { + "epoch": 2.9027529191170203, + "grad_norm": 42.78246929575709, + "learning_rate": 3.2030579673368865e-08, + "loss": 1.2667, + "step": 34058 + }, + { + "epoch": 2.9028381488110457, + "grad_norm": 24.234032329559753, + "learning_rate": 3.1974568485693134e-08, + "loss": 0.7296, + "step": 34059 + }, + { + "epoch": 2.902923378505071, + "grad_norm": 39.506080550455415, + "learning_rate": 3.1918606156573494e-08, + "loss": 1.0483, + "step": 34060 + }, + { + "epoch": 2.9030086081990967, + "grad_norm": 66.55310454064042, + "learning_rate": 3.186269268656006e-08, + "loss": 1.4902, + "step": 34061 + }, + { + "epoch": 2.903093837893122, + "grad_norm": 53.53304373158625, + "learning_rate": 3.180682807620294e-08, + "loss": 1.1973, + "step": 34062 + }, + { + "epoch": 2.903179067587147, + "grad_norm": 39.3000694250294, + "learning_rate": 3.175101232605115e-08, + "loss": 1.2157, + "step": 34063 + }, + { + "epoch": 2.9032642972811726, + "grad_norm": 35.86620030734252, + "learning_rate": 3.1695245436654255e-08, + "loss": 1.0822, + "step": 34064 + }, + { + "epoch": 2.903349526975198, + "grad_norm": 67.95541652311377, + "learning_rate": 3.1639527408560686e-08, + "loss": 1.4694, + "step": 34065 + }, + { + "epoch": 2.9034347566692236, + "grad_norm": 39.57337998169139, + "learning_rate": 3.15838582423178e-08, + "loss": 1.1438, + "step": 34066 + }, + { + "epoch": 2.903519986363249, + "grad_norm": 54.27179656938609, + "learning_rate": 3.1528237938473485e-08, + "loss": 1.9667, + "step": 34067 + }, + { + "epoch": 2.9036052160572745, + "grad_norm": 49.006115389016614, + "learning_rate": 3.147266649757508e-08, + "loss": 1.7719, + "step": 34068 + }, + { + "epoch": 2.9036904457513, + "grad_norm": 53.54688618390282, + "learning_rate": 3.14171439201677e-08, + "loss": 1.1493, + "step": 34069 + }, + { + "epoch": 2.903775675445325, + "grad_norm": 70.1345705746063, + "learning_rate": 3.1361670206799256e-08, + "loss": 1.6933, + "step": 34070 + }, + { + "epoch": 2.9038609051393505, + "grad_norm": 35.11342578447405, + "learning_rate": 3.1306245358014295e-08, + "loss": 1.2597, + "step": 34071 + }, + { + "epoch": 2.903946134833376, + "grad_norm": 51.80327177665731, + "learning_rate": 3.125086937435795e-08, + "loss": 1.2388, + "step": 34072 + }, + { + "epoch": 2.9040313645274014, + "grad_norm": 61.98787273681186, + "learning_rate": 3.119554225637478e-08, + "loss": 1.325, + "step": 34073 + }, + { + "epoch": 2.904116594221427, + "grad_norm": 98.3215196750334, + "learning_rate": 3.1140264004609346e-08, + "loss": 1.5739, + "step": 34074 + }, + { + "epoch": 2.904201823915452, + "grad_norm": 39.46891414973708, + "learning_rate": 3.108503461960455e-08, + "loss": 1.4546, + "step": 34075 + }, + { + "epoch": 2.904287053609478, + "grad_norm": 65.81902462717396, + "learning_rate": 3.10298541019044e-08, + "loss": 1.4593, + "step": 34076 + }, + { + "epoch": 2.904372283303503, + "grad_norm": 75.73474589219185, + "learning_rate": 3.097472245205124e-08, + "loss": 1.9157, + "step": 34077 + }, + { + "epoch": 2.9044575129975283, + "grad_norm": 42.50253874953505, + "learning_rate": 3.091963967058687e-08, + "loss": 1.239, + "step": 34078 + }, + { + "epoch": 2.9045427426915538, + "grad_norm": 54.285281535298914, + "learning_rate": 3.086460575805306e-08, + "loss": 0.9771, + "step": 34079 + }, + { + "epoch": 2.9046279723855792, + "grad_norm": 23.695960903505046, + "learning_rate": 3.080962071499161e-08, + "loss": 0.9544, + "step": 34080 + }, + { + "epoch": 2.9047132020796047, + "grad_norm": 46.316492790762645, + "learning_rate": 3.075468454194319e-08, + "loss": 0.7601, + "step": 34081 + }, + { + "epoch": 2.9047984317736297, + "grad_norm": 96.49229915890227, + "learning_rate": 3.069979723944738e-08, + "loss": 2.5809, + "step": 34082 + }, + { + "epoch": 2.904883661467655, + "grad_norm": 50.16518391234142, + "learning_rate": 3.0644958808044855e-08, + "loss": 1.2151, + "step": 34083 + }, + { + "epoch": 2.9049688911616807, + "grad_norm": 39.70672510243964, + "learning_rate": 3.0590169248274623e-08, + "loss": 1.0568, + "step": 34084 + }, + { + "epoch": 2.905054120855706, + "grad_norm": 49.06430533615679, + "learning_rate": 3.0535428560675704e-08, + "loss": 0.8022, + "step": 34085 + }, + { + "epoch": 2.9051393505497316, + "grad_norm": 49.02633519048249, + "learning_rate": 3.0480736745785447e-08, + "loss": 0.9664, + "step": 34086 + }, + { + "epoch": 2.905224580243757, + "grad_norm": 61.549987045276445, + "learning_rate": 3.04260938041423e-08, + "loss": 1.7669, + "step": 34087 + }, + { + "epoch": 2.9053098099377825, + "grad_norm": 44.892611932414866, + "learning_rate": 3.0371499736284724e-08, + "loss": 1.6444, + "step": 34088 + }, + { + "epoch": 2.9053950396318076, + "grad_norm": 74.42210987066284, + "learning_rate": 3.0316954542747853e-08, + "loss": 1.7144, + "step": 34089 + }, + { + "epoch": 2.905480269325833, + "grad_norm": 24.977982931788233, + "learning_rate": 3.026245822406959e-08, + "loss": 0.6194, + "step": 34090 + }, + { + "epoch": 2.9055654990198585, + "grad_norm": 71.62558769128492, + "learning_rate": 3.0208010780785056e-08, + "loss": 1.9287, + "step": 34091 + }, + { + "epoch": 2.905650728713884, + "grad_norm": 69.00323514957427, + "learning_rate": 3.015361221342994e-08, + "loss": 1.6559, + "step": 34092 + }, + { + "epoch": 2.9057359584079094, + "grad_norm": 48.931387774441426, + "learning_rate": 3.0099262522539364e-08, + "loss": 1.4017, + "step": 34093 + }, + { + "epoch": 2.9058211881019345, + "grad_norm": 31.95914565575666, + "learning_rate": 3.0044961708647344e-08, + "loss": 0.9575, + "step": 34094 + }, + { + "epoch": 2.9059064177959604, + "grad_norm": 20.91306939158375, + "learning_rate": 2.9990709772287905e-08, + "loss": 0.6528, + "step": 34095 + }, + { + "epoch": 2.9059916474899854, + "grad_norm": 25.68233982558696, + "learning_rate": 2.9936506713995617e-08, + "loss": 0.8378, + "step": 34096 + }, + { + "epoch": 2.906076877184011, + "grad_norm": 60.58307882977761, + "learning_rate": 2.9882352534302827e-08, + "loss": 1.2862, + "step": 34097 + }, + { + "epoch": 2.9061621068780363, + "grad_norm": 23.37209513692051, + "learning_rate": 2.9828247233742444e-08, + "loss": 1.0056, + "step": 34098 + }, + { + "epoch": 2.906247336572062, + "grad_norm": 56.579687182307595, + "learning_rate": 2.9774190812845715e-08, + "loss": 1.4075, + "step": 34099 + }, + { + "epoch": 2.9063325662660873, + "grad_norm": 50.363251495404164, + "learning_rate": 2.9720183272144985e-08, + "loss": 1.2257, + "step": 34100 + }, + { + "epoch": 2.9064177959601123, + "grad_norm": 49.70144188251021, + "learning_rate": 2.96662246121715e-08, + "loss": 1.311, + "step": 34101 + }, + { + "epoch": 2.9065030256541378, + "grad_norm": 69.41436801012864, + "learning_rate": 2.961231483345539e-08, + "loss": 1.7185, + "step": 34102 + }, + { + "epoch": 2.9065882553481632, + "grad_norm": 29.554573076006196, + "learning_rate": 2.9558453936527343e-08, + "loss": 0.8812, + "step": 34103 + }, + { + "epoch": 2.9066734850421887, + "grad_norm": 59.933934995705414, + "learning_rate": 2.9504641921916377e-08, + "loss": 0.9284, + "step": 34104 + }, + { + "epoch": 2.906758714736214, + "grad_norm": 83.45380274946832, + "learning_rate": 2.9450878790152624e-08, + "loss": 2.1817, + "step": 34105 + }, + { + "epoch": 2.9068439444302396, + "grad_norm": 38.01151345738629, + "learning_rate": 2.9397164541764557e-08, + "loss": 1.243, + "step": 34106 + }, + { + "epoch": 2.906929174124265, + "grad_norm": 50.488759151059355, + "learning_rate": 2.9343499177280075e-08, + "loss": 1.2111, + "step": 34107 + }, + { + "epoch": 2.90701440381829, + "grad_norm": 46.63730173422994, + "learning_rate": 2.9289882697227102e-08, + "loss": 1.2785, + "step": 34108 + }, + { + "epoch": 2.9070996335123156, + "grad_norm": 42.34380171190652, + "learning_rate": 2.9236315102132985e-08, + "loss": 0.8372, + "step": 34109 + }, + { + "epoch": 2.907184863206341, + "grad_norm": 32.167879171281186, + "learning_rate": 2.918279639252397e-08, + "loss": 0.801, + "step": 34110 + }, + { + "epoch": 2.9072700929003665, + "grad_norm": 49.979295703414564, + "learning_rate": 2.9129326568927973e-08, + "loss": 1.3807, + "step": 34111 + }, + { + "epoch": 2.907355322594392, + "grad_norm": 50.25301696200351, + "learning_rate": 2.9075905631869017e-08, + "loss": 1.1651, + "step": 34112 + }, + { + "epoch": 2.907440552288417, + "grad_norm": 132.50512721183856, + "learning_rate": 2.9022533581873347e-08, + "loss": 3.3125, + "step": 34113 + }, + { + "epoch": 2.907525781982443, + "grad_norm": 71.78216227125537, + "learning_rate": 2.8969210419466098e-08, + "loss": 1.1692, + "step": 34114 + }, + { + "epoch": 2.907611011676468, + "grad_norm": 80.09182527967872, + "learning_rate": 2.8915936145171298e-08, + "loss": 1.9028, + "step": 34115 + }, + { + "epoch": 2.9076962413704934, + "grad_norm": 69.45450868688248, + "learning_rate": 2.886271075951297e-08, + "loss": 1.5457, + "step": 34116 + }, + { + "epoch": 2.907781471064519, + "grad_norm": 22.825377921810443, + "learning_rate": 2.880953426301458e-08, + "loss": 0.7009, + "step": 34117 + }, + { + "epoch": 2.9078667007585444, + "grad_norm": 61.96386751674061, + "learning_rate": 2.87564066561985e-08, + "loss": 1.642, + "step": 34118 + }, + { + "epoch": 2.90795193045257, + "grad_norm": 58.23059274380744, + "learning_rate": 2.8703327939588744e-08, + "loss": 1.4713, + "step": 34119 + }, + { + "epoch": 2.908037160146595, + "grad_norm": 70.31115720818627, + "learning_rate": 2.865029811370601e-08, + "loss": 2.3313, + "step": 34120 + }, + { + "epoch": 2.9081223898406203, + "grad_norm": 37.52179133324672, + "learning_rate": 2.859731717907155e-08, + "loss": 1.7683, + "step": 34121 + }, + { + "epoch": 2.908207619534646, + "grad_norm": 51.480446424678156, + "learning_rate": 2.854438513620772e-08, + "loss": 1.3829, + "step": 34122 + }, + { + "epoch": 2.9082928492286713, + "grad_norm": 28.192450410708414, + "learning_rate": 2.8491501985634108e-08, + "loss": 0.7661, + "step": 34123 + }, + { + "epoch": 2.9083780789226967, + "grad_norm": 62.772335398053436, + "learning_rate": 2.8438667727871406e-08, + "loss": 1.3673, + "step": 34124 + }, + { + "epoch": 2.908463308616722, + "grad_norm": 86.62512886125141, + "learning_rate": 2.8385882363438643e-08, + "loss": 2.0481, + "step": 34125 + }, + { + "epoch": 2.9085485383107477, + "grad_norm": 37.43996404466274, + "learning_rate": 2.8333145892854852e-08, + "loss": 1.1029, + "step": 34126 + }, + { + "epoch": 2.9086337680047727, + "grad_norm": 73.98117206249515, + "learning_rate": 2.8280458316639613e-08, + "loss": 1.6714, + "step": 34127 + }, + { + "epoch": 2.908718997698798, + "grad_norm": 64.36343754730746, + "learning_rate": 2.8227819635310294e-08, + "loss": 1.848, + "step": 34128 + }, + { + "epoch": 2.9088042273928236, + "grad_norm": 67.16209774365984, + "learning_rate": 2.817522984938481e-08, + "loss": 1.752, + "step": 34129 + }, + { + "epoch": 2.908889457086849, + "grad_norm": 64.6832317598972, + "learning_rate": 2.8122688959379972e-08, + "loss": 1.5161, + "step": 34130 + }, + { + "epoch": 2.9089746867808746, + "grad_norm": 86.23131958455588, + "learning_rate": 2.8070196965813147e-08, + "loss": 1.9734, + "step": 34131 + }, + { + "epoch": 2.9090599164748996, + "grad_norm": 56.11287989552052, + "learning_rate": 2.801775386920058e-08, + "loss": 1.7057, + "step": 34132 + }, + { + "epoch": 2.9091451461689255, + "grad_norm": 64.6054773625774, + "learning_rate": 2.7965359670057425e-08, + "loss": 1.5725, + "step": 34133 + }, + { + "epoch": 2.9092303758629505, + "grad_norm": 66.03788964454328, + "learning_rate": 2.791301436889937e-08, + "loss": 1.8333, + "step": 34134 + }, + { + "epoch": 2.909315605556976, + "grad_norm": 103.06546968632856, + "learning_rate": 2.7860717966241012e-08, + "loss": 1.9736, + "step": 34135 + }, + { + "epoch": 2.9094008352510015, + "grad_norm": 64.71731087014577, + "learning_rate": 2.7808470462596937e-08, + "loss": 1.6131, + "step": 34136 + }, + { + "epoch": 2.909486064945027, + "grad_norm": 44.91165511954816, + "learning_rate": 2.775627185848062e-08, + "loss": 1.3398, + "step": 34137 + }, + { + "epoch": 2.9095712946390524, + "grad_norm": 58.527396752359635, + "learning_rate": 2.7704122154405544e-08, + "loss": 1.2724, + "step": 34138 + }, + { + "epoch": 2.9096565243330774, + "grad_norm": 59.68064796172194, + "learning_rate": 2.7652021350884628e-08, + "loss": 0.8628, + "step": 34139 + }, + { + "epoch": 2.909741754027103, + "grad_norm": 22.421065271379835, + "learning_rate": 2.7599969448430243e-08, + "loss": 0.6063, + "step": 34140 + }, + { + "epoch": 2.9098269837211284, + "grad_norm": 24.941472500695937, + "learning_rate": 2.75479664475542e-08, + "loss": 0.8206, + "step": 34141 + }, + { + "epoch": 2.909912213415154, + "grad_norm": 30.59197974202686, + "learning_rate": 2.7496012348768308e-08, + "loss": 1.1496, + "step": 34142 + }, + { + "epoch": 2.9099974431091793, + "grad_norm": 30.011687406662258, + "learning_rate": 2.7444107152583276e-08, + "loss": 0.7043, + "step": 34143 + }, + { + "epoch": 2.9100826728032048, + "grad_norm": 40.112785772833774, + "learning_rate": 2.7392250859509807e-08, + "loss": 1.1277, + "step": 34144 + }, + { + "epoch": 2.9101679024972302, + "grad_norm": 21.565675568841534, + "learning_rate": 2.7340443470057487e-08, + "loss": 0.7909, + "step": 34145 + }, + { + "epoch": 2.9102531321912553, + "grad_norm": 46.61369679583534, + "learning_rate": 2.7288684984735357e-08, + "loss": 1.007, + "step": 34146 + }, + { + "epoch": 2.9103383618852807, + "grad_norm": 37.684233263688405, + "learning_rate": 2.723697540405357e-08, + "loss": 0.7803, + "step": 34147 + }, + { + "epoch": 2.910423591579306, + "grad_norm": 33.523578667034336, + "learning_rate": 2.718531472852004e-08, + "loss": 0.7671, + "step": 34148 + }, + { + "epoch": 2.9105088212733317, + "grad_norm": 59.49152419720559, + "learning_rate": 2.7133702958642704e-08, + "loss": 0.8019, + "step": 34149 + }, + { + "epoch": 2.910594050967357, + "grad_norm": 42.76727092704881, + "learning_rate": 2.7082140094930042e-08, + "loss": 1.2455, + "step": 34150 + }, + { + "epoch": 2.910679280661382, + "grad_norm": 73.96515616226719, + "learning_rate": 2.703062613788776e-08, + "loss": 2.0626, + "step": 34151 + }, + { + "epoch": 2.910764510355408, + "grad_norm": 52.7062090429038, + "learning_rate": 2.6979161088023788e-08, + "loss": 1.4155, + "step": 34152 + }, + { + "epoch": 2.910849740049433, + "grad_norm": 45.36155194115197, + "learning_rate": 2.692774494584327e-08, + "loss": 1.1067, + "step": 34153 + }, + { + "epoch": 2.9109349697434586, + "grad_norm": 46.85666899543239, + "learning_rate": 2.6876377711852474e-08, + "loss": 1.2062, + "step": 34154 + }, + { + "epoch": 2.911020199437484, + "grad_norm": 64.91919696039119, + "learning_rate": 2.682505938655655e-08, + "loss": 1.6337, + "step": 34155 + }, + { + "epoch": 2.9111054291315095, + "grad_norm": 47.3121632539443, + "learning_rate": 2.6773789970459542e-08, + "loss": 1.3597, + "step": 34156 + }, + { + "epoch": 2.911190658825535, + "grad_norm": 29.653792246234243, + "learning_rate": 2.6722569464066593e-08, + "loss": 1.0891, + "step": 34157 + }, + { + "epoch": 2.91127588851956, + "grad_norm": 29.99301878848825, + "learning_rate": 2.6671397867880644e-08, + "loss": 1.222, + "step": 34158 + }, + { + "epoch": 2.9113611182135855, + "grad_norm": 31.422002393940378, + "learning_rate": 2.6620275182405175e-08, + "loss": 0.8556, + "step": 34159 + }, + { + "epoch": 2.911446347907611, + "grad_norm": 53.25252192922847, + "learning_rate": 2.6569201408143675e-08, + "loss": 1.4809, + "step": 34160 + }, + { + "epoch": 2.9115315776016364, + "grad_norm": 55.089612862914805, + "learning_rate": 2.6518176545597406e-08, + "loss": 0.9894, + "step": 34161 + }, + { + "epoch": 2.911616807295662, + "grad_norm": 77.81350715247534, + "learning_rate": 2.646720059526875e-08, + "loss": 1.5723, + "step": 34162 + }, + { + "epoch": 2.9117020369896873, + "grad_norm": 65.15773595710536, + "learning_rate": 2.6416273557658968e-08, + "loss": 1.9617, + "step": 34163 + }, + { + "epoch": 2.911787266683713, + "grad_norm": 14.331108773690284, + "learning_rate": 2.6365395433268213e-08, + "loss": 0.3869, + "step": 34164 + }, + { + "epoch": 2.911872496377738, + "grad_norm": 26.585351134181632, + "learning_rate": 2.6314566222598315e-08, + "loss": 0.9645, + "step": 34165 + }, + { + "epoch": 2.9119577260717633, + "grad_norm": 18.698200225777903, + "learning_rate": 2.6263785926147757e-08, + "loss": 0.6863, + "step": 34166 + }, + { + "epoch": 2.9120429557657888, + "grad_norm": 56.095368896069466, + "learning_rate": 2.6213054544416694e-08, + "loss": 1.6842, + "step": 34167 + }, + { + "epoch": 2.9121281854598142, + "grad_norm": 43.632704377807066, + "learning_rate": 2.616237207790362e-08, + "loss": 1.1982, + "step": 34168 + }, + { + "epoch": 2.9122134151538397, + "grad_norm": 45.09329591582645, + "learning_rate": 2.611173852710758e-08, + "loss": 1.4424, + "step": 34169 + }, + { + "epoch": 2.912298644847865, + "grad_norm": 67.177446419462, + "learning_rate": 2.606115389252595e-08, + "loss": 1.9795, + "step": 34170 + }, + { + "epoch": 2.9123838745418906, + "grad_norm": 38.47490235155596, + "learning_rate": 2.6010618174656666e-08, + "loss": 1.1712, + "step": 34171 + }, + { + "epoch": 2.9124691042359157, + "grad_norm": 38.75876395893743, + "learning_rate": 2.5960131373996e-08, + "loss": 1.0429, + "step": 34172 + }, + { + "epoch": 2.912554333929941, + "grad_norm": 26.19993783801743, + "learning_rate": 2.590969349104133e-08, + "loss": 0.7651, + "step": 34173 + }, + { + "epoch": 2.9126395636239666, + "grad_norm": 54.809543127140195, + "learning_rate": 2.5859304526288375e-08, + "loss": 1.0006, + "step": 34174 + }, + { + "epoch": 2.912724793317992, + "grad_norm": 25.00135476977374, + "learning_rate": 2.580896448023229e-08, + "loss": 0.4663, + "step": 34175 + }, + { + "epoch": 2.9128100230120175, + "grad_norm": 72.52569599812941, + "learning_rate": 2.575867335336879e-08, + "loss": 1.901, + "step": 34176 + }, + { + "epoch": 2.9128952527060425, + "grad_norm": 22.684745296360347, + "learning_rate": 2.5708431146191927e-08, + "loss": 0.7156, + "step": 34177 + }, + { + "epoch": 2.9129804824000685, + "grad_norm": 31.92687170142077, + "learning_rate": 2.56582378591963e-08, + "loss": 1.0632, + "step": 34178 + }, + { + "epoch": 2.9130657120940935, + "grad_norm": 64.84548321738151, + "learning_rate": 2.560809349287541e-08, + "loss": 1.3271, + "step": 34179 + }, + { + "epoch": 2.913150941788119, + "grad_norm": 23.39095175721065, + "learning_rate": 2.5557998047721633e-08, + "loss": 0.829, + "step": 34180 + }, + { + "epoch": 2.9132361714821444, + "grad_norm": 45.85693286014164, + "learning_rate": 2.5507951524229025e-08, + "loss": 1.3142, + "step": 34181 + }, + { + "epoch": 2.91332140117617, + "grad_norm": 39.80144721831509, + "learning_rate": 2.54579539228883e-08, + "loss": 1.3584, + "step": 34182 + }, + { + "epoch": 2.9134066308701954, + "grad_norm": 27.574696090307565, + "learning_rate": 2.5408005244192958e-08, + "loss": 1.1387, + "step": 34183 + }, + { + "epoch": 2.9134918605642204, + "grad_norm": 48.31230785569427, + "learning_rate": 2.53581054886326e-08, + "loss": 0.8557, + "step": 34184 + }, + { + "epoch": 2.913577090258246, + "grad_norm": 29.968713829338935, + "learning_rate": 2.530825465669795e-08, + "loss": 1.3463, + "step": 34185 + }, + { + "epoch": 2.9136623199522713, + "grad_norm": 27.29249048810255, + "learning_rate": 2.5258452748880836e-08, + "loss": 0.8316, + "step": 34186 + }, + { + "epoch": 2.913747549646297, + "grad_norm": 25.728508242849088, + "learning_rate": 2.5208699765669753e-08, + "loss": 0.8689, + "step": 34187 + }, + { + "epoch": 2.9138327793403223, + "grad_norm": 49.104182321765755, + "learning_rate": 2.5158995707553758e-08, + "loss": 1.1008, + "step": 34188 + }, + { + "epoch": 2.9139180090343477, + "grad_norm": 66.22220547207421, + "learning_rate": 2.5109340575023013e-08, + "loss": 1.0876, + "step": 34189 + }, + { + "epoch": 2.914003238728373, + "grad_norm": 44.897609707369035, + "learning_rate": 2.505973436856435e-08, + "loss": 1.1221, + "step": 34190 + }, + { + "epoch": 2.914088468422398, + "grad_norm": 86.06110879822262, + "learning_rate": 2.5010177088667376e-08, + "loss": 2.2422, + "step": 34191 + }, + { + "epoch": 2.9141736981164237, + "grad_norm": 62.708352344057765, + "learning_rate": 2.4960668735817817e-08, + "loss": 1.5324, + "step": 34192 + }, + { + "epoch": 2.914258927810449, + "grad_norm": 68.8679110946504, + "learning_rate": 2.491120931050306e-08, + "loss": 1.7512, + "step": 34193 + }, + { + "epoch": 2.9143441575044746, + "grad_norm": 27.610982451112783, + "learning_rate": 2.4861798813209935e-08, + "loss": 0.992, + "step": 34194 + }, + { + "epoch": 2.9144293871985, + "grad_norm": 37.68303570272867, + "learning_rate": 2.4812437244423614e-08, + "loss": 1.057, + "step": 34195 + }, + { + "epoch": 2.914514616892525, + "grad_norm": 63.51227527845353, + "learning_rate": 2.4763124604630373e-08, + "loss": 1.2782, + "step": 34196 + }, + { + "epoch": 2.914599846586551, + "grad_norm": 63.41073121342175, + "learning_rate": 2.4713860894314823e-08, + "loss": 1.5926, + "step": 34197 + }, + { + "epoch": 2.914685076280576, + "grad_norm": 48.957890208698615, + "learning_rate": 2.4664646113961575e-08, + "loss": 1.6703, + "step": 34198 + }, + { + "epoch": 2.9147703059746015, + "grad_norm": 34.892438177926564, + "learning_rate": 2.4615480264054693e-08, + "loss": 0.9579, + "step": 34199 + }, + { + "epoch": 2.914855535668627, + "grad_norm": 45.43961692031526, + "learning_rate": 2.4566363345077116e-08, + "loss": 1.3813, + "step": 34200 + }, + { + "epoch": 2.9149407653626525, + "grad_norm": 46.30537866106272, + "learning_rate": 2.451729535751235e-08, + "loss": 0.9489, + "step": 34201 + }, + { + "epoch": 2.915025995056678, + "grad_norm": 60.28222367816907, + "learning_rate": 2.446827630184334e-08, + "loss": 1.7908, + "step": 34202 + }, + { + "epoch": 2.915111224750703, + "grad_norm": 48.24868486603814, + "learning_rate": 2.4419306178551372e-08, + "loss": 1.0594, + "step": 34203 + }, + { + "epoch": 2.9151964544447284, + "grad_norm": 42.882433589407064, + "learning_rate": 2.4370384988118278e-08, + "loss": 1.174, + "step": 34204 + }, + { + "epoch": 2.915281684138754, + "grad_norm": 79.30844544126569, + "learning_rate": 2.4321512731025897e-08, + "loss": 1.8748, + "step": 34205 + }, + { + "epoch": 2.9153669138327793, + "grad_norm": 31.908380879830183, + "learning_rate": 2.4272689407753847e-08, + "loss": 1.3395, + "step": 34206 + }, + { + "epoch": 2.915452143526805, + "grad_norm": 61.45675071568464, + "learning_rate": 2.422391501878285e-08, + "loss": 1.1269, + "step": 34207 + }, + { + "epoch": 2.9155373732208303, + "grad_norm": 62.48000670731112, + "learning_rate": 2.4175189564592528e-08, + "loss": 1.6725, + "step": 34208 + }, + { + "epoch": 2.9156226029148558, + "grad_norm": 36.446666127586504, + "learning_rate": 2.4126513045661937e-08, + "loss": 0.815, + "step": 34209 + }, + { + "epoch": 2.915707832608881, + "grad_norm": 37.17759464767895, + "learning_rate": 2.407788546247014e-08, + "loss": 0.835, + "step": 34210 + }, + { + "epoch": 2.9157930623029062, + "grad_norm": 70.38437553528077, + "learning_rate": 2.4029306815494534e-08, + "loss": 1.8124, + "step": 34211 + }, + { + "epoch": 2.9158782919969317, + "grad_norm": 24.19096292424992, + "learning_rate": 2.3980777105213625e-08, + "loss": 0.6892, + "step": 34212 + }, + { + "epoch": 2.915963521690957, + "grad_norm": 51.1801558589941, + "learning_rate": 2.393229633210481e-08, + "loss": 0.7741, + "step": 34213 + }, + { + "epoch": 2.9160487513849827, + "grad_norm": 60.877605106695846, + "learning_rate": 2.3883864496644372e-08, + "loss": 1.469, + "step": 34214 + }, + { + "epoch": 2.9161339810790077, + "grad_norm": 40.51192758162728, + "learning_rate": 2.3835481599308595e-08, + "loss": 1.3449, + "step": 34215 + }, + { + "epoch": 2.9162192107730336, + "grad_norm": 21.869413752413248, + "learning_rate": 2.3787147640573772e-08, + "loss": 1.0872, + "step": 34216 + }, + { + "epoch": 2.9163044404670586, + "grad_norm": 59.40832065275373, + "learning_rate": 2.373886262091507e-08, + "loss": 1.4199, + "step": 34217 + }, + { + "epoch": 2.916389670161084, + "grad_norm": 47.552638660260016, + "learning_rate": 2.369062654080767e-08, + "loss": 1.3022, + "step": 34218 + }, + { + "epoch": 2.9164748998551095, + "grad_norm": 109.49962592115197, + "learning_rate": 2.3642439400725082e-08, + "loss": 2.7805, + "step": 34219 + }, + { + "epoch": 2.916560129549135, + "grad_norm": 63.5291419176517, + "learning_rate": 2.3594301201141366e-08, + "loss": 1.402, + "step": 34220 + }, + { + "epoch": 2.9166453592431605, + "grad_norm": 64.21953027704063, + "learning_rate": 2.3546211942530594e-08, + "loss": 0.9978, + "step": 34221 + }, + { + "epoch": 2.9167305889371855, + "grad_norm": 90.11661234004002, + "learning_rate": 2.3498171625365717e-08, + "loss": 1.9553, + "step": 34222 + }, + { + "epoch": 2.916815818631211, + "grad_norm": 24.008653083745095, + "learning_rate": 2.345018025011858e-08, + "loss": 0.9882, + "step": 34223 + }, + { + "epoch": 2.9169010483252364, + "grad_norm": 66.94136616233202, + "learning_rate": 2.3402237817261032e-08, + "loss": 2.031, + "step": 34224 + }, + { + "epoch": 2.916986278019262, + "grad_norm": 38.26445342211446, + "learning_rate": 2.3354344327265465e-08, + "loss": 1.4286, + "step": 34225 + }, + { + "epoch": 2.9170715077132874, + "grad_norm": 27.74622583232455, + "learning_rate": 2.3306499780602066e-08, + "loss": 0.8868, + "step": 34226 + }, + { + "epoch": 2.917156737407313, + "grad_norm": 68.86967417164072, + "learning_rate": 2.325870417774212e-08, + "loss": 1.1391, + "step": 34227 + }, + { + "epoch": 2.9172419671013383, + "grad_norm": 36.33760400591312, + "learning_rate": 2.3210957519154697e-08, + "loss": 0.9957, + "step": 34228 + }, + { + "epoch": 2.9173271967953633, + "grad_norm": 64.9841601194238, + "learning_rate": 2.3163259805310533e-08, + "loss": 1.3543, + "step": 34229 + }, + { + "epoch": 2.917412426489389, + "grad_norm": 77.66215570380932, + "learning_rate": 2.3115611036677587e-08, + "loss": 1.5659, + "step": 34230 + }, + { + "epoch": 2.9174976561834143, + "grad_norm": 61.39756794792044, + "learning_rate": 2.3068011213725483e-08, + "loss": 1.851, + "step": 34231 + }, + { + "epoch": 2.9175828858774397, + "grad_norm": 88.9433862539958, + "learning_rate": 2.3020460336921623e-08, + "loss": 2.2925, + "step": 34232 + }, + { + "epoch": 2.917668115571465, + "grad_norm": 34.10482315631374, + "learning_rate": 2.2972958406733968e-08, + "loss": 1.1363, + "step": 34233 + }, + { + "epoch": 2.9177533452654902, + "grad_norm": 52.10427794490419, + "learning_rate": 2.2925505423629368e-08, + "loss": 1.9251, + "step": 34234 + }, + { + "epoch": 2.917838574959516, + "grad_norm": 28.082704285266576, + "learning_rate": 2.287810138807467e-08, + "loss": 0.8927, + "step": 34235 + }, + { + "epoch": 2.917923804653541, + "grad_norm": 51.04353957584302, + "learning_rate": 2.2830746300536167e-08, + "loss": 1.3937, + "step": 34236 + }, + { + "epoch": 2.9180090343475666, + "grad_norm": 25.324503020570127, + "learning_rate": 2.2783440161479596e-08, + "loss": 0.8425, + "step": 34237 + }, + { + "epoch": 2.918094264041592, + "grad_norm": 59.79155041094692, + "learning_rate": 2.2736182971370145e-08, + "loss": 1.3557, + "step": 34238 + }, + { + "epoch": 2.9181794937356176, + "grad_norm": 42.88609002088125, + "learning_rate": 2.2688974730672998e-08, + "loss": 1.0699, + "step": 34239 + }, + { + "epoch": 2.918264723429643, + "grad_norm": 51.97133991379263, + "learning_rate": 2.2641815439851665e-08, + "loss": 1.6511, + "step": 34240 + }, + { + "epoch": 2.918349953123668, + "grad_norm": 39.23387993568269, + "learning_rate": 2.259470509937023e-08, + "loss": 0.8547, + "step": 34241 + }, + { + "epoch": 2.9184351828176935, + "grad_norm": 27.4053694918247, + "learning_rate": 2.2547643709691647e-08, + "loss": 0.9474, + "step": 34242 + }, + { + "epoch": 2.918520412511719, + "grad_norm": 59.11766204304711, + "learning_rate": 2.250063127127944e-08, + "loss": 2.1104, + "step": 34243 + }, + { + "epoch": 2.9186056422057445, + "grad_norm": 36.11574279394349, + "learning_rate": 2.245366778459601e-08, + "loss": 1.3564, + "step": 34244 + }, + { + "epoch": 2.91869087189977, + "grad_norm": 64.84073246219413, + "learning_rate": 2.2406753250102665e-08, + "loss": 1.6244, + "step": 34245 + }, + { + "epoch": 2.9187761015937954, + "grad_norm": 31.3707928826401, + "learning_rate": 2.2359887668260695e-08, + "loss": 0.8726, + "step": 34246 + }, + { + "epoch": 2.918861331287821, + "grad_norm": 60.9100038319579, + "learning_rate": 2.2313071039531953e-08, + "loss": 2.1119, + "step": 34247 + }, + { + "epoch": 2.918946560981846, + "grad_norm": 60.67282509734189, + "learning_rate": 2.2266303364376074e-08, + "loss": 0.8331, + "step": 34248 + }, + { + "epoch": 2.9190317906758714, + "grad_norm": 42.25191998823533, + "learning_rate": 2.2219584643252688e-08, + "loss": 1.4662, + "step": 34249 + }, + { + "epoch": 2.919117020369897, + "grad_norm": 31.17844279432678, + "learning_rate": 2.2172914876621987e-08, + "loss": 1.3147, + "step": 34250 + }, + { + "epoch": 2.9192022500639223, + "grad_norm": 49.1865099511709, + "learning_rate": 2.212629406494249e-08, + "loss": 0.9372, + "step": 34251 + }, + { + "epoch": 2.919287479757948, + "grad_norm": 76.95575848499809, + "learning_rate": 2.2079722208672715e-08, + "loss": 2.0908, + "step": 34252 + }, + { + "epoch": 2.919372709451973, + "grad_norm": 37.25643461406146, + "learning_rate": 2.2033199308271193e-08, + "loss": 1.2468, + "step": 34253 + }, + { + "epoch": 2.9194579391459987, + "grad_norm": 30.148338823643577, + "learning_rate": 2.1986725364195327e-08, + "loss": 0.8801, + "step": 34254 + }, + { + "epoch": 2.9195431688400237, + "grad_norm": 59.50970146779204, + "learning_rate": 2.1940300376901424e-08, + "loss": 1.5438, + "step": 34255 + }, + { + "epoch": 2.919628398534049, + "grad_norm": 41.433565660313995, + "learning_rate": 2.1893924346846896e-08, + "loss": 0.9836, + "step": 34256 + }, + { + "epoch": 2.9197136282280747, + "grad_norm": 24.726888615653756, + "learning_rate": 2.1847597274487487e-08, + "loss": 0.6959, + "step": 34257 + }, + { + "epoch": 2.9197988579221, + "grad_norm": 24.876681691704814, + "learning_rate": 2.1801319160278943e-08, + "loss": 0.608, + "step": 34258 + }, + { + "epoch": 2.9198840876161256, + "grad_norm": 77.2916400814847, + "learning_rate": 2.1755090004675904e-08, + "loss": 2.5208, + "step": 34259 + }, + { + "epoch": 2.9199693173101506, + "grad_norm": 97.80793326991417, + "learning_rate": 2.170890980813356e-08, + "loss": 1.1988, + "step": 34260 + }, + { + "epoch": 2.920054547004176, + "grad_norm": 32.29814463943268, + "learning_rate": 2.166277857110599e-08, + "loss": 1.0515, + "step": 34261 + }, + { + "epoch": 2.9201397766982016, + "grad_norm": 72.18109150374833, + "learning_rate": 2.1616696294046723e-08, + "loss": 1.4847, + "step": 34262 + }, + { + "epoch": 2.920225006392227, + "grad_norm": 68.95371169470947, + "learning_rate": 2.1570662977409284e-08, + "loss": 2.0369, + "step": 34263 + }, + { + "epoch": 2.9203102360862525, + "grad_norm": 70.43914219261288, + "learning_rate": 2.152467862164609e-08, + "loss": 1.1121, + "step": 34264 + }, + { + "epoch": 2.920395465780278, + "grad_norm": 68.16983135904249, + "learning_rate": 2.1478743227209e-08, + "loss": 2.1912, + "step": 34265 + }, + { + "epoch": 2.9204806954743034, + "grad_norm": 67.90074894801414, + "learning_rate": 2.1432856794550426e-08, + "loss": 1.5387, + "step": 34266 + }, + { + "epoch": 2.9205659251683285, + "grad_norm": 59.19660803243447, + "learning_rate": 2.138701932412168e-08, + "loss": 1.4312, + "step": 34267 + }, + { + "epoch": 2.920651154862354, + "grad_norm": 52.884552919422596, + "learning_rate": 2.134123081637296e-08, + "loss": 1.336, + "step": 34268 + }, + { + "epoch": 2.9207363845563794, + "grad_norm": 77.76492855877949, + "learning_rate": 2.1295491271755007e-08, + "loss": 1.3175, + "step": 34269 + }, + { + "epoch": 2.920821614250405, + "grad_norm": 37.6729399379197, + "learning_rate": 2.124980069071747e-08, + "loss": 1.0949, + "step": 34270 + }, + { + "epoch": 2.9209068439444303, + "grad_norm": 77.54784180991676, + "learning_rate": 2.1204159073709985e-08, + "loss": 1.9636, + "step": 34271 + }, + { + "epoch": 2.9209920736384554, + "grad_norm": 52.17529503059117, + "learning_rate": 2.1158566421181083e-08, + "loss": 1.32, + "step": 34272 + }, + { + "epoch": 2.9210773033324813, + "grad_norm": 44.0624105996811, + "learning_rate": 2.1113022733579292e-08, + "loss": 1.5011, + "step": 34273 + }, + { + "epoch": 2.9211625330265063, + "grad_norm": 73.77327759519643, + "learning_rate": 2.106752801135259e-08, + "loss": 1.98, + "step": 34274 + }, + { + "epoch": 2.9212477627205318, + "grad_norm": 26.031207122929025, + "learning_rate": 2.1022082254948395e-08, + "loss": 0.8799, + "step": 34275 + }, + { + "epoch": 2.9213329924145572, + "grad_norm": 61.42536438895077, + "learning_rate": 2.0976685464813574e-08, + "loss": 2.055, + "step": 34276 + }, + { + "epoch": 2.9214182221085827, + "grad_norm": 64.4583735929862, + "learning_rate": 2.0931337641394433e-08, + "loss": 1.275, + "step": 34277 + }, + { + "epoch": 2.921503451802608, + "grad_norm": 41.962312331616985, + "learning_rate": 2.088603878513673e-08, + "loss": 1.4587, + "step": 34278 + }, + { + "epoch": 2.921588681496633, + "grad_norm": 77.99917695310556, + "learning_rate": 2.0840788896486774e-08, + "loss": 1.5464, + "step": 34279 + }, + { + "epoch": 2.9216739111906587, + "grad_norm": 32.76869661634209, + "learning_rate": 2.0795587975889208e-08, + "loss": 1.074, + "step": 34280 + }, + { + "epoch": 2.921759140884684, + "grad_norm": 44.19631672948133, + "learning_rate": 2.075043602378868e-08, + "loss": 1.4217, + "step": 34281 + }, + { + "epoch": 2.9218443705787096, + "grad_norm": 38.66918374361011, + "learning_rate": 2.0705333040628163e-08, + "loss": 0.908, + "step": 34282 + }, + { + "epoch": 2.921929600272735, + "grad_norm": 41.96067957482825, + "learning_rate": 2.0660279026852858e-08, + "loss": 1.1619, + "step": 34283 + }, + { + "epoch": 2.9220148299667605, + "grad_norm": 44.8010555423551, + "learning_rate": 2.0615273982905192e-08, + "loss": 1.5101, + "step": 34284 + }, + { + "epoch": 2.922100059660786, + "grad_norm": 31.477828235017206, + "learning_rate": 2.0570317909227033e-08, + "loss": 0.7713, + "step": 34285 + }, + { + "epoch": 2.922185289354811, + "grad_norm": 57.699331327072834, + "learning_rate": 2.0525410806261915e-08, + "loss": 1.6047, + "step": 34286 + }, + { + "epoch": 2.9222705190488365, + "grad_norm": 53.84965193829773, + "learning_rate": 2.0480552674450592e-08, + "loss": 1.2117, + "step": 34287 + }, + { + "epoch": 2.922355748742862, + "grad_norm": 66.53346791226855, + "learning_rate": 2.043574351423383e-08, + "loss": 1.6076, + "step": 34288 + }, + { + "epoch": 2.9224409784368874, + "grad_norm": 48.56791413530065, + "learning_rate": 2.0390983326053493e-08, + "loss": 1.4295, + "step": 34289 + }, + { + "epoch": 2.922526208130913, + "grad_norm": 64.827465694647, + "learning_rate": 2.0346272110348676e-08, + "loss": 1.4344, + "step": 34290 + }, + { + "epoch": 2.9226114378249384, + "grad_norm": 22.054241746886625, + "learning_rate": 2.030160986755958e-08, + "loss": 0.5789, + "step": 34291 + }, + { + "epoch": 2.922696667518964, + "grad_norm": 80.67334214159597, + "learning_rate": 2.02569965981253e-08, + "loss": 1.8575, + "step": 34292 + }, + { + "epoch": 2.922781897212989, + "grad_norm": 45.96526668785599, + "learning_rate": 2.021243230248493e-08, + "loss": 1.4497, + "step": 34293 + }, + { + "epoch": 2.9228671269070143, + "grad_norm": 84.62383286900851, + "learning_rate": 2.016791698107701e-08, + "loss": 1.9314, + "step": 34294 + }, + { + "epoch": 2.92295235660104, + "grad_norm": 54.923404796957, + "learning_rate": 2.0123450634338404e-08, + "loss": 1.4055, + "step": 34295 + }, + { + "epoch": 2.9230375862950653, + "grad_norm": 45.92392704370269, + "learning_rate": 2.007903326270655e-08, + "loss": 1.1948, + "step": 34296 + }, + { + "epoch": 2.9231228159890907, + "grad_norm": 57.79670087989136, + "learning_rate": 2.003466486661887e-08, + "loss": 1.4021, + "step": 34297 + }, + { + "epoch": 2.9232080456831158, + "grad_norm": 73.78439727383744, + "learning_rate": 1.9990345446511128e-08, + "loss": 1.6111, + "step": 34298 + }, + { + "epoch": 2.9232932753771412, + "grad_norm": 51.1544326548817, + "learning_rate": 1.9946075002819643e-08, + "loss": 1.1976, + "step": 34299 + }, + { + "epoch": 2.9233785050711667, + "grad_norm": 85.79854070692673, + "learning_rate": 1.9901853535980176e-08, + "loss": 1.4712, + "step": 34300 + }, + { + "epoch": 2.923463734765192, + "grad_norm": 29.00470701932905, + "learning_rate": 1.985768104642627e-08, + "loss": 0.6689, + "step": 34301 + }, + { + "epoch": 2.9235489644592176, + "grad_norm": 58.82878456607888, + "learning_rate": 1.9813557534593687e-08, + "loss": 1.2561, + "step": 34302 + }, + { + "epoch": 2.923634194153243, + "grad_norm": 58.069105590814, + "learning_rate": 1.9769483000915413e-08, + "loss": 1.9728, + "step": 34303 + }, + { + "epoch": 2.9237194238472686, + "grad_norm": 59.49700635505229, + "learning_rate": 1.9725457445825548e-08, + "loss": 1.1447, + "step": 34304 + }, + { + "epoch": 2.9238046535412936, + "grad_norm": 97.35935411608213, + "learning_rate": 1.9681480869757075e-08, + "loss": 2.0261, + "step": 34305 + }, + { + "epoch": 2.923889883235319, + "grad_norm": 27.785888110177723, + "learning_rate": 1.9637553273141874e-08, + "loss": 0.9134, + "step": 34306 + }, + { + "epoch": 2.9239751129293445, + "grad_norm": 61.06203586569015, + "learning_rate": 1.9593674656412377e-08, + "loss": 1.3888, + "step": 34307 + }, + { + "epoch": 2.92406034262337, + "grad_norm": 54.784690397964354, + "learning_rate": 1.9549845020000456e-08, + "loss": 1.4527, + "step": 34308 + }, + { + "epoch": 2.9241455723173955, + "grad_norm": 30.803323007998994, + "learning_rate": 1.950606436433633e-08, + "loss": 0.9731, + "step": 34309 + }, + { + "epoch": 2.924230802011421, + "grad_norm": 76.06848674368578, + "learning_rate": 1.9462332689851315e-08, + "loss": 2.1992, + "step": 34310 + }, + { + "epoch": 2.9243160317054464, + "grad_norm": 67.31479748853297, + "learning_rate": 1.941864999697507e-08, + "loss": 2.3507, + "step": 34311 + }, + { + "epoch": 2.9244012613994714, + "grad_norm": 39.43463484277006, + "learning_rate": 1.937501628613725e-08, + "loss": 1.1459, + "step": 34312 + }, + { + "epoch": 2.924486491093497, + "grad_norm": 60.985562588337594, + "learning_rate": 1.9331431557766955e-08, + "loss": 1.4743, + "step": 34313 + }, + { + "epoch": 2.9245717207875224, + "grad_norm": 57.386029813623736, + "learning_rate": 1.9287895812292735e-08, + "loss": 1.1809, + "step": 34314 + }, + { + "epoch": 2.924656950481548, + "grad_norm": 44.47842331333864, + "learning_rate": 1.9244409050143687e-08, + "loss": 1.1093, + "step": 34315 + }, + { + "epoch": 2.9247421801755733, + "grad_norm": 29.78863353364442, + "learning_rate": 1.9200971271746138e-08, + "loss": 0.6575, + "step": 34316 + }, + { + "epoch": 2.9248274098695983, + "grad_norm": 92.19993332089001, + "learning_rate": 1.915758247752808e-08, + "loss": 2.1799, + "step": 34317 + }, + { + "epoch": 2.9249126395636242, + "grad_norm": 45.46617083856337, + "learning_rate": 1.911424266791584e-08, + "loss": 1.4994, + "step": 34318 + }, + { + "epoch": 2.9249978692576493, + "grad_norm": 110.9168235060995, + "learning_rate": 1.9070951843335738e-08, + "loss": 1.155, + "step": 34319 + }, + { + "epoch": 2.9250830989516747, + "grad_norm": 29.542182243693574, + "learning_rate": 1.9027710004214105e-08, + "loss": 1.1261, + "step": 34320 + }, + { + "epoch": 2.9251683286457, + "grad_norm": 33.648598377255915, + "learning_rate": 1.8984517150975046e-08, + "loss": 1.3255, + "step": 34321 + }, + { + "epoch": 2.9252535583397257, + "grad_norm": 24.32955873358939, + "learning_rate": 1.894137328404433e-08, + "loss": 1.0018, + "step": 34322 + }, + { + "epoch": 2.925338788033751, + "grad_norm": 79.34742424827661, + "learning_rate": 1.8898278403845504e-08, + "loss": 2.1789, + "step": 34323 + }, + { + "epoch": 2.925424017727776, + "grad_norm": 64.87619314087404, + "learning_rate": 1.8855232510803236e-08, + "loss": 1.5951, + "step": 34324 + }, + { + "epoch": 2.9255092474218016, + "grad_norm": 30.927506437486674, + "learning_rate": 1.8812235605340512e-08, + "loss": 1.6275, + "step": 34325 + }, + { + "epoch": 2.925594477115827, + "grad_norm": 120.49740825701791, + "learning_rate": 1.8769287687880333e-08, + "loss": 3.2898, + "step": 34326 + }, + { + "epoch": 2.9256797068098526, + "grad_norm": 27.707663904255693, + "learning_rate": 1.872638875884458e-08, + "loss": 1.0135, + "step": 34327 + }, + { + "epoch": 2.925764936503878, + "grad_norm": 47.17276213955129, + "learning_rate": 1.8683538818655145e-08, + "loss": 1.419, + "step": 34328 + }, + { + "epoch": 2.9258501661979035, + "grad_norm": 49.61400213674098, + "learning_rate": 1.8640737867733903e-08, + "loss": 1.4371, + "step": 34329 + }, + { + "epoch": 2.925935395891929, + "grad_norm": 19.833870728844847, + "learning_rate": 1.8597985906501636e-08, + "loss": 0.4171, + "step": 34330 + }, + { + "epoch": 2.926020625585954, + "grad_norm": 58.97860824764104, + "learning_rate": 1.855528293537856e-08, + "loss": 1.4809, + "step": 34331 + }, + { + "epoch": 2.9261058552799795, + "grad_norm": 59.454486013782436, + "learning_rate": 1.851262895478545e-08, + "loss": 1.241, + "step": 34332 + }, + { + "epoch": 2.926191084974005, + "grad_norm": 99.9457466284388, + "learning_rate": 1.847002396514086e-08, + "loss": 2.2158, + "step": 34333 + }, + { + "epoch": 2.9262763146680304, + "grad_norm": 45.75962412021554, + "learning_rate": 1.84274679668639e-08, + "loss": 0.9745, + "step": 34334 + }, + { + "epoch": 2.926361544362056, + "grad_norm": 44.7616420636042, + "learning_rate": 1.8384960960373678e-08, + "loss": 1.2836, + "step": 34335 + }, + { + "epoch": 2.926446774056081, + "grad_norm": 75.27767551522604, + "learning_rate": 1.8342502946087638e-08, + "loss": 1.9346, + "step": 34336 + }, + { + "epoch": 2.926532003750107, + "grad_norm": 89.41961690874116, + "learning_rate": 1.8300093924423223e-08, + "loss": 2.1802, + "step": 34337 + }, + { + "epoch": 2.926617233444132, + "grad_norm": 31.004951553386316, + "learning_rate": 1.8257733895798435e-08, + "loss": 1.1453, + "step": 34338 + }, + { + "epoch": 2.9267024631381573, + "grad_norm": 37.04827453254898, + "learning_rate": 1.821542286062905e-08, + "loss": 0.9781, + "step": 34339 + }, + { + "epoch": 2.9267876928321828, + "grad_norm": 35.45209934263173, + "learning_rate": 1.8173160819331404e-08, + "loss": 0.9956, + "step": 34340 + }, + { + "epoch": 2.9268729225262082, + "grad_norm": 83.20940572642937, + "learning_rate": 1.813094777232127e-08, + "loss": 1.5873, + "step": 34341 + }, + { + "epoch": 2.9269581522202337, + "grad_norm": 42.34639063014041, + "learning_rate": 1.808878372001388e-08, + "loss": 1.4652, + "step": 34342 + }, + { + "epoch": 2.9270433819142587, + "grad_norm": 81.5048511008768, + "learning_rate": 1.804666866282334e-08, + "loss": 1.6024, + "step": 34343 + }, + { + "epoch": 2.927128611608284, + "grad_norm": 27.951119195555126, + "learning_rate": 1.8004602601163767e-08, + "loss": 0.8053, + "step": 34344 + }, + { + "epoch": 2.9272138413023097, + "grad_norm": 30.57127792706728, + "learning_rate": 1.7962585535449826e-08, + "loss": 0.8743, + "step": 34345 + }, + { + "epoch": 2.927299070996335, + "grad_norm": 66.23287852281292, + "learning_rate": 1.792061746609397e-08, + "loss": 1.6674, + "step": 34346 + }, + { + "epoch": 2.9273843006903606, + "grad_norm": 81.79259943771963, + "learning_rate": 1.7878698393509753e-08, + "loss": 1.6528, + "step": 34347 + }, + { + "epoch": 2.927469530384386, + "grad_norm": 71.55289550806697, + "learning_rate": 1.7836828318107956e-08, + "loss": 2.1423, + "step": 34348 + }, + { + "epoch": 2.9275547600784115, + "grad_norm": 71.85117909952491, + "learning_rate": 1.7795007240301587e-08, + "loss": 1.5177, + "step": 34349 + }, + { + "epoch": 2.9276399897724366, + "grad_norm": 28.54539953268273, + "learning_rate": 1.775323516050198e-08, + "loss": 0.8366, + "step": 34350 + }, + { + "epoch": 2.927725219466462, + "grad_norm": 63.05309124299957, + "learning_rate": 1.7711512079119362e-08, + "loss": 1.8152, + "step": 34351 + }, + { + "epoch": 2.9278104491604875, + "grad_norm": 56.88221437424026, + "learning_rate": 1.7669837996563965e-08, + "loss": 1.52, + "step": 34352 + }, + { + "epoch": 2.927895678854513, + "grad_norm": 65.01558753633641, + "learning_rate": 1.762821291324601e-08, + "loss": 1.3708, + "step": 34353 + }, + { + "epoch": 2.9279809085485384, + "grad_norm": 33.43104717530512, + "learning_rate": 1.7586636829574622e-08, + "loss": 0.9375, + "step": 34354 + }, + { + "epoch": 2.9280661382425635, + "grad_norm": 67.1624598414627, + "learning_rate": 1.754510974595891e-08, + "loss": 1.9156, + "step": 34355 + }, + { + "epoch": 2.9281513679365894, + "grad_norm": 52.825322864192955, + "learning_rate": 1.750363166280744e-08, + "loss": 1.9209, + "step": 34356 + }, + { + "epoch": 2.9282365976306144, + "grad_norm": 63.03021031010059, + "learning_rate": 1.7462202580527665e-08, + "loss": 1.794, + "step": 34357 + }, + { + "epoch": 2.92832182732464, + "grad_norm": 44.76222194820639, + "learning_rate": 1.742082249952759e-08, + "loss": 1.3897, + "step": 34358 + }, + { + "epoch": 2.9284070570186653, + "grad_norm": 29.72788752232893, + "learning_rate": 1.737949142021411e-08, + "loss": 1.1268, + "step": 34359 + }, + { + "epoch": 2.928492286712691, + "grad_norm": 57.86573714656885, + "learning_rate": 1.7338209342992463e-08, + "loss": 1.7313, + "step": 34360 + }, + { + "epoch": 2.9285775164067163, + "grad_norm": 38.55901162910358, + "learning_rate": 1.7296976268270093e-08, + "loss": 1.3403, + "step": 34361 + }, + { + "epoch": 2.9286627461007413, + "grad_norm": 48.03861304443639, + "learning_rate": 1.7255792196452238e-08, + "loss": 1.3301, + "step": 34362 + }, + { + "epoch": 2.9287479757947668, + "grad_norm": 38.60959769031771, + "learning_rate": 1.7214657127943568e-08, + "loss": 0.785, + "step": 34363 + }, + { + "epoch": 2.9288332054887922, + "grad_norm": 31.47682249972393, + "learning_rate": 1.7173571063148766e-08, + "loss": 0.9056, + "step": 34364 + }, + { + "epoch": 2.9289184351828177, + "grad_norm": 28.777484220751226, + "learning_rate": 1.7132534002471946e-08, + "loss": 0.9432, + "step": 34365 + }, + { + "epoch": 2.929003664876843, + "grad_norm": 35.81759085232065, + "learning_rate": 1.709154594631668e-08, + "loss": 0.9023, + "step": 34366 + }, + { + "epoch": 2.9290888945708686, + "grad_norm": 28.82441167272143, + "learning_rate": 1.7050606895085975e-08, + "loss": 0.7715, + "step": 34367 + }, + { + "epoch": 2.929174124264894, + "grad_norm": 53.8006811249705, + "learning_rate": 1.700971684918229e-08, + "loss": 1.6972, + "step": 34368 + }, + { + "epoch": 2.929259353958919, + "grad_norm": 60.40651868594872, + "learning_rate": 1.6968875809008634e-08, + "loss": 1.3028, + "step": 34369 + }, + { + "epoch": 2.9293445836529446, + "grad_norm": 77.96757748144876, + "learning_rate": 1.692808377496524e-08, + "loss": 2.1839, + "step": 34370 + }, + { + "epoch": 2.92942981334697, + "grad_norm": 68.86987627057938, + "learning_rate": 1.6887340747454573e-08, + "loss": 1.7835, + "step": 34371 + }, + { + "epoch": 2.9295150430409955, + "grad_norm": 80.18635963644238, + "learning_rate": 1.684664672687686e-08, + "loss": 2.0858, + "step": 34372 + }, + { + "epoch": 2.929600272735021, + "grad_norm": 31.350491057816537, + "learning_rate": 1.6806001713632337e-08, + "loss": 1.0467, + "step": 34373 + }, + { + "epoch": 2.929685502429046, + "grad_norm": 83.98304502667538, + "learning_rate": 1.676540570812013e-08, + "loss": 2.2465, + "step": 34374 + }, + { + "epoch": 2.929770732123072, + "grad_norm": 83.75415944462975, + "learning_rate": 1.6724858710740476e-08, + "loss": 2.023, + "step": 34375 + }, + { + "epoch": 2.929855961817097, + "grad_norm": 31.256315258424216, + "learning_rate": 1.668436072189139e-08, + "loss": 0.8945, + "step": 34376 + }, + { + "epoch": 2.9299411915111224, + "grad_norm": 32.052259926891495, + "learning_rate": 1.6643911741971442e-08, + "loss": 1.2149, + "step": 34377 + }, + { + "epoch": 2.930026421205148, + "grad_norm": 48.18625705087151, + "learning_rate": 1.6603511771378645e-08, + "loss": 1.2661, + "step": 34378 + }, + { + "epoch": 2.9301116508991734, + "grad_norm": 68.11840520133316, + "learning_rate": 1.6563160810509905e-08, + "loss": 1.3798, + "step": 34379 + }, + { + "epoch": 2.930196880593199, + "grad_norm": 54.67188252456661, + "learning_rate": 1.652285885976268e-08, + "loss": 0.8722, + "step": 34380 + }, + { + "epoch": 2.930282110287224, + "grad_norm": 67.41846168274787, + "learning_rate": 1.6482605919532214e-08, + "loss": 1.66, + "step": 34381 + }, + { + "epoch": 2.9303673399812493, + "grad_norm": 33.22828632098219, + "learning_rate": 1.644240199021596e-08, + "loss": 1.1898, + "step": 34382 + }, + { + "epoch": 2.930452569675275, + "grad_norm": 45.41220743509901, + "learning_rate": 1.64022470722075e-08, + "loss": 1.4759, + "step": 34383 + }, + { + "epoch": 2.9305377993693003, + "grad_norm": 27.256166206267224, + "learning_rate": 1.6362141165903178e-08, + "loss": 1.1059, + "step": 34384 + }, + { + "epoch": 2.9306230290633257, + "grad_norm": 55.23955373712581, + "learning_rate": 1.632208427169657e-08, + "loss": 1.497, + "step": 34385 + }, + { + "epoch": 2.930708258757351, + "grad_norm": 93.25305365070713, + "learning_rate": 1.628207638998236e-08, + "loss": 2.3463, + "step": 34386 + }, + { + "epoch": 2.9307934884513767, + "grad_norm": 63.89183061650187, + "learning_rate": 1.624211752115301e-08, + "loss": 1.891, + "step": 34387 + }, + { + "epoch": 2.9308787181454017, + "grad_norm": 43.98381658553754, + "learning_rate": 1.6202207665602654e-08, + "loss": 1.2813, + "step": 34388 + }, + { + "epoch": 2.930963947839427, + "grad_norm": 102.41838732154176, + "learning_rate": 1.6162346823723197e-08, + "loss": 2.8945, + "step": 34389 + }, + { + "epoch": 2.9310491775334526, + "grad_norm": 35.78041501106428, + "learning_rate": 1.612253499590599e-08, + "loss": 1.2295, + "step": 34390 + }, + { + "epoch": 2.931134407227478, + "grad_norm": 72.90938168866168, + "learning_rate": 1.608277218254406e-08, + "loss": 1.7259, + "step": 34391 + }, + { + "epoch": 2.9312196369215036, + "grad_norm": 66.53763193029789, + "learning_rate": 1.6043058384027088e-08, + "loss": 1.9005, + "step": 34392 + }, + { + "epoch": 2.9313048666155286, + "grad_norm": 69.65565919127286, + "learning_rate": 1.6003393600746432e-08, + "loss": 1.7622, + "step": 34393 + }, + { + "epoch": 2.9313900963095545, + "grad_norm": 52.66568667528521, + "learning_rate": 1.5963777833091776e-08, + "loss": 1.0949, + "step": 34394 + }, + { + "epoch": 2.9314753260035795, + "grad_norm": 58.91147629920252, + "learning_rate": 1.5924211081453367e-08, + "loss": 1.2987, + "step": 34395 + }, + { + "epoch": 2.931560555697605, + "grad_norm": 41.173905821112264, + "learning_rate": 1.588469334621978e-08, + "loss": 1.3713, + "step": 34396 + }, + { + "epoch": 2.9316457853916305, + "grad_norm": 60.20096941071184, + "learning_rate": 1.58452246277796e-08, + "loss": 1.576, + "step": 34397 + }, + { + "epoch": 2.931731015085656, + "grad_norm": 43.681796640229706, + "learning_rate": 1.580580492652084e-08, + "loss": 1.1003, + "step": 34398 + }, + { + "epoch": 2.9318162447796814, + "grad_norm": 19.210906559913404, + "learning_rate": 1.576643424283153e-08, + "loss": 0.5945, + "step": 34399 + }, + { + "epoch": 2.9319014744737064, + "grad_norm": 22.872622074132344, + "learning_rate": 1.5727112577099136e-08, + "loss": 0.7831, + "step": 34400 + }, + { + "epoch": 2.931986704167732, + "grad_norm": 48.58538552176771, + "learning_rate": 1.5687839929710013e-08, + "loss": 1.8939, + "step": 34401 + }, + { + "epoch": 2.9320719338617574, + "grad_norm": 41.49687469241238, + "learning_rate": 1.564861630104997e-08, + "loss": 1.2231, + "step": 34402 + }, + { + "epoch": 2.932157163555783, + "grad_norm": 18.293676304462164, + "learning_rate": 1.5609441691505912e-08, + "loss": 0.7871, + "step": 34403 + }, + { + "epoch": 2.9322423932498083, + "grad_norm": 22.128642043028236, + "learning_rate": 1.5570316101461426e-08, + "loss": 0.7565, + "step": 34404 + }, + { + "epoch": 2.9323276229438338, + "grad_norm": 62.36512164787196, + "learning_rate": 1.5531239531302867e-08, + "loss": 1.8586, + "step": 34405 + }, + { + "epoch": 2.9324128526378592, + "grad_norm": 43.5540502351584, + "learning_rate": 1.5492211981414372e-08, + "loss": 0.8949, + "step": 34406 + }, + { + "epoch": 2.9324980823318842, + "grad_norm": 54.72017612718121, + "learning_rate": 1.545323345217842e-08, + "loss": 1.2242, + "step": 34407 + }, + { + "epoch": 2.9325833120259097, + "grad_norm": 34.51332250960384, + "learning_rate": 1.5414303943979692e-08, + "loss": 1.2107, + "step": 34408 + }, + { + "epoch": 2.932668541719935, + "grad_norm": 54.206482263291655, + "learning_rate": 1.5375423457200667e-08, + "loss": 1.2431, + "step": 34409 + }, + { + "epoch": 2.9327537714139607, + "grad_norm": 58.41498784551099, + "learning_rate": 1.5336591992223813e-08, + "loss": 1.5876, + "step": 34410 + }, + { + "epoch": 2.932839001107986, + "grad_norm": 55.00174630782766, + "learning_rate": 1.5297809549430496e-08, + "loss": 1.8949, + "step": 34411 + }, + { + "epoch": 2.932924230802011, + "grad_norm": 35.374249072642314, + "learning_rate": 1.5259076129202633e-08, + "loss": 0.7722, + "step": 34412 + }, + { + "epoch": 2.933009460496037, + "grad_norm": 48.901766383411186, + "learning_rate": 1.5220391731921026e-08, + "loss": 0.9558, + "step": 34413 + }, + { + "epoch": 2.933094690190062, + "grad_norm": 33.97039115080111, + "learning_rate": 1.5181756357965925e-08, + "loss": 0.8818, + "step": 34414 + }, + { + "epoch": 2.9331799198840875, + "grad_norm": 47.65688638301924, + "learning_rate": 1.514317000771759e-08, + "loss": 1.5132, + "step": 34415 + }, + { + "epoch": 2.933265149578113, + "grad_norm": 72.52406197627366, + "learning_rate": 1.5104632681555708e-08, + "loss": 1.5922, + "step": 34416 + }, + { + "epoch": 2.9333503792721385, + "grad_norm": 44.90232916455688, + "learning_rate": 1.5066144379858315e-08, + "loss": 1.2713, + "step": 34417 + }, + { + "epoch": 2.933435608966164, + "grad_norm": 31.738550627922063, + "learning_rate": 1.502770510300511e-08, + "loss": 0.8139, + "step": 34418 + }, + { + "epoch": 2.933520838660189, + "grad_norm": 51.422929715304605, + "learning_rate": 1.498931485137356e-08, + "loss": 1.578, + "step": 34419 + }, + { + "epoch": 2.9336060683542144, + "grad_norm": 43.0809943509533, + "learning_rate": 1.495097362534115e-08, + "loss": 0.9374, + "step": 34420 + }, + { + "epoch": 2.93369129804824, + "grad_norm": 85.28678942820837, + "learning_rate": 1.491268142528479e-08, + "loss": 1.3734, + "step": 34421 + }, + { + "epoch": 2.9337765277422654, + "grad_norm": 21.90190091985469, + "learning_rate": 1.4874438251581413e-08, + "loss": 0.8589, + "step": 34422 + }, + { + "epoch": 2.933861757436291, + "grad_norm": 45.078313874002795, + "learning_rate": 1.4836244104607378e-08, + "loss": 1.2748, + "step": 34423 + }, + { + "epoch": 2.9339469871303163, + "grad_norm": 68.84803266320849, + "learning_rate": 1.479809898473794e-08, + "loss": 1.6269, + "step": 34424 + }, + { + "epoch": 2.934032216824342, + "grad_norm": 34.414333369371654, + "learning_rate": 1.4760002892347802e-08, + "loss": 1.0944, + "step": 34425 + }, + { + "epoch": 2.934117446518367, + "grad_norm": 63.26430097138955, + "learning_rate": 1.4721955827812772e-08, + "loss": 1.4321, + "step": 34426 + }, + { + "epoch": 2.9342026762123923, + "grad_norm": 30.100382587338743, + "learning_rate": 1.4683957791505887e-08, + "loss": 1.0058, + "step": 34427 + }, + { + "epoch": 2.9342879059064177, + "grad_norm": 81.23317650014364, + "learning_rate": 1.4646008783801291e-08, + "loss": 1.8011, + "step": 34428 + }, + { + "epoch": 2.934373135600443, + "grad_norm": 65.15629742044511, + "learning_rate": 1.4608108805072573e-08, + "loss": 1.5207, + "step": 34429 + }, + { + "epoch": 2.9344583652944687, + "grad_norm": 103.07656603293815, + "learning_rate": 1.457025785569166e-08, + "loss": 1.8169, + "step": 34430 + }, + { + "epoch": 2.934543594988494, + "grad_norm": 65.58171441456946, + "learning_rate": 1.4532455936031586e-08, + "loss": 1.4928, + "step": 34431 + }, + { + "epoch": 2.9346288246825196, + "grad_norm": 28.877353829526623, + "learning_rate": 1.449470304646372e-08, + "loss": 0.9694, + "step": 34432 + }, + { + "epoch": 2.9347140543765446, + "grad_norm": 18.585465680122358, + "learning_rate": 1.4456999187358878e-08, + "loss": 0.9173, + "step": 34433 + }, + { + "epoch": 2.93479928407057, + "grad_norm": 53.20940291775558, + "learning_rate": 1.4419344359088983e-08, + "loss": 1.032, + "step": 34434 + }, + { + "epoch": 2.9348845137645956, + "grad_norm": 42.408648025431184, + "learning_rate": 1.438173856202374e-08, + "loss": 0.8642, + "step": 34435 + }, + { + "epoch": 2.934969743458621, + "grad_norm": 51.88035653005694, + "learning_rate": 1.4344181796532852e-08, + "loss": 1.2302, + "step": 34436 + }, + { + "epoch": 2.9350549731526465, + "grad_norm": 33.04379501341826, + "learning_rate": 1.4306674062986026e-08, + "loss": 1.0105, + "step": 34437 + }, + { + "epoch": 2.9351402028466715, + "grad_norm": 59.944120872794386, + "learning_rate": 1.4269215361751853e-08, + "loss": 1.2538, + "step": 34438 + }, + { + "epoch": 2.9352254325406975, + "grad_norm": 39.746591095995036, + "learning_rate": 1.4231805693198376e-08, + "loss": 1.1641, + "step": 34439 + }, + { + "epoch": 2.9353106622347225, + "grad_norm": 51.171346431901924, + "learning_rate": 1.4194445057694184e-08, + "loss": 1.6266, + "step": 34440 + }, + { + "epoch": 2.935395891928748, + "grad_norm": 59.61911970352049, + "learning_rate": 1.4157133455606764e-08, + "loss": 1.4383, + "step": 34441 + }, + { + "epoch": 2.9354811216227734, + "grad_norm": 37.79218309402335, + "learning_rate": 1.411987088730249e-08, + "loss": 1.4265, + "step": 34442 + }, + { + "epoch": 2.935566351316799, + "grad_norm": 50.041163252761116, + "learning_rate": 1.4082657353148288e-08, + "loss": 1.019, + "step": 34443 + }, + { + "epoch": 2.9356515810108244, + "grad_norm": 50.406579043432515, + "learning_rate": 1.4045492853509979e-08, + "loss": 1.876, + "step": 34444 + }, + { + "epoch": 2.9357368107048494, + "grad_norm": 79.85898785308316, + "learning_rate": 1.4008377388752825e-08, + "loss": 1.4073, + "step": 34445 + }, + { + "epoch": 2.935822040398875, + "grad_norm": 37.70951166259767, + "learning_rate": 1.3971310959242091e-08, + "loss": 0.9759, + "step": 34446 + }, + { + "epoch": 2.9359072700929003, + "grad_norm": 65.31052419326147, + "learning_rate": 1.3934293565341928e-08, + "loss": 1.1809, + "step": 34447 + }, + { + "epoch": 2.935992499786926, + "grad_norm": 76.26943375961363, + "learning_rate": 1.3897325207417045e-08, + "loss": 2.6524, + "step": 34448 + }, + { + "epoch": 2.9360777294809512, + "grad_norm": 119.24610292976016, + "learning_rate": 1.3860405885831041e-08, + "loss": 1.1966, + "step": 34449 + }, + { + "epoch": 2.9361629591749767, + "grad_norm": 35.9419992983396, + "learning_rate": 1.3823535600946403e-08, + "loss": 1.0726, + "step": 34450 + }, + { + "epoch": 2.936248188869002, + "grad_norm": 60.99357381697705, + "learning_rate": 1.3786714353125618e-08, + "loss": 1.3926, + "step": 34451 + }, + { + "epoch": 2.936333418563027, + "grad_norm": 70.54356585849402, + "learning_rate": 1.3749942142731176e-08, + "loss": 1.3626, + "step": 34452 + }, + { + "epoch": 2.9364186482570527, + "grad_norm": 35.01888448939499, + "learning_rate": 1.3713218970125008e-08, + "loss": 0.8423, + "step": 34453 + }, + { + "epoch": 2.936503877951078, + "grad_norm": 59.56375409585893, + "learning_rate": 1.3676544835667938e-08, + "loss": 1.7578, + "step": 34454 + }, + { + "epoch": 2.9365891076451036, + "grad_norm": 24.455787987252855, + "learning_rate": 1.3639919739720785e-08, + "loss": 0.5731, + "step": 34455 + }, + { + "epoch": 2.936674337339129, + "grad_norm": 40.311171570171155, + "learning_rate": 1.3603343682643266e-08, + "loss": 1.1943, + "step": 34456 + }, + { + "epoch": 2.936759567033154, + "grad_norm": 65.35702696921543, + "learning_rate": 1.3566816664795646e-08, + "loss": 2.0296, + "step": 34457 + }, + { + "epoch": 2.93684479672718, + "grad_norm": 33.74840186255849, + "learning_rate": 1.3530338686537637e-08, + "loss": 1.2068, + "step": 34458 + }, + { + "epoch": 2.936930026421205, + "grad_norm": 55.67830708071252, + "learning_rate": 1.349390974822673e-08, + "loss": 1.6234, + "step": 34459 + }, + { + "epoch": 2.9370152561152305, + "grad_norm": 56.20943894412949, + "learning_rate": 1.345752985022153e-08, + "loss": 1.2801, + "step": 34460 + }, + { + "epoch": 2.937100485809256, + "grad_norm": 33.64177758453608, + "learning_rate": 1.342119899288008e-08, + "loss": 1.0661, + "step": 34461 + }, + { + "epoch": 2.9371857155032814, + "grad_norm": 71.02516072755623, + "learning_rate": 1.3384917176559876e-08, + "loss": 1.9904, + "step": 34462 + }, + { + "epoch": 2.937270945197307, + "grad_norm": 43.59092182134056, + "learning_rate": 1.334868440161785e-08, + "loss": 1.5568, + "step": 34463 + }, + { + "epoch": 2.937356174891332, + "grad_norm": 23.438677274441716, + "learning_rate": 1.3312500668409279e-08, + "loss": 0.922, + "step": 34464 + }, + { + "epoch": 2.9374414045853574, + "grad_norm": 21.725601104931005, + "learning_rate": 1.3276365977291095e-08, + "loss": 0.7773, + "step": 34465 + }, + { + "epoch": 2.937526634279383, + "grad_norm": 57.95261649914879, + "learning_rate": 1.3240280328618571e-08, + "loss": 1.4504, + "step": 34466 + }, + { + "epoch": 2.9376118639734083, + "grad_norm": 43.624860111648665, + "learning_rate": 1.3204243722746423e-08, + "loss": 0.8209, + "step": 34467 + }, + { + "epoch": 2.937697093667434, + "grad_norm": 39.325893268859254, + "learning_rate": 1.3168256160028814e-08, + "loss": 1.1381, + "step": 34468 + }, + { + "epoch": 2.9377823233614593, + "grad_norm": 43.53863976068709, + "learning_rate": 1.3132317640819348e-08, + "loss": 1.2562, + "step": 34469 + }, + { + "epoch": 2.9378675530554847, + "grad_norm": 27.43935620590519, + "learning_rate": 1.3096428165472741e-08, + "loss": 0.9363, + "step": 34470 + }, + { + "epoch": 2.9379527827495098, + "grad_norm": 47.95341548111028, + "learning_rate": 1.306058773434038e-08, + "loss": 1.1411, + "step": 34471 + }, + { + "epoch": 2.9380380124435352, + "grad_norm": 45.795741143367685, + "learning_rate": 1.302479634777587e-08, + "loss": 1.092, + "step": 34472 + }, + { + "epoch": 2.9381232421375607, + "grad_norm": 77.18290622080181, + "learning_rate": 1.2989054006131152e-08, + "loss": 2.1266, + "step": 34473 + }, + { + "epoch": 2.938208471831586, + "grad_norm": 36.227433187367055, + "learning_rate": 1.2953360709757056e-08, + "loss": 1.1672, + "step": 34474 + }, + { + "epoch": 2.9382937015256116, + "grad_norm": 41.72724867231184, + "learning_rate": 1.2917716459005525e-08, + "loss": 1.1031, + "step": 34475 + }, + { + "epoch": 2.9383789312196367, + "grad_norm": 25.09088547489263, + "learning_rate": 1.2882121254226276e-08, + "loss": 0.7137, + "step": 34476 + }, + { + "epoch": 2.9384641609136626, + "grad_norm": 79.61992918438881, + "learning_rate": 1.2846575095769032e-08, + "loss": 1.936, + "step": 34477 + }, + { + "epoch": 2.9385493906076876, + "grad_norm": 54.799357221637564, + "learning_rate": 1.2811077983984622e-08, + "loss": 1.6574, + "step": 34478 + }, + { + "epoch": 2.938634620301713, + "grad_norm": 35.23232717747913, + "learning_rate": 1.2775629919221655e-08, + "loss": 1.1442, + "step": 34479 + }, + { + "epoch": 2.9387198499957385, + "grad_norm": 45.442127330224835, + "learning_rate": 1.2740230901828744e-08, + "loss": 1.274, + "step": 34480 + }, + { + "epoch": 2.938805079689764, + "grad_norm": 19.023288481762165, + "learning_rate": 1.2704880932153384e-08, + "loss": 0.5677, + "step": 34481 + }, + { + "epoch": 2.9388903093837895, + "grad_norm": 45.350264506554595, + "learning_rate": 1.266958001054419e-08, + "loss": 0.8444, + "step": 34482 + }, + { + "epoch": 2.9389755390778145, + "grad_norm": 35.58007536263555, + "learning_rate": 1.2634328137347552e-08, + "loss": 0.7614, + "step": 34483 + }, + { + "epoch": 2.93906076877184, + "grad_norm": 39.09361500184237, + "learning_rate": 1.2599125312910965e-08, + "loss": 0.9551, + "step": 34484 + }, + { + "epoch": 2.9391459984658654, + "grad_norm": 77.19081235754624, + "learning_rate": 1.256397153757971e-08, + "loss": 1.9223, + "step": 34485 + }, + { + "epoch": 2.939231228159891, + "grad_norm": 28.597207347437827, + "learning_rate": 1.252886681170018e-08, + "loss": 1.3864, + "step": 34486 + }, + { + "epoch": 2.9393164578539164, + "grad_norm": 46.501437885717515, + "learning_rate": 1.2493811135617651e-08, + "loss": 0.9674, + "step": 34487 + }, + { + "epoch": 2.939401687547942, + "grad_norm": 37.50255293101212, + "learning_rate": 1.2458804509676292e-08, + "loss": 1.0239, + "step": 34488 + }, + { + "epoch": 2.9394869172419673, + "grad_norm": 30.99852924278896, + "learning_rate": 1.242384693422083e-08, + "loss": 1.1248, + "step": 34489 + }, + { + "epoch": 2.9395721469359923, + "grad_norm": 66.55451067841337, + "learning_rate": 1.238893840959543e-08, + "loss": 1.5228, + "step": 34490 + }, + { + "epoch": 2.939657376630018, + "grad_norm": 43.52363927844761, + "learning_rate": 1.2354078936142044e-08, + "loss": 1.1554, + "step": 34491 + }, + { + "epoch": 2.9397426063240433, + "grad_norm": 40.064390969259776, + "learning_rate": 1.2319268514205397e-08, + "loss": 0.9075, + "step": 34492 + }, + { + "epoch": 2.9398278360180687, + "grad_norm": 41.05034880206905, + "learning_rate": 1.2284507144126323e-08, + "loss": 0.9062, + "step": 34493 + }, + { + "epoch": 2.939913065712094, + "grad_norm": 49.61495539629917, + "learning_rate": 1.2249794826246775e-08, + "loss": 1.2309, + "step": 34494 + }, + { + "epoch": 2.9399982954061192, + "grad_norm": 72.06839437487598, + "learning_rate": 1.2215131560909255e-08, + "loss": 1.8062, + "step": 34495 + }, + { + "epoch": 2.940083525100145, + "grad_norm": 40.688715395147845, + "learning_rate": 1.218051734845349e-08, + "loss": 0.7645, + "step": 34496 + }, + { + "epoch": 2.94016875479417, + "grad_norm": 28.949021562726237, + "learning_rate": 1.2145952189220877e-08, + "loss": 1.0996, + "step": 34497 + }, + { + "epoch": 2.9402539844881956, + "grad_norm": 22.801890365499883, + "learning_rate": 1.2111436083550033e-08, + "loss": 0.7951, + "step": 34498 + }, + { + "epoch": 2.940339214182221, + "grad_norm": 68.01517617253992, + "learning_rate": 1.2076969031781794e-08, + "loss": 2.021, + "step": 34499 + }, + { + "epoch": 2.9404244438762466, + "grad_norm": 27.514482477923544, + "learning_rate": 1.204255103425478e-08, + "loss": 0.6017, + "step": 34500 + }, + { + "epoch": 2.940509673570272, + "grad_norm": 65.26636150438141, + "learning_rate": 1.2008182091307052e-08, + "loss": 1.6607, + "step": 34501 + }, + { + "epoch": 2.940594903264297, + "grad_norm": 54.555885329302356, + "learning_rate": 1.1973862203276677e-08, + "loss": 0.823, + "step": 34502 + }, + { + "epoch": 2.9406801329583225, + "grad_norm": 64.60960901972595, + "learning_rate": 1.1939591370501158e-08, + "loss": 1.6309, + "step": 34503 + }, + { + "epoch": 2.940765362652348, + "grad_norm": 53.707906838898396, + "learning_rate": 1.1905369593318005e-08, + "loss": 1.2978, + "step": 34504 + }, + { + "epoch": 2.9408505923463735, + "grad_norm": 53.38194074848121, + "learning_rate": 1.1871196872063617e-08, + "loss": 1.8015, + "step": 34505 + }, + { + "epoch": 2.940935822040399, + "grad_norm": 26.071261485122744, + "learning_rate": 1.1837073207073834e-08, + "loss": 0.6314, + "step": 34506 + }, + { + "epoch": 2.9410210517344244, + "grad_norm": 58.74023176888783, + "learning_rate": 1.1802998598683945e-08, + "loss": 1.7398, + "step": 34507 + }, + { + "epoch": 2.94110628142845, + "grad_norm": 56.501910234671826, + "learning_rate": 1.1768973047229792e-08, + "loss": 1.2474, + "step": 34508 + }, + { + "epoch": 2.941191511122475, + "grad_norm": 25.629956404943023, + "learning_rate": 1.1734996553045552e-08, + "loss": 0.6696, + "step": 34509 + }, + { + "epoch": 2.9412767408165004, + "grad_norm": 63.65250837176512, + "learning_rate": 1.1701069116465403e-08, + "loss": 1.8277, + "step": 34510 + }, + { + "epoch": 2.941361970510526, + "grad_norm": 54.64673772806745, + "learning_rate": 1.1667190737822965e-08, + "loss": 1.4224, + "step": 34511 + }, + { + "epoch": 2.9414472002045513, + "grad_norm": 74.75313496161482, + "learning_rate": 1.163336141745186e-08, + "loss": 1.697, + "step": 34512 + }, + { + "epoch": 2.9415324298985768, + "grad_norm": 49.185797349820604, + "learning_rate": 1.1599581155684603e-08, + "loss": 1.309, + "step": 34513 + }, + { + "epoch": 2.941617659592602, + "grad_norm": 46.345773837599985, + "learning_rate": 1.1565849952852592e-08, + "loss": 1.0676, + "step": 34514 + }, + { + "epoch": 2.9417028892866277, + "grad_norm": 45.71196864047635, + "learning_rate": 1.1532167809288897e-08, + "loss": 1.2892, + "step": 34515 + }, + { + "epoch": 2.9417881189806527, + "grad_norm": 53.45933421423204, + "learning_rate": 1.1498534725323807e-08, + "loss": 1.3459, + "step": 34516 + }, + { + "epoch": 2.941873348674678, + "grad_norm": 46.32754648910586, + "learning_rate": 1.146495070128817e-08, + "loss": 1.1986, + "step": 34517 + }, + { + "epoch": 2.9419585783687037, + "grad_norm": 74.92862847257035, + "learning_rate": 1.143141573751283e-08, + "loss": 1.8271, + "step": 34518 + }, + { + "epoch": 2.942043808062729, + "grad_norm": 64.38841517918614, + "learning_rate": 1.1397929834327525e-08, + "loss": 1.6173, + "step": 34519 + }, + { + "epoch": 2.9421290377567546, + "grad_norm": 44.66929642168764, + "learning_rate": 1.1364492992060883e-08, + "loss": 1.264, + "step": 34520 + }, + { + "epoch": 2.9422142674507796, + "grad_norm": 25.234384235113204, + "learning_rate": 1.1331105211042081e-08, + "loss": 0.8452, + "step": 34521 + }, + { + "epoch": 2.942299497144805, + "grad_norm": 34.27252776711535, + "learning_rate": 1.1297766491599749e-08, + "loss": 1.0454, + "step": 34522 + }, + { + "epoch": 2.9423847268388306, + "grad_norm": 64.34110783104816, + "learning_rate": 1.1264476834061955e-08, + "loss": 1.3401, + "step": 34523 + }, + { + "epoch": 2.942469956532856, + "grad_norm": 61.34237908871786, + "learning_rate": 1.1231236238755105e-08, + "loss": 1.0938, + "step": 34524 + }, + { + "epoch": 2.9425551862268815, + "grad_norm": 25.096829813448938, + "learning_rate": 1.1198044706006717e-08, + "loss": 0.9617, + "step": 34525 + }, + { + "epoch": 2.942640415920907, + "grad_norm": 56.04545613920346, + "learning_rate": 1.1164902236143749e-08, + "loss": 1.0403, + "step": 34526 + }, + { + "epoch": 2.9427256456149324, + "grad_norm": 38.064213182027544, + "learning_rate": 1.1131808829490943e-08, + "loss": 1.2106, + "step": 34527 + }, + { + "epoch": 2.9428108753089575, + "grad_norm": 31.290784172310698, + "learning_rate": 1.1098764486374702e-08, + "loss": 0.6756, + "step": 34528 + }, + { + "epoch": 2.942896105002983, + "grad_norm": 65.08567762635165, + "learning_rate": 1.1065769207119769e-08, + "loss": 1.5738, + "step": 34529 + }, + { + "epoch": 2.9429813346970084, + "grad_norm": 39.68392777244082, + "learning_rate": 1.1032822992050884e-08, + "loss": 1.3366, + "step": 34530 + }, + { + "epoch": 2.943066564391034, + "grad_norm": 33.676326884672385, + "learning_rate": 1.0999925841491122e-08, + "loss": 0.8843, + "step": 34531 + }, + { + "epoch": 2.9431517940850593, + "grad_norm": 23.405208706947253, + "learning_rate": 1.0967077755765221e-08, + "loss": 0.7618, + "step": 34532 + }, + { + "epoch": 2.9432370237790844, + "grad_norm": 31.334481207651226, + "learning_rate": 1.0934278735195147e-08, + "loss": 1.2081, + "step": 34533 + }, + { + "epoch": 2.9433222534731103, + "grad_norm": 42.392995140772896, + "learning_rate": 1.0901528780103976e-08, + "loss": 1.1753, + "step": 34534 + }, + { + "epoch": 2.9434074831671353, + "grad_norm": 32.02537821992777, + "learning_rate": 1.0868827890814226e-08, + "loss": 1.0713, + "step": 34535 + }, + { + "epoch": 2.9434927128611608, + "grad_norm": 45.06844537311232, + "learning_rate": 1.0836176067647309e-08, + "loss": 1.1834, + "step": 34536 + }, + { + "epoch": 2.9435779425551862, + "grad_norm": 18.927834342476665, + "learning_rate": 1.080357331092352e-08, + "loss": 0.7035, + "step": 34537 + }, + { + "epoch": 2.9436631722492117, + "grad_norm": 37.846095647447214, + "learning_rate": 1.0771019620964274e-08, + "loss": 0.7264, + "step": 34538 + }, + { + "epoch": 2.943748401943237, + "grad_norm": 43.79458167582293, + "learning_rate": 1.073851499808931e-08, + "loss": 1.3739, + "step": 34539 + }, + { + "epoch": 2.943833631637262, + "grad_norm": 66.98084497953411, + "learning_rate": 1.0706059442618932e-08, + "loss": 1.8796, + "step": 34540 + }, + { + "epoch": 2.9439188613312877, + "grad_norm": 54.390330280188735, + "learning_rate": 1.0673652954871217e-08, + "loss": 1.3576, + "step": 34541 + }, + { + "epoch": 2.944004091025313, + "grad_norm": 70.24613945110816, + "learning_rate": 1.0641295535166463e-08, + "loss": 1.839, + "step": 34542 + }, + { + "epoch": 2.9440893207193386, + "grad_norm": 33.39268376040998, + "learning_rate": 1.060898718382164e-08, + "loss": 0.6517, + "step": 34543 + }, + { + "epoch": 2.944174550413364, + "grad_norm": 53.57886935306513, + "learning_rate": 1.0576727901154271e-08, + "loss": 1.4141, + "step": 34544 + }, + { + "epoch": 2.9442597801073895, + "grad_norm": 51.401714226653645, + "learning_rate": 1.0544517687482992e-08, + "loss": 2.1486, + "step": 34545 + }, + { + "epoch": 2.944345009801415, + "grad_norm": 48.238104428017174, + "learning_rate": 1.0512356543123104e-08, + "loss": 1.344, + "step": 34546 + }, + { + "epoch": 2.94443023949544, + "grad_norm": 22.464937889919796, + "learning_rate": 1.048024446839213e-08, + "loss": 0.6933, + "step": 34547 + }, + { + "epoch": 2.9445154691894655, + "grad_norm": 28.60346124102188, + "learning_rate": 1.0448181463604823e-08, + "loss": 1.2386, + "step": 34548 + }, + { + "epoch": 2.944600698883491, + "grad_norm": 61.31556813070188, + "learning_rate": 1.0416167529077591e-08, + "loss": 1.3994, + "step": 34549 + }, + { + "epoch": 2.9446859285775164, + "grad_norm": 18.9468103222231, + "learning_rate": 1.0384202665124077e-08, + "loss": 1.0294, + "step": 34550 + }, + { + "epoch": 2.944771158271542, + "grad_norm": 43.774145889266464, + "learning_rate": 1.035228687205958e-08, + "loss": 1.221, + "step": 34551 + }, + { + "epoch": 2.9448563879655674, + "grad_norm": 28.795006524353813, + "learning_rate": 1.032042015019774e-08, + "loss": 0.8492, + "step": 34552 + }, + { + "epoch": 2.944941617659593, + "grad_norm": 50.9893402260286, + "learning_rate": 1.0288602499851641e-08, + "loss": 1.5617, + "step": 34553 + }, + { + "epoch": 2.945026847353618, + "grad_norm": 56.50023432762556, + "learning_rate": 1.0256833921334363e-08, + "loss": 1.5539, + "step": 34554 + }, + { + "epoch": 2.9451120770476433, + "grad_norm": 69.80375689166817, + "learning_rate": 1.0225114414958437e-08, + "loss": 1.3809, + "step": 34555 + }, + { + "epoch": 2.945197306741669, + "grad_norm": 29.194362666891067, + "learning_rate": 1.0193443981035834e-08, + "loss": 1.0564, + "step": 34556 + }, + { + "epoch": 2.9452825364356943, + "grad_norm": 35.73139754564967, + "learning_rate": 1.016182261987797e-08, + "loss": 1.0439, + "step": 34557 + }, + { + "epoch": 2.9453677661297197, + "grad_norm": 30.957035506410318, + "learning_rate": 1.0130250331796266e-08, + "loss": 1.1213, + "step": 34558 + }, + { + "epoch": 2.9454529958237448, + "grad_norm": 86.61302541909986, + "learning_rate": 1.0098727117099915e-08, + "loss": 1.6895, + "step": 34559 + }, + { + "epoch": 2.9455382255177702, + "grad_norm": 33.49952985507592, + "learning_rate": 1.006725297610034e-08, + "loss": 1.1141, + "step": 34560 + }, + { + "epoch": 2.9456234552117957, + "grad_norm": 55.45395261576336, + "learning_rate": 1.0035827909106178e-08, + "loss": 1.362, + "step": 34561 + }, + { + "epoch": 2.945708684905821, + "grad_norm": 61.9963314767849, + "learning_rate": 1.0004451916427183e-08, + "loss": 0.8277, + "step": 34562 + }, + { + "epoch": 2.9457939145998466, + "grad_norm": 58.36756312676269, + "learning_rate": 9.973124998371441e-09, + "loss": 0.9883, + "step": 34563 + }, + { + "epoch": 2.945879144293872, + "grad_norm": 38.62246700031707, + "learning_rate": 9.941847155247042e-09, + "loss": 0.9093, + "step": 34564 + }, + { + "epoch": 2.9459643739878976, + "grad_norm": 38.60548943523829, + "learning_rate": 9.91061838736207e-09, + "loss": 1.1017, + "step": 34565 + }, + { + "epoch": 2.9460496036819226, + "grad_norm": 57.44779416192775, + "learning_rate": 9.879438695022946e-09, + "loss": 1.5096, + "step": 34566 + }, + { + "epoch": 2.946134833375948, + "grad_norm": 23.780322565859965, + "learning_rate": 9.84830807853665e-09, + "loss": 0.4945, + "step": 34567 + }, + { + "epoch": 2.9462200630699735, + "grad_norm": 45.29752126457939, + "learning_rate": 9.8172265382096e-09, + "loss": 1.7327, + "step": 34568 + }, + { + "epoch": 2.946305292763999, + "grad_norm": 28.255379751874017, + "learning_rate": 9.78619407434711e-09, + "loss": 0.9341, + "step": 34569 + }, + { + "epoch": 2.9463905224580245, + "grad_norm": 39.604081695159486, + "learning_rate": 9.75521068725449e-09, + "loss": 1.1385, + "step": 34570 + }, + { + "epoch": 2.94647575215205, + "grad_norm": 65.9387695976757, + "learning_rate": 9.724276377236496e-09, + "loss": 1.7358, + "step": 34571 + }, + { + "epoch": 2.9465609818460754, + "grad_norm": 53.84466248139836, + "learning_rate": 9.69339114459733e-09, + "loss": 1.6154, + "step": 34572 + }, + { + "epoch": 2.9466462115401004, + "grad_norm": 37.92380364754606, + "learning_rate": 9.662554989640637e-09, + "loss": 1.5469, + "step": 34573 + }, + { + "epoch": 2.946731441234126, + "grad_norm": 44.34211583396055, + "learning_rate": 9.631767912669509e-09, + "loss": 1.6988, + "step": 34574 + }, + { + "epoch": 2.9468166709281514, + "grad_norm": 84.43334365988436, + "learning_rate": 9.601029913987592e-09, + "loss": 2.1191, + "step": 34575 + }, + { + "epoch": 2.946901900622177, + "grad_norm": 67.60420739224665, + "learning_rate": 9.570340993895755e-09, + "loss": 1.2944, + "step": 34576 + }, + { + "epoch": 2.9469871303162023, + "grad_norm": 25.156428289402125, + "learning_rate": 9.53970115269709e-09, + "loss": 0.9668, + "step": 34577 + }, + { + "epoch": 2.9470723600102273, + "grad_norm": 49.6406406727614, + "learning_rate": 9.509110390692466e-09, + "loss": 1.3646, + "step": 34578 + }, + { + "epoch": 2.9471575897042532, + "grad_norm": 27.35511255334636, + "learning_rate": 9.478568708182757e-09, + "loss": 0.6078, + "step": 34579 + }, + { + "epoch": 2.9472428193982783, + "grad_norm": 53.501912848492815, + "learning_rate": 9.44807610546772e-09, + "loss": 0.9881, + "step": 34580 + }, + { + "epoch": 2.9473280490923037, + "grad_norm": 30.682740383759526, + "learning_rate": 9.417632582848224e-09, + "loss": 0.9651, + "step": 34581 + }, + { + "epoch": 2.947413278786329, + "grad_norm": 29.62595850051187, + "learning_rate": 9.387238140623478e-09, + "loss": 0.7743, + "step": 34582 + }, + { + "epoch": 2.9474985084803547, + "grad_norm": 25.797070703468805, + "learning_rate": 9.356892779092131e-09, + "loss": 0.9054, + "step": 34583 + }, + { + "epoch": 2.94758373817438, + "grad_norm": 65.51997447840321, + "learning_rate": 9.326596498552277e-09, + "loss": 1.94, + "step": 34584 + }, + { + "epoch": 2.947668967868405, + "grad_norm": 26.244258597954946, + "learning_rate": 9.296349299302566e-09, + "loss": 1.1088, + "step": 34585 + }, + { + "epoch": 2.9477541975624306, + "grad_norm": 53.44317984256214, + "learning_rate": 9.266151181639982e-09, + "loss": 1.1697, + "step": 34586 + }, + { + "epoch": 2.947839427256456, + "grad_norm": 82.94691027437571, + "learning_rate": 9.236002145861512e-09, + "loss": 1.7814, + "step": 34587 + }, + { + "epoch": 2.9479246569504816, + "grad_norm": 41.90541778044273, + "learning_rate": 9.205902192264138e-09, + "loss": 0.9056, + "step": 34588 + }, + { + "epoch": 2.948009886644507, + "grad_norm": 29.351221575486033, + "learning_rate": 9.175851321143181e-09, + "loss": 1.4296, + "step": 34589 + }, + { + "epoch": 2.9480951163385325, + "grad_norm": 58.40261933907104, + "learning_rate": 9.14584953279507e-09, + "loss": 1.4553, + "step": 34590 + }, + { + "epoch": 2.948180346032558, + "grad_norm": 41.38402688108127, + "learning_rate": 9.115896827514015e-09, + "loss": 0.986, + "step": 34591 + }, + { + "epoch": 2.948265575726583, + "grad_norm": 96.104953910285, + "learning_rate": 9.085993205595333e-09, + "loss": 1.9684, + "step": 34592 + }, + { + "epoch": 2.9483508054206085, + "grad_norm": 45.67503284673571, + "learning_rate": 9.056138667332126e-09, + "loss": 1.0591, + "step": 34593 + }, + { + "epoch": 2.948436035114634, + "grad_norm": 15.389788043060795, + "learning_rate": 9.026333213019156e-09, + "loss": 0.4704, + "step": 34594 + }, + { + "epoch": 2.9485212648086594, + "grad_norm": 38.161348710763946, + "learning_rate": 8.996576842948968e-09, + "loss": 1.0677, + "step": 34595 + }, + { + "epoch": 2.948606494502685, + "grad_norm": 24.818752408896202, + "learning_rate": 8.966869557413549e-09, + "loss": 0.4952, + "step": 34596 + }, + { + "epoch": 2.94869172419671, + "grad_norm": 49.18029335984259, + "learning_rate": 8.937211356706555e-09, + "loss": 1.4661, + "step": 34597 + }, + { + "epoch": 2.948776953890736, + "grad_norm": 42.04009726709128, + "learning_rate": 8.907602241118308e-09, + "loss": 1.0184, + "step": 34598 + }, + { + "epoch": 2.948862183584761, + "grad_norm": 70.24176678748988, + "learning_rate": 8.878042210940796e-09, + "loss": 1.4112, + "step": 34599 + }, + { + "epoch": 2.9489474132787863, + "grad_norm": 81.13787349575745, + "learning_rate": 8.848531266464344e-09, + "loss": 2.4098, + "step": 34600 + }, + { + "epoch": 2.9490326429728118, + "grad_norm": 47.60213580542043, + "learning_rate": 8.819069407979275e-09, + "loss": 0.9125, + "step": 34601 + }, + { + "epoch": 2.9491178726668372, + "grad_norm": 33.15262530401195, + "learning_rate": 8.789656635775357e-09, + "loss": 1.0764, + "step": 34602 + }, + { + "epoch": 2.9492031023608627, + "grad_norm": 46.14196073455015, + "learning_rate": 8.760292950141802e-09, + "loss": 1.2752, + "step": 34603 + }, + { + "epoch": 2.9492883320548877, + "grad_norm": 50.59717332754862, + "learning_rate": 8.73097835136727e-09, + "loss": 1.0746, + "step": 34604 + }, + { + "epoch": 2.949373561748913, + "grad_norm": 43.22225996251273, + "learning_rate": 8.701712839740417e-09, + "loss": 1.2656, + "step": 34605 + }, + { + "epoch": 2.9494587914429387, + "grad_norm": 42.90881643274918, + "learning_rate": 8.672496415548793e-09, + "loss": 1.1091, + "step": 34606 + }, + { + "epoch": 2.949544021136964, + "grad_norm": 60.31131231663546, + "learning_rate": 8.643329079079943e-09, + "loss": 1.2747, + "step": 34607 + }, + { + "epoch": 2.9496292508309896, + "grad_norm": 41.487121449914376, + "learning_rate": 8.614210830620861e-09, + "loss": 1.2211, + "step": 34608 + }, + { + "epoch": 2.949714480525015, + "grad_norm": 41.65714059090489, + "learning_rate": 8.58514167045743e-09, + "loss": 1.3685, + "step": 34609 + }, + { + "epoch": 2.9497997102190405, + "grad_norm": 123.15582421482361, + "learning_rate": 8.556121598875534e-09, + "loss": 1.1673, + "step": 34610 + }, + { + "epoch": 2.9498849399130656, + "grad_norm": 87.33560415200542, + "learning_rate": 8.527150616160496e-09, + "loss": 1.7357, + "step": 34611 + }, + { + "epoch": 2.949970169607091, + "grad_norm": 23.237665296839317, + "learning_rate": 8.4982287225982e-09, + "loss": 0.9717, + "step": 34612 + }, + { + "epoch": 2.9500553993011165, + "grad_norm": 50.649416034360826, + "learning_rate": 8.46935591847231e-09, + "loss": 1.3442, + "step": 34613 + }, + { + "epoch": 2.950140628995142, + "grad_norm": 34.999383625124196, + "learning_rate": 8.440532204066487e-09, + "loss": 0.9589, + "step": 34614 + }, + { + "epoch": 2.9502258586891674, + "grad_norm": 63.323335168646636, + "learning_rate": 8.411757579664392e-09, + "loss": 1.4115, + "step": 34615 + }, + { + "epoch": 2.9503110883831924, + "grad_norm": 47.97074612092938, + "learning_rate": 8.383032045549688e-09, + "loss": 0.9582, + "step": 34616 + }, + { + "epoch": 2.9503963180772184, + "grad_norm": 47.55716803096834, + "learning_rate": 8.354355602004372e-09, + "loss": 1.5862, + "step": 34617 + }, + { + "epoch": 2.9504815477712434, + "grad_norm": 53.58304603859567, + "learning_rate": 8.325728249310439e-09, + "loss": 1.3916, + "step": 34618 + }, + { + "epoch": 2.950566777465269, + "grad_norm": 59.52731366030604, + "learning_rate": 8.29714998774933e-09, + "loss": 1.3017, + "step": 34619 + }, + { + "epoch": 2.9506520071592943, + "grad_norm": 21.31224930001289, + "learning_rate": 8.268620817601936e-09, + "loss": 0.7697, + "step": 34620 + }, + { + "epoch": 2.95073723685332, + "grad_norm": 22.90091401616716, + "learning_rate": 8.240140739149694e-09, + "loss": 0.8496, + "step": 34621 + }, + { + "epoch": 2.9508224665473453, + "grad_norm": 52.98666622142189, + "learning_rate": 8.211709752672381e-09, + "loss": 1.3067, + "step": 34622 + }, + { + "epoch": 2.9509076962413703, + "grad_norm": 49.237281529373895, + "learning_rate": 8.183327858448665e-09, + "loss": 1.4702, + "step": 34623 + }, + { + "epoch": 2.9509929259353957, + "grad_norm": 55.031419137161386, + "learning_rate": 8.154995056758875e-09, + "loss": 1.485, + "step": 34624 + }, + { + "epoch": 2.951078155629421, + "grad_norm": 74.73026598998167, + "learning_rate": 8.126711347880567e-09, + "loss": 1.6308, + "step": 34625 + }, + { + "epoch": 2.9511633853234467, + "grad_norm": 34.92023632473804, + "learning_rate": 8.098476732092964e-09, + "loss": 0.7968, + "step": 34626 + }, + { + "epoch": 2.951248615017472, + "grad_norm": 49.82502559814395, + "learning_rate": 8.070291209673064e-09, + "loss": 1.5224, + "step": 34627 + }, + { + "epoch": 2.9513338447114976, + "grad_norm": 51.302980628952945, + "learning_rate": 8.042154780898426e-09, + "loss": 1.4267, + "step": 34628 + }, + { + "epoch": 2.951419074405523, + "grad_norm": 40.66322043600447, + "learning_rate": 8.014067446045493e-09, + "loss": 0.7764, + "step": 34629 + }, + { + "epoch": 2.951504304099548, + "grad_norm": 40.658224802581294, + "learning_rate": 7.986029205390156e-09, + "loss": 0.7842, + "step": 34630 + }, + { + "epoch": 2.9515895337935736, + "grad_norm": 68.5271230609741, + "learning_rate": 7.958040059209415e-09, + "loss": 1.756, + "step": 34631 + }, + { + "epoch": 2.951674763487599, + "grad_norm": 26.96950523848959, + "learning_rate": 7.930100007776941e-09, + "loss": 0.9232, + "step": 34632 + }, + { + "epoch": 2.9517599931816245, + "grad_norm": 40.050014200261565, + "learning_rate": 7.902209051368625e-09, + "loss": 1.2154, + "step": 34633 + }, + { + "epoch": 2.95184522287565, + "grad_norm": 63.371030537797985, + "learning_rate": 7.874367190258136e-09, + "loss": 1.5071, + "step": 34634 + }, + { + "epoch": 2.951930452569675, + "grad_norm": 29.268425258843404, + "learning_rate": 7.846574424720255e-09, + "loss": 0.8676, + "step": 34635 + }, + { + "epoch": 2.952015682263701, + "grad_norm": 47.03396912324989, + "learning_rate": 7.818830755026985e-09, + "loss": 1.2951, + "step": 34636 + }, + { + "epoch": 2.952100911957726, + "grad_norm": 27.503624930125, + "learning_rate": 7.791136181451997e-09, + "loss": 1.054, + "step": 34637 + }, + { + "epoch": 2.9521861416517514, + "grad_norm": 75.1107511547135, + "learning_rate": 7.76349070426785e-09, + "loss": 1.8275, + "step": 34638 + }, + { + "epoch": 2.952271371345777, + "grad_norm": 28.115114003285335, + "learning_rate": 7.73589432374544e-09, + "loss": 0.7814, + "step": 34639 + }, + { + "epoch": 2.9523566010398024, + "grad_norm": 69.71069374022915, + "learning_rate": 7.708347040157327e-09, + "loss": 1.9418, + "step": 34640 + }, + { + "epoch": 2.952441830733828, + "grad_norm": 71.32062309071277, + "learning_rate": 7.680848853773294e-09, + "loss": 1.9936, + "step": 34641 + }, + { + "epoch": 2.952527060427853, + "grad_norm": 44.77982356158198, + "learning_rate": 7.653399764864788e-09, + "loss": 1.3692, + "step": 34642 + }, + { + "epoch": 2.9526122901218783, + "grad_norm": 40.10785513401007, + "learning_rate": 7.625999773701043e-09, + "loss": 1.6821, + "step": 34643 + }, + { + "epoch": 2.952697519815904, + "grad_norm": 49.941702077176636, + "learning_rate": 7.59864888055184e-09, + "loss": 0.9519, + "step": 34644 + }, + { + "epoch": 2.9527827495099292, + "grad_norm": 49.71461308952477, + "learning_rate": 7.571347085686409e-09, + "loss": 1.0196, + "step": 34645 + }, + { + "epoch": 2.9528679792039547, + "grad_norm": 49.46141654017752, + "learning_rate": 7.54409438937287e-09, + "loss": 1.3637, + "step": 34646 + }, + { + "epoch": 2.95295320889798, + "grad_norm": 61.94311260430678, + "learning_rate": 7.516890791879338e-09, + "loss": 1.3729, + "step": 34647 + }, + { + "epoch": 2.9530384385920057, + "grad_norm": 37.4844715690413, + "learning_rate": 7.489736293473937e-09, + "loss": 0.9712, + "step": 34648 + }, + { + "epoch": 2.9531236682860307, + "grad_norm": 100.85153103840712, + "learning_rate": 7.462630894422007e-09, + "loss": 2.3345, + "step": 34649 + }, + { + "epoch": 2.953208897980056, + "grad_norm": 26.973474703255658, + "learning_rate": 7.435574594992223e-09, + "loss": 0.6604, + "step": 34650 + }, + { + "epoch": 2.9532941276740816, + "grad_norm": 108.40520686613054, + "learning_rate": 7.408567395449373e-09, + "loss": 2.3402, + "step": 34651 + }, + { + "epoch": 2.953379357368107, + "grad_norm": 36.058559579319116, + "learning_rate": 7.381609296059355e-09, + "loss": 0.9813, + "step": 34652 + }, + { + "epoch": 2.9534645870621326, + "grad_norm": 26.95817461976723, + "learning_rate": 7.3547002970869585e-09, + "loss": 0.9143, + "step": 34653 + }, + { + "epoch": 2.9535498167561576, + "grad_norm": 56.31048738778418, + "learning_rate": 7.327840398797526e-09, + "loss": 2.0248, + "step": 34654 + }, + { + "epoch": 2.9536350464501835, + "grad_norm": 60.26174684663834, + "learning_rate": 7.301029601455289e-09, + "loss": 1.4978, + "step": 34655 + }, + { + "epoch": 2.9537202761442085, + "grad_norm": 79.8333719526216, + "learning_rate": 7.2742679053233735e-09, + "loss": 2.1655, + "step": 34656 + }, + { + "epoch": 2.953805505838234, + "grad_norm": 44.46245284384738, + "learning_rate": 7.2475553106649e-09, + "loss": 0.8137, + "step": 34657 + }, + { + "epoch": 2.9538907355322594, + "grad_norm": 51.304310342232824, + "learning_rate": 7.220891817742437e-09, + "loss": 1.3731, + "step": 34658 + }, + { + "epoch": 2.953975965226285, + "grad_norm": 56.558320541374016, + "learning_rate": 7.1942774268191076e-09, + "loss": 1.725, + "step": 34659 + }, + { + "epoch": 2.9540611949203104, + "grad_norm": 43.15048902553169, + "learning_rate": 7.1677121381558135e-09, + "loss": 1.1382, + "step": 34660 + }, + { + "epoch": 2.9541464246143354, + "grad_norm": 34.84318140980366, + "learning_rate": 7.141195952014568e-09, + "loss": 0.7936, + "step": 34661 + }, + { + "epoch": 2.954231654308361, + "grad_norm": 61.953642217398325, + "learning_rate": 7.114728868655163e-09, + "loss": 1.9842, + "step": 34662 + }, + { + "epoch": 2.9543168840023863, + "grad_norm": 36.361987375779776, + "learning_rate": 7.088310888338501e-09, + "loss": 0.9664, + "step": 34663 + }, + { + "epoch": 2.954402113696412, + "grad_norm": 20.505910865676352, + "learning_rate": 7.061942011324929e-09, + "loss": 0.7194, + "step": 34664 + }, + { + "epoch": 2.9544873433904373, + "grad_norm": 65.93448168851965, + "learning_rate": 7.035622237872575e-09, + "loss": 1.0445, + "step": 34665 + }, + { + "epoch": 2.9545725730844627, + "grad_norm": 45.57904656571841, + "learning_rate": 7.009351568240675e-09, + "loss": 1.0459, + "step": 34666 + }, + { + "epoch": 2.954657802778488, + "grad_norm": 71.37897593999018, + "learning_rate": 6.983130002688465e-09, + "loss": 1.5178, + "step": 34667 + }, + { + "epoch": 2.9547430324725132, + "grad_norm": 60.520220827733915, + "learning_rate": 6.9569575414724085e-09, + "loss": 1.6145, + "step": 34668 + }, + { + "epoch": 2.9548282621665387, + "grad_norm": 77.71707278497892, + "learning_rate": 6.930834184851187e-09, + "loss": 1.8932, + "step": 34669 + }, + { + "epoch": 2.954913491860564, + "grad_norm": 20.463073305938202, + "learning_rate": 6.904759933080707e-09, + "loss": 0.8303, + "step": 34670 + }, + { + "epoch": 2.9549987215545896, + "grad_norm": 55.07396533284811, + "learning_rate": 6.8787347864185395e-09, + "loss": 1.2354, + "step": 34671 + }, + { + "epoch": 2.955083951248615, + "grad_norm": 51.129741047794965, + "learning_rate": 6.852758745119481e-09, + "loss": 1.0114, + "step": 34672 + }, + { + "epoch": 2.95516918094264, + "grad_norm": 63.659019550689315, + "learning_rate": 6.826831809439438e-09, + "loss": 1.7283, + "step": 34673 + }, + { + "epoch": 2.955254410636666, + "grad_norm": 30.82290646620276, + "learning_rate": 6.800953979633207e-09, + "loss": 1.2595, + "step": 34674 + }, + { + "epoch": 2.955339640330691, + "grad_norm": 54.716059974384834, + "learning_rate": 6.7751252559555835e-09, + "loss": 0.8775, + "step": 34675 + }, + { + "epoch": 2.9554248700247165, + "grad_norm": 65.95709898024404, + "learning_rate": 6.7493456386608096e-09, + "loss": 1.5752, + "step": 34676 + }, + { + "epoch": 2.955510099718742, + "grad_norm": 52.64200431748059, + "learning_rate": 6.723615128002014e-09, + "loss": 1.5371, + "step": 34677 + }, + { + "epoch": 2.9555953294127675, + "grad_norm": 38.149175725421536, + "learning_rate": 6.697933724232331e-09, + "loss": 1.1819, + "step": 34678 + }, + { + "epoch": 2.955680559106793, + "grad_norm": 64.9962480178447, + "learning_rate": 6.672301427604333e-09, + "loss": 1.5435, + "step": 34679 + }, + { + "epoch": 2.955765788800818, + "grad_norm": 63.237303087967724, + "learning_rate": 6.646718238370043e-09, + "loss": 1.1295, + "step": 34680 + }, + { + "epoch": 2.9558510184948434, + "grad_norm": 31.374261542434848, + "learning_rate": 6.621184156780924e-09, + "loss": 0.9751, + "step": 34681 + }, + { + "epoch": 2.955936248188869, + "grad_norm": 31.924993378998867, + "learning_rate": 6.595699183088445e-09, + "loss": 1.4468, + "step": 34682 + }, + { + "epoch": 2.9560214778828944, + "grad_norm": 31.304561342626872, + "learning_rate": 6.570263317542958e-09, + "loss": 1.1541, + "step": 34683 + }, + { + "epoch": 2.95610670757692, + "grad_norm": 88.7860805294951, + "learning_rate": 6.544876560394819e-09, + "loss": 1.9717, + "step": 34684 + }, + { + "epoch": 2.9561919372709453, + "grad_norm": 53.481942575054426, + "learning_rate": 6.51953891189383e-09, + "loss": 1.4039, + "step": 34685 + }, + { + "epoch": 2.956277166964971, + "grad_norm": 32.723499291177674, + "learning_rate": 6.49425037228868e-09, + "loss": 1.0074, + "step": 34686 + }, + { + "epoch": 2.956362396658996, + "grad_norm": 92.51833314940102, + "learning_rate": 6.469010941828613e-09, + "loss": 2.3111, + "step": 34687 + }, + { + "epoch": 2.9564476263530213, + "grad_norm": 26.682096248110973, + "learning_rate": 6.443820620761765e-09, + "loss": 0.4249, + "step": 34688 + }, + { + "epoch": 2.9565328560470467, + "grad_norm": 23.597648162303972, + "learning_rate": 6.41867940933516e-09, + "loss": 0.7976, + "step": 34689 + }, + { + "epoch": 2.956618085741072, + "grad_norm": 38.5559814050162, + "learning_rate": 6.393587307796933e-09, + "loss": 1.2218, + "step": 34690 + }, + { + "epoch": 2.9567033154350977, + "grad_norm": 54.86243955529205, + "learning_rate": 6.368544316392999e-09, + "loss": 2.0792, + "step": 34691 + }, + { + "epoch": 2.956788545129123, + "grad_norm": 33.55117333924415, + "learning_rate": 6.343550435370938e-09, + "loss": 1.1508, + "step": 34692 + }, + { + "epoch": 2.9568737748231486, + "grad_norm": 25.481115563853876, + "learning_rate": 6.318605664974997e-09, + "loss": 0.9969, + "step": 34693 + }, + { + "epoch": 2.9569590045171736, + "grad_norm": 71.91140698772102, + "learning_rate": 6.293710005451647e-09, + "loss": 1.4824, + "step": 34694 + }, + { + "epoch": 2.957044234211199, + "grad_norm": 28.494968593533965, + "learning_rate": 6.268863457045693e-09, + "loss": 0.8934, + "step": 34695 + }, + { + "epoch": 2.9571294639052246, + "grad_norm": 53.46796916017825, + "learning_rate": 6.2440660200008276e-09, + "loss": 1.5394, + "step": 34696 + }, + { + "epoch": 2.95721469359925, + "grad_norm": 63.59094248700841, + "learning_rate": 6.2193176945607445e-09, + "loss": 1.5812, + "step": 34697 + }, + { + "epoch": 2.9572999232932755, + "grad_norm": 33.6030919937773, + "learning_rate": 6.1946184809702495e-09, + "loss": 1.2591, + "step": 34698 + }, + { + "epoch": 2.9573851529873005, + "grad_norm": 34.26387529708484, + "learning_rate": 6.169968379470815e-09, + "loss": 1.1909, + "step": 34699 + }, + { + "epoch": 2.9574703826813264, + "grad_norm": 59.14818839318714, + "learning_rate": 6.145367390305579e-09, + "loss": 1.7987, + "step": 34700 + }, + { + "epoch": 2.9575556123753515, + "grad_norm": 31.59416895234193, + "learning_rate": 6.120815513716017e-09, + "loss": 0.9523, + "step": 34701 + }, + { + "epoch": 2.957640842069377, + "grad_norm": 43.91406627020254, + "learning_rate": 6.0963127499441554e-09, + "loss": 1.4096, + "step": 34702 + }, + { + "epoch": 2.9577260717634024, + "grad_norm": 50.178102952346045, + "learning_rate": 6.071859099230359e-09, + "loss": 1.4458, + "step": 34703 + }, + { + "epoch": 2.957811301457428, + "grad_norm": 68.22980870297296, + "learning_rate": 6.047454561816102e-09, + "loss": 1.7541, + "step": 34704 + }, + { + "epoch": 2.9578965311514533, + "grad_norm": 14.44506314481209, + "learning_rate": 6.02309913794008e-09, + "loss": 0.4781, + "step": 34705 + }, + { + "epoch": 2.9579817608454784, + "grad_norm": 85.15647779190071, + "learning_rate": 5.9987928278426585e-09, + "loss": 1.1492, + "step": 34706 + }, + { + "epoch": 2.958066990539504, + "grad_norm": 55.00284726131569, + "learning_rate": 5.974535631762535e-09, + "loss": 1.7008, + "step": 34707 + }, + { + "epoch": 2.9581522202335293, + "grad_norm": 65.51391430074824, + "learning_rate": 5.950327549938961e-09, + "loss": 1.7311, + "step": 34708 + }, + { + "epoch": 2.9582374499275548, + "grad_norm": 45.16342289204717, + "learning_rate": 5.926168582608971e-09, + "loss": 1.5049, + "step": 34709 + }, + { + "epoch": 2.9583226796215802, + "grad_norm": 50.054717949551794, + "learning_rate": 5.902058730010707e-09, + "loss": 1.1761, + "step": 34710 + }, + { + "epoch": 2.9584079093156057, + "grad_norm": 71.94754072122804, + "learning_rate": 5.877997992381202e-09, + "loss": 1.6807, + "step": 34711 + }, + { + "epoch": 2.958493139009631, + "grad_norm": 85.68723335641833, + "learning_rate": 5.853986369957487e-09, + "loss": 1.8346, + "step": 34712 + }, + { + "epoch": 2.958578368703656, + "grad_norm": 39.29591525543283, + "learning_rate": 5.830023862974932e-09, + "loss": 1.2133, + "step": 34713 + }, + { + "epoch": 2.9586635983976817, + "grad_norm": 62.228132742447066, + "learning_rate": 5.8061104716694574e-09, + "loss": 1.6947, + "step": 34714 + }, + { + "epoch": 2.958748828091707, + "grad_norm": 54.88541984777094, + "learning_rate": 5.782246196276986e-09, + "loss": 1.1309, + "step": 34715 + }, + { + "epoch": 2.9588340577857326, + "grad_norm": 48.62669651366595, + "learning_rate": 5.75843103703122e-09, + "loss": 1.5382, + "step": 34716 + }, + { + "epoch": 2.958919287479758, + "grad_norm": 43.32508618078852, + "learning_rate": 5.734664994166972e-09, + "loss": 1.139, + "step": 34717 + }, + { + "epoch": 2.959004517173783, + "grad_norm": 55.787716649547264, + "learning_rate": 5.710948067917388e-09, + "loss": 0.8211, + "step": 34718 + }, + { + "epoch": 2.959089746867809, + "grad_norm": 43.74952785111039, + "learning_rate": 5.687280258516725e-09, + "loss": 1.6754, + "step": 34719 + }, + { + "epoch": 2.959174976561834, + "grad_norm": 25.628639159733527, + "learning_rate": 5.663661566196465e-09, + "loss": 0.9258, + "step": 34720 + }, + { + "epoch": 2.9592602062558595, + "grad_norm": 19.69434943998878, + "learning_rate": 5.6400919911897555e-09, + "loss": 0.7123, + "step": 34721 + }, + { + "epoch": 2.959345435949885, + "grad_norm": 58.72586184759014, + "learning_rate": 5.616571533728077e-09, + "loss": 1.4173, + "step": 34722 + }, + { + "epoch": 2.9594306656439104, + "grad_norm": 22.977571465732506, + "learning_rate": 5.593100194043466e-09, + "loss": 0.5544, + "step": 34723 + }, + { + "epoch": 2.959515895337936, + "grad_norm": 60.15484590826773, + "learning_rate": 5.569677972365739e-09, + "loss": 1.4335, + "step": 34724 + }, + { + "epoch": 2.959601125031961, + "grad_norm": 36.98970056207887, + "learning_rate": 5.546304868925267e-09, + "loss": 1.4548, + "step": 34725 + }, + { + "epoch": 2.9596863547259864, + "grad_norm": 80.12457045890817, + "learning_rate": 5.522980883952422e-09, + "loss": 2.0944, + "step": 34726 + }, + { + "epoch": 2.959771584420012, + "grad_norm": 48.64780006171298, + "learning_rate": 5.49970601767702e-09, + "loss": 1.1196, + "step": 34727 + }, + { + "epoch": 2.9598568141140373, + "grad_norm": 59.400834311864564, + "learning_rate": 5.476480270326656e-09, + "loss": 1.6468, + "step": 34728 + }, + { + "epoch": 2.959942043808063, + "grad_norm": 28.000610552862916, + "learning_rate": 5.453303642131147e-09, + "loss": 0.6613, + "step": 34729 + }, + { + "epoch": 2.9600272735020883, + "grad_norm": 54.48361103648804, + "learning_rate": 5.430176133316978e-09, + "loss": 1.0473, + "step": 34730 + }, + { + "epoch": 2.9601125031961137, + "grad_norm": 78.8569460736759, + "learning_rate": 5.4070977441134095e-09, + "loss": 1.9524, + "step": 34731 + }, + { + "epoch": 2.9601977328901388, + "grad_norm": 49.310390060815664, + "learning_rate": 5.3840684747458185e-09, + "loss": 0.9661, + "step": 34732 + }, + { + "epoch": 2.9602829625841642, + "grad_norm": 37.90926330858271, + "learning_rate": 5.361088325441243e-09, + "loss": 1.1284, + "step": 34733 + }, + { + "epoch": 2.9603681922781897, + "grad_norm": 30.519484412799866, + "learning_rate": 5.338157296425617e-09, + "loss": 1.0994, + "step": 34734 + }, + { + "epoch": 2.960453421972215, + "grad_norm": 39.93107316890934, + "learning_rate": 5.315275387924312e-09, + "loss": 1.7128, + "step": 34735 + }, + { + "epoch": 2.9605386516662406, + "grad_norm": 34.6474827431791, + "learning_rate": 5.292442600162706e-09, + "loss": 1.0655, + "step": 34736 + }, + { + "epoch": 2.9606238813602657, + "grad_norm": 32.694580641460995, + "learning_rate": 5.269658933365618e-09, + "loss": 1.0572, + "step": 34737 + }, + { + "epoch": 2.9607091110542916, + "grad_norm": 33.11036305530105, + "learning_rate": 5.246924387756203e-09, + "loss": 0.8643, + "step": 34738 + }, + { + "epoch": 2.9607943407483166, + "grad_norm": 53.14571601819874, + "learning_rate": 5.2242389635592805e-09, + "loss": 1.5943, + "step": 34739 + }, + { + "epoch": 2.960879570442342, + "grad_norm": 47.49271808690477, + "learning_rate": 5.201602660996341e-09, + "loss": 1.0058, + "step": 34740 + }, + { + "epoch": 2.9609648001363675, + "grad_norm": 39.797238860602526, + "learning_rate": 5.179015480291649e-09, + "loss": 0.8266, + "step": 34741 + }, + { + "epoch": 2.961050029830393, + "grad_norm": 50.02499950403891, + "learning_rate": 5.15647742166614e-09, + "loss": 1.0181, + "step": 34742 + }, + { + "epoch": 2.9611352595244185, + "grad_norm": 30.420742552419245, + "learning_rate": 5.133988485342412e-09, + "loss": 0.9462, + "step": 34743 + }, + { + "epoch": 2.9612204892184435, + "grad_norm": 70.48064095254446, + "learning_rate": 5.111548671540845e-09, + "loss": 1.9034, + "step": 34744 + }, + { + "epoch": 2.961305718912469, + "grad_norm": 57.19787564948373, + "learning_rate": 5.089157980482373e-09, + "loss": 1.8128, + "step": 34745 + }, + { + "epoch": 2.9613909486064944, + "grad_norm": 24.328273221986123, + "learning_rate": 5.0668164123873764e-09, + "loss": 1.375, + "step": 34746 + }, + { + "epoch": 2.96147617830052, + "grad_norm": 37.77639144487417, + "learning_rate": 5.044523967475124e-09, + "loss": 1.3717, + "step": 34747 + }, + { + "epoch": 2.9615614079945454, + "grad_norm": 26.271549543396336, + "learning_rate": 5.0222806459654385e-09, + "loss": 0.6243, + "step": 34748 + }, + { + "epoch": 2.961646637688571, + "grad_norm": 40.4451951131436, + "learning_rate": 5.0000864480764795e-09, + "loss": 0.8656, + "step": 34749 + }, + { + "epoch": 2.9617318673825963, + "grad_norm": 50.2121119292467, + "learning_rate": 4.9779413740275176e-09, + "loss": 1.3776, + "step": 34750 + }, + { + "epoch": 2.9618170970766213, + "grad_norm": 45.817406201677656, + "learning_rate": 4.955845424035044e-09, + "loss": 1.4319, + "step": 34751 + }, + { + "epoch": 2.961902326770647, + "grad_norm": 41.07754962671774, + "learning_rate": 4.9337985983172185e-09, + "loss": 1.1928, + "step": 34752 + }, + { + "epoch": 2.9619875564646723, + "grad_norm": 54.87298509568277, + "learning_rate": 4.911800897090535e-09, + "loss": 1.5431, + "step": 34753 + }, + { + "epoch": 2.9620727861586977, + "grad_norm": 74.49369178438683, + "learning_rate": 4.889852320572042e-09, + "loss": 1.3454, + "step": 34754 + }, + { + "epoch": 2.962158015852723, + "grad_norm": 67.00982809438077, + "learning_rate": 4.867952868976566e-09, + "loss": 1.9965, + "step": 34755 + }, + { + "epoch": 2.9622432455467482, + "grad_norm": 43.74823751447533, + "learning_rate": 4.846102542519493e-09, + "loss": 1.3396, + "step": 34756 + }, + { + "epoch": 2.962328475240774, + "grad_norm": 48.54876522258802, + "learning_rate": 4.8243013414167595e-09, + "loss": 1.465, + "step": 34757 + }, + { + "epoch": 2.962413704934799, + "grad_norm": 27.197400703233306, + "learning_rate": 4.802549265881529e-09, + "loss": 1.0832, + "step": 34758 + }, + { + "epoch": 2.9624989346288246, + "grad_norm": 59.97742550309183, + "learning_rate": 4.780846316128629e-09, + "loss": 0.8559, + "step": 34759 + }, + { + "epoch": 2.96258416432285, + "grad_norm": 50.65216857931877, + "learning_rate": 4.7591924923712225e-09, + "loss": 1.5398, + "step": 34760 + }, + { + "epoch": 2.9626693940168756, + "grad_norm": 49.5287484181204, + "learning_rate": 4.737587794822473e-09, + "loss": 0.8889, + "step": 34761 + }, + { + "epoch": 2.962754623710901, + "grad_norm": 82.56030647950885, + "learning_rate": 4.716032223694433e-09, + "loss": 2.1024, + "step": 34762 + }, + { + "epoch": 2.962839853404926, + "grad_norm": 47.607708435619756, + "learning_rate": 4.694525779199155e-09, + "loss": 1.0831, + "step": 34763 + }, + { + "epoch": 2.9629250830989515, + "grad_norm": 43.59331176600096, + "learning_rate": 4.673068461548691e-09, + "loss": 1.3779, + "step": 34764 + }, + { + "epoch": 2.963010312792977, + "grad_norm": 70.42473767561867, + "learning_rate": 4.651660270952874e-09, + "loss": 0.7548, + "step": 34765 + }, + { + "epoch": 2.9630955424870025, + "grad_norm": 48.16498514786168, + "learning_rate": 4.630301207623756e-09, + "loss": 0.8258, + "step": 34766 + }, + { + "epoch": 2.963180772181028, + "grad_norm": 32.90077794726611, + "learning_rate": 4.608991271770058e-09, + "loss": 1.0945, + "step": 34767 + }, + { + "epoch": 2.9632660018750534, + "grad_norm": 74.04823905725023, + "learning_rate": 4.587730463602724e-09, + "loss": 1.5336, + "step": 34768 + }, + { + "epoch": 2.963351231569079, + "grad_norm": 62.71740382454746, + "learning_rate": 4.566518783329366e-09, + "loss": 1.6278, + "step": 34769 + }, + { + "epoch": 2.963436461263104, + "grad_norm": 59.95357189407169, + "learning_rate": 4.545356231159814e-09, + "loss": 1.1473, + "step": 34770 + }, + { + "epoch": 2.9635216909571294, + "grad_norm": 58.48310440230912, + "learning_rate": 4.524242807301127e-09, + "loss": 1.8623, + "step": 34771 + }, + { + "epoch": 2.963606920651155, + "grad_norm": 45.84130604702506, + "learning_rate": 4.503178511962026e-09, + "loss": 1.0538, + "step": 34772 + }, + { + "epoch": 2.9636921503451803, + "grad_norm": 34.10897954665832, + "learning_rate": 4.482163345349011e-09, + "loss": 1.3145, + "step": 34773 + }, + { + "epoch": 2.9637773800392058, + "grad_norm": 20.720691865486053, + "learning_rate": 4.461197307668586e-09, + "loss": 0.5524, + "step": 34774 + }, + { + "epoch": 2.963862609733231, + "grad_norm": 64.56543616531589, + "learning_rate": 4.4402803991278055e-09, + "loss": 1.4605, + "step": 34775 + }, + { + "epoch": 2.9639478394272567, + "grad_norm": 84.53999888622818, + "learning_rate": 4.419412619931507e-09, + "loss": 1.8756, + "step": 34776 + }, + { + "epoch": 2.9640330691212817, + "grad_norm": 44.66692117503709, + "learning_rate": 4.398593970285081e-09, + "loss": 1.3322, + "step": 34777 + }, + { + "epoch": 2.964118298815307, + "grad_norm": 63.6651257017101, + "learning_rate": 4.377824450393919e-09, + "loss": 1.7656, + "step": 34778 + }, + { + "epoch": 2.9642035285093327, + "grad_norm": 72.25407627145267, + "learning_rate": 4.3571040604611926e-09, + "loss": 2.0078, + "step": 34779 + }, + { + "epoch": 2.964288758203358, + "grad_norm": 37.31273092886994, + "learning_rate": 4.336432800691737e-09, + "loss": 1.017, + "step": 34780 + }, + { + "epoch": 2.9643739878973836, + "grad_norm": 40.62408134497479, + "learning_rate": 4.315810671288168e-09, + "loss": 0.859, + "step": 34781 + }, + { + "epoch": 2.9644592175914086, + "grad_norm": 37.98474056890387, + "learning_rate": 4.295237672453656e-09, + "loss": 0.9376, + "step": 34782 + }, + { + "epoch": 2.964544447285434, + "grad_norm": 41.5980691806159, + "learning_rate": 4.274713804390263e-09, + "loss": 1.2422, + "step": 34783 + }, + { + "epoch": 2.9646296769794596, + "grad_norm": 72.67797453005534, + "learning_rate": 4.254239067300048e-09, + "loss": 2.2718, + "step": 34784 + }, + { + "epoch": 2.964714906673485, + "grad_norm": 37.6548040189839, + "learning_rate": 4.2338134613839625e-09, + "loss": 1.4198, + "step": 34785 + }, + { + "epoch": 2.9648001363675105, + "grad_norm": 84.42840697008117, + "learning_rate": 4.213436986843511e-09, + "loss": 2.1614, + "step": 34786 + }, + { + "epoch": 2.964885366061536, + "grad_norm": 47.731569270717515, + "learning_rate": 4.193109643878534e-09, + "loss": 1.3521, + "step": 34787 + }, + { + "epoch": 2.9649705957555614, + "grad_norm": 66.91522078595946, + "learning_rate": 4.172831432689428e-09, + "loss": 1.4748, + "step": 34788 + }, + { + "epoch": 2.9650558254495865, + "grad_norm": 75.25340317248957, + "learning_rate": 4.152602353475477e-09, + "loss": 1.2762, + "step": 34789 + }, + { + "epoch": 2.965141055143612, + "grad_norm": 18.609955475102076, + "learning_rate": 4.13242240643541e-09, + "loss": 0.6131, + "step": 34790 + }, + { + "epoch": 2.9652262848376374, + "grad_norm": 75.18985080504167, + "learning_rate": 4.112291591767403e-09, + "loss": 1.8697, + "step": 34791 + }, + { + "epoch": 2.965311514531663, + "grad_norm": 55.37154481122216, + "learning_rate": 4.092209909670186e-09, + "loss": 1.5565, + "step": 34792 + }, + { + "epoch": 2.9653967442256883, + "grad_norm": 75.02076059476398, + "learning_rate": 4.072177360340823e-09, + "loss": 1.6362, + "step": 34793 + }, + { + "epoch": 2.9654819739197134, + "grad_norm": 44.16925146040079, + "learning_rate": 4.052193943976379e-09, + "loss": 1.2129, + "step": 34794 + }, + { + "epoch": 2.9655672036137393, + "grad_norm": 35.54653355886837, + "learning_rate": 4.0322596607739185e-09, + "loss": 0.6666, + "step": 34795 + }, + { + "epoch": 2.9656524333077643, + "grad_norm": 42.6748898613013, + "learning_rate": 4.012374510928285e-09, + "loss": 0.9548, + "step": 34796 + }, + { + "epoch": 2.9657376630017898, + "grad_norm": 30.179462871449303, + "learning_rate": 3.992538494635989e-09, + "loss": 1.0941, + "step": 34797 + }, + { + "epoch": 2.9658228926958152, + "grad_norm": 46.02547672915521, + "learning_rate": 3.972751612091874e-09, + "loss": 1.2398, + "step": 34798 + }, + { + "epoch": 2.9659081223898407, + "grad_norm": 25.768122572361303, + "learning_rate": 3.953013863490784e-09, + "loss": 1.1829, + "step": 34799 + }, + { + "epoch": 2.965993352083866, + "grad_norm": 33.59929692382669, + "learning_rate": 3.933325249026454e-09, + "loss": 0.8921, + "step": 34800 + }, + { + "epoch": 2.966078581777891, + "grad_norm": 44.0844303645669, + "learning_rate": 3.913685768892062e-09, + "loss": 0.9916, + "step": 34801 + }, + { + "epoch": 2.9661638114719167, + "grad_norm": 79.65007690509199, + "learning_rate": 3.8940954232818965e-09, + "loss": 2.3666, + "step": 34802 + }, + { + "epoch": 2.966249041165942, + "grad_norm": 32.48296512142663, + "learning_rate": 3.8745542123880265e-09, + "loss": 0.8539, + "step": 34803 + }, + { + "epoch": 2.9663342708599676, + "grad_norm": 48.63771213643773, + "learning_rate": 3.855062136402521e-09, + "loss": 1.6914, + "step": 34804 + }, + { + "epoch": 2.966419500553993, + "grad_norm": 32.987768677588086, + "learning_rate": 3.8356191955174485e-09, + "loss": 0.8549, + "step": 34805 + }, + { + "epoch": 2.9665047302480185, + "grad_norm": 53.969442321848604, + "learning_rate": 3.816225389923211e-09, + "loss": 1.3741, + "step": 34806 + }, + { + "epoch": 2.966589959942044, + "grad_norm": 41.084509885780655, + "learning_rate": 3.796880719811325e-09, + "loss": 1.0305, + "step": 34807 + }, + { + "epoch": 2.966675189636069, + "grad_norm": 36.90021159339829, + "learning_rate": 3.77758518537219e-09, + "loss": 1.0468, + "step": 34808 + }, + { + "epoch": 2.9667604193300945, + "grad_norm": 31.859659502624975, + "learning_rate": 3.758338786795101e-09, + "loss": 0.9902, + "step": 34809 + }, + { + "epoch": 2.96684564902412, + "grad_norm": 67.25787802227968, + "learning_rate": 3.739141524269352e-09, + "loss": 2.2629, + "step": 34810 + }, + { + "epoch": 2.9669308787181454, + "grad_norm": 45.7200011250725, + "learning_rate": 3.719993397983679e-09, + "loss": 0.8421, + "step": 34811 + }, + { + "epoch": 2.967016108412171, + "grad_norm": 100.70197503599874, + "learning_rate": 3.7008944081262655e-09, + "loss": 2.1781, + "step": 34812 + }, + { + "epoch": 2.9671013381061964, + "grad_norm": 49.141282330777365, + "learning_rate": 3.6818445548858494e-09, + "loss": 1.5536, + "step": 34813 + }, + { + "epoch": 2.967186567800222, + "grad_norm": 79.78319789490203, + "learning_rate": 3.6628438384489485e-09, + "loss": 1.6338, + "step": 34814 + }, + { + "epoch": 2.967271797494247, + "grad_norm": 66.53862552478955, + "learning_rate": 3.643892259002635e-09, + "loss": 1.8821, + "step": 34815 + }, + { + "epoch": 2.9673570271882723, + "grad_norm": 21.145049002089788, + "learning_rate": 3.624989816732871e-09, + "loss": 0.7006, + "step": 34816 + }, + { + "epoch": 2.967442256882298, + "grad_norm": 23.31863340972381, + "learning_rate": 3.606136511826175e-09, + "loss": 0.5411, + "step": 34817 + }, + { + "epoch": 2.9675274865763233, + "grad_norm": 29.56372191934531, + "learning_rate": 3.587332344467953e-09, + "loss": 0.9033, + "step": 34818 + }, + { + "epoch": 2.9676127162703487, + "grad_norm": 54.851637530022906, + "learning_rate": 3.568577314843058e-09, + "loss": 2.2945, + "step": 34819 + }, + { + "epoch": 2.9676979459643738, + "grad_norm": 26.43828317728636, + "learning_rate": 3.5498714231357867e-09, + "loss": 1.1257, + "step": 34820 + }, + { + "epoch": 2.9677831756583997, + "grad_norm": 37.72548766565474, + "learning_rate": 3.531214669529881e-09, + "loss": 1.1514, + "step": 34821 + }, + { + "epoch": 2.9678684053524247, + "grad_norm": 20.078872527363313, + "learning_rate": 3.5126070542090827e-09, + "loss": 0.6837, + "step": 34822 + }, + { + "epoch": 2.96795363504645, + "grad_norm": 68.21755240489811, + "learning_rate": 3.494048577356579e-09, + "loss": 1.3079, + "step": 34823 + }, + { + "epoch": 2.9680388647404756, + "grad_norm": 48.8120110250771, + "learning_rate": 3.475539239154446e-09, + "loss": 1.2948, + "step": 34824 + }, + { + "epoch": 2.968124094434501, + "grad_norm": 69.69021885830769, + "learning_rate": 3.457079039785316e-09, + "loss": 1.7223, + "step": 34825 + }, + { + "epoch": 2.9682093241285266, + "grad_norm": 49.48339486948963, + "learning_rate": 3.4386679794301546e-09, + "loss": 1.0248, + "step": 34826 + }, + { + "epoch": 2.9682945538225516, + "grad_norm": 85.40968877764233, + "learning_rate": 3.420306058270484e-09, + "loss": 2.1668, + "step": 34827 + }, + { + "epoch": 2.968379783516577, + "grad_norm": 46.79036421842804, + "learning_rate": 3.40199327648616e-09, + "loss": 1.169, + "step": 34828 + }, + { + "epoch": 2.9684650132106025, + "grad_norm": 36.09796595606458, + "learning_rate": 3.3837296342581484e-09, + "loss": 0.9941, + "step": 34829 + }, + { + "epoch": 2.968550242904628, + "grad_norm": 55.51696363380239, + "learning_rate": 3.365515131765751e-09, + "loss": 1.9303, + "step": 34830 + }, + { + "epoch": 2.9686354725986535, + "grad_norm": 60.73175352257239, + "learning_rate": 3.3473497691877132e-09, + "loss": 1.3344, + "step": 34831 + }, + { + "epoch": 2.968720702292679, + "grad_norm": 37.46920793101418, + "learning_rate": 3.3292335467033366e-09, + "loss": 1.46, + "step": 34832 + }, + { + "epoch": 2.9688059319867044, + "grad_norm": 48.40796945132686, + "learning_rate": 3.3111664644902563e-09, + "loss": 1.8088, + "step": 34833 + }, + { + "epoch": 2.9688911616807294, + "grad_norm": 34.33357496381126, + "learning_rate": 3.2931485227266635e-09, + "loss": 1.0447, + "step": 34834 + }, + { + "epoch": 2.968976391374755, + "grad_norm": 52.45298467902117, + "learning_rate": 3.2751797215890833e-09, + "loss": 1.4306, + "step": 34835 + }, + { + "epoch": 2.9690616210687804, + "grad_norm": 29.702290247758516, + "learning_rate": 3.2572600612545967e-09, + "loss": 0.9471, + "step": 34836 + }, + { + "epoch": 2.969146850762806, + "grad_norm": 30.205073792777064, + "learning_rate": 3.239389541899729e-09, + "loss": 1.1378, + "step": 34837 + }, + { + "epoch": 2.9692320804568313, + "grad_norm": 42.01282668795817, + "learning_rate": 3.221568163699895e-09, + "loss": 1.574, + "step": 34838 + }, + { + "epoch": 2.9693173101508563, + "grad_norm": 75.49618244195212, + "learning_rate": 3.203795926830511e-09, + "loss": 1.4969, + "step": 34839 + }, + { + "epoch": 2.9694025398448822, + "grad_norm": 54.221423529282504, + "learning_rate": 3.1860728314658806e-09, + "loss": 2.2656, + "step": 34840 + }, + { + "epoch": 2.9694877695389073, + "grad_norm": 82.51936290406732, + "learning_rate": 3.168398877780865e-09, + "loss": 2.5087, + "step": 34841 + }, + { + "epoch": 2.9695729992329327, + "grad_norm": 57.56807542842801, + "learning_rate": 3.150774065949214e-09, + "loss": 1.2329, + "step": 34842 + }, + { + "epoch": 2.969658228926958, + "grad_norm": 45.10461791463255, + "learning_rate": 3.1331983961441214e-09, + "loss": 1.2269, + "step": 34843 + }, + { + "epoch": 2.9697434586209837, + "grad_norm": 26.399390504453812, + "learning_rate": 3.115671868538228e-09, + "loss": 0.9073, + "step": 34844 + }, + { + "epoch": 2.969828688315009, + "grad_norm": 17.073114845568547, + "learning_rate": 3.098194483304173e-09, + "loss": 0.8079, + "step": 34845 + }, + { + "epoch": 2.969913918009034, + "grad_norm": 49.0157528259512, + "learning_rate": 3.0807662406140415e-09, + "loss": 1.1404, + "step": 34846 + }, + { + "epoch": 2.9699991477030596, + "grad_norm": 68.43194209236958, + "learning_rate": 3.063387140638807e-09, + "loss": 1.4544, + "step": 34847 + }, + { + "epoch": 2.970084377397085, + "grad_norm": 42.72454714177043, + "learning_rate": 3.04605718355e-09, + "loss": 1.3817, + "step": 34848 + }, + { + "epoch": 2.9701696070911106, + "grad_norm": 37.9171230464083, + "learning_rate": 3.0287763695174833e-09, + "loss": 1.2957, + "step": 34849 + }, + { + "epoch": 2.970254836785136, + "grad_norm": 42.11631724192159, + "learning_rate": 3.011544698711122e-09, + "loss": 1.3911, + "step": 34850 + }, + { + "epoch": 2.9703400664791615, + "grad_norm": 71.05852611210608, + "learning_rate": 2.9943621713002245e-09, + "loss": 1.6617, + "step": 34851 + }, + { + "epoch": 2.970425296173187, + "grad_norm": 48.9413407776276, + "learning_rate": 2.977228787454656e-09, + "loss": 1.9177, + "step": 34852 + }, + { + "epoch": 2.970510525867212, + "grad_norm": 106.7852281733113, + "learning_rate": 2.960144547342614e-09, + "loss": 1.7862, + "step": 34853 + }, + { + "epoch": 2.9705957555612374, + "grad_norm": 48.377882453171196, + "learning_rate": 2.9431094511317427e-09, + "loss": 1.7288, + "step": 34854 + }, + { + "epoch": 2.970680985255263, + "grad_norm": 58.96106787960884, + "learning_rate": 2.926123498989686e-09, + "loss": 1.4717, + "step": 34855 + }, + { + "epoch": 2.9707662149492884, + "grad_norm": 21.819701141287958, + "learning_rate": 2.909186691083532e-09, + "loss": 0.8586, + "step": 34856 + }, + { + "epoch": 2.970851444643314, + "grad_norm": 94.98615406586578, + "learning_rate": 2.8922990275798148e-09, + "loss": 2.2879, + "step": 34857 + }, + { + "epoch": 2.970936674337339, + "grad_norm": 40.05337747053969, + "learning_rate": 2.875460508644512e-09, + "loss": 1.3825, + "step": 34858 + }, + { + "epoch": 2.971021904031365, + "grad_norm": 82.9262467021314, + "learning_rate": 2.8586711344436025e-09, + "loss": 1.8081, + "step": 34859 + }, + { + "epoch": 2.97110713372539, + "grad_norm": 66.19146586179608, + "learning_rate": 2.8419309051419543e-09, + "loss": 1.905, + "step": 34860 + }, + { + "epoch": 2.9711923634194153, + "grad_norm": 64.61389081236372, + "learning_rate": 2.8252398209044352e-09, + "loss": 1.3159, + "step": 34861 + }, + { + "epoch": 2.9712775931134408, + "grad_norm": 62.35608524578121, + "learning_rate": 2.8085978818948033e-09, + "loss": 1.314, + "step": 34862 + }, + { + "epoch": 2.971362822807466, + "grad_norm": 27.207150140600177, + "learning_rate": 2.792005088277372e-09, + "loss": 1.1107, + "step": 34863 + }, + { + "epoch": 2.9714480525014917, + "grad_norm": 57.281287669404286, + "learning_rate": 2.7754614402142332e-09, + "loss": 1.2006, + "step": 34864 + }, + { + "epoch": 2.9715332821955167, + "grad_norm": 84.64602865368393, + "learning_rate": 2.758966937869145e-09, + "loss": 2.099, + "step": 34865 + }, + { + "epoch": 2.971618511889542, + "grad_norm": 126.39580691546668, + "learning_rate": 2.7425215814036454e-09, + "loss": 2.2368, + "step": 34866 + }, + { + "epoch": 2.9717037415835676, + "grad_norm": 45.870894880510136, + "learning_rate": 2.7261253709798263e-09, + "loss": 1.4057, + "step": 34867 + }, + { + "epoch": 2.971788971277593, + "grad_norm": 54.69097615578821, + "learning_rate": 2.7097783067592253e-09, + "loss": 1.249, + "step": 34868 + }, + { + "epoch": 2.9718742009716186, + "grad_norm": 22.54230230418883, + "learning_rate": 2.6934803889017146e-09, + "loss": 0.7526, + "step": 34869 + }, + { + "epoch": 2.971959430665644, + "grad_norm": 72.9530160160137, + "learning_rate": 2.677231617568277e-09, + "loss": 1.5706, + "step": 34870 + }, + { + "epoch": 2.9720446603596695, + "grad_norm": 72.09608189445387, + "learning_rate": 2.661031992918228e-09, + "loss": 1.2559, + "step": 34871 + }, + { + "epoch": 2.9721298900536945, + "grad_norm": 65.68846939996203, + "learning_rate": 2.6448815151114414e-09, + "loss": 1.776, + "step": 34872 + }, + { + "epoch": 2.97221511974772, + "grad_norm": 59.67378376820398, + "learning_rate": 2.6287801843066783e-09, + "loss": 1.3044, + "step": 34873 + }, + { + "epoch": 2.9723003494417455, + "grad_norm": 71.85894614433073, + "learning_rate": 2.6127280006615908e-09, + "loss": 1.7246, + "step": 34874 + }, + { + "epoch": 2.972385579135771, + "grad_norm": 136.99230961638128, + "learning_rate": 2.59672496433494e-09, + "loss": 1.6928, + "step": 34875 + }, + { + "epoch": 2.9724708088297964, + "grad_norm": 59.882280438653424, + "learning_rate": 2.5807710754838232e-09, + "loss": 1.3421, + "step": 34876 + }, + { + "epoch": 2.9725560385238214, + "grad_norm": 50.81542347154949, + "learning_rate": 2.5648663342647816e-09, + "loss": 1.389, + "step": 34877 + }, + { + "epoch": 2.9726412682178474, + "grad_norm": 64.41131264239215, + "learning_rate": 2.549010740834912e-09, + "loss": 2.0329, + "step": 34878 + }, + { + "epoch": 2.9727264979118724, + "grad_norm": 31.223041556378305, + "learning_rate": 2.5332042953490898e-09, + "loss": 1.006, + "step": 34879 + }, + { + "epoch": 2.972811727605898, + "grad_norm": 22.60017808982877, + "learning_rate": 2.517446997963857e-09, + "loss": 0.7164, + "step": 34880 + }, + { + "epoch": 2.9728969572999233, + "grad_norm": 43.73218361281966, + "learning_rate": 2.501738848833535e-09, + "loss": 1.7165, + "step": 34881 + }, + { + "epoch": 2.972982186993949, + "grad_norm": 35.788095241867794, + "learning_rate": 2.486079848112999e-09, + "loss": 0.9122, + "step": 34882 + }, + { + "epoch": 2.9730674166879743, + "grad_norm": 39.246819058406224, + "learning_rate": 2.4704699959560154e-09, + "loss": 1.1879, + "step": 34883 + }, + { + "epoch": 2.9731526463819993, + "grad_norm": 47.2406976923968, + "learning_rate": 2.4549092925157947e-09, + "loss": 1.6267, + "step": 34884 + }, + { + "epoch": 2.9732378760760247, + "grad_norm": 41.26905300101995, + "learning_rate": 2.439397737946103e-09, + "loss": 1.1831, + "step": 34885 + }, + { + "epoch": 2.97332310577005, + "grad_norm": 67.3371449162374, + "learning_rate": 2.4239353323990412e-09, + "loss": 1.5289, + "step": 34886 + }, + { + "epoch": 2.9734083354640757, + "grad_norm": 63.59839563768619, + "learning_rate": 2.408522076026709e-09, + "loss": 1.7431, + "step": 34887 + }, + { + "epoch": 2.973493565158101, + "grad_norm": 80.16762643233095, + "learning_rate": 2.393157968980653e-09, + "loss": 1.8373, + "step": 34888 + }, + { + "epoch": 2.9735787948521266, + "grad_norm": 39.60613371230656, + "learning_rate": 2.377843011411862e-09, + "loss": 1.2441, + "step": 34889 + }, + { + "epoch": 2.973664024546152, + "grad_norm": 63.862912909613456, + "learning_rate": 2.362577203471328e-09, + "loss": 1.6073, + "step": 34890 + }, + { + "epoch": 2.973749254240177, + "grad_norm": 35.97486991293508, + "learning_rate": 2.34736054530893e-09, + "loss": 0.8958, + "step": 34891 + }, + { + "epoch": 2.9738344839342026, + "grad_norm": 31.09856435116246, + "learning_rate": 2.3321930370745483e-09, + "loss": 1.2659, + "step": 34892 + }, + { + "epoch": 2.973919713628228, + "grad_norm": 26.475491884946376, + "learning_rate": 2.317074678916953e-09, + "loss": 0.8438, + "step": 34893 + }, + { + "epoch": 2.9740049433222535, + "grad_norm": 75.7977400225848, + "learning_rate": 2.302005470984914e-09, + "loss": 1.9203, + "step": 34894 + }, + { + "epoch": 2.974090173016279, + "grad_norm": 42.09270700280233, + "learning_rate": 2.2869854134266455e-09, + "loss": 1.1048, + "step": 34895 + }, + { + "epoch": 2.974175402710304, + "grad_norm": 63.742095072223, + "learning_rate": 2.2720145063903632e-09, + "loss": 2.2334, + "step": 34896 + }, + { + "epoch": 2.97426063240433, + "grad_norm": 26.447209774726993, + "learning_rate": 2.257092750023171e-09, + "loss": 0.9841, + "step": 34897 + }, + { + "epoch": 2.974345862098355, + "grad_norm": 54.86726474937679, + "learning_rate": 2.2422201444710634e-09, + "loss": 1.4233, + "step": 34898 + }, + { + "epoch": 2.9744310917923804, + "grad_norm": 64.83938607213112, + "learning_rate": 2.227396689881145e-09, + "loss": 1.7351, + "step": 34899 + }, + { + "epoch": 2.974516321486406, + "grad_norm": 21.922797032355746, + "learning_rate": 2.2126223863983e-09, + "loss": 0.7229, + "step": 34900 + }, + { + "epoch": 2.9746015511804313, + "grad_norm": 39.32997426741401, + "learning_rate": 2.1978972341690774e-09, + "loss": 0.9168, + "step": 34901 + }, + { + "epoch": 2.974686780874457, + "grad_norm": 33.75704191388286, + "learning_rate": 2.1832212333372516e-09, + "loss": 0.9432, + "step": 34902 + }, + { + "epoch": 2.974772010568482, + "grad_norm": 31.840764649032277, + "learning_rate": 2.1685943840477065e-09, + "loss": 1.1376, + "step": 34903 + }, + { + "epoch": 2.9748572402625073, + "grad_norm": 71.76479169079919, + "learning_rate": 2.1540166864436605e-09, + "loss": 1.7312, + "step": 34904 + }, + { + "epoch": 2.9749424699565328, + "grad_norm": 24.378722708614585, + "learning_rate": 2.1394881406694435e-09, + "loss": 0.8075, + "step": 34905 + }, + { + "epoch": 2.9750276996505582, + "grad_norm": 35.29407414963638, + "learning_rate": 2.1250087468671633e-09, + "loss": 0.9233, + "step": 34906 + }, + { + "epoch": 2.9751129293445837, + "grad_norm": 52.64573443021398, + "learning_rate": 2.1105785051794835e-09, + "loss": 1.4139, + "step": 34907 + }, + { + "epoch": 2.975198159038609, + "grad_norm": 35.89336883473324, + "learning_rate": 2.0961974157479582e-09, + "loss": 1.1018, + "step": 34908 + }, + { + "epoch": 2.9752833887326346, + "grad_norm": 44.22010093677748, + "learning_rate": 2.0818654787146954e-09, + "loss": 1.6701, + "step": 34909 + }, + { + "epoch": 2.9753686184266597, + "grad_norm": 68.99570924037901, + "learning_rate": 2.0675826942201384e-09, + "loss": 1.8735, + "step": 34910 + }, + { + "epoch": 2.975453848120685, + "grad_norm": 55.84252635655033, + "learning_rate": 2.0533490624052854e-09, + "loss": 1.5743, + "step": 34911 + }, + { + "epoch": 2.9755390778147106, + "grad_norm": 28.672304438588455, + "learning_rate": 2.0391645834094697e-09, + "loss": 0.9715, + "step": 34912 + }, + { + "epoch": 2.975624307508736, + "grad_norm": 19.908993685562635, + "learning_rate": 2.0250292573725795e-09, + "loss": 0.7114, + "step": 34913 + }, + { + "epoch": 2.9757095372027615, + "grad_norm": 24.218701801515532, + "learning_rate": 2.010943084433392e-09, + "loss": 0.5801, + "step": 34914 + }, + { + "epoch": 2.9757947668967866, + "grad_norm": 66.33489308971696, + "learning_rate": 1.9969060647306857e-09, + "loss": 2.2184, + "step": 34915 + }, + { + "epoch": 2.9758799965908125, + "grad_norm": 36.765825176341785, + "learning_rate": 1.9829181984021284e-09, + "loss": 1.1683, + "step": 34916 + }, + { + "epoch": 2.9759652262848375, + "grad_norm": 54.68682448986307, + "learning_rate": 1.968979485585942e-09, + "loss": 1.6777, + "step": 34917 + }, + { + "epoch": 2.976050455978863, + "grad_norm": 56.699977719537046, + "learning_rate": 1.955089926418685e-09, + "loss": 1.6951, + "step": 34918 + }, + { + "epoch": 2.9761356856728884, + "grad_norm": 82.56776104620576, + "learning_rate": 1.9412495210369143e-09, + "loss": 2.2442, + "step": 34919 + }, + { + "epoch": 2.976220915366914, + "grad_norm": 43.99173721842884, + "learning_rate": 1.927458269577187e-09, + "loss": 1.4349, + "step": 34920 + }, + { + "epoch": 2.9763061450609394, + "grad_norm": 49.91496615508185, + "learning_rate": 1.913716172174951e-09, + "loss": 1.5107, + "step": 34921 + }, + { + "epoch": 2.9763913747549644, + "grad_norm": 23.65315308030501, + "learning_rate": 1.9000232289650977e-09, + "loss": 0.8409, + "step": 34922 + }, + { + "epoch": 2.97647660444899, + "grad_norm": 54.208430163574356, + "learning_rate": 1.8863794400825198e-09, + "loss": 1.9296, + "step": 34923 + }, + { + "epoch": 2.9765618341430153, + "grad_norm": 35.51420005492157, + "learning_rate": 1.872784805660999e-09, + "loss": 0.9817, + "step": 34924 + }, + { + "epoch": 2.976647063837041, + "grad_norm": 27.520300572637016, + "learning_rate": 1.8592393258348718e-09, + "loss": 1.1368, + "step": 34925 + }, + { + "epoch": 2.9767322935310663, + "grad_norm": 41.41758795270151, + "learning_rate": 1.8457430007373656e-09, + "loss": 1.45, + "step": 34926 + }, + { + "epoch": 2.9768175232250917, + "grad_norm": 48.724789920893784, + "learning_rate": 1.8322958305005967e-09, + "loss": 1.3529, + "step": 34927 + }, + { + "epoch": 2.976902752919117, + "grad_norm": 23.924416034905214, + "learning_rate": 1.8188978152572367e-09, + "loss": 1.2166, + "step": 34928 + }, + { + "epoch": 2.9769879826131422, + "grad_norm": 50.39017359635066, + "learning_rate": 1.8055489551388473e-09, + "loss": 1.1096, + "step": 34929 + }, + { + "epoch": 2.9770732123071677, + "grad_norm": 27.57469826614466, + "learning_rate": 1.7922492502764343e-09, + "loss": 0.7319, + "step": 34930 + }, + { + "epoch": 2.977158442001193, + "grad_norm": 53.04921844036494, + "learning_rate": 1.7789987008015597e-09, + "loss": 1.2483, + "step": 34931 + }, + { + "epoch": 2.9772436716952186, + "grad_norm": 61.46035760966534, + "learning_rate": 1.7657973068441193e-09, + "loss": 1.2707, + "step": 34932 + }, + { + "epoch": 2.977328901389244, + "grad_norm": 23.053381309618054, + "learning_rate": 1.752645068534009e-09, + "loss": 1.1386, + "step": 34933 + }, + { + "epoch": 2.9774141310832696, + "grad_norm": 68.8169684412948, + "learning_rate": 1.739541986000015e-09, + "loss": 1.5948, + "step": 34934 + }, + { + "epoch": 2.977499360777295, + "grad_norm": 41.58448552449139, + "learning_rate": 1.7264880593720334e-09, + "loss": 1.5004, + "step": 34935 + }, + { + "epoch": 2.97758459047132, + "grad_norm": 41.45686041167331, + "learning_rate": 1.7134832887777397e-09, + "loss": 1.092, + "step": 34936 + }, + { + "epoch": 2.9776698201653455, + "grad_norm": 101.74483706110179, + "learning_rate": 1.7005276743453647e-09, + "loss": 1.6433, + "step": 34937 + }, + { + "epoch": 2.977755049859371, + "grad_norm": 49.69978007286574, + "learning_rate": 1.6876212162020288e-09, + "loss": 1.1854, + "step": 34938 + }, + { + "epoch": 2.9778402795533965, + "grad_norm": 22.138918218042072, + "learning_rate": 1.6747639144748529e-09, + "loss": 0.6051, + "step": 34939 + }, + { + "epoch": 2.977925509247422, + "grad_norm": 48.57026347712555, + "learning_rate": 1.661955769290402e-09, + "loss": 1.3741, + "step": 34940 + }, + { + "epoch": 2.978010738941447, + "grad_norm": 63.358948421368176, + "learning_rate": 1.6491967807741316e-09, + "loss": 1.5759, + "step": 34941 + }, + { + "epoch": 2.9780959686354724, + "grad_norm": 27.788417088649126, + "learning_rate": 1.636486949052052e-09, + "loss": 0.7734, + "step": 34942 + }, + { + "epoch": 2.978181198329498, + "grad_norm": 52.2785561328102, + "learning_rate": 1.623826274249063e-09, + "loss": 1.2584, + "step": 34943 + }, + { + "epoch": 2.9782664280235234, + "grad_norm": 50.92283723872245, + "learning_rate": 1.6112147564895098e-09, + "loss": 1.3188, + "step": 34944 + }, + { + "epoch": 2.978351657717549, + "grad_norm": 47.71152949962095, + "learning_rate": 1.5986523958977373e-09, + "loss": 1.402, + "step": 34945 + }, + { + "epoch": 2.9784368874115743, + "grad_norm": 82.90826854775712, + "learning_rate": 1.5861391925964254e-09, + "loss": 2.3386, + "step": 34946 + }, + { + "epoch": 2.9785221171055998, + "grad_norm": 46.61665089507925, + "learning_rate": 1.5736751467099187e-09, + "loss": 1.2103, + "step": 34947 + }, + { + "epoch": 2.978607346799625, + "grad_norm": 37.45604680181912, + "learning_rate": 1.5612602583597868e-09, + "loss": 1.3481, + "step": 34948 + }, + { + "epoch": 2.9786925764936503, + "grad_norm": 65.14007781170386, + "learning_rate": 1.5488945276681544e-09, + "loss": 1.7849, + "step": 34949 + }, + { + "epoch": 2.9787778061876757, + "grad_norm": 64.11526507647302, + "learning_rate": 1.536577954757701e-09, + "loss": 1.6699, + "step": 34950 + }, + { + "epoch": 2.978863035881701, + "grad_norm": 92.53274476417977, + "learning_rate": 1.5243105397483305e-09, + "loss": 2.4319, + "step": 34951 + }, + { + "epoch": 2.9789482655757267, + "grad_norm": 65.88830160116825, + "learning_rate": 1.5120922827610573e-09, + "loss": 1.4948, + "step": 34952 + }, + { + "epoch": 2.979033495269752, + "grad_norm": 63.73797645959684, + "learning_rate": 1.4999231839163408e-09, + "loss": 1.5987, + "step": 34953 + }, + { + "epoch": 2.9791187249637776, + "grad_norm": 72.70595884191874, + "learning_rate": 1.48780324333353e-09, + "loss": 2.3887, + "step": 34954 + }, + { + "epoch": 2.9792039546578026, + "grad_norm": 27.46322284650552, + "learning_rate": 1.4757324611325286e-09, + "loss": 1.0748, + "step": 34955 + }, + { + "epoch": 2.979289184351828, + "grad_norm": 30.31868228596559, + "learning_rate": 1.4637108374310205e-09, + "loss": 1.1569, + "step": 34956 + }, + { + "epoch": 2.9793744140458536, + "grad_norm": 41.41939951866274, + "learning_rate": 1.4517383723477995e-09, + "loss": 1.4336, + "step": 34957 + }, + { + "epoch": 2.979459643739879, + "grad_norm": 26.177009309728152, + "learning_rate": 1.4398150659999943e-09, + "loss": 0.8272, + "step": 34958 + }, + { + "epoch": 2.9795448734339045, + "grad_norm": 30.71670518592938, + "learning_rate": 1.4279409185058435e-09, + "loss": 1.0995, + "step": 34959 + }, + { + "epoch": 2.9796301031279295, + "grad_norm": 44.92570022308172, + "learning_rate": 1.4161159299819204e-09, + "loss": 1.4848, + "step": 34960 + }, + { + "epoch": 2.9797153328219554, + "grad_norm": 27.709108381226827, + "learning_rate": 1.4043401005436886e-09, + "loss": 0.7289, + "step": 34961 + }, + { + "epoch": 2.9798005625159805, + "grad_norm": 18.873551684230964, + "learning_rate": 1.3926134303082761e-09, + "loss": 0.5292, + "step": 34962 + }, + { + "epoch": 2.979885792210006, + "grad_norm": 52.12260038034931, + "learning_rate": 1.3809359193894812e-09, + "loss": 1.0048, + "step": 34963 + }, + { + "epoch": 2.9799710219040314, + "grad_norm": 28.09876913702537, + "learning_rate": 1.369307567903322e-09, + "loss": 1.1146, + "step": 34964 + }, + { + "epoch": 2.980056251598057, + "grad_norm": 34.14084955304309, + "learning_rate": 1.3577283759635963e-09, + "loss": 1.0696, + "step": 34965 + }, + { + "epoch": 2.9801414812920823, + "grad_norm": 58.575767856080844, + "learning_rate": 1.346198343684657e-09, + "loss": 1.7519, + "step": 34966 + }, + { + "epoch": 2.9802267109861074, + "grad_norm": 17.42748270929574, + "learning_rate": 1.334717471179192e-09, + "loss": 0.5554, + "step": 34967 + }, + { + "epoch": 2.980311940680133, + "grad_norm": 30.674213330145896, + "learning_rate": 1.3232857585604442e-09, + "loss": 1.0588, + "step": 34968 + }, + { + "epoch": 2.9803971703741583, + "grad_norm": 37.304386073127226, + "learning_rate": 1.3119032059411009e-09, + "loss": 1.5479, + "step": 34969 + }, + { + "epoch": 2.9804824000681838, + "grad_norm": 81.61231707132379, + "learning_rate": 1.3005698134332944e-09, + "loss": 1.596, + "step": 34970 + }, + { + "epoch": 2.9805676297622092, + "grad_norm": 44.989295346055364, + "learning_rate": 1.2892855811474924e-09, + "loss": 1.4418, + "step": 34971 + }, + { + "epoch": 2.9806528594562347, + "grad_norm": 34.841628177440576, + "learning_rate": 1.2780505091958274e-09, + "loss": 0.7987, + "step": 34972 + }, + { + "epoch": 2.98073808915026, + "grad_norm": 82.59761059974753, + "learning_rate": 1.2668645976876558e-09, + "loss": 1.1312, + "step": 34973 + }, + { + "epoch": 2.980823318844285, + "grad_norm": 45.08222050788898, + "learning_rate": 1.2557278467340007e-09, + "loss": 1.2332, + "step": 34974 + }, + { + "epoch": 2.9809085485383107, + "grad_norm": 74.65900973209516, + "learning_rate": 1.2446402564436632e-09, + "loss": 1.4887, + "step": 34975 + }, + { + "epoch": 2.980993778232336, + "grad_norm": 26.822339275764396, + "learning_rate": 1.2336018269260008e-09, + "loss": 0.7269, + "step": 34976 + }, + { + "epoch": 2.9810790079263616, + "grad_norm": 46.25973620986101, + "learning_rate": 1.2226125582898153e-09, + "loss": 1.3639, + "step": 34977 + }, + { + "epoch": 2.981164237620387, + "grad_norm": 43.37962960965179, + "learning_rate": 1.2116724506427979e-09, + "loss": 0.8909, + "step": 34978 + }, + { + "epoch": 2.981249467314412, + "grad_norm": 52.94250857519528, + "learning_rate": 1.2007815040926406e-09, + "loss": 1.1013, + "step": 34979 + }, + { + "epoch": 2.981334697008438, + "grad_norm": 23.29743477193672, + "learning_rate": 1.18993971874648e-09, + "loss": 1.0555, + "step": 34980 + }, + { + "epoch": 2.981419926702463, + "grad_norm": 64.0926059347287, + "learning_rate": 1.1791470947108975e-09, + "loss": 0.8434, + "step": 34981 + }, + { + "epoch": 2.9815051563964885, + "grad_norm": 52.455266884588, + "learning_rate": 1.168403632091919e-09, + "loss": 1.2267, + "step": 34982 + }, + { + "epoch": 2.981590386090514, + "grad_norm": 71.766427590031, + "learning_rate": 1.157709330995571e-09, + "loss": 1.5447, + "step": 34983 + }, + { + "epoch": 2.9816756157845394, + "grad_norm": 31.93298651872741, + "learning_rate": 1.1470641915267699e-09, + "loss": 1.1131, + "step": 34984 + }, + { + "epoch": 2.981760845478565, + "grad_norm": 21.559568870192845, + "learning_rate": 1.1364682137898764e-09, + "loss": 1.0008, + "step": 34985 + }, + { + "epoch": 2.98184607517259, + "grad_norm": 72.01731473021823, + "learning_rate": 1.1259213978898064e-09, + "loss": 1.3089, + "step": 34986 + }, + { + "epoch": 2.9819313048666154, + "grad_norm": 44.33844786289926, + "learning_rate": 1.115423743929811e-09, + "loss": 1.4389, + "step": 34987 + }, + { + "epoch": 2.982016534560641, + "grad_norm": 20.866498855563286, + "learning_rate": 1.104975252013696e-09, + "loss": 0.8525, + "step": 34988 + }, + { + "epoch": 2.9821017642546663, + "grad_norm": 71.30316790610024, + "learning_rate": 1.0945759222436015e-09, + "loss": 2.2397, + "step": 34989 + }, + { + "epoch": 2.982186993948692, + "grad_norm": 45.65879476435169, + "learning_rate": 1.0842257547216683e-09, + "loss": 1.7487, + "step": 34990 + }, + { + "epoch": 2.9822722236427173, + "grad_norm": 91.56675225374268, + "learning_rate": 1.0739247495500371e-09, + "loss": 1.2416, + "step": 34991 + }, + { + "epoch": 2.9823574533367427, + "grad_norm": 19.811420120707098, + "learning_rate": 1.0636729068302932e-09, + "loss": 0.4681, + "step": 34992 + }, + { + "epoch": 2.9824426830307678, + "grad_norm": 63.35156163496331, + "learning_rate": 1.0534702266629115e-09, + "loss": 1.7983, + "step": 34993 + }, + { + "epoch": 2.9825279127247932, + "grad_norm": 55.40823424359126, + "learning_rate": 1.0433167091478124e-09, + "loss": 1.4815, + "step": 34994 + }, + { + "epoch": 2.9826131424188187, + "grad_norm": 63.97008974676491, + "learning_rate": 1.033212354385471e-09, + "loss": 1.3261, + "step": 34995 + }, + { + "epoch": 2.982698372112844, + "grad_norm": 51.10409789966017, + "learning_rate": 1.0231571624752523e-09, + "loss": 0.9327, + "step": 34996 + }, + { + "epoch": 2.9827836018068696, + "grad_norm": 48.60999197688797, + "learning_rate": 1.013151133515966e-09, + "loss": 1.1495, + "step": 34997 + }, + { + "epoch": 2.9828688315008947, + "grad_norm": 50.76938243060187, + "learning_rate": 1.003194267605867e-09, + "loss": 1.402, + "step": 34998 + }, + { + "epoch": 2.9829540611949206, + "grad_norm": 65.97015507614788, + "learning_rate": 9.932865648432099e-10, + "loss": 1.5924, + "step": 34999 + }, + { + "epoch": 2.9830392908889456, + "grad_norm": 19.965902633967758, + "learning_rate": 9.834280253245842e-10, + "loss": 0.6866, + "step": 35000 + }, + { + "epoch": 2.983124520582971, + "grad_norm": 70.95642287899538, + "learning_rate": 9.736186491476896e-10, + "loss": 1.7672, + "step": 35001 + }, + { + "epoch": 2.9832097502769965, + "grad_norm": 30.68676125770765, + "learning_rate": 9.638584364091152e-10, + "loss": 1.2212, + "step": 35002 + }, + { + "epoch": 2.983294979971022, + "grad_norm": 54.437296333680784, + "learning_rate": 9.541473872043405e-10, + "loss": 1.4448, + "step": 35003 + }, + { + "epoch": 2.9833802096650475, + "grad_norm": 34.38175353405014, + "learning_rate": 9.444855016293997e-10, + "loss": 1.3322, + "step": 35004 + }, + { + "epoch": 2.9834654393590725, + "grad_norm": 32.33261734665293, + "learning_rate": 9.348727797786616e-10, + "loss": 1.0426, + "step": 35005 + }, + { + "epoch": 2.983550669053098, + "grad_norm": 80.5562604450742, + "learning_rate": 9.253092217470506e-10, + "loss": 1.9453, + "step": 35006 + }, + { + "epoch": 2.9836358987471234, + "grad_norm": 75.8711782474964, + "learning_rate": 9.157948276289353e-10, + "loss": 1.2737, + "step": 35007 + }, + { + "epoch": 2.983721128441149, + "grad_norm": 27.410351542015185, + "learning_rate": 9.063295975170195e-10, + "loss": 0.9524, + "step": 35008 + }, + { + "epoch": 2.9838063581351744, + "grad_norm": 36.16606453476186, + "learning_rate": 8.969135315051169e-10, + "loss": 1.2046, + "step": 35009 + }, + { + "epoch": 2.9838915878292, + "grad_norm": 81.01373982556139, + "learning_rate": 8.875466296859314e-10, + "loss": 0.919, + "step": 35010 + }, + { + "epoch": 2.9839768175232253, + "grad_norm": 45.68790002014326, + "learning_rate": 8.782288921510562e-10, + "loss": 1.0974, + "step": 35011 + }, + { + "epoch": 2.9840620472172503, + "grad_norm": 28.167304201477336, + "learning_rate": 8.689603189920848e-10, + "loss": 0.9813, + "step": 35012 + }, + { + "epoch": 2.984147276911276, + "grad_norm": 30.547159985611824, + "learning_rate": 8.597409103006105e-10, + "loss": 1.0108, + "step": 35013 + }, + { + "epoch": 2.9842325066053013, + "grad_norm": 47.47405508996172, + "learning_rate": 8.505706661671164e-10, + "loss": 1.2679, + "step": 35014 + }, + { + "epoch": 2.9843177362993267, + "grad_norm": 70.73281126134934, + "learning_rate": 8.41449586682086e-10, + "loss": 1.4058, + "step": 35015 + }, + { + "epoch": 2.984402965993352, + "grad_norm": 77.82809464947678, + "learning_rate": 8.323776719348919e-10, + "loss": 2.4, + "step": 35016 + }, + { + "epoch": 2.984488195687377, + "grad_norm": 89.8273726362191, + "learning_rate": 8.233549220143521e-10, + "loss": 1.635, + "step": 35017 + }, + { + "epoch": 2.984573425381403, + "grad_norm": 90.97079001089847, + "learning_rate": 8.143813370098397e-10, + "loss": 2.2827, + "step": 35018 + }, + { + "epoch": 2.984658655075428, + "grad_norm": 32.79041627855249, + "learning_rate": 8.054569170096171e-10, + "loss": 1.1607, + "step": 35019 + }, + { + "epoch": 2.9847438847694536, + "grad_norm": 66.10131828950529, + "learning_rate": 7.965816621008371e-10, + "loss": 2.0076, + "step": 35020 + }, + { + "epoch": 2.984829114463479, + "grad_norm": 42.04023179455758, + "learning_rate": 7.877555723717622e-10, + "loss": 0.8921, + "step": 35021 + }, + { + "epoch": 2.9849143441575046, + "grad_norm": 54.880011579589635, + "learning_rate": 7.789786479078798e-10, + "loss": 1.532, + "step": 35022 + }, + { + "epoch": 2.98499957385153, + "grad_norm": 52.4448352617034, + "learning_rate": 7.702508887968974e-10, + "loss": 1.3525, + "step": 35023 + }, + { + "epoch": 2.985084803545555, + "grad_norm": 38.72285272993238, + "learning_rate": 7.61572295123747e-10, + "loss": 0.8751, + "step": 35024 + }, + { + "epoch": 2.9851700332395805, + "grad_norm": 22.256766358394636, + "learning_rate": 7.529428669739158e-10, + "loss": 0.7017, + "step": 35025 + }, + { + "epoch": 2.985255262933606, + "grad_norm": 79.57064119774749, + "learning_rate": 7.44362604432336e-10, + "loss": 1.6379, + "step": 35026 + }, + { + "epoch": 2.9853404926276315, + "grad_norm": 67.99113843905918, + "learning_rate": 7.358315075833844e-10, + "loss": 1.8098, + "step": 35027 + }, + { + "epoch": 2.985425722321657, + "grad_norm": 28.343285891870856, + "learning_rate": 7.27349576511438e-10, + "loss": 0.7459, + "step": 35028 + }, + { + "epoch": 2.9855109520156824, + "grad_norm": 67.02440315058192, + "learning_rate": 7.189168112992084e-10, + "loss": 1.4278, + "step": 35029 + }, + { + "epoch": 2.985596181709708, + "grad_norm": 40.39349998735982, + "learning_rate": 7.105332120299624e-10, + "loss": 1.4419, + "step": 35030 + }, + { + "epoch": 2.985681411403733, + "grad_norm": 57.65774374198568, + "learning_rate": 7.021987787864115e-10, + "loss": 1.5114, + "step": 35031 + }, + { + "epoch": 2.9857666410977584, + "grad_norm": 59.92600692241741, + "learning_rate": 6.939135116496021e-10, + "loss": 1.2922, + "step": 35032 + }, + { + "epoch": 2.985851870791784, + "grad_norm": 41.26733030667441, + "learning_rate": 6.856774107022457e-10, + "loss": 1.1125, + "step": 35033 + }, + { + "epoch": 2.9859371004858093, + "grad_norm": 75.89790393439621, + "learning_rate": 6.774904760242785e-10, + "loss": 1.4883, + "step": 35034 + }, + { + "epoch": 2.9860223301798348, + "grad_norm": 28.008217308570828, + "learning_rate": 6.693527076967465e-10, + "loss": 0.9396, + "step": 35035 + }, + { + "epoch": 2.98610755987386, + "grad_norm": 18.923243346404547, + "learning_rate": 6.612641057995861e-10, + "loss": 0.5902, + "step": 35036 + }, + { + "epoch": 2.9861927895678857, + "grad_norm": 69.34760981402502, + "learning_rate": 6.532246704127332e-10, + "loss": 1.5018, + "step": 35037 + }, + { + "epoch": 2.9862780192619107, + "grad_norm": 49.8183095112308, + "learning_rate": 6.452344016144585e-10, + "loss": 1.8258, + "step": 35038 + }, + { + "epoch": 2.986363248955936, + "grad_norm": 54.326519632416066, + "learning_rate": 6.372932994835878e-10, + "loss": 1.8717, + "step": 35039 + }, + { + "epoch": 2.9864484786499617, + "grad_norm": 38.99570852638375, + "learning_rate": 6.294013640989472e-10, + "loss": 1.2618, + "step": 35040 + }, + { + "epoch": 2.986533708343987, + "grad_norm": 74.29957419716054, + "learning_rate": 6.215585955371417e-10, + "loss": 1.9166, + "step": 35041 + }, + { + "epoch": 2.9866189380380126, + "grad_norm": 43.08180935187237, + "learning_rate": 6.137649938758872e-10, + "loss": 0.7154, + "step": 35042 + }, + { + "epoch": 2.9867041677320376, + "grad_norm": 49.47990977623423, + "learning_rate": 6.06020559191789e-10, + "loss": 1.8933, + "step": 35043 + }, + { + "epoch": 2.986789397426063, + "grad_norm": 44.75892001332661, + "learning_rate": 5.983252915603422e-10, + "loss": 1.386, + "step": 35044 + }, + { + "epoch": 2.9868746271200886, + "grad_norm": 45.51033430509543, + "learning_rate": 5.906791910587073e-10, + "loss": 1.1779, + "step": 35045 + }, + { + "epoch": 2.986959856814114, + "grad_norm": 62.6371264604264, + "learning_rate": 5.830822577607143e-10, + "loss": 0.9639, + "step": 35046 + }, + { + "epoch": 2.9870450865081395, + "grad_norm": 80.17636639934986, + "learning_rate": 5.75534491741303e-10, + "loss": 1.5874, + "step": 35047 + }, + { + "epoch": 2.987130316202165, + "grad_norm": 78.63366112593977, + "learning_rate": 5.680358930754137e-10, + "loss": 1.2019, + "step": 35048 + }, + { + "epoch": 2.9872155458961904, + "grad_norm": 55.65462488049252, + "learning_rate": 5.605864618357659e-10, + "loss": 1.7147, + "step": 35049 + }, + { + "epoch": 2.9873007755902155, + "grad_norm": 25.90157421876659, + "learning_rate": 5.531861980967445e-10, + "loss": 0.8066, + "step": 35050 + }, + { + "epoch": 2.987386005284241, + "grad_norm": 55.127982243902544, + "learning_rate": 5.45835101929959e-10, + "loss": 1.9893, + "step": 35051 + }, + { + "epoch": 2.9874712349782664, + "grad_norm": 62.32777256886929, + "learning_rate": 5.385331734086841e-10, + "loss": 1.5363, + "step": 35052 + }, + { + "epoch": 2.987556464672292, + "grad_norm": 50.11158815896454, + "learning_rate": 5.31280412604529e-10, + "loss": 0.9527, + "step": 35053 + }, + { + "epoch": 2.9876416943663173, + "grad_norm": 34.71820466162808, + "learning_rate": 5.240768195885482e-10, + "loss": 0.9281, + "step": 35054 + }, + { + "epoch": 2.9877269240603423, + "grad_norm": 95.71971240819812, + "learning_rate": 5.169223944312407e-10, + "loss": 2.3086, + "step": 35055 + }, + { + "epoch": 2.9878121537543683, + "grad_norm": 53.7600279560289, + "learning_rate": 5.098171372036609e-10, + "loss": 0.9739, + "step": 35056 + }, + { + "epoch": 2.9878973834483933, + "grad_norm": 32.79673122658012, + "learning_rate": 5.027610479757527e-10, + "loss": 1.0216, + "step": 35057 + }, + { + "epoch": 2.9879826131424188, + "grad_norm": 22.889960481811247, + "learning_rate": 4.957541268157951e-10, + "loss": 0.9618, + "step": 35058 + }, + { + "epoch": 2.988067842836444, + "grad_norm": 51.61846140334499, + "learning_rate": 4.88796373794287e-10, + "loss": 1.2199, + "step": 35059 + }, + { + "epoch": 2.9881530725304697, + "grad_norm": 25.16679444619579, + "learning_rate": 4.818877889783968e-10, + "loss": 0.7611, + "step": 35060 + }, + { + "epoch": 2.988238302224495, + "grad_norm": 64.72096114177212, + "learning_rate": 4.750283724369586e-10, + "loss": 1.736, + "step": 35061 + }, + { + "epoch": 2.98832353191852, + "grad_norm": 47.113721788656385, + "learning_rate": 4.682181242365858e-10, + "loss": 0.9236, + "step": 35062 + }, + { + "epoch": 2.9884087616125456, + "grad_norm": 46.23131006094191, + "learning_rate": 4.614570444450017e-10, + "loss": 1.2216, + "step": 35063 + }, + { + "epoch": 2.988493991306571, + "grad_norm": 47.06429944919927, + "learning_rate": 4.547451331277097e-10, + "loss": 1.4127, + "step": 35064 + }, + { + "epoch": 2.9885792210005966, + "grad_norm": 68.26224835203257, + "learning_rate": 4.480823903518783e-10, + "loss": 1.7744, + "step": 35065 + }, + { + "epoch": 2.988664450694622, + "grad_norm": 52.903295352512615, + "learning_rate": 4.4146881618245543e-10, + "loss": 1.6579, + "step": 35066 + }, + { + "epoch": 2.9887496803886475, + "grad_norm": 65.96635386703254, + "learning_rate": 4.349044106849443e-10, + "loss": 1.325, + "step": 35067 + }, + { + "epoch": 2.988834910082673, + "grad_norm": 47.68149382921193, + "learning_rate": 4.283891739231827e-10, + "loss": 1.0106, + "step": 35068 + }, + { + "epoch": 2.988920139776698, + "grad_norm": 73.71350179587206, + "learning_rate": 4.219231059615636e-10, + "loss": 1.9223, + "step": 35069 + }, + { + "epoch": 2.9890053694707235, + "grad_norm": 20.071965782240724, + "learning_rate": 4.1550620686336975e-10, + "loss": 0.5524, + "step": 35070 + }, + { + "epoch": 2.989090599164749, + "grad_norm": 75.84247643077737, + "learning_rate": 4.091384766924389e-10, + "loss": 1.1192, + "step": 35071 + }, + { + "epoch": 2.9891758288587744, + "grad_norm": 54.55143211462416, + "learning_rate": 4.0281991551094356e-10, + "loss": 1.7915, + "step": 35072 + }, + { + "epoch": 2.9892610585528, + "grad_norm": 75.58996861482791, + "learning_rate": 3.9655052338105627e-10, + "loss": 1.7678, + "step": 35073 + }, + { + "epoch": 2.9893462882468254, + "grad_norm": 55.13159369639053, + "learning_rate": 3.9033030036439434e-10, + "loss": 1.7235, + "step": 35074 + }, + { + "epoch": 2.989431517940851, + "grad_norm": 39.14089708557057, + "learning_rate": 3.8415924652202007e-10, + "loss": 1.326, + "step": 35075 + }, + { + "epoch": 2.989516747634876, + "grad_norm": 60.98504064040095, + "learning_rate": 3.7803736191499575e-10, + "loss": 1.0894, + "step": 35076 + }, + { + "epoch": 2.9896019773289013, + "grad_norm": 50.08603921365286, + "learning_rate": 3.7196464660327335e-10, + "loss": 0.8838, + "step": 35077 + }, + { + "epoch": 2.989687207022927, + "grad_norm": 52.62474806500384, + "learning_rate": 3.65941100646805e-10, + "loss": 0.9667, + "step": 35078 + }, + { + "epoch": 2.9897724367169523, + "grad_norm": 61.5572625973797, + "learning_rate": 3.599667241044325e-10, + "loss": 1.8813, + "step": 35079 + }, + { + "epoch": 2.9898576664109777, + "grad_norm": 99.67838046347374, + "learning_rate": 3.5404151703499756e-10, + "loss": 2.0962, + "step": 35080 + }, + { + "epoch": 2.9899428961050027, + "grad_norm": 121.82100729372183, + "learning_rate": 3.481654794973421e-10, + "loss": 2.442, + "step": 35081 + }, + { + "epoch": 2.9900281257990287, + "grad_norm": 36.52102893143139, + "learning_rate": 3.423386115480876e-10, + "loss": 1.3612, + "step": 35082 + }, + { + "epoch": 2.9901133554930537, + "grad_norm": 48.259769155863005, + "learning_rate": 3.3656091324607566e-10, + "loss": 0.8755, + "step": 35083 + }, + { + "epoch": 2.990198585187079, + "grad_norm": 56.29685878827164, + "learning_rate": 3.3083238464681757e-10, + "loss": 2.1044, + "step": 35084 + }, + { + "epoch": 2.9902838148811046, + "grad_norm": 35.79583612279934, + "learning_rate": 3.251530258074898e-10, + "loss": 1.0798, + "step": 35085 + }, + { + "epoch": 2.99036904457513, + "grad_norm": 66.95305634000425, + "learning_rate": 3.195228367830483e-10, + "loss": 1.5763, + "step": 35086 + }, + { + "epoch": 2.9904542742691556, + "grad_norm": 38.29177707058357, + "learning_rate": 3.1394181763011456e-10, + "loss": 1.3656, + "step": 35087 + }, + { + "epoch": 2.9905395039631806, + "grad_norm": 44.201709904468345, + "learning_rate": 3.084099684025343e-10, + "loss": 0.7326, + "step": 35088 + }, + { + "epoch": 2.990624733657206, + "grad_norm": 66.4418958080838, + "learning_rate": 3.029272891547086e-10, + "loss": 2.0032, + "step": 35089 + }, + { + "epoch": 2.9907099633512315, + "grad_norm": 40.62320124271336, + "learning_rate": 2.9749377994159335e-10, + "loss": 1.0135, + "step": 35090 + }, + { + "epoch": 2.990795193045257, + "grad_norm": 33.637526231291446, + "learning_rate": 2.9210944081536907e-10, + "loss": 0.8428, + "step": 35091 + }, + { + "epoch": 2.9908804227392825, + "grad_norm": 34.92174040277258, + "learning_rate": 2.867742718298816e-10, + "loss": 0.6765, + "step": 35092 + }, + { + "epoch": 2.990965652433308, + "grad_norm": 70.52316248071426, + "learning_rate": 2.8148827303731144e-10, + "loss": 1.5844, + "step": 35093 + }, + { + "epoch": 2.9910508821273334, + "grad_norm": 46.61180148270367, + "learning_rate": 2.762514444892839e-10, + "loss": 1.5509, + "step": 35094 + }, + { + "epoch": 2.9911361118213584, + "grad_norm": 30.080368606849337, + "learning_rate": 2.7106378623797944e-10, + "loss": 0.8384, + "step": 35095 + }, + { + "epoch": 2.991221341515384, + "grad_norm": 51.51764800335255, + "learning_rate": 2.659252983339133e-10, + "loss": 1.4959, + "step": 35096 + }, + { + "epoch": 2.9913065712094093, + "grad_norm": 35.022914010577814, + "learning_rate": 2.6083598082760065e-10, + "loss": 0.9875, + "step": 35097 + }, + { + "epoch": 2.991391800903435, + "grad_norm": 57.21429843646934, + "learning_rate": 2.5579583376955653e-10, + "loss": 1.6696, + "step": 35098 + }, + { + "epoch": 2.9914770305974603, + "grad_norm": 42.07830337496035, + "learning_rate": 2.508048572091859e-10, + "loss": 1.2462, + "step": 35099 + }, + { + "epoch": 2.9915622602914853, + "grad_norm": 80.69415572551132, + "learning_rate": 2.458630511953386e-10, + "loss": 1.1128, + "step": 35100 + }, + { + "epoch": 2.991647489985511, + "grad_norm": 49.182322917157386, + "learning_rate": 2.409704157763093e-10, + "loss": 1.3427, + "step": 35101 + }, + { + "epoch": 2.9917327196795362, + "grad_norm": 24.71981599736862, + "learning_rate": 2.3612695100094784e-10, + "loss": 0.6844, + "step": 35102 + }, + { + "epoch": 2.9918179493735617, + "grad_norm": 57.8610772537429, + "learning_rate": 2.313326569164387e-10, + "loss": 0.9142, + "step": 35103 + }, + { + "epoch": 2.991903179067587, + "grad_norm": 72.5131958269201, + "learning_rate": 2.2658753357052144e-10, + "loss": 1.6549, + "step": 35104 + }, + { + "epoch": 2.9919884087616127, + "grad_norm": 40.8864643302271, + "learning_rate": 2.2189158100871522e-10, + "loss": 1.3783, + "step": 35105 + }, + { + "epoch": 2.992073638455638, + "grad_norm": 45.53679997022924, + "learning_rate": 2.1724479927820452e-10, + "loss": 1.5142, + "step": 35106 + }, + { + "epoch": 2.992158868149663, + "grad_norm": 70.97355412288857, + "learning_rate": 2.1264718842450849e-10, + "loss": 2.1953, + "step": 35107 + }, + { + "epoch": 2.9922440978436886, + "grad_norm": 33.49835162755679, + "learning_rate": 2.0809874849259116e-10, + "loss": 0.9474, + "step": 35108 + }, + { + "epoch": 2.992329327537714, + "grad_norm": 46.096113822054, + "learning_rate": 2.0359947952741655e-10, + "loss": 1.2084, + "step": 35109 + }, + { + "epoch": 2.9924145572317395, + "grad_norm": 35.74090181731171, + "learning_rate": 1.9914938157283847e-10, + "loss": 0.7403, + "step": 35110 + }, + { + "epoch": 2.992499786925765, + "grad_norm": 65.03847706273294, + "learning_rate": 1.9474845467326587e-10, + "loss": 1.6332, + "step": 35111 + }, + { + "epoch": 2.9925850166197905, + "grad_norm": 32.59209969990097, + "learning_rate": 1.903966988714423e-10, + "loss": 1.023, + "step": 35112 + }, + { + "epoch": 2.992670246313816, + "grad_norm": 28.973082618317576, + "learning_rate": 1.8609411421011136e-10, + "loss": 0.679, + "step": 35113 + }, + { + "epoch": 2.992755476007841, + "grad_norm": 29.629322917573162, + "learning_rate": 1.8184070073201664e-10, + "loss": 0.9778, + "step": 35114 + }, + { + "epoch": 2.9928407057018664, + "grad_norm": 38.56927659272277, + "learning_rate": 1.7763645847879153e-10, + "loss": 1.1252, + "step": 35115 + }, + { + "epoch": 2.992925935395892, + "grad_norm": 73.02644306923972, + "learning_rate": 1.7348138749151422e-10, + "loss": 1.7888, + "step": 35116 + }, + { + "epoch": 2.9930111650899174, + "grad_norm": 74.57140180379808, + "learning_rate": 1.693754878118181e-10, + "loss": 1.5467, + "step": 35117 + }, + { + "epoch": 2.993096394783943, + "grad_norm": 40.89393620710989, + "learning_rate": 1.6531875947911613e-10, + "loss": 1.1502, + "step": 35118 + }, + { + "epoch": 2.993181624477968, + "grad_norm": 75.59848135815359, + "learning_rate": 1.613112025339314e-10, + "loss": 1.5763, + "step": 35119 + }, + { + "epoch": 2.993266854171994, + "grad_norm": 34.88343107055858, + "learning_rate": 1.5735281701567685e-10, + "loss": 1.1486, + "step": 35120 + }, + { + "epoch": 2.993352083866019, + "grad_norm": 50.5244945637659, + "learning_rate": 1.5344360296265516e-10, + "loss": 0.8049, + "step": 35121 + }, + { + "epoch": 2.9934373135600443, + "grad_norm": 27.83979006882251, + "learning_rate": 1.4958356041427925e-10, + "loss": 1.0717, + "step": 35122 + }, + { + "epoch": 2.9935225432540697, + "grad_norm": 76.17576570156741, + "learning_rate": 1.457726894077416e-10, + "loss": 2.054, + "step": 35123 + }, + { + "epoch": 2.993607772948095, + "grad_norm": 43.16989477198148, + "learning_rate": 1.4201098998078976e-10, + "loss": 0.9281, + "step": 35124 + }, + { + "epoch": 2.9936930026421207, + "grad_norm": 47.66264050805299, + "learning_rate": 1.3829846217061627e-10, + "loss": 1.5052, + "step": 35125 + }, + { + "epoch": 2.9937782323361457, + "grad_norm": 40.27543687486282, + "learning_rate": 1.346351060133033e-10, + "loss": 1.7582, + "step": 35126 + }, + { + "epoch": 2.993863462030171, + "grad_norm": 45.26050426625966, + "learning_rate": 1.310209215454883e-10, + "loss": 1.0196, + "step": 35127 + }, + { + "epoch": 2.9939486917241966, + "grad_norm": 66.830529096263, + "learning_rate": 1.274559088015881e-10, + "loss": 1.5438, + "step": 35128 + }, + { + "epoch": 2.994033921418222, + "grad_norm": 21.956770194427996, + "learning_rate": 1.239400678182401e-10, + "loss": 0.8991, + "step": 35129 + }, + { + "epoch": 2.9941191511122476, + "grad_norm": 29.451650583292285, + "learning_rate": 1.2047339862875095e-10, + "loss": 1.2037, + "step": 35130 + }, + { + "epoch": 2.994204380806273, + "grad_norm": 56.16283744499078, + "learning_rate": 1.1705590126753762e-10, + "loss": 1.5823, + "step": 35131 + }, + { + "epoch": 2.9942896105002985, + "grad_norm": 56.80505090209937, + "learning_rate": 1.136875757684619e-10, + "loss": 2.2075, + "step": 35132 + }, + { + "epoch": 2.9943748401943235, + "grad_norm": 38.21883558731862, + "learning_rate": 1.1036842216427535e-10, + "loss": 1.1565, + "step": 35133 + }, + { + "epoch": 2.994460069888349, + "grad_norm": 27.3959051510979, + "learning_rate": 1.0709844048828467e-10, + "loss": 0.9629, + "step": 35134 + }, + { + "epoch": 2.9945452995823745, + "grad_norm": 29.962636817634284, + "learning_rate": 1.038776307721312e-10, + "loss": 1.0956, + "step": 35135 + }, + { + "epoch": 2.9946305292764, + "grad_norm": 36.21814734738153, + "learning_rate": 1.0070599304745632e-10, + "loss": 1.0273, + "step": 35136 + }, + { + "epoch": 2.9947157589704254, + "grad_norm": 20.569292718192084, + "learning_rate": 9.758352734534627e-11, + "loss": 0.6984, + "step": 35137 + }, + { + "epoch": 2.9948009886644504, + "grad_norm": 18.18974176427422, + "learning_rate": 9.451023369744239e-11, + "loss": 0.4725, + "step": 35138 + }, + { + "epoch": 2.9948862183584763, + "grad_norm": 35.095341276784055, + "learning_rate": 9.14861121326105e-11, + "loss": 1.1724, + "step": 35139 + }, + { + "epoch": 2.9949714480525014, + "grad_norm": 25.98536871001818, + "learning_rate": 8.851116268138172e-11, + "loss": 0.969, + "step": 35140 + }, + { + "epoch": 2.995056677746527, + "grad_norm": 21.138166766059552, + "learning_rate": 8.558538537262184e-11, + "loss": 0.7611, + "step": 35141 + }, + { + "epoch": 2.9951419074405523, + "grad_norm": 68.26831627277227, + "learning_rate": 8.27087802357518e-11, + "loss": 2.119, + "step": 35142 + }, + { + "epoch": 2.9952271371345778, + "grad_norm": 32.48204216014952, + "learning_rate": 7.988134729852714e-11, + "loss": 0.9698, + "step": 35143 + }, + { + "epoch": 2.9953123668286032, + "grad_norm": 27.661123411352683, + "learning_rate": 7.710308658925858e-11, + "loss": 0.756, + "step": 35144 + }, + { + "epoch": 2.9953975965226283, + "grad_norm": 66.3448239810435, + "learning_rate": 7.437399813459145e-11, + "loss": 1.6257, + "step": 35145 + }, + { + "epoch": 2.9954828262166537, + "grad_norm": 31.759649109850887, + "learning_rate": 7.169408196172623e-11, + "loss": 1.1156, + "step": 35146 + }, + { + "epoch": 2.995568055910679, + "grad_norm": 177.8769230722783, + "learning_rate": 6.906333809675314e-11, + "loss": 2.8163, + "step": 35147 + }, + { + "epoch": 2.9956532856047047, + "grad_norm": 37.98394415555782, + "learning_rate": 6.648176656576244e-11, + "loss": 0.7574, + "step": 35148 + }, + { + "epoch": 2.99573851529873, + "grad_norm": 84.04158274098421, + "learning_rate": 6.394936739484436e-11, + "loss": 1.8864, + "step": 35149 + }, + { + "epoch": 2.9958237449927556, + "grad_norm": 40.21870181850901, + "learning_rate": 6.146614060731359e-11, + "loss": 1.0757, + "step": 35150 + }, + { + "epoch": 2.995908974686781, + "grad_norm": 53.587182987197586, + "learning_rate": 5.903208622870527e-11, + "loss": 1.7092, + "step": 35151 + }, + { + "epoch": 2.995994204380806, + "grad_norm": 36.755620511901654, + "learning_rate": 5.6647204282889166e-11, + "loss": 0.6743, + "step": 35152 + }, + { + "epoch": 2.9960794340748316, + "grad_norm": 26.550441592242443, + "learning_rate": 5.431149479317999e-11, + "loss": 0.8167, + "step": 35153 + }, + { + "epoch": 2.996164663768857, + "grad_norm": 93.70900253783871, + "learning_rate": 5.2024957782892403e-11, + "loss": 2.3331, + "step": 35154 + }, + { + "epoch": 2.9962498934628825, + "grad_norm": 65.76901413754939, + "learning_rate": 4.978759327367577e-11, + "loss": 1.4285, + "step": 35155 + }, + { + "epoch": 2.996335123156908, + "grad_norm": 53.43061936895685, + "learning_rate": 4.759940128828966e-11, + "loss": 1.1798, + "step": 35156 + }, + { + "epoch": 2.996420352850933, + "grad_norm": 30.917470629147488, + "learning_rate": 4.5460381847828305e-11, + "loss": 0.9036, + "step": 35157 + }, + { + "epoch": 2.996505582544959, + "grad_norm": 42.86101625409972, + "learning_rate": 4.337053497338595e-11, + "loss": 1.3297, + "step": 35158 + }, + { + "epoch": 2.996590812238984, + "grad_norm": 61.43306619703081, + "learning_rate": 4.132986068550171e-11, + "loss": 1.5725, + "step": 35159 + }, + { + "epoch": 2.9966760419330094, + "grad_norm": 53.08902836330102, + "learning_rate": 3.933835900471472e-11, + "loss": 1.9552, + "step": 35160 + }, + { + "epoch": 2.996761271627035, + "grad_norm": 43.93346095169384, + "learning_rate": 3.739602994989877e-11, + "loss": 1.3743, + "step": 35161 + }, + { + "epoch": 2.9968465013210603, + "grad_norm": 27.458319233244573, + "learning_rate": 3.5502873540482763e-11, + "loss": 0.4939, + "step": 35162 + }, + { + "epoch": 2.996931731015086, + "grad_norm": 95.39912649646533, + "learning_rate": 3.365888979534049e-11, + "loss": 2.6595, + "step": 35163 + }, + { + "epoch": 2.997016960709111, + "grad_norm": 20.534802149470924, + "learning_rate": 3.186407873223551e-11, + "loss": 0.703, + "step": 35164 + }, + { + "epoch": 2.9971021904031363, + "grad_norm": 71.20702615532906, + "learning_rate": 3.011844036893141e-11, + "loss": 1.1144, + "step": 35165 + }, + { + "epoch": 2.9971874200971618, + "grad_norm": 42.03911848769669, + "learning_rate": 2.8421974723191747e-11, + "loss": 1.4425, + "step": 35166 + }, + { + "epoch": 2.9972726497911872, + "grad_norm": 28.44890474685689, + "learning_rate": 2.6774681810559645e-11, + "loss": 1.2536, + "step": 35167 + }, + { + "epoch": 2.9973578794852127, + "grad_norm": 73.76341354743171, + "learning_rate": 2.5176561647688448e-11, + "loss": 1.5455, + "step": 35168 + }, + { + "epoch": 2.997443109179238, + "grad_norm": 30.987780609267723, + "learning_rate": 2.362761425067639e-11, + "loss": 0.6835, + "step": 35169 + }, + { + "epoch": 2.9975283388732636, + "grad_norm": 33.54343227671455, + "learning_rate": 2.2127839634511483e-11, + "loss": 1.2295, + "step": 35170 + }, + { + "epoch": 2.9976135685672887, + "grad_norm": 61.02599074973226, + "learning_rate": 2.0677237813626626e-11, + "loss": 2.0397, + "step": 35171 + }, + { + "epoch": 2.997698798261314, + "grad_norm": 49.46888058489625, + "learning_rate": 1.9275808803009833e-11, + "loss": 1.3648, + "step": 35172 + }, + { + "epoch": 2.9977840279553396, + "grad_norm": 63.75830628893182, + "learning_rate": 1.7923552615983776e-11, + "loss": 1.2559, + "step": 35173 + }, + { + "epoch": 2.997869257649365, + "grad_norm": 52.002128093846196, + "learning_rate": 1.662046926531602e-11, + "loss": 1.1394, + "step": 35174 + }, + { + "epoch": 2.9979544873433905, + "grad_norm": 66.97301213260964, + "learning_rate": 1.5366558764884353e-11, + "loss": 1.7716, + "step": 35175 + }, + { + "epoch": 2.9980397170374156, + "grad_norm": 52.96311747192122, + "learning_rate": 1.416182112634612e-11, + "loss": 1.1845, + "step": 35176 + }, + { + "epoch": 2.9981249467314415, + "grad_norm": 47.261501884944565, + "learning_rate": 1.3006256361913772e-11, + "loss": 1.2412, + "step": 35177 + }, + { + "epoch": 2.9982101764254665, + "grad_norm": 34.16141674100308, + "learning_rate": 1.1899864482689538e-11, + "loss": 1.0344, + "step": 35178 + }, + { + "epoch": 2.998295406119492, + "grad_norm": 55.170200339297025, + "learning_rate": 1.084264549977565e-11, + "loss": 1.4471, + "step": 35179 + }, + { + "epoch": 2.9983806358135174, + "grad_norm": 33.91247873857371, + "learning_rate": 9.834599423164114e-12, + "loss": 0.6275, + "step": 35180 + }, + { + "epoch": 2.998465865507543, + "grad_norm": 68.44906062601183, + "learning_rate": 8.87572626284694e-12, + "loss": 1.31, + "step": 35181 + }, + { + "epoch": 2.9985510952015684, + "grad_norm": 60.36405176212703, + "learning_rate": 7.966026028816132e-12, + "loss": 1.5137, + "step": 35182 + }, + { + "epoch": 2.9986363248955934, + "grad_norm": 60.4131886987483, + "learning_rate": 7.105498729398364e-12, + "loss": 1.3141, + "step": 35183 + }, + { + "epoch": 2.998721554589619, + "grad_norm": 40.099042415441644, + "learning_rate": 6.2941443734754195e-12, + "loss": 1.0652, + "step": 35184 + }, + { + "epoch": 2.9988067842836443, + "grad_norm": 57.39755275052752, + "learning_rate": 5.5319629688188604e-12, + "loss": 1.7535, + "step": 35185 + }, + { + "epoch": 2.99889201397767, + "grad_norm": 74.89543334733843, + "learning_rate": 4.818954523200248e-12, + "loss": 1.5617, + "step": 35186 + }, + { + "epoch": 2.9989772436716953, + "grad_norm": 69.21863711116576, + "learning_rate": 4.1551190427258085e-12, + "loss": 2.003, + "step": 35187 + }, + { + "epoch": 2.9990624733657207, + "grad_norm": 60.013910688826776, + "learning_rate": 3.5404565351671027e-12, + "loss": 1.279, + "step": 35188 + }, + { + "epoch": 2.999147703059746, + "grad_norm": 52.3445651084111, + "learning_rate": 2.9749670055201354e-12, + "loss": 2.1073, + "step": 35189 + }, + { + "epoch": 2.9992329327537712, + "grad_norm": 38.83591947134333, + "learning_rate": 2.4586504604462435e-12, + "loss": 0.7686, + "step": 35190 + }, + { + "epoch": 2.9993181624477967, + "grad_norm": 84.24394258497509, + "learning_rate": 1.9915069038312087e-12, + "loss": 2.5505, + "step": 35191 + }, + { + "epoch": 2.999403392141822, + "grad_norm": 33.370742946016776, + "learning_rate": 1.573536341226145e-12, + "loss": 1.0927, + "step": 35192 + }, + { + "epoch": 2.9994886218358476, + "grad_norm": 85.02344505823878, + "learning_rate": 1.2047387759617223e-12, + "loss": 1.3491, + "step": 35193 + }, + { + "epoch": 2.999573851529873, + "grad_norm": 69.84645543482148, + "learning_rate": 8.85114211923721e-13, + "loss": 1.375, + "step": 35194 + }, + { + "epoch": 2.9996590812238986, + "grad_norm": 57.56274758144193, + "learning_rate": 6.146626529979216e-13, + "loss": 1.2851, + "step": 35195 + }, + { + "epoch": 2.999744310917924, + "grad_norm": 31.883931228201092, + "learning_rate": 3.933841008496586e-13, + "loss": 0.5153, + "step": 35196 + }, + { + "epoch": 2.999829540611949, + "grad_norm": 48.91164285386988, + "learning_rate": 2.2127855769937812e-13, + "loss": 1.3367, + "step": 35197 + }, + { + "epoch": 2.9999147703059745, + "grad_norm": 61.29211276624585, + "learning_rate": 9.834602632263767e-14, + "loss": 1.4099, + "step": 35198 + }, + { + "epoch": 3.0, + "grad_norm": 74.29104973942152, + "learning_rate": 2.4586506719437296e-14, + "loss": 1.6246, + "step": 35199 + }, + { + "epoch": 3.0, + "step": 35199, + "total_flos": 7110251765760.0, + "train_loss": 2.7040989276763305, + "train_runtime": 23717.1272, + "train_samples_per_second": 2.968, + "train_steps_per_second": 1.484 + } + ], + "logging_steps": 1, + "max_steps": 35199, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7110251765760.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}