{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1845, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016276703967446592, "grad_norm": 18.405115391571353, "learning_rate": 0.0, "loss": 4.595, "step": 1 }, { "epoch": 0.0032553407934893183, "grad_norm": 19.37369081953676, "learning_rate": 5.405405405405406e-08, "loss": 4.5815, "step": 2 }, { "epoch": 0.0048830111902339775, "grad_norm": 20.70434264473797, "learning_rate": 1.0810810810810812e-07, "loss": 4.8054, "step": 3 }, { "epoch": 0.006510681586978637, "grad_norm": 21.816331534686014, "learning_rate": 1.6216216216216218e-07, "loss": 4.6383, "step": 4 }, { "epoch": 0.008138351983723296, "grad_norm": 16.539995004423393, "learning_rate": 2.1621621621621625e-07, "loss": 4.3864, "step": 5 }, { "epoch": 0.009766022380467955, "grad_norm": 17.911277312556553, "learning_rate": 2.702702702702703e-07, "loss": 4.6036, "step": 6 }, { "epoch": 0.011393692777212614, "grad_norm": 16.861289356194984, "learning_rate": 3.2432432432432436e-07, "loss": 4.4994, "step": 7 }, { "epoch": 0.013021363173957273, "grad_norm": 19.286825653248293, "learning_rate": 3.7837837837837843e-07, "loss": 4.6015, "step": 8 }, { "epoch": 0.014649033570701932, "grad_norm": 17.848718457843354, "learning_rate": 4.324324324324325e-07, "loss": 4.4643, "step": 9 }, { "epoch": 0.01627670396744659, "grad_norm": 19.209733625974366, "learning_rate": 4.864864864864865e-07, "loss": 4.3829, "step": 10 }, { "epoch": 0.01790437436419125, "grad_norm": 17.52433235238143, "learning_rate": 5.405405405405406e-07, "loss": 4.5602, "step": 11 }, { "epoch": 0.01953204476093591, "grad_norm": 18.900155698302324, "learning_rate": 5.945945945945947e-07, "loss": 4.6887, "step": 12 }, { "epoch": 0.02115971515768057, "grad_norm": 20.336804229624335, "learning_rate": 6.486486486486487e-07, "loss": 4.6726, "step": 13 }, { "epoch": 0.022787385554425228, "grad_norm": 16.947540726203478, "learning_rate": 7.027027027027028e-07, "loss": 4.4765, "step": 14 }, { "epoch": 0.024415055951169887, "grad_norm": 16.82466865833761, "learning_rate": 7.567567567567569e-07, "loss": 4.3704, "step": 15 }, { "epoch": 0.026042726347914547, "grad_norm": 17.126557934759873, "learning_rate": 8.108108108108109e-07, "loss": 4.4054, "step": 16 }, { "epoch": 0.027670396744659206, "grad_norm": 18.005470150383825, "learning_rate": 8.64864864864865e-07, "loss": 4.5418, "step": 17 }, { "epoch": 0.029298067141403865, "grad_norm": 17.15961260095372, "learning_rate": 9.189189189189191e-07, "loss": 4.4678, "step": 18 }, { "epoch": 0.030925737538148524, "grad_norm": 16.561816291184407, "learning_rate": 9.72972972972973e-07, "loss": 4.3585, "step": 19 }, { "epoch": 0.03255340793489318, "grad_norm": 17.106504638793844, "learning_rate": 1.027027027027027e-06, "loss": 4.5827, "step": 20 }, { "epoch": 0.034181078331637846, "grad_norm": 15.444766106020726, "learning_rate": 1.0810810810810812e-06, "loss": 4.3316, "step": 21 }, { "epoch": 0.0358087487283825, "grad_norm": 13.25771983338267, "learning_rate": 1.1351351351351352e-06, "loss": 3.9733, "step": 22 }, { "epoch": 0.037436419125127164, "grad_norm": 14.808533340124978, "learning_rate": 1.1891891891891893e-06, "loss": 4.2644, "step": 23 }, { "epoch": 0.03906408952187182, "grad_norm": 28.570000781952523, "learning_rate": 1.2432432432432434e-06, "loss": 4.4768, "step": 24 }, { "epoch": 0.04069175991861648, "grad_norm": 14.765212657478061, "learning_rate": 1.2972972972972974e-06, "loss": 4.2572, "step": 25 }, { "epoch": 0.04231943031536114, "grad_norm": 16.8763045830167, "learning_rate": 1.3513513513513515e-06, "loss": 4.4737, "step": 26 }, { "epoch": 0.0439471007121058, "grad_norm": 20.57793259078222, "learning_rate": 1.4054054054054056e-06, "loss": 4.189, "step": 27 }, { "epoch": 0.045574771108850456, "grad_norm": 15.210127626787061, "learning_rate": 1.4594594594594596e-06, "loss": 4.1575, "step": 28 }, { "epoch": 0.04720244150559512, "grad_norm": 14.057162788415257, "learning_rate": 1.5135135135135137e-06, "loss": 4.2375, "step": 29 }, { "epoch": 0.048830111902339775, "grad_norm": 13.124896530169348, "learning_rate": 1.5675675675675678e-06, "loss": 3.9448, "step": 30 }, { "epoch": 0.05045778229908444, "grad_norm": 15.295089762748027, "learning_rate": 1.6216216216216219e-06, "loss": 4.241, "step": 31 }, { "epoch": 0.05208545269582909, "grad_norm": 13.521735562535323, "learning_rate": 1.675675675675676e-06, "loss": 3.997, "step": 32 }, { "epoch": 0.053713123092573756, "grad_norm": 12.743927098315782, "learning_rate": 1.72972972972973e-06, "loss": 3.9285, "step": 33 }, { "epoch": 0.05534079348931841, "grad_norm": 12.41796062516869, "learning_rate": 1.783783783783784e-06, "loss": 3.8848, "step": 34 }, { "epoch": 0.056968463886063074, "grad_norm": 11.679485638921474, "learning_rate": 1.8378378378378381e-06, "loss": 3.7983, "step": 35 }, { "epoch": 0.05859613428280773, "grad_norm": 12.619857492754166, "learning_rate": 1.8918918918918922e-06, "loss": 4.0908, "step": 36 }, { "epoch": 0.06022380467955239, "grad_norm": 11.167690750147685, "learning_rate": 1.945945945945946e-06, "loss": 3.9463, "step": 37 }, { "epoch": 0.06185147507629705, "grad_norm": 9.338946743316512, "learning_rate": 2.0000000000000003e-06, "loss": 3.516, "step": 38 }, { "epoch": 0.0634791454730417, "grad_norm": 10.079010584706438, "learning_rate": 2.054054054054054e-06, "loss": 3.7315, "step": 39 }, { "epoch": 0.06510681586978637, "grad_norm": 10.917540412998214, "learning_rate": 2.1081081081081085e-06, "loss": 3.6421, "step": 40 }, { "epoch": 0.06673448626653103, "grad_norm": 11.30559930585391, "learning_rate": 2.1621621621621623e-06, "loss": 3.8341, "step": 41 }, { "epoch": 0.06836215666327569, "grad_norm": 10.447405314734715, "learning_rate": 2.2162162162162166e-06, "loss": 3.7236, "step": 42 }, { "epoch": 0.06998982706002034, "grad_norm": 9.798150634888527, "learning_rate": 2.2702702702702705e-06, "loss": 3.7438, "step": 43 }, { "epoch": 0.071617497456765, "grad_norm": 11.073592064281238, "learning_rate": 2.3243243243243247e-06, "loss": 3.8681, "step": 44 }, { "epoch": 0.07324516785350967, "grad_norm": 10.365316469854015, "learning_rate": 2.3783783783783786e-06, "loss": 3.7848, "step": 45 }, { "epoch": 0.07487283825025433, "grad_norm": 11.314126732883098, "learning_rate": 2.432432432432433e-06, "loss": 3.832, "step": 46 }, { "epoch": 0.07650050864699898, "grad_norm": 9.730137532647516, "learning_rate": 2.4864864864864867e-06, "loss": 3.6405, "step": 47 }, { "epoch": 0.07812817904374364, "grad_norm": 10.174112317612916, "learning_rate": 2.540540540540541e-06, "loss": 3.5218, "step": 48 }, { "epoch": 0.0797558494404883, "grad_norm": 9.936652123282203, "learning_rate": 2.594594594594595e-06, "loss": 3.7059, "step": 49 }, { "epoch": 0.08138351983723296, "grad_norm": 10.445739655385365, "learning_rate": 2.648648648648649e-06, "loss": 3.6969, "step": 50 }, { "epoch": 0.08301119023397761, "grad_norm": 9.211688437646767, "learning_rate": 2.702702702702703e-06, "loss": 3.5598, "step": 51 }, { "epoch": 0.08463886063072228, "grad_norm": 9.273995737720474, "learning_rate": 2.7567567567567573e-06, "loss": 3.5955, "step": 52 }, { "epoch": 0.08626653102746694, "grad_norm": 9.846704078353907, "learning_rate": 2.810810810810811e-06, "loss": 3.6695, "step": 53 }, { "epoch": 0.0878942014242116, "grad_norm": 9.864902821303566, "learning_rate": 2.8648648648648654e-06, "loss": 3.5676, "step": 54 }, { "epoch": 0.08952187182095625, "grad_norm": 11.345180968958164, "learning_rate": 2.9189189189189193e-06, "loss": 3.6958, "step": 55 }, { "epoch": 0.09114954221770091, "grad_norm": 9.141510840966554, "learning_rate": 2.9729729729729736e-06, "loss": 3.6732, "step": 56 }, { "epoch": 0.09277721261444558, "grad_norm": 9.669960149766391, "learning_rate": 3.0270270270270274e-06, "loss": 3.7537, "step": 57 }, { "epoch": 0.09440488301119024, "grad_norm": 9.809811433577053, "learning_rate": 3.0810810810810817e-06, "loss": 3.6505, "step": 58 }, { "epoch": 0.09603255340793489, "grad_norm": 11.461257546605484, "learning_rate": 3.1351351351351356e-06, "loss": 3.612, "step": 59 }, { "epoch": 0.09766022380467955, "grad_norm": 8.954180401516146, "learning_rate": 3.1891891891891894e-06, "loss": 3.4639, "step": 60 }, { "epoch": 0.09928789420142421, "grad_norm": 8.51625956799713, "learning_rate": 3.2432432432432437e-06, "loss": 3.4033, "step": 61 }, { "epoch": 0.10091556459816887, "grad_norm": 9.767241129928802, "learning_rate": 3.2972972972972976e-06, "loss": 3.5372, "step": 62 }, { "epoch": 0.10254323499491352, "grad_norm": 8.68987756605806, "learning_rate": 3.351351351351352e-06, "loss": 3.5113, "step": 63 }, { "epoch": 0.10417090539165819, "grad_norm": 13.392291717647353, "learning_rate": 3.4054054054054057e-06, "loss": 3.5841, "step": 64 }, { "epoch": 0.10579857578840285, "grad_norm": 8.88446552706667, "learning_rate": 3.45945945945946e-06, "loss": 3.551, "step": 65 }, { "epoch": 0.10742624618514751, "grad_norm": 8.882611979704343, "learning_rate": 3.513513513513514e-06, "loss": 3.473, "step": 66 }, { "epoch": 0.10905391658189216, "grad_norm": 9.461587622637653, "learning_rate": 3.567567567567568e-06, "loss": 3.4656, "step": 67 }, { "epoch": 0.11068158697863682, "grad_norm": 8.508456031799517, "learning_rate": 3.621621621621622e-06, "loss": 3.6037, "step": 68 }, { "epoch": 0.11230925737538149, "grad_norm": 8.938103974674666, "learning_rate": 3.6756756756756763e-06, "loss": 3.6048, "step": 69 }, { "epoch": 0.11393692777212615, "grad_norm": 11.275955496865018, "learning_rate": 3.72972972972973e-06, "loss": 3.7261, "step": 70 }, { "epoch": 0.1155645981688708, "grad_norm": 10.28872752593033, "learning_rate": 3.7837837837837844e-06, "loss": 3.4519, "step": 71 }, { "epoch": 0.11719226856561546, "grad_norm": 8.502786076126576, "learning_rate": 3.837837837837838e-06, "loss": 3.5015, "step": 72 }, { "epoch": 0.11881993896236012, "grad_norm": 9.773414058716456, "learning_rate": 3.891891891891892e-06, "loss": 3.4443, "step": 73 }, { "epoch": 0.12044760935910478, "grad_norm": 8.664935026586456, "learning_rate": 3.945945945945947e-06, "loss": 3.3554, "step": 74 }, { "epoch": 0.12207527975584945, "grad_norm": 8.910200908333717, "learning_rate": 4.000000000000001e-06, "loss": 3.2855, "step": 75 }, { "epoch": 0.1237029501525941, "grad_norm": 10.451955139742086, "learning_rate": 4.0540540540540545e-06, "loss": 3.3587, "step": 76 }, { "epoch": 0.12533062054933877, "grad_norm": 9.853865837350686, "learning_rate": 4.108108108108108e-06, "loss": 3.6051, "step": 77 }, { "epoch": 0.1269582909460834, "grad_norm": 9.597819841246444, "learning_rate": 4.162162162162163e-06, "loss": 3.6297, "step": 78 }, { "epoch": 0.12858596134282807, "grad_norm": 8.707422139241817, "learning_rate": 4.216216216216217e-06, "loss": 3.3522, "step": 79 }, { "epoch": 0.13021363173957273, "grad_norm": 8.874700013561677, "learning_rate": 4.270270270270271e-06, "loss": 3.4787, "step": 80 }, { "epoch": 0.1318413021363174, "grad_norm": 9.357764867164931, "learning_rate": 4.324324324324325e-06, "loss": 3.5273, "step": 81 }, { "epoch": 0.13346897253306206, "grad_norm": 8.611930449645795, "learning_rate": 4.378378378378379e-06, "loss": 3.557, "step": 82 }, { "epoch": 0.13509664292980672, "grad_norm": 8.81154685676804, "learning_rate": 4.432432432432433e-06, "loss": 3.5443, "step": 83 }, { "epoch": 0.13672431332655138, "grad_norm": 8.684714797435023, "learning_rate": 4.486486486486487e-06, "loss": 3.6234, "step": 84 }, { "epoch": 0.13835198372329605, "grad_norm": 8.450539522870228, "learning_rate": 4.540540540540541e-06, "loss": 3.4268, "step": 85 }, { "epoch": 0.13997965412004068, "grad_norm": 203.26896956316205, "learning_rate": 4.594594594594596e-06, "loss": 3.3475, "step": 86 }, { "epoch": 0.14160732451678534, "grad_norm": 8.530249464271947, "learning_rate": 4.6486486486486495e-06, "loss": 3.5357, "step": 87 }, { "epoch": 0.14323499491353, "grad_norm": 8.813910653387477, "learning_rate": 4.702702702702703e-06, "loss": 3.4407, "step": 88 }, { "epoch": 0.14486266531027467, "grad_norm": 7.996612013191018, "learning_rate": 4.756756756756757e-06, "loss": 3.3899, "step": 89 }, { "epoch": 0.14649033570701933, "grad_norm": 8.964977950808732, "learning_rate": 4.810810810810811e-06, "loss": 3.5022, "step": 90 }, { "epoch": 0.148118006103764, "grad_norm": 9.540310705679733, "learning_rate": 4.864864864864866e-06, "loss": 3.5561, "step": 91 }, { "epoch": 0.14974567650050866, "grad_norm": 8.514977345599496, "learning_rate": 4.91891891891892e-06, "loss": 3.4527, "step": 92 }, { "epoch": 0.15137334689725332, "grad_norm": 8.553120525846303, "learning_rate": 4.9729729729729735e-06, "loss": 3.3705, "step": 93 }, { "epoch": 0.15300101729399795, "grad_norm": 7.664586273663719, "learning_rate": 5.027027027027027e-06, "loss": 3.3678, "step": 94 }, { "epoch": 0.15462868769074262, "grad_norm": 8.804839561104611, "learning_rate": 5.081081081081082e-06, "loss": 3.4374, "step": 95 }, { "epoch": 0.15625635808748728, "grad_norm": 8.421114146553995, "learning_rate": 5.135135135135135e-06, "loss": 3.3744, "step": 96 }, { "epoch": 0.15788402848423194, "grad_norm": 9.124854995402064, "learning_rate": 5.18918918918919e-06, "loss": 3.356, "step": 97 }, { "epoch": 0.1595116988809766, "grad_norm": 8.026693707144563, "learning_rate": 5.243243243243244e-06, "loss": 3.3927, "step": 98 }, { "epoch": 0.16113936927772127, "grad_norm": 8.381601376052453, "learning_rate": 5.297297297297298e-06, "loss": 3.3948, "step": 99 }, { "epoch": 0.16276703967446593, "grad_norm": 8.732085507331083, "learning_rate": 5.351351351351351e-06, "loss": 3.285, "step": 100 }, { "epoch": 0.1643947100712106, "grad_norm": 8.350837114012615, "learning_rate": 5.405405405405406e-06, "loss": 3.4003, "step": 101 }, { "epoch": 0.16602238046795523, "grad_norm": 8.789744218452267, "learning_rate": 5.45945945945946e-06, "loss": 3.4465, "step": 102 }, { "epoch": 0.1676500508646999, "grad_norm": 8.673051112708281, "learning_rate": 5.513513513513515e-06, "loss": 3.4631, "step": 103 }, { "epoch": 0.16927772126144455, "grad_norm": 8.1880888394177, "learning_rate": 5.567567567567568e-06, "loss": 3.4114, "step": 104 }, { "epoch": 0.17090539165818922, "grad_norm": 8.234025231008145, "learning_rate": 5.621621621621622e-06, "loss": 3.3295, "step": 105 }, { "epoch": 0.17253306205493388, "grad_norm": 55.037934393589886, "learning_rate": 5.675675675675676e-06, "loss": 3.4663, "step": 106 }, { "epoch": 0.17416073245167854, "grad_norm": 7.9308410763408235, "learning_rate": 5.729729729729731e-06, "loss": 3.3697, "step": 107 }, { "epoch": 0.1757884028484232, "grad_norm": 8.312377301814953, "learning_rate": 5.783783783783784e-06, "loss": 3.3949, "step": 108 }, { "epoch": 0.17741607324516787, "grad_norm": 8.31619609006047, "learning_rate": 5.837837837837839e-06, "loss": 3.5479, "step": 109 }, { "epoch": 0.1790437436419125, "grad_norm": 8.980462037223424, "learning_rate": 5.8918918918918924e-06, "loss": 3.3296, "step": 110 }, { "epoch": 0.18067141403865716, "grad_norm": 7.889600315401066, "learning_rate": 5.945945945945947e-06, "loss": 3.4096, "step": 111 }, { "epoch": 0.18229908443540183, "grad_norm": 9.518011310231163, "learning_rate": 6e-06, "loss": 3.4926, "step": 112 }, { "epoch": 0.1839267548321465, "grad_norm": 8.643724353384668, "learning_rate": 6.054054054054055e-06, "loss": 3.4599, "step": 113 }, { "epoch": 0.18555442522889115, "grad_norm": 8.861147305004378, "learning_rate": 6.108108108108109e-06, "loss": 3.5831, "step": 114 }, { "epoch": 0.1871820956256358, "grad_norm": 9.021514200657966, "learning_rate": 6.162162162162163e-06, "loss": 3.4331, "step": 115 }, { "epoch": 0.18880976602238048, "grad_norm": 8.150555240125234, "learning_rate": 6.2162162162162164e-06, "loss": 3.3702, "step": 116 }, { "epoch": 0.19043743641912514, "grad_norm": 7.859238046061924, "learning_rate": 6.270270270270271e-06, "loss": 3.2184, "step": 117 }, { "epoch": 0.19206510681586977, "grad_norm": 9.160407091108384, "learning_rate": 6.324324324324325e-06, "loss": 3.4339, "step": 118 }, { "epoch": 0.19369277721261444, "grad_norm": 7.921773671527862, "learning_rate": 6.378378378378379e-06, "loss": 3.3387, "step": 119 }, { "epoch": 0.1953204476093591, "grad_norm": 8.811306232994587, "learning_rate": 6.432432432432433e-06, "loss": 3.4358, "step": 120 }, { "epoch": 0.19694811800610376, "grad_norm": 8.908780047741356, "learning_rate": 6.486486486486487e-06, "loss": 3.2271, "step": 121 }, { "epoch": 0.19857578840284842, "grad_norm": 8.326846971692133, "learning_rate": 6.540540540540541e-06, "loss": 3.3491, "step": 122 }, { "epoch": 0.2002034587995931, "grad_norm": 8.580401220453139, "learning_rate": 6.594594594594595e-06, "loss": 3.4554, "step": 123 }, { "epoch": 0.20183112919633775, "grad_norm": 7.584582108504181, "learning_rate": 6.648648648648649e-06, "loss": 3.2494, "step": 124 }, { "epoch": 0.2034587995930824, "grad_norm": 9.684763777172732, "learning_rate": 6.702702702702704e-06, "loss": 3.447, "step": 125 }, { "epoch": 0.20508646998982705, "grad_norm": 7.828233062419421, "learning_rate": 6.7567567567567575e-06, "loss": 3.3143, "step": 126 }, { "epoch": 0.2067141403865717, "grad_norm": 8.347140945049503, "learning_rate": 6.810810810810811e-06, "loss": 3.1973, "step": 127 }, { "epoch": 0.20834181078331637, "grad_norm": 8.828098204011892, "learning_rate": 6.864864864864865e-06, "loss": 3.5023, "step": 128 }, { "epoch": 0.20996948118006104, "grad_norm": 8.419936435537798, "learning_rate": 6.91891891891892e-06, "loss": 3.3296, "step": 129 }, { "epoch": 0.2115971515768057, "grad_norm": 9.0120726378614, "learning_rate": 6.972972972972973e-06, "loss": 3.2357, "step": 130 }, { "epoch": 0.21322482197355036, "grad_norm": 9.166916272724663, "learning_rate": 7.027027027027028e-06, "loss": 3.4288, "step": 131 }, { "epoch": 0.21485249237029502, "grad_norm": 8.364596453680067, "learning_rate": 7.0810810810810815e-06, "loss": 3.3135, "step": 132 }, { "epoch": 0.21648016276703969, "grad_norm": 8.138524309004698, "learning_rate": 7.135135135135136e-06, "loss": 3.3847, "step": 133 }, { "epoch": 0.21810783316378432, "grad_norm": 8.182320033840563, "learning_rate": 7.189189189189189e-06, "loss": 3.4356, "step": 134 }, { "epoch": 0.21973550356052898, "grad_norm": 9.800050737796552, "learning_rate": 7.243243243243244e-06, "loss": 3.5419, "step": 135 }, { "epoch": 0.22136317395727365, "grad_norm": 9.06761262036164, "learning_rate": 7.297297297297298e-06, "loss": 3.454, "step": 136 }, { "epoch": 0.2229908443540183, "grad_norm": 8.537442108242093, "learning_rate": 7.3513513513513525e-06, "loss": 3.3697, "step": 137 }, { "epoch": 0.22461851475076297, "grad_norm": 9.143829817143542, "learning_rate": 7.4054054054054055e-06, "loss": 3.3589, "step": 138 }, { "epoch": 0.22624618514750763, "grad_norm": 8.579439585799376, "learning_rate": 7.45945945945946e-06, "loss": 3.3569, "step": 139 }, { "epoch": 0.2278738555442523, "grad_norm": 8.512298095627923, "learning_rate": 7.513513513513514e-06, "loss": 3.5413, "step": 140 }, { "epoch": 0.22950152594099696, "grad_norm": 8.936999348594039, "learning_rate": 7.567567567567569e-06, "loss": 3.3054, "step": 141 }, { "epoch": 0.2311291963377416, "grad_norm": 8.3514841712592, "learning_rate": 7.621621621621622e-06, "loss": 3.4834, "step": 142 }, { "epoch": 0.23275686673448626, "grad_norm": 8.638196026168202, "learning_rate": 7.675675675675676e-06, "loss": 3.2704, "step": 143 }, { "epoch": 0.23438453713123092, "grad_norm": 9.499173589593726, "learning_rate": 7.72972972972973e-06, "loss": 3.3098, "step": 144 }, { "epoch": 0.23601220752797558, "grad_norm": 8.338731209477796, "learning_rate": 7.783783783783784e-06, "loss": 3.3739, "step": 145 }, { "epoch": 0.23763987792472024, "grad_norm": 9.080367329689933, "learning_rate": 7.837837837837838e-06, "loss": 3.2462, "step": 146 }, { "epoch": 0.2392675483214649, "grad_norm": 8.888565081916285, "learning_rate": 7.891891891891894e-06, "loss": 3.4448, "step": 147 }, { "epoch": 0.24089521871820957, "grad_norm": 8.949387011375853, "learning_rate": 7.945945945945946e-06, "loss": 3.2962, "step": 148 }, { "epoch": 0.24252288911495423, "grad_norm": 8.00371179409505, "learning_rate": 8.000000000000001e-06, "loss": 3.3065, "step": 149 }, { "epoch": 0.2441505595116989, "grad_norm": 9.34623931194304, "learning_rate": 8.054054054054055e-06, "loss": 3.5516, "step": 150 }, { "epoch": 0.24577822990844353, "grad_norm": 8.29337661381796, "learning_rate": 8.108108108108109e-06, "loss": 3.4526, "step": 151 }, { "epoch": 0.2474059003051882, "grad_norm": 9.19565099223485, "learning_rate": 8.162162162162163e-06, "loss": 3.3555, "step": 152 }, { "epoch": 0.24903357070193285, "grad_norm": 8.065241700587107, "learning_rate": 8.216216216216217e-06, "loss": 3.5066, "step": 153 }, { "epoch": 0.25066124109867755, "grad_norm": 8.565646196253741, "learning_rate": 8.27027027027027e-06, "loss": 3.4477, "step": 154 }, { "epoch": 0.25228891149542215, "grad_norm": 9.818848926498077, "learning_rate": 8.324324324324326e-06, "loss": 3.592, "step": 155 }, { "epoch": 0.2539165818921668, "grad_norm": 8.707979589080521, "learning_rate": 8.378378378378378e-06, "loss": 3.4491, "step": 156 }, { "epoch": 0.2555442522889115, "grad_norm": 8.483860080588416, "learning_rate": 8.432432432432434e-06, "loss": 3.4246, "step": 157 }, { "epoch": 0.25717192268565614, "grad_norm": 8.923250907849061, "learning_rate": 8.486486486486488e-06, "loss": 3.5234, "step": 158 }, { "epoch": 0.2587995930824008, "grad_norm": 7.828643402207903, "learning_rate": 8.540540540540542e-06, "loss": 3.3417, "step": 159 }, { "epoch": 0.26042726347914547, "grad_norm": 7.876197542587053, "learning_rate": 8.594594594594595e-06, "loss": 3.3926, "step": 160 }, { "epoch": 0.26205493387589013, "grad_norm": 8.309006852378408, "learning_rate": 8.64864864864865e-06, "loss": 3.4464, "step": 161 }, { "epoch": 0.2636826042726348, "grad_norm": 7.973750147897568, "learning_rate": 8.702702702702703e-06, "loss": 3.3043, "step": 162 }, { "epoch": 0.26531027466937945, "grad_norm": 8.481146521839698, "learning_rate": 8.756756756756759e-06, "loss": 3.4933, "step": 163 }, { "epoch": 0.2669379450661241, "grad_norm": 8.882275175371621, "learning_rate": 8.810810810810811e-06, "loss": 3.3968, "step": 164 }, { "epoch": 0.2685656154628688, "grad_norm": 8.29874068166967, "learning_rate": 8.864864864864866e-06, "loss": 3.3499, "step": 165 }, { "epoch": 0.27019328585961344, "grad_norm": 8.391776615357568, "learning_rate": 8.91891891891892e-06, "loss": 3.4339, "step": 166 }, { "epoch": 0.2718209562563581, "grad_norm": 7.913451023463229, "learning_rate": 8.972972972972974e-06, "loss": 3.3316, "step": 167 }, { "epoch": 0.27344862665310277, "grad_norm": 8.118177756368118, "learning_rate": 9.027027027027028e-06, "loss": 3.2665, "step": 168 }, { "epoch": 0.27507629704984743, "grad_norm": 8.587906560239198, "learning_rate": 9.081081081081082e-06, "loss": 3.4588, "step": 169 }, { "epoch": 0.2767039674465921, "grad_norm": 8.458609310650687, "learning_rate": 9.135135135135136e-06, "loss": 3.3706, "step": 170 }, { "epoch": 0.2783316378433367, "grad_norm": 8.317559479218096, "learning_rate": 9.189189189189191e-06, "loss": 3.5419, "step": 171 }, { "epoch": 0.27995930824008136, "grad_norm": 8.30019159519956, "learning_rate": 9.243243243243243e-06, "loss": 3.3771, "step": 172 }, { "epoch": 0.281586978636826, "grad_norm": 9.433485704116189, "learning_rate": 9.297297297297299e-06, "loss": 3.3642, "step": 173 }, { "epoch": 0.2832146490335707, "grad_norm": 8.565880903677826, "learning_rate": 9.351351351351353e-06, "loss": 3.3207, "step": 174 }, { "epoch": 0.28484231943031535, "grad_norm": 9.81068319527487, "learning_rate": 9.405405405405407e-06, "loss": 3.4941, "step": 175 }, { "epoch": 0.28646998982706, "grad_norm": 8.284754348976593, "learning_rate": 9.45945945945946e-06, "loss": 3.3371, "step": 176 }, { "epoch": 0.2880976602238047, "grad_norm": 9.077009028477397, "learning_rate": 9.513513513513514e-06, "loss": 3.3968, "step": 177 }, { "epoch": 0.28972533062054934, "grad_norm": 8.46383617930216, "learning_rate": 9.567567567567568e-06, "loss": 3.5414, "step": 178 }, { "epoch": 0.291353001017294, "grad_norm": 8.581631793204373, "learning_rate": 9.621621621621622e-06, "loss": 3.3063, "step": 179 }, { "epoch": 0.29298067141403866, "grad_norm": 8.537753591745028, "learning_rate": 9.675675675675676e-06, "loss": 3.5448, "step": 180 }, { "epoch": 0.2946083418107833, "grad_norm": 10.400595859108979, "learning_rate": 9.729729729729732e-06, "loss": 3.3418, "step": 181 }, { "epoch": 0.296236012207528, "grad_norm": 8.456778654646381, "learning_rate": 9.783783783783785e-06, "loss": 3.5033, "step": 182 }, { "epoch": 0.29786368260427265, "grad_norm": 8.117421592282872, "learning_rate": 9.83783783783784e-06, "loss": 3.5385, "step": 183 }, { "epoch": 0.2994913530010173, "grad_norm": 9.519824635065966, "learning_rate": 9.891891891891893e-06, "loss": 3.4692, "step": 184 }, { "epoch": 0.301119023397762, "grad_norm": 7.664577807586338, "learning_rate": 9.945945945945947e-06, "loss": 3.2608, "step": 185 }, { "epoch": 0.30274669379450664, "grad_norm": 8.388221648796417, "learning_rate": 1e-05, "loss": 3.2046, "step": 186 }, { "epoch": 0.30437436419125125, "grad_norm": 8.151113016469125, "learning_rate": 9.999991045868909e-06, "loss": 3.2787, "step": 187 }, { "epoch": 0.3060020345879959, "grad_norm": 7.949528440485408, "learning_rate": 9.999964183507702e-06, "loss": 3.4348, "step": 188 }, { "epoch": 0.30762970498474057, "grad_norm": 11.343755599353774, "learning_rate": 9.999919413012593e-06, "loss": 3.3242, "step": 189 }, { "epoch": 0.30925737538148523, "grad_norm": 7.620561295643476, "learning_rate": 9.999856734543933e-06, "loss": 3.3766, "step": 190 }, { "epoch": 0.3108850457782299, "grad_norm": 9.464937243816276, "learning_rate": 9.999776148326216e-06, "loss": 3.356, "step": 191 }, { "epoch": 0.31251271617497456, "grad_norm": 9.095387761292685, "learning_rate": 9.999677654648072e-06, "loss": 3.4442, "step": 192 }, { "epoch": 0.3141403865717192, "grad_norm": 8.24687192798249, "learning_rate": 9.999561253862273e-06, "loss": 3.5061, "step": 193 }, { "epoch": 0.3157680569684639, "grad_norm": 9.230539450379755, "learning_rate": 9.999426946385727e-06, "loss": 3.4341, "step": 194 }, { "epoch": 0.31739572736520855, "grad_norm": 8.301483317686992, "learning_rate": 9.999274732699473e-06, "loss": 3.3576, "step": 195 }, { "epoch": 0.3190233977619532, "grad_norm": 8.292892256969285, "learning_rate": 9.99910461334869e-06, "loss": 3.3274, "step": 196 }, { "epoch": 0.32065106815869787, "grad_norm": 8.501742447033934, "learning_rate": 9.998916588942685e-06, "loss": 3.4885, "step": 197 }, { "epoch": 0.32227873855544253, "grad_norm": 7.912648776682216, "learning_rate": 9.998710660154898e-06, "loss": 3.4194, "step": 198 }, { "epoch": 0.3239064089521872, "grad_norm": 8.937834497458752, "learning_rate": 9.998486827722894e-06, "loss": 3.4982, "step": 199 }, { "epoch": 0.32553407934893186, "grad_norm": 8.249897230652076, "learning_rate": 9.998245092448362e-06, "loss": 3.3287, "step": 200 }, { "epoch": 0.3271617497456765, "grad_norm": 8.209379499788827, "learning_rate": 9.997985455197114e-06, "loss": 3.3002, "step": 201 }, { "epoch": 0.3287894201424212, "grad_norm": 8.46239731742331, "learning_rate": 9.99770791689908e-06, "loss": 3.3561, "step": 202 }, { "epoch": 0.3304170905391658, "grad_norm": 8.327742614857177, "learning_rate": 9.997412478548306e-06, "loss": 3.3251, "step": 203 }, { "epoch": 0.33204476093591045, "grad_norm": 8.318634115748203, "learning_rate": 9.99709914120295e-06, "loss": 3.2337, "step": 204 }, { "epoch": 0.3336724313326551, "grad_norm": 8.399574580262145, "learning_rate": 9.996767905985279e-06, "loss": 3.4451, "step": 205 }, { "epoch": 0.3353001017293998, "grad_norm": 7.769681071785025, "learning_rate": 9.996418774081658e-06, "loss": 3.5325, "step": 206 }, { "epoch": 0.33692777212614444, "grad_norm": 7.913171360622336, "learning_rate": 9.99605174674256e-06, "loss": 3.3217, "step": 207 }, { "epoch": 0.3385554425228891, "grad_norm": 9.016308849961778, "learning_rate": 9.995666825282547e-06, "loss": 3.4267, "step": 208 }, { "epoch": 0.34018311291963377, "grad_norm": 7.924612638264981, "learning_rate": 9.995264011080277e-06, "loss": 3.426, "step": 209 }, { "epoch": 0.34181078331637843, "grad_norm": 8.46972080566441, "learning_rate": 9.994843305578487e-06, "loss": 3.541, "step": 210 }, { "epoch": 0.3434384537131231, "grad_norm": 8.378788655267654, "learning_rate": 9.994404710283999e-06, "loss": 3.4345, "step": 211 }, { "epoch": 0.34506612410986776, "grad_norm": 7.47205949454622, "learning_rate": 9.993948226767709e-06, "loss": 3.3089, "step": 212 }, { "epoch": 0.3466937945066124, "grad_norm": 10.53995449072941, "learning_rate": 9.993473856664584e-06, "loss": 3.4057, "step": 213 }, { "epoch": 0.3483214649033571, "grad_norm": 8.60949379053401, "learning_rate": 9.99298160167365e-06, "loss": 3.3003, "step": 214 }, { "epoch": 0.34994913530010174, "grad_norm": 7.775998822375484, "learning_rate": 9.992471463557995e-06, "loss": 3.3684, "step": 215 }, { "epoch": 0.3515768056968464, "grad_norm": 8.969502753058071, "learning_rate": 9.991943444144758e-06, "loss": 3.4043, "step": 216 }, { "epoch": 0.35320447609359107, "grad_norm": 7.912383085882444, "learning_rate": 9.991397545325116e-06, "loss": 3.4841, "step": 217 }, { "epoch": 0.35483214649033573, "grad_norm": 8.298464335197542, "learning_rate": 9.990833769054294e-06, "loss": 3.402, "step": 218 }, { "epoch": 0.35645981688708034, "grad_norm": 8.45745101370768, "learning_rate": 9.99025211735154e-06, "loss": 3.4044, "step": 219 }, { "epoch": 0.358087487283825, "grad_norm": 8.460998461470611, "learning_rate": 9.989652592300129e-06, "loss": 3.3161, "step": 220 }, { "epoch": 0.35971515768056966, "grad_norm": 8.865854922078181, "learning_rate": 9.989035196047349e-06, "loss": 3.3038, "step": 221 }, { "epoch": 0.3613428280773143, "grad_norm": 8.479305556566837, "learning_rate": 9.988399930804504e-06, "loss": 3.4034, "step": 222 }, { "epoch": 0.362970498474059, "grad_norm": 8.541607138094765, "learning_rate": 9.98774679884689e-06, "loss": 3.3547, "step": 223 }, { "epoch": 0.36459816887080365, "grad_norm": 7.909790534516617, "learning_rate": 9.987075802513796e-06, "loss": 3.2805, "step": 224 }, { "epoch": 0.3662258392675483, "grad_norm": 8.019776118241158, "learning_rate": 9.986386944208505e-06, "loss": 3.4237, "step": 225 }, { "epoch": 0.367853509664293, "grad_norm": 7.939209209874796, "learning_rate": 9.985680226398261e-06, "loss": 3.3178, "step": 226 }, { "epoch": 0.36948118006103764, "grad_norm": 8.01761412843025, "learning_rate": 9.984955651614285e-06, "loss": 3.298, "step": 227 }, { "epoch": 0.3711088504577823, "grad_norm": 9.271226784687848, "learning_rate": 9.98421322245175e-06, "loss": 3.3944, "step": 228 }, { "epoch": 0.37273652085452696, "grad_norm": 7.770067314180095, "learning_rate": 9.983452941569782e-06, "loss": 3.3709, "step": 229 }, { "epoch": 0.3743641912512716, "grad_norm": 8.64930983363181, "learning_rate": 9.98267481169144e-06, "loss": 3.458, "step": 230 }, { "epoch": 0.3759918616480163, "grad_norm": 8.262948356417986, "learning_rate": 9.981878835603718e-06, "loss": 3.2756, "step": 231 }, { "epoch": 0.37761953204476095, "grad_norm": 8.473871962645362, "learning_rate": 9.981065016157522e-06, "loss": 3.3427, "step": 232 }, { "epoch": 0.3792472024415056, "grad_norm": 8.00084621081924, "learning_rate": 9.980233356267672e-06, "loss": 3.2564, "step": 233 }, { "epoch": 0.3808748728382503, "grad_norm": 7.776419257809964, "learning_rate": 9.979383858912886e-06, "loss": 3.4257, "step": 234 }, { "epoch": 0.38250254323499494, "grad_norm": 8.791083152092174, "learning_rate": 9.978516527135767e-06, "loss": 3.5701, "step": 235 }, { "epoch": 0.38413021363173955, "grad_norm": 7.7630442967259405, "learning_rate": 9.977631364042796e-06, "loss": 3.2353, "step": 236 }, { "epoch": 0.3857578840284842, "grad_norm": 8.795845178193952, "learning_rate": 9.976728372804318e-06, "loss": 3.6355, "step": 237 }, { "epoch": 0.3873855544252289, "grad_norm": 7.885520292759795, "learning_rate": 9.975807556654538e-06, "loss": 3.3972, "step": 238 }, { "epoch": 0.38901322482197354, "grad_norm": 10.600211326242205, "learning_rate": 9.974868918891496e-06, "loss": 3.422, "step": 239 }, { "epoch": 0.3906408952187182, "grad_norm": 8.127851291617453, "learning_rate": 9.973912462877067e-06, "loss": 3.3028, "step": 240 }, { "epoch": 0.39226856561546286, "grad_norm": 8.332292890115626, "learning_rate": 9.972938192036945e-06, "loss": 3.5405, "step": 241 }, { "epoch": 0.3938962360122075, "grad_norm": 7.964003949223213, "learning_rate": 9.971946109860627e-06, "loss": 3.3281, "step": 242 }, { "epoch": 0.3955239064089522, "grad_norm": 7.556407870254964, "learning_rate": 9.970936219901408e-06, "loss": 3.3609, "step": 243 }, { "epoch": 0.39715157680569685, "grad_norm": 7.289603854499082, "learning_rate": 9.969908525776364e-06, "loss": 3.284, "step": 244 }, { "epoch": 0.3987792472024415, "grad_norm": 7.072740167947576, "learning_rate": 9.968863031166338e-06, "loss": 3.2197, "step": 245 }, { "epoch": 0.4004069175991862, "grad_norm": 7.665876135770702, "learning_rate": 9.967799739815925e-06, "loss": 3.2756, "step": 246 }, { "epoch": 0.40203458799593084, "grad_norm": 8.576119824507597, "learning_rate": 9.96671865553347e-06, "loss": 3.4411, "step": 247 }, { "epoch": 0.4036622583926755, "grad_norm": 7.989154946142315, "learning_rate": 9.965619782191037e-06, "loss": 3.4244, "step": 248 }, { "epoch": 0.40528992878942016, "grad_norm": 8.216027090687579, "learning_rate": 9.964503123724411e-06, "loss": 3.3647, "step": 249 }, { "epoch": 0.4069175991861648, "grad_norm": 8.171950027569569, "learning_rate": 9.963368684133073e-06, "loss": 3.4871, "step": 250 }, { "epoch": 0.4085452695829095, "grad_norm": 9.555864935257263, "learning_rate": 9.962216467480192e-06, "loss": 3.4757, "step": 251 }, { "epoch": 0.4101729399796541, "grad_norm": 9.004818386702171, "learning_rate": 9.961046477892608e-06, "loss": 3.5011, "step": 252 }, { "epoch": 0.41180061037639876, "grad_norm": 7.760659428424265, "learning_rate": 9.959858719560817e-06, "loss": 3.5773, "step": 253 }, { "epoch": 0.4134282807731434, "grad_norm": 8.167966337267336, "learning_rate": 9.958653196738955e-06, "loss": 3.1586, "step": 254 }, { "epoch": 0.4150559511698881, "grad_norm": 8.572783139465091, "learning_rate": 9.957429913744788e-06, "loss": 3.3959, "step": 255 }, { "epoch": 0.41668362156663274, "grad_norm": 7.572103629286971, "learning_rate": 9.956188874959686e-06, "loss": 3.2682, "step": 256 }, { "epoch": 0.4183112919633774, "grad_norm": 7.899145678715709, "learning_rate": 9.954930084828627e-06, "loss": 3.3384, "step": 257 }, { "epoch": 0.41993896236012207, "grad_norm": 8.090477540329214, "learning_rate": 9.953653547860152e-06, "loss": 3.2886, "step": 258 }, { "epoch": 0.42156663275686673, "grad_norm": 7.796803636834272, "learning_rate": 9.952359268626375e-06, "loss": 3.4104, "step": 259 }, { "epoch": 0.4231943031536114, "grad_norm": 7.928120950620653, "learning_rate": 9.951047251762956e-06, "loss": 3.2862, "step": 260 }, { "epoch": 0.42482197355035606, "grad_norm": 8.66209864209529, "learning_rate": 9.94971750196908e-06, "loss": 3.5314, "step": 261 }, { "epoch": 0.4264496439471007, "grad_norm": 7.8948988984518795, "learning_rate": 9.948370024007454e-06, "loss": 3.4088, "step": 262 }, { "epoch": 0.4280773143438454, "grad_norm": 7.578199810660849, "learning_rate": 9.947004822704271e-06, "loss": 3.3845, "step": 263 }, { "epoch": 0.42970498474059005, "grad_norm": 10.452899143576063, "learning_rate": 9.94562190294921e-06, "loss": 3.4315, "step": 264 }, { "epoch": 0.4313326551373347, "grad_norm": 7.338025808542487, "learning_rate": 9.944221269695407e-06, "loss": 3.2565, "step": 265 }, { "epoch": 0.43296032553407937, "grad_norm": 7.929459170531931, "learning_rate": 9.942802927959444e-06, "loss": 3.4353, "step": 266 }, { "epoch": 0.43458799593082403, "grad_norm": 7.8936972673053765, "learning_rate": 9.941366882821329e-06, "loss": 3.3343, "step": 267 }, { "epoch": 0.43621566632756864, "grad_norm": 7.711889089464919, "learning_rate": 9.939913139424476e-06, "loss": 3.3993, "step": 268 }, { "epoch": 0.4378433367243133, "grad_norm": 7.700721117643982, "learning_rate": 9.938441702975689e-06, "loss": 3.2914, "step": 269 }, { "epoch": 0.43947100712105797, "grad_norm": 7.657815138885694, "learning_rate": 9.936952578745143e-06, "loss": 3.3961, "step": 270 }, { "epoch": 0.44109867751780263, "grad_norm": 8.453281413200996, "learning_rate": 9.935445772066362e-06, "loss": 3.5407, "step": 271 }, { "epoch": 0.4427263479145473, "grad_norm": 7.553744739738954, "learning_rate": 9.933921288336201e-06, "loss": 3.3619, "step": 272 }, { "epoch": 0.44435401831129195, "grad_norm": 8.227688981842336, "learning_rate": 9.932379133014836e-06, "loss": 3.3338, "step": 273 }, { "epoch": 0.4459816887080366, "grad_norm": 7.784617615291107, "learning_rate": 9.93081931162573e-06, "loss": 3.3914, "step": 274 }, { "epoch": 0.4476093591047813, "grad_norm": 7.961164563357216, "learning_rate": 9.92924182975562e-06, "loss": 3.3409, "step": 275 }, { "epoch": 0.44923702950152594, "grad_norm": 8.519049926856574, "learning_rate": 9.927646693054498e-06, "loss": 3.4572, "step": 276 }, { "epoch": 0.4508646998982706, "grad_norm": 7.4105941287705885, "learning_rate": 9.926033907235588e-06, "loss": 3.187, "step": 277 }, { "epoch": 0.45249237029501527, "grad_norm": 7.797484965070785, "learning_rate": 9.92440347807533e-06, "loss": 3.3909, "step": 278 }, { "epoch": 0.45412004069175993, "grad_norm": 7.748576518070326, "learning_rate": 9.922755411413353e-06, "loss": 3.4666, "step": 279 }, { "epoch": 0.4557477110885046, "grad_norm": 7.65629537945599, "learning_rate": 9.921089713152463e-06, "loss": 3.3322, "step": 280 }, { "epoch": 0.45737538148524925, "grad_norm": 7.66942073337566, "learning_rate": 9.919406389258607e-06, "loss": 3.2786, "step": 281 }, { "epoch": 0.4590030518819939, "grad_norm": 7.431689712588778, "learning_rate": 9.91770544576087e-06, "loss": 3.3479, "step": 282 }, { "epoch": 0.4606307222787386, "grad_norm": 7.591075070895288, "learning_rate": 9.915986888751439e-06, "loss": 3.3936, "step": 283 }, { "epoch": 0.4622583926754832, "grad_norm": 7.133802657237484, "learning_rate": 9.914250724385588e-06, "loss": 3.2987, "step": 284 }, { "epoch": 0.46388606307222785, "grad_norm": 7.9022034802989785, "learning_rate": 9.912496958881653e-06, "loss": 3.3943, "step": 285 }, { "epoch": 0.4655137334689725, "grad_norm": 7.210239205970947, "learning_rate": 9.910725598521014e-06, "loss": 3.2016, "step": 286 }, { "epoch": 0.4671414038657172, "grad_norm": 8.67138093301255, "learning_rate": 9.90893664964807e-06, "loss": 3.3859, "step": 287 }, { "epoch": 0.46876907426246184, "grad_norm": 7.4473071285429615, "learning_rate": 9.907130118670208e-06, "loss": 3.2379, "step": 288 }, { "epoch": 0.4703967446592065, "grad_norm": 8.40088288583514, "learning_rate": 9.9053060120578e-06, "loss": 3.3314, "step": 289 }, { "epoch": 0.47202441505595116, "grad_norm": 8.561558874797127, "learning_rate": 9.90346433634416e-06, "loss": 3.4744, "step": 290 }, { "epoch": 0.4736520854526958, "grad_norm": 7.887054685240966, "learning_rate": 9.901605098125528e-06, "loss": 3.5451, "step": 291 }, { "epoch": 0.4752797558494405, "grad_norm": 7.852539561680993, "learning_rate": 9.899728304061053e-06, "loss": 3.3216, "step": 292 }, { "epoch": 0.47690742624618515, "grad_norm": 7.823085838171288, "learning_rate": 9.897833960872758e-06, "loss": 3.295, "step": 293 }, { "epoch": 0.4785350966429298, "grad_norm": 7.27772184458157, "learning_rate": 9.895922075345522e-06, "loss": 3.3143, "step": 294 }, { "epoch": 0.4801627670396745, "grad_norm": 8.279211580101418, "learning_rate": 9.893992654327052e-06, "loss": 3.4018, "step": 295 }, { "epoch": 0.48179043743641914, "grad_norm": 7.6675367147527, "learning_rate": 9.892045704727864e-06, "loss": 3.4147, "step": 296 }, { "epoch": 0.4834181078331638, "grad_norm": 7.8728477288544, "learning_rate": 9.890081233521258e-06, "loss": 3.391, "step": 297 }, { "epoch": 0.48504577822990846, "grad_norm": 8.485759453010889, "learning_rate": 9.888099247743283e-06, "loss": 3.3697, "step": 298 }, { "epoch": 0.4866734486266531, "grad_norm": 8.626024115361409, "learning_rate": 9.886099754492727e-06, "loss": 3.3782, "step": 299 }, { "epoch": 0.4883011190233978, "grad_norm": 7.471210313359118, "learning_rate": 9.884082760931077e-06, "loss": 3.4875, "step": 300 }, { "epoch": 0.4899287894201424, "grad_norm": 10.089043749009173, "learning_rate": 9.882048274282505e-06, "loss": 3.1316, "step": 301 }, { "epoch": 0.49155645981688706, "grad_norm": 8.10225623646246, "learning_rate": 9.879996301833834e-06, "loss": 3.3415, "step": 302 }, { "epoch": 0.4931841302136317, "grad_norm": 7.597674946181814, "learning_rate": 9.877926850934517e-06, "loss": 3.5017, "step": 303 }, { "epoch": 0.4948118006103764, "grad_norm": 8.398996562707426, "learning_rate": 9.875839928996605e-06, "loss": 3.4132, "step": 304 }, { "epoch": 0.49643947100712105, "grad_norm": 7.443788752546975, "learning_rate": 9.873735543494729e-06, "loss": 3.298, "step": 305 }, { "epoch": 0.4980671414038657, "grad_norm": 8.298335535710844, "learning_rate": 9.871613701966067e-06, "loss": 3.4161, "step": 306 }, { "epoch": 0.4996948118006104, "grad_norm": 8.65122754892351, "learning_rate": 9.869474412010319e-06, "loss": 3.226, "step": 307 }, { "epoch": 0.5013224821973551, "grad_norm": 7.191061714548919, "learning_rate": 9.867317681289675e-06, "loss": 3.416, "step": 308 }, { "epoch": 0.5029501525940997, "grad_norm": 8.173367222014383, "learning_rate": 9.865143517528797e-06, "loss": 3.366, "step": 309 }, { "epoch": 0.5045778229908443, "grad_norm": 8.374872034393798, "learning_rate": 9.862951928514782e-06, "loss": 3.3247, "step": 310 }, { "epoch": 0.506205493387589, "grad_norm": 7.848826218433371, "learning_rate": 9.860742922097141e-06, "loss": 3.3769, "step": 311 }, { "epoch": 0.5078331637843336, "grad_norm": 8.121070560580142, "learning_rate": 9.85851650618777e-06, "loss": 3.4668, "step": 312 }, { "epoch": 0.5094608341810783, "grad_norm": 8.514264470643433, "learning_rate": 9.856272688760913e-06, "loss": 3.3518, "step": 313 }, { "epoch": 0.511088504577823, "grad_norm": 7.631254471551994, "learning_rate": 9.854011477853147e-06, "loss": 3.3405, "step": 314 }, { "epoch": 0.5127161749745677, "grad_norm": 7.334804568439522, "learning_rate": 9.85173288156334e-06, "loss": 3.2551, "step": 315 }, { "epoch": 0.5143438453713123, "grad_norm": 8.022519981411627, "learning_rate": 9.849436908052636e-06, "loss": 3.4469, "step": 316 }, { "epoch": 0.515971515768057, "grad_norm": 6.945554375378001, "learning_rate": 9.847123565544413e-06, "loss": 3.2992, "step": 317 }, { "epoch": 0.5175991861648016, "grad_norm": 7.617702179325285, "learning_rate": 9.844792862324258e-06, "loss": 3.4455, "step": 318 }, { "epoch": 0.5192268565615463, "grad_norm": 8.044119495055243, "learning_rate": 9.842444806739942e-06, "loss": 3.3871, "step": 319 }, { "epoch": 0.5208545269582909, "grad_norm": 7.010033399116665, "learning_rate": 9.840079407201382e-06, "loss": 3.118, "step": 320 }, { "epoch": 0.5224821973550356, "grad_norm": 7.0606828638076236, "learning_rate": 9.837696672180618e-06, "loss": 3.5321, "step": 321 }, { "epoch": 0.5241098677517803, "grad_norm": 7.416272711444871, "learning_rate": 9.83529661021178e-06, "loss": 3.2956, "step": 322 }, { "epoch": 0.525737538148525, "grad_norm": 8.335810020751083, "learning_rate": 9.832879229891054e-06, "loss": 3.2901, "step": 323 }, { "epoch": 0.5273652085452696, "grad_norm": 7.366556737827219, "learning_rate": 9.830444539876656e-06, "loss": 3.3359, "step": 324 }, { "epoch": 0.5289928789420142, "grad_norm": 8.734280465964437, "learning_rate": 9.827992548888801e-06, "loss": 3.4605, "step": 325 }, { "epoch": 0.5306205493387589, "grad_norm": 8.177089461678898, "learning_rate": 9.825523265709667e-06, "loss": 3.2513, "step": 326 }, { "epoch": 0.5322482197355035, "grad_norm": 7.868304849009075, "learning_rate": 9.82303669918337e-06, "loss": 3.3727, "step": 327 }, { "epoch": 0.5338758901322482, "grad_norm": 8.008479609318403, "learning_rate": 9.820532858215925e-06, "loss": 3.4998, "step": 328 }, { "epoch": 0.5355035605289928, "grad_norm": 7.853169736747938, "learning_rate": 9.81801175177522e-06, "loss": 3.3405, "step": 329 }, { "epoch": 0.5371312309257376, "grad_norm": 7.92424032096779, "learning_rate": 9.815473388890984e-06, "loss": 3.4522, "step": 330 }, { "epoch": 0.5387589013224822, "grad_norm": 7.419729764279102, "learning_rate": 9.81291777865475e-06, "loss": 3.5066, "step": 331 }, { "epoch": 0.5403865717192269, "grad_norm": 8.32541114526704, "learning_rate": 9.810344930219825e-06, "loss": 3.3467, "step": 332 }, { "epoch": 0.5420142421159715, "grad_norm": 7.956646630895741, "learning_rate": 9.807754852801257e-06, "loss": 3.2484, "step": 333 }, { "epoch": 0.5436419125127162, "grad_norm": 7.357937316112977, "learning_rate": 9.805147555675805e-06, "loss": 3.2233, "step": 334 }, { "epoch": 0.5452695829094608, "grad_norm": 7.468731049818511, "learning_rate": 9.802523048181898e-06, "loss": 3.3738, "step": 335 }, { "epoch": 0.5468972533062055, "grad_norm": 7.830756868871445, "learning_rate": 9.799881339719615e-06, "loss": 3.4247, "step": 336 }, { "epoch": 0.5485249237029501, "grad_norm": 7.131323570963607, "learning_rate": 9.797222439750633e-06, "loss": 3.3126, "step": 337 }, { "epoch": 0.5501525940996949, "grad_norm": 7.83203416775292, "learning_rate": 9.79454635779821e-06, "loss": 3.3487, "step": 338 }, { "epoch": 0.5517802644964395, "grad_norm": 7.219952817156139, "learning_rate": 9.79185310344714e-06, "loss": 3.4016, "step": 339 }, { "epoch": 0.5534079348931842, "grad_norm": 7.526934535019579, "learning_rate": 9.789142686343723e-06, "loss": 3.4379, "step": 340 }, { "epoch": 0.5550356052899288, "grad_norm": 8.466843087935207, "learning_rate": 9.786415116195733e-06, "loss": 3.4859, "step": 341 }, { "epoch": 0.5566632756866734, "grad_norm": 7.997590181948238, "learning_rate": 9.78367040277238e-06, "loss": 3.2355, "step": 342 }, { "epoch": 0.5582909460834181, "grad_norm": 7.584762886521113, "learning_rate": 9.780908555904269e-06, "loss": 3.4291, "step": 343 }, { "epoch": 0.5599186164801627, "grad_norm": 7.25726054903936, "learning_rate": 9.778129585483378e-06, "loss": 3.2865, "step": 344 }, { "epoch": 0.5615462868769074, "grad_norm": 6.884386220840884, "learning_rate": 9.775333501463013e-06, "loss": 3.2599, "step": 345 }, { "epoch": 0.563173957273652, "grad_norm": 7.309750523124441, "learning_rate": 9.772520313857777e-06, "loss": 3.3468, "step": 346 }, { "epoch": 0.5648016276703968, "grad_norm": 6.983489396086076, "learning_rate": 9.769690032743526e-06, "loss": 3.2463, "step": 347 }, { "epoch": 0.5664292980671414, "grad_norm": 7.180956370435456, "learning_rate": 9.766842668257348e-06, "loss": 3.382, "step": 348 }, { "epoch": 0.5680569684638861, "grad_norm": 7.246162692037319, "learning_rate": 9.76397823059751e-06, "loss": 3.2761, "step": 349 }, { "epoch": 0.5696846388606307, "grad_norm": 8.192210781412015, "learning_rate": 9.761096730023431e-06, "loss": 3.1233, "step": 350 }, { "epoch": 0.5713123092573754, "grad_norm": 7.06290209270871, "learning_rate": 9.758198176855648e-06, "loss": 3.1796, "step": 351 }, { "epoch": 0.57293997965412, "grad_norm": 7.512316502610631, "learning_rate": 9.755282581475769e-06, "loss": 3.3116, "step": 352 }, { "epoch": 0.5745676500508647, "grad_norm": 6.9746000153253975, "learning_rate": 9.752349954326443e-06, "loss": 3.4463, "step": 353 }, { "epoch": 0.5761953204476093, "grad_norm": 9.17991686735015, "learning_rate": 9.749400305911323e-06, "loss": 3.5176, "step": 354 }, { "epoch": 0.5778229908443541, "grad_norm": 8.351434099854039, "learning_rate": 9.746433646795022e-06, "loss": 3.3427, "step": 355 }, { "epoch": 0.5794506612410987, "grad_norm": 7.466410382893679, "learning_rate": 9.743449987603082e-06, "loss": 3.4139, "step": 356 }, { "epoch": 0.5810783316378433, "grad_norm": 7.619627357629753, "learning_rate": 9.740449339021937e-06, "loss": 3.2086, "step": 357 }, { "epoch": 0.582706002034588, "grad_norm": 7.6962946079888175, "learning_rate": 9.737431711798864e-06, "loss": 3.5036, "step": 358 }, { "epoch": 0.5843336724313326, "grad_norm": 7.415557670644888, "learning_rate": 9.734397116741953e-06, "loss": 3.2697, "step": 359 }, { "epoch": 0.5859613428280773, "grad_norm": 7.877032643098632, "learning_rate": 9.731345564720074e-06, "loss": 3.4693, "step": 360 }, { "epoch": 0.5875890132248219, "grad_norm": 7.39026122271418, "learning_rate": 9.728277066662821e-06, "loss": 3.3939, "step": 361 }, { "epoch": 0.5892166836215667, "grad_norm": 7.1923422252368905, "learning_rate": 9.725191633560492e-06, "loss": 3.4227, "step": 362 }, { "epoch": 0.5908443540183113, "grad_norm": 8.243548317591074, "learning_rate": 9.722089276464032e-06, "loss": 3.5447, "step": 363 }, { "epoch": 0.592472024415056, "grad_norm": 7.855166577500991, "learning_rate": 9.718970006485007e-06, "loss": 3.5117, "step": 364 }, { "epoch": 0.5940996948118006, "grad_norm": 7.247898290882595, "learning_rate": 9.715833834795559e-06, "loss": 3.4683, "step": 365 }, { "epoch": 0.5957273652085453, "grad_norm": 8.600446748205766, "learning_rate": 9.712680772628365e-06, "loss": 3.4928, "step": 366 }, { "epoch": 0.5973550356052899, "grad_norm": 8.16363040492585, "learning_rate": 9.709510831276596e-06, "loss": 3.5042, "step": 367 }, { "epoch": 0.5989827060020346, "grad_norm": 7.570562730584253, "learning_rate": 9.70632402209388e-06, "loss": 3.318, "step": 368 }, { "epoch": 0.6006103763987792, "grad_norm": 7.346930544413185, "learning_rate": 9.703120356494265e-06, "loss": 3.3489, "step": 369 }, { "epoch": 0.602238046795524, "grad_norm": 7.9689513539184675, "learning_rate": 9.69989984595216e-06, "loss": 3.4668, "step": 370 }, { "epoch": 0.6038657171922686, "grad_norm": 8.180745842490607, "learning_rate": 9.69666250200232e-06, "loss": 3.3913, "step": 371 }, { "epoch": 0.6054933875890133, "grad_norm": 7.855297100998057, "learning_rate": 9.693408336239784e-06, "loss": 3.2962, "step": 372 }, { "epoch": 0.6071210579857579, "grad_norm": 7.986243887740391, "learning_rate": 9.690137360319842e-06, "loss": 3.45, "step": 373 }, { "epoch": 0.6087487283825025, "grad_norm": 8.24353940511427, "learning_rate": 9.686849585957995e-06, "loss": 3.4266, "step": 374 }, { "epoch": 0.6103763987792472, "grad_norm": 7.57578103013256, "learning_rate": 9.683545024929905e-06, "loss": 3.3939, "step": 375 }, { "epoch": 0.6120040691759918, "grad_norm": 7.377016057837269, "learning_rate": 9.680223689071364e-06, "loss": 3.3152, "step": 376 }, { "epoch": 0.6136317395727365, "grad_norm": 7.100277190347212, "learning_rate": 9.676885590278243e-06, "loss": 3.2764, "step": 377 }, { "epoch": 0.6152594099694811, "grad_norm": 7.3246017440029645, "learning_rate": 9.673530740506447e-06, "loss": 3.4448, "step": 378 }, { "epoch": 0.6168870803662259, "grad_norm": 8.063760610849624, "learning_rate": 9.670159151771887e-06, "loss": 3.2305, "step": 379 }, { "epoch": 0.6185147507629705, "grad_norm": 8.0211730015538, "learning_rate": 9.66677083615042e-06, "loss": 3.3705, "step": 380 }, { "epoch": 0.6201424211597152, "grad_norm": 8.085159137803078, "learning_rate": 9.663365805777815e-06, "loss": 3.4601, "step": 381 }, { "epoch": 0.6217700915564598, "grad_norm": 7.499214200056498, "learning_rate": 9.659944072849708e-06, "loss": 3.3883, "step": 382 }, { "epoch": 0.6233977619532045, "grad_norm": 7.184330837912677, "learning_rate": 9.656505649621555e-06, "loss": 3.2017, "step": 383 }, { "epoch": 0.6250254323499491, "grad_norm": 8.423547785916426, "learning_rate": 9.653050548408594e-06, "loss": 3.2402, "step": 384 }, { "epoch": 0.6266531027466938, "grad_norm": 7.632881243217823, "learning_rate": 9.649578781585798e-06, "loss": 3.3529, "step": 385 }, { "epoch": 0.6282807731434384, "grad_norm": 7.2204221081558755, "learning_rate": 9.646090361587828e-06, "loss": 3.2741, "step": 386 }, { "epoch": 0.6299084435401832, "grad_norm": 7.512638770952979, "learning_rate": 9.64258530090899e-06, "loss": 3.3874, "step": 387 }, { "epoch": 0.6315361139369278, "grad_norm": 7.455006539885303, "learning_rate": 9.639063612103199e-06, "loss": 3.5159, "step": 388 }, { "epoch": 0.6331637843336725, "grad_norm": 7.609871227660504, "learning_rate": 9.635525307783914e-06, "loss": 3.2644, "step": 389 }, { "epoch": 0.6347914547304171, "grad_norm": 7.114300237079663, "learning_rate": 9.631970400624114e-06, "loss": 3.4031, "step": 390 }, { "epoch": 0.6364191251271617, "grad_norm": 6.993576149513928, "learning_rate": 9.628398903356239e-06, "loss": 3.224, "step": 391 }, { "epoch": 0.6380467955239064, "grad_norm": 7.508913490586774, "learning_rate": 9.624810828772156e-06, "loss": 3.3377, "step": 392 }, { "epoch": 0.639674465920651, "grad_norm": 7.686962303049194, "learning_rate": 9.621206189723097e-06, "loss": 3.3858, "step": 393 }, { "epoch": 0.6413021363173957, "grad_norm": 7.6029537857541865, "learning_rate": 9.617584999119624e-06, "loss": 3.4554, "step": 394 }, { "epoch": 0.6429298067141404, "grad_norm": 6.862365430562035, "learning_rate": 9.613947269931587e-06, "loss": 3.3671, "step": 395 }, { "epoch": 0.6445574771108851, "grad_norm": 7.690267351103006, "learning_rate": 9.610293015188067e-06, "loss": 3.1934, "step": 396 }, { "epoch": 0.6461851475076297, "grad_norm": 8.845518354691988, "learning_rate": 9.606622247977336e-06, "loss": 3.327, "step": 397 }, { "epoch": 0.6478128179043744, "grad_norm": 7.289156814622173, "learning_rate": 9.602934981446804e-06, "loss": 3.4498, "step": 398 }, { "epoch": 0.649440488301119, "grad_norm": 7.813695343635967, "learning_rate": 9.599231228802977e-06, "loss": 3.3282, "step": 399 }, { "epoch": 0.6510681586978637, "grad_norm": 7.935903648696806, "learning_rate": 9.595511003311411e-06, "loss": 3.4546, "step": 400 }, { "epoch": 0.6526958290946083, "grad_norm": 8.508661225317464, "learning_rate": 9.591774318296661e-06, "loss": 3.2357, "step": 401 }, { "epoch": 0.654323499491353, "grad_norm": 6.989974294123776, "learning_rate": 9.588021187142236e-06, "loss": 3.3221, "step": 402 }, { "epoch": 0.6559511698880977, "grad_norm": 9.250615075354615, "learning_rate": 9.584251623290543e-06, "loss": 3.3714, "step": 403 }, { "epoch": 0.6575788402848424, "grad_norm": 7.04154670706104, "learning_rate": 9.580465640242852e-06, "loss": 3.2862, "step": 404 }, { "epoch": 0.659206510681587, "grad_norm": 6.748218348516812, "learning_rate": 9.576663251559239e-06, "loss": 3.3596, "step": 405 }, { "epoch": 0.6608341810783316, "grad_norm": 7.9001708990698445, "learning_rate": 9.572844470858537e-06, "loss": 3.311, "step": 406 }, { "epoch": 0.6624618514750763, "grad_norm": 7.504739492727831, "learning_rate": 9.569009311818292e-06, "loss": 3.2734, "step": 407 }, { "epoch": 0.6640895218718209, "grad_norm": 7.0317882645570045, "learning_rate": 9.565157788174713e-06, "loss": 3.2319, "step": 408 }, { "epoch": 0.6657171922685656, "grad_norm": 7.7230546457524385, "learning_rate": 9.561289913722614e-06, "loss": 3.3699, "step": 409 }, { "epoch": 0.6673448626653102, "grad_norm": 8.00506572674308, "learning_rate": 9.55740570231538e-06, "loss": 3.315, "step": 410 }, { "epoch": 0.668972533062055, "grad_norm": 7.1399362686426215, "learning_rate": 9.55350516786491e-06, "loss": 3.2482, "step": 411 }, { "epoch": 0.6706002034587996, "grad_norm": 7.239203943569414, "learning_rate": 9.549588324341555e-06, "loss": 3.241, "step": 412 }, { "epoch": 0.6722278738555443, "grad_norm": 7.248659316576052, "learning_rate": 9.545655185774093e-06, "loss": 3.1992, "step": 413 }, { "epoch": 0.6738555442522889, "grad_norm": 7.661284323028834, "learning_rate": 9.541705766249654e-06, "loss": 3.2861, "step": 414 }, { "epoch": 0.6754832146490336, "grad_norm": 6.646240746027127, "learning_rate": 9.537740079913692e-06, "loss": 3.3663, "step": 415 }, { "epoch": 0.6771108850457782, "grad_norm": 7.393777035866731, "learning_rate": 9.533758140969913e-06, "loss": 3.3384, "step": 416 }, { "epoch": 0.6787385554425229, "grad_norm": 7.301397977454081, "learning_rate": 9.529759963680242e-06, "loss": 3.2429, "step": 417 }, { "epoch": 0.6803662258392675, "grad_norm": 7.679983153200053, "learning_rate": 9.525745562364756e-06, "loss": 3.4312, "step": 418 }, { "epoch": 0.6819938962360123, "grad_norm": 9.362800560254156, "learning_rate": 9.521714951401649e-06, "loss": 3.4227, "step": 419 }, { "epoch": 0.6836215666327569, "grad_norm": 7.989244328012467, "learning_rate": 9.517668145227166e-06, "loss": 3.2158, "step": 420 }, { "epoch": 0.6852492370295016, "grad_norm": 7.557194197513166, "learning_rate": 9.513605158335562e-06, "loss": 3.3916, "step": 421 }, { "epoch": 0.6868769074262462, "grad_norm": 6.972747065863008, "learning_rate": 9.509526005279044e-06, "loss": 3.1776, "step": 422 }, { "epoch": 0.6885045778229908, "grad_norm": 8.141277781191524, "learning_rate": 9.505430700667717e-06, "loss": 3.4864, "step": 423 }, { "epoch": 0.6901322482197355, "grad_norm": 7.7807593131318695, "learning_rate": 9.501319259169544e-06, "loss": 3.4446, "step": 424 }, { "epoch": 0.6917599186164801, "grad_norm": 7.164263614114379, "learning_rate": 9.497191695510274e-06, "loss": 3.1658, "step": 425 }, { "epoch": 0.6933875890132248, "grad_norm": 8.045042632950862, "learning_rate": 9.493048024473413e-06, "loss": 3.3302, "step": 426 }, { "epoch": 0.6950152594099694, "grad_norm": 7.142902941773968, "learning_rate": 9.488888260900143e-06, "loss": 3.2105, "step": 427 }, { "epoch": 0.6966429298067142, "grad_norm": 6.917305181424775, "learning_rate": 9.484712419689293e-06, "loss": 3.4375, "step": 428 }, { "epoch": 0.6982706002034588, "grad_norm": 6.956442368377462, "learning_rate": 9.480520515797276e-06, "loss": 3.2874, "step": 429 }, { "epoch": 0.6998982706002035, "grad_norm": 7.638006144380143, "learning_rate": 9.476312564238035e-06, "loss": 3.4581, "step": 430 }, { "epoch": 0.7015259409969481, "grad_norm": 7.30211701032896, "learning_rate": 9.472088580082991e-06, "loss": 3.4284, "step": 431 }, { "epoch": 0.7031536113936928, "grad_norm": 7.39020516547575, "learning_rate": 9.467848578460986e-06, "loss": 3.3145, "step": 432 }, { "epoch": 0.7047812817904374, "grad_norm": 8.890192812375014, "learning_rate": 9.46359257455823e-06, "loss": 3.5721, "step": 433 }, { "epoch": 0.7064089521871821, "grad_norm": 7.615964358490253, "learning_rate": 9.459320583618253e-06, "loss": 3.3708, "step": 434 }, { "epoch": 0.7080366225839267, "grad_norm": 6.742408295182748, "learning_rate": 9.45503262094184e-06, "loss": 3.3519, "step": 435 }, { "epoch": 0.7096642929806715, "grad_norm": 6.9244680539585595, "learning_rate": 9.450728701886985e-06, "loss": 3.4615, "step": 436 }, { "epoch": 0.7112919633774161, "grad_norm": 7.333645495362927, "learning_rate": 9.446408841868826e-06, "loss": 3.3122, "step": 437 }, { "epoch": 0.7129196337741607, "grad_norm": 6.951458414298065, "learning_rate": 9.442073056359604e-06, "loss": 3.1635, "step": 438 }, { "epoch": 0.7145473041709054, "grad_norm": 7.7134032322899895, "learning_rate": 9.437721360888594e-06, "loss": 3.4587, "step": 439 }, { "epoch": 0.71617497456765, "grad_norm": 7.687316956339726, "learning_rate": 9.433353771042059e-06, "loss": 3.4594, "step": 440 }, { "epoch": 0.7178026449643947, "grad_norm": 7.783351619856221, "learning_rate": 9.428970302463185e-06, "loss": 3.3603, "step": 441 }, { "epoch": 0.7194303153611393, "grad_norm": 7.438339813716801, "learning_rate": 9.424570970852035e-06, "loss": 3.3401, "step": 442 }, { "epoch": 0.721057985757884, "grad_norm": 6.768257259078661, "learning_rate": 9.420155791965482e-06, "loss": 3.1814, "step": 443 }, { "epoch": 0.7226856561546287, "grad_norm": 7.550139059011003, "learning_rate": 9.415724781617166e-06, "loss": 3.4597, "step": 444 }, { "epoch": 0.7243133265513734, "grad_norm": 7.278550339594999, "learning_rate": 9.411277955677425e-06, "loss": 3.2336, "step": 445 }, { "epoch": 0.725940996948118, "grad_norm": 8.345743949792949, "learning_rate": 9.406815330073244e-06, "loss": 3.0355, "step": 446 }, { "epoch": 0.7275686673448627, "grad_norm": 7.10154072618765, "learning_rate": 9.402336920788197e-06, "loss": 3.3129, "step": 447 }, { "epoch": 0.7291963377416073, "grad_norm": 7.0611644267086975, "learning_rate": 9.397842743862391e-06, "loss": 3.3575, "step": 448 }, { "epoch": 0.730824008138352, "grad_norm": 7.287445729892704, "learning_rate": 9.393332815392402e-06, "loss": 3.3075, "step": 449 }, { "epoch": 0.7324516785350966, "grad_norm": 7.8030484428459586, "learning_rate": 9.38880715153123e-06, "loss": 3.3825, "step": 450 }, { "epoch": 0.7340793489318413, "grad_norm": 7.747695388776855, "learning_rate": 9.384265768488226e-06, "loss": 3.4152, "step": 451 }, { "epoch": 0.735707019328586, "grad_norm": 7.115293269179921, "learning_rate": 9.37970868252905e-06, "loss": 3.3415, "step": 452 }, { "epoch": 0.7373346897253307, "grad_norm": 7.341676529922958, "learning_rate": 9.375135909975596e-06, "loss": 3.4284, "step": 453 }, { "epoch": 0.7389623601220753, "grad_norm": 8.215489901852376, "learning_rate": 9.37054746720595e-06, "loss": 3.4572, "step": 454 }, { "epoch": 0.7405900305188199, "grad_norm": 7.218439851196501, "learning_rate": 9.365943370654315e-06, "loss": 3.1738, "step": 455 }, { "epoch": 0.7422177009155646, "grad_norm": 7.74151842975269, "learning_rate": 9.36132363681097e-06, "loss": 3.2454, "step": 456 }, { "epoch": 0.7438453713123092, "grad_norm": 7.711923950381008, "learning_rate": 9.356688282222191e-06, "loss": 3.2098, "step": 457 }, { "epoch": 0.7454730417090539, "grad_norm": 6.786245140861336, "learning_rate": 9.352037323490208e-06, "loss": 3.1995, "step": 458 }, { "epoch": 0.7471007121057985, "grad_norm": 7.618412503942862, "learning_rate": 9.347370777273141e-06, "loss": 3.4121, "step": 459 }, { "epoch": 0.7487283825025433, "grad_norm": 7.80795033840699, "learning_rate": 9.342688660284935e-06, "loss": 3.4904, "step": 460 }, { "epoch": 0.7503560528992879, "grad_norm": 7.114047663489838, "learning_rate": 9.337990989295306e-06, "loss": 3.454, "step": 461 }, { "epoch": 0.7519837232960326, "grad_norm": 6.717522322255384, "learning_rate": 9.333277781129679e-06, "loss": 3.1772, "step": 462 }, { "epoch": 0.7536113936927772, "grad_norm": 7.266596051653623, "learning_rate": 9.328549052669127e-06, "loss": 3.4471, "step": 463 }, { "epoch": 0.7552390640895219, "grad_norm": 7.245821830638635, "learning_rate": 9.323804820850311e-06, "loss": 3.3159, "step": 464 }, { "epoch": 0.7568667344862665, "grad_norm": 6.850071814564889, "learning_rate": 9.319045102665421e-06, "loss": 3.2142, "step": 465 }, { "epoch": 0.7584944048830112, "grad_norm": 6.5772765685448755, "learning_rate": 9.314269915162115e-06, "loss": 3.4254, "step": 466 }, { "epoch": 0.7601220752797558, "grad_norm": 8.119818891835823, "learning_rate": 9.309479275443453e-06, "loss": 3.3537, "step": 467 }, { "epoch": 0.7617497456765006, "grad_norm": 7.323038475132149, "learning_rate": 9.304673200667842e-06, "loss": 3.3186, "step": 468 }, { "epoch": 0.7633774160732452, "grad_norm": 7.897487285363906, "learning_rate": 9.299851708048972e-06, "loss": 3.3536, "step": 469 }, { "epoch": 0.7650050864699899, "grad_norm": 7.749646562069135, "learning_rate": 9.295014814855755e-06, "loss": 3.3205, "step": 470 }, { "epoch": 0.7666327568667345, "grad_norm": 7.524571492085422, "learning_rate": 9.290162538412257e-06, "loss": 3.3604, "step": 471 }, { "epoch": 0.7682604272634791, "grad_norm": 7.702753958769076, "learning_rate": 9.285294896097648e-06, "loss": 3.1374, "step": 472 }, { "epoch": 0.7698880976602238, "grad_norm": 7.442472716246709, "learning_rate": 9.280411905346131e-06, "loss": 3.3248, "step": 473 }, { "epoch": 0.7715157680569684, "grad_norm": 6.766665953346373, "learning_rate": 9.275513583646885e-06, "loss": 3.2657, "step": 474 }, { "epoch": 0.7731434384537131, "grad_norm": 6.894237837913607, "learning_rate": 9.270599948543991e-06, "loss": 3.4561, "step": 475 }, { "epoch": 0.7747711088504577, "grad_norm": 6.939044813142089, "learning_rate": 9.265671017636384e-06, "loss": 3.3846, "step": 476 }, { "epoch": 0.7763987792472025, "grad_norm": 7.446412973230784, "learning_rate": 9.260726808577782e-06, "loss": 3.239, "step": 477 }, { "epoch": 0.7780264496439471, "grad_norm": 6.784972903753985, "learning_rate": 9.255767339076622e-06, "loss": 3.2571, "step": 478 }, { "epoch": 0.7796541200406918, "grad_norm": 6.986549112164136, "learning_rate": 9.250792626896e-06, "loss": 3.251, "step": 479 }, { "epoch": 0.7812817904374364, "grad_norm": 6.795902772878749, "learning_rate": 9.24580268985361e-06, "loss": 3.2745, "step": 480 }, { "epoch": 0.7829094608341811, "grad_norm": 7.21804402541467, "learning_rate": 9.240797545821666e-06, "loss": 3.3427, "step": 481 }, { "epoch": 0.7845371312309257, "grad_norm": 7.548632194136501, "learning_rate": 9.235777212726859e-06, "loss": 3.3444, "step": 482 }, { "epoch": 0.7861648016276704, "grad_norm": 7.672805149149543, "learning_rate": 9.230741708550275e-06, "loss": 3.2639, "step": 483 }, { "epoch": 0.787792472024415, "grad_norm": 6.674234416204953, "learning_rate": 9.225691051327341e-06, "loss": 3.3031, "step": 484 }, { "epoch": 0.7894201424211598, "grad_norm": 7.591220825688657, "learning_rate": 9.220625259147754e-06, "loss": 3.4916, "step": 485 }, { "epoch": 0.7910478128179044, "grad_norm": 7.881138901607985, "learning_rate": 9.215544350155423e-06, "loss": 3.3908, "step": 486 }, { "epoch": 0.792675483214649, "grad_norm": 7.136893957816425, "learning_rate": 9.210448342548396e-06, "loss": 3.4099, "step": 487 }, { "epoch": 0.7943031536113937, "grad_norm": 6.99323189642892, "learning_rate": 9.205337254578803e-06, "loss": 3.2926, "step": 488 }, { "epoch": 0.7959308240081383, "grad_norm": 7.052029247775543, "learning_rate": 9.200211104552786e-06, "loss": 3.372, "step": 489 }, { "epoch": 0.797558494404883, "grad_norm": 6.973997809853156, "learning_rate": 9.195069910830428e-06, "loss": 3.3314, "step": 490 }, { "epoch": 0.7991861648016276, "grad_norm": 7.0063625529000015, "learning_rate": 9.1899136918257e-06, "loss": 3.3922, "step": 491 }, { "epoch": 0.8008138351983723, "grad_norm": 7.603195429028454, "learning_rate": 9.18474246600639e-06, "loss": 3.2695, "step": 492 }, { "epoch": 0.802441505595117, "grad_norm": 7.327564512954984, "learning_rate": 9.179556251894025e-06, "loss": 3.3032, "step": 493 }, { "epoch": 0.8040691759918617, "grad_norm": 7.342480416108183, "learning_rate": 9.174355068063827e-06, "loss": 3.4952, "step": 494 }, { "epoch": 0.8056968463886063, "grad_norm": 7.479952238773796, "learning_rate": 9.169138933144626e-06, "loss": 3.2611, "step": 495 }, { "epoch": 0.807324516785351, "grad_norm": 7.145363230767157, "learning_rate": 9.163907865818806e-06, "loss": 3.2688, "step": 496 }, { "epoch": 0.8089521871820956, "grad_norm": 7.177388242027909, "learning_rate": 9.15866188482223e-06, "loss": 3.3932, "step": 497 }, { "epoch": 0.8105798575788403, "grad_norm": 7.4502092647992795, "learning_rate": 9.15340100894418e-06, "loss": 3.3557, "step": 498 }, { "epoch": 0.8122075279755849, "grad_norm": 6.74017864741359, "learning_rate": 9.148125257027286e-06, "loss": 3.3704, "step": 499 }, { "epoch": 0.8138351983723296, "grad_norm": 6.875534087006057, "learning_rate": 9.142834647967455e-06, "loss": 3.3972, "step": 500 }, { "epoch": 0.8154628687690743, "grad_norm": 7.16118904372872, "learning_rate": 9.137529200713811e-06, "loss": 3.2626, "step": 501 }, { "epoch": 0.817090539165819, "grad_norm": 7.26623184023163, "learning_rate": 9.132208934268622e-06, "loss": 3.2718, "step": 502 }, { "epoch": 0.8187182095625636, "grad_norm": 7.740399999499181, "learning_rate": 9.126873867687234e-06, "loss": 3.3096, "step": 503 }, { "epoch": 0.8203458799593082, "grad_norm": 7.251434955796277, "learning_rate": 9.121524020078003e-06, "loss": 3.3364, "step": 504 }, { "epoch": 0.8219735503560529, "grad_norm": 7.330538002292107, "learning_rate": 9.116159410602218e-06, "loss": 3.3065, "step": 505 }, { "epoch": 0.8236012207527975, "grad_norm": 8.169987497982, "learning_rate": 9.110780058474052e-06, "loss": 3.2754, "step": 506 }, { "epoch": 0.8252288911495422, "grad_norm": 6.659092766249537, "learning_rate": 9.105385982960468e-06, "loss": 3.2955, "step": 507 }, { "epoch": 0.8268565615462868, "grad_norm": 7.0217787062193135, "learning_rate": 9.099977203381176e-06, "loss": 3.2372, "step": 508 }, { "epoch": 0.8284842319430316, "grad_norm": 7.207356165887921, "learning_rate": 9.094553739108543e-06, "loss": 3.311, "step": 509 }, { "epoch": 0.8301119023397762, "grad_norm": 7.342761592691962, "learning_rate": 9.08911560956753e-06, "loss": 3.4706, "step": 510 }, { "epoch": 0.8317395727365209, "grad_norm": 6.969985900796636, "learning_rate": 9.08366283423563e-06, "loss": 3.1098, "step": 511 }, { "epoch": 0.8333672431332655, "grad_norm": 7.143010570490118, "learning_rate": 9.078195432642788e-06, "loss": 3.2715, "step": 512 }, { "epoch": 0.8349949135300102, "grad_norm": 7.4308247770173965, "learning_rate": 9.072713424371336e-06, "loss": 3.3815, "step": 513 }, { "epoch": 0.8366225839267548, "grad_norm": 6.739364518464837, "learning_rate": 9.067216829055922e-06, "loss": 3.153, "step": 514 }, { "epoch": 0.8382502543234995, "grad_norm": 7.369511880718046, "learning_rate": 9.06170566638344e-06, "loss": 3.2464, "step": 515 }, { "epoch": 0.8398779247202441, "grad_norm": 8.321854180241262, "learning_rate": 9.056179956092961e-06, "loss": 3.3327, "step": 516 }, { "epoch": 0.8415055951169889, "grad_norm": 7.019592424030351, "learning_rate": 9.05063971797566e-06, "loss": 3.2724, "step": 517 }, { "epoch": 0.8431332655137335, "grad_norm": 6.816928294667129, "learning_rate": 9.045084971874738e-06, "loss": 3.1388, "step": 518 }, { "epoch": 0.8447609359104782, "grad_norm": 6.7595134156551575, "learning_rate": 9.03951573768537e-06, "loss": 3.4121, "step": 519 }, { "epoch": 0.8463886063072228, "grad_norm": 7.375484622838033, "learning_rate": 9.033932035354616e-06, "loss": 3.3493, "step": 520 }, { "epoch": 0.8480162767039674, "grad_norm": 6.9081950829521865, "learning_rate": 9.028333884881357e-06, "loss": 3.4189, "step": 521 }, { "epoch": 0.8496439471007121, "grad_norm": 7.855936484603847, "learning_rate": 9.022721306316223e-06, "loss": 3.351, "step": 522 }, { "epoch": 0.8512716174974567, "grad_norm": 8.445126331856418, "learning_rate": 9.017094319761516e-06, "loss": 3.5481, "step": 523 }, { "epoch": 0.8528992878942014, "grad_norm": 6.846097933240796, "learning_rate": 9.011452945371154e-06, "loss": 3.1693, "step": 524 }, { "epoch": 0.854526958290946, "grad_norm": 6.757828571713359, "learning_rate": 9.00579720335057e-06, "loss": 3.3543, "step": 525 }, { "epoch": 0.8561546286876908, "grad_norm": 7.5759613041258875, "learning_rate": 9.000127113956673e-06, "loss": 3.2174, "step": 526 }, { "epoch": 0.8577822990844354, "grad_norm": 7.608237954852521, "learning_rate": 8.994442697497749e-06, "loss": 3.2325, "step": 527 }, { "epoch": 0.8594099694811801, "grad_norm": 6.959183691875813, "learning_rate": 8.988743974333405e-06, "loss": 3.3466, "step": 528 }, { "epoch": 0.8610376398779247, "grad_norm": 7.394233625535276, "learning_rate": 8.983030964874484e-06, "loss": 3.3621, "step": 529 }, { "epoch": 0.8626653102746694, "grad_norm": 7.8873117919618245, "learning_rate": 8.977303689583e-06, "loss": 3.3213, "step": 530 }, { "epoch": 0.864292980671414, "grad_norm": 7.393036226915921, "learning_rate": 8.971562168972065e-06, "loss": 3.3013, "step": 531 }, { "epoch": 0.8659206510681587, "grad_norm": 7.1214070196074735, "learning_rate": 8.965806423605808e-06, "loss": 3.3246, "step": 532 }, { "epoch": 0.8675483214649033, "grad_norm": 7.127760791067227, "learning_rate": 8.96003647409931e-06, "loss": 3.2993, "step": 533 }, { "epoch": 0.8691759918616481, "grad_norm": 6.97090964272378, "learning_rate": 8.954252341118523e-06, "loss": 3.1042, "step": 534 }, { "epoch": 0.8708036622583927, "grad_norm": 6.632314295146176, "learning_rate": 8.948454045380203e-06, "loss": 3.4169, "step": 535 }, { "epoch": 0.8724313326551373, "grad_norm": 7.197007816675904, "learning_rate": 8.94264160765183e-06, "loss": 3.3779, "step": 536 }, { "epoch": 0.874059003051882, "grad_norm": 6.854776993339331, "learning_rate": 8.936815048751533e-06, "loss": 3.1356, "step": 537 }, { "epoch": 0.8756866734486266, "grad_norm": 7.23116069486393, "learning_rate": 8.930974389548023e-06, "loss": 3.2694, "step": 538 }, { "epoch": 0.8773143438453713, "grad_norm": 6.973670444955071, "learning_rate": 8.925119650960514e-06, "loss": 3.3936, "step": 539 }, { "epoch": 0.8789420142421159, "grad_norm": 6.760973966475414, "learning_rate": 8.919250853958639e-06, "loss": 3.2666, "step": 540 }, { "epoch": 0.8805696846388607, "grad_norm": 7.239614739987012, "learning_rate": 8.913368019562391e-06, "loss": 3.3368, "step": 541 }, { "epoch": 0.8821973550356053, "grad_norm": 7.587052549074661, "learning_rate": 8.907471168842042e-06, "loss": 3.3764, "step": 542 }, { "epoch": 0.88382502543235, "grad_norm": 6.808531362168606, "learning_rate": 8.901560322918057e-06, "loss": 3.3139, "step": 543 }, { "epoch": 0.8854526958290946, "grad_norm": 7.72615166904643, "learning_rate": 8.895635502961033e-06, "loss": 3.3175, "step": 544 }, { "epoch": 0.8870803662258393, "grad_norm": 7.138221169682668, "learning_rate": 8.889696730191618e-06, "loss": 3.3353, "step": 545 }, { "epoch": 0.8887080366225839, "grad_norm": 7.336845578934267, "learning_rate": 8.883744025880429e-06, "loss": 3.3636, "step": 546 }, { "epoch": 0.8903357070193286, "grad_norm": 6.781296836835196, "learning_rate": 8.877777411347985e-06, "loss": 3.3728, "step": 547 }, { "epoch": 0.8919633774160732, "grad_norm": 6.997800095784034, "learning_rate": 8.871796907964626e-06, "loss": 3.2454, "step": 548 }, { "epoch": 0.893591047812818, "grad_norm": 7.347590007451053, "learning_rate": 8.865802537150436e-06, "loss": 3.2602, "step": 549 }, { "epoch": 0.8952187182095626, "grad_norm": 6.766329487975185, "learning_rate": 8.859794320375169e-06, "loss": 3.3329, "step": 550 }, { "epoch": 0.8968463886063073, "grad_norm": 6.555304400709218, "learning_rate": 8.853772279158166e-06, "loss": 3.2368, "step": 551 }, { "epoch": 0.8984740590030519, "grad_norm": 7.537574256208805, "learning_rate": 8.847736435068289e-06, "loss": 3.4044, "step": 552 }, { "epoch": 0.9001017293997965, "grad_norm": 6.790960769138448, "learning_rate": 8.841686809723833e-06, "loss": 3.2657, "step": 553 }, { "epoch": 0.9017293997965412, "grad_norm": 7.52795102603558, "learning_rate": 8.835623424792453e-06, "loss": 3.4236, "step": 554 }, { "epoch": 0.9033570701932858, "grad_norm": 7.660485984016413, "learning_rate": 8.829546301991086e-06, "loss": 3.4375, "step": 555 }, { "epoch": 0.9049847405900305, "grad_norm": 6.751336051297821, "learning_rate": 8.823455463085873e-06, "loss": 3.2916, "step": 556 }, { "epoch": 0.9066124109867751, "grad_norm": 6.688589434778656, "learning_rate": 8.817350929892086e-06, "loss": 3.5333, "step": 557 }, { "epoch": 0.9082400813835199, "grad_norm": 7.060388391561153, "learning_rate": 8.811232724274035e-06, "loss": 3.285, "step": 558 }, { "epoch": 0.9098677517802645, "grad_norm": 7.780579070867769, "learning_rate": 8.805100868145011e-06, "loss": 3.4321, "step": 559 }, { "epoch": 0.9114954221770092, "grad_norm": 7.09463663322348, "learning_rate": 8.798955383467189e-06, "loss": 3.3415, "step": 560 }, { "epoch": 0.9131230925737538, "grad_norm": 7.2965807039798, "learning_rate": 8.79279629225156e-06, "loss": 3.3738, "step": 561 }, { "epoch": 0.9147507629704985, "grad_norm": 6.902351064736495, "learning_rate": 8.786623616557848e-06, "loss": 3.1983, "step": 562 }, { "epoch": 0.9163784333672431, "grad_norm": 7.069301429075778, "learning_rate": 8.780437378494432e-06, "loss": 3.3246, "step": 563 }, { "epoch": 0.9180061037639878, "grad_norm": 7.488962120848072, "learning_rate": 8.774237600218266e-06, "loss": 3.4033, "step": 564 }, { "epoch": 0.9196337741607324, "grad_norm": 7.402164684540642, "learning_rate": 8.768024303934802e-06, "loss": 3.3579, "step": 565 }, { "epoch": 0.9212614445574772, "grad_norm": 7.761888619934395, "learning_rate": 8.761797511897907e-06, "loss": 3.337, "step": 566 }, { "epoch": 0.9228891149542218, "grad_norm": 6.8185441070629915, "learning_rate": 8.755557246409788e-06, "loss": 3.1057, "step": 567 }, { "epoch": 0.9245167853509664, "grad_norm": 6.625867413771893, "learning_rate": 8.749303529820903e-06, "loss": 3.2069, "step": 568 }, { "epoch": 0.9261444557477111, "grad_norm": 7.197672598572348, "learning_rate": 8.743036384529893e-06, "loss": 3.3654, "step": 569 }, { "epoch": 0.9277721261444557, "grad_norm": 6.804668989224706, "learning_rate": 8.736755832983497e-06, "loss": 3.206, "step": 570 }, { "epoch": 0.9293997965412004, "grad_norm": 7.560538666343285, "learning_rate": 8.730461897676463e-06, "loss": 3.2353, "step": 571 }, { "epoch": 0.931027466937945, "grad_norm": 6.465303853614732, "learning_rate": 8.724154601151484e-06, "loss": 3.1166, "step": 572 }, { "epoch": 0.9326551373346897, "grad_norm": 7.3463271412624636, "learning_rate": 8.7178339659991e-06, "loss": 3.4365, "step": 573 }, { "epoch": 0.9342828077314344, "grad_norm": 6.880551013824854, "learning_rate": 8.711500014857635e-06, "loss": 3.2501, "step": 574 }, { "epoch": 0.9359104781281791, "grad_norm": 7.002362881517669, "learning_rate": 8.705152770413094e-06, "loss": 3.4504, "step": 575 }, { "epoch": 0.9375381485249237, "grad_norm": 7.1569940942359205, "learning_rate": 8.698792255399104e-06, "loss": 3.3469, "step": 576 }, { "epoch": 0.9391658189216684, "grad_norm": 8.26661262340222, "learning_rate": 8.69241849259682e-06, "loss": 3.4496, "step": 577 }, { "epoch": 0.940793489318413, "grad_norm": 6.831881904006422, "learning_rate": 8.686031504834843e-06, "loss": 3.3166, "step": 578 }, { "epoch": 0.9424211597151577, "grad_norm": 7.208298872608186, "learning_rate": 8.679631314989143e-06, "loss": 3.3705, "step": 579 }, { "epoch": 0.9440488301119023, "grad_norm": 7.014868276152848, "learning_rate": 8.673217945982979e-06, "loss": 3.2168, "step": 580 }, { "epoch": 0.945676500508647, "grad_norm": 7.278922035372799, "learning_rate": 8.666791420786805e-06, "loss": 3.4682, "step": 581 }, { "epoch": 0.9473041709053917, "grad_norm": 7.429270172859412, "learning_rate": 8.660351762418203e-06, "loss": 3.4067, "step": 582 }, { "epoch": 0.9489318413021364, "grad_norm": 7.270844575138131, "learning_rate": 8.653898993941791e-06, "loss": 3.2512, "step": 583 }, { "epoch": 0.950559511698881, "grad_norm": 6.360758549654599, "learning_rate": 8.647433138469145e-06, "loss": 3.1201, "step": 584 }, { "epoch": 0.9521871820956256, "grad_norm": 6.939150123503346, "learning_rate": 8.640954219158708e-06, "loss": 3.1885, "step": 585 }, { "epoch": 0.9538148524923703, "grad_norm": 6.545556114327753, "learning_rate": 8.634462259215719e-06, "loss": 3.3672, "step": 586 }, { "epoch": 0.9554425228891149, "grad_norm": 6.819403246888161, "learning_rate": 8.627957281892123e-06, "loss": 3.3973, "step": 587 }, { "epoch": 0.9570701932858596, "grad_norm": 8.186077981639842, "learning_rate": 8.621439310486486e-06, "loss": 3.4302, "step": 588 }, { "epoch": 0.9586978636826042, "grad_norm": 7.058148308619134, "learning_rate": 8.61490836834392e-06, "loss": 3.3399, "step": 589 }, { "epoch": 0.960325534079349, "grad_norm": 6.941885203767873, "learning_rate": 8.608364478855984e-06, "loss": 3.5035, "step": 590 }, { "epoch": 0.9619532044760936, "grad_norm": 6.8999946709949915, "learning_rate": 8.60180766546062e-06, "loss": 3.3421, "step": 591 }, { "epoch": 0.9635808748728383, "grad_norm": 6.63371752705906, "learning_rate": 8.595237951642055e-06, "loss": 3.4373, "step": 592 }, { "epoch": 0.9652085452695829, "grad_norm": 7.532777946345135, "learning_rate": 8.588655360930717e-06, "loss": 3.2341, "step": 593 }, { "epoch": 0.9668362156663276, "grad_norm": 7.480100853356731, "learning_rate": 8.58205991690316e-06, "loss": 3.3717, "step": 594 }, { "epoch": 0.9684638860630722, "grad_norm": 6.901955745736511, "learning_rate": 8.575451643181974e-06, "loss": 3.095, "step": 595 }, { "epoch": 0.9700915564598169, "grad_norm": 6.447072952136236, "learning_rate": 8.568830563435695e-06, "loss": 3.3207, "step": 596 }, { "epoch": 0.9717192268565615, "grad_norm": 6.387575902994535, "learning_rate": 8.562196701378734e-06, "loss": 3.3337, "step": 597 }, { "epoch": 0.9733468972533063, "grad_norm": 7.951326285110298, "learning_rate": 8.555550080771273e-06, "loss": 3.4992, "step": 598 }, { "epoch": 0.9749745676500509, "grad_norm": 7.678935652285871, "learning_rate": 8.548890725419204e-06, "loss": 3.4163, "step": 599 }, { "epoch": 0.9766022380467956, "grad_norm": 7.558529149864469, "learning_rate": 8.542218659174018e-06, "loss": 3.3028, "step": 600 }, { "epoch": 0.9782299084435402, "grad_norm": 7.263340186953341, "learning_rate": 8.535533905932739e-06, "loss": 3.2442, "step": 601 }, { "epoch": 0.9798575788402848, "grad_norm": 6.703077769935083, "learning_rate": 8.528836489637828e-06, "loss": 3.2969, "step": 602 }, { "epoch": 0.9814852492370295, "grad_norm": 7.352835338881992, "learning_rate": 8.522126434277108e-06, "loss": 3.3243, "step": 603 }, { "epoch": 0.9831129196337741, "grad_norm": 7.793179440914934, "learning_rate": 8.51540376388366e-06, "loss": 3.2648, "step": 604 }, { "epoch": 0.9847405900305188, "grad_norm": 6.725787371563211, "learning_rate": 8.508668502535753e-06, "loss": 3.3041, "step": 605 }, { "epoch": 0.9863682604272634, "grad_norm": 7.342083666840881, "learning_rate": 8.501920674356755e-06, "loss": 3.2593, "step": 606 }, { "epoch": 0.9879959308240082, "grad_norm": 7.23402846228289, "learning_rate": 8.49516030351504e-06, "loss": 3.4616, "step": 607 }, { "epoch": 0.9896236012207528, "grad_norm": 6.672947541594658, "learning_rate": 8.488387414223905e-06, "loss": 3.3967, "step": 608 }, { "epoch": 0.9912512716174975, "grad_norm": 7.6046868559874845, "learning_rate": 8.481602030741486e-06, "loss": 3.3771, "step": 609 }, { "epoch": 0.9928789420142421, "grad_norm": 7.696407501208929, "learning_rate": 8.474804177370671e-06, "loss": 3.26, "step": 610 }, { "epoch": 0.9945066124109868, "grad_norm": 7.521784774265105, "learning_rate": 8.467993878459005e-06, "loss": 3.3236, "step": 611 }, { "epoch": 0.9961342828077314, "grad_norm": 6.9661191575053145, "learning_rate": 8.461171158398612e-06, "loss": 3.3803, "step": 612 }, { "epoch": 0.9977619532044761, "grad_norm": 7.173553486327662, "learning_rate": 8.454336041626106e-06, "loss": 3.3492, "step": 613 }, { "epoch": 0.9993896236012207, "grad_norm": 6.714575704010093, "learning_rate": 8.447488552622498e-06, "loss": 3.2672, "step": 614 }, { "epoch": 1.0, "grad_norm": 6.714575704010093, "learning_rate": 8.440628715913114e-06, "loss": 3.3002, "step": 615 }, { "epoch": 1.0016276703967446, "grad_norm": 13.968612562187639, "learning_rate": 8.433756556067506e-06, "loss": 2.707, "step": 616 }, { "epoch": 1.0032553407934892, "grad_norm": 6.801610186498195, "learning_rate": 8.426872097699361e-06, "loss": 2.7477, "step": 617 }, { "epoch": 1.004883011190234, "grad_norm": 6.7309282371729795, "learning_rate": 8.419975365466415e-06, "loss": 2.7005, "step": 618 }, { "epoch": 1.0065106815869787, "grad_norm": 5.930595715273167, "learning_rate": 8.413066384070367e-06, "loss": 2.4391, "step": 619 }, { "epoch": 1.0081383519837233, "grad_norm": 6.082206177931838, "learning_rate": 8.406145178256788e-06, "loss": 2.597, "step": 620 }, { "epoch": 1.0097660223804679, "grad_norm": 6.4575245540346025, "learning_rate": 8.39921177281503e-06, "loss": 2.7455, "step": 621 }, { "epoch": 1.0113936927772127, "grad_norm": 6.257995650523895, "learning_rate": 8.392266192578143e-06, "loss": 2.7412, "step": 622 }, { "epoch": 1.0130213631739573, "grad_norm": 6.655006831761868, "learning_rate": 8.385308462422778e-06, "loss": 2.6515, "step": 623 }, { "epoch": 1.014649033570702, "grad_norm": 6.544761273486829, "learning_rate": 8.37833860726911e-06, "loss": 2.553, "step": 624 }, { "epoch": 1.0162767039674465, "grad_norm": 6.7268276065574835, "learning_rate": 8.371356652080737e-06, "loss": 2.6219, "step": 625 }, { "epoch": 1.0179043743641913, "grad_norm": 6.165625154768398, "learning_rate": 8.364362621864595e-06, "loss": 2.5771, "step": 626 }, { "epoch": 1.019532044760936, "grad_norm": 6.907837569193185, "learning_rate": 8.35735654167087e-06, "loss": 2.597, "step": 627 }, { "epoch": 1.0211597151576806, "grad_norm": 7.196233797543074, "learning_rate": 8.350338436592905e-06, "loss": 2.5535, "step": 628 }, { "epoch": 1.0227873855544252, "grad_norm": 7.825855803283873, "learning_rate": 8.343308331767115e-06, "loss": 2.6991, "step": 629 }, { "epoch": 1.02441505595117, "grad_norm": 6.9705457915123565, "learning_rate": 8.33626625237289e-06, "loss": 2.4291, "step": 630 }, { "epoch": 1.0260427263479146, "grad_norm": 7.471572891018775, "learning_rate": 8.329212223632511e-06, "loss": 2.4832, "step": 631 }, { "epoch": 1.0276703967446592, "grad_norm": 7.770695465182524, "learning_rate": 8.322146270811058e-06, "loss": 2.7108, "step": 632 }, { "epoch": 1.0292980671414038, "grad_norm": 7.46009003360848, "learning_rate": 8.315068419216321e-06, "loss": 2.5639, "step": 633 }, { "epoch": 1.0309257375381484, "grad_norm": 7.2296046038740425, "learning_rate": 8.3079786941987e-06, "loss": 2.4266, "step": 634 }, { "epoch": 1.0325534079348933, "grad_norm": 6.813724926258742, "learning_rate": 8.300877121151127e-06, "loss": 2.6098, "step": 635 }, { "epoch": 1.0341810783316379, "grad_norm": 7.336628197071965, "learning_rate": 8.29376372550897e-06, "loss": 2.4673, "step": 636 }, { "epoch": 1.0358087487283825, "grad_norm": 7.027986062265194, "learning_rate": 8.286638532749938e-06, "loss": 2.3455, "step": 637 }, { "epoch": 1.037436419125127, "grad_norm": 7.6281530171494705, "learning_rate": 8.279501568393996e-06, "loss": 2.7997, "step": 638 }, { "epoch": 1.039064089521872, "grad_norm": 7.036046167154228, "learning_rate": 8.272352858003268e-06, "loss": 2.4166, "step": 639 }, { "epoch": 1.0406917599186165, "grad_norm": 6.868401170838677, "learning_rate": 8.265192427181954e-06, "loss": 2.4826, "step": 640 }, { "epoch": 1.0423194303153611, "grad_norm": 7.559740249123819, "learning_rate": 8.258020301576224e-06, "loss": 2.5246, "step": 641 }, { "epoch": 1.0439471007121057, "grad_norm": 6.944861306129637, "learning_rate": 8.250836506874142e-06, "loss": 2.5636, "step": 642 }, { "epoch": 1.0455747711088506, "grad_norm": 7.6088912068649766, "learning_rate": 8.243641068805563e-06, "loss": 2.5436, "step": 643 }, { "epoch": 1.0472024415055952, "grad_norm": 6.993220773732502, "learning_rate": 8.236434013142046e-06, "loss": 2.5324, "step": 644 }, { "epoch": 1.0488301119023398, "grad_norm": 6.634836101109396, "learning_rate": 8.22921536569676e-06, "loss": 2.421, "step": 645 }, { "epoch": 1.0504577822990844, "grad_norm": 7.648487405561615, "learning_rate": 8.221985152324385e-06, "loss": 2.8394, "step": 646 }, { "epoch": 1.0520854526958292, "grad_norm": 7.597173958007929, "learning_rate": 8.214743398921041e-06, "loss": 2.6832, "step": 647 }, { "epoch": 1.0537131230925738, "grad_norm": 6.75713174404292, "learning_rate": 8.207490131424167e-06, "loss": 2.403, "step": 648 }, { "epoch": 1.0553407934893184, "grad_norm": 6.923653155181039, "learning_rate": 8.200225375812449e-06, "loss": 2.63, "step": 649 }, { "epoch": 1.056968463886063, "grad_norm": 7.432461146906554, "learning_rate": 8.192949158105713e-06, "loss": 2.5339, "step": 650 }, { "epoch": 1.0585961342828076, "grad_norm": 7.011854620053064, "learning_rate": 8.185661504364845e-06, "loss": 2.4127, "step": 651 }, { "epoch": 1.0602238046795525, "grad_norm": 6.164876981834595, "learning_rate": 8.178362440691685e-06, "loss": 2.4618, "step": 652 }, { "epoch": 1.061851475076297, "grad_norm": 7.594054865713797, "learning_rate": 8.171051993228945e-06, "loss": 2.5819, "step": 653 }, { "epoch": 1.0634791454730417, "grad_norm": 7.403433768232072, "learning_rate": 8.163730188160105e-06, "loss": 2.3477, "step": 654 }, { "epoch": 1.0651068158697863, "grad_norm": 6.942506121058486, "learning_rate": 8.156397051709327e-06, "loss": 2.401, "step": 655 }, { "epoch": 1.0667344862665311, "grad_norm": 7.172026464091698, "learning_rate": 8.149052610141357e-06, "loss": 2.4723, "step": 656 }, { "epoch": 1.0683621566632757, "grad_norm": 7.086749694112898, "learning_rate": 8.141696889761432e-06, "loss": 2.5542, "step": 657 }, { "epoch": 1.0699898270600203, "grad_norm": 7.499668915797014, "learning_rate": 8.134329916915185e-06, "loss": 2.4527, "step": 658 }, { "epoch": 1.071617497456765, "grad_norm": 6.83112528053073, "learning_rate": 8.126951717988551e-06, "loss": 2.5054, "step": 659 }, { "epoch": 1.0732451678535098, "grad_norm": 7.41580262780746, "learning_rate": 8.119562319407678e-06, "loss": 2.736, "step": 660 }, { "epoch": 1.0748728382502544, "grad_norm": 7.271614792324178, "learning_rate": 8.112161747638823e-06, "loss": 2.5916, "step": 661 }, { "epoch": 1.076500508646999, "grad_norm": 6.6766944828422075, "learning_rate": 8.104750029188258e-06, "loss": 2.4936, "step": 662 }, { "epoch": 1.0781281790437436, "grad_norm": 7.216535668631431, "learning_rate": 8.097327190602186e-06, "loss": 2.5166, "step": 663 }, { "epoch": 1.0797558494404882, "grad_norm": 6.629951610278823, "learning_rate": 8.089893258466633e-06, "loss": 2.4736, "step": 664 }, { "epoch": 1.081383519837233, "grad_norm": 6.554525069682699, "learning_rate": 8.08244825940736e-06, "loss": 2.5225, "step": 665 }, { "epoch": 1.0830111902339776, "grad_norm": 6.620445486692053, "learning_rate": 8.07499222008977e-06, "loss": 2.5592, "step": 666 }, { "epoch": 1.0846388606307222, "grad_norm": 6.665936812947577, "learning_rate": 8.067525167218797e-06, "loss": 2.4676, "step": 667 }, { "epoch": 1.0862665310274668, "grad_norm": 7.225340840854492, "learning_rate": 8.060047127538836e-06, "loss": 2.5794, "step": 668 }, { "epoch": 1.0878942014242117, "grad_norm": 6.970758063604883, "learning_rate": 8.052558127833623e-06, "loss": 2.5718, "step": 669 }, { "epoch": 1.0895218718209563, "grad_norm": 7.68949725890778, "learning_rate": 8.045058194926153e-06, "loss": 2.4976, "step": 670 }, { "epoch": 1.0911495422177009, "grad_norm": 7.1780786952678275, "learning_rate": 8.037547355678578e-06, "loss": 2.5813, "step": 671 }, { "epoch": 1.0927772126144455, "grad_norm": 7.199904059990945, "learning_rate": 8.030025636992113e-06, "loss": 2.407, "step": 672 }, { "epoch": 1.0944048830111903, "grad_norm": 7.392383901513262, "learning_rate": 8.022493065806944e-06, "loss": 2.5839, "step": 673 }, { "epoch": 1.096032553407935, "grad_norm": 7.877695135032782, "learning_rate": 8.014949669102117e-06, "loss": 2.5174, "step": 674 }, { "epoch": 1.0976602238046795, "grad_norm": 7.48914897342574, "learning_rate": 8.007395473895463e-06, "loss": 2.495, "step": 675 }, { "epoch": 1.0992878942014241, "grad_norm": 7.953907204901502, "learning_rate": 7.999830507243478e-06, "loss": 2.6745, "step": 676 }, { "epoch": 1.100915564598169, "grad_norm": 7.653782252428148, "learning_rate": 7.992254796241249e-06, "loss": 2.5984, "step": 677 }, { "epoch": 1.1025432349949136, "grad_norm": 7.3852292982551875, "learning_rate": 7.984668368022335e-06, "loss": 2.5901, "step": 678 }, { "epoch": 1.1041709053916582, "grad_norm": 6.949067379094522, "learning_rate": 7.977071249758689e-06, "loss": 2.5376, "step": 679 }, { "epoch": 1.1057985757884028, "grad_norm": 7.2429438510403905, "learning_rate": 7.969463468660546e-06, "loss": 2.5578, "step": 680 }, { "epoch": 1.1074262461851476, "grad_norm": 7.624764555048211, "learning_rate": 7.961845051976334e-06, "loss": 2.5391, "step": 681 }, { "epoch": 1.1090539165818922, "grad_norm": 7.236260406412389, "learning_rate": 7.954216026992571e-06, "loss": 2.5848, "step": 682 }, { "epoch": 1.1106815869786368, "grad_norm": 7.903640251994442, "learning_rate": 7.946576421033778e-06, "loss": 2.5331, "step": 683 }, { "epoch": 1.1123092573753814, "grad_norm": 7.757537205530013, "learning_rate": 7.938926261462366e-06, "loss": 2.6797, "step": 684 }, { "epoch": 1.113936927772126, "grad_norm": 6.858086452942727, "learning_rate": 7.931265575678549e-06, "loss": 2.34, "step": 685 }, { "epoch": 1.1155645981688709, "grad_norm": 6.919974425451723, "learning_rate": 7.923594391120237e-06, "loss": 2.4913, "step": 686 }, { "epoch": 1.1171922685656155, "grad_norm": 7.217811913766826, "learning_rate": 7.91591273526295e-06, "loss": 2.4911, "step": 687 }, { "epoch": 1.11881993896236, "grad_norm": 6.937514789196187, "learning_rate": 7.908220635619708e-06, "loss": 2.5978, "step": 688 }, { "epoch": 1.1204476093591047, "grad_norm": 7.10788694779231, "learning_rate": 7.90051811974094e-06, "loss": 2.5662, "step": 689 }, { "epoch": 1.1220752797558495, "grad_norm": 7.49282736459384, "learning_rate": 7.89280521521438e-06, "loss": 2.3106, "step": 690 }, { "epoch": 1.1237029501525941, "grad_norm": 6.904410061250997, "learning_rate": 7.885081949664971e-06, "loss": 2.3549, "step": 691 }, { "epoch": 1.1253306205493387, "grad_norm": 6.829183926439996, "learning_rate": 7.877348350754766e-06, "loss": 2.6757, "step": 692 }, { "epoch": 1.1269582909460834, "grad_norm": 7.377425894307193, "learning_rate": 7.86960444618283e-06, "loss": 2.6601, "step": 693 }, { "epoch": 1.1285859613428282, "grad_norm": 8.9425488996857, "learning_rate": 7.861850263685134e-06, "loss": 2.494, "step": 694 }, { "epoch": 1.1302136317395728, "grad_norm": 7.409604043565789, "learning_rate": 7.854085831034467e-06, "loss": 2.6937, "step": 695 }, { "epoch": 1.1318413021363174, "grad_norm": 7.957040552668971, "learning_rate": 7.846311176040331e-06, "loss": 2.6773, "step": 696 }, { "epoch": 1.133468972533062, "grad_norm": 7.059843380210266, "learning_rate": 7.838526326548832e-06, "loss": 2.4561, "step": 697 }, { "epoch": 1.1350966429298066, "grad_norm": 6.9176902913039235, "learning_rate": 7.830731310442599e-06, "loss": 2.4567, "step": 698 }, { "epoch": 1.1367243133265514, "grad_norm": 7.480516255238179, "learning_rate": 7.822926155640671e-06, "loss": 2.5617, "step": 699 }, { "epoch": 1.138351983723296, "grad_norm": 7.123789710345735, "learning_rate": 7.815110890098396e-06, "loss": 2.5323, "step": 700 }, { "epoch": 1.1399796541200407, "grad_norm": 6.643997999143781, "learning_rate": 7.807285541807342e-06, "loss": 2.5638, "step": 701 }, { "epoch": 1.1416073245167853, "grad_norm": 7.546254344441774, "learning_rate": 7.799450138795187e-06, "loss": 2.6203, "step": 702 }, { "epoch": 1.14323499491353, "grad_norm": 7.124007542325887, "learning_rate": 7.791604709125617e-06, "loss": 2.5043, "step": 703 }, { "epoch": 1.1448626653102747, "grad_norm": 7.451041007233805, "learning_rate": 7.78374928089824e-06, "loss": 2.6204, "step": 704 }, { "epoch": 1.1464903357070193, "grad_norm": 7.287240819354314, "learning_rate": 7.775883882248467e-06, "loss": 2.456, "step": 705 }, { "epoch": 1.148118006103764, "grad_norm": 7.440747614006502, "learning_rate": 7.768008541347423e-06, "loss": 2.5019, "step": 706 }, { "epoch": 1.1497456765005087, "grad_norm": 7.445406162146577, "learning_rate": 7.760123286401841e-06, "loss": 2.6455, "step": 707 }, { "epoch": 1.1513733468972533, "grad_norm": 7.24149897399673, "learning_rate": 7.752228145653964e-06, "loss": 2.5871, "step": 708 }, { "epoch": 1.153001017293998, "grad_norm": 6.805641510921647, "learning_rate": 7.74432314738144e-06, "loss": 2.4701, "step": 709 }, { "epoch": 1.1546286876907426, "grad_norm": 7.248743715619819, "learning_rate": 7.73640831989723e-06, "loss": 2.5032, "step": 710 }, { "epoch": 1.1562563580874872, "grad_norm": 6.763647047480556, "learning_rate": 7.728483691549491e-06, "loss": 2.5221, "step": 711 }, { "epoch": 1.157884028484232, "grad_norm": 7.083108119430588, "learning_rate": 7.720549290721492e-06, "loss": 2.5299, "step": 712 }, { "epoch": 1.1595116988809766, "grad_norm": 7.259351799929205, "learning_rate": 7.712605145831492e-06, "loss": 2.3997, "step": 713 }, { "epoch": 1.1611393692777212, "grad_norm": 7.299386237678383, "learning_rate": 7.704651285332662e-06, "loss": 2.5511, "step": 714 }, { "epoch": 1.162767039674466, "grad_norm": 7.573316231651308, "learning_rate": 7.696687737712964e-06, "loss": 2.5536, "step": 715 }, { "epoch": 1.1643947100712106, "grad_norm": 7.498762250925887, "learning_rate": 7.688714531495061e-06, "loss": 2.4826, "step": 716 }, { "epoch": 1.1660223804679553, "grad_norm": 7.555091217217231, "learning_rate": 7.680731695236204e-06, "loss": 2.498, "step": 717 }, { "epoch": 1.1676500508646999, "grad_norm": 6.815321713965633, "learning_rate": 7.672739257528135e-06, "loss": 2.3851, "step": 718 }, { "epoch": 1.1692777212614445, "grad_norm": 7.184503064825123, "learning_rate": 7.664737246996992e-06, "loss": 2.4826, "step": 719 }, { "epoch": 1.1709053916581893, "grad_norm": 7.753460147007892, "learning_rate": 7.656725692303195e-06, "loss": 2.6094, "step": 720 }, { "epoch": 1.172533062054934, "grad_norm": 7.45644115157806, "learning_rate": 7.648704622141347e-06, "loss": 2.4967, "step": 721 }, { "epoch": 1.1741607324516785, "grad_norm": 7.536640420641747, "learning_rate": 7.640674065240136e-06, "loss": 2.4012, "step": 722 }, { "epoch": 1.1757884028484231, "grad_norm": 7.123001658904488, "learning_rate": 7.632634050362223e-06, "loss": 2.5653, "step": 723 }, { "epoch": 1.177416073245168, "grad_norm": 7.533624711314105, "learning_rate": 7.624584606304148e-06, "loss": 2.5477, "step": 724 }, { "epoch": 1.1790437436419126, "grad_norm": 7.266533458729527, "learning_rate": 7.616525761896221e-06, "loss": 2.7305, "step": 725 }, { "epoch": 1.1806714140386572, "grad_norm": 7.811005923231022, "learning_rate": 7.608457546002423e-06, "loss": 2.5369, "step": 726 }, { "epoch": 1.1822990844354018, "grad_norm": 7.614872130523433, "learning_rate": 7.600379987520299e-06, "loss": 2.5798, "step": 727 }, { "epoch": 1.1839267548321466, "grad_norm": 6.482128841046095, "learning_rate": 7.5922931153808555e-06, "loss": 2.6534, "step": 728 }, { "epoch": 1.1855544252288912, "grad_norm": 7.337166959652978, "learning_rate": 7.584196958548458e-06, "loss": 2.4104, "step": 729 }, { "epoch": 1.1871820956256358, "grad_norm": 7.137456776447317, "learning_rate": 7.576091546020725e-06, "loss": 2.5885, "step": 730 }, { "epoch": 1.1888097660223804, "grad_norm": 7.940132483237246, "learning_rate": 7.567976906828431e-06, "loss": 2.7226, "step": 731 }, { "epoch": 1.190437436419125, "grad_norm": 6.765388059571409, "learning_rate": 7.559853070035389e-06, "loss": 2.4598, "step": 732 }, { "epoch": 1.1920651068158699, "grad_norm": 7.647641218391358, "learning_rate": 7.551720064738361e-06, "loss": 2.5532, "step": 733 }, { "epoch": 1.1936927772126145, "grad_norm": 7.173621505452903, "learning_rate": 7.543577920066945e-06, "loss": 2.6766, "step": 734 }, { "epoch": 1.195320447609359, "grad_norm": 6.790293702052522, "learning_rate": 7.535426665183472e-06, "loss": 2.4609, "step": 735 }, { "epoch": 1.1969481180061037, "grad_norm": 6.704530589131449, "learning_rate": 7.527266329282905e-06, "loss": 2.5961, "step": 736 }, { "epoch": 1.1985757884028485, "grad_norm": 6.688920637944841, "learning_rate": 7.519096941592732e-06, "loss": 2.4038, "step": 737 }, { "epoch": 1.2002034587995931, "grad_norm": 7.057680637100975, "learning_rate": 7.510918531372857e-06, "loss": 2.6145, "step": 738 }, { "epoch": 1.2018311291963377, "grad_norm": 6.677200832939882, "learning_rate": 7.502731127915507e-06, "loss": 2.3983, "step": 739 }, { "epoch": 1.2034587995930823, "grad_norm": 7.154999524096353, "learning_rate": 7.494534760545114e-06, "loss": 2.7236, "step": 740 }, { "epoch": 1.2050864699898272, "grad_norm": 7.4890440215259995, "learning_rate": 7.486329458618215e-06, "loss": 2.6054, "step": 741 }, { "epoch": 1.2067141403865718, "grad_norm": 7.547950025457361, "learning_rate": 7.478115251523351e-06, "loss": 2.4583, "step": 742 }, { "epoch": 1.2083418107833164, "grad_norm": 7.833110753609217, "learning_rate": 7.46989216868096e-06, "loss": 2.5143, "step": 743 }, { "epoch": 1.209969481180061, "grad_norm": 7.294092519319992, "learning_rate": 7.461660239543261e-06, "loss": 2.4117, "step": 744 }, { "epoch": 1.2115971515768056, "grad_norm": 7.2867474523535805, "learning_rate": 7.453419493594168e-06, "loss": 2.6245, "step": 745 }, { "epoch": 1.2132248219735504, "grad_norm": 6.862829485489356, "learning_rate": 7.445169960349167e-06, "loss": 2.7448, "step": 746 }, { "epoch": 1.214852492370295, "grad_norm": 7.023004712591372, "learning_rate": 7.436911669355218e-06, "loss": 2.5551, "step": 747 }, { "epoch": 1.2164801627670396, "grad_norm": 7.615337200984255, "learning_rate": 7.4286446501906505e-06, "loss": 2.6445, "step": 748 }, { "epoch": 1.2181078331637842, "grad_norm": 8.3792452116444, "learning_rate": 7.420368932465053e-06, "loss": 2.607, "step": 749 }, { "epoch": 1.219735503560529, "grad_norm": 6.878587177093769, "learning_rate": 7.412084545819169e-06, "loss": 2.3723, "step": 750 }, { "epoch": 1.2213631739572737, "grad_norm": 7.144988182416455, "learning_rate": 7.403791519924794e-06, "loss": 2.5215, "step": 751 }, { "epoch": 1.2229908443540183, "grad_norm": 7.220178146911765, "learning_rate": 7.3954898844846645e-06, "loss": 2.6902, "step": 752 }, { "epoch": 1.2246185147507629, "grad_norm": 6.70414227788658, "learning_rate": 7.387179669232351e-06, "loss": 2.4662, "step": 753 }, { "epoch": 1.2262461851475077, "grad_norm": 7.333803544320044, "learning_rate": 7.378860903932159e-06, "loss": 2.7693, "step": 754 }, { "epoch": 1.2278738555442523, "grad_norm": 7.033710775276569, "learning_rate": 7.370533618379013e-06, "loss": 2.4084, "step": 755 }, { "epoch": 1.229501525940997, "grad_norm": 7.2251696601473165, "learning_rate": 7.362197842398355e-06, "loss": 2.437, "step": 756 }, { "epoch": 1.2311291963377415, "grad_norm": 7.19416063737049, "learning_rate": 7.3538536058460395e-06, "loss": 2.535, "step": 757 }, { "epoch": 1.2327568667344861, "grad_norm": 7.115267536518697, "learning_rate": 7.34550093860822e-06, "loss": 2.4636, "step": 758 }, { "epoch": 1.234384537131231, "grad_norm": 6.769385066249573, "learning_rate": 7.337139870601247e-06, "loss": 2.5318, "step": 759 }, { "epoch": 1.2360122075279756, "grad_norm": 7.792365076673628, "learning_rate": 7.3287704317715615e-06, "loss": 2.6191, "step": 760 }, { "epoch": 1.2376398779247202, "grad_norm": 7.51269255659091, "learning_rate": 7.3203926520955846e-06, "loss": 2.8248, "step": 761 }, { "epoch": 1.239267548321465, "grad_norm": 7.534896047155522, "learning_rate": 7.31200656157961e-06, "loss": 2.5861, "step": 762 }, { "epoch": 1.2408952187182096, "grad_norm": 7.240396496645509, "learning_rate": 7.303612190259699e-06, "loss": 2.5651, "step": 763 }, { "epoch": 1.2425228891149542, "grad_norm": 7.178228736470261, "learning_rate": 7.295209568201574e-06, "loss": 2.7323, "step": 764 }, { "epoch": 1.2441505595116988, "grad_norm": 7.032158821337262, "learning_rate": 7.286798725500506e-06, "loss": 2.5941, "step": 765 }, { "epoch": 1.2457782299084434, "grad_norm": 7.434150343823504, "learning_rate": 7.278379692281209e-06, "loss": 2.5669, "step": 766 }, { "epoch": 1.2474059003051883, "grad_norm": 7.5656937949096745, "learning_rate": 7.269952498697734e-06, "loss": 2.6277, "step": 767 }, { "epoch": 1.2490335707019329, "grad_norm": 7.572514249257789, "learning_rate": 7.261517174933362e-06, "loss": 2.5932, "step": 768 }, { "epoch": 1.2506612410986775, "grad_norm": 6.827562707785206, "learning_rate": 7.253073751200489e-06, "loss": 2.63, "step": 769 }, { "epoch": 1.252288911495422, "grad_norm": 6.888627608137843, "learning_rate": 7.2446222577405234e-06, "loss": 2.5692, "step": 770 }, { "epoch": 1.2539165818921667, "grad_norm": 6.965505899272686, "learning_rate": 7.2361627248237795e-06, "loss": 2.4969, "step": 771 }, { "epoch": 1.2555442522889115, "grad_norm": 6.931436656774723, "learning_rate": 7.2276951827493615e-06, "loss": 2.4406, "step": 772 }, { "epoch": 1.2571719226856561, "grad_norm": 7.249135772869975, "learning_rate": 7.219219661845065e-06, "loss": 2.6073, "step": 773 }, { "epoch": 1.2587995930824007, "grad_norm": 7.924077031430345, "learning_rate": 7.210736192467256e-06, "loss": 2.5806, "step": 774 }, { "epoch": 1.2604272634791456, "grad_norm": 8.03703364114828, "learning_rate": 7.202244805000778e-06, "loss": 2.5707, "step": 775 }, { "epoch": 1.2620549338758902, "grad_norm": 6.963654485128617, "learning_rate": 7.193745529858827e-06, "loss": 2.5134, "step": 776 }, { "epoch": 1.2636826042726348, "grad_norm": 7.515454641802577, "learning_rate": 7.1852383974828525e-06, "loss": 2.6336, "step": 777 }, { "epoch": 1.2653102746693794, "grad_norm": 7.039394529499536, "learning_rate": 7.176723438342445e-06, "loss": 2.4741, "step": 778 }, { "epoch": 1.266937945066124, "grad_norm": 7.058048958588894, "learning_rate": 7.168200682935233e-06, "loss": 2.4473, "step": 779 }, { "epoch": 1.2685656154628688, "grad_norm": 7.075445591468153, "learning_rate": 7.159670161786759e-06, "loss": 2.7061, "step": 780 }, { "epoch": 1.2701932858596134, "grad_norm": 6.83749371446083, "learning_rate": 7.151131905450386e-06, "loss": 2.4073, "step": 781 }, { "epoch": 1.271820956256358, "grad_norm": 6.9388386491789715, "learning_rate": 7.1425859445071845e-06, "loss": 2.4515, "step": 782 }, { "epoch": 1.2734486266531029, "grad_norm": 7.020657691876368, "learning_rate": 7.134032309565811e-06, "loss": 2.4398, "step": 783 }, { "epoch": 1.2750762970498475, "grad_norm": 7.308520683430806, "learning_rate": 7.125471031262416e-06, "loss": 2.3408, "step": 784 }, { "epoch": 1.276703967446592, "grad_norm": 7.073833738810363, "learning_rate": 7.116902140260525e-06, "loss": 2.4523, "step": 785 }, { "epoch": 1.2783316378433367, "grad_norm": 7.41013374674775, "learning_rate": 7.10832566725092e-06, "loss": 2.5121, "step": 786 }, { "epoch": 1.2799593082400813, "grad_norm": 7.386044498520568, "learning_rate": 7.099741642951554e-06, "loss": 2.5194, "step": 787 }, { "epoch": 1.2815869786368261, "grad_norm": 7.208755367945585, "learning_rate": 7.091150098107415e-06, "loss": 2.3952, "step": 788 }, { "epoch": 1.2832146490335707, "grad_norm": 7.355181722146758, "learning_rate": 7.08255106349043e-06, "loss": 2.6309, "step": 789 }, { "epoch": 1.2848423194303153, "grad_norm": 7.729833268895966, "learning_rate": 7.0739445698993545e-06, "loss": 2.5422, "step": 790 }, { "epoch": 1.28646998982706, "grad_norm": 7.731401752034183, "learning_rate": 7.0653306481596565e-06, "loss": 2.5886, "step": 791 }, { "epoch": 1.2880976602238046, "grad_norm": 7.041333386894302, "learning_rate": 7.056709329123408e-06, "loss": 2.4104, "step": 792 }, { "epoch": 1.2897253306205494, "grad_norm": 7.580200682270676, "learning_rate": 7.048080643669178e-06, "loss": 2.6251, "step": 793 }, { "epoch": 1.291353001017294, "grad_norm": 7.202471011687516, "learning_rate": 7.039444622701922e-06, "loss": 2.4831, "step": 794 }, { "epoch": 1.2929806714140386, "grad_norm": 6.944793147074507, "learning_rate": 7.03080129715286e-06, "loss": 2.4836, "step": 795 }, { "epoch": 1.2946083418107834, "grad_norm": 6.915632301250002, "learning_rate": 7.022150697979385e-06, "loss": 2.6293, "step": 796 }, { "epoch": 1.296236012207528, "grad_norm": 7.779935371935402, "learning_rate": 7.013492856164935e-06, "loss": 2.6048, "step": 797 }, { "epoch": 1.2978636826042727, "grad_norm": 6.966615404926816, "learning_rate": 7.00482780271889e-06, "loss": 2.7309, "step": 798 }, { "epoch": 1.2994913530010173, "grad_norm": 7.903104781764957, "learning_rate": 6.996155568676459e-06, "loss": 2.797, "step": 799 }, { "epoch": 1.3011190233977619, "grad_norm": 7.276786974559671, "learning_rate": 6.987476185098571e-06, "loss": 2.6353, "step": 800 }, { "epoch": 1.3027466937945067, "grad_norm": 7.950633359217921, "learning_rate": 6.978789683071761e-06, "loss": 2.6225, "step": 801 }, { "epoch": 1.3043743641912513, "grad_norm": 6.884606391110065, "learning_rate": 6.97009609370806e-06, "loss": 2.4905, "step": 802 }, { "epoch": 1.306002034587996, "grad_norm": 7.328356189633276, "learning_rate": 6.961395448144885e-06, "loss": 2.4979, "step": 803 }, { "epoch": 1.3076297049847405, "grad_norm": 7.776749536912063, "learning_rate": 6.952687777544922e-06, "loss": 2.319, "step": 804 }, { "epoch": 1.3092573753814851, "grad_norm": 6.6974773049265055, "learning_rate": 6.943973113096023e-06, "loss": 2.5547, "step": 805 }, { "epoch": 1.31088504577823, "grad_norm": 6.945238151329532, "learning_rate": 6.9352514860110876e-06, "loss": 2.4324, "step": 806 }, { "epoch": 1.3125127161749746, "grad_norm": 6.9479650516648155, "learning_rate": 6.9265229275279475e-06, "loss": 2.4567, "step": 807 }, { "epoch": 1.3141403865717192, "grad_norm": 7.19133234659444, "learning_rate": 6.917787468909272e-06, "loss": 2.5935, "step": 808 }, { "epoch": 1.315768056968464, "grad_norm": 7.9858108191995365, "learning_rate": 6.9090451414424345e-06, "loss": 2.4609, "step": 809 }, { "epoch": 1.3173957273652086, "grad_norm": 7.199264609210759, "learning_rate": 6.900295976439413e-06, "loss": 2.5033, "step": 810 }, { "epoch": 1.3190233977619532, "grad_norm": 7.036098839669952, "learning_rate": 6.8915400052366756e-06, "loss": 2.5234, "step": 811 }, { "epoch": 1.3206510681586978, "grad_norm": 7.85596353691799, "learning_rate": 6.882777259195072e-06, "loss": 2.7254, "step": 812 }, { "epoch": 1.3222787385554424, "grad_norm": 7.546420813239242, "learning_rate": 6.874007769699708e-06, "loss": 2.6387, "step": 813 }, { "epoch": 1.3239064089521873, "grad_norm": 7.762511933706328, "learning_rate": 6.865231568159847e-06, "loss": 2.6211, "step": 814 }, { "epoch": 1.3255340793489319, "grad_norm": 7.786465300842358, "learning_rate": 6.856448686008797e-06, "loss": 2.5119, "step": 815 }, { "epoch": 1.3271617497456765, "grad_norm": 7.889771924871699, "learning_rate": 6.847659154703785e-06, "loss": 2.5961, "step": 816 }, { "epoch": 1.3287894201424213, "grad_norm": 7.17114726043021, "learning_rate": 6.83886300572586e-06, "loss": 2.5214, "step": 817 }, { "epoch": 1.3304170905391657, "grad_norm": 7.854857862864335, "learning_rate": 6.830060270579768e-06, "loss": 2.5648, "step": 818 }, { "epoch": 1.3320447609359105, "grad_norm": 7.088068919610799, "learning_rate": 6.82125098079385e-06, "loss": 2.5854, "step": 819 }, { "epoch": 1.3336724313326551, "grad_norm": 7.155836461488828, "learning_rate": 6.812435167919917e-06, "loss": 2.5852, "step": 820 }, { "epoch": 1.3353001017293997, "grad_norm": 8.0387008137957, "learning_rate": 6.803612863533149e-06, "loss": 2.6502, "step": 821 }, { "epoch": 1.3369277721261446, "grad_norm": 7.562239433971548, "learning_rate": 6.7947840992319724e-06, "loss": 2.523, "step": 822 }, { "epoch": 1.3385554425228892, "grad_norm": 7.888946695621224, "learning_rate": 6.785948906637951e-06, "loss": 2.5923, "step": 823 }, { "epoch": 1.3401831129196338, "grad_norm": 6.721486308055436, "learning_rate": 6.777107317395679e-06, "loss": 2.4354, "step": 824 }, { "epoch": 1.3418107833163784, "grad_norm": 6.49005775144583, "learning_rate": 6.768259363172651e-06, "loss": 2.4603, "step": 825 }, { "epoch": 1.343438453713123, "grad_norm": 6.972087957225619, "learning_rate": 6.759405075659165e-06, "loss": 2.4758, "step": 826 }, { "epoch": 1.3450661241098678, "grad_norm": 7.197026644383624, "learning_rate": 6.750544486568205e-06, "loss": 2.531, "step": 827 }, { "epoch": 1.3466937945066124, "grad_norm": 7.240752185927417, "learning_rate": 6.741677627635317e-06, "loss": 2.3636, "step": 828 }, { "epoch": 1.348321464903357, "grad_norm": 6.7557035976940165, "learning_rate": 6.73280453061851e-06, "loss": 2.3336, "step": 829 }, { "epoch": 1.3499491353001019, "grad_norm": 7.02988590877508, "learning_rate": 6.723925227298133e-06, "loss": 2.6618, "step": 830 }, { "epoch": 1.3515768056968465, "grad_norm": 7.932025741430496, "learning_rate": 6.715039749476764e-06, "loss": 2.615, "step": 831 }, { "epoch": 1.353204476093591, "grad_norm": 7.1417608148555995, "learning_rate": 6.706148128979096e-06, "loss": 2.5734, "step": 832 }, { "epoch": 1.3548321464903357, "grad_norm": 7.6358132424046135, "learning_rate": 6.697250397651826e-06, "loss": 2.649, "step": 833 }, { "epoch": 1.3564598168870803, "grad_norm": 6.873827949594999, "learning_rate": 6.688346587363533e-06, "loss": 2.5492, "step": 834 }, { "epoch": 1.3580874872838251, "grad_norm": 7.211800809469366, "learning_rate": 6.679436730004569e-06, "loss": 2.4029, "step": 835 }, { "epoch": 1.3597151576805697, "grad_norm": 6.889401083941532, "learning_rate": 6.6705208574869504e-06, "loss": 2.3809, "step": 836 }, { "epoch": 1.3613428280773143, "grad_norm": 6.4071400030940096, "learning_rate": 6.66159900174423e-06, "loss": 2.4793, "step": 837 }, { "epoch": 1.362970498474059, "grad_norm": 7.500623616611551, "learning_rate": 6.652671194731396e-06, "loss": 2.5111, "step": 838 }, { "epoch": 1.3645981688708035, "grad_norm": 6.812565122124493, "learning_rate": 6.64373746842475e-06, "loss": 2.2929, "step": 839 }, { "epoch": 1.3662258392675484, "grad_norm": 6.851264822613306, "learning_rate": 6.634797854821795e-06, "loss": 2.4211, "step": 840 }, { "epoch": 1.367853509664293, "grad_norm": 6.784251469674895, "learning_rate": 6.62585238594112e-06, "loss": 2.4488, "step": 841 }, { "epoch": 1.3694811800610376, "grad_norm": 7.066070306169518, "learning_rate": 6.616901093822283e-06, "loss": 2.3863, "step": 842 }, { "epoch": 1.3711088504577824, "grad_norm": 6.971676235210089, "learning_rate": 6.607944010525704e-06, "loss": 2.6289, "step": 843 }, { "epoch": 1.372736520854527, "grad_norm": 7.734955046537858, "learning_rate": 6.59898116813254e-06, "loss": 2.6626, "step": 844 }, { "epoch": 1.3743641912512716, "grad_norm": 7.375411197293905, "learning_rate": 6.590012598744581e-06, "loss": 2.4483, "step": 845 }, { "epoch": 1.3759918616480162, "grad_norm": 7.208775142336999, "learning_rate": 6.58103833448412e-06, "loss": 2.456, "step": 846 }, { "epoch": 1.3776195320447608, "grad_norm": 7.655508604127466, "learning_rate": 6.572058407493857e-06, "loss": 2.6661, "step": 847 }, { "epoch": 1.3792472024415057, "grad_norm": 7.742145047448687, "learning_rate": 6.563072849936767e-06, "loss": 2.804, "step": 848 }, { "epoch": 1.3808748728382503, "grad_norm": 8.142099103862094, "learning_rate": 6.5540816939959964e-06, "loss": 2.6889, "step": 849 }, { "epoch": 1.3825025432349949, "grad_norm": 7.7329792908866395, "learning_rate": 6.545084971874738e-06, "loss": 2.6719, "step": 850 }, { "epoch": 1.3841302136317395, "grad_norm": 8.092412116739508, "learning_rate": 6.536082715796125e-06, "loss": 2.479, "step": 851 }, { "epoch": 1.385757884028484, "grad_norm": 7.1478010236292615, "learning_rate": 6.52707495800311e-06, "loss": 2.5873, "step": 852 }, { "epoch": 1.387385554425229, "grad_norm": 7.559638717561875, "learning_rate": 6.518061730758348e-06, "loss": 2.4231, "step": 853 }, { "epoch": 1.3890132248219735, "grad_norm": 6.692373122683836, "learning_rate": 6.509043066344092e-06, "loss": 2.4456, "step": 854 }, { "epoch": 1.3906408952187181, "grad_norm": 7.256645862779619, "learning_rate": 6.500018997062058e-06, "loss": 2.7827, "step": 855 }, { "epoch": 1.392268565615463, "grad_norm": 7.490946372877344, "learning_rate": 6.490989555233328e-06, "loss": 2.6086, "step": 856 }, { "epoch": 1.3938962360122076, "grad_norm": 7.446703498207399, "learning_rate": 6.4819547731982244e-06, "loss": 2.5918, "step": 857 }, { "epoch": 1.3955239064089522, "grad_norm": 7.154886939998566, "learning_rate": 6.472914683316195e-06, "loss": 2.5787, "step": 858 }, { "epoch": 1.3971515768056968, "grad_norm": 7.056290677397076, "learning_rate": 6.463869317965701e-06, "loss": 2.4598, "step": 859 }, { "epoch": 1.3987792472024414, "grad_norm": 6.761039396062092, "learning_rate": 6.454818709544097e-06, "loss": 2.6009, "step": 860 }, { "epoch": 1.4004069175991862, "grad_norm": 6.853838858216, "learning_rate": 6.445762890467517e-06, "loss": 2.5463, "step": 861 }, { "epoch": 1.4020345879959308, "grad_norm": 6.564036278244871, "learning_rate": 6.436701893170757e-06, "loss": 2.3901, "step": 862 }, { "epoch": 1.4036622583926754, "grad_norm": 6.990214337230211, "learning_rate": 6.427635750107159e-06, "loss": 2.4874, "step": 863 }, { "epoch": 1.4052899287894203, "grad_norm": 7.115262383175639, "learning_rate": 6.418564493748501e-06, "loss": 2.4767, "step": 864 }, { "epoch": 1.4069175991861649, "grad_norm": 7.1356201786729105, "learning_rate": 6.409488156584862e-06, "loss": 2.4997, "step": 865 }, { "epoch": 1.4085452695829095, "grad_norm": 7.136308528795081, "learning_rate": 6.4004067711245366e-06, "loss": 2.4283, "step": 866 }, { "epoch": 1.410172939979654, "grad_norm": 7.0771996140737325, "learning_rate": 6.391320369893883e-06, "loss": 2.6559, "step": 867 }, { "epoch": 1.4118006103763987, "grad_norm": 7.166569658255905, "learning_rate": 6.382228985437238e-06, "loss": 2.5376, "step": 868 }, { "epoch": 1.4134282807731435, "grad_norm": 6.685389360399984, "learning_rate": 6.373132650316778e-06, "loss": 2.5669, "step": 869 }, { "epoch": 1.4150559511698881, "grad_norm": 7.343161640171893, "learning_rate": 6.3640313971124155e-06, "loss": 2.4888, "step": 870 }, { "epoch": 1.4166836215666327, "grad_norm": 6.850746464055362, "learning_rate": 6.354925258421676e-06, "loss": 2.3809, "step": 871 }, { "epoch": 1.4183112919633774, "grad_norm": 7.316343097505193, "learning_rate": 6.345814266859581e-06, "loss": 2.4487, "step": 872 }, { "epoch": 1.419938962360122, "grad_norm": 7.448487397910067, "learning_rate": 6.336698455058538e-06, "loss": 2.5714, "step": 873 }, { "epoch": 1.4215666327568668, "grad_norm": 6.642816886593007, "learning_rate": 6.327577855668216e-06, "loss": 2.4084, "step": 874 }, { "epoch": 1.4231943031536114, "grad_norm": 7.894363468284929, "learning_rate": 6.318452501355433e-06, "loss": 2.4848, "step": 875 }, { "epoch": 1.424821973550356, "grad_norm": 7.57203921636277, "learning_rate": 6.309322424804034e-06, "loss": 2.6822, "step": 876 }, { "epoch": 1.4264496439471008, "grad_norm": 7.317296110993347, "learning_rate": 6.3001876587147825e-06, "loss": 2.5862, "step": 877 }, { "epoch": 1.4280773143438454, "grad_norm": 7.908830325546391, "learning_rate": 6.291048235805234e-06, "loss": 2.4093, "step": 878 }, { "epoch": 1.42970498474059, "grad_norm": 7.41966036239266, "learning_rate": 6.281904188809627e-06, "loss": 2.5733, "step": 879 }, { "epoch": 1.4313326551373347, "grad_norm": 7.027346818622856, "learning_rate": 6.272755550478756e-06, "loss": 2.5675, "step": 880 }, { "epoch": 1.4329603255340793, "grad_norm": 6.990443521155838, "learning_rate": 6.263602353579868e-06, "loss": 2.4456, "step": 881 }, { "epoch": 1.434587995930824, "grad_norm": 7.317625210378577, "learning_rate": 6.254444630896529e-06, "loss": 2.6863, "step": 882 }, { "epoch": 1.4362156663275687, "grad_norm": 7.020074360138977, "learning_rate": 6.245282415228521e-06, "loss": 2.3219, "step": 883 }, { "epoch": 1.4378433367243133, "grad_norm": 7.7106482924150574, "learning_rate": 6.236115739391717e-06, "loss": 2.5555, "step": 884 }, { "epoch": 1.439471007121058, "grad_norm": 7.143830648355234, "learning_rate": 6.226944636217962e-06, "loss": 2.5899, "step": 885 }, { "epoch": 1.4410986775178025, "grad_norm": 7.645725849560782, "learning_rate": 6.2177691385549595e-06, "loss": 2.5641, "step": 886 }, { "epoch": 1.4427263479145473, "grad_norm": 7.196833977170635, "learning_rate": 6.208589279266157e-06, "loss": 2.5344, "step": 887 }, { "epoch": 1.444354018311292, "grad_norm": 6.927750066597852, "learning_rate": 6.199405091230615e-06, "loss": 2.596, "step": 888 }, { "epoch": 1.4459816887080366, "grad_norm": 7.755193204655843, "learning_rate": 6.190216607342906e-06, "loss": 2.531, "step": 889 }, { "epoch": 1.4476093591047814, "grad_norm": 7.589899510592822, "learning_rate": 6.181023860512985e-06, "loss": 2.296, "step": 890 }, { "epoch": 1.449237029501526, "grad_norm": 7.65371380032414, "learning_rate": 6.171826883666075e-06, "loss": 2.7145, "step": 891 }, { "epoch": 1.4508646998982706, "grad_norm": 8.16096328036141, "learning_rate": 6.1626257097425515e-06, "loss": 2.5293, "step": 892 }, { "epoch": 1.4524923702950152, "grad_norm": 7.860021196702176, "learning_rate": 6.153420371697821e-06, "loss": 2.4924, "step": 893 }, { "epoch": 1.4541200406917598, "grad_norm": 6.730141270604894, "learning_rate": 6.144210902502207e-06, "loss": 2.6584, "step": 894 }, { "epoch": 1.4557477110885046, "grad_norm": 7.646438295057861, "learning_rate": 6.134997335140824e-06, "loss": 2.6841, "step": 895 }, { "epoch": 1.4573753814852493, "grad_norm": 7.3475095477042975, "learning_rate": 6.125779702613471e-06, "loss": 2.3801, "step": 896 }, { "epoch": 1.4590030518819939, "grad_norm": 7.3757408138070355, "learning_rate": 6.116558037934501e-06, "loss": 2.6001, "step": 897 }, { "epoch": 1.4606307222787387, "grad_norm": 7.949395496616596, "learning_rate": 6.107332374132715e-06, "loss": 2.5929, "step": 898 }, { "epoch": 1.462258392675483, "grad_norm": 6.898179478882045, "learning_rate": 6.0981027442512316e-06, "loss": 2.4309, "step": 899 }, { "epoch": 1.463886063072228, "grad_norm": 6.499658382086391, "learning_rate": 6.0888691813473785e-06, "loss": 2.4852, "step": 900 }, { "epoch": 1.4655137334689725, "grad_norm": 7.481967355500946, "learning_rate": 6.079631718492569e-06, "loss": 2.5438, "step": 901 }, { "epoch": 1.4671414038657171, "grad_norm": 7.019182389285353, "learning_rate": 6.070390388772184e-06, "loss": 2.5147, "step": 902 }, { "epoch": 1.468769074262462, "grad_norm": 7.217858170803212, "learning_rate": 6.061145225285454e-06, "loss": 2.5747, "step": 903 }, { "epoch": 1.4703967446592066, "grad_norm": 7.194739066728751, "learning_rate": 6.051896261145341e-06, "loss": 2.4514, "step": 904 }, { "epoch": 1.4720244150559512, "grad_norm": 7.589463929467697, "learning_rate": 6.042643529478424e-06, "loss": 2.529, "step": 905 }, { "epoch": 1.4736520854526958, "grad_norm": 7.668688488296065, "learning_rate": 6.033387063424765e-06, "loss": 2.5081, "step": 906 }, { "epoch": 1.4752797558494404, "grad_norm": 7.293862215657799, "learning_rate": 6.024126896137814e-06, "loss": 2.507, "step": 907 }, { "epoch": 1.4769074262461852, "grad_norm": 7.428762165444603, "learning_rate": 6.01486306078427e-06, "loss": 2.5566, "step": 908 }, { "epoch": 1.4785350966429298, "grad_norm": 8.01644904480054, "learning_rate": 6.00559559054397e-06, "loss": 2.6876, "step": 909 }, { "epoch": 1.4801627670396744, "grad_norm": 7.3550637243759445, "learning_rate": 5.996324518609773e-06, "loss": 2.6271, "step": 910 }, { "epoch": 1.4817904374364192, "grad_norm": 7.520396465759319, "learning_rate": 5.987049878187437e-06, "loss": 2.6183, "step": 911 }, { "epoch": 1.4834181078331639, "grad_norm": 7.196312959853555, "learning_rate": 5.977771702495497e-06, "loss": 2.48, "step": 912 }, { "epoch": 1.4850457782299085, "grad_norm": 6.907073801554598, "learning_rate": 5.968490024765158e-06, "loss": 2.5442, "step": 913 }, { "epoch": 1.486673448626653, "grad_norm": 7.495793381655675, "learning_rate": 5.95920487824016e-06, "loss": 2.6515, "step": 914 }, { "epoch": 1.4883011190233977, "grad_norm": 6.949782749728095, "learning_rate": 5.949916296176674e-06, "loss": 2.4914, "step": 915 }, { "epoch": 1.4899287894201425, "grad_norm": 6.974810351288801, "learning_rate": 5.94062431184317e-06, "loss": 2.6851, "step": 916 }, { "epoch": 1.4915564598168871, "grad_norm": 7.449845147433385, "learning_rate": 5.9313289585203074e-06, "loss": 2.6633, "step": 917 }, { "epoch": 1.4931841302136317, "grad_norm": 7.286082142948522, "learning_rate": 5.922030269500809e-06, "loss": 2.6014, "step": 918 }, { "epoch": 1.4948118006103763, "grad_norm": 7.978819746203161, "learning_rate": 5.912728278089352e-06, "loss": 2.6989, "step": 919 }, { "epoch": 1.496439471007121, "grad_norm": 7.893521170197285, "learning_rate": 5.903423017602432e-06, "loss": 2.493, "step": 920 }, { "epoch": 1.4980671414038658, "grad_norm": 7.7983573385364044, "learning_rate": 5.894114521368259e-06, "loss": 2.6423, "step": 921 }, { "epoch": 1.4996948118006104, "grad_norm": 7.188609855009061, "learning_rate": 5.8848028227266325e-06, "loss": 2.6809, "step": 922 }, { "epoch": 1.501322482197355, "grad_norm": 6.650471919906854, "learning_rate": 5.87548795502882e-06, "loss": 2.344, "step": 923 }, { "epoch": 1.5029501525940998, "grad_norm": 7.150725506246275, "learning_rate": 5.8661699516374395e-06, "loss": 2.6716, "step": 924 }, { "epoch": 1.5045778229908442, "grad_norm": 7.371375525207782, "learning_rate": 5.85684884592634e-06, "loss": 2.5156, "step": 925 }, { "epoch": 1.506205493387589, "grad_norm": 7.784278483730839, "learning_rate": 5.8475246712804845e-06, "loss": 2.7374, "step": 926 }, { "epoch": 1.5078331637843336, "grad_norm": 6.596053925644081, "learning_rate": 5.8381974610958226e-06, "loss": 2.2945, "step": 927 }, { "epoch": 1.5094608341810782, "grad_norm": 7.356278497582057, "learning_rate": 5.828867248779186e-06, "loss": 2.4299, "step": 928 }, { "epoch": 1.511088504577823, "grad_norm": 7.375028160208118, "learning_rate": 5.819534067748143e-06, "loss": 2.5849, "step": 929 }, { "epoch": 1.5127161749745677, "grad_norm": 7.227202333752401, "learning_rate": 5.810197951430912e-06, "loss": 2.2366, "step": 930 }, { "epoch": 1.5143438453713123, "grad_norm": 6.148502615726455, "learning_rate": 5.800858933266214e-06, "loss": 2.6037, "step": 931 }, { "epoch": 1.515971515768057, "grad_norm": 7.175877401834235, "learning_rate": 5.791517046703163e-06, "loss": 2.4112, "step": 932 }, { "epoch": 1.5175991861648015, "grad_norm": 6.855731172338194, "learning_rate": 5.782172325201155e-06, "loss": 2.624, "step": 933 }, { "epoch": 1.5192268565615463, "grad_norm": 7.758469442612416, "learning_rate": 5.772824802229733e-06, "loss": 2.493, "step": 934 }, { "epoch": 1.520854526958291, "grad_norm": 7.342628560696581, "learning_rate": 5.763474511268473e-06, "loss": 2.7452, "step": 935 }, { "epoch": 1.5224821973550355, "grad_norm": 7.693819959531978, "learning_rate": 5.7541214858068705e-06, "loss": 2.4929, "step": 936 }, { "epoch": 1.5241098677517804, "grad_norm": 8.273753770641765, "learning_rate": 5.74476575934421e-06, "loss": 2.6586, "step": 937 }, { "epoch": 1.525737538148525, "grad_norm": 7.324058106769533, "learning_rate": 5.735407365389453e-06, "loss": 2.5767, "step": 938 }, { "epoch": 1.5273652085452696, "grad_norm": 7.9236615465131, "learning_rate": 5.726046337461112e-06, "loss": 2.5724, "step": 939 }, { "epoch": 1.5289928789420142, "grad_norm": 6.998830307067352, "learning_rate": 5.716682709087139e-06, "loss": 2.5056, "step": 940 }, { "epoch": 1.5306205493387588, "grad_norm": 7.641704860522804, "learning_rate": 5.707316513804793e-06, "loss": 2.6522, "step": 941 }, { "epoch": 1.5322482197355036, "grad_norm": 7.641861552091686, "learning_rate": 5.697947785160532e-06, "loss": 2.5801, "step": 942 }, { "epoch": 1.5338758901322482, "grad_norm": 7.813868525665441, "learning_rate": 5.688576556709886e-06, "loss": 2.6437, "step": 943 }, { "epoch": 1.5355035605289928, "grad_norm": 6.833498366184688, "learning_rate": 5.679202862017338e-06, "loss": 2.5823, "step": 944 }, { "epoch": 1.5371312309257377, "grad_norm": 6.876929467732722, "learning_rate": 5.669826734656203e-06, "loss": 2.3855, "step": 945 }, { "epoch": 1.538758901322482, "grad_norm": 7.3685284359593375, "learning_rate": 5.660448208208513e-06, "loss": 2.4017, "step": 946 }, { "epoch": 1.5403865717192269, "grad_norm": 6.676024685139891, "learning_rate": 5.651067316264888e-06, "loss": 2.539, "step": 947 }, { "epoch": 1.5420142421159715, "grad_norm": 7.114674645765368, "learning_rate": 5.641684092424421e-06, "loss": 2.5444, "step": 948 }, { "epoch": 1.543641912512716, "grad_norm": 7.84844029586021, "learning_rate": 5.632298570294563e-06, "loss": 2.731, "step": 949 }, { "epoch": 1.545269582909461, "grad_norm": 7.78876037123777, "learning_rate": 5.6229107834909875e-06, "loss": 2.6118, "step": 950 }, { "epoch": 1.5468972533062055, "grad_norm": 7.198673787360983, "learning_rate": 5.613520765637489e-06, "loss": 2.5225, "step": 951 }, { "epoch": 1.5485249237029501, "grad_norm": 7.517373316334473, "learning_rate": 5.604128550365845e-06, "loss": 2.5465, "step": 952 }, { "epoch": 1.550152594099695, "grad_norm": 7.149795276593462, "learning_rate": 5.594734171315705e-06, "loss": 2.5681, "step": 953 }, { "epoch": 1.5517802644964394, "grad_norm": 7.731751104486799, "learning_rate": 5.585337662134471e-06, "loss": 2.5822, "step": 954 }, { "epoch": 1.5534079348931842, "grad_norm": 7.354441658128124, "learning_rate": 5.575939056477173e-06, "loss": 2.64, "step": 955 }, { "epoch": 1.5550356052899288, "grad_norm": 6.877006410280508, "learning_rate": 5.566538388006351e-06, "loss": 2.5185, "step": 956 }, { "epoch": 1.5566632756866734, "grad_norm": 7.137354528314152, "learning_rate": 5.557135690391928e-06, "loss": 2.5064, "step": 957 }, { "epoch": 1.5582909460834182, "grad_norm": 7.421543689334166, "learning_rate": 5.547730997311105e-06, "loss": 2.5851, "step": 958 }, { "epoch": 1.5599186164801626, "grad_norm": 7.185231705567218, "learning_rate": 5.538324342448221e-06, "loss": 2.4413, "step": 959 }, { "epoch": 1.5615462868769074, "grad_norm": 10.887726232092824, "learning_rate": 5.528915759494641e-06, "loss": 2.5671, "step": 960 }, { "epoch": 1.563173957273652, "grad_norm": 7.077040556073354, "learning_rate": 5.519505282148644e-06, "loss": 2.4581, "step": 961 }, { "epoch": 1.5648016276703967, "grad_norm": 7.196071021548636, "learning_rate": 5.510092944115286e-06, "loss": 2.4458, "step": 962 }, { "epoch": 1.5664292980671415, "grad_norm": 6.91786141012999, "learning_rate": 5.500678779106292e-06, "loss": 2.4181, "step": 963 }, { "epoch": 1.568056968463886, "grad_norm": 7.0513515776447955, "learning_rate": 5.49126282083993e-06, "loss": 2.4293, "step": 964 }, { "epoch": 1.5696846388606307, "grad_norm": 7.128953859778242, "learning_rate": 5.481845103040887e-06, "loss": 2.4621, "step": 965 }, { "epoch": 1.5713123092573755, "grad_norm": 8.095938630442914, "learning_rate": 5.472425659440157e-06, "loss": 2.5879, "step": 966 }, { "epoch": 1.57293997965412, "grad_norm": 7.22102940587674, "learning_rate": 5.463004523774913e-06, "loss": 2.407, "step": 967 }, { "epoch": 1.5745676500508647, "grad_norm": 7.231699988709768, "learning_rate": 5.453581729788388e-06, "loss": 2.5328, "step": 968 }, { "epoch": 1.5761953204476093, "grad_norm": 7.45139297139098, "learning_rate": 5.4441573112297545e-06, "loss": 2.5171, "step": 969 }, { "epoch": 1.577822990844354, "grad_norm": 7.6473986157633265, "learning_rate": 5.434731301854006e-06, "loss": 2.4911, "step": 970 }, { "epoch": 1.5794506612410988, "grad_norm": 7.458793111189566, "learning_rate": 5.425303735421828e-06, "loss": 2.6287, "step": 971 }, { "epoch": 1.5810783316378432, "grad_norm": 8.028713745468513, "learning_rate": 5.415874645699493e-06, "loss": 2.5735, "step": 972 }, { "epoch": 1.582706002034588, "grad_norm": 6.674792035256759, "learning_rate": 5.406444066458718e-06, "loss": 2.2867, "step": 973 }, { "epoch": 1.5843336724313326, "grad_norm": 7.093417637081279, "learning_rate": 5.397012031476561e-06, "loss": 2.6579, "step": 974 }, { "epoch": 1.5859613428280772, "grad_norm": 7.552038987700316, "learning_rate": 5.387578574535296e-06, "loss": 2.7058, "step": 975 }, { "epoch": 1.587589013224822, "grad_norm": 7.507278100695671, "learning_rate": 5.378143729422285e-06, "loss": 2.5016, "step": 976 }, { "epoch": 1.5892166836215667, "grad_norm": 7.374176748452649, "learning_rate": 5.368707529929863e-06, "loss": 2.6018, "step": 977 }, { "epoch": 1.5908443540183113, "grad_norm": 7.3926100644120645, "learning_rate": 5.359270009855217e-06, "loss": 2.5668, "step": 978 }, { "epoch": 1.592472024415056, "grad_norm": 7.491343029213741, "learning_rate": 5.349831203000267e-06, "loss": 2.4161, "step": 979 }, { "epoch": 1.5940996948118005, "grad_norm": 6.987201208813213, "learning_rate": 5.340391143171535e-06, "loss": 2.4957, "step": 980 }, { "epoch": 1.5957273652085453, "grad_norm": 7.365175848952149, "learning_rate": 5.330949864180034e-06, "loss": 2.5208, "step": 981 }, { "epoch": 1.59735503560529, "grad_norm": 7.127294501427154, "learning_rate": 5.321507399841148e-06, "loss": 2.4671, "step": 982 }, { "epoch": 1.5989827060020345, "grad_norm": 7.4316809783192666, "learning_rate": 5.312063783974498e-06, "loss": 2.5966, "step": 983 }, { "epoch": 1.6006103763987793, "grad_norm": 7.66455724038307, "learning_rate": 5.302619050403837e-06, "loss": 2.5283, "step": 984 }, { "epoch": 1.602238046795524, "grad_norm": 6.675341208168659, "learning_rate": 5.293173232956917e-06, "loss": 2.4908, "step": 985 }, { "epoch": 1.6038657171922686, "grad_norm": 6.889025532052792, "learning_rate": 5.2837263654653715e-06, "loss": 2.4582, "step": 986 }, { "epoch": 1.6054933875890134, "grad_norm": 7.1823495855814805, "learning_rate": 5.274278481764598e-06, "loss": 2.4823, "step": 987 }, { "epoch": 1.6071210579857578, "grad_norm": 7.52947103767488, "learning_rate": 5.264829615693631e-06, "loss": 2.5309, "step": 988 }, { "epoch": 1.6087487283825026, "grad_norm": 7.033201390653195, "learning_rate": 5.255379801095027e-06, "loss": 2.5029, "step": 989 }, { "epoch": 1.6103763987792472, "grad_norm": 7.571844050340152, "learning_rate": 5.245929071814735e-06, "loss": 2.4224, "step": 990 }, { "epoch": 1.6120040691759918, "grad_norm": 7.719506622977143, "learning_rate": 5.236477461701985e-06, "loss": 2.537, "step": 991 }, { "epoch": 1.6136317395727366, "grad_norm": 7.157719113763191, "learning_rate": 5.227025004609156e-06, "loss": 2.6934, "step": 992 }, { "epoch": 1.615259409969481, "grad_norm": 7.155667808881783, "learning_rate": 5.217571734391668e-06, "loss": 2.4204, "step": 993 }, { "epoch": 1.6168870803662259, "grad_norm": 7.144553699316705, "learning_rate": 5.208117684907846e-06, "loss": 2.5955, "step": 994 }, { "epoch": 1.6185147507629705, "grad_norm": 7.136958920735455, "learning_rate": 5.198662890018811e-06, "loss": 2.586, "step": 995 }, { "epoch": 1.620142421159715, "grad_norm": 6.651633070813827, "learning_rate": 5.189207383588353e-06, "loss": 2.4901, "step": 996 }, { "epoch": 1.62177009155646, "grad_norm": 8.040412670635826, "learning_rate": 5.179751199482807e-06, "loss": 2.5633, "step": 997 }, { "epoch": 1.6233977619532045, "grad_norm": 7.836427156702813, "learning_rate": 5.1702943715709395e-06, "loss": 2.5366, "step": 998 }, { "epoch": 1.6250254323499491, "grad_norm": 8.001585804680515, "learning_rate": 5.160836933723821e-06, "loss": 2.484, "step": 999 }, { "epoch": 1.626653102746694, "grad_norm": 8.095653407925694, "learning_rate": 5.151378919814708e-06, "loss": 2.6694, "step": 1000 }, { "epoch": 1.6282807731434383, "grad_norm": 7.140418373231746, "learning_rate": 5.141920363718916e-06, "loss": 2.4354, "step": 1001 }, { "epoch": 1.6299084435401832, "grad_norm": 7.246433618890291, "learning_rate": 5.132461299313709e-06, "loss": 2.4798, "step": 1002 }, { "epoch": 1.6315361139369278, "grad_norm": 6.996662844936418, "learning_rate": 5.1230017604781666e-06, "loss": 2.4413, "step": 1003 }, { "epoch": 1.6331637843336724, "grad_norm": 6.607652154252145, "learning_rate": 5.113541781093068e-06, "loss": 2.4727, "step": 1004 }, { "epoch": 1.6347914547304172, "grad_norm": 6.673508482283488, "learning_rate": 5.104081395040773e-06, "loss": 2.5388, "step": 1005 }, { "epoch": 1.6364191251271616, "grad_norm": 7.243519389171994, "learning_rate": 5.094620636205096e-06, "loss": 2.5741, "step": 1006 }, { "epoch": 1.6380467955239064, "grad_norm": 7.0022363352250006, "learning_rate": 5.085159538471186e-06, "loss": 2.6998, "step": 1007 }, { "epoch": 1.639674465920651, "grad_norm": 8.203935790495846, "learning_rate": 5.075698135725408e-06, "loss": 2.7088, "step": 1008 }, { "epoch": 1.6413021363173956, "grad_norm": 7.2066440861653085, "learning_rate": 5.0662364618552186e-06, "loss": 2.5254, "step": 1009 }, { "epoch": 1.6429298067141405, "grad_norm": 7.347262528712721, "learning_rate": 5.056774550749043e-06, "loss": 2.6765, "step": 1010 }, { "epoch": 1.644557477110885, "grad_norm": 7.64763026262225, "learning_rate": 5.047312436296159e-06, "loss": 2.6078, "step": 1011 }, { "epoch": 1.6461851475076297, "grad_norm": 7.005937467718846, "learning_rate": 5.037850152386574e-06, "loss": 2.5208, "step": 1012 }, { "epoch": 1.6478128179043745, "grad_norm": 6.776422626919974, "learning_rate": 5.028387732910897e-06, "loss": 2.5829, "step": 1013 }, { "epoch": 1.649440488301119, "grad_norm": 7.618670766574482, "learning_rate": 5.018925211760227e-06, "loss": 2.4828, "step": 1014 }, { "epoch": 1.6510681586978637, "grad_norm": 7.093203390139925, "learning_rate": 5.009462622826027e-06, "loss": 2.5771, "step": 1015 }, { "epoch": 1.6526958290946083, "grad_norm": 7.382086277685373, "learning_rate": 5e-06, "loss": 2.6665, "step": 1016 }, { "epoch": 1.654323499491353, "grad_norm": 6.812240548341276, "learning_rate": 4.990537377173975e-06, "loss": 2.66, "step": 1017 }, { "epoch": 1.6559511698880978, "grad_norm": 7.2061095865491644, "learning_rate": 4.981074788239773e-06, "loss": 2.5818, "step": 1018 }, { "epoch": 1.6575788402848424, "grad_norm": 7.302258449917542, "learning_rate": 4.971612267089105e-06, "loss": 2.6427, "step": 1019 }, { "epoch": 1.659206510681587, "grad_norm": 6.83855220491758, "learning_rate": 4.9621498476134284e-06, "loss": 2.4811, "step": 1020 }, { "epoch": 1.6608341810783316, "grad_norm": 7.288495518038251, "learning_rate": 4.952687563703841e-06, "loss": 2.7484, "step": 1021 }, { "epoch": 1.6624618514750762, "grad_norm": 7.039423007586131, "learning_rate": 4.943225449250959e-06, "loss": 2.4206, "step": 1022 }, { "epoch": 1.664089521871821, "grad_norm": 7.002259620776586, "learning_rate": 4.933763538144784e-06, "loss": 2.5709, "step": 1023 }, { "epoch": 1.6657171922685656, "grad_norm": 7.644127408244774, "learning_rate": 4.924301864274592e-06, "loss": 2.4964, "step": 1024 }, { "epoch": 1.6673448626653102, "grad_norm": 7.306220699963003, "learning_rate": 4.914840461528815e-06, "loss": 2.6019, "step": 1025 }, { "epoch": 1.668972533062055, "grad_norm": 7.395394982951798, "learning_rate": 4.905379363794907e-06, "loss": 2.5039, "step": 1026 }, { "epoch": 1.6706002034587994, "grad_norm": 7.086958544901244, "learning_rate": 4.895918604959227e-06, "loss": 2.4569, "step": 1027 }, { "epoch": 1.6722278738555443, "grad_norm": 7.189652484218376, "learning_rate": 4.886458218906934e-06, "loss": 2.4913, "step": 1028 }, { "epoch": 1.6738555442522889, "grad_norm": 7.016905651005405, "learning_rate": 4.876998239521836e-06, "loss": 2.5256, "step": 1029 }, { "epoch": 1.6754832146490335, "grad_norm": 7.120975867035879, "learning_rate": 4.867538700686292e-06, "loss": 2.5182, "step": 1030 }, { "epoch": 1.6771108850457783, "grad_norm": 7.026654674087041, "learning_rate": 4.858079636281086e-06, "loss": 2.7166, "step": 1031 }, { "epoch": 1.678738555442523, "grad_norm": 6.962421823693428, "learning_rate": 4.848621080185294e-06, "loss": 2.5253, "step": 1032 }, { "epoch": 1.6803662258392675, "grad_norm": 6.7886107843879335, "learning_rate": 4.83916306627618e-06, "loss": 2.5359, "step": 1033 }, { "epoch": 1.6819938962360124, "grad_norm": 7.614220341425296, "learning_rate": 4.829705628429061e-06, "loss": 2.4976, "step": 1034 }, { "epoch": 1.6836215666327567, "grad_norm": 7.480022392610622, "learning_rate": 4.820248800517196e-06, "loss": 2.4889, "step": 1035 }, { "epoch": 1.6852492370295016, "grad_norm": 6.905508623087176, "learning_rate": 4.81079261641165e-06, "loss": 2.4426, "step": 1036 }, { "epoch": 1.6868769074262462, "grad_norm": 7.4599356130159125, "learning_rate": 4.80133710998119e-06, "loss": 2.4602, "step": 1037 }, { "epoch": 1.6885045778229908, "grad_norm": 7.1469839062333245, "learning_rate": 4.791882315092156e-06, "loss": 2.6325, "step": 1038 }, { "epoch": 1.6901322482197356, "grad_norm": 6.814630576881873, "learning_rate": 4.782428265608333e-06, "loss": 2.4387, "step": 1039 }, { "epoch": 1.69175991861648, "grad_norm": 7.751885946893623, "learning_rate": 4.7729749953908455e-06, "loss": 2.6227, "step": 1040 }, { "epoch": 1.6933875890132248, "grad_norm": 7.150381593449218, "learning_rate": 4.763522538298018e-06, "loss": 2.4556, "step": 1041 }, { "epoch": 1.6950152594099694, "grad_norm": 7.430824372769004, "learning_rate": 4.754070928185266e-06, "loss": 2.4709, "step": 1042 }, { "epoch": 1.696642929806714, "grad_norm": 7.230327730351756, "learning_rate": 4.744620198904974e-06, "loss": 2.5879, "step": 1043 }, { "epoch": 1.6982706002034589, "grad_norm": 7.820633018027017, "learning_rate": 4.735170384306371e-06, "loss": 2.7627, "step": 1044 }, { "epoch": 1.6998982706002035, "grad_norm": 7.25619538803272, "learning_rate": 4.725721518235404e-06, "loss": 2.6317, "step": 1045 }, { "epoch": 1.701525940996948, "grad_norm": 6.7069555267499705, "learning_rate": 4.71627363453463e-06, "loss": 2.3263, "step": 1046 }, { "epoch": 1.703153611393693, "grad_norm": 7.279873863244663, "learning_rate": 4.706826767043086e-06, "loss": 2.6514, "step": 1047 }, { "epoch": 1.7047812817904373, "grad_norm": 7.037469909408122, "learning_rate": 4.697380949596163e-06, "loss": 2.5286, "step": 1048 }, { "epoch": 1.7064089521871821, "grad_norm": 6.995416864393667, "learning_rate": 4.687936216025503e-06, "loss": 2.4772, "step": 1049 }, { "epoch": 1.7080366225839267, "grad_norm": 6.644354188838425, "learning_rate": 4.678492600158855e-06, "loss": 2.4749, "step": 1050 }, { "epoch": 1.7096642929806714, "grad_norm": 7.438997093072106, "learning_rate": 4.669050135819966e-06, "loss": 2.6723, "step": 1051 }, { "epoch": 1.7112919633774162, "grad_norm": 7.268887011125248, "learning_rate": 4.659608856828467e-06, "loss": 2.3428, "step": 1052 }, { "epoch": 1.7129196337741606, "grad_norm": 6.961883223211438, "learning_rate": 4.650168796999736e-06, "loss": 2.3491, "step": 1053 }, { "epoch": 1.7145473041709054, "grad_norm": 7.171201017675992, "learning_rate": 4.640729990144784e-06, "loss": 2.3892, "step": 1054 }, { "epoch": 1.71617497456765, "grad_norm": 6.813999651711974, "learning_rate": 4.631292470070139e-06, "loss": 2.44, "step": 1055 }, { "epoch": 1.7178026449643946, "grad_norm": 7.097405800083662, "learning_rate": 4.6218562705777185e-06, "loss": 2.4579, "step": 1056 }, { "epoch": 1.7194303153611394, "grad_norm": 7.496957653488845, "learning_rate": 4.6124214254647045e-06, "loss": 2.5201, "step": 1057 }, { "epoch": 1.721057985757884, "grad_norm": 6.825618477683297, "learning_rate": 4.6029879685234395e-06, "loss": 2.4174, "step": 1058 }, { "epoch": 1.7226856561546287, "grad_norm": 7.482194106723668, "learning_rate": 4.593555933541284e-06, "loss": 2.7331, "step": 1059 }, { "epoch": 1.7243133265513735, "grad_norm": 7.848667593871324, "learning_rate": 4.584125354300508e-06, "loss": 2.7415, "step": 1060 }, { "epoch": 1.7259409969481179, "grad_norm": 7.529112747319603, "learning_rate": 4.574696264578173e-06, "loss": 2.5681, "step": 1061 }, { "epoch": 1.7275686673448627, "grad_norm": 7.806506919368194, "learning_rate": 4.565268698145997e-06, "loss": 2.5584, "step": 1062 }, { "epoch": 1.7291963377416073, "grad_norm": 7.910386843338659, "learning_rate": 4.555842688770246e-06, "loss": 2.7977, "step": 1063 }, { "epoch": 1.730824008138352, "grad_norm": 7.638140898235063, "learning_rate": 4.546418270211614e-06, "loss": 2.6554, "step": 1064 }, { "epoch": 1.7324516785350967, "grad_norm": 7.2654573148630055, "learning_rate": 4.53699547622509e-06, "loss": 2.4862, "step": 1065 }, { "epoch": 1.7340793489318413, "grad_norm": 10.827986671962512, "learning_rate": 4.527574340559844e-06, "loss": 2.736, "step": 1066 }, { "epoch": 1.735707019328586, "grad_norm": 8.096168532701657, "learning_rate": 4.518154896959114e-06, "loss": 2.647, "step": 1067 }, { "epoch": 1.7373346897253308, "grad_norm": 7.693954134878495, "learning_rate": 4.508737179160072e-06, "loss": 2.6038, "step": 1068 }, { "epoch": 1.7389623601220752, "grad_norm": 7.032328750706983, "learning_rate": 4.4993212208937084e-06, "loss": 2.5442, "step": 1069 }, { "epoch": 1.74059003051882, "grad_norm": 6.95569056250147, "learning_rate": 4.489907055884716e-06, "loss": 2.6493, "step": 1070 }, { "epoch": 1.7422177009155646, "grad_norm": 6.967405867428712, "learning_rate": 4.480494717851359e-06, "loss": 2.501, "step": 1071 }, { "epoch": 1.7438453713123092, "grad_norm": 7.808501419179553, "learning_rate": 4.47108424050536e-06, "loss": 2.4363, "step": 1072 }, { "epoch": 1.745473041709054, "grad_norm": 6.928981315709678, "learning_rate": 4.461675657551782e-06, "loss": 2.721, "step": 1073 }, { "epoch": 1.7471007121057984, "grad_norm": 7.208341469080126, "learning_rate": 4.452269002688897e-06, "loss": 2.5799, "step": 1074 }, { "epoch": 1.7487283825025433, "grad_norm": 6.945217000144789, "learning_rate": 4.442864309608072e-06, "loss": 2.4425, "step": 1075 }, { "epoch": 1.7503560528992879, "grad_norm": 8.34551026828082, "learning_rate": 4.4334616119936516e-06, "loss": 2.6507, "step": 1076 }, { "epoch": 1.7519837232960325, "grad_norm": 7.430160574062386, "learning_rate": 4.424060943522829e-06, "loss": 2.5787, "step": 1077 }, { "epoch": 1.7536113936927773, "grad_norm": 6.894410399766849, "learning_rate": 4.41466233786553e-06, "loss": 2.3783, "step": 1078 }, { "epoch": 1.755239064089522, "grad_norm": 7.075846509519076, "learning_rate": 4.405265828684297e-06, "loss": 2.4959, "step": 1079 }, { "epoch": 1.7568667344862665, "grad_norm": 7.341849619660998, "learning_rate": 4.395871449634157e-06, "loss": 2.6428, "step": 1080 }, { "epoch": 1.7584944048830113, "grad_norm": 7.190628155303531, "learning_rate": 4.386479234362512e-06, "loss": 2.4878, "step": 1081 }, { "epoch": 1.7601220752797557, "grad_norm": 7.360691069781115, "learning_rate": 4.377089216509013e-06, "loss": 2.4402, "step": 1082 }, { "epoch": 1.7617497456765006, "grad_norm": 6.807206936682764, "learning_rate": 4.367701429705439e-06, "loss": 2.5597, "step": 1083 }, { "epoch": 1.7633774160732452, "grad_norm": 7.0530057243032065, "learning_rate": 4.35831590757558e-06, "loss": 2.5173, "step": 1084 }, { "epoch": 1.7650050864699898, "grad_norm": 7.31675516087201, "learning_rate": 4.3489326837351145e-06, "loss": 2.615, "step": 1085 }, { "epoch": 1.7666327568667346, "grad_norm": 7.6248980903207375, "learning_rate": 4.33955179179149e-06, "loss": 2.5752, "step": 1086 }, { "epoch": 1.768260427263479, "grad_norm": 7.7482320561240945, "learning_rate": 4.3301732653437975e-06, "loss": 2.443, "step": 1087 }, { "epoch": 1.7698880976602238, "grad_norm": 7.304452940220405, "learning_rate": 4.3207971379826636e-06, "loss": 2.6258, "step": 1088 }, { "epoch": 1.7715157680569684, "grad_norm": 7.397280698980048, "learning_rate": 4.311423443290115e-06, "loss": 2.3996, "step": 1089 }, { "epoch": 1.773143438453713, "grad_norm": 7.6850404118922455, "learning_rate": 4.302052214839468e-06, "loss": 2.6133, "step": 1090 }, { "epoch": 1.7747711088504579, "grad_norm": 7.000111641516366, "learning_rate": 4.292683486195208e-06, "loss": 2.5009, "step": 1091 }, { "epoch": 1.7763987792472025, "grad_norm": 7.917045085494247, "learning_rate": 4.2833172909128635e-06, "loss": 2.6779, "step": 1092 }, { "epoch": 1.778026449643947, "grad_norm": 7.28605852242624, "learning_rate": 4.273953662538888e-06, "loss": 2.6072, "step": 1093 }, { "epoch": 1.779654120040692, "grad_norm": 7.628060275285932, "learning_rate": 4.264592634610549e-06, "loss": 2.4954, "step": 1094 }, { "epoch": 1.7812817904374363, "grad_norm": 7.35694412582059, "learning_rate": 4.2552342406557924e-06, "loss": 2.4745, "step": 1095 }, { "epoch": 1.7829094608341811, "grad_norm": 7.547708508004973, "learning_rate": 4.245878514193131e-06, "loss": 2.6251, "step": 1096 }, { "epoch": 1.7845371312309257, "grad_norm": 7.096776022578963, "learning_rate": 4.236525488731528e-06, "loss": 2.5121, "step": 1097 }, { "epoch": 1.7861648016276703, "grad_norm": 7.672206191844872, "learning_rate": 4.22717519777027e-06, "loss": 2.4583, "step": 1098 }, { "epoch": 1.7877924720244152, "grad_norm": 7.280882495478644, "learning_rate": 4.217827674798845e-06, "loss": 2.6088, "step": 1099 }, { "epoch": 1.7894201424211598, "grad_norm": 6.874056084031787, "learning_rate": 4.208482953296838e-06, "loss": 2.4219, "step": 1100 }, { "epoch": 1.7910478128179044, "grad_norm": 7.431546460362658, "learning_rate": 4.1991410667337896e-06, "loss": 2.5235, "step": 1101 }, { "epoch": 1.792675483214649, "grad_norm": 7.233958479795417, "learning_rate": 4.189802048569089e-06, "loss": 2.3957, "step": 1102 }, { "epoch": 1.7943031536113936, "grad_norm": 7.530582653332875, "learning_rate": 4.180465932251859e-06, "loss": 2.6518, "step": 1103 }, { "epoch": 1.7959308240081384, "grad_norm": 6.4542854537445375, "learning_rate": 4.171132751220818e-06, "loss": 2.4958, "step": 1104 }, { "epoch": 1.797558494404883, "grad_norm": 6.887957441840262, "learning_rate": 4.161802538904177e-06, "loss": 2.5058, "step": 1105 }, { "epoch": 1.7991861648016276, "grad_norm": 6.69564775487908, "learning_rate": 4.152475328719517e-06, "loss": 2.5479, "step": 1106 }, { "epoch": 1.8008138351983725, "grad_norm": 7.72823793957526, "learning_rate": 4.143151154073662e-06, "loss": 2.7064, "step": 1107 }, { "epoch": 1.8024415055951168, "grad_norm": 7.298414439774556, "learning_rate": 4.133830048362561e-06, "loss": 2.5946, "step": 1108 }, { "epoch": 1.8040691759918617, "grad_norm": 7.078921723223037, "learning_rate": 4.124512044971182e-06, "loss": 2.5802, "step": 1109 }, { "epoch": 1.8056968463886063, "grad_norm": 7.086279144567053, "learning_rate": 4.115197177273369e-06, "loss": 2.4828, "step": 1110 }, { "epoch": 1.8073245167853509, "grad_norm": 7.319485129314234, "learning_rate": 4.105885478631741e-06, "loss": 2.4557, "step": 1111 }, { "epoch": 1.8089521871820957, "grad_norm": 7.030235833494835, "learning_rate": 4.09657698239757e-06, "loss": 2.3216, "step": 1112 }, { "epoch": 1.8105798575788403, "grad_norm": 7.35419310473764, "learning_rate": 4.08727172191065e-06, "loss": 2.3807, "step": 1113 }, { "epoch": 1.812207527975585, "grad_norm": 7.385039338941366, "learning_rate": 4.07796973049919e-06, "loss": 2.6247, "step": 1114 }, { "epoch": 1.8138351983723298, "grad_norm": 8.301883899365215, "learning_rate": 4.068671041479694e-06, "loss": 2.5742, "step": 1115 }, { "epoch": 1.8154628687690741, "grad_norm": 7.634367995178018, "learning_rate": 4.059375688156833e-06, "loss": 2.6052, "step": 1116 }, { "epoch": 1.817090539165819, "grad_norm": 8.030607737386342, "learning_rate": 4.050083703823327e-06, "loss": 2.5119, "step": 1117 }, { "epoch": 1.8187182095625636, "grad_norm": 7.455045822050495, "learning_rate": 4.040795121759841e-06, "loss": 2.4438, "step": 1118 }, { "epoch": 1.8203458799593082, "grad_norm": 7.014408649898465, "learning_rate": 4.031509975234844e-06, "loss": 2.3613, "step": 1119 }, { "epoch": 1.821973550356053, "grad_norm": 6.754041021459838, "learning_rate": 4.022228297504503e-06, "loss": 2.4299, "step": 1120 }, { "epoch": 1.8236012207527974, "grad_norm": 7.494592402276598, "learning_rate": 4.012950121812566e-06, "loss": 2.4878, "step": 1121 }, { "epoch": 1.8252288911495422, "grad_norm": 7.777800608434133, "learning_rate": 4.003675481390229e-06, "loss": 2.4941, "step": 1122 }, { "epoch": 1.8268565615462868, "grad_norm": 6.679188595042328, "learning_rate": 3.994404409456031e-06, "loss": 2.4638, "step": 1123 }, { "epoch": 1.8284842319430314, "grad_norm": 7.1371904671053965, "learning_rate": 3.985136939215731e-06, "loss": 2.4732, "step": 1124 }, { "epoch": 1.8301119023397763, "grad_norm": 7.469516873454636, "learning_rate": 3.975873103862188e-06, "loss": 2.5102, "step": 1125 }, { "epoch": 1.8317395727365209, "grad_norm": 6.9423286528774595, "learning_rate": 3.966612936575235e-06, "loss": 2.408, "step": 1126 }, { "epoch": 1.8333672431332655, "grad_norm": 7.5739444539849385, "learning_rate": 3.957356470521578e-06, "loss": 2.5493, "step": 1127 }, { "epoch": 1.8349949135300103, "grad_norm": 8.086808216675971, "learning_rate": 3.94810373885466e-06, "loss": 2.5591, "step": 1128 }, { "epoch": 1.8366225839267547, "grad_norm": 7.511027582267737, "learning_rate": 3.938854774714546e-06, "loss": 2.5338, "step": 1129 }, { "epoch": 1.8382502543234995, "grad_norm": 7.3756972575333615, "learning_rate": 3.929609611227817e-06, "loss": 2.4029, "step": 1130 }, { "epoch": 1.8398779247202441, "grad_norm": 7.776857467463983, "learning_rate": 3.920368281507431e-06, "loss": 2.4889, "step": 1131 }, { "epoch": 1.8415055951169887, "grad_norm": 7.019809965427335, "learning_rate": 3.911130818652621e-06, "loss": 2.5604, "step": 1132 }, { "epoch": 1.8431332655137336, "grad_norm": 7.336616297376101, "learning_rate": 3.901897255748769e-06, "loss": 2.5695, "step": 1133 }, { "epoch": 1.8447609359104782, "grad_norm": 6.671808167656518, "learning_rate": 3.892667625867287e-06, "loss": 2.3952, "step": 1134 }, { "epoch": 1.8463886063072228, "grad_norm": 6.933099028207708, "learning_rate": 3.883441962065499e-06, "loss": 2.3834, "step": 1135 }, { "epoch": 1.8480162767039674, "grad_norm": 7.6095614643928355, "learning_rate": 3.87422029738653e-06, "loss": 2.4217, "step": 1136 }, { "epoch": 1.849643947100712, "grad_norm": 7.017607600030662, "learning_rate": 3.8650026648591775e-06, "loss": 2.5224, "step": 1137 }, { "epoch": 1.8512716174974568, "grad_norm": 7.438487653763438, "learning_rate": 3.855789097497795e-06, "loss": 2.6119, "step": 1138 }, { "epoch": 1.8528992878942014, "grad_norm": 7.88359310115741, "learning_rate": 3.84657962830218e-06, "loss": 2.4771, "step": 1139 }, { "epoch": 1.854526958290946, "grad_norm": 7.102316294614701, "learning_rate": 3.837374290257449e-06, "loss": 2.6667, "step": 1140 }, { "epoch": 1.8561546286876909, "grad_norm": 7.20598656690994, "learning_rate": 3.828173116333925e-06, "loss": 2.5204, "step": 1141 }, { "epoch": 1.8577822990844353, "grad_norm": 6.808859415570651, "learning_rate": 3.818976139487017e-06, "loss": 2.5494, "step": 1142 }, { "epoch": 1.85940996948118, "grad_norm": 6.954091831978313, "learning_rate": 3.809783392657096e-06, "loss": 2.4619, "step": 1143 }, { "epoch": 1.8610376398779247, "grad_norm": 7.240402607730024, "learning_rate": 3.8005949087693857e-06, "loss": 2.7412, "step": 1144 }, { "epoch": 1.8626653102746693, "grad_norm": 7.669125629402927, "learning_rate": 3.791410720733844e-06, "loss": 2.6676, "step": 1145 }, { "epoch": 1.8642929806714141, "grad_norm": 8.61956847563084, "learning_rate": 3.782230861445041e-06, "loss": 2.5392, "step": 1146 }, { "epoch": 1.8659206510681587, "grad_norm": 6.638467816572396, "learning_rate": 3.7730553637820387e-06, "loss": 2.5635, "step": 1147 }, { "epoch": 1.8675483214649033, "grad_norm": 6.729535050977518, "learning_rate": 3.763884260608284e-06, "loss": 2.4673, "step": 1148 }, { "epoch": 1.8691759918616482, "grad_norm": 7.110358148598629, "learning_rate": 3.7547175847714806e-06, "loss": 2.3305, "step": 1149 }, { "epoch": 1.8708036622583926, "grad_norm": 7.031304344383967, "learning_rate": 3.7455553691034714e-06, "loss": 2.3148, "step": 1150 }, { "epoch": 1.8724313326551374, "grad_norm": 6.923837335846561, "learning_rate": 3.7363976464201348e-06, "loss": 2.5253, "step": 1151 }, { "epoch": 1.874059003051882, "grad_norm": 7.257641835035393, "learning_rate": 3.7272444495212457e-06, "loss": 2.5176, "step": 1152 }, { "epoch": 1.8756866734486266, "grad_norm": 7.028142316738349, "learning_rate": 3.7180958111903742e-06, "loss": 2.5398, "step": 1153 }, { "epoch": 1.8773143438453714, "grad_norm": 7.557020765433243, "learning_rate": 3.708951764194767e-06, "loss": 2.5904, "step": 1154 }, { "epoch": 1.8789420142421158, "grad_norm": 7.362499579569919, "learning_rate": 3.699812341285219e-06, "loss": 2.5079, "step": 1155 }, { "epoch": 1.8805696846388607, "grad_norm": 7.229440336564223, "learning_rate": 3.6906775751959667e-06, "loss": 2.5341, "step": 1156 }, { "epoch": 1.8821973550356053, "grad_norm": 6.838640438977149, "learning_rate": 3.6815474986445683e-06, "loss": 2.4961, "step": 1157 }, { "epoch": 1.8838250254323499, "grad_norm": 7.78549101619117, "learning_rate": 3.6724221443317854e-06, "loss": 2.4433, "step": 1158 }, { "epoch": 1.8854526958290947, "grad_norm": 6.687967087153988, "learning_rate": 3.6633015449414625e-06, "loss": 2.458, "step": 1159 }, { "epoch": 1.8870803662258393, "grad_norm": 7.352228492155016, "learning_rate": 3.65418573314042e-06, "loss": 2.5354, "step": 1160 }, { "epoch": 1.888708036622584, "grad_norm": 7.4790430237837695, "learning_rate": 3.645074741578326e-06, "loss": 2.5828, "step": 1161 }, { "epoch": 1.8903357070193287, "grad_norm": 6.818880816460278, "learning_rate": 3.6359686028875853e-06, "loss": 2.4197, "step": 1162 }, { "epoch": 1.8919633774160731, "grad_norm": 7.090144242277327, "learning_rate": 3.626867349683223e-06, "loss": 2.3943, "step": 1163 }, { "epoch": 1.893591047812818, "grad_norm": 6.933053555513212, "learning_rate": 3.6177710145627636e-06, "loss": 2.5385, "step": 1164 }, { "epoch": 1.8952187182095626, "grad_norm": 6.906958646309803, "learning_rate": 3.6086796301061174e-06, "loss": 2.6182, "step": 1165 }, { "epoch": 1.8968463886063072, "grad_norm": 7.866690230549328, "learning_rate": 3.5995932288754655e-06, "loss": 2.5961, "step": 1166 }, { "epoch": 1.898474059003052, "grad_norm": 7.67101250981826, "learning_rate": 3.5905118434151394e-06, "loss": 2.5945, "step": 1167 }, { "epoch": 1.9001017293997964, "grad_norm": 7.724485660189292, "learning_rate": 3.5814355062515014e-06, "loss": 2.4064, "step": 1168 }, { "epoch": 1.9017293997965412, "grad_norm": 6.61195205395405, "learning_rate": 3.5723642498928414e-06, "loss": 2.4456, "step": 1169 }, { "epoch": 1.9033570701932858, "grad_norm": 6.737695078485875, "learning_rate": 3.5632981068292444e-06, "loss": 2.4149, "step": 1170 }, { "epoch": 1.9049847405900304, "grad_norm": 7.1393208623348885, "learning_rate": 3.5542371095324835e-06, "loss": 2.5032, "step": 1171 }, { "epoch": 1.9066124109867753, "grad_norm": 7.292865292919829, "learning_rate": 3.545181290455904e-06, "loss": 2.3732, "step": 1172 }, { "epoch": 1.9082400813835199, "grad_norm": 6.624306520383729, "learning_rate": 3.5361306820342998e-06, "loss": 2.6206, "step": 1173 }, { "epoch": 1.9098677517802645, "grad_norm": 7.427354933240494, "learning_rate": 3.5270853166838052e-06, "loss": 2.4087, "step": 1174 }, { "epoch": 1.9114954221770093, "grad_norm": 7.091604083240525, "learning_rate": 3.518045226801777e-06, "loss": 2.452, "step": 1175 }, { "epoch": 1.9131230925737537, "grad_norm": 7.359733760010201, "learning_rate": 3.509010444766674e-06, "loss": 2.5297, "step": 1176 }, { "epoch": 1.9147507629704985, "grad_norm": 7.414803732496394, "learning_rate": 3.499981002937943e-06, "loss": 2.5021, "step": 1177 }, { "epoch": 1.9163784333672431, "grad_norm": 6.833984831474675, "learning_rate": 3.490956933655909e-06, "loss": 2.5491, "step": 1178 }, { "epoch": 1.9180061037639877, "grad_norm": 7.3957529815484255, "learning_rate": 3.4819382692416524e-06, "loss": 2.4431, "step": 1179 }, { "epoch": 1.9196337741607326, "grad_norm": 7.146140760879185, "learning_rate": 3.4729250419968908e-06, "loss": 2.5086, "step": 1180 }, { "epoch": 1.9212614445574772, "grad_norm": 6.956526776300934, "learning_rate": 3.4639172842038766e-06, "loss": 2.512, "step": 1181 }, { "epoch": 1.9228891149542218, "grad_norm": 7.169815706389186, "learning_rate": 3.4549150281252635e-06, "loss": 2.4618, "step": 1182 }, { "epoch": 1.9245167853509664, "grad_norm": 7.233489998578641, "learning_rate": 3.445918306004005e-06, "loss": 2.6955, "step": 1183 }, { "epoch": 1.926144455747711, "grad_norm": 7.752938750515219, "learning_rate": 3.436927150063234e-06, "loss": 2.5353, "step": 1184 }, { "epoch": 1.9277721261444558, "grad_norm": 7.03735352446459, "learning_rate": 3.4279415925061445e-06, "loss": 2.464, "step": 1185 }, { "epoch": 1.9293997965412004, "grad_norm": 7.456074745889446, "learning_rate": 3.4189616655158803e-06, "loss": 2.5277, "step": 1186 }, { "epoch": 1.931027466937945, "grad_norm": 7.56990908325132, "learning_rate": 3.4099874012554206e-06, "loss": 2.8099, "step": 1187 }, { "epoch": 1.9326551373346899, "grad_norm": 7.454371202916822, "learning_rate": 3.401018831867461e-06, "loss": 2.5145, "step": 1188 }, { "epoch": 1.9342828077314342, "grad_norm": 6.879004413776412, "learning_rate": 3.392055989474298e-06, "loss": 2.4856, "step": 1189 }, { "epoch": 1.935910478128179, "grad_norm": 7.506221245660615, "learning_rate": 3.3830989061777184e-06, "loss": 2.7647, "step": 1190 }, { "epoch": 1.9375381485249237, "grad_norm": 7.772577155870881, "learning_rate": 3.3741476140588825e-06, "loss": 2.3708, "step": 1191 }, { "epoch": 1.9391658189216683, "grad_norm": 7.061216255560879, "learning_rate": 3.365202145178205e-06, "loss": 2.3856, "step": 1192 }, { "epoch": 1.9407934893184131, "grad_norm": 7.674063006964169, "learning_rate": 3.356262531575251e-06, "loss": 2.7204, "step": 1193 }, { "epoch": 1.9424211597151577, "grad_norm": 7.996325442276908, "learning_rate": 3.3473288052686055e-06, "loss": 2.5349, "step": 1194 }, { "epoch": 1.9440488301119023, "grad_norm": 7.292813597716096, "learning_rate": 3.3384009982557706e-06, "loss": 2.6002, "step": 1195 }, { "epoch": 1.9456765005086472, "grad_norm": 7.430730962492683, "learning_rate": 3.3294791425130512e-06, "loss": 2.4547, "step": 1196 }, { "epoch": 1.9473041709053915, "grad_norm": 7.230039284237225, "learning_rate": 3.3205632699954328e-06, "loss": 2.4808, "step": 1197 }, { "epoch": 1.9489318413021364, "grad_norm": 6.882126651042331, "learning_rate": 3.3116534126364686e-06, "loss": 2.3357, "step": 1198 }, { "epoch": 1.950559511698881, "grad_norm": 7.119688958351221, "learning_rate": 3.3027496023481753e-06, "loss": 2.5859, "step": 1199 }, { "epoch": 1.9521871820956256, "grad_norm": 6.742085951986546, "learning_rate": 3.2938518710209055e-06, "loss": 2.4515, "step": 1200 }, { "epoch": 1.9538148524923704, "grad_norm": 6.832658667753226, "learning_rate": 3.284960250523237e-06, "loss": 2.36, "step": 1201 }, { "epoch": 1.9554425228891148, "grad_norm": 7.014474842177999, "learning_rate": 3.2760747727018695e-06, "loss": 2.6721, "step": 1202 }, { "epoch": 1.9570701932858596, "grad_norm": 7.866745266181306, "learning_rate": 3.2671954693814924e-06, "loss": 2.6558, "step": 1203 }, { "epoch": 1.9586978636826042, "grad_norm": 7.706388371746324, "learning_rate": 3.258322372364684e-06, "loss": 2.6244, "step": 1204 }, { "epoch": 1.9603255340793488, "grad_norm": 7.858810571610296, "learning_rate": 3.249455513431797e-06, "loss": 2.6646, "step": 1205 }, { "epoch": 1.9619532044760937, "grad_norm": 7.3024646345232735, "learning_rate": 3.240594924340835e-06, "loss": 2.4877, "step": 1206 }, { "epoch": 1.9635808748728383, "grad_norm": 7.009092958749246, "learning_rate": 3.2317406368273495e-06, "loss": 2.6706, "step": 1207 }, { "epoch": 1.9652085452695829, "grad_norm": 7.074782613079555, "learning_rate": 3.222892682604323e-06, "loss": 2.4355, "step": 1208 }, { "epoch": 1.9668362156663277, "grad_norm": 6.926004847556727, "learning_rate": 3.2140510933620505e-06, "loss": 2.4074, "step": 1209 }, { "epoch": 1.968463886063072, "grad_norm": 6.840944812065555, "learning_rate": 3.2052159007680297e-06, "loss": 2.6159, "step": 1210 }, { "epoch": 1.970091556459817, "grad_norm": 7.341598483125056, "learning_rate": 3.196387136466853e-06, "loss": 2.6198, "step": 1211 }, { "epoch": 1.9717192268565615, "grad_norm": 7.135635458422531, "learning_rate": 3.1875648320800845e-06, "loss": 2.5498, "step": 1212 }, { "epoch": 1.9733468972533061, "grad_norm": 7.488286184161243, "learning_rate": 3.178749019206151e-06, "loss": 2.7068, "step": 1213 }, { "epoch": 1.974974567650051, "grad_norm": 7.420759778619371, "learning_rate": 3.169939729420233e-06, "loss": 2.5311, "step": 1214 }, { "epoch": 1.9766022380467956, "grad_norm": 7.16228794004945, "learning_rate": 3.1611369942741416e-06, "loss": 2.5291, "step": 1215 }, { "epoch": 1.9782299084435402, "grad_norm": 7.042688649277333, "learning_rate": 3.1523408452962156e-06, "loss": 2.5038, "step": 1216 }, { "epoch": 1.9798575788402848, "grad_norm": 7.482659378852682, "learning_rate": 3.1435513139912045e-06, "loss": 2.5249, "step": 1217 }, { "epoch": 1.9814852492370294, "grad_norm": 6.203944024359897, "learning_rate": 3.1347684318401537e-06, "loss": 2.4686, "step": 1218 }, { "epoch": 1.9831129196337742, "grad_norm": 7.177835940644875, "learning_rate": 3.125992230300294e-06, "loss": 2.4797, "step": 1219 }, { "epoch": 1.9847405900305188, "grad_norm": 7.2172869550680785, "learning_rate": 3.11722274080493e-06, "loss": 2.5442, "step": 1220 }, { "epoch": 1.9863682604272634, "grad_norm": 6.539031657851906, "learning_rate": 3.1084599947633252e-06, "loss": 2.5734, "step": 1221 }, { "epoch": 1.9879959308240083, "grad_norm": 7.320692771948454, "learning_rate": 3.0997040235605876e-06, "loss": 2.6055, "step": 1222 }, { "epoch": 1.9896236012207527, "grad_norm": 7.374201256253725, "learning_rate": 3.0909548585575676e-06, "loss": 2.5434, "step": 1223 }, { "epoch": 1.9912512716174975, "grad_norm": 7.112407035819097, "learning_rate": 3.0822125310907297e-06, "loss": 2.4051, "step": 1224 }, { "epoch": 1.992878942014242, "grad_norm": 7.290529670146898, "learning_rate": 3.073477072472054e-06, "loss": 2.4709, "step": 1225 }, { "epoch": 1.9945066124109867, "grad_norm": 7.046974448618534, "learning_rate": 3.0647485139889145e-06, "loss": 2.5516, "step": 1226 }, { "epoch": 1.9961342828077315, "grad_norm": 7.127470490540275, "learning_rate": 3.0560268869039785e-06, "loss": 2.454, "step": 1227 }, { "epoch": 1.9977619532044761, "grad_norm": 7.051977556707317, "learning_rate": 3.0473122224550787e-06, "loss": 2.4094, "step": 1228 }, { "epoch": 1.9993896236012207, "grad_norm": 7.289831052916815, "learning_rate": 3.038604551855116e-06, "loss": 2.6682, "step": 1229 }, { "epoch": 2.0, "grad_norm": 7.289831052916815, "learning_rate": 3.0299039062919417e-06, "loss": 2.5917, "step": 1230 }, { "epoch": 2.001627670396745, "grad_norm": 12.740314211978893, "learning_rate": 3.0212103169282415e-06, "loss": 2.1005, "step": 1231 }, { "epoch": 2.003255340793489, "grad_norm": 7.2375604714827935, "learning_rate": 3.0125238149014304e-06, "loss": 1.9726, "step": 1232 }, { "epoch": 2.004883011190234, "grad_norm": 6.113824074521847, "learning_rate": 3.0038444313235427e-06, "loss": 1.9266, "step": 1233 }, { "epoch": 2.0065106815869784, "grad_norm": 6.716624375183312, "learning_rate": 2.9951721972811133e-06, "loss": 1.6599, "step": 1234 }, { "epoch": 2.0081383519837233, "grad_norm": 6.066243392654594, "learning_rate": 2.9865071438350664e-06, "loss": 1.7514, "step": 1235 }, { "epoch": 2.009766022380468, "grad_norm": 6.4194246881199, "learning_rate": 2.9778493020206155e-06, "loss": 1.7852, "step": 1236 }, { "epoch": 2.0113936927772125, "grad_norm": 6.876857402986618, "learning_rate": 2.969198702847141e-06, "loss": 1.851, "step": 1237 }, { "epoch": 2.0130213631739573, "grad_norm": 6.575015579331752, "learning_rate": 2.96055537729808e-06, "loss": 1.7615, "step": 1238 }, { "epoch": 2.014649033570702, "grad_norm": 6.517074615784237, "learning_rate": 2.9519193563308235e-06, "loss": 1.6958, "step": 1239 }, { "epoch": 2.0162767039674465, "grad_norm": 6.624311126698779, "learning_rate": 2.9432906708765953e-06, "loss": 1.6284, "step": 1240 }, { "epoch": 2.0179043743641913, "grad_norm": 6.3477965795291915, "learning_rate": 2.9346693518403456e-06, "loss": 1.9102, "step": 1241 }, { "epoch": 2.0195320447609357, "grad_norm": 7.098883542508465, "learning_rate": 2.926055430100647e-06, "loss": 1.7548, "step": 1242 }, { "epoch": 2.0211597151576806, "grad_norm": 7.549661393553244, "learning_rate": 2.9174489365095715e-06, "loss": 1.7711, "step": 1243 }, { "epoch": 2.0227873855544254, "grad_norm": 7.738447135371062, "learning_rate": 2.908849901892587e-06, "loss": 1.832, "step": 1244 }, { "epoch": 2.0244150559511698, "grad_norm": 8.033253925475156, "learning_rate": 2.9002583570484478e-06, "loss": 1.7485, "step": 1245 }, { "epoch": 2.0260427263479146, "grad_norm": 8.049065286988277, "learning_rate": 2.89167433274908e-06, "loss": 1.5794, "step": 1246 }, { "epoch": 2.027670396744659, "grad_norm": 8.553543765211888, "learning_rate": 2.8830978597394775e-06, "loss": 1.7167, "step": 1247 }, { "epoch": 2.029298067141404, "grad_norm": 7.538534472474699, "learning_rate": 2.8745289687375842e-06, "loss": 1.6376, "step": 1248 }, { "epoch": 2.0309257375381486, "grad_norm": 8.257737252113275, "learning_rate": 2.8659676904341904e-06, "loss": 1.8067, "step": 1249 }, { "epoch": 2.032553407934893, "grad_norm": 8.108188383540144, "learning_rate": 2.8574140554928175e-06, "loss": 1.808, "step": 1250 }, { "epoch": 2.034181078331638, "grad_norm": 8.213928642111098, "learning_rate": 2.848868094549615e-06, "loss": 1.4933, "step": 1251 }, { "epoch": 2.0358087487283827, "grad_norm": 7.517704131196852, "learning_rate": 2.8403298382132437e-06, "loss": 1.7864, "step": 1252 }, { "epoch": 2.037436419125127, "grad_norm": 8.326418654888368, "learning_rate": 2.8317993170647685e-06, "loss": 1.6303, "step": 1253 }, { "epoch": 2.039064089521872, "grad_norm": 7.624338718412112, "learning_rate": 2.8232765616575565e-06, "loss": 1.8612, "step": 1254 }, { "epoch": 2.0406917599186163, "grad_norm": 7.296620639398164, "learning_rate": 2.8147616025171504e-06, "loss": 1.7229, "step": 1255 }, { "epoch": 2.042319430315361, "grad_norm": 6.999585801272633, "learning_rate": 2.806254470141174e-06, "loss": 1.8974, "step": 1256 }, { "epoch": 2.043947100712106, "grad_norm": 7.693525566777956, "learning_rate": 2.7977551949992228e-06, "loss": 1.7688, "step": 1257 }, { "epoch": 2.0455747711088503, "grad_norm": 7.714589220600351, "learning_rate": 2.7892638075327463e-06, "loss": 1.7221, "step": 1258 }, { "epoch": 2.047202441505595, "grad_norm": 7.239404787776932, "learning_rate": 2.780780338154937e-06, "loss": 1.662, "step": 1259 }, { "epoch": 2.04883011190234, "grad_norm": 7.229373031007829, "learning_rate": 2.7723048172506393e-06, "loss": 1.9157, "step": 1260 }, { "epoch": 2.0504577822990844, "grad_norm": 6.966811845783632, "learning_rate": 2.763837275176224e-06, "loss": 1.6619, "step": 1261 }, { "epoch": 2.052085452695829, "grad_norm": 6.9171395085845315, "learning_rate": 2.7553777422594774e-06, "loss": 1.7963, "step": 1262 }, { "epoch": 2.0537131230925736, "grad_norm": 6.641686352610721, "learning_rate": 2.7469262487995125e-06, "loss": 1.8189, "step": 1263 }, { "epoch": 2.0553407934893184, "grad_norm": 6.962134475169491, "learning_rate": 2.7384828250666394e-06, "loss": 1.6443, "step": 1264 }, { "epoch": 2.0569684638860632, "grad_norm": 6.94612811816876, "learning_rate": 2.7300475013022666e-06, "loss": 1.6524, "step": 1265 }, { "epoch": 2.0585961342828076, "grad_norm": 7.023402098184262, "learning_rate": 2.721620307718793e-06, "loss": 1.7474, "step": 1266 }, { "epoch": 2.0602238046795525, "grad_norm": 6.2753618240209965, "learning_rate": 2.713201274499496e-06, "loss": 1.8322, "step": 1267 }, { "epoch": 2.061851475076297, "grad_norm": 7.082677735285766, "learning_rate": 2.7047904317984273e-06, "loss": 1.9216, "step": 1268 }, { "epoch": 2.0634791454730417, "grad_norm": 11.677773993745605, "learning_rate": 2.696387809740303e-06, "loss": 1.8326, "step": 1269 }, { "epoch": 2.0651068158697865, "grad_norm": 6.666902886732488, "learning_rate": 2.6879934384203922e-06, "loss": 1.9822, "step": 1270 }, { "epoch": 2.066734486266531, "grad_norm": 8.01345752362986, "learning_rate": 2.6796073479044175e-06, "loss": 1.7245, "step": 1271 }, { "epoch": 2.0683621566632757, "grad_norm": 7.65127139581927, "learning_rate": 2.6712295682284406e-06, "loss": 1.6439, "step": 1272 }, { "epoch": 2.0699898270600205, "grad_norm": 7.144887822288721, "learning_rate": 2.6628601293987544e-06, "loss": 1.6294, "step": 1273 }, { "epoch": 2.071617497456765, "grad_norm": 6.5513247220885855, "learning_rate": 2.6544990613917803e-06, "loss": 1.7963, "step": 1274 }, { "epoch": 2.0732451678535098, "grad_norm": 7.31641705012418, "learning_rate": 2.646146394153963e-06, "loss": 1.688, "step": 1275 }, { "epoch": 2.074872838250254, "grad_norm": 6.737356607546922, "learning_rate": 2.6378021576016467e-06, "loss": 1.6348, "step": 1276 }, { "epoch": 2.076500508646999, "grad_norm": 6.5343107075808335, "learning_rate": 2.6294663816209877e-06, "loss": 1.6499, "step": 1277 }, { "epoch": 2.078128179043744, "grad_norm": 6.686209409256845, "learning_rate": 2.621139096067841e-06, "loss": 1.6407, "step": 1278 }, { "epoch": 2.079755849440488, "grad_norm": 7.353998103893926, "learning_rate": 2.6128203307676508e-06, "loss": 1.7412, "step": 1279 }, { "epoch": 2.081383519837233, "grad_norm": 6.860358880403155, "learning_rate": 2.6045101155153363e-06, "loss": 1.6563, "step": 1280 }, { "epoch": 2.0830111902339774, "grad_norm": 6.560578488506188, "learning_rate": 2.5962084800752064e-06, "loss": 1.8528, "step": 1281 }, { "epoch": 2.0846388606307222, "grad_norm": 9.006524984708753, "learning_rate": 2.5879154541808337e-06, "loss": 1.8248, "step": 1282 }, { "epoch": 2.086266531027467, "grad_norm": 7.235827271704288, "learning_rate": 2.579631067534949e-06, "loss": 1.7593, "step": 1283 }, { "epoch": 2.0878942014242114, "grad_norm": 7.1256327454132755, "learning_rate": 2.5713553498093508e-06, "loss": 1.7667, "step": 1284 }, { "epoch": 2.0895218718209563, "grad_norm": 7.29225286389193, "learning_rate": 2.563088330644783e-06, "loss": 1.8007, "step": 1285 }, { "epoch": 2.091149542217701, "grad_norm": 8.180384729853984, "learning_rate": 2.554830039650834e-06, "loss": 1.8225, "step": 1286 }, { "epoch": 2.0927772126144455, "grad_norm": 7.376559255902152, "learning_rate": 2.546580506405833e-06, "loss": 1.6453, "step": 1287 }, { "epoch": 2.0944048830111903, "grad_norm": 7.271496912108958, "learning_rate": 2.5383397604567394e-06, "loss": 1.5958, "step": 1288 }, { "epoch": 2.0960325534079347, "grad_norm": 6.463754284333228, "learning_rate": 2.530107831319042e-06, "loss": 1.6498, "step": 1289 }, { "epoch": 2.0976602238046795, "grad_norm": 6.9695174575430725, "learning_rate": 2.5218847484766497e-06, "loss": 1.7108, "step": 1290 }, { "epoch": 2.0992878942014244, "grad_norm": 7.134259588700701, "learning_rate": 2.5136705413817873e-06, "loss": 1.6831, "step": 1291 }, { "epoch": 2.1009155645981687, "grad_norm": 7.627602604368529, "learning_rate": 2.5054652394548895e-06, "loss": 1.628, "step": 1292 }, { "epoch": 2.1025432349949136, "grad_norm": 7.161484933968531, "learning_rate": 2.497268872084495e-06, "loss": 1.7886, "step": 1293 }, { "epoch": 2.1041709053916584, "grad_norm": 6.907941863753152, "learning_rate": 2.4890814686271446e-06, "loss": 1.8216, "step": 1294 }, { "epoch": 2.105798575788403, "grad_norm": 7.1013696208061265, "learning_rate": 2.4809030584072692e-06, "loss": 1.8158, "step": 1295 }, { "epoch": 2.1074262461851476, "grad_norm": 7.622999739519649, "learning_rate": 2.4727336707170973e-06, "loss": 1.6785, "step": 1296 }, { "epoch": 2.109053916581892, "grad_norm": 7.750123004178778, "learning_rate": 2.4645733348165306e-06, "loss": 1.6777, "step": 1297 }, { "epoch": 2.110681586978637, "grad_norm": 7.046128693780249, "learning_rate": 2.4564220799330564e-06, "loss": 1.6882, "step": 1298 }, { "epoch": 2.1123092573753817, "grad_norm": 6.8396887647324025, "learning_rate": 2.4482799352616397e-06, "loss": 1.5355, "step": 1299 }, { "epoch": 2.113936927772126, "grad_norm": 7.516374965569692, "learning_rate": 2.4401469299646134e-06, "loss": 1.8311, "step": 1300 }, { "epoch": 2.115564598168871, "grad_norm": 7.382148863602184, "learning_rate": 2.43202309317157e-06, "loss": 1.6234, "step": 1301 }, { "epoch": 2.1171922685656153, "grad_norm": 7.615670617469679, "learning_rate": 2.4239084539792745e-06, "loss": 1.7208, "step": 1302 }, { "epoch": 2.11881993896236, "grad_norm": 6.930084788281244, "learning_rate": 2.415803041451545e-06, "loss": 1.7194, "step": 1303 }, { "epoch": 2.120447609359105, "grad_norm": 7.850797236768215, "learning_rate": 2.4077068846191453e-06, "loss": 1.8183, "step": 1304 }, { "epoch": 2.1220752797558493, "grad_norm": 6.635632212758718, "learning_rate": 2.399620012479702e-06, "loss": 1.6103, "step": 1305 }, { "epoch": 2.123702950152594, "grad_norm": 7.24381482944125, "learning_rate": 2.391542453997578e-06, "loss": 1.7124, "step": 1306 }, { "epoch": 2.125330620549339, "grad_norm": 7.270719058105004, "learning_rate": 2.3834742381037802e-06, "loss": 1.8106, "step": 1307 }, { "epoch": 2.1269582909460834, "grad_norm": 7.7815088336420395, "learning_rate": 2.375415393695854e-06, "loss": 1.6395, "step": 1308 }, { "epoch": 2.128585961342828, "grad_norm": 6.781521084399599, "learning_rate": 2.3673659496377786e-06, "loss": 1.7652, "step": 1309 }, { "epoch": 2.1302136317395726, "grad_norm": 7.2858383550153425, "learning_rate": 2.359325934759866e-06, "loss": 1.8989, "step": 1310 }, { "epoch": 2.1318413021363174, "grad_norm": 7.608528370395869, "learning_rate": 2.3512953778586537e-06, "loss": 1.7603, "step": 1311 }, { "epoch": 2.1334689725330622, "grad_norm": 7.030614423256876, "learning_rate": 2.3432743076968067e-06, "loss": 1.7593, "step": 1312 }, { "epoch": 2.1350966429298066, "grad_norm": 7.32926648438766, "learning_rate": 2.3352627530030076e-06, "loss": 1.596, "step": 1313 }, { "epoch": 2.1367243133265514, "grad_norm": 6.749057298062823, "learning_rate": 2.3272607424718675e-06, "loss": 1.7381, "step": 1314 }, { "epoch": 2.138351983723296, "grad_norm": 7.343933323298815, "learning_rate": 2.3192683047638e-06, "loss": 1.7134, "step": 1315 }, { "epoch": 2.1399796541200407, "grad_norm": 7.7730843279598, "learning_rate": 2.3112854685049397e-06, "loss": 1.6658, "step": 1316 }, { "epoch": 2.1416073245167855, "grad_norm": 7.111426453104293, "learning_rate": 2.303312262287037e-06, "loss": 1.8195, "step": 1317 }, { "epoch": 2.14323499491353, "grad_norm": 7.015622088251597, "learning_rate": 2.29534871466734e-06, "loss": 1.6973, "step": 1318 }, { "epoch": 2.1448626653102747, "grad_norm": 7.43725773100821, "learning_rate": 2.287394854168509e-06, "loss": 1.527, "step": 1319 }, { "epoch": 2.1464903357070195, "grad_norm": 6.532540741426812, "learning_rate": 2.2794507092785105e-06, "loss": 1.5892, "step": 1320 }, { "epoch": 2.148118006103764, "grad_norm": 6.8547354803986424, "learning_rate": 2.271516308450511e-06, "loss": 1.8163, "step": 1321 }, { "epoch": 2.1497456765005087, "grad_norm": 7.604334274478793, "learning_rate": 2.2635916801027706e-06, "loss": 1.7189, "step": 1322 }, { "epoch": 2.151373346897253, "grad_norm": 6.906394885996099, "learning_rate": 2.2556768526185595e-06, "loss": 1.6608, "step": 1323 }, { "epoch": 2.153001017293998, "grad_norm": 7.230337308064934, "learning_rate": 2.2477718543460376e-06, "loss": 1.7729, "step": 1324 }, { "epoch": 2.154628687690743, "grad_norm": 7.2028003983161035, "learning_rate": 2.2398767135981603e-06, "loss": 1.841, "step": 1325 }, { "epoch": 2.156256358087487, "grad_norm": 7.026758180329883, "learning_rate": 2.2319914586525776e-06, "loss": 1.7041, "step": 1326 }, { "epoch": 2.157884028484232, "grad_norm": 7.624852316875544, "learning_rate": 2.224116117751533e-06, "loss": 1.7774, "step": 1327 }, { "epoch": 2.1595116988809764, "grad_norm": 6.994854643053645, "learning_rate": 2.2162507191017603e-06, "loss": 1.7602, "step": 1328 }, { "epoch": 2.161139369277721, "grad_norm": 6.616432318121656, "learning_rate": 2.208395290874383e-06, "loss": 1.7616, "step": 1329 }, { "epoch": 2.162767039674466, "grad_norm": 7.699674060675883, "learning_rate": 2.2005498612048154e-06, "loss": 1.7179, "step": 1330 }, { "epoch": 2.1643947100712104, "grad_norm": 7.23935384849859, "learning_rate": 2.1927144581926597e-06, "loss": 1.8081, "step": 1331 }, { "epoch": 2.1660223804679553, "grad_norm": 7.83528871640409, "learning_rate": 2.184889109901606e-06, "loss": 1.7392, "step": 1332 }, { "epoch": 2.1676500508647, "grad_norm": 7.8210128456675205, "learning_rate": 2.1770738443593316e-06, "loss": 1.8181, "step": 1333 }, { "epoch": 2.1692777212614445, "grad_norm": 6.42468567163581, "learning_rate": 2.1692686895574006e-06, "loss": 1.6831, "step": 1334 }, { "epoch": 2.1709053916581893, "grad_norm": 7.597299604674575, "learning_rate": 2.1614736734511692e-06, "loss": 1.6312, "step": 1335 }, { "epoch": 2.1725330620549337, "grad_norm": 6.8284147703292675, "learning_rate": 2.1536888239596714e-06, "loss": 1.6864, "step": 1336 }, { "epoch": 2.1741607324516785, "grad_norm": 6.544667120989876, "learning_rate": 2.145914168965532e-06, "loss": 1.6776, "step": 1337 }, { "epoch": 2.1757884028484233, "grad_norm": 6.060916980582725, "learning_rate": 2.1381497363148675e-06, "loss": 1.6551, "step": 1338 }, { "epoch": 2.1774160732451677, "grad_norm": 6.407863801792211, "learning_rate": 2.1303955538171727e-06, "loss": 1.6673, "step": 1339 }, { "epoch": 2.1790437436419126, "grad_norm": 8.29848082721057, "learning_rate": 2.1226516492452337e-06, "loss": 1.8066, "step": 1340 }, { "epoch": 2.180671414038657, "grad_norm": 7.78250691587662, "learning_rate": 2.114918050335029e-06, "loss": 1.6382, "step": 1341 }, { "epoch": 2.1822990844354018, "grad_norm": 7.129755010285555, "learning_rate": 2.1071947847856223e-06, "loss": 1.6603, "step": 1342 }, { "epoch": 2.1839267548321466, "grad_norm": 6.922145288081775, "learning_rate": 2.0994818802590607e-06, "loss": 1.7476, "step": 1343 }, { "epoch": 2.185554425228891, "grad_norm": 7.482438014558934, "learning_rate": 2.091779364380293e-06, "loss": 1.6199, "step": 1344 }, { "epoch": 2.187182095625636, "grad_norm": 7.0437971478343195, "learning_rate": 2.084087264737052e-06, "loss": 1.7155, "step": 1345 }, { "epoch": 2.1888097660223806, "grad_norm": 6.56803567582986, "learning_rate": 2.0764056088797646e-06, "loss": 1.6916, "step": 1346 }, { "epoch": 2.190437436419125, "grad_norm": 7.475283395694533, "learning_rate": 2.0687344243214534e-06, "loss": 1.7632, "step": 1347 }, { "epoch": 2.19206510681587, "grad_norm": 6.353460707392077, "learning_rate": 2.061073738537635e-06, "loss": 1.6164, "step": 1348 }, { "epoch": 2.1936927772126142, "grad_norm": 7.521931891956867, "learning_rate": 2.0534235789662226e-06, "loss": 1.7451, "step": 1349 }, { "epoch": 2.195320447609359, "grad_norm": 7.072548477324475, "learning_rate": 2.045783973007429e-06, "loss": 1.5771, "step": 1350 }, { "epoch": 2.196948118006104, "grad_norm": 7.399397482793551, "learning_rate": 2.0381549480236685e-06, "loss": 1.8488, "step": 1351 }, { "epoch": 2.1985757884028483, "grad_norm": 7.471386014977747, "learning_rate": 2.030536531339456e-06, "loss": 1.7264, "step": 1352 }, { "epoch": 2.200203458799593, "grad_norm": 7.293409374908535, "learning_rate": 2.0229287502413125e-06, "loss": 1.6618, "step": 1353 }, { "epoch": 2.201831129196338, "grad_norm": 6.9610268915272515, "learning_rate": 2.0153316319776663e-06, "loss": 1.6848, "step": 1354 }, { "epoch": 2.2034587995930823, "grad_norm": 7.252218100080784, "learning_rate": 2.0077452037587514e-06, "loss": 1.6641, "step": 1355 }, { "epoch": 2.205086469989827, "grad_norm": 7.265382572972745, "learning_rate": 2.000169492756523e-06, "loss": 1.7751, "step": 1356 }, { "epoch": 2.2067141403865715, "grad_norm": 7.117870383636954, "learning_rate": 1.9926045261045403e-06, "loss": 1.9279, "step": 1357 }, { "epoch": 2.2083418107833164, "grad_norm": 7.476458771735503, "learning_rate": 1.985050330897883e-06, "loss": 1.9333, "step": 1358 }, { "epoch": 2.209969481180061, "grad_norm": 7.4186015933438645, "learning_rate": 1.9775069341930592e-06, "loss": 1.6805, "step": 1359 }, { "epoch": 2.2115971515768056, "grad_norm": 6.708170478781185, "learning_rate": 1.969974363007888e-06, "loss": 1.598, "step": 1360 }, { "epoch": 2.2132248219735504, "grad_norm": 7.063003202088113, "learning_rate": 1.9624526443214228e-06, "loss": 1.7485, "step": 1361 }, { "epoch": 2.2148524923702952, "grad_norm": 7.465151963402605, "learning_rate": 1.9549418050738478e-06, "loss": 1.5167, "step": 1362 }, { "epoch": 2.2164801627670396, "grad_norm": 6.787680931752794, "learning_rate": 1.947441872166379e-06, "loss": 1.6516, "step": 1363 }, { "epoch": 2.2181078331637845, "grad_norm": 7.0184430601712, "learning_rate": 1.9399528724611643e-06, "loss": 1.8895, "step": 1364 }, { "epoch": 2.219735503560529, "grad_norm": 7.533892814086459, "learning_rate": 1.932474832781203e-06, "loss": 1.6266, "step": 1365 }, { "epoch": 2.2213631739572737, "grad_norm": 7.089706193077295, "learning_rate": 1.9250077799102323e-06, "loss": 1.6901, "step": 1366 }, { "epoch": 2.2229908443540185, "grad_norm": 7.103975432188483, "learning_rate": 1.91755174059264e-06, "loss": 1.7378, "step": 1367 }, { "epoch": 2.224618514750763, "grad_norm": 7.46103860721861, "learning_rate": 1.9101067415333685e-06, "loss": 1.8149, "step": 1368 }, { "epoch": 2.2262461851475077, "grad_norm": 6.812449921002505, "learning_rate": 1.9026728093978157e-06, "loss": 1.7445, "step": 1369 }, { "epoch": 2.227873855544252, "grad_norm": 8.005440379978886, "learning_rate": 1.8952499708117433e-06, "loss": 1.7355, "step": 1370 }, { "epoch": 2.229501525940997, "grad_norm": 7.257929760939761, "learning_rate": 1.8878382523611789e-06, "loss": 1.5765, "step": 1371 }, { "epoch": 2.2311291963377418, "grad_norm": 7.477594197923311, "learning_rate": 1.8804376805923224e-06, "loss": 1.6229, "step": 1372 }, { "epoch": 2.232756866734486, "grad_norm": 6.865864895530359, "learning_rate": 1.8730482820114493e-06, "loss": 1.5621, "step": 1373 }, { "epoch": 2.234384537131231, "grad_norm": 6.54803747979964, "learning_rate": 1.8656700830848174e-06, "loss": 2.0155, "step": 1374 }, { "epoch": 2.236012207527976, "grad_norm": 7.6952182349409, "learning_rate": 1.8583031102385708e-06, "loss": 1.829, "step": 1375 }, { "epoch": 2.23763987792472, "grad_norm": 7.243558105135446, "learning_rate": 1.8509473898586432e-06, "loss": 1.6955, "step": 1376 }, { "epoch": 2.239267548321465, "grad_norm": 7.111477910941522, "learning_rate": 1.8436029482906747e-06, "loss": 1.6422, "step": 1377 }, { "epoch": 2.2408952187182094, "grad_norm": 6.778957444029774, "learning_rate": 1.8362698118398969e-06, "loss": 1.7879, "step": 1378 }, { "epoch": 2.2425228891149542, "grad_norm": 7.474432029911821, "learning_rate": 1.8289480067710558e-06, "loss": 1.8182, "step": 1379 }, { "epoch": 2.244150559511699, "grad_norm": 7.820650477373333, "learning_rate": 1.8216375593083152e-06, "loss": 1.9066, "step": 1380 }, { "epoch": 2.2457782299084434, "grad_norm": 8.175711950143878, "learning_rate": 1.814338495635158e-06, "loss": 1.9527, "step": 1381 }, { "epoch": 2.2474059003051883, "grad_norm": 7.175547994588235, "learning_rate": 1.8070508418942878e-06, "loss": 1.6934, "step": 1382 }, { "epoch": 2.2490335707019327, "grad_norm": 7.408622173477658, "learning_rate": 1.7997746241875525e-06, "loss": 1.46, "step": 1383 }, { "epoch": 2.2506612410986775, "grad_norm": 7.6874077413634385, "learning_rate": 1.7925098685758346e-06, "loss": 1.8495, "step": 1384 }, { "epoch": 2.2522889114954223, "grad_norm": 7.612475464481103, "learning_rate": 1.7852566010789597e-06, "loss": 1.5952, "step": 1385 }, { "epoch": 2.2539165818921667, "grad_norm": 6.7751113202217335, "learning_rate": 1.7780148476756148e-06, "loss": 1.6401, "step": 1386 }, { "epoch": 2.2555442522889115, "grad_norm": 7.662051931588742, "learning_rate": 1.770784634303243e-06, "loss": 1.5734, "step": 1387 }, { "epoch": 2.2571719226856564, "grad_norm": 6.893743332057969, "learning_rate": 1.7635659868579552e-06, "loss": 1.751, "step": 1388 }, { "epoch": 2.2587995930824007, "grad_norm": 7.140993269214544, "learning_rate": 1.756358931194438e-06, "loss": 1.7411, "step": 1389 }, { "epoch": 2.2604272634791456, "grad_norm": 6.788033516977937, "learning_rate": 1.7491634931258589e-06, "loss": 1.5684, "step": 1390 }, { "epoch": 2.26205493387589, "grad_norm": 6.91546498529779, "learning_rate": 1.741979698423777e-06, "loss": 1.4809, "step": 1391 }, { "epoch": 2.263682604272635, "grad_norm": 6.62472805589311, "learning_rate": 1.734807572818048e-06, "loss": 1.7106, "step": 1392 }, { "epoch": 2.2653102746693796, "grad_norm": 6.958779884067102, "learning_rate": 1.7276471419967327e-06, "loss": 1.6384, "step": 1393 }, { "epoch": 2.266937945066124, "grad_norm": 6.364196622545943, "learning_rate": 1.7204984316060063e-06, "loss": 1.7775, "step": 1394 }, { "epoch": 2.268565615462869, "grad_norm": 7.305339207572143, "learning_rate": 1.7133614672500643e-06, "loss": 1.957, "step": 1395 }, { "epoch": 2.270193285859613, "grad_norm": 7.235305713922802, "learning_rate": 1.7062362744910321e-06, "loss": 1.6239, "step": 1396 }, { "epoch": 2.271820956256358, "grad_norm": 7.0272096354007285, "learning_rate": 1.6991228788488729e-06, "loss": 1.8439, "step": 1397 }, { "epoch": 2.273448626653103, "grad_norm": 7.395454051600444, "learning_rate": 1.6920213058013024e-06, "loss": 1.6481, "step": 1398 }, { "epoch": 2.2750762970498473, "grad_norm": 7.367873057531506, "learning_rate": 1.6849315807836814e-06, "loss": 1.6265, "step": 1399 }, { "epoch": 2.276703967446592, "grad_norm": 7.94426744796582, "learning_rate": 1.6778537291889407e-06, "loss": 1.706, "step": 1400 }, { "epoch": 2.278331637843337, "grad_norm": 7.350072795639363, "learning_rate": 1.670787776367489e-06, "loss": 1.582, "step": 1401 }, { "epoch": 2.2799593082400813, "grad_norm": 6.602062663747484, "learning_rate": 1.6637337476271127e-06, "loss": 1.668, "step": 1402 }, { "epoch": 2.281586978636826, "grad_norm": 8.049728410425384, "learning_rate": 1.6566916682328864e-06, "loss": 1.8305, "step": 1403 }, { "epoch": 2.2832146490335705, "grad_norm": 7.990316150440582, "learning_rate": 1.6496615634070955e-06, "loss": 1.7444, "step": 1404 }, { "epoch": 2.2848423194303153, "grad_norm": 6.962601561032506, "learning_rate": 1.642643458329133e-06, "loss": 1.7752, "step": 1405 }, { "epoch": 2.28646998982706, "grad_norm": 7.671236029602846, "learning_rate": 1.6356373781354058e-06, "loss": 1.5469, "step": 1406 }, { "epoch": 2.2880976602238046, "grad_norm": 7.065106734186884, "learning_rate": 1.6286433479192637e-06, "loss": 1.6466, "step": 1407 }, { "epoch": 2.2897253306205494, "grad_norm": 7.502489672864704, "learning_rate": 1.6216613927308905e-06, "loss": 1.8877, "step": 1408 }, { "epoch": 2.2913530010172938, "grad_norm": 7.642472197156576, "learning_rate": 1.6146915375772225e-06, "loss": 1.5826, "step": 1409 }, { "epoch": 2.2929806714140386, "grad_norm": 7.775553502895717, "learning_rate": 1.6077338074218597e-06, "loss": 1.5302, "step": 1410 }, { "epoch": 2.2946083418107834, "grad_norm": 7.036497747677149, "learning_rate": 1.6007882271849718e-06, "loss": 1.5603, "step": 1411 }, { "epoch": 2.296236012207528, "grad_norm": 6.818552815367851, "learning_rate": 1.5938548217432136e-06, "loss": 1.5771, "step": 1412 }, { "epoch": 2.2978636826042727, "grad_norm": 6.990772455304575, "learning_rate": 1.586933615929634e-06, "loss": 1.8898, "step": 1413 }, { "epoch": 2.2994913530010175, "grad_norm": 7.249810334276076, "learning_rate": 1.5800246345335868e-06, "loss": 1.5696, "step": 1414 }, { "epoch": 2.301119023397762, "grad_norm": 7.184130564768896, "learning_rate": 1.5731279023006412e-06, "loss": 1.7271, "step": 1415 }, { "epoch": 2.3027466937945067, "grad_norm": 6.959975736289694, "learning_rate": 1.566243443932496e-06, "loss": 1.7761, "step": 1416 }, { "epoch": 2.304374364191251, "grad_norm": 6.819194821144061, "learning_rate": 1.5593712840868868e-06, "loss": 1.7574, "step": 1417 }, { "epoch": 2.306002034587996, "grad_norm": 6.906469096097711, "learning_rate": 1.5525114473775015e-06, "loss": 1.6206, "step": 1418 }, { "epoch": 2.3076297049847407, "grad_norm": 6.785899365586747, "learning_rate": 1.5456639583738958e-06, "loss": 1.7133, "step": 1419 }, { "epoch": 2.309257375381485, "grad_norm": 7.415344014066142, "learning_rate": 1.5388288416013897e-06, "loss": 1.9219, "step": 1420 }, { "epoch": 2.31088504577823, "grad_norm": 7.676271086269514, "learning_rate": 1.532006121540996e-06, "loss": 1.7157, "step": 1421 }, { "epoch": 2.3125127161749743, "grad_norm": 7.927643352012801, "learning_rate": 1.5251958226293306e-06, "loss": 1.7577, "step": 1422 }, { "epoch": 2.314140386571719, "grad_norm": 7.35382979078468, "learning_rate": 1.518397969258516e-06, "loss": 1.6958, "step": 1423 }, { "epoch": 2.315768056968464, "grad_norm": 6.5274104716161085, "learning_rate": 1.5116125857760966e-06, "loss": 1.5558, "step": 1424 }, { "epoch": 2.3173957273652084, "grad_norm": 6.77668910575944, "learning_rate": 1.5048396964849621e-06, "loss": 1.6951, "step": 1425 }, { "epoch": 2.319023397761953, "grad_norm": 8.137523710320561, "learning_rate": 1.4980793256432474e-06, "loss": 1.9456, "step": 1426 }, { "epoch": 2.320651068158698, "grad_norm": 7.6358829224605, "learning_rate": 1.4913314974642474e-06, "loss": 1.6676, "step": 1427 }, { "epoch": 2.3222787385554424, "grad_norm": 6.989923435727838, "learning_rate": 1.4845962361163413e-06, "loss": 1.594, "step": 1428 }, { "epoch": 2.3239064089521873, "grad_norm": 6.9548659575064375, "learning_rate": 1.4778735657228933e-06, "loss": 1.65, "step": 1429 }, { "epoch": 2.325534079348932, "grad_norm": 6.964427875462924, "learning_rate": 1.4711635103621718e-06, "loss": 1.869, "step": 1430 }, { "epoch": 2.3271617497456765, "grad_norm": 7.645859722561478, "learning_rate": 1.4644660940672628e-06, "loss": 1.8669, "step": 1431 }, { "epoch": 2.3287894201424213, "grad_norm": 7.127169675209611, "learning_rate": 1.4577813408259839e-06, "loss": 1.4841, "step": 1432 }, { "epoch": 2.3304170905391657, "grad_norm": 7.03972765901708, "learning_rate": 1.4511092745807981e-06, "loss": 1.6751, "step": 1433 }, { "epoch": 2.3320447609359105, "grad_norm": 6.082355722603657, "learning_rate": 1.4444499192287275e-06, "loss": 1.698, "step": 1434 }, { "epoch": 2.333672431332655, "grad_norm": 6.815101092551239, "learning_rate": 1.4378032986212687e-06, "loss": 1.6506, "step": 1435 }, { "epoch": 2.3353001017293997, "grad_norm": 6.520603942469742, "learning_rate": 1.4311694365643048e-06, "loss": 1.702, "step": 1436 }, { "epoch": 2.3369277721261446, "grad_norm": 7.838626951707438, "learning_rate": 1.4245483568180286e-06, "loss": 1.5731, "step": 1437 }, { "epoch": 2.338555442522889, "grad_norm": 7.096595115894495, "learning_rate": 1.4179400830968415e-06, "loss": 1.633, "step": 1438 }, { "epoch": 2.3401831129196338, "grad_norm": 7.154583600001119, "learning_rate": 1.4113446390692837e-06, "loss": 1.7374, "step": 1439 }, { "epoch": 2.3418107833163786, "grad_norm": 8.040300806547448, "learning_rate": 1.4047620483579477e-06, "loss": 1.6967, "step": 1440 }, { "epoch": 2.343438453713123, "grad_norm": 7.129113034626538, "learning_rate": 1.3981923345393816e-06, "loss": 1.5686, "step": 1441 }, { "epoch": 2.345066124109868, "grad_norm": 7.147170578167556, "learning_rate": 1.3916355211440163e-06, "loss": 1.8744, "step": 1442 }, { "epoch": 2.3466937945066126, "grad_norm": 7.514846506841612, "learning_rate": 1.3850916316560813e-06, "loss": 1.5814, "step": 1443 }, { "epoch": 2.348321464903357, "grad_norm": 6.785499234691258, "learning_rate": 1.378560689513515e-06, "loss": 1.7792, "step": 1444 }, { "epoch": 2.349949135300102, "grad_norm": 7.676729380696212, "learning_rate": 1.3720427181078778e-06, "loss": 1.6876, "step": 1445 }, { "epoch": 2.3515768056968462, "grad_norm": 7.123396941348094, "learning_rate": 1.3655377407842813e-06, "loss": 1.6503, "step": 1446 }, { "epoch": 2.353204476093591, "grad_norm": 7.318486879143924, "learning_rate": 1.3590457808412933e-06, "loss": 1.581, "step": 1447 }, { "epoch": 2.354832146490336, "grad_norm": 6.603987598199794, "learning_rate": 1.3525668615308562e-06, "loss": 1.7162, "step": 1448 }, { "epoch": 2.3564598168870803, "grad_norm": 6.491405622540728, "learning_rate": 1.3461010060582091e-06, "loss": 1.5536, "step": 1449 }, { "epoch": 2.358087487283825, "grad_norm": 7.157972369315001, "learning_rate": 1.3396482375817977e-06, "loss": 1.7826, "step": 1450 }, { "epoch": 2.3597151576805695, "grad_norm": 6.844938321280039, "learning_rate": 1.3332085792131966e-06, "loss": 1.5519, "step": 1451 }, { "epoch": 2.3613428280773143, "grad_norm": 7.065371900304191, "learning_rate": 1.3267820540170229e-06, "loss": 1.881, "step": 1452 }, { "epoch": 2.362970498474059, "grad_norm": 7.3598979568808325, "learning_rate": 1.3203686850108576e-06, "loss": 1.6099, "step": 1453 }, { "epoch": 2.3645981688708035, "grad_norm": 6.5398572848162955, "learning_rate": 1.3139684951651587e-06, "loss": 1.804, "step": 1454 }, { "epoch": 2.3662258392675484, "grad_norm": 7.572784899443195, "learning_rate": 1.3075815074031817e-06, "loss": 1.7125, "step": 1455 }, { "epoch": 2.367853509664293, "grad_norm": 7.322364310452466, "learning_rate": 1.3012077446008969e-06, "loss": 1.7814, "step": 1456 }, { "epoch": 2.3694811800610376, "grad_norm": 7.576513532540607, "learning_rate": 1.2948472295869057e-06, "loss": 1.8243, "step": 1457 }, { "epoch": 2.3711088504577824, "grad_norm": 7.184645830615872, "learning_rate": 1.2884999851423675e-06, "loss": 1.5311, "step": 1458 }, { "epoch": 2.372736520854527, "grad_norm": 6.830315964006821, "learning_rate": 1.2821660340009006e-06, "loss": 1.7219, "step": 1459 }, { "epoch": 2.3743641912512716, "grad_norm": 7.508719833824683, "learning_rate": 1.2758453988485164e-06, "loss": 1.8679, "step": 1460 }, { "epoch": 2.3759918616480165, "grad_norm": 7.58582517679591, "learning_rate": 1.2695381023235387e-06, "loss": 1.5257, "step": 1461 }, { "epoch": 2.377619532044761, "grad_norm": 6.581289595578873, "learning_rate": 1.2632441670165058e-06, "loss": 1.8151, "step": 1462 }, { "epoch": 2.3792472024415057, "grad_norm": 7.413460709418316, "learning_rate": 1.2569636154701076e-06, "loss": 1.6726, "step": 1463 }, { "epoch": 2.38087487283825, "grad_norm": 6.92379556783169, "learning_rate": 1.2506964701790986e-06, "loss": 1.5976, "step": 1464 }, { "epoch": 2.382502543234995, "grad_norm": 6.536662872187333, "learning_rate": 1.2444427535902154e-06, "loss": 1.7674, "step": 1465 }, { "epoch": 2.3841302136317397, "grad_norm": 7.602074294767915, "learning_rate": 1.2382024881020937e-06, "loss": 1.65, "step": 1466 }, { "epoch": 2.385757884028484, "grad_norm": 7.077389064705663, "learning_rate": 1.231975696065199e-06, "loss": 1.6647, "step": 1467 }, { "epoch": 2.387385554425229, "grad_norm": 7.148475744935718, "learning_rate": 1.2257623997817348e-06, "loss": 1.6988, "step": 1468 }, { "epoch": 2.3890132248219738, "grad_norm": 7.721506153776071, "learning_rate": 1.2195626215055694e-06, "loss": 1.8699, "step": 1469 }, { "epoch": 2.390640895218718, "grad_norm": 7.789303415343659, "learning_rate": 1.213376383442153e-06, "loss": 1.6453, "step": 1470 }, { "epoch": 2.392268565615463, "grad_norm": 6.872462209610133, "learning_rate": 1.2072037077484416e-06, "loss": 1.768, "step": 1471 }, { "epoch": 2.3938962360122074, "grad_norm": 7.6556749619580415, "learning_rate": 1.2010446165328126e-06, "loss": 2.0127, "step": 1472 }, { "epoch": 2.395523906408952, "grad_norm": 7.4640642976581715, "learning_rate": 1.1948991318549907e-06, "loss": 1.6424, "step": 1473 }, { "epoch": 2.397151576805697, "grad_norm": 6.849589791026262, "learning_rate": 1.188767275725966e-06, "loss": 1.5628, "step": 1474 }, { "epoch": 2.3987792472024414, "grad_norm": 6.995126298433606, "learning_rate": 1.1826490701079163e-06, "loss": 1.6336, "step": 1475 }, { "epoch": 2.4004069175991862, "grad_norm": 7.080711911300695, "learning_rate": 1.1765445369141276e-06, "loss": 1.5942, "step": 1476 }, { "epoch": 2.4020345879959306, "grad_norm": 7.082421672133423, "learning_rate": 1.1704536980089155e-06, "loss": 1.669, "step": 1477 }, { "epoch": 2.4036622583926754, "grad_norm": 6.907287956648545, "learning_rate": 1.164376575207547e-06, "loss": 1.7575, "step": 1478 }, { "epoch": 2.4052899287894203, "grad_norm": 7.289959987054364, "learning_rate": 1.1583131902761685e-06, "loss": 1.8861, "step": 1479 }, { "epoch": 2.4069175991861647, "grad_norm": 7.890250472317154, "learning_rate": 1.152263564931712e-06, "loss": 1.6069, "step": 1480 }, { "epoch": 2.4085452695829095, "grad_norm": 6.496891732833277, "learning_rate": 1.1462277208418338e-06, "loss": 1.727, "step": 1481 }, { "epoch": 2.4101729399796543, "grad_norm": 7.482571488961443, "learning_rate": 1.140205679624834e-06, "loss": 1.725, "step": 1482 }, { "epoch": 2.4118006103763987, "grad_norm": 7.282419925454468, "learning_rate": 1.1341974628495662e-06, "loss": 1.8917, "step": 1483 }, { "epoch": 2.4134282807731435, "grad_norm": 8.065318136865656, "learning_rate": 1.128203092035375e-06, "loss": 1.8651, "step": 1484 }, { "epoch": 2.415055951169888, "grad_norm": 7.5100072556561495, "learning_rate": 1.1222225886520161e-06, "loss": 1.8184, "step": 1485 }, { "epoch": 2.4166836215666327, "grad_norm": 7.188476650065806, "learning_rate": 1.1162559741195733e-06, "loss": 1.7449, "step": 1486 }, { "epoch": 2.4183112919633776, "grad_norm": 6.870366619366637, "learning_rate": 1.1103032698083831e-06, "loss": 1.6743, "step": 1487 }, { "epoch": 2.419938962360122, "grad_norm": 6.820480553882988, "learning_rate": 1.1043644970389671e-06, "loss": 1.6807, "step": 1488 }, { "epoch": 2.421566632756867, "grad_norm": 6.632083590750506, "learning_rate": 1.0984396770819438e-06, "loss": 1.747, "step": 1489 }, { "epoch": 2.423194303153611, "grad_norm": 8.184312812200279, "learning_rate": 1.0925288311579591e-06, "loss": 1.8067, "step": 1490 }, { "epoch": 2.424821973550356, "grad_norm": 6.846495864069235, "learning_rate": 1.0866319804376086e-06, "loss": 1.8098, "step": 1491 }, { "epoch": 2.426449643947101, "grad_norm": 7.6175731354702245, "learning_rate": 1.0807491460413622e-06, "loss": 1.8619, "step": 1492 }, { "epoch": 2.428077314343845, "grad_norm": 7.682518410186781, "learning_rate": 1.0748803490394877e-06, "loss": 1.6006, "step": 1493 }, { "epoch": 2.42970498474059, "grad_norm": 7.269707135727439, "learning_rate": 1.0690256104519764e-06, "loss": 1.628, "step": 1494 }, { "epoch": 2.431332655137335, "grad_norm": 7.405834157714768, "learning_rate": 1.0631849512484671e-06, "loss": 1.8056, "step": 1495 }, { "epoch": 2.4329603255340793, "grad_norm": 7.341477933043256, "learning_rate": 1.057358392348171e-06, "loss": 1.6952, "step": 1496 }, { "epoch": 2.434587995930824, "grad_norm": 7.8992626897504445, "learning_rate": 1.0515459546197976e-06, "loss": 1.7614, "step": 1497 }, { "epoch": 2.4362156663275685, "grad_norm": 7.249284051202803, "learning_rate": 1.0457476588814774e-06, "loss": 1.6746, "step": 1498 }, { "epoch": 2.4378433367243133, "grad_norm": 6.890372832309815, "learning_rate": 1.03996352590069e-06, "loss": 1.7948, "step": 1499 }, { "epoch": 2.439471007121058, "grad_norm": 7.018319369406671, "learning_rate": 1.0341935763941934e-06, "loss": 1.8089, "step": 1500 }, { "epoch": 2.4410986775178025, "grad_norm": 7.0638860503111776, "learning_rate": 1.028437831027937e-06, "loss": 1.7318, "step": 1501 }, { "epoch": 2.4427263479145473, "grad_norm": 6.865400572540434, "learning_rate": 1.0226963104170002e-06, "loss": 1.6481, "step": 1502 }, { "epoch": 2.4443540183112917, "grad_norm": 7.455264323973343, "learning_rate": 1.0169690351255173e-06, "loss": 1.8237, "step": 1503 }, { "epoch": 2.4459816887080366, "grad_norm": 7.673735977529153, "learning_rate": 1.0112560256665971e-06, "loss": 1.6075, "step": 1504 }, { "epoch": 2.4476093591047814, "grad_norm": 7.065592197527784, "learning_rate": 1.0055573025022508e-06, "loss": 1.7341, "step": 1505 }, { "epoch": 2.4492370295015258, "grad_norm": 7.226767050835037, "learning_rate": 9.998728860433277e-07, "loss": 1.596, "step": 1506 }, { "epoch": 2.4508646998982706, "grad_norm": 7.415377576394878, "learning_rate": 9.942027966494317e-07, "loss": 1.6031, "step": 1507 }, { "epoch": 2.4524923702950154, "grad_norm": 7.260073045877326, "learning_rate": 9.885470546288478e-07, "loss": 1.8212, "step": 1508 }, { "epoch": 2.45412004069176, "grad_norm": 7.758448765604084, "learning_rate": 9.829056802384834e-07, "loss": 1.6775, "step": 1509 }, { "epoch": 2.4557477110885046, "grad_norm": 7.446361857136384, "learning_rate": 9.772786936837786e-07, "loss": 1.8334, "step": 1510 }, { "epoch": 2.4573753814852495, "grad_norm": 6.972663984058032, "learning_rate": 9.71666115118644e-07, "loss": 1.6503, "step": 1511 }, { "epoch": 2.459003051881994, "grad_norm": 6.849676453596015, "learning_rate": 9.660679646453853e-07, "loss": 1.7153, "step": 1512 }, { "epoch": 2.4606307222787387, "grad_norm": 6.800710025745236, "learning_rate": 9.60484262314631e-07, "loss": 1.8433, "step": 1513 }, { "epoch": 2.462258392675483, "grad_norm": 7.586932572082763, "learning_rate": 9.549150281252633e-07, "loss": 1.6499, "step": 1514 }, { "epoch": 2.463886063072228, "grad_norm": 6.734544190364704, "learning_rate": 9.493602820243424e-07, "loss": 1.5658, "step": 1515 }, { "epoch": 2.4655137334689723, "grad_norm": 7.4833440675714, "learning_rate": 9.438200439070388e-07, "loss": 1.839, "step": 1516 }, { "epoch": 2.467141403865717, "grad_norm": 7.3253234237853775, "learning_rate": 9.382943336165601e-07, "loss": 1.8948, "step": 1517 }, { "epoch": 2.468769074262462, "grad_norm": 7.115842831211694, "learning_rate": 9.327831709440793e-07, "loss": 1.6132, "step": 1518 }, { "epoch": 2.4703967446592063, "grad_norm": 7.62465052403968, "learning_rate": 9.272865756286658e-07, "loss": 1.724, "step": 1519 }, { "epoch": 2.472024415055951, "grad_norm": 7.757490752388255, "learning_rate": 9.218045673572124e-07, "loss": 1.5522, "step": 1520 }, { "epoch": 2.473652085452696, "grad_norm": 6.6358371774894405, "learning_rate": 9.163371657643716e-07, "loss": 1.5557, "step": 1521 }, { "epoch": 2.4752797558494404, "grad_norm": 7.020580189485553, "learning_rate": 9.108843904324716e-07, "loss": 1.696, "step": 1522 }, { "epoch": 2.476907426246185, "grad_norm": 7.694973079643266, "learning_rate": 9.054462608914577e-07, "loss": 1.8084, "step": 1523 }, { "epoch": 2.47853509664293, "grad_norm": 7.666729151026, "learning_rate": 9.000227966188235e-07, "loss": 1.7158, "step": 1524 }, { "epoch": 2.4801627670396744, "grad_norm": 7.579579012882057, "learning_rate": 8.946140170395328e-07, "loss": 1.8192, "step": 1525 }, { "epoch": 2.4817904374364192, "grad_norm": 7.579966083884539, "learning_rate": 8.892199415259501e-07, "loss": 1.839, "step": 1526 }, { "epoch": 2.4834181078331636, "grad_norm": 7.37893674767589, "learning_rate": 8.838405893977825e-07, "loss": 1.4991, "step": 1527 }, { "epoch": 2.4850457782299085, "grad_norm": 6.894274009803495, "learning_rate": 8.78475979922e-07, "loss": 1.7778, "step": 1528 }, { "epoch": 2.4866734486266533, "grad_norm": 6.562889348634068, "learning_rate": 8.731261323127659e-07, "loss": 1.738, "step": 1529 }, { "epoch": 2.4883011190233977, "grad_norm": 7.323208177568219, "learning_rate": 8.677910657313782e-07, "loss": 1.5986, "step": 1530 }, { "epoch": 2.4899287894201425, "grad_norm": 7.442688436305294, "learning_rate": 8.624707992861897e-07, "loss": 1.6288, "step": 1531 }, { "epoch": 2.491556459816887, "grad_norm": 7.22984167296001, "learning_rate": 8.571653520325462e-07, "loss": 1.694, "step": 1532 }, { "epoch": 2.4931841302136317, "grad_norm": 6.680909637786937, "learning_rate": 8.518747429727159e-07, "loss": 1.7166, "step": 1533 }, { "epoch": 2.4948118006103766, "grad_norm": 6.697894383331654, "learning_rate": 8.465989910558209e-07, "loss": 1.5341, "step": 1534 }, { "epoch": 2.496439471007121, "grad_norm": 7.296207169433651, "learning_rate": 8.413381151777711e-07, "loss": 1.8205, "step": 1535 }, { "epoch": 2.4980671414038658, "grad_norm": 6.831974364374975, "learning_rate": 8.360921341811956e-07, "loss": 1.7388, "step": 1536 }, { "epoch": 2.4996948118006106, "grad_norm": 7.505359411518895, "learning_rate": 8.308610668553751e-07, "loss": 1.7826, "step": 1537 }, { "epoch": 2.501322482197355, "grad_norm": 7.171651933285067, "learning_rate": 8.256449319361748e-07, "loss": 1.5793, "step": 1538 }, { "epoch": 2.5029501525941, "grad_norm": 6.804590460256403, "learning_rate": 8.204437481059763e-07, "loss": 1.5402, "step": 1539 }, { "epoch": 2.504577822990844, "grad_norm": 7.251431182509196, "learning_rate": 8.152575339936131e-07, "loss": 1.7071, "step": 1540 }, { "epoch": 2.506205493387589, "grad_norm": 6.995416327865824, "learning_rate": 8.100863081742999e-07, "loss": 1.6525, "step": 1541 }, { "epoch": 2.5078331637843334, "grad_norm": 7.978074421733629, "learning_rate": 8.049300891695744e-07, "loss": 1.6247, "step": 1542 }, { "epoch": 2.5094608341810782, "grad_norm": 7.739288831232543, "learning_rate": 7.997888954472172e-07, "loss": 1.9851, "step": 1543 }, { "epoch": 2.511088504577823, "grad_norm": 7.620099424258863, "learning_rate": 7.946627454211969e-07, "loss": 1.6248, "step": 1544 }, { "epoch": 2.5127161749745675, "grad_norm": 6.9187576259796275, "learning_rate": 7.895516574516038e-07, "loss": 1.7223, "step": 1545 }, { "epoch": 2.5143438453713123, "grad_norm": 7.65816749743554, "learning_rate": 7.844556498445788e-07, "loss": 1.7331, "step": 1546 }, { "epoch": 2.515971515768057, "grad_norm": 7.633016002353161, "learning_rate": 7.793747408522462e-07, "loss": 1.9585, "step": 1547 }, { "epoch": 2.5175991861648015, "grad_norm": 7.767128082464342, "learning_rate": 7.743089486726602e-07, "loss": 1.7257, "step": 1548 }, { "epoch": 2.5192268565615463, "grad_norm": 6.75666176668869, "learning_rate": 7.692582914497265e-07, "loss": 1.6267, "step": 1549 }, { "epoch": 2.520854526958291, "grad_norm": 7.082311913544406, "learning_rate": 7.642227872731417e-07, "loss": 1.6962, "step": 1550 }, { "epoch": 2.5224821973550355, "grad_norm": 7.626264185515792, "learning_rate": 7.592024541783344e-07, "loss": 1.8188, "step": 1551 }, { "epoch": 2.5241098677517804, "grad_norm": 7.823956578810606, "learning_rate": 7.541973101463912e-07, "loss": 1.8846, "step": 1552 }, { "epoch": 2.525737538148525, "grad_norm": 7.613331255842684, "learning_rate": 7.492073731039995e-07, "loss": 1.7498, "step": 1553 }, { "epoch": 2.5273652085452696, "grad_norm": 7.058767225997198, "learning_rate": 7.442326609233786e-07, "loss": 1.5139, "step": 1554 }, { "epoch": 2.528992878942014, "grad_norm": 6.680119812164517, "learning_rate": 7.392731914222189e-07, "loss": 1.633, "step": 1555 }, { "epoch": 2.530620549338759, "grad_norm": 7.232821580849197, "learning_rate": 7.343289823636168e-07, "loss": 1.8474, "step": 1556 }, { "epoch": 2.5322482197355036, "grad_norm": 7.562652073774, "learning_rate": 7.294000514560101e-07, "loss": 1.6366, "step": 1557 }, { "epoch": 2.533875890132248, "grad_norm": 7.055589186165839, "learning_rate": 7.244864163531163e-07, "loss": 1.7171, "step": 1558 }, { "epoch": 2.535503560528993, "grad_norm": 7.56036505310709, "learning_rate": 7.195880946538675e-07, "loss": 1.7889, "step": 1559 }, { "epoch": 2.5371312309257377, "grad_norm": 7.097141548863587, "learning_rate": 7.147051039023528e-07, "loss": 1.5172, "step": 1560 }, { "epoch": 2.538758901322482, "grad_norm": 7.067882208188577, "learning_rate": 7.098374615877452e-07, "loss": 1.7612, "step": 1561 }, { "epoch": 2.540386571719227, "grad_norm": 7.813494834728604, "learning_rate": 7.049851851442468e-07, "loss": 1.7679, "step": 1562 }, { "epoch": 2.5420142421159717, "grad_norm": 7.260000405827042, "learning_rate": 7.001482919510288e-07, "loss": 1.7434, "step": 1563 }, { "epoch": 2.543641912512716, "grad_norm": 6.750602510860825, "learning_rate": 6.953267993321588e-07, "loss": 1.5959, "step": 1564 }, { "epoch": 2.545269582909461, "grad_norm": 6.812938387464732, "learning_rate": 6.90520724556547e-07, "loss": 1.6403, "step": 1565 }, { "epoch": 2.5468972533062058, "grad_norm": 7.008423539996639, "learning_rate": 6.857300848378857e-07, "loss": 1.699, "step": 1566 }, { "epoch": 2.54852492370295, "grad_norm": 7.319881142122011, "learning_rate": 6.809548973345803e-07, "loss": 1.8099, "step": 1567 }, { "epoch": 2.550152594099695, "grad_norm": 8.072829862238732, "learning_rate": 6.761951791496902e-07, "loss": 1.9401, "step": 1568 }, { "epoch": 2.5517802644964394, "grad_norm": 7.557784358635401, "learning_rate": 6.71450947330875e-07, "loss": 1.5444, "step": 1569 }, { "epoch": 2.553407934893184, "grad_norm": 7.630823815384925, "learning_rate": 6.667222188703227e-07, "loss": 1.708, "step": 1570 }, { "epoch": 2.5550356052899286, "grad_norm": 7.365308194146718, "learning_rate": 6.62009010704695e-07, "loss": 1.8465, "step": 1571 }, { "epoch": 2.5566632756866734, "grad_norm": 8.779064250856312, "learning_rate": 6.573113397150654e-07, "loss": 1.5974, "step": 1572 }, { "epoch": 2.5582909460834182, "grad_norm": 7.430997998046969, "learning_rate": 6.526292227268593e-07, "loss": 1.6949, "step": 1573 }, { "epoch": 2.5599186164801626, "grad_norm": 6.52148070748597, "learning_rate": 6.479626765097919e-07, "loss": 1.8222, "step": 1574 }, { "epoch": 2.5615462868769074, "grad_norm": 6.421471296388945, "learning_rate": 6.433117177778103e-07, "loss": 1.563, "step": 1575 }, { "epoch": 2.5631739572736523, "grad_norm": 7.402714003596154, "learning_rate": 6.386763631890313e-07, "loss": 1.8242, "step": 1576 }, { "epoch": 2.5648016276703967, "grad_norm": 7.8676841243254785, "learning_rate": 6.340566293456845e-07, "loss": 1.6803, "step": 1577 }, { "epoch": 2.5664292980671415, "grad_norm": 7.030615994965437, "learning_rate": 6.294525327940515e-07, "loss": 1.8234, "step": 1578 }, { "epoch": 2.5680569684638863, "grad_norm": 7.484938505597712, "learning_rate": 6.248640900244046e-07, "loss": 1.6206, "step": 1579 }, { "epoch": 2.5696846388606307, "grad_norm": 7.038666246105466, "learning_rate": 6.202913174709507e-07, "loss": 1.7307, "step": 1580 }, { "epoch": 2.5713123092573755, "grad_norm": 7.287471900302259, "learning_rate": 6.157342315117754e-07, "loss": 1.8034, "step": 1581 }, { "epoch": 2.57293997965412, "grad_norm": 6.636987832528873, "learning_rate": 6.111928484687723e-07, "loss": 1.6667, "step": 1582 }, { "epoch": 2.5745676500508647, "grad_norm": 7.23480180905048, "learning_rate": 6.066671846075984e-07, "loss": 1.5454, "step": 1583 }, { "epoch": 2.576195320447609, "grad_norm": 7.137310585366126, "learning_rate": 6.02157256137611e-07, "loss": 1.6524, "step": 1584 }, { "epoch": 2.577822990844354, "grad_norm": 7.503874064411725, "learning_rate": 5.976630792118033e-07, "loss": 1.6498, "step": 1585 }, { "epoch": 2.579450661241099, "grad_norm": 7.26844034484499, "learning_rate": 5.931846699267558e-07, "loss": 1.5624, "step": 1586 }, { "epoch": 2.581078331637843, "grad_norm": 7.2015823995380295, "learning_rate": 5.887220443225749e-07, "loss": 1.5779, "step": 1587 }, { "epoch": 2.582706002034588, "grad_norm": 7.015035518069043, "learning_rate": 5.842752183828354e-07, "loss": 1.8287, "step": 1588 }, { "epoch": 2.584333672431333, "grad_norm": 7.127765243470796, "learning_rate": 5.79844208034519e-07, "loss": 1.6216, "step": 1589 }, { "epoch": 2.585961342828077, "grad_norm": 7.807030456444256, "learning_rate": 5.754290291479675e-07, "loss": 1.6316, "step": 1590 }, { "epoch": 2.587589013224822, "grad_norm": 6.103443152712225, "learning_rate": 5.710296975368163e-07, "loss": 1.6631, "step": 1591 }, { "epoch": 2.589216683621567, "grad_norm": 6.846243073506196, "learning_rate": 5.666462289579422e-07, "loss": 1.7845, "step": 1592 }, { "epoch": 2.5908443540183113, "grad_norm": 7.5485158415075935, "learning_rate": 5.62278639111406e-07, "loss": 1.9239, "step": 1593 }, { "epoch": 2.592472024415056, "grad_norm": 6.81254900064419, "learning_rate": 5.579269436403967e-07, "loss": 1.8123, "step": 1594 }, { "epoch": 2.5940996948118005, "grad_norm": 6.706042300301408, "learning_rate": 5.535911581311748e-07, "loss": 1.7929, "step": 1595 }, { "epoch": 2.5957273652085453, "grad_norm": 8.326546900983498, "learning_rate": 5.492712981130171e-07, "loss": 1.7079, "step": 1596 }, { "epoch": 2.5973550356052897, "grad_norm": 6.981604959789821, "learning_rate": 5.449673790581611e-07, "loss": 1.7799, "step": 1597 }, { "epoch": 2.5989827060020345, "grad_norm": 7.547915417628757, "learning_rate": 5.406794163817481e-07, "loss": 1.8474, "step": 1598 }, { "epoch": 2.6006103763987793, "grad_norm": 8.170199167928063, "learning_rate": 5.364074254417712e-07, "loss": 1.7735, "step": 1599 }, { "epoch": 2.6022380467955237, "grad_norm": 7.425508121702081, "learning_rate": 5.321514215390161e-07, "loss": 1.6441, "step": 1600 }, { "epoch": 2.6038657171922686, "grad_norm": 6.918592665631552, "learning_rate": 5.279114199170094e-07, "loss": 1.7829, "step": 1601 }, { "epoch": 2.6054933875890134, "grad_norm": 7.854964663662975, "learning_rate": 5.236874357619653e-07, "loss": 1.7292, "step": 1602 }, { "epoch": 2.6071210579857578, "grad_norm": 8.357438276068812, "learning_rate": 5.194794842027251e-07, "loss": 1.8227, "step": 1603 }, { "epoch": 2.6087487283825026, "grad_norm": 7.097760293194216, "learning_rate": 5.152875803107082e-07, "loss": 1.6596, "step": 1604 }, { "epoch": 2.6103763987792474, "grad_norm": 7.6796099475024615, "learning_rate": 5.111117390998599e-07, "loss": 1.586, "step": 1605 }, { "epoch": 2.612004069175992, "grad_norm": 7.359501970835604, "learning_rate": 5.0695197552659e-07, "loss": 1.7606, "step": 1606 }, { "epoch": 2.6136317395727366, "grad_norm": 6.927117422555836, "learning_rate": 5.028083044897247e-07, "loss": 1.7446, "step": 1607 }, { "epoch": 2.615259409969481, "grad_norm": 7.616847768742601, "learning_rate": 4.986807408304567e-07, "loss": 1.6161, "step": 1608 }, { "epoch": 2.616887080366226, "grad_norm": 7.517645899635431, "learning_rate": 4.945692993322837e-07, "loss": 1.7615, "step": 1609 }, { "epoch": 2.6185147507629702, "grad_norm": 7.635774689789629, "learning_rate": 4.904739947209575e-07, "loss": 1.7092, "step": 1610 }, { "epoch": 2.620142421159715, "grad_norm": 7.124064129863508, "learning_rate": 4.863948416644382e-07, "loss": 1.6643, "step": 1611 }, { "epoch": 2.62177009155646, "grad_norm": 6.575016506088135, "learning_rate": 4.82331854772834e-07, "loss": 1.8018, "step": 1612 }, { "epoch": 2.6233977619532043, "grad_norm": 6.9944022289512215, "learning_rate": 4.782850485983515e-07, "loss": 1.5509, "step": 1613 }, { "epoch": 2.625025432349949, "grad_norm": 6.597764377005532, "learning_rate": 4.7425443763524427e-07, "loss": 1.789, "step": 1614 }, { "epoch": 2.626653102746694, "grad_norm": 7.415812342849091, "learning_rate": 4.70240036319759e-07, "loss": 1.7731, "step": 1615 }, { "epoch": 2.6282807731434383, "grad_norm": 7.324062291007644, "learning_rate": 4.6624185903008713e-07, "loss": 1.8525, "step": 1616 }, { "epoch": 2.629908443540183, "grad_norm": 7.160485546293309, "learning_rate": 4.6225992008630926e-07, "loss": 1.9384, "step": 1617 }, { "epoch": 2.631536113936928, "grad_norm": 7.984441766171447, "learning_rate": 4.5829423375034653e-07, "loss": 1.7808, "step": 1618 }, { "epoch": 2.6331637843336724, "grad_norm": 7.725302406893302, "learning_rate": 4.543448142259099e-07, "loss": 1.86, "step": 1619 }, { "epoch": 2.634791454730417, "grad_norm": 6.826296926338474, "learning_rate": 4.5041167565844647e-07, "loss": 1.7678, "step": 1620 }, { "epoch": 2.6364191251271616, "grad_norm": 7.604344496124257, "learning_rate": 4.4649483213509257e-07, "loss": 1.6905, "step": 1621 }, { "epoch": 2.6380467955239064, "grad_norm": 7.799195969458421, "learning_rate": 4.425942976846187e-07, "loss": 1.5331, "step": 1622 }, { "epoch": 2.639674465920651, "grad_norm": 7.024807340783003, "learning_rate": 4.3871008627738686e-07, "loss": 1.7638, "step": 1623 }, { "epoch": 2.6413021363173956, "grad_norm": 7.27719632613657, "learning_rate": 4.348422118252893e-07, "loss": 1.6589, "step": 1624 }, { "epoch": 2.6429298067141405, "grad_norm": 7.559453045452107, "learning_rate": 4.3099068818170776e-07, "loss": 1.7231, "step": 1625 }, { "epoch": 2.644557477110885, "grad_norm": 7.222542851681027, "learning_rate": 4.271555291414636e-07, "loss": 1.8322, "step": 1626 }, { "epoch": 2.6461851475076297, "grad_norm": 7.442613132404635, "learning_rate": 4.23336748440763e-07, "loss": 1.6341, "step": 1627 }, { "epoch": 2.6478128179043745, "grad_norm": 7.161552338894156, "learning_rate": 4.1953435975714886e-07, "loss": 1.645, "step": 1628 }, { "epoch": 2.649440488301119, "grad_norm": 7.09360580135842, "learning_rate": 4.157483767094583e-07, "loss": 1.8311, "step": 1629 }, { "epoch": 2.6510681586978637, "grad_norm": 7.343072426935293, "learning_rate": 4.1197881285776675e-07, "loss": 1.6622, "step": 1630 }, { "epoch": 2.6526958290946085, "grad_norm": 6.621915860500845, "learning_rate": 4.082256817033392e-07, "loss": 1.647, "step": 1631 }, { "epoch": 2.654323499491353, "grad_norm": 7.004707608664503, "learning_rate": 4.0448899668858956e-07, "loss": 1.6613, "step": 1632 }, { "epoch": 2.6559511698880978, "grad_norm": 7.299166237833414, "learning_rate": 4.007687711970243e-07, "loss": 1.5917, "step": 1633 }, { "epoch": 2.6575788402848426, "grad_norm": 6.84108132532542, "learning_rate": 3.9706501855319767e-07, "loss": 1.5733, "step": 1634 }, { "epoch": 2.659206510681587, "grad_norm": 6.659600731937168, "learning_rate": 3.9337775202266506e-07, "loss": 1.6991, "step": 1635 }, { "epoch": 2.6608341810783314, "grad_norm": 7.869950062285272, "learning_rate": 3.8970698481193225e-07, "loss": 1.8791, "step": 1636 }, { "epoch": 2.662461851475076, "grad_norm": 7.756303189213125, "learning_rate": 3.8605273006841305e-07, "loss": 1.8155, "step": 1637 }, { "epoch": 2.664089521871821, "grad_norm": 7.0636219995120415, "learning_rate": 3.8241500088037674e-07, "loss": 1.7864, "step": 1638 }, { "epoch": 2.6657171922685654, "grad_norm": 7.581899920193987, "learning_rate": 3.7879381027690563e-07, "loss": 1.7228, "step": 1639 }, { "epoch": 2.6673448626653102, "grad_norm": 7.427377160240841, "learning_rate": 3.7518917122784606e-07, "loss": 1.7065, "step": 1640 }, { "epoch": 2.668972533062055, "grad_norm": 7.688776194101659, "learning_rate": 3.716010966437611e-07, "loss": 1.6418, "step": 1641 }, { "epoch": 2.6706002034587994, "grad_norm": 7.874287588646193, "learning_rate": 3.680295993758881e-07, "loss": 1.5722, "step": 1642 }, { "epoch": 2.6722278738555443, "grad_norm": 7.455331319894167, "learning_rate": 3.6447469221608854e-07, "loss": 1.6936, "step": 1643 }, { "epoch": 2.673855544252289, "grad_norm": 7.4125298822034775, "learning_rate": 3.609363878968036e-07, "loss": 1.6837, "step": 1644 }, { "epoch": 2.6754832146490335, "grad_norm": 7.036680247183093, "learning_rate": 3.5741469909101043e-07, "loss": 1.7371, "step": 1645 }, { "epoch": 2.6771108850457783, "grad_norm": 7.599712258137732, "learning_rate": 3.539096384121743e-07, "loss": 1.5586, "step": 1646 }, { "epoch": 2.678738555442523, "grad_norm": 6.825755839943773, "learning_rate": 3.504212184142031e-07, "loss": 1.7309, "step": 1647 }, { "epoch": 2.6803662258392675, "grad_norm": 6.9656830040645366, "learning_rate": 3.469494515914079e-07, "loss": 1.7731, "step": 1648 }, { "epoch": 2.6819938962360124, "grad_norm": 7.930253765725239, "learning_rate": 3.4349435037844714e-07, "loss": 1.9061, "step": 1649 }, { "epoch": 2.6836215666327567, "grad_norm": 7.86863622135382, "learning_rate": 3.40055927150294e-07, "loss": 1.7549, "step": 1650 }, { "epoch": 2.6852492370295016, "grad_norm": 8.341822902249383, "learning_rate": 3.366341942221868e-07, "loss": 1.7859, "step": 1651 }, { "epoch": 2.686876907426246, "grad_norm": 7.645043796837441, "learning_rate": 3.3322916384958157e-07, "loss": 1.5496, "step": 1652 }, { "epoch": 2.688504577822991, "grad_norm": 7.450259926633082, "learning_rate": 3.298408482281135e-07, "loss": 1.7155, "step": 1653 }, { "epoch": 2.6901322482197356, "grad_norm": 8.487869615981362, "learning_rate": 3.264692594935531e-07, "loss": 1.8783, "step": 1654 }, { "epoch": 2.69175991861648, "grad_norm": 7.982805509884393, "learning_rate": 3.231144097217598e-07, "loss": 1.6199, "step": 1655 }, { "epoch": 2.693387589013225, "grad_norm": 7.506678978797558, "learning_rate": 3.1977631092863613e-07, "loss": 1.8597, "step": 1656 }, { "epoch": 2.6950152594099697, "grad_norm": 7.6273216699613355, "learning_rate": 3.1645497507009515e-07, "loss": 1.7405, "step": 1657 }, { "epoch": 2.696642929806714, "grad_norm": 7.622277237272326, "learning_rate": 3.1315041404200663e-07, "loss": 1.7904, "step": 1658 }, { "epoch": 2.698270600203459, "grad_norm": 8.374989903792239, "learning_rate": 3.098626396801591e-07, "loss": 1.7213, "step": 1659 }, { "epoch": 2.6998982706002037, "grad_norm": 7.053306573684582, "learning_rate": 3.065916637602173e-07, "loss": 1.6361, "step": 1660 }, { "epoch": 2.701525940996948, "grad_norm": 6.957051684352628, "learning_rate": 3.033374979976811e-07, "loss": 1.5628, "step": 1661 }, { "epoch": 2.703153611393693, "grad_norm": 7.246538959184659, "learning_rate": 3.0010015404784097e-07, "loss": 1.8403, "step": 1662 }, { "epoch": 2.7047812817904373, "grad_norm": 7.121559445794055, "learning_rate": 2.968796435057375e-07, "loss": 1.8624, "step": 1663 }, { "epoch": 2.706408952187182, "grad_norm": 7.31235224388711, "learning_rate": 2.936759779061199e-07, "loss": 1.6025, "step": 1664 }, { "epoch": 2.7080366225839265, "grad_norm": 7.429814035040142, "learning_rate": 2.9048916872340574e-07, "loss": 1.7302, "step": 1665 }, { "epoch": 2.7096642929806714, "grad_norm": 7.621037142520996, "learning_rate": 2.873192273716369e-07, "loss": 1.7687, "step": 1666 }, { "epoch": 2.711291963377416, "grad_norm": 7.440637150440862, "learning_rate": 2.8416616520444193e-07, "loss": 1.6816, "step": 1667 }, { "epoch": 2.7129196337741606, "grad_norm": 7.8174870718356395, "learning_rate": 2.810299935149935e-07, "loss": 1.6289, "step": 1668 }, { "epoch": 2.7145473041709054, "grad_norm": 7.517625225929033, "learning_rate": 2.779107235359696e-07, "loss": 1.8268, "step": 1669 }, { "epoch": 2.7161749745676502, "grad_norm": 7.104550376140775, "learning_rate": 2.748083664395096e-07, "loss": 1.7942, "step": 1670 }, { "epoch": 2.7178026449643946, "grad_norm": 7.572559964032903, "learning_rate": 2.7172293333717846e-07, "loss": 1.7847, "step": 1671 }, { "epoch": 2.7194303153611394, "grad_norm": 7.685951276967782, "learning_rate": 2.68654435279927e-07, "loss": 1.7739, "step": 1672 }, { "epoch": 2.7210579857578843, "grad_norm": 7.193331912085589, "learning_rate": 2.656028832580476e-07, "loss": 1.4873, "step": 1673 }, { "epoch": 2.7226856561546287, "grad_norm": 7.289661272268068, "learning_rate": 2.6256828820113765e-07, "loss": 1.7409, "step": 1674 }, { "epoch": 2.7243133265513735, "grad_norm": 6.59125297155556, "learning_rate": 2.5955066097806405e-07, "loss": 1.7963, "step": 1675 }, { "epoch": 2.725940996948118, "grad_norm": 8.114924750248786, "learning_rate": 2.5655001239691836e-07, "loss": 1.9146, "step": 1676 }, { "epoch": 2.7275686673448627, "grad_norm": 7.8558642614696135, "learning_rate": 2.5356635320497924e-07, "loss": 1.826, "step": 1677 }, { "epoch": 2.729196337741607, "grad_norm": 6.956872567750479, "learning_rate": 2.5059969408867846e-07, "loss": 1.8927, "step": 1678 }, { "epoch": 2.730824008138352, "grad_norm": 7.838498445521436, "learning_rate": 2.476500456735581e-07, "loss": 1.856, "step": 1679 }, { "epoch": 2.7324516785350967, "grad_norm": 7.804996650128892, "learning_rate": 2.447174185242324e-07, "loss": 1.7069, "step": 1680 }, { "epoch": 2.734079348931841, "grad_norm": 7.6900784547442536, "learning_rate": 2.4180182314435305e-07, "loss": 1.7755, "step": 1681 }, { "epoch": 2.735707019328586, "grad_norm": 7.458226889734503, "learning_rate": 2.3890326997656975e-07, "loss": 1.8125, "step": 1682 }, { "epoch": 2.737334689725331, "grad_norm": 7.032294956135086, "learning_rate": 2.3602176940249188e-07, "loss": 1.8056, "step": 1683 }, { "epoch": 2.738962360122075, "grad_norm": 7.184786082135983, "learning_rate": 2.3315733174265333e-07, "loss": 1.6231, "step": 1684 }, { "epoch": 2.74059003051882, "grad_norm": 6.92058934648293, "learning_rate": 2.3030996725647402e-07, "loss": 1.6904, "step": 1685 }, { "epoch": 2.742217700915565, "grad_norm": 7.364445776968929, "learning_rate": 2.274796861422246e-07, "loss": 1.7088, "step": 1686 }, { "epoch": 2.743845371312309, "grad_norm": 7.021160091308363, "learning_rate": 2.246664985369873e-07, "loss": 1.8805, "step": 1687 }, { "epoch": 2.745473041709054, "grad_norm": 7.552146146780486, "learning_rate": 2.2187041451662285e-07, "loss": 1.5908, "step": 1688 }, { "epoch": 2.7471007121057984, "grad_norm": 6.812970341286714, "learning_rate": 2.1909144409573146e-07, "loss": 1.7068, "step": 1689 }, { "epoch": 2.7487283825025433, "grad_norm": 6.944821861418251, "learning_rate": 2.1632959722762192e-07, "loss": 1.7053, "step": 1690 }, { "epoch": 2.7503560528992876, "grad_norm": 7.426597228768934, "learning_rate": 2.1358488380426757e-07, "loss": 1.7807, "step": 1691 }, { "epoch": 2.7519837232960325, "grad_norm": 7.1890195517903654, "learning_rate": 2.108573136562775e-07, "loss": 1.6217, "step": 1692 }, { "epoch": 2.7536113936927773, "grad_norm": 8.338445942416023, "learning_rate": 2.0814689655286157e-07, "loss": 1.8602, "step": 1693 }, { "epoch": 2.7552390640895217, "grad_norm": 7.295574593961834, "learning_rate": 2.054536422017922e-07, "loss": 1.7785, "step": 1694 }, { "epoch": 2.7568667344862665, "grad_norm": 8.1134319931583, "learning_rate": 2.0277756024936812e-07, "loss": 1.749, "step": 1695 }, { "epoch": 2.7584944048830113, "grad_norm": 8.545466009439517, "learning_rate": 2.0011866028038617e-07, "loss": 1.7229, "step": 1696 }, { "epoch": 2.7601220752797557, "grad_norm": 7.669280281235903, "learning_rate": 1.9747695181810245e-07, "loss": 1.7861, "step": 1697 }, { "epoch": 2.7617497456765006, "grad_norm": 6.371104295868133, "learning_rate": 1.948524443241967e-07, "loss": 1.627, "step": 1698 }, { "epoch": 2.7633774160732454, "grad_norm": 6.950460399735837, "learning_rate": 1.9224514719874465e-07, "loss": 1.7293, "step": 1699 }, { "epoch": 2.7650050864699898, "grad_norm": 6.88784599090233, "learning_rate": 1.896550697801769e-07, "loss": 1.86, "step": 1700 }, { "epoch": 2.7666327568667346, "grad_norm": 7.509724356937051, "learning_rate": 1.8708222134525168e-07, "loss": 1.5876, "step": 1701 }, { "epoch": 2.768260427263479, "grad_norm": 7.5722283866785665, "learning_rate": 1.8452661110901715e-07, "loss": 1.9304, "step": 1702 }, { "epoch": 2.769888097660224, "grad_norm": 8.031180744773073, "learning_rate": 1.819882482247809e-07, "loss": 1.8069, "step": 1703 }, { "epoch": 2.771515768056968, "grad_norm": 7.6830068488657774, "learning_rate": 1.7946714178407655e-07, "loss": 1.861, "step": 1704 }, { "epoch": 2.773143438453713, "grad_norm": 8.86551445758907, "learning_rate": 1.769633008166316e-07, "loss": 1.6577, "step": 1705 }, { "epoch": 2.774771108850458, "grad_norm": 7.7555196314070995, "learning_rate": 1.7447673429033361e-07, "loss": 1.6252, "step": 1706 }, { "epoch": 2.7763987792472022, "grad_norm": 7.131009372943469, "learning_rate": 1.7200745111120021e-07, "loss": 1.7032, "step": 1707 }, { "epoch": 2.778026449643947, "grad_norm": 6.795694883842075, "learning_rate": 1.695554601233451e-07, "loss": 1.5576, "step": 1708 }, { "epoch": 2.779654120040692, "grad_norm": 7.3879804009904015, "learning_rate": 1.6712077010894778e-07, "loss": 1.6659, "step": 1709 }, { "epoch": 2.7812817904374363, "grad_norm": 7.500742823601542, "learning_rate": 1.647033897882211e-07, "loss": 1.3706, "step": 1710 }, { "epoch": 2.782909460834181, "grad_norm": 7.90250992175192, "learning_rate": 1.6230332781938253e-07, "loss": 1.8559, "step": 1711 }, { "epoch": 2.784537131230926, "grad_norm": 7.729786469082654, "learning_rate": 1.5992059279861916e-07, "loss": 1.5542, "step": 1712 }, { "epoch": 2.7861648016276703, "grad_norm": 7.596575455223464, "learning_rate": 1.575551932600594e-07, "loss": 1.6358, "step": 1713 }, { "epoch": 2.787792472024415, "grad_norm": 7.193369365507281, "learning_rate": 1.5520713767574247e-07, "loss": 1.6665, "step": 1714 }, { "epoch": 2.78942014242116, "grad_norm": 6.72117417346949, "learning_rate": 1.528764344555883e-07, "loss": 1.6321, "step": 1715 }, { "epoch": 2.7910478128179044, "grad_norm": 7.123520200221215, "learning_rate": 1.5056309194736385e-07, "loss": 1.6463, "step": 1716 }, { "epoch": 2.7926754832146488, "grad_norm": 7.395425673672002, "learning_rate": 1.4826711843665964e-07, "loss": 1.5161, "step": 1717 }, { "epoch": 2.7943031536113936, "grad_norm": 6.315743823860524, "learning_rate": 1.4598852214685488e-07, "loss": 1.7171, "step": 1718 }, { "epoch": 2.7959308240081384, "grad_norm": 7.521336552922427, "learning_rate": 1.4372731123908745e-07, "loss": 1.9258, "step": 1719 }, { "epoch": 2.797558494404883, "grad_norm": 7.939007169135731, "learning_rate": 1.414834938122306e-07, "loss": 1.8052, "step": 1720 }, { "epoch": 2.7991861648016276, "grad_norm": 7.160410010954086, "learning_rate": 1.3925707790285848e-07, "loss": 1.7763, "step": 1721 }, { "epoch": 2.8008138351983725, "grad_norm": 7.4463683638444635, "learning_rate": 1.3704807148521903e-07, "loss": 1.8486, "step": 1722 }, { "epoch": 2.802441505595117, "grad_norm": 7.548000349664076, "learning_rate": 1.3485648247120454e-07, "loss": 1.6703, "step": 1723 }, { "epoch": 2.8040691759918617, "grad_norm": 6.908028919114392, "learning_rate": 1.3268231871032655e-07, "loss": 1.7992, "step": 1724 }, { "epoch": 2.8056968463886065, "grad_norm": 7.646997752849372, "learning_rate": 1.3052558798968274e-07, "loss": 1.6226, "step": 1725 }, { "epoch": 2.807324516785351, "grad_norm": 7.96456545387754, "learning_rate": 1.2838629803393343e-07, "loss": 1.7209, "step": 1726 }, { "epoch": 2.8089521871820957, "grad_norm": 7.619624263967442, "learning_rate": 1.2626445650527176e-07, "loss": 1.6697, "step": 1727 }, { "epoch": 2.8105798575788405, "grad_norm": 7.348492409182239, "learning_rate": 1.2416007100339577e-07, "loss": 1.686, "step": 1728 }, { "epoch": 2.812207527975585, "grad_norm": 7.233128989349756, "learning_rate": 1.2207314906548528e-07, "loss": 1.6949, "step": 1729 }, { "epoch": 2.8138351983723298, "grad_norm": 6.745402453793974, "learning_rate": 1.2000369816616675e-07, "loss": 1.7207, "step": 1730 }, { "epoch": 2.815462868769074, "grad_norm": 7.2419088181251, "learning_rate": 1.1795172571749503e-07, "loss": 1.6804, "step": 1731 }, { "epoch": 2.817090539165819, "grad_norm": 7.036974003428646, "learning_rate": 1.1591723906892339e-07, "loss": 1.5793, "step": 1732 }, { "epoch": 2.8187182095625634, "grad_norm": 7.092358743411497, "learning_rate": 1.1390024550727352e-07, "loss": 1.6004, "step": 1733 }, { "epoch": 2.820345879959308, "grad_norm": 7.598520721236, "learning_rate": 1.119007522567167e-07, "loss": 1.8007, "step": 1734 }, { "epoch": 2.821973550356053, "grad_norm": 7.685373423100191, "learning_rate": 1.0991876647874322e-07, "loss": 1.9403, "step": 1735 }, { "epoch": 2.8236012207527974, "grad_norm": 6.882269821770329, "learning_rate": 1.0795429527213685e-07, "loss": 1.7046, "step": 1736 }, { "epoch": 2.8252288911495422, "grad_norm": 7.56034258050407, "learning_rate": 1.060073456729499e-07, "loss": 1.8743, "step": 1737 }, { "epoch": 2.826856561546287, "grad_norm": 7.435394046599421, "learning_rate": 1.0407792465447986e-07, "loss": 1.6968, "step": 1738 }, { "epoch": 2.8284842319430314, "grad_norm": 7.907306645002319, "learning_rate": 1.0216603912724221e-07, "loss": 1.8087, "step": 1739 }, { "epoch": 2.8301119023397763, "grad_norm": 7.852769202093541, "learning_rate": 1.0027169593894659e-07, "loss": 1.7497, "step": 1740 }, { "epoch": 2.831739572736521, "grad_norm": 7.329112192547927, "learning_rate": 9.839490187447176e-08, "loss": 1.5876, "step": 1741 }, { "epoch": 2.8333672431332655, "grad_norm": 7.31143927274419, "learning_rate": 9.653566365584178e-08, "loss": 1.8906, "step": 1742 }, { "epoch": 2.8349949135300103, "grad_norm": 8.117389891268763, "learning_rate": 9.469398794220153e-08, "loss": 1.5736, "step": 1743 }, { "epoch": 2.8366225839267547, "grad_norm": 7.382098089102542, "learning_rate": 9.286988132979292e-08, "loss": 1.6185, "step": 1744 }, { "epoch": 2.8382502543234995, "grad_norm": 7.250669303678091, "learning_rate": 9.106335035193315e-08, "loss": 1.64, "step": 1745 }, { "epoch": 2.839877924720244, "grad_norm": 7.036791531504068, "learning_rate": 8.927440147898703e-08, "loss": 1.7107, "step": 1746 }, { "epoch": 2.8415055951169887, "grad_norm": 7.1583647894455895, "learning_rate": 8.750304111834807e-08, "loss": 1.7913, "step": 1747 }, { "epoch": 2.8431332655137336, "grad_norm": 6.917446697985718, "learning_rate": 8.574927561441348e-08, "loss": 1.7859, "step": 1748 }, { "epoch": 2.844760935910478, "grad_norm": 7.54676947662624, "learning_rate": 8.401311124856148e-08, "loss": 1.9327, "step": 1749 }, { "epoch": 2.846388606307223, "grad_norm": 7.752589159816136, "learning_rate": 8.229455423913013e-08, "loss": 1.4709, "step": 1750 }, { "epoch": 2.8480162767039676, "grad_norm": 6.537145369888248, "learning_rate": 8.059361074139293e-08, "loss": 1.9411, "step": 1751 }, { "epoch": 2.849643947100712, "grad_norm": 7.698812838268404, "learning_rate": 7.891028684753777e-08, "loss": 1.6528, "step": 1752 }, { "epoch": 2.851271617497457, "grad_norm": 6.823585270958799, "learning_rate": 7.724458858664686e-08, "loss": 1.7114, "step": 1753 }, { "epoch": 2.8528992878942017, "grad_norm": 7.491041074361417, "learning_rate": 7.559652192467127e-08, "loss": 1.8955, "step": 1754 }, { "epoch": 2.854526958290946, "grad_norm": 7.856504595923284, "learning_rate": 7.396609276441313e-08, "loss": 1.6376, "step": 1755 }, { "epoch": 2.856154628687691, "grad_norm": 6.764386211519891, "learning_rate": 7.235330694550402e-08, "loss": 1.4413, "step": 1756 }, { "epoch": 2.8577822990844353, "grad_norm": 6.608995161340471, "learning_rate": 7.075817024438215e-08, "loss": 1.6177, "step": 1757 }, { "epoch": 2.85940996948118, "grad_norm": 6.934154462026746, "learning_rate": 6.918068837427127e-08, "loss": 1.8384, "step": 1758 }, { "epoch": 2.8610376398779245, "grad_norm": 7.798234720142274, "learning_rate": 6.762086698516413e-08, "loss": 1.5738, "step": 1759 }, { "epoch": 2.8626653102746693, "grad_norm": 6.8264736517788345, "learning_rate": 6.607871166379897e-08, "loss": 1.5655, "step": 1760 }, { "epoch": 2.864292980671414, "grad_norm": 6.798764492694531, "learning_rate": 6.45542279336403e-08, "loss": 1.6655, "step": 1761 }, { "epoch": 2.8659206510681585, "grad_norm": 6.774360447945654, "learning_rate": 6.304742125485874e-08, "loss": 1.9295, "step": 1762 }, { "epoch": 2.8675483214649033, "grad_norm": 6.79655254647495, "learning_rate": 6.15582970243117e-08, "loss": 1.5801, "step": 1763 }, { "epoch": 2.869175991861648, "grad_norm": 6.998500773769877, "learning_rate": 6.008686057552449e-08, "loss": 1.8327, "step": 1764 }, { "epoch": 2.8708036622583926, "grad_norm": 7.68757003075291, "learning_rate": 5.8633117178671926e-08, "loss": 1.7333, "step": 1765 }, { "epoch": 2.8724313326551374, "grad_norm": 6.6749619647182765, "learning_rate": 5.7197072040557356e-08, "loss": 1.6759, "step": 1766 }, { "epoch": 2.874059003051882, "grad_norm": 7.5665143773962225, "learning_rate": 5.5778730304594794e-08, "loss": 1.8851, "step": 1767 }, { "epoch": 2.8756866734486266, "grad_norm": 7.367234656358884, "learning_rate": 5.437809705079233e-08, "loss": 1.7742, "step": 1768 }, { "epoch": 2.8773143438453714, "grad_norm": 7.3833899602216455, "learning_rate": 5.299517729573045e-08, "loss": 1.7322, "step": 1769 }, { "epoch": 2.878942014242116, "grad_norm": 7.637981550450564, "learning_rate": 5.162997599254704e-08, "loss": 1.6174, "step": 1770 }, { "epoch": 2.8805696846388607, "grad_norm": 6.995142992297914, "learning_rate": 5.028249803091967e-08, "loss": 1.726, "step": 1771 }, { "epoch": 2.882197355035605, "grad_norm": 7.7141028686978315, "learning_rate": 4.8952748237045546e-08, "loss": 1.7212, "step": 1772 }, { "epoch": 2.88382502543235, "grad_norm": 7.39924173252206, "learning_rate": 4.764073137362546e-08, "loss": 1.5845, "step": 1773 }, { "epoch": 2.8854526958290947, "grad_norm": 7.610284536213376, "learning_rate": 4.6346452139849344e-08, "loss": 1.8073, "step": 1774 }, { "epoch": 2.887080366225839, "grad_norm": 7.717852783423118, "learning_rate": 4.50699151713746e-08, "loss": 1.944, "step": 1775 }, { "epoch": 2.888708036622584, "grad_norm": 8.457942090068974, "learning_rate": 4.381112504031337e-08, "loss": 1.568, "step": 1776 }, { "epoch": 2.8903357070193287, "grad_norm": 6.499071485448515, "learning_rate": 4.257008625521364e-08, "loss": 1.6869, "step": 1777 }, { "epoch": 2.891963377416073, "grad_norm": 7.456001825037739, "learning_rate": 4.1346803261046455e-08, "loss": 1.8462, "step": 1778 }, { "epoch": 2.893591047812818, "grad_norm": 7.114455875463442, "learning_rate": 4.0141280439184305e-08, "loss": 1.5645, "step": 1779 }, { "epoch": 2.895218718209563, "grad_norm": 7.317280799723345, "learning_rate": 3.8953522107392785e-08, "loss": 1.7504, "step": 1780 }, { "epoch": 2.896846388606307, "grad_norm": 7.537553298433132, "learning_rate": 3.7783532519808376e-08, "loss": 1.5617, "step": 1781 }, { "epoch": 2.898474059003052, "grad_norm": 7.072795980206849, "learning_rate": 3.663131586692792e-08, "loss": 1.6737, "step": 1782 }, { "epoch": 2.9001017293997964, "grad_norm": 6.628716490374918, "learning_rate": 3.5496876275590286e-08, "loss": 1.7422, "step": 1783 }, { "epoch": 2.901729399796541, "grad_norm": 7.576107459787328, "learning_rate": 3.4380217808964164e-08, "loss": 1.6793, "step": 1784 }, { "epoch": 2.9033570701932856, "grad_norm": 7.474728903220345, "learning_rate": 3.3281344466531416e-08, "loss": 1.6385, "step": 1785 }, { "epoch": 2.9049847405900304, "grad_norm": 7.216014590472781, "learning_rate": 3.220026018407541e-08, "loss": 1.6032, "step": 1786 }, { "epoch": 2.9066124109867753, "grad_norm": 7.492625115183176, "learning_rate": 3.1136968833663795e-08, "loss": 1.7619, "step": 1787 }, { "epoch": 2.9082400813835196, "grad_norm": 7.8564117135747, "learning_rate": 3.0091474223636895e-08, "loss": 1.7077, "step": 1788 }, { "epoch": 2.9098677517802645, "grad_norm": 6.444307256335772, "learning_rate": 2.9063780098592677e-08, "loss": 1.7231, "step": 1789 }, { "epoch": 2.9114954221770093, "grad_norm": 7.0310907004356435, "learning_rate": 2.805389013937454e-08, "loss": 1.7683, "step": 1790 }, { "epoch": 2.9131230925737537, "grad_norm": 7.4479962932371615, "learning_rate": 2.706180796305691e-08, "loss": 1.7859, "step": 1791 }, { "epoch": 2.9147507629704985, "grad_norm": 6.430023515229046, "learning_rate": 2.6087537122934103e-08, "loss": 1.6504, "step": 1792 }, { "epoch": 2.9163784333672433, "grad_norm": 7.351782117831229, "learning_rate": 2.513108110850482e-08, "loss": 1.7877, "step": 1793 }, { "epoch": 2.9180061037639877, "grad_norm": 7.744505910109731, "learning_rate": 2.4192443345462667e-08, "loss": 1.7819, "step": 1794 }, { "epoch": 2.9196337741607326, "grad_norm": 7.158547119094752, "learning_rate": 2.3271627195681768e-08, "loss": 1.7009, "step": 1795 }, { "epoch": 2.9212614445574774, "grad_norm": 6.641861324722885, "learning_rate": 2.236863595720562e-08, "loss": 1.8191, "step": 1796 }, { "epoch": 2.9228891149542218, "grad_norm": 7.175157235369945, "learning_rate": 2.1483472864234356e-08, "loss": 1.6935, "step": 1797 }, { "epoch": 2.924516785350966, "grad_norm": 6.557550917756789, "learning_rate": 2.0616141087114737e-08, "loss": 1.5936, "step": 1798 }, { "epoch": 2.926144455747711, "grad_norm": 7.398710804170939, "learning_rate": 1.9766643732328506e-08, "loss": 1.7705, "step": 1799 }, { "epoch": 2.927772126144456, "grad_norm": 6.939294528592234, "learning_rate": 1.8934983842479048e-08, "loss": 1.6862, "step": 1800 }, { "epoch": 2.9293997965412, "grad_norm": 6.933431888290875, "learning_rate": 1.8121164396283643e-08, "loss": 1.6217, "step": 1801 }, { "epoch": 2.931027466937945, "grad_norm": 7.117428158561281, "learning_rate": 1.732518830856067e-08, "loss": 1.6247, "step": 1802 }, { "epoch": 2.93265513733469, "grad_norm": 7.009512912808115, "learning_rate": 1.6547058430219086e-08, "loss": 1.6328, "step": 1803 }, { "epoch": 2.9342828077314342, "grad_norm": 6.916963590364916, "learning_rate": 1.5786777548250644e-08, "loss": 1.6411, "step": 1804 }, { "epoch": 2.935910478128179, "grad_norm": 6.996968172578978, "learning_rate": 1.5044348385716557e-08, "loss": 1.7196, "step": 1805 }, { "epoch": 2.937538148524924, "grad_norm": 7.049980244562297, "learning_rate": 1.431977360173975e-08, "loss": 1.8037, "step": 1806 }, { "epoch": 2.9391658189216683, "grad_norm": 7.132487030859086, "learning_rate": 1.3613055791496521e-08, "loss": 1.7999, "step": 1807 }, { "epoch": 2.940793489318413, "grad_norm": 7.54920959611513, "learning_rate": 1.2924197486203215e-08, "loss": 1.763, "step": 1808 }, { "epoch": 2.942421159715158, "grad_norm": 6.337839592489822, "learning_rate": 1.2253201153111239e-08, "loss": 1.6241, "step": 1809 }, { "epoch": 2.9440488301119023, "grad_norm": 7.649626734747042, "learning_rate": 1.16000691954965e-08, "loss": 1.7768, "step": 1810 }, { "epoch": 2.945676500508647, "grad_norm": 7.621456571606609, "learning_rate": 1.0964803952650539e-08, "loss": 1.8278, "step": 1811 }, { "epoch": 2.9473041709053915, "grad_norm": 7.321459365591134, "learning_rate": 1.0347407699872192e-08, "loss": 1.5903, "step": 1812 }, { "epoch": 2.9489318413021364, "grad_norm": 6.3545441345087195, "learning_rate": 9.747882648460938e-09, "loss": 1.6249, "step": 1813 }, { "epoch": 2.9505595116988808, "grad_norm": 6.915537771879328, "learning_rate": 9.1662309457069e-09, "loss": 1.6863, "step": 1814 }, { "epoch": 2.9521871820956256, "grad_norm": 6.75528549336841, "learning_rate": 8.602454674884187e-09, "loss": 1.8274, "step": 1815 }, { "epoch": 2.9538148524923704, "grad_norm": 6.943559824997305, "learning_rate": 8.056555855243675e-09, "loss": 1.7224, "step": 1816 }, { "epoch": 2.955442522889115, "grad_norm": 7.479076774649672, "learning_rate": 7.528536442005241e-09, "loss": 1.8175, "step": 1817 }, { "epoch": 2.9570701932858596, "grad_norm": 8.37222776026831, "learning_rate": 7.01839832635054e-09, "loss": 1.7029, "step": 1818 }, { "epoch": 2.9586978636826045, "grad_norm": 6.841526028957009, "learning_rate": 6.526143335416901e-09, "loss": 1.6968, "step": 1819 }, { "epoch": 2.960325534079349, "grad_norm": 6.686548421964373, "learning_rate": 6.051773232291225e-09, "loss": 1.4711, "step": 1820 }, { "epoch": 2.9619532044760937, "grad_norm": 7.026215022509465, "learning_rate": 5.5952897160016505e-09, "loss": 1.7165, "step": 1821 }, { "epoch": 2.9635808748728385, "grad_norm": 7.224746998566885, "learning_rate": 5.15669442151423e-09, "loss": 1.7729, "step": 1822 }, { "epoch": 2.965208545269583, "grad_norm": 7.515163020026272, "learning_rate": 4.735988919724599e-09, "loss": 1.7141, "step": 1823 }, { "epoch": 2.9668362156663277, "grad_norm": 7.977451787875248, "learning_rate": 4.333174717453537e-09, "loss": 1.762, "step": 1824 }, { "epoch": 2.968463886063072, "grad_norm": 7.915228591964517, "learning_rate": 3.948253257440859e-09, "loss": 1.6091, "step": 1825 }, { "epoch": 2.970091556459817, "grad_norm": 6.88913088721091, "learning_rate": 3.5812259183426457e-09, "loss": 1.934, "step": 1826 }, { "epoch": 2.9717192268565613, "grad_norm": 6.9631375085281615, "learning_rate": 3.2320940147229085e-09, "loss": 1.683, "step": 1827 }, { "epoch": 2.973346897253306, "grad_norm": 7.463970114583696, "learning_rate": 2.9008587970502655e-09, "loss": 1.6304, "step": 1828 }, { "epoch": 2.974974567650051, "grad_norm": 6.777975380643299, "learning_rate": 2.587521451694608e-09, "loss": 1.5139, "step": 1829 }, { "epoch": 2.9766022380467954, "grad_norm": 6.880392571775515, "learning_rate": 2.2920831009209944e-09, "loss": 1.7888, "step": 1830 }, { "epoch": 2.97822990844354, "grad_norm": 7.463931151153864, "learning_rate": 2.0145448028874305e-09, "loss": 1.5364, "step": 1831 }, { "epoch": 2.979857578840285, "grad_norm": 7.1451070199451205, "learning_rate": 1.7549075516393176e-09, "loss": 1.8009, "step": 1832 }, { "epoch": 2.9814852492370294, "grad_norm": 7.0141369681613375, "learning_rate": 1.5131722771066782e-09, "loss": 1.7311, "step": 1833 }, { "epoch": 2.9831129196337742, "grad_norm": 7.422930952278004, "learning_rate": 1.2893398451024886e-09, "loss": 1.6188, "step": 1834 }, { "epoch": 2.984740590030519, "grad_norm": 6.865434720176492, "learning_rate": 1.0834110573154644e-09, "loss": 1.5349, "step": 1835 }, { "epoch": 2.9863682604272634, "grad_norm": 6.570452290295371, "learning_rate": 8.953866513111698e-10, "loss": 1.7369, "step": 1836 }, { "epoch": 2.9879959308240083, "grad_norm": 6.97038415629167, "learning_rate": 7.252673005281319e-10, "loss": 1.6354, "step": 1837 }, { "epoch": 2.9896236012207527, "grad_norm": 7.561620380644066, "learning_rate": 5.730536142745103e-10, "loss": 1.7334, "step": 1838 }, { "epoch": 2.9912512716174975, "grad_norm": 7.5261644539319335, "learning_rate": 4.387461377269864e-10, "loss": 1.8933, "step": 1839 }, { "epoch": 2.992878942014242, "grad_norm": 7.6818735302591445, "learning_rate": 3.2234535192798843e-10, "loss": 1.7758, "step": 1840 }, { "epoch": 2.9945066124109867, "grad_norm": 8.091287544360902, "learning_rate": 2.2385167378513594e-10, "loss": 1.8932, "step": 1841 }, { "epoch": 2.9961342828077315, "grad_norm": 6.597733699600663, "learning_rate": 1.432654560679092e-10, "loss": 1.7026, "step": 1842 }, { "epoch": 2.997761953204476, "grad_norm": 7.437783299011484, "learning_rate": 8.058698740820437e-11, "loss": 2.0183, "step": 1843 }, { "epoch": 2.9993896236012207, "grad_norm": 7.75310375845717, "learning_rate": 3.581649229922324e-11, "loss": 1.722, "step": 1844 }, { "epoch": 3.0, "grad_norm": 13.012032475978849, "learning_rate": 8.954131092142604e-12, "loss": 2.0281, "step": 1845 }, { "epoch": 3.0, "step": 1845, "total_flos": 7453590939648.0, "train_loss": 2.564446141532443, "train_runtime": 7879.5559, "train_samples_per_second": 14.968, "train_steps_per_second": 0.234 } ], "logging_steps": 1, "max_steps": 1845, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7453590939648.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }