{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1431, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006988120195667365, "grad_norm": 15.825864096286791, "learning_rate": 0.0, "loss": 26.435, "num_tokens": 7466.0, "step": 1 }, { "epoch": 0.001397624039133473, "grad_norm": 12.251496631056742, "learning_rate": 4.651162790697674e-06, "loss": 26.4932, "num_tokens": 14867.0, "step": 2 }, { "epoch": 0.0020964360587002098, "grad_norm": 13.519476973182835, "learning_rate": 9.302325581395349e-06, "loss": 26.4287, "num_tokens": 22758.0, "step": 3 }, { "epoch": 0.002795248078266946, "grad_norm": 15.38208620339498, "learning_rate": 1.3953488372093024e-05, "loss": 26.07, "num_tokens": 30251.0, "step": 4 }, { "epoch": 0.0034940600978336828, "grad_norm": 14.123335556294114, "learning_rate": 1.8604651162790697e-05, "loss": 25.6902, "num_tokens": 37442.0, "step": 5 }, { "epoch": 0.0041928721174004195, "grad_norm": 38.41325227449582, "learning_rate": 2.3255813953488374e-05, "loss": 25.6086, "num_tokens": 44450.0, "step": 6 }, { "epoch": 0.004891684136967156, "grad_norm": 49.20044694933379, "learning_rate": 2.7906976744186048e-05, "loss": 25.6802, "num_tokens": 51596.0, "step": 7 }, { "epoch": 0.005590496156533892, "grad_norm": 24.837518059934986, "learning_rate": 3.2558139534883724e-05, "loss": 25.0283, "num_tokens": 58501.0, "step": 8 }, { "epoch": 0.006289308176100629, "grad_norm": 35.02610727478617, "learning_rate": 3.7209302325581394e-05, "loss": 24.5402, "num_tokens": 65085.0, "step": 9 }, { "epoch": 0.0069881201956673656, "grad_norm": 18.49743608785304, "learning_rate": 4.186046511627907e-05, "loss": 24.1431, "num_tokens": 72520.0, "step": 10 }, { "epoch": 0.007686932215234102, "grad_norm": 16.142798556485957, "learning_rate": 4.651162790697675e-05, "loss": 23.9589, "num_tokens": 79490.0, "step": 11 }, { "epoch": 0.008385744234800839, "grad_norm": 15.957529997417875, "learning_rate": 5.1162790697674425e-05, "loss": 23.7223, "num_tokens": 86902.0, "step": 12 }, { "epoch": 0.009084556254367574, "grad_norm": 10.570962604630072, "learning_rate": 5.5813953488372095e-05, "loss": 22.9895, "num_tokens": 92738.0, "step": 13 }, { "epoch": 0.009783368273934312, "grad_norm": 18.77079227590805, "learning_rate": 6.0465116279069765e-05, "loss": 23.1725, "num_tokens": 100060.0, "step": 14 }, { "epoch": 0.010482180293501049, "grad_norm": 7.3705046557873635, "learning_rate": 6.511627906976745e-05, "loss": 22.0704, "num_tokens": 107649.0, "step": 15 }, { "epoch": 0.011180992313067784, "grad_norm": 9.756769805700404, "learning_rate": 6.976744186046513e-05, "loss": 21.7304, "num_tokens": 114327.0, "step": 16 }, { "epoch": 0.011879804332634521, "grad_norm": 7.1793383883530995, "learning_rate": 7.441860465116279e-05, "loss": 21.4763, "num_tokens": 121566.0, "step": 17 }, { "epoch": 0.012578616352201259, "grad_norm": 7.137243029031286, "learning_rate": 7.906976744186047e-05, "loss": 21.3598, "num_tokens": 128982.0, "step": 18 }, { "epoch": 0.013277428371767994, "grad_norm": 13.638520130002313, "learning_rate": 8.372093023255814e-05, "loss": 21.1434, "num_tokens": 136536.0, "step": 19 }, { "epoch": 0.013976240391334731, "grad_norm": 11.69983873670843, "learning_rate": 8.837209302325582e-05, "loss": 21.0494, "num_tokens": 144292.0, "step": 20 }, { "epoch": 0.014675052410901468, "grad_norm": 5.355565758628784, "learning_rate": 9.30232558139535e-05, "loss": 20.9966, "num_tokens": 151133.0, "step": 21 }, { "epoch": 0.015373864430468204, "grad_norm": 4.717512683910897, "learning_rate": 9.767441860465116e-05, "loss": 20.7461, "num_tokens": 158252.0, "step": 22 }, { "epoch": 0.01607267645003494, "grad_norm": 7.555779518574644, "learning_rate": 0.00010232558139534885, "loss": 20.3074, "num_tokens": 165457.0, "step": 23 }, { "epoch": 0.016771488469601678, "grad_norm": 6.72771421928895, "learning_rate": 0.00010697674418604651, "loss": 20.2158, "num_tokens": 172785.0, "step": 24 }, { "epoch": 0.017470300489168415, "grad_norm": 9.911658022140173, "learning_rate": 0.00011162790697674419, "loss": 20.2821, "num_tokens": 179494.0, "step": 25 }, { "epoch": 0.01816911250873515, "grad_norm": 4.297543791874311, "learning_rate": 0.00011627906976744187, "loss": 20.2236, "num_tokens": 187501.0, "step": 26 }, { "epoch": 0.018867924528301886, "grad_norm": 3.398148288869523, "learning_rate": 0.00012093023255813953, "loss": 20.1017, "num_tokens": 194486.0, "step": 27 }, { "epoch": 0.019566736547868623, "grad_norm": 8.051576045312112, "learning_rate": 0.0001255813953488372, "loss": 19.7799, "num_tokens": 201489.0, "step": 28 }, { "epoch": 0.02026554856743536, "grad_norm": 15.488905000197105, "learning_rate": 0.0001302325581395349, "loss": 19.9241, "num_tokens": 209354.0, "step": 29 }, { "epoch": 0.020964360587002098, "grad_norm": 3.283628115165652, "learning_rate": 0.00013488372093023256, "loss": 19.404, "num_tokens": 216345.0, "step": 30 }, { "epoch": 0.02166317260656883, "grad_norm": 3.9857224790977117, "learning_rate": 0.00013953488372093025, "loss": 19.6057, "num_tokens": 223037.0, "step": 31 }, { "epoch": 0.02236198462613557, "grad_norm": 3.22352219751297, "learning_rate": 0.00014418604651162791, "loss": 19.5688, "num_tokens": 230416.0, "step": 32 }, { "epoch": 0.023060796645702306, "grad_norm": 3.8491447527833986, "learning_rate": 0.00014883720930232558, "loss": 19.2043, "num_tokens": 237115.0, "step": 33 }, { "epoch": 0.023759608665269043, "grad_norm": 3.7874852298026425, "learning_rate": 0.00015348837209302327, "loss": 19.084, "num_tokens": 244240.0, "step": 34 }, { "epoch": 0.02445842068483578, "grad_norm": 3.8289674897719554, "learning_rate": 0.00015813953488372093, "loss": 19.075, "num_tokens": 251723.0, "step": 35 }, { "epoch": 0.025157232704402517, "grad_norm": 3.9440239720411228, "learning_rate": 0.00016279069767441862, "loss": 18.5376, "num_tokens": 258362.0, "step": 36 }, { "epoch": 0.02585604472396925, "grad_norm": 3.0319951387170065, "learning_rate": 0.00016744186046511629, "loss": 18.7768, "num_tokens": 265593.0, "step": 37 }, { "epoch": 0.026554856743535988, "grad_norm": 3.3898521484712276, "learning_rate": 0.00017209302325581395, "loss": 18.1432, "num_tokens": 272624.0, "step": 38 }, { "epoch": 0.027253668763102725, "grad_norm": 2.9301870676453463, "learning_rate": 0.00017674418604651164, "loss": 18.588, "num_tokens": 279181.0, "step": 39 }, { "epoch": 0.027952480782669462, "grad_norm": 2.654404660666782, "learning_rate": 0.0001813953488372093, "loss": 18.2087, "num_tokens": 285979.0, "step": 40 }, { "epoch": 0.0286512928022362, "grad_norm": 3.7416042387150528, "learning_rate": 0.000186046511627907, "loss": 18.2133, "num_tokens": 293070.0, "step": 41 }, { "epoch": 0.029350104821802937, "grad_norm": 3.105130611534875, "learning_rate": 0.00019069767441860466, "loss": 17.687, "num_tokens": 299558.0, "step": 42 }, { "epoch": 0.03004891684136967, "grad_norm": 3.3080203495958673, "learning_rate": 0.00019534883720930232, "loss": 17.8142, "num_tokens": 306733.0, "step": 43 }, { "epoch": 0.030747728860936407, "grad_norm": 4.550644498461186, "learning_rate": 0.0002, "loss": 17.5783, "num_tokens": 313625.0, "step": 44 }, { "epoch": 0.031446540880503145, "grad_norm": 2.89083091176252, "learning_rate": 0.00019999974385219888, "loss": 17.6028, "num_tokens": 320835.0, "step": 45 }, { "epoch": 0.03214535290006988, "grad_norm": 2.688470184996243, "learning_rate": 0.00019999897541010772, "loss": 17.0988, "num_tokens": 328348.0, "step": 46 }, { "epoch": 0.03284416491963662, "grad_norm": 4.12814028592051, "learning_rate": 0.00019999769467766323, "loss": 16.9933, "num_tokens": 335913.0, "step": 47 }, { "epoch": 0.033542976939203356, "grad_norm": 2.6768197525570696, "learning_rate": 0.00019999590166142655, "loss": 16.9533, "num_tokens": 343205.0, "step": 48 }, { "epoch": 0.03424178895877009, "grad_norm": 4.933854683750753, "learning_rate": 0.0001999935963705832, "loss": 16.78, "num_tokens": 350261.0, "step": 49 }, { "epoch": 0.03494060097833683, "grad_norm": 2.4948269682154813, "learning_rate": 0.0001999907788169431, "loss": 16.986, "num_tokens": 357367.0, "step": 50 }, { "epoch": 0.03563941299790356, "grad_norm": 2.5971622220228503, "learning_rate": 0.00019998744901494049, "loss": 16.3867, "num_tokens": 364705.0, "step": 51 }, { "epoch": 0.0363382250174703, "grad_norm": 3.5285288483689126, "learning_rate": 0.00019998360698163375, "loss": 16.6009, "num_tokens": 371422.0, "step": 52 }, { "epoch": 0.037037037037037035, "grad_norm": 2.3072905364363407, "learning_rate": 0.00019997925273670543, "loss": 16.3405, "num_tokens": 378970.0, "step": 53 }, { "epoch": 0.03773584905660377, "grad_norm": 2.111981005938095, "learning_rate": 0.0001999743863024622, "loss": 16.1949, "num_tokens": 386189.0, "step": 54 }, { "epoch": 0.03843466107617051, "grad_norm": 2.296216898523924, "learning_rate": 0.00019996900770383454, "loss": 16.3281, "num_tokens": 393815.0, "step": 55 }, { "epoch": 0.039133473095737246, "grad_norm": 2.666184126601885, "learning_rate": 0.0001999631169683768, "loss": 16.1931, "num_tokens": 399923.0, "step": 56 }, { "epoch": 0.039832285115303984, "grad_norm": 3.7207866770964886, "learning_rate": 0.0001999567141262669, "loss": 16.1543, "num_tokens": 406458.0, "step": 57 }, { "epoch": 0.04053109713487072, "grad_norm": 1.8938232560374766, "learning_rate": 0.0001999497992103064, "loss": 16.0368, "num_tokens": 414710.0, "step": 58 }, { "epoch": 0.04122990915443746, "grad_norm": 2.412458505414354, "learning_rate": 0.00019994237225592012, "loss": 15.8979, "num_tokens": 422447.0, "step": 59 }, { "epoch": 0.041928721174004195, "grad_norm": 2.5317332125646894, "learning_rate": 0.00019993443330115592, "loss": 15.5823, "num_tokens": 428907.0, "step": 60 }, { "epoch": 0.04262753319357093, "grad_norm": 2.8439927802342164, "learning_rate": 0.0001999259823866848, "loss": 15.7681, "num_tokens": 436247.0, "step": 61 }, { "epoch": 0.04332634521313766, "grad_norm": 2.3124546860656836, "learning_rate": 0.0001999170195558004, "loss": 15.496, "num_tokens": 443409.0, "step": 62 }, { "epoch": 0.0440251572327044, "grad_norm": 2.1789745411385413, "learning_rate": 0.0001999075448544189, "loss": 15.4492, "num_tokens": 450956.0, "step": 63 }, { "epoch": 0.04472396925227114, "grad_norm": 2.62233287160653, "learning_rate": 0.00019989755833107876, "loss": 15.2355, "num_tokens": 458417.0, "step": 64 }, { "epoch": 0.045422781271837874, "grad_norm": 1.8545665130668028, "learning_rate": 0.00019988706003694055, "loss": 15.3334, "num_tokens": 465196.0, "step": 65 }, { "epoch": 0.04612159329140461, "grad_norm": 3.6525347342712147, "learning_rate": 0.00019987605002578653, "loss": 15.1933, "num_tokens": 472489.0, "step": 66 }, { "epoch": 0.04682040531097135, "grad_norm": 2.7277334237717783, "learning_rate": 0.0001998645283540205, "loss": 15.1653, "num_tokens": 479315.0, "step": 67 }, { "epoch": 0.047519217330538085, "grad_norm": 1.97496245060142, "learning_rate": 0.00019985249508066755, "loss": 15.3583, "num_tokens": 486906.0, "step": 68 }, { "epoch": 0.04821802935010482, "grad_norm": 2.365948985865034, "learning_rate": 0.0001998399502673735, "loss": 14.9784, "num_tokens": 493938.0, "step": 69 }, { "epoch": 0.04891684136967156, "grad_norm": 2.0207852222510008, "learning_rate": 0.00019982689397840496, "loss": 15.0744, "num_tokens": 500590.0, "step": 70 }, { "epoch": 0.0496156533892383, "grad_norm": 2.137629225341362, "learning_rate": 0.00019981332628064865, "loss": 15.0679, "num_tokens": 508345.0, "step": 71 }, { "epoch": 0.050314465408805034, "grad_norm": 2.709600688855553, "learning_rate": 0.0001997992472436114, "loss": 14.9663, "num_tokens": 515272.0, "step": 72 }, { "epoch": 0.05101327742837177, "grad_norm": 2.769437204379797, "learning_rate": 0.0001997846569394194, "loss": 15.0369, "num_tokens": 522104.0, "step": 73 }, { "epoch": 0.0517120894479385, "grad_norm": 2.1465516152240407, "learning_rate": 0.00019976955544281815, "loss": 14.9015, "num_tokens": 529458.0, "step": 74 }, { "epoch": 0.05241090146750524, "grad_norm": 2.554970599066564, "learning_rate": 0.000199753942831172, "loss": 14.563, "num_tokens": 537278.0, "step": 75 }, { "epoch": 0.053109713487071976, "grad_norm": 2.388126495406421, "learning_rate": 0.0001997378191844636, "loss": 14.813, "num_tokens": 544549.0, "step": 76 }, { "epoch": 0.05380852550663871, "grad_norm": 2.1919533051739863, "learning_rate": 0.00019972118458529375, "loss": 14.4483, "num_tokens": 551068.0, "step": 77 }, { "epoch": 0.05450733752620545, "grad_norm": 1.6927300542355745, "learning_rate": 0.00019970403911888078, "loss": 14.7465, "num_tokens": 558188.0, "step": 78 }, { "epoch": 0.05520614954577219, "grad_norm": 2.2064497543576587, "learning_rate": 0.0001996863828730601, "loss": 14.6674, "num_tokens": 565805.0, "step": 79 }, { "epoch": 0.055904961565338925, "grad_norm": 2.4114054632830295, "learning_rate": 0.00019966821593828392, "loss": 14.5352, "num_tokens": 573381.0, "step": 80 }, { "epoch": 0.05660377358490566, "grad_norm": 2.4134183889931102, "learning_rate": 0.0001996495384076206, "loss": 14.3595, "num_tokens": 581350.0, "step": 81 }, { "epoch": 0.0573025856044724, "grad_norm": 1.8864006743237123, "learning_rate": 0.0001996303503767544, "loss": 14.3631, "num_tokens": 588473.0, "step": 82 }, { "epoch": 0.058001397624039136, "grad_norm": 3.13170686212352, "learning_rate": 0.00019961065194398466, "loss": 14.2639, "num_tokens": 595623.0, "step": 83 }, { "epoch": 0.05870020964360587, "grad_norm": 1.8895852504927524, "learning_rate": 0.00019959044321022563, "loss": 14.5274, "num_tokens": 602999.0, "step": 84 }, { "epoch": 0.0593990216631726, "grad_norm": 2.0439231597559364, "learning_rate": 0.00019956972427900578, "loss": 14.1757, "num_tokens": 610543.0, "step": 85 }, { "epoch": 0.06009783368273934, "grad_norm": 2.2725286826693627, "learning_rate": 0.00019954849525646726, "loss": 14.0905, "num_tokens": 617203.0, "step": 86 }, { "epoch": 0.06079664570230608, "grad_norm": 1.7055612047248596, "learning_rate": 0.0001995267562513654, "loss": 14.5731, "num_tokens": 623602.0, "step": 87 }, { "epoch": 0.061495457721872815, "grad_norm": 2.0031518563876785, "learning_rate": 0.00019950450737506824, "loss": 14.2865, "num_tokens": 631160.0, "step": 88 }, { "epoch": 0.06219426974143955, "grad_norm": 1.815051060076169, "learning_rate": 0.00019948174874155573, "loss": 13.8439, "num_tokens": 638355.0, "step": 89 }, { "epoch": 0.06289308176100629, "grad_norm": 1.8441847733219316, "learning_rate": 0.00019945848046741934, "loss": 14.061, "num_tokens": 645362.0, "step": 90 }, { "epoch": 0.06359189378057302, "grad_norm": 2.155812571108576, "learning_rate": 0.00019943470267186144, "loss": 13.9919, "num_tokens": 652539.0, "step": 91 }, { "epoch": 0.06429070580013976, "grad_norm": 1.817937039545139, "learning_rate": 0.00019941041547669465, "loss": 14.1473, "num_tokens": 659738.0, "step": 92 }, { "epoch": 0.0649895178197065, "grad_norm": 1.6248726317728508, "learning_rate": 0.0001993856190063412, "loss": 13.9793, "num_tokens": 667862.0, "step": 93 }, { "epoch": 0.06568832983927324, "grad_norm": 2.094683855819787, "learning_rate": 0.00019936031338783225, "loss": 14.0755, "num_tokens": 675183.0, "step": 94 }, { "epoch": 0.06638714185883997, "grad_norm": 2.423753515861936, "learning_rate": 0.00019933449875080746, "loss": 13.7537, "num_tokens": 682199.0, "step": 95 }, { "epoch": 0.06708595387840671, "grad_norm": 1.8615093715014854, "learning_rate": 0.00019930817522751401, "loss": 13.7573, "num_tokens": 689412.0, "step": 96 }, { "epoch": 0.06778476589797344, "grad_norm": 1.922603628179396, "learning_rate": 0.0001992813429528062, "loss": 13.7809, "num_tokens": 696608.0, "step": 97 }, { "epoch": 0.06848357791754019, "grad_norm": 1.885694054680989, "learning_rate": 0.0001992540020641446, "loss": 13.6316, "num_tokens": 703838.0, "step": 98 }, { "epoch": 0.06918238993710692, "grad_norm": 1.830733189483595, "learning_rate": 0.0001992261527015953, "loss": 13.8526, "num_tokens": 711432.0, "step": 99 }, { "epoch": 0.06988120195667366, "grad_norm": 2.0529905069418053, "learning_rate": 0.00019919779500782948, "loss": 13.8049, "num_tokens": 717755.0, "step": 100 }, { "epoch": 0.07058001397624039, "grad_norm": 1.9178698722664131, "learning_rate": 0.0001991689291281223, "loss": 13.9275, "num_tokens": 725381.0, "step": 101 }, { "epoch": 0.07127882599580712, "grad_norm": 1.7826032618004608, "learning_rate": 0.00019913955521035234, "loss": 13.4739, "num_tokens": 732317.0, "step": 102 }, { "epoch": 0.07197763801537387, "grad_norm": 1.7430765768053889, "learning_rate": 0.00019910967340500094, "loss": 13.7472, "num_tokens": 739043.0, "step": 103 }, { "epoch": 0.0726764500349406, "grad_norm": 1.9327097576731953, "learning_rate": 0.00019907928386515126, "loss": 13.7117, "num_tokens": 745729.0, "step": 104 }, { "epoch": 0.07337526205450734, "grad_norm": 1.6523098222253636, "learning_rate": 0.00019904838674648763, "loss": 13.5757, "num_tokens": 753195.0, "step": 105 }, { "epoch": 0.07407407407407407, "grad_norm": 1.6969490994020622, "learning_rate": 0.00019901698220729458, "loss": 13.748, "num_tokens": 759938.0, "step": 106 }, { "epoch": 0.07477288609364081, "grad_norm": 1.7677649372168398, "learning_rate": 0.00019898507040845616, "loss": 13.5139, "num_tokens": 767625.0, "step": 107 }, { "epoch": 0.07547169811320754, "grad_norm": 1.9518708854729268, "learning_rate": 0.00019895265151345518, "loss": 13.4042, "num_tokens": 775131.0, "step": 108 }, { "epoch": 0.07617051013277429, "grad_norm": 1.5910502551681978, "learning_rate": 0.00019891972568837214, "loss": 13.5223, "num_tokens": 782395.0, "step": 109 }, { "epoch": 0.07686932215234102, "grad_norm": 1.7329435914061764, "learning_rate": 0.00019888629310188465, "loss": 13.513, "num_tokens": 789064.0, "step": 110 }, { "epoch": 0.07756813417190776, "grad_norm": 1.483874271007651, "learning_rate": 0.00019885235392526636, "loss": 13.4427, "num_tokens": 796698.0, "step": 111 }, { "epoch": 0.07826694619147449, "grad_norm": 1.6804353651923707, "learning_rate": 0.00019881790833238617, "loss": 13.4666, "num_tokens": 803919.0, "step": 112 }, { "epoch": 0.07896575821104122, "grad_norm": 1.9157840259811565, "learning_rate": 0.00019878295649970734, "loss": 13.3714, "num_tokens": 810971.0, "step": 113 }, { "epoch": 0.07966457023060797, "grad_norm": 1.7551309996052762, "learning_rate": 0.0001987474986062866, "loss": 13.2645, "num_tokens": 817887.0, "step": 114 }, { "epoch": 0.0803633822501747, "grad_norm": 1.5349632976896173, "learning_rate": 0.00019871153483377315, "loss": 13.3252, "num_tokens": 824738.0, "step": 115 }, { "epoch": 0.08106219426974144, "grad_norm": 1.924756300417848, "learning_rate": 0.0001986750653664078, "loss": 13.4228, "num_tokens": 832079.0, "step": 116 }, { "epoch": 0.08176100628930817, "grad_norm": 1.5896909041784393, "learning_rate": 0.0001986380903910221, "loss": 13.27, "num_tokens": 838908.0, "step": 117 }, { "epoch": 0.08245981830887492, "grad_norm": 1.575684845736584, "learning_rate": 0.00019860061009703713, "loss": 13.4192, "num_tokens": 845348.0, "step": 118 }, { "epoch": 0.08315863032844165, "grad_norm": 1.8295088795123042, "learning_rate": 0.00019856262467646282, "loss": 13.4711, "num_tokens": 852162.0, "step": 119 }, { "epoch": 0.08385744234800839, "grad_norm": 1.7581038567698788, "learning_rate": 0.00019852413432389684, "loss": 13.3141, "num_tokens": 860170.0, "step": 120 }, { "epoch": 0.08455625436757512, "grad_norm": 1.526290883383786, "learning_rate": 0.00019848513923652358, "loss": 13.2942, "num_tokens": 867476.0, "step": 121 }, { "epoch": 0.08525506638714186, "grad_norm": 1.878529626375439, "learning_rate": 0.00019844563961411309, "loss": 13.2712, "num_tokens": 874866.0, "step": 122 }, { "epoch": 0.0859538784067086, "grad_norm": 1.489846227446125, "learning_rate": 0.00019840563565902026, "loss": 13.2977, "num_tokens": 881774.0, "step": 123 }, { "epoch": 0.08665269042627533, "grad_norm": 1.6815017809617654, "learning_rate": 0.00019836512757618355, "loss": 13.2761, "num_tokens": 888149.0, "step": 124 }, { "epoch": 0.08735150244584207, "grad_norm": 1.6091641966868473, "learning_rate": 0.00019832411557312414, "loss": 13.4763, "num_tokens": 894693.0, "step": 125 }, { "epoch": 0.0880503144654088, "grad_norm": 1.5095813931005195, "learning_rate": 0.00019828259985994463, "loss": 13.5071, "num_tokens": 901024.0, "step": 126 }, { "epoch": 0.08874912648497554, "grad_norm": 1.6693788073219258, "learning_rate": 0.00019824058064932831, "loss": 13.2807, "num_tokens": 908206.0, "step": 127 }, { "epoch": 0.08944793850454227, "grad_norm": 1.9687315987537344, "learning_rate": 0.00019819805815653768, "loss": 13.363, "num_tokens": 914376.0, "step": 128 }, { "epoch": 0.09014675052410902, "grad_norm": 1.7385897372739076, "learning_rate": 0.00019815503259941358, "loss": 13.2634, "num_tokens": 921721.0, "step": 129 }, { "epoch": 0.09084556254367575, "grad_norm": 1.428721148603436, "learning_rate": 0.0001981115041983741, "loss": 13.2458, "num_tokens": 928937.0, "step": 130 }, { "epoch": 0.09154437456324249, "grad_norm": 1.662634637805847, "learning_rate": 0.0001980674731764133, "loss": 13.2113, "num_tokens": 936881.0, "step": 131 }, { "epoch": 0.09224318658280922, "grad_norm": 1.6355733105429533, "learning_rate": 0.00019802293975910016, "loss": 13.3508, "num_tokens": 943685.0, "step": 132 }, { "epoch": 0.09294199860237597, "grad_norm": 1.6463942231883306, "learning_rate": 0.00019797790417457742, "loss": 13.0835, "num_tokens": 950976.0, "step": 133 }, { "epoch": 0.0936408106219427, "grad_norm": 1.4062728688315147, "learning_rate": 0.0001979323666535604, "loss": 13.2962, "num_tokens": 957928.0, "step": 134 }, { "epoch": 0.09433962264150944, "grad_norm": 1.5672116073719475, "learning_rate": 0.00019788632742933585, "loss": 13.105, "num_tokens": 964412.0, "step": 135 }, { "epoch": 0.09503843466107617, "grad_norm": 1.8304590876950395, "learning_rate": 0.00019783978673776063, "loss": 13.1619, "num_tokens": 971468.0, "step": 136 }, { "epoch": 0.0957372466806429, "grad_norm": 1.5169514478830495, "learning_rate": 0.00019779274481726073, "loss": 13.223, "num_tokens": 978459.0, "step": 137 }, { "epoch": 0.09643605870020965, "grad_norm": 1.320041796641737, "learning_rate": 0.00019774520190882978, "loss": 13.1441, "num_tokens": 985920.0, "step": 138 }, { "epoch": 0.09713487071977638, "grad_norm": 1.579778797268716, "learning_rate": 0.00019769715825602803, "loss": 13.0779, "num_tokens": 992764.0, "step": 139 }, { "epoch": 0.09783368273934312, "grad_norm": 1.6097268033651382, "learning_rate": 0.00019764861410498098, "loss": 13.2544, "num_tokens": 999854.0, "step": 140 }, { "epoch": 0.09853249475890985, "grad_norm": 1.4573003230376171, "learning_rate": 0.00019759956970437825, "loss": 13.0092, "num_tokens": 1006667.0, "step": 141 }, { "epoch": 0.0992313067784766, "grad_norm": 1.5921140115600168, "learning_rate": 0.00019755002530547208, "loss": 13.2447, "num_tokens": 1014089.0, "step": 142 }, { "epoch": 0.09993011879804332, "grad_norm": 1.3438414888588825, "learning_rate": 0.00019749998116207621, "loss": 13.2542, "num_tokens": 1020912.0, "step": 143 }, { "epoch": 0.10062893081761007, "grad_norm": 1.7285759166278971, "learning_rate": 0.00019744943753056472, "loss": 12.9093, "num_tokens": 1028495.0, "step": 144 }, { "epoch": 0.1013277428371768, "grad_norm": 1.3903171195019095, "learning_rate": 0.0001973983946698703, "loss": 13.0009, "num_tokens": 1036332.0, "step": 145 }, { "epoch": 0.10202655485674354, "grad_norm": 1.7455056295429205, "learning_rate": 0.0001973468528414833, "loss": 13.1304, "num_tokens": 1043895.0, "step": 146 }, { "epoch": 0.10272536687631027, "grad_norm": 1.4497939444434873, "learning_rate": 0.0001972948123094503, "loss": 13.0796, "num_tokens": 1051123.0, "step": 147 }, { "epoch": 0.103424178895877, "grad_norm": 1.383580156751911, "learning_rate": 0.00019724227334037256, "loss": 13.1124, "num_tokens": 1058636.0, "step": 148 }, { "epoch": 0.10412299091544375, "grad_norm": 1.4838421489362628, "learning_rate": 0.00019718923620340496, "loss": 13.142, "num_tokens": 1065532.0, "step": 149 }, { "epoch": 0.10482180293501048, "grad_norm": 1.4242535011413662, "learning_rate": 0.00019713570117025443, "loss": 12.8958, "num_tokens": 1073092.0, "step": 150 }, { "epoch": 0.10552061495457722, "grad_norm": 1.40181080040232, "learning_rate": 0.0001970816685151786, "loss": 12.9283, "num_tokens": 1081083.0, "step": 151 }, { "epoch": 0.10621942697414395, "grad_norm": 1.5313341039141877, "learning_rate": 0.00019702713851498435, "loss": 12.9873, "num_tokens": 1088405.0, "step": 152 }, { "epoch": 0.1069182389937107, "grad_norm": 1.3601093201693377, "learning_rate": 0.00019697211144902648, "loss": 12.9683, "num_tokens": 1095299.0, "step": 153 }, { "epoch": 0.10761705101327743, "grad_norm": 1.6794919624091056, "learning_rate": 0.00019691658759920624, "loss": 12.9984, "num_tokens": 1102272.0, "step": 154 }, { "epoch": 0.10831586303284417, "grad_norm": 1.2881559193849654, "learning_rate": 0.00019686056724996988, "loss": 13.0438, "num_tokens": 1108878.0, "step": 155 }, { "epoch": 0.1090146750524109, "grad_norm": 1.5180984823138908, "learning_rate": 0.00019680405068830717, "loss": 12.9799, "num_tokens": 1116345.0, "step": 156 }, { "epoch": 0.10971348707197764, "grad_norm": 1.4196966779586535, "learning_rate": 0.00019674703820374994, "loss": 13.065, "num_tokens": 1123338.0, "step": 157 }, { "epoch": 0.11041229909154437, "grad_norm": 1.3929707667138889, "learning_rate": 0.0001966895300883707, "loss": 12.8968, "num_tokens": 1130116.0, "step": 158 }, { "epoch": 0.1111111111111111, "grad_norm": 1.4996662049248695, "learning_rate": 0.00019663152663678099, "loss": 12.7327, "num_tokens": 1137314.0, "step": 159 }, { "epoch": 0.11180992313067785, "grad_norm": 1.369079829503796, "learning_rate": 0.0001965730281461299, "loss": 13.0669, "num_tokens": 1143586.0, "step": 160 }, { "epoch": 0.11250873515024458, "grad_norm": 1.4354897097544304, "learning_rate": 0.00019651403491610268, "loss": 12.9036, "num_tokens": 1150678.0, "step": 161 }, { "epoch": 0.11320754716981132, "grad_norm": 1.352851751508517, "learning_rate": 0.000196454547248919, "loss": 12.9177, "num_tokens": 1158316.0, "step": 162 }, { "epoch": 0.11390635918937805, "grad_norm": 1.3710181240525572, "learning_rate": 0.00019639456544933155, "loss": 13.0527, "num_tokens": 1165236.0, "step": 163 }, { "epoch": 0.1146051712089448, "grad_norm": 1.3068223937565793, "learning_rate": 0.0001963340898246245, "loss": 13.0438, "num_tokens": 1172589.0, "step": 164 }, { "epoch": 0.11530398322851153, "grad_norm": 1.346519471881747, "learning_rate": 0.00019627312068461184, "loss": 13.0086, "num_tokens": 1179343.0, "step": 165 }, { "epoch": 0.11600279524807827, "grad_norm": 1.446789885424941, "learning_rate": 0.00019621165834163572, "loss": 12.9444, "num_tokens": 1185779.0, "step": 166 }, { "epoch": 0.116701607267645, "grad_norm": 1.444884622592261, "learning_rate": 0.00019614970311056503, "loss": 13.2269, "num_tokens": 1192364.0, "step": 167 }, { "epoch": 0.11740041928721175, "grad_norm": 1.2891615557860914, "learning_rate": 0.00019608725530879375, "loss": 12.9378, "num_tokens": 1199385.0, "step": 168 }, { "epoch": 0.11809923130677848, "grad_norm": 1.3524191128592733, "learning_rate": 0.00019602431525623918, "loss": 13.1315, "num_tokens": 1206524.0, "step": 169 }, { "epoch": 0.1187980433263452, "grad_norm": 1.466681450368447, "learning_rate": 0.00019596088327534047, "loss": 12.8783, "num_tokens": 1213487.0, "step": 170 }, { "epoch": 0.11949685534591195, "grad_norm": 1.4000232584973735, "learning_rate": 0.0001958969596910568, "loss": 12.887, "num_tokens": 1220301.0, "step": 171 }, { "epoch": 0.12019566736547868, "grad_norm": 1.250384646500827, "learning_rate": 0.000195832544830866, "loss": 12.8478, "num_tokens": 1227778.0, "step": 172 }, { "epoch": 0.12089447938504543, "grad_norm": 1.5137312829748157, "learning_rate": 0.00019576763902476242, "loss": 12.9285, "num_tokens": 1234261.0, "step": 173 }, { "epoch": 0.12159329140461216, "grad_norm": 1.2791777538329325, "learning_rate": 0.0001957022426052558, "loss": 12.8265, "num_tokens": 1241757.0, "step": 174 }, { "epoch": 0.1222921034241789, "grad_norm": 1.5901299698985887, "learning_rate": 0.00019563635590736901, "loss": 12.8283, "num_tokens": 1248424.0, "step": 175 }, { "epoch": 0.12299091544374563, "grad_norm": 1.4082846461212457, "learning_rate": 0.00019556997926863673, "loss": 13.054, "num_tokens": 1255116.0, "step": 176 }, { "epoch": 0.12368972746331237, "grad_norm": 1.336316502475352, "learning_rate": 0.0001955031130291036, "loss": 12.8992, "num_tokens": 1262373.0, "step": 177 }, { "epoch": 0.1243885394828791, "grad_norm": 1.2337463600865273, "learning_rate": 0.0001954357575313224, "loss": 13.0176, "num_tokens": 1268591.0, "step": 178 }, { "epoch": 0.12508735150244585, "grad_norm": 1.3019629873776766, "learning_rate": 0.0001953679131203524, "loss": 12.8964, "num_tokens": 1276309.0, "step": 179 }, { "epoch": 0.12578616352201258, "grad_norm": 1.359626952499068, "learning_rate": 0.00019529958014375746, "loss": 12.8146, "num_tokens": 1283604.0, "step": 180 }, { "epoch": 0.1264849755415793, "grad_norm": 1.1515862878249248, "learning_rate": 0.0001952307589516045, "loss": 12.9346, "num_tokens": 1290423.0, "step": 181 }, { "epoch": 0.12718378756114604, "grad_norm": 1.3387432887733188, "learning_rate": 0.00019516144989646143, "loss": 13.0495, "num_tokens": 1297162.0, "step": 182 }, { "epoch": 0.1278825995807128, "grad_norm": 1.2279239336305443, "learning_rate": 0.00019509165333339551, "loss": 12.798, "num_tokens": 1304042.0, "step": 183 }, { "epoch": 0.12858141160027953, "grad_norm": 1.43782424281205, "learning_rate": 0.0001950213696199714, "loss": 12.7759, "num_tokens": 1311266.0, "step": 184 }, { "epoch": 0.12928022361984626, "grad_norm": 1.3241777082441377, "learning_rate": 0.00019495059911624958, "loss": 13.0, "num_tokens": 1317490.0, "step": 185 }, { "epoch": 0.129979035639413, "grad_norm": 1.4217170996471962, "learning_rate": 0.00019487934218478413, "loss": 12.9661, "num_tokens": 1324708.0, "step": 186 }, { "epoch": 0.13067784765897975, "grad_norm": 1.34114667099107, "learning_rate": 0.0001948075991906212, "loss": 12.7199, "num_tokens": 1331506.0, "step": 187 }, { "epoch": 0.13137665967854648, "grad_norm": 1.2485002323335002, "learning_rate": 0.00019473537050129704, "loss": 12.8164, "num_tokens": 1338737.0, "step": 188 }, { "epoch": 0.1320754716981132, "grad_norm": 1.2386002612681404, "learning_rate": 0.00019466265648683602, "loss": 12.8269, "num_tokens": 1346238.0, "step": 189 }, { "epoch": 0.13277428371767994, "grad_norm": 1.193140463955583, "learning_rate": 0.0001945894575197488, "loss": 12.8081, "num_tokens": 1353786.0, "step": 190 }, { "epoch": 0.1334730957372467, "grad_norm": 1.5872251747340467, "learning_rate": 0.00019451577397503053, "loss": 12.9692, "num_tokens": 1360969.0, "step": 191 }, { "epoch": 0.13417190775681342, "grad_norm": 1.2664833823468389, "learning_rate": 0.00019444160623015874, "loss": 13.0089, "num_tokens": 1368167.0, "step": 192 }, { "epoch": 0.13487071977638015, "grad_norm": 1.3919751858860443, "learning_rate": 0.00019436695466509152, "loss": 12.7512, "num_tokens": 1375092.0, "step": 193 }, { "epoch": 0.13556953179594688, "grad_norm": 1.7287267721473023, "learning_rate": 0.00019429181966226558, "loss": 12.583, "num_tokens": 1383015.0, "step": 194 }, { "epoch": 0.13626834381551362, "grad_norm": 1.3019053109225178, "learning_rate": 0.00019421620160659417, "loss": 12.6838, "num_tokens": 1389785.0, "step": 195 }, { "epoch": 0.13696715583508037, "grad_norm": 1.427516186724372, "learning_rate": 0.00019414010088546535, "loss": 12.6612, "num_tokens": 1397770.0, "step": 196 }, { "epoch": 0.1376659678546471, "grad_norm": 1.5627247304701708, "learning_rate": 0.00019406351788873972, "loss": 12.6157, "num_tokens": 1404674.0, "step": 197 }, { "epoch": 0.13836477987421383, "grad_norm": 1.3129375645013237, "learning_rate": 0.00019398645300874865, "loss": 12.9586, "num_tokens": 1411618.0, "step": 198 }, { "epoch": 0.13906359189378056, "grad_norm": 1.2811801120707738, "learning_rate": 0.00019390890664029204, "loss": 12.6556, "num_tokens": 1418834.0, "step": 199 }, { "epoch": 0.13976240391334732, "grad_norm": 1.2764420734679425, "learning_rate": 0.0001938308791806366, "loss": 12.8285, "num_tokens": 1425877.0, "step": 200 }, { "epoch": 0.14046121593291405, "grad_norm": 1.4621638790890457, "learning_rate": 0.0001937523710295136, "loss": 12.7881, "num_tokens": 1432515.0, "step": 201 }, { "epoch": 0.14116002795248078, "grad_norm": 1.2904825789726404, "learning_rate": 0.00019367338258911675, "loss": 12.7984, "num_tokens": 1439548.0, "step": 202 }, { "epoch": 0.1418588399720475, "grad_norm": 1.186391641270174, "learning_rate": 0.0001935939142641004, "loss": 12.634, "num_tokens": 1446322.0, "step": 203 }, { "epoch": 0.14255765199161424, "grad_norm": 1.1674776839727241, "learning_rate": 0.0001935139664615773, "loss": 12.8383, "num_tokens": 1453298.0, "step": 204 }, { "epoch": 0.143256464011181, "grad_norm": 1.2683482772147976, "learning_rate": 0.00019343353959111652, "loss": 12.6733, "num_tokens": 1460188.0, "step": 205 }, { "epoch": 0.14395527603074773, "grad_norm": 1.216633183257395, "learning_rate": 0.00019335263406474137, "loss": 12.7315, "num_tokens": 1467199.0, "step": 206 }, { "epoch": 0.14465408805031446, "grad_norm": 1.2669421899729538, "learning_rate": 0.00019327125029692735, "loss": 12.6752, "num_tokens": 1474116.0, "step": 207 }, { "epoch": 0.1453529000698812, "grad_norm": 1.1816422309516645, "learning_rate": 0.00019318938870459984, "loss": 12.7585, "num_tokens": 1480988.0, "step": 208 }, { "epoch": 0.14605171208944795, "grad_norm": 1.593879641391394, "learning_rate": 0.00019310704970713224, "loss": 12.5382, "num_tokens": 1487900.0, "step": 209 }, { "epoch": 0.14675052410901468, "grad_norm": 1.4309781033240252, "learning_rate": 0.0001930242337263436, "loss": 12.672, "num_tokens": 1495543.0, "step": 210 }, { "epoch": 0.1474493361285814, "grad_norm": 1.3370449709559875, "learning_rate": 0.00019294094118649653, "loss": 12.542, "num_tokens": 1502498.0, "step": 211 }, { "epoch": 0.14814814814814814, "grad_norm": 1.7429475926881917, "learning_rate": 0.00019285717251429506, "loss": 12.5638, "num_tokens": 1509580.0, "step": 212 }, { "epoch": 0.1488469601677149, "grad_norm": 1.3587122905893039, "learning_rate": 0.00019277292813888244, "loss": 12.689, "num_tokens": 1516376.0, "step": 213 }, { "epoch": 0.14954577218728163, "grad_norm": 1.4852579200171538, "learning_rate": 0.00019268820849183883, "loss": 12.7086, "num_tokens": 1523015.0, "step": 214 }, { "epoch": 0.15024458420684836, "grad_norm": 1.3259646669042442, "learning_rate": 0.00019260301400717938, "loss": 12.8351, "num_tokens": 1530696.0, "step": 215 }, { "epoch": 0.1509433962264151, "grad_norm": 1.432491095454994, "learning_rate": 0.00019251734512135157, "loss": 13.0261, "num_tokens": 1537893.0, "step": 216 }, { "epoch": 0.15164220824598182, "grad_norm": 1.5012452677994659, "learning_rate": 0.00019243120227323333, "loss": 12.5507, "num_tokens": 1545460.0, "step": 217 }, { "epoch": 0.15234102026554858, "grad_norm": 1.449104510035988, "learning_rate": 0.00019234458590413077, "loss": 12.6152, "num_tokens": 1552764.0, "step": 218 }, { "epoch": 0.1530398322851153, "grad_norm": 1.2430393467361687, "learning_rate": 0.0001922574964577757, "loss": 12.6997, "num_tokens": 1559826.0, "step": 219 }, { "epoch": 0.15373864430468204, "grad_norm": 1.1751985575150206, "learning_rate": 0.0001921699343803235, "loss": 12.7505, "num_tokens": 1567575.0, "step": 220 }, { "epoch": 0.15443745632424877, "grad_norm": 1.224128766914656, "learning_rate": 0.00019208190012035087, "loss": 12.6943, "num_tokens": 1574362.0, "step": 221 }, { "epoch": 0.15513626834381553, "grad_norm": 1.4730426557595246, "learning_rate": 0.00019199339412885347, "loss": 12.4975, "num_tokens": 1581335.0, "step": 222 }, { "epoch": 0.15583508036338226, "grad_norm": 1.245219689341052, "learning_rate": 0.00019190441685924353, "loss": 12.52, "num_tokens": 1588536.0, "step": 223 }, { "epoch": 0.15653389238294899, "grad_norm": 1.3566970317678715, "learning_rate": 0.00019181496876734776, "loss": 12.67, "num_tokens": 1595480.0, "step": 224 }, { "epoch": 0.15723270440251572, "grad_norm": 1.2125862352464616, "learning_rate": 0.0001917250503114048, "loss": 12.6823, "num_tokens": 1602858.0, "step": 225 }, { "epoch": 0.15793151642208245, "grad_norm": 1.2658499521661504, "learning_rate": 0.0001916346619520629, "loss": 12.4635, "num_tokens": 1610558.0, "step": 226 }, { "epoch": 0.1586303284416492, "grad_norm": 1.1986536717425549, "learning_rate": 0.00019154380415237768, "loss": 12.7901, "num_tokens": 1617490.0, "step": 227 }, { "epoch": 0.15932914046121593, "grad_norm": 1.2965597058989395, "learning_rate": 0.00019145247737780961, "loss": 12.6139, "num_tokens": 1624406.0, "step": 228 }, { "epoch": 0.16002795248078266, "grad_norm": 1.0789453042961938, "learning_rate": 0.00019136068209622183, "loss": 12.6052, "num_tokens": 1631539.0, "step": 229 }, { "epoch": 0.1607267645003494, "grad_norm": 1.2665628322889513, "learning_rate": 0.00019126841877787745, "loss": 12.5022, "num_tokens": 1638417.0, "step": 230 }, { "epoch": 0.16142557651991615, "grad_norm": 1.1381170673306098, "learning_rate": 0.00019117568789543742, "loss": 12.5689, "num_tokens": 1645769.0, "step": 231 }, { "epoch": 0.16212438853948288, "grad_norm": 1.2090318652793792, "learning_rate": 0.00019108248992395795, "loss": 12.6484, "num_tokens": 1653953.0, "step": 232 }, { "epoch": 0.1628232005590496, "grad_norm": 1.0651593003631354, "learning_rate": 0.0001909888253408882, "loss": 12.3967, "num_tokens": 1661632.0, "step": 233 }, { "epoch": 0.16352201257861634, "grad_norm": 1.289629674228255, "learning_rate": 0.00019089469462606765, "loss": 12.463, "num_tokens": 1668594.0, "step": 234 }, { "epoch": 0.1642208245981831, "grad_norm": 1.2696209740727902, "learning_rate": 0.00019080009826172387, "loss": 12.4893, "num_tokens": 1675137.0, "step": 235 }, { "epoch": 0.16491963661774983, "grad_norm": 1.295734136130956, "learning_rate": 0.00019070503673246982, "loss": 12.4947, "num_tokens": 1682132.0, "step": 236 }, { "epoch": 0.16561844863731656, "grad_norm": 1.186085096495133, "learning_rate": 0.0001906095105253016, "loss": 12.803, "num_tokens": 1688115.0, "step": 237 }, { "epoch": 0.1663172606568833, "grad_norm": 1.3469233034400552, "learning_rate": 0.00019051352012959568, "loss": 12.5784, "num_tokens": 1694968.0, "step": 238 }, { "epoch": 0.16701607267645002, "grad_norm": 1.1030332713184419, "learning_rate": 0.0001904170660371067, "loss": 12.7819, "num_tokens": 1702568.0, "step": 239 }, { "epoch": 0.16771488469601678, "grad_norm": 1.5256563801599314, "learning_rate": 0.00019032014874196474, "loss": 12.7243, "num_tokens": 1709560.0, "step": 240 }, { "epoch": 0.1684136967155835, "grad_norm": 1.0954893842833933, "learning_rate": 0.0001902227687406728, "loss": 12.5503, "num_tokens": 1717750.0, "step": 241 }, { "epoch": 0.16911250873515024, "grad_norm": 1.2813575980659235, "learning_rate": 0.0001901249265321044, "loss": 12.5196, "num_tokens": 1724736.0, "step": 242 }, { "epoch": 0.16981132075471697, "grad_norm": 1.2440300161189415, "learning_rate": 0.00019002662261750078, "loss": 12.5384, "num_tokens": 1732240.0, "step": 243 }, { "epoch": 0.17051013277428373, "grad_norm": 1.2129795823903677, "learning_rate": 0.00018992785750046863, "loss": 12.3593, "num_tokens": 1739543.0, "step": 244 }, { "epoch": 0.17120894479385046, "grad_norm": 1.1970012785599058, "learning_rate": 0.00018982863168697734, "loss": 12.587, "num_tokens": 1746459.0, "step": 245 }, { "epoch": 0.1719077568134172, "grad_norm": 1.3170795769984949, "learning_rate": 0.00018972894568535634, "loss": 12.705, "num_tokens": 1753478.0, "step": 246 }, { "epoch": 0.17260656883298392, "grad_norm": 1.1706949815019843, "learning_rate": 0.00018962880000629258, "loss": 12.3755, "num_tokens": 1760374.0, "step": 247 }, { "epoch": 0.17330538085255065, "grad_norm": 1.3094831534974631, "learning_rate": 0.0001895281951628281, "loss": 12.7245, "num_tokens": 1767809.0, "step": 248 }, { "epoch": 0.1740041928721174, "grad_norm": 1.044605208635119, "learning_rate": 0.000189427131670357, "loss": 12.6817, "num_tokens": 1775284.0, "step": 249 }, { "epoch": 0.17470300489168414, "grad_norm": 1.1554679184494987, "learning_rate": 0.00018932561004662312, "loss": 12.3913, "num_tokens": 1782896.0, "step": 250 }, { "epoch": 0.17540181691125087, "grad_norm": 1.3348111022458111, "learning_rate": 0.00018922363081171723, "loss": 12.5296, "num_tokens": 1790348.0, "step": 251 }, { "epoch": 0.1761006289308176, "grad_norm": 1.1120352895593362, "learning_rate": 0.0001891211944880746, "loss": 12.8437, "num_tokens": 1796663.0, "step": 252 }, { "epoch": 0.17679944095038436, "grad_norm": 1.5043333334071107, "learning_rate": 0.00018901830160047184, "loss": 12.4049, "num_tokens": 1804575.0, "step": 253 }, { "epoch": 0.1774982529699511, "grad_norm": 1.1726144920737067, "learning_rate": 0.0001889149526760248, "loss": 12.5944, "num_tokens": 1810818.0, "step": 254 }, { "epoch": 0.17819706498951782, "grad_norm": 1.2408621835003835, "learning_rate": 0.0001888111482441855, "loss": 12.4801, "num_tokens": 1817813.0, "step": 255 }, { "epoch": 0.17889587700908455, "grad_norm": 1.379904171536855, "learning_rate": 0.00018870688883673936, "loss": 12.5618, "num_tokens": 1824365.0, "step": 256 }, { "epoch": 0.1795946890286513, "grad_norm": 1.1840265644520456, "learning_rate": 0.00018860217498780285, "loss": 12.5814, "num_tokens": 1831336.0, "step": 257 }, { "epoch": 0.18029350104821804, "grad_norm": 1.3292220144802318, "learning_rate": 0.00018849700723382035, "loss": 12.385, "num_tokens": 1838657.0, "step": 258 }, { "epoch": 0.18099231306778477, "grad_norm": 1.211001471258756, "learning_rate": 0.0001883913861135617, "loss": 12.4776, "num_tokens": 1845965.0, "step": 259 }, { "epoch": 0.1816911250873515, "grad_norm": 1.2646806883821105, "learning_rate": 0.00018828531216811913, "loss": 12.7187, "num_tokens": 1852638.0, "step": 260 }, { "epoch": 0.18238993710691823, "grad_norm": 1.3716555848937888, "learning_rate": 0.00018817878594090494, "loss": 12.598, "num_tokens": 1859877.0, "step": 261 }, { "epoch": 0.18308874912648498, "grad_norm": 1.1204826654463, "learning_rate": 0.00018807180797764822, "loss": 12.5591, "num_tokens": 1866923.0, "step": 262 }, { "epoch": 0.18378756114605171, "grad_norm": 1.3449240572237333, "learning_rate": 0.00018796437882639242, "loss": 12.3795, "num_tokens": 1873292.0, "step": 263 }, { "epoch": 0.18448637316561844, "grad_norm": 1.1392064399539101, "learning_rate": 0.00018785649903749234, "loss": 12.43, "num_tokens": 1879744.0, "step": 264 }, { "epoch": 0.18518518518518517, "grad_norm": 1.390099384211355, "learning_rate": 0.00018774816916361137, "loss": 12.5114, "num_tokens": 1886064.0, "step": 265 }, { "epoch": 0.18588399720475193, "grad_norm": 1.1814107601446537, "learning_rate": 0.00018763938975971872, "loss": 12.3272, "num_tokens": 1893813.0, "step": 266 }, { "epoch": 0.18658280922431866, "grad_norm": 1.2818010177186572, "learning_rate": 0.0001875301613830865, "loss": 12.8937, "num_tokens": 1901211.0, "step": 267 }, { "epoch": 0.1872816212438854, "grad_norm": 1.400621329298869, "learning_rate": 0.00018742048459328682, "loss": 12.51, "num_tokens": 1907987.0, "step": 268 }, { "epoch": 0.18798043326345212, "grad_norm": 1.2330141005937425, "learning_rate": 0.00018731035995218914, "loss": 12.7508, "num_tokens": 1915853.0, "step": 269 }, { "epoch": 0.18867924528301888, "grad_norm": 1.37904712394343, "learning_rate": 0.00018719978802395705, "loss": 12.2983, "num_tokens": 1923310.0, "step": 270 }, { "epoch": 0.1893780573025856, "grad_norm": 1.2467103995837752, "learning_rate": 0.0001870887693750458, "loss": 12.3118, "num_tokens": 1930169.0, "step": 271 }, { "epoch": 0.19007686932215234, "grad_norm": 1.1402621781657731, "learning_rate": 0.00018697730457419893, "loss": 12.452, "num_tokens": 1937617.0, "step": 272 }, { "epoch": 0.19077568134171907, "grad_norm": 1.215321051813323, "learning_rate": 0.00018686539419244578, "loss": 12.4004, "num_tokens": 1944358.0, "step": 273 }, { "epoch": 0.1914744933612858, "grad_norm": 1.3080313421863534, "learning_rate": 0.0001867530388030983, "loss": 12.4736, "num_tokens": 1951629.0, "step": 274 }, { "epoch": 0.19217330538085256, "grad_norm": 1.1294773048320363, "learning_rate": 0.00018664023898174817, "loss": 12.4207, "num_tokens": 1958779.0, "step": 275 }, { "epoch": 0.1928721174004193, "grad_norm": 1.2033795977734214, "learning_rate": 0.00018652699530626398, "loss": 12.3521, "num_tokens": 1966253.0, "step": 276 }, { "epoch": 0.19357092941998602, "grad_norm": 1.152683402138601, "learning_rate": 0.00018641330835678804, "loss": 12.5124, "num_tokens": 1973038.0, "step": 277 }, { "epoch": 0.19426974143955275, "grad_norm": 1.008307851183863, "learning_rate": 0.00018629917871573366, "loss": 12.621, "num_tokens": 1980735.0, "step": 278 }, { "epoch": 0.1949685534591195, "grad_norm": 1.2051981110164842, "learning_rate": 0.0001861846069677819, "loss": 12.4481, "num_tokens": 1988250.0, "step": 279 }, { "epoch": 0.19566736547868624, "grad_norm": 1.0838581398000466, "learning_rate": 0.00018606959369987883, "loss": 12.4857, "num_tokens": 1995184.0, "step": 280 }, { "epoch": 0.19636617749825297, "grad_norm": 1.0917822922153184, "learning_rate": 0.00018595413950123235, "loss": 12.3827, "num_tokens": 2001901.0, "step": 281 }, { "epoch": 0.1970649895178197, "grad_norm": 1.1018335599813311, "learning_rate": 0.00018583824496330923, "loss": 12.685, "num_tokens": 2009242.0, "step": 282 }, { "epoch": 0.19776380153738643, "grad_norm": 1.2127896142017403, "learning_rate": 0.00018572191067983216, "loss": 12.5809, "num_tokens": 2016167.0, "step": 283 }, { "epoch": 0.1984626135569532, "grad_norm": 1.348517937802105, "learning_rate": 0.00018560513724677643, "loss": 12.4575, "num_tokens": 2023059.0, "step": 284 }, { "epoch": 0.19916142557651992, "grad_norm": 1.1090341293586254, "learning_rate": 0.00018548792526236732, "loss": 12.3595, "num_tokens": 2030297.0, "step": 285 }, { "epoch": 0.19986023759608665, "grad_norm": 1.342568850797832, "learning_rate": 0.00018537027532707662, "loss": 12.5375, "num_tokens": 2036674.0, "step": 286 }, { "epoch": 0.20055904961565338, "grad_norm": 1.1060237909798654, "learning_rate": 0.00018525218804361977, "loss": 12.2509, "num_tokens": 2043766.0, "step": 287 }, { "epoch": 0.20125786163522014, "grad_norm": 1.2327170579459596, "learning_rate": 0.00018513366401695276, "loss": 12.5329, "num_tokens": 2051302.0, "step": 288 }, { "epoch": 0.20195667365478687, "grad_norm": 1.1508656944870255, "learning_rate": 0.00018501470385426892, "loss": 12.3698, "num_tokens": 2058562.0, "step": 289 }, { "epoch": 0.2026554856743536, "grad_norm": 1.1595543958178998, "learning_rate": 0.00018489530816499596, "loss": 12.4003, "num_tokens": 2065605.0, "step": 290 }, { "epoch": 0.20335429769392033, "grad_norm": 1.3192276014407542, "learning_rate": 0.00018477547756079276, "loss": 12.4305, "num_tokens": 2072590.0, "step": 291 }, { "epoch": 0.20405310971348709, "grad_norm": 1.1135384173094371, "learning_rate": 0.0001846552126555462, "loss": 12.5111, "num_tokens": 2080039.0, "step": 292 }, { "epoch": 0.20475192173305382, "grad_norm": 1.3261734695119327, "learning_rate": 0.00018453451406536816, "loss": 12.6033, "num_tokens": 2086481.0, "step": 293 }, { "epoch": 0.20545073375262055, "grad_norm": 1.1201785620737152, "learning_rate": 0.00018441338240859215, "loss": 12.5272, "num_tokens": 2093192.0, "step": 294 }, { "epoch": 0.20614954577218728, "grad_norm": 1.1183523903788608, "learning_rate": 0.00018429181830577034, "loss": 12.3886, "num_tokens": 2100572.0, "step": 295 }, { "epoch": 0.206848357791754, "grad_norm": 1.2121568257432795, "learning_rate": 0.00018416982237967028, "loss": 12.5291, "num_tokens": 2107911.0, "step": 296 }, { "epoch": 0.20754716981132076, "grad_norm": 1.162263554260533, "learning_rate": 0.00018404739525527174, "loss": 12.334, "num_tokens": 2115264.0, "step": 297 }, { "epoch": 0.2082459818308875, "grad_norm": 1.1525516113964571, "learning_rate": 0.0001839245375597635, "loss": 12.2785, "num_tokens": 2122114.0, "step": 298 }, { "epoch": 0.20894479385045422, "grad_norm": 1.186494728566964, "learning_rate": 0.0001838012499225401, "loss": 12.3004, "num_tokens": 2129186.0, "step": 299 }, { "epoch": 0.20964360587002095, "grad_norm": 1.2912767009253392, "learning_rate": 0.00018367753297519873, "loss": 12.5505, "num_tokens": 2136056.0, "step": 300 }, { "epoch": 0.2103424178895877, "grad_norm": 1.1093100716237951, "learning_rate": 0.00018355338735153587, "loss": 12.3422, "num_tokens": 2143135.0, "step": 301 }, { "epoch": 0.21104122990915444, "grad_norm": 1.1832901006966179, "learning_rate": 0.00018342881368754404, "loss": 12.4585, "num_tokens": 2149855.0, "step": 302 }, { "epoch": 0.21174004192872117, "grad_norm": 1.304549193251159, "learning_rate": 0.00018330381262140864, "loss": 12.6785, "num_tokens": 2156629.0, "step": 303 }, { "epoch": 0.2124388539482879, "grad_norm": 1.02534115696337, "learning_rate": 0.00018317838479350472, "loss": 12.5892, "num_tokens": 2163993.0, "step": 304 }, { "epoch": 0.21313766596785463, "grad_norm": 1.2301685746042796, "learning_rate": 0.0001830525308463934, "loss": 12.4886, "num_tokens": 2170517.0, "step": 305 }, { "epoch": 0.2138364779874214, "grad_norm": 1.075671835098473, "learning_rate": 0.00018292625142481906, "loss": 12.4849, "num_tokens": 2177805.0, "step": 306 }, { "epoch": 0.21453529000698812, "grad_norm": 1.1292457741896016, "learning_rate": 0.00018279954717570553, "loss": 12.467, "num_tokens": 2184824.0, "step": 307 }, { "epoch": 0.21523410202655485, "grad_norm": 1.052027926129132, "learning_rate": 0.00018267241874815314, "loss": 12.1912, "num_tokens": 2192640.0, "step": 308 }, { "epoch": 0.21593291404612158, "grad_norm": 1.1526308912958079, "learning_rate": 0.00018254486679343516, "loss": 12.4349, "num_tokens": 2199963.0, "step": 309 }, { "epoch": 0.21663172606568834, "grad_norm": 1.0944974834134447, "learning_rate": 0.00018241689196499475, "loss": 12.425, "num_tokens": 2207388.0, "step": 310 }, { "epoch": 0.21733053808525507, "grad_norm": 1.2455487452556113, "learning_rate": 0.00018228849491844129, "loss": 12.2816, "num_tokens": 2214115.0, "step": 311 }, { "epoch": 0.2180293501048218, "grad_norm": 1.1660092991408635, "learning_rate": 0.00018215967631154717, "loss": 12.3521, "num_tokens": 2221801.0, "step": 312 }, { "epoch": 0.21872816212438853, "grad_norm": 1.2793656576637555, "learning_rate": 0.00018203043680424448, "loss": 12.4023, "num_tokens": 2229449.0, "step": 313 }, { "epoch": 0.2194269741439553, "grad_norm": 1.2171298749865551, "learning_rate": 0.00018190077705862155, "loss": 12.563, "num_tokens": 2236249.0, "step": 314 }, { "epoch": 0.22012578616352202, "grad_norm": 1.3096258411240842, "learning_rate": 0.00018177069773891953, "loss": 12.3427, "num_tokens": 2243354.0, "step": 315 }, { "epoch": 0.22082459818308875, "grad_norm": 1.2231138275461113, "learning_rate": 0.00018164019951152902, "loss": 12.4167, "num_tokens": 2249837.0, "step": 316 }, { "epoch": 0.22152341020265548, "grad_norm": 1.6783299447406501, "learning_rate": 0.00018150928304498675, "loss": 12.3678, "num_tokens": 2256520.0, "step": 317 }, { "epoch": 0.2222222222222222, "grad_norm": 1.2789357635089038, "learning_rate": 0.00018137794900997201, "loss": 12.5104, "num_tokens": 2263145.0, "step": 318 }, { "epoch": 0.22292103424178897, "grad_norm": 1.3687005934905156, "learning_rate": 0.0001812461980793033, "loss": 12.4424, "num_tokens": 2269862.0, "step": 319 }, { "epoch": 0.2236198462613557, "grad_norm": 1.0318168591263561, "learning_rate": 0.0001811140309279348, "loss": 12.5401, "num_tokens": 2276820.0, "step": 320 }, { "epoch": 0.22431865828092243, "grad_norm": 1.3080904924929653, "learning_rate": 0.00018098144823295304, "loss": 12.2369, "num_tokens": 2284748.0, "step": 321 }, { "epoch": 0.22501747030048916, "grad_norm": 1.1758761094017982, "learning_rate": 0.00018084845067357336, "loss": 12.3439, "num_tokens": 2292140.0, "step": 322 }, { "epoch": 0.22571628232005592, "grad_norm": 1.0519584181435369, "learning_rate": 0.00018071503893113638, "loss": 12.5563, "num_tokens": 2300108.0, "step": 323 }, { "epoch": 0.22641509433962265, "grad_norm": 1.3244204464017082, "learning_rate": 0.00018058121368910458, "loss": 12.1751, "num_tokens": 2307811.0, "step": 324 }, { "epoch": 0.22711390635918938, "grad_norm": 1.3441901462036563, "learning_rate": 0.00018044697563305876, "loss": 12.2075, "num_tokens": 2315254.0, "step": 325 }, { "epoch": 0.2278127183787561, "grad_norm": 1.1698154655629684, "learning_rate": 0.00018031232545069468, "loss": 12.3967, "num_tokens": 2323116.0, "step": 326 }, { "epoch": 0.22851153039832284, "grad_norm": 1.2327000749220458, "learning_rate": 0.00018017726383181925, "loss": 12.4241, "num_tokens": 2330812.0, "step": 327 }, { "epoch": 0.2292103424178896, "grad_norm": 1.1494561411366055, "learning_rate": 0.0001800417914683471, "loss": 12.3747, "num_tokens": 2338451.0, "step": 328 }, { "epoch": 0.22990915443745633, "grad_norm": 1.035155095769357, "learning_rate": 0.0001799059090542974, "loss": 12.4331, "num_tokens": 2346026.0, "step": 329 }, { "epoch": 0.23060796645702306, "grad_norm": 1.036281954431926, "learning_rate": 0.00017976961728578963, "loss": 12.2998, "num_tokens": 2353605.0, "step": 330 }, { "epoch": 0.23130677847658979, "grad_norm": 0.9491052072879148, "learning_rate": 0.00017963291686104053, "loss": 12.2493, "num_tokens": 2360509.0, "step": 331 }, { "epoch": 0.23200559049615654, "grad_norm": 1.0796880075257131, "learning_rate": 0.00017949580848036046, "loss": 12.4595, "num_tokens": 2367035.0, "step": 332 }, { "epoch": 0.23270440251572327, "grad_norm": 1.084018095842082, "learning_rate": 0.00017935829284614952, "loss": 12.3212, "num_tokens": 2373702.0, "step": 333 }, { "epoch": 0.23340321453529, "grad_norm": 1.1049395371201602, "learning_rate": 0.00017922037066289432, "loss": 12.4234, "num_tokens": 2380174.0, "step": 334 }, { "epoch": 0.23410202655485673, "grad_norm": 1.0508677886350823, "learning_rate": 0.0001790820426371641, "loss": 12.3334, "num_tokens": 2387052.0, "step": 335 }, { "epoch": 0.2348008385744235, "grad_norm": 1.0785045718409343, "learning_rate": 0.00017894330947760726, "loss": 12.1805, "num_tokens": 2393866.0, "step": 336 }, { "epoch": 0.23549965059399022, "grad_norm": 0.9921298050576933, "learning_rate": 0.0001788041718949477, "loss": 12.2666, "num_tokens": 2401040.0, "step": 337 }, { "epoch": 0.23619846261355695, "grad_norm": 1.0660998539990059, "learning_rate": 0.00017866463060198115, "loss": 12.4653, "num_tokens": 2408627.0, "step": 338 }, { "epoch": 0.23689727463312368, "grad_norm": 0.995380832073994, "learning_rate": 0.00017852468631357146, "loss": 12.3646, "num_tokens": 2415390.0, "step": 339 }, { "epoch": 0.2375960866526904, "grad_norm": 1.075956806450121, "learning_rate": 0.00017838433974664712, "loss": 12.221, "num_tokens": 2422275.0, "step": 340 }, { "epoch": 0.23829489867225717, "grad_norm": 1.0139961269153555, "learning_rate": 0.00017824359162019738, "loss": 12.2838, "num_tokens": 2429408.0, "step": 341 }, { "epoch": 0.2389937106918239, "grad_norm": 1.1457365908463537, "learning_rate": 0.00017810244265526875, "loss": 12.3507, "num_tokens": 2436362.0, "step": 342 }, { "epoch": 0.23969252271139063, "grad_norm": 1.1147345258046557, "learning_rate": 0.00017796089357496108, "loss": 12.4508, "num_tokens": 2442668.0, "step": 343 }, { "epoch": 0.24039133473095736, "grad_norm": 1.0930232074967394, "learning_rate": 0.0001778189451044242, "loss": 12.4069, "num_tokens": 2449013.0, "step": 344 }, { "epoch": 0.24109014675052412, "grad_norm": 1.0438980257850956, "learning_rate": 0.00017767659797085375, "loss": 12.3327, "num_tokens": 2455131.0, "step": 345 }, { "epoch": 0.24178895877009085, "grad_norm": 1.073359782419257, "learning_rate": 0.000177533852903488, "loss": 12.4483, "num_tokens": 2462060.0, "step": 346 }, { "epoch": 0.24248777078965758, "grad_norm": 0.9628449472719576, "learning_rate": 0.0001773907106336035, "loss": 12.4128, "num_tokens": 2469590.0, "step": 347 }, { "epoch": 0.2431865828092243, "grad_norm": 1.0691074828768792, "learning_rate": 0.0001772471718945119, "loss": 12.1702, "num_tokens": 2476488.0, "step": 348 }, { "epoch": 0.24388539482879107, "grad_norm": 0.9952237415704821, "learning_rate": 0.0001771032374215558, "loss": 12.2478, "num_tokens": 2484657.0, "step": 349 }, { "epoch": 0.2445842068483578, "grad_norm": 1.0036029456137867, "learning_rate": 0.00017695890795210517, "loss": 12.494, "num_tokens": 2491489.0, "step": 350 }, { "epoch": 0.24528301886792453, "grad_norm": 1.1069581308411622, "learning_rate": 0.00017681418422555356, "loss": 12.4616, "num_tokens": 2498631.0, "step": 351 }, { "epoch": 0.24598183088749126, "grad_norm": 0.9149619351800009, "learning_rate": 0.00017666906698331428, "loss": 12.0364, "num_tokens": 2506017.0, "step": 352 }, { "epoch": 0.246680642907058, "grad_norm": 1.108978780022514, "learning_rate": 0.00017652355696881652, "loss": 12.3213, "num_tokens": 2513168.0, "step": 353 }, { "epoch": 0.24737945492662475, "grad_norm": 1.0744303402500144, "learning_rate": 0.0001763776549275017, "loss": 12.2812, "num_tokens": 2520301.0, "step": 354 }, { "epoch": 0.24807826694619148, "grad_norm": 1.1831634927540147, "learning_rate": 0.00017623136160681963, "loss": 12.369, "num_tokens": 2527172.0, "step": 355 }, { "epoch": 0.2487770789657582, "grad_norm": 1.052250021391116, "learning_rate": 0.00017608467775622445, "loss": 12.425, "num_tokens": 2534008.0, "step": 356 }, { "epoch": 0.24947589098532494, "grad_norm": 1.0564729282720158, "learning_rate": 0.00017593760412717117, "loss": 12.3877, "num_tokens": 2541533.0, "step": 357 }, { "epoch": 0.2501747030048917, "grad_norm": 1.1910914093601987, "learning_rate": 0.0001757901414731115, "loss": 12.4473, "num_tokens": 2548063.0, "step": 358 }, { "epoch": 0.2508735150244584, "grad_norm": 1.0694263327013265, "learning_rate": 0.00017564229054949006, "loss": 12.2255, "num_tokens": 2555500.0, "step": 359 }, { "epoch": 0.25157232704402516, "grad_norm": 1.2542503960072244, "learning_rate": 0.0001754940521137407, "loss": 12.2771, "num_tokens": 2562431.0, "step": 360 }, { "epoch": 0.2522711390635919, "grad_norm": 1.1158939880015024, "learning_rate": 0.0001753454269252824, "loss": 12.4299, "num_tokens": 2569063.0, "step": 361 }, { "epoch": 0.2529699510831586, "grad_norm": 1.0409199378943457, "learning_rate": 0.00017519641574551546, "loss": 12.4908, "num_tokens": 2576257.0, "step": 362 }, { "epoch": 0.25366876310272535, "grad_norm": 1.2189765723091746, "learning_rate": 0.0001750470193378176, "loss": 12.4082, "num_tokens": 2582076.0, "step": 363 }, { "epoch": 0.2543675751222921, "grad_norm": 1.0384789257844729, "learning_rate": 0.00017489723846754002, "loss": 12.5168, "num_tokens": 2588634.0, "step": 364 }, { "epoch": 0.25506638714185886, "grad_norm": 1.0719298770534975, "learning_rate": 0.0001747470739020036, "loss": 12.1278, "num_tokens": 2595988.0, "step": 365 }, { "epoch": 0.2557651991614256, "grad_norm": 1.1294326548073568, "learning_rate": 0.00017459652641049474, "loss": 12.3269, "num_tokens": 2602882.0, "step": 366 }, { "epoch": 0.2564640111809923, "grad_norm": 1.173317995313158, "learning_rate": 0.0001744455967642616, "loss": 12.2611, "num_tokens": 2610428.0, "step": 367 }, { "epoch": 0.25716282320055905, "grad_norm": 1.0410607823903442, "learning_rate": 0.00017429428573651024, "loss": 12.0204, "num_tokens": 2617338.0, "step": 368 }, { "epoch": 0.2578616352201258, "grad_norm": 1.1884465012383119, "learning_rate": 0.00017414259410240026, "loss": 12.226, "num_tokens": 2624172.0, "step": 369 }, { "epoch": 0.2585604472396925, "grad_norm": 1.0099285124263113, "learning_rate": 0.0001739905226390413, "loss": 12.2136, "num_tokens": 2631185.0, "step": 370 }, { "epoch": 0.25925925925925924, "grad_norm": 1.121309544418593, "learning_rate": 0.0001738380721254888, "loss": 12.349, "num_tokens": 2638925.0, "step": 371 }, { "epoch": 0.259958071278826, "grad_norm": 1.0686531979747818, "learning_rate": 0.00017368524334273998, "loss": 12.2824, "num_tokens": 2645812.0, "step": 372 }, { "epoch": 0.2606568832983927, "grad_norm": 1.0070897139519819, "learning_rate": 0.00017353203707373, "loss": 12.4482, "num_tokens": 2652935.0, "step": 373 }, { "epoch": 0.2613556953179595, "grad_norm": 1.1557578122673882, "learning_rate": 0.00017337845410332782, "loss": 12.289, "num_tokens": 2659882.0, "step": 374 }, { "epoch": 0.2620545073375262, "grad_norm": 1.0827441091133332, "learning_rate": 0.0001732244952183323, "loss": 12.2165, "num_tokens": 2667013.0, "step": 375 }, { "epoch": 0.26275331935709295, "grad_norm": 0.9155364485477532, "learning_rate": 0.000173070161207468, "loss": 12.2606, "num_tokens": 2675131.0, "step": 376 }, { "epoch": 0.2634521313766597, "grad_norm": 1.0579891464839146, "learning_rate": 0.00017291545286138126, "loss": 12.3224, "num_tokens": 2681743.0, "step": 377 }, { "epoch": 0.2641509433962264, "grad_norm": 0.9495150325149874, "learning_rate": 0.00017276037097263612, "loss": 12.2971, "num_tokens": 2688355.0, "step": 378 }, { "epoch": 0.26484975541579314, "grad_norm": 1.0942465116250388, "learning_rate": 0.00017260491633571033, "loss": 12.2816, "num_tokens": 2695315.0, "step": 379 }, { "epoch": 0.2655485674353599, "grad_norm": 0.9236055818886025, "learning_rate": 0.0001724490897469911, "loss": 12.154, "num_tokens": 2703258.0, "step": 380 }, { "epoch": 0.2662473794549266, "grad_norm": 1.0922583573424403, "learning_rate": 0.00017229289200477123, "loss": 12.1475, "num_tokens": 2710326.0, "step": 381 }, { "epoch": 0.2669461914744934, "grad_norm": 0.9557123291968079, "learning_rate": 0.00017213632390924486, "loss": 12.2181, "num_tokens": 2716825.0, "step": 382 }, { "epoch": 0.2676450034940601, "grad_norm": 1.0051833679431164, "learning_rate": 0.00017197938626250348, "loss": 12.265, "num_tokens": 2723868.0, "step": 383 }, { "epoch": 0.26834381551362685, "grad_norm": 0.941677244029484, "learning_rate": 0.00017182207986853176, "loss": 12.3948, "num_tokens": 2730711.0, "step": 384 }, { "epoch": 0.2690426275331936, "grad_norm": 1.0600159793133541, "learning_rate": 0.00017166440553320337, "loss": 12.1216, "num_tokens": 2737540.0, "step": 385 }, { "epoch": 0.2697414395527603, "grad_norm": 1.0303448078848036, "learning_rate": 0.0001715063640642771, "loss": 12.2437, "num_tokens": 2744595.0, "step": 386 }, { "epoch": 0.27044025157232704, "grad_norm": 1.0844807651487938, "learning_rate": 0.00017134795627139236, "loss": 12.324, "num_tokens": 2751663.0, "step": 387 }, { "epoch": 0.27113906359189377, "grad_norm": 0.9321137382225717, "learning_rate": 0.00017118918296606537, "loss": 12.2311, "num_tokens": 2759081.0, "step": 388 }, { "epoch": 0.2718378756114605, "grad_norm": 0.99797444758753, "learning_rate": 0.00017103004496168473, "loss": 12.2611, "num_tokens": 2766249.0, "step": 389 }, { "epoch": 0.27253668763102723, "grad_norm": 1.0201352413756293, "learning_rate": 0.0001708705430735075, "loss": 12.3475, "num_tokens": 2773320.0, "step": 390 }, { "epoch": 0.273235499650594, "grad_norm": 0.9230415417655204, "learning_rate": 0.00017071067811865476, "loss": 12.2765, "num_tokens": 2780968.0, "step": 391 }, { "epoch": 0.27393431167016075, "grad_norm": 1.058464339332179, "learning_rate": 0.0001705504509161077, "loss": 12.2317, "num_tokens": 2788319.0, "step": 392 }, { "epoch": 0.2746331236897275, "grad_norm": 0.935810740189098, "learning_rate": 0.00017038986228670323, "loss": 12.3109, "num_tokens": 2796120.0, "step": 393 }, { "epoch": 0.2753319357092942, "grad_norm": 1.0048031215221804, "learning_rate": 0.00017022891305312987, "loss": 12.1342, "num_tokens": 2803198.0, "step": 394 }, { "epoch": 0.27603074772886094, "grad_norm": 1.0859371888536073, "learning_rate": 0.00017006760403992337, "loss": 12.1707, "num_tokens": 2809978.0, "step": 395 }, { "epoch": 0.27672955974842767, "grad_norm": 1.057763218829223, "learning_rate": 0.00016990593607346276, "loss": 12.1999, "num_tokens": 2816932.0, "step": 396 }, { "epoch": 0.2774283717679944, "grad_norm": 0.9696692663519124, "learning_rate": 0.00016974390998196595, "loss": 12.1105, "num_tokens": 2824857.0, "step": 397 }, { "epoch": 0.2781271837875611, "grad_norm": 1.0912831454596232, "learning_rate": 0.00016958152659548548, "loss": 12.161, "num_tokens": 2832029.0, "step": 398 }, { "epoch": 0.27882599580712786, "grad_norm": 1.0272883066524952, "learning_rate": 0.00016941878674590425, "loss": 12.1335, "num_tokens": 2839252.0, "step": 399 }, { "epoch": 0.27952480782669464, "grad_norm": 0.9719847308428423, "learning_rate": 0.00016925569126693136, "loss": 12.1187, "num_tokens": 2847177.0, "step": 400 }, { "epoch": 0.2802236198462614, "grad_norm": 0.9720490101732504, "learning_rate": 0.0001690922409940978, "loss": 12.2379, "num_tokens": 2854561.0, "step": 401 }, { "epoch": 0.2809224318658281, "grad_norm": 1.0410666937197113, "learning_rate": 0.00016892843676475212, "loss": 12.0956, "num_tokens": 2861654.0, "step": 402 }, { "epoch": 0.28162124388539483, "grad_norm": 1.013809534538909, "learning_rate": 0.00016876427941805622, "loss": 12.1831, "num_tokens": 2868359.0, "step": 403 }, { "epoch": 0.28232005590496156, "grad_norm": 0.8592336007443362, "learning_rate": 0.00016859976979498092, "loss": 12.1739, "num_tokens": 2875713.0, "step": 404 }, { "epoch": 0.2830188679245283, "grad_norm": 0.9975671907769668, "learning_rate": 0.00016843490873830178, "loss": 12.1526, "num_tokens": 2882164.0, "step": 405 }, { "epoch": 0.283717679944095, "grad_norm": 1.046642053431646, "learning_rate": 0.00016826969709259477, "loss": 12.4811, "num_tokens": 2888466.0, "step": 406 }, { "epoch": 0.28441649196366176, "grad_norm": 0.9951638046201554, "learning_rate": 0.0001681041357042319, "loss": 12.1391, "num_tokens": 2896054.0, "step": 407 }, { "epoch": 0.2851153039832285, "grad_norm": 1.0120154348914012, "learning_rate": 0.0001679382254213768, "loss": 12.2303, "num_tokens": 2903381.0, "step": 408 }, { "epoch": 0.28581411600279527, "grad_norm": 0.971103244517684, "learning_rate": 0.00016777196709398065, "loss": 12.3491, "num_tokens": 2910764.0, "step": 409 }, { "epoch": 0.286512928022362, "grad_norm": 1.008656433224794, "learning_rate": 0.00016760536157377754, "loss": 12.1785, "num_tokens": 2917852.0, "step": 410 }, { "epoch": 0.28721174004192873, "grad_norm": 1.0230579216157252, "learning_rate": 0.00016743840971428017, "loss": 12.0658, "num_tokens": 2925426.0, "step": 411 }, { "epoch": 0.28791055206149546, "grad_norm": 1.020310009862771, "learning_rate": 0.00016727111237077559, "loss": 12.223, "num_tokens": 2932534.0, "step": 412 }, { "epoch": 0.2886093640810622, "grad_norm": 0.9125469963918674, "learning_rate": 0.00016710347040032076, "loss": 12.2343, "num_tokens": 2939544.0, "step": 413 }, { "epoch": 0.2893081761006289, "grad_norm": 1.152646459857617, "learning_rate": 0.0001669354846617381, "loss": 12.1026, "num_tokens": 2946773.0, "step": 414 }, { "epoch": 0.29000698812019565, "grad_norm": 0.9487006632604599, "learning_rate": 0.00016676715601561117, "loss": 12.2828, "num_tokens": 2954251.0, "step": 415 }, { "epoch": 0.2907058001397624, "grad_norm": 1.0339989063419985, "learning_rate": 0.00016659848532428023, "loss": 12.4354, "num_tokens": 2961656.0, "step": 416 }, { "epoch": 0.2914046121593291, "grad_norm": 0.9810692884344294, "learning_rate": 0.00016642947345183774, "loss": 12.2144, "num_tokens": 2969059.0, "step": 417 }, { "epoch": 0.2921034241788959, "grad_norm": 1.041382797288484, "learning_rate": 0.0001662601212641242, "loss": 12.1039, "num_tokens": 2977234.0, "step": 418 }, { "epoch": 0.29280223619846263, "grad_norm": 0.9394420025529445, "learning_rate": 0.00016609042962872333, "loss": 12.1398, "num_tokens": 2984854.0, "step": 419 }, { "epoch": 0.29350104821802936, "grad_norm": 1.0586381513052323, "learning_rate": 0.00016592039941495804, "loss": 11.9381, "num_tokens": 2991537.0, "step": 420 }, { "epoch": 0.2941998602375961, "grad_norm": 0.9975034902351064, "learning_rate": 0.00016575003149388548, "loss": 12.1831, "num_tokens": 2998088.0, "step": 421 }, { "epoch": 0.2948986722571628, "grad_norm": 1.0160628218220324, "learning_rate": 0.00016557932673829311, "loss": 12.2999, "num_tokens": 3004876.0, "step": 422 }, { "epoch": 0.29559748427672955, "grad_norm": 1.093919382310838, "learning_rate": 0.0001654082860226939, "loss": 12.1683, "num_tokens": 3011520.0, "step": 423 }, { "epoch": 0.2962962962962963, "grad_norm": 0.9568683146100065, "learning_rate": 0.00016523691022332185, "loss": 12.2034, "num_tokens": 3018294.0, "step": 424 }, { "epoch": 0.296995108315863, "grad_norm": 1.133923086552664, "learning_rate": 0.00016506520021812766, "loss": 11.8777, "num_tokens": 3026301.0, "step": 425 }, { "epoch": 0.2976939203354298, "grad_norm": 0.9273115968347424, "learning_rate": 0.00016489315688677416, "loss": 12.1574, "num_tokens": 3034166.0, "step": 426 }, { "epoch": 0.2983927323549965, "grad_norm": 0.8779205899305784, "learning_rate": 0.00016472078111063175, "loss": 12.0526, "num_tokens": 3041930.0, "step": 427 }, { "epoch": 0.29909154437456326, "grad_norm": 1.0184105137011932, "learning_rate": 0.00016454807377277398, "loss": 12.2997, "num_tokens": 3048767.0, "step": 428 }, { "epoch": 0.29979035639413, "grad_norm": 0.961478703601268, "learning_rate": 0.00016437503575797297, "loss": 12.0761, "num_tokens": 3055753.0, "step": 429 }, { "epoch": 0.3004891684136967, "grad_norm": 0.9946551332938962, "learning_rate": 0.00016420166795269475, "loss": 12.3698, "num_tokens": 3063120.0, "step": 430 }, { "epoch": 0.30118798043326345, "grad_norm": 0.9437621585892487, "learning_rate": 0.00016402797124509508, "loss": 12.235, "num_tokens": 3070742.0, "step": 431 }, { "epoch": 0.3018867924528302, "grad_norm": 1.0369518319538322, "learning_rate": 0.00016385394652501445, "loss": 12.1428, "num_tokens": 3078056.0, "step": 432 }, { "epoch": 0.3025856044723969, "grad_norm": 0.9296598663042578, "learning_rate": 0.00016367959468397393, "loss": 12.1666, "num_tokens": 3085214.0, "step": 433 }, { "epoch": 0.30328441649196364, "grad_norm": 0.9998913789930738, "learning_rate": 0.00016350491661517032, "loss": 12.199, "num_tokens": 3092548.0, "step": 434 }, { "epoch": 0.3039832285115304, "grad_norm": 0.9317649079361281, "learning_rate": 0.00016332991321347167, "loss": 12.0497, "num_tokens": 3100393.0, "step": 435 }, { "epoch": 0.30468204053109715, "grad_norm": 1.0873578371369284, "learning_rate": 0.0001631545853754127, "loss": 12.396, "num_tokens": 3106325.0, "step": 436 }, { "epoch": 0.3053808525506639, "grad_norm": 1.111865839622246, "learning_rate": 0.0001629789339991902, "loss": 12.2868, "num_tokens": 3113193.0, "step": 437 }, { "epoch": 0.3060796645702306, "grad_norm": 1.0625922180498937, "learning_rate": 0.0001628029599846585, "loss": 12.2205, "num_tokens": 3120358.0, "step": 438 }, { "epoch": 0.30677847658979734, "grad_norm": 1.0810749864557256, "learning_rate": 0.00016262666423332473, "loss": 12.118, "num_tokens": 3127109.0, "step": 439 }, { "epoch": 0.3074772886093641, "grad_norm": 1.0039089118126248, "learning_rate": 0.00016245004764834422, "loss": 11.9709, "num_tokens": 3134426.0, "step": 440 }, { "epoch": 0.3081761006289308, "grad_norm": 0.9760746122171329, "learning_rate": 0.000162273111134516, "loss": 12.2341, "num_tokens": 3141188.0, "step": 441 }, { "epoch": 0.30887491264849753, "grad_norm": 1.0221181358646754, "learning_rate": 0.00016209585559827806, "loss": 12.1916, "num_tokens": 3149034.0, "step": 442 }, { "epoch": 0.30957372466806427, "grad_norm": 0.9632877916386517, "learning_rate": 0.0001619182819477027, "loss": 12.3057, "num_tokens": 3156062.0, "step": 443 }, { "epoch": 0.31027253668763105, "grad_norm": 1.0519770843397107, "learning_rate": 0.0001617403910924919, "loss": 12.0121, "num_tokens": 3164141.0, "step": 444 }, { "epoch": 0.3109713487071978, "grad_norm": 0.9570699017638297, "learning_rate": 0.00016156218394397273, "loss": 12.1094, "num_tokens": 3172103.0, "step": 445 }, { "epoch": 0.3116701607267645, "grad_norm": 1.078199682948591, "learning_rate": 0.0001613836614150926, "loss": 12.2273, "num_tokens": 3178950.0, "step": 446 }, { "epoch": 0.31236897274633124, "grad_norm": 0.9297993503765057, "learning_rate": 0.00016120482442041447, "loss": 12.0441, "num_tokens": 3186094.0, "step": 447 }, { "epoch": 0.31306778476589797, "grad_norm": 1.052582853013476, "learning_rate": 0.0001610256738761125, "loss": 12.276, "num_tokens": 3193281.0, "step": 448 }, { "epoch": 0.3137665967854647, "grad_norm": 0.9525142248097075, "learning_rate": 0.000160846210699967, "loss": 12.089, "num_tokens": 3200265.0, "step": 449 }, { "epoch": 0.31446540880503143, "grad_norm": 0.9973983515962607, "learning_rate": 0.0001606664358113599, "loss": 12.0084, "num_tokens": 3207548.0, "step": 450 }, { "epoch": 0.31516422082459816, "grad_norm": 0.9695049891418782, "learning_rate": 0.00016048635013127016, "loss": 12.2589, "num_tokens": 3214372.0, "step": 451 }, { "epoch": 0.3158630328441649, "grad_norm": 1.1601028592283664, "learning_rate": 0.00016030595458226872, "loss": 11.9678, "num_tokens": 3221203.0, "step": 452 }, { "epoch": 0.3165618448637317, "grad_norm": 0.9617505431282883, "learning_rate": 0.00016012525008851403, "loss": 12.0657, "num_tokens": 3229249.0, "step": 453 }, { "epoch": 0.3172606568832984, "grad_norm": 1.1766775349257634, "learning_rate": 0.0001599442375757473, "loss": 12.1183, "num_tokens": 3236472.0, "step": 454 }, { "epoch": 0.31795946890286514, "grad_norm": 0.9825393901516871, "learning_rate": 0.00015976291797128767, "loss": 12.1128, "num_tokens": 3243897.0, "step": 455 }, { "epoch": 0.31865828092243187, "grad_norm": 1.115098077594373, "learning_rate": 0.00015958129220402744, "loss": 11.989, "num_tokens": 3250533.0, "step": 456 }, { "epoch": 0.3193570929419986, "grad_norm": 1.112270034985869, "learning_rate": 0.00015939936120442752, "loss": 12.2418, "num_tokens": 3257466.0, "step": 457 }, { "epoch": 0.32005590496156533, "grad_norm": 1.0490822017700396, "learning_rate": 0.00015921712590451236, "loss": 12.0436, "num_tokens": 3263944.0, "step": 458 }, { "epoch": 0.32075471698113206, "grad_norm": 1.19266057918563, "learning_rate": 0.00015903458723786544, "loss": 12.3036, "num_tokens": 3270543.0, "step": 459 }, { "epoch": 0.3214535290006988, "grad_norm": 0.9754576421413539, "learning_rate": 0.00015885174613962426, "loss": 12.2576, "num_tokens": 3277894.0, "step": 460 }, { "epoch": 0.3221523410202656, "grad_norm": 1.2240651695002909, "learning_rate": 0.00015866860354647576, "loss": 12.1746, "num_tokens": 3284793.0, "step": 461 }, { "epoch": 0.3228511530398323, "grad_norm": 0.9220005514021452, "learning_rate": 0.00015848516039665138, "loss": 12.1768, "num_tokens": 3292856.0, "step": 462 }, { "epoch": 0.32354996505939904, "grad_norm": 1.1405752061575267, "learning_rate": 0.0001583014176299223, "loss": 11.9588, "num_tokens": 3300134.0, "step": 463 }, { "epoch": 0.32424877707896577, "grad_norm": 0.9785862767305678, "learning_rate": 0.00015811737618759468, "loss": 12.0556, "num_tokens": 3307091.0, "step": 464 }, { "epoch": 0.3249475890985325, "grad_norm": 1.1239069485026847, "learning_rate": 0.00015793303701250468, "loss": 11.9657, "num_tokens": 3314380.0, "step": 465 }, { "epoch": 0.3256464011180992, "grad_norm": 0.9230619844697087, "learning_rate": 0.00015774840104901378, "loss": 12.4159, "num_tokens": 3322081.0, "step": 466 }, { "epoch": 0.32634521313766596, "grad_norm": 1.045958434033744, "learning_rate": 0.000157563469243004, "loss": 12.2218, "num_tokens": 3329105.0, "step": 467 }, { "epoch": 0.3270440251572327, "grad_norm": 1.0386841546884902, "learning_rate": 0.00015737824254187275, "loss": 12.1626, "num_tokens": 3336405.0, "step": 468 }, { "epoch": 0.3277428371767994, "grad_norm": 1.0497667874202588, "learning_rate": 0.00015719272189452824, "loss": 12.0446, "num_tokens": 3343087.0, "step": 469 }, { "epoch": 0.3284416491963662, "grad_norm": 0.9471293053870683, "learning_rate": 0.00015700690825138473, "loss": 12.2638, "num_tokens": 3350235.0, "step": 470 }, { "epoch": 0.32914046121593293, "grad_norm": 0.9131515390790194, "learning_rate": 0.00015682080256435724, "loss": 12.1216, "num_tokens": 3357485.0, "step": 471 }, { "epoch": 0.32983927323549966, "grad_norm": 0.9820518461252037, "learning_rate": 0.00015663440578685703, "loss": 12.2044, "num_tokens": 3364351.0, "step": 472 }, { "epoch": 0.3305380852550664, "grad_norm": 0.8992691022961555, "learning_rate": 0.00015644771887378663, "loss": 12.1364, "num_tokens": 3372249.0, "step": 473 }, { "epoch": 0.3312368972746331, "grad_norm": 0.9980561034084768, "learning_rate": 0.00015626074278153485, "loss": 12.1517, "num_tokens": 3379136.0, "step": 474 }, { "epoch": 0.33193570929419985, "grad_norm": 0.9243681679240103, "learning_rate": 0.000156073478467972, "loss": 12.0981, "num_tokens": 3386556.0, "step": 475 }, { "epoch": 0.3326345213137666, "grad_norm": 0.9256289781367883, "learning_rate": 0.0001558859268924449, "loss": 11.8951, "num_tokens": 3394436.0, "step": 476 }, { "epoch": 0.3333333333333333, "grad_norm": 0.9900816750625112, "learning_rate": 0.0001556980890157721, "loss": 12.029, "num_tokens": 3401842.0, "step": 477 }, { "epoch": 0.33403214535290005, "grad_norm": 0.9703349089213797, "learning_rate": 0.00015550996580023868, "loss": 11.9128, "num_tokens": 3409180.0, "step": 478 }, { "epoch": 0.33473095737246683, "grad_norm": 0.9630292571815687, "learning_rate": 0.00015532155820959165, "loss": 12.2213, "num_tokens": 3416900.0, "step": 479 }, { "epoch": 0.33542976939203356, "grad_norm": 0.9051616995348135, "learning_rate": 0.00015513286720903485, "loss": 12.2121, "num_tokens": 3424074.0, "step": 480 }, { "epoch": 0.3361285814116003, "grad_norm": 0.9690008678265436, "learning_rate": 0.00015494389376522388, "loss": 12.2515, "num_tokens": 3431040.0, "step": 481 }, { "epoch": 0.336827393431167, "grad_norm": 1.0166496850552176, "learning_rate": 0.0001547546388462615, "loss": 12.1705, "num_tokens": 3437949.0, "step": 482 }, { "epoch": 0.33752620545073375, "grad_norm": 0.9396624167132782, "learning_rate": 0.00015456510342169225, "loss": 11.8757, "num_tokens": 3444767.0, "step": 483 }, { "epoch": 0.3382250174703005, "grad_norm": 0.9809819891433745, "learning_rate": 0.00015437528846249784, "loss": 12.1112, "num_tokens": 3452455.0, "step": 484 }, { "epoch": 0.3389238294898672, "grad_norm": 0.9286838895806877, "learning_rate": 0.00015418519494109185, "loss": 12.0485, "num_tokens": 3459075.0, "step": 485 }, { "epoch": 0.33962264150943394, "grad_norm": 0.9139552415362666, "learning_rate": 0.00015399482383131517, "loss": 12.0818, "num_tokens": 3466194.0, "step": 486 }, { "epoch": 0.3403214535290007, "grad_norm": 0.9304856467336878, "learning_rate": 0.0001538041761084305, "loss": 12.1795, "num_tokens": 3472861.0, "step": 487 }, { "epoch": 0.34102026554856746, "grad_norm": 0.9208992780595067, "learning_rate": 0.00015361325274911779, "loss": 12.0489, "num_tokens": 3480127.0, "step": 488 }, { "epoch": 0.3417190775681342, "grad_norm": 0.8997956372118784, "learning_rate": 0.00015342205473146904, "loss": 12.1201, "num_tokens": 3487155.0, "step": 489 }, { "epoch": 0.3424178895877009, "grad_norm": 0.9777088108576337, "learning_rate": 0.00015323058303498324, "loss": 12.0205, "num_tokens": 3494773.0, "step": 490 }, { "epoch": 0.34311670160726765, "grad_norm": 0.879745446242411, "learning_rate": 0.00015303883864056154, "loss": 11.9763, "num_tokens": 3502169.0, "step": 491 }, { "epoch": 0.3438155136268344, "grad_norm": 0.9329909041798102, "learning_rate": 0.00015284682253050198, "loss": 12.3791, "num_tokens": 3509575.0, "step": 492 }, { "epoch": 0.3445143256464011, "grad_norm": 0.9682786444676359, "learning_rate": 0.00015265453568849463, "loss": 12.4002, "num_tokens": 3516257.0, "step": 493 }, { "epoch": 0.34521313766596784, "grad_norm": 0.8882073075846929, "learning_rate": 0.0001524619790996166, "loss": 12.2699, "num_tokens": 3523550.0, "step": 494 }, { "epoch": 0.34591194968553457, "grad_norm": 0.9501809761434933, "learning_rate": 0.00015226915375032675, "loss": 12.0807, "num_tokens": 3530386.0, "step": 495 }, { "epoch": 0.3466107617051013, "grad_norm": 0.8935380130451016, "learning_rate": 0.00015207606062846092, "loss": 12.0567, "num_tokens": 3537432.0, "step": 496 }, { "epoch": 0.3473095737246681, "grad_norm": 0.9513080986369654, "learning_rate": 0.00015188270072322664, "loss": 12.1846, "num_tokens": 3544343.0, "step": 497 }, { "epoch": 0.3480083857442348, "grad_norm": 0.9311057870656899, "learning_rate": 0.00015168907502519823, "loss": 12.0444, "num_tokens": 3551437.0, "step": 498 }, { "epoch": 0.34870719776380155, "grad_norm": 1.0017415906580418, "learning_rate": 0.00015149518452631163, "loss": 12.1657, "num_tokens": 3558381.0, "step": 499 }, { "epoch": 0.3494060097833683, "grad_norm": 1.0960810177794507, "learning_rate": 0.00015130103021985928, "loss": 12.0332, "num_tokens": 3565537.0, "step": 500 }, { "epoch": 0.350104821802935, "grad_norm": 0.9446673820098566, "learning_rate": 0.00015110661310048523, "loss": 12.3164, "num_tokens": 3572664.0, "step": 501 }, { "epoch": 0.35080363382250174, "grad_norm": 1.00955257241026, "learning_rate": 0.00015091193416417981, "loss": 11.9365, "num_tokens": 3579863.0, "step": 502 }, { "epoch": 0.35150244584206847, "grad_norm": 0.9056375701048143, "learning_rate": 0.00015071699440827462, "loss": 12.1319, "num_tokens": 3587973.0, "step": 503 }, { "epoch": 0.3522012578616352, "grad_norm": 0.9234279886277251, "learning_rate": 0.00015052179483143752, "loss": 12.0822, "num_tokens": 3594430.0, "step": 504 }, { "epoch": 0.352900069881202, "grad_norm": 0.9306165895829293, "learning_rate": 0.00015032633643366727, "loss": 11.9546, "num_tokens": 3600877.0, "step": 505 }, { "epoch": 0.3535988819007687, "grad_norm": 0.916209027167804, "learning_rate": 0.0001501306202162887, "loss": 12.0369, "num_tokens": 3607903.0, "step": 506 }, { "epoch": 0.35429769392033544, "grad_norm": 0.8278388004090699, "learning_rate": 0.0001499346471819474, "loss": 12.0787, "num_tokens": 3614967.0, "step": 507 }, { "epoch": 0.3549965059399022, "grad_norm": 0.8626281789322203, "learning_rate": 0.00014973841833460457, "loss": 12.158, "num_tokens": 3622644.0, "step": 508 }, { "epoch": 0.3556953179594689, "grad_norm": 1.0169394719056057, "learning_rate": 0.00014954193467953196, "loss": 12.0281, "num_tokens": 3629496.0, "step": 509 }, { "epoch": 0.35639412997903563, "grad_norm": 1.0391735155244062, "learning_rate": 0.0001493451972233067, "loss": 12.1707, "num_tokens": 3636251.0, "step": 510 }, { "epoch": 0.35709294199860236, "grad_norm": 0.8580636790874151, "learning_rate": 0.0001491482069738062, "loss": 12.0772, "num_tokens": 3643640.0, "step": 511 }, { "epoch": 0.3577917540181691, "grad_norm": 0.8826702226607871, "learning_rate": 0.00014895096494020274, "loss": 12.0327, "num_tokens": 3651194.0, "step": 512 }, { "epoch": 0.3584905660377358, "grad_norm": 0.8613455261520183, "learning_rate": 0.00014875347213295863, "loss": 11.9211, "num_tokens": 3658441.0, "step": 513 }, { "epoch": 0.3591893780573026, "grad_norm": 0.9064281924248683, "learning_rate": 0.00014855572956382082, "loss": 12.0293, "num_tokens": 3665170.0, "step": 514 }, { "epoch": 0.35988819007686934, "grad_norm": 0.8367714960887371, "learning_rate": 0.0001483577382458158, "loss": 12.0457, "num_tokens": 3672474.0, "step": 515 }, { "epoch": 0.36058700209643607, "grad_norm": 0.9241907055749421, "learning_rate": 0.00014815949919324444, "loss": 11.9374, "num_tokens": 3679386.0, "step": 516 }, { "epoch": 0.3612858141160028, "grad_norm": 0.824152296178085, "learning_rate": 0.00014796101342167664, "loss": 12.1566, "num_tokens": 3687308.0, "step": 517 }, { "epoch": 0.36198462613556953, "grad_norm": 0.8944396761994626, "learning_rate": 0.00014776228194794623, "loss": 11.9896, "num_tokens": 3694073.0, "step": 518 }, { "epoch": 0.36268343815513626, "grad_norm": 0.858271930480037, "learning_rate": 0.00014756330579014591, "loss": 12.1352, "num_tokens": 3701387.0, "step": 519 }, { "epoch": 0.363382250174703, "grad_norm": 1.154248735078944, "learning_rate": 0.0001473640859676217, "loss": 11.858, "num_tokens": 3708600.0, "step": 520 }, { "epoch": 0.3640810621942697, "grad_norm": 0.9591101732020725, "learning_rate": 0.00014716462350096803, "loss": 12.0315, "num_tokens": 3716979.0, "step": 521 }, { "epoch": 0.36477987421383645, "grad_norm": 1.17298705448662, "learning_rate": 0.0001469649194120224, "loss": 12.0713, "num_tokens": 3723012.0, "step": 522 }, { "epoch": 0.36547868623340324, "grad_norm": 0.9650929504820438, "learning_rate": 0.00014676497472385994, "loss": 11.8546, "num_tokens": 3730363.0, "step": 523 }, { "epoch": 0.36617749825296997, "grad_norm": 1.0598732657131429, "learning_rate": 0.0001465647904607886, "loss": 12.0821, "num_tokens": 3737054.0, "step": 524 }, { "epoch": 0.3668763102725367, "grad_norm": 1.055588049526762, "learning_rate": 0.00014636436764834353, "loss": 11.9764, "num_tokens": 3743973.0, "step": 525 }, { "epoch": 0.36757512229210343, "grad_norm": 0.9634047864594488, "learning_rate": 0.000146163707313282, "loss": 12.2426, "num_tokens": 3750646.0, "step": 526 }, { "epoch": 0.36827393431167016, "grad_norm": 0.9379570298735684, "learning_rate": 0.00014596281048357806, "loss": 12.0747, "num_tokens": 3758294.0, "step": 527 }, { "epoch": 0.3689727463312369, "grad_norm": 0.8475926897769003, "learning_rate": 0.0001457616781884173, "loss": 11.9549, "num_tokens": 3765428.0, "step": 528 }, { "epoch": 0.3696715583508036, "grad_norm": 0.9785101548217052, "learning_rate": 0.00014556031145819168, "loss": 12.0683, "num_tokens": 3772483.0, "step": 529 }, { "epoch": 0.37037037037037035, "grad_norm": 0.8657231796104006, "learning_rate": 0.0001453587113244941, "loss": 12.0748, "num_tokens": 3779599.0, "step": 530 }, { "epoch": 0.3710691823899371, "grad_norm": 0.9948561473315793, "learning_rate": 0.00014515687882011313, "loss": 12.0452, "num_tokens": 3786663.0, "step": 531 }, { "epoch": 0.37176799440950387, "grad_norm": 0.8961368986877242, "learning_rate": 0.00014495481497902788, "loss": 12.0502, "num_tokens": 3793917.0, "step": 532 }, { "epoch": 0.3724668064290706, "grad_norm": 1.0039214702487007, "learning_rate": 0.00014475252083640246, "loss": 12.1103, "num_tokens": 3800383.0, "step": 533 }, { "epoch": 0.3731656184486373, "grad_norm": 1.104043842808782, "learning_rate": 0.00014454999742858092, "loss": 12.0706, "num_tokens": 3807421.0, "step": 534 }, { "epoch": 0.37386443046820406, "grad_norm": 0.9706944725637485, "learning_rate": 0.0001443472457930817, "loss": 12.1253, "num_tokens": 3814470.0, "step": 535 }, { "epoch": 0.3745632424877708, "grad_norm": 1.0708871567151401, "learning_rate": 0.0001441442669685926, "loss": 12.175, "num_tokens": 3821117.0, "step": 536 }, { "epoch": 0.3752620545073375, "grad_norm": 0.9202934515725604, "learning_rate": 0.00014394106199496517, "loss": 12.2001, "num_tokens": 3828050.0, "step": 537 }, { "epoch": 0.37596086652690425, "grad_norm": 0.887859566885173, "learning_rate": 0.00014373763191320954, "loss": 12.1075, "num_tokens": 3835858.0, "step": 538 }, { "epoch": 0.376659678546471, "grad_norm": 0.9187757022639819, "learning_rate": 0.00014353397776548912, "loss": 12.0908, "num_tokens": 3843141.0, "step": 539 }, { "epoch": 0.37735849056603776, "grad_norm": 0.9155332566153289, "learning_rate": 0.00014333010059511505, "loss": 12.1828, "num_tokens": 3850497.0, "step": 540 }, { "epoch": 0.3780573025856045, "grad_norm": 0.8284408148826566, "learning_rate": 0.0001431260014465412, "loss": 11.6563, "num_tokens": 3857621.0, "step": 541 }, { "epoch": 0.3787561146051712, "grad_norm": 0.9716124370861428, "learning_rate": 0.00014292168136535854, "loss": 12.0896, "num_tokens": 3864403.0, "step": 542 }, { "epoch": 0.37945492662473795, "grad_norm": 0.8208484050542351, "learning_rate": 0.00014271714139828983, "loss": 11.8759, "num_tokens": 3871744.0, "step": 543 }, { "epoch": 0.3801537386443047, "grad_norm": 0.9605228173140264, "learning_rate": 0.0001425123825931843, "loss": 11.994, "num_tokens": 3879289.0, "step": 544 }, { "epoch": 0.3808525506638714, "grad_norm": 0.9031441753054636, "learning_rate": 0.00014230740599901231, "loss": 11.8901, "num_tokens": 3886810.0, "step": 545 }, { "epoch": 0.38155136268343814, "grad_norm": 0.907168225729437, "learning_rate": 0.00014210221266585998, "loss": 12.1076, "num_tokens": 3894520.0, "step": 546 }, { "epoch": 0.3822501747030049, "grad_norm": 0.9280949877436617, "learning_rate": 0.0001418968036449237, "loss": 12.0276, "num_tokens": 3901356.0, "step": 547 }, { "epoch": 0.3829489867225716, "grad_norm": 0.8801303118077374, "learning_rate": 0.0001416911799885049, "loss": 12.0104, "num_tokens": 3908231.0, "step": 548 }, { "epoch": 0.3836477987421384, "grad_norm": 0.8825862875085414, "learning_rate": 0.00014148534275000444, "loss": 12.2931, "num_tokens": 3915548.0, "step": 549 }, { "epoch": 0.3843466107617051, "grad_norm": 0.9636172523939012, "learning_rate": 0.0001412792929839175, "loss": 11.9835, "num_tokens": 3922294.0, "step": 550 }, { "epoch": 0.38504542278127185, "grad_norm": 0.8577046886760447, "learning_rate": 0.00014107303174582794, "loss": 11.95, "num_tokens": 3929145.0, "step": 551 }, { "epoch": 0.3857442348008386, "grad_norm": 0.9191735227445484, "learning_rate": 0.00014086656009240306, "loss": 12.0281, "num_tokens": 3936061.0, "step": 552 }, { "epoch": 0.3864430468204053, "grad_norm": 0.9127134286950958, "learning_rate": 0.00014065987908138804, "loss": 12.1853, "num_tokens": 3942508.0, "step": 553 }, { "epoch": 0.38714185883997204, "grad_norm": 0.9287174901506331, "learning_rate": 0.00014045298977160057, "loss": 12.189, "num_tokens": 3949004.0, "step": 554 }, { "epoch": 0.38784067085953877, "grad_norm": 0.8083833012878675, "learning_rate": 0.00014024589322292555, "loss": 11.9526, "num_tokens": 3956901.0, "step": 555 }, { "epoch": 0.3885394828791055, "grad_norm": 0.937342232458656, "learning_rate": 0.00014003859049630942, "loss": 12.1259, "num_tokens": 3964133.0, "step": 556 }, { "epoch": 0.38923829489867223, "grad_norm": 0.8744910335409934, "learning_rate": 0.000139831082653755, "loss": 12.2569, "num_tokens": 3971294.0, "step": 557 }, { "epoch": 0.389937106918239, "grad_norm": 0.9325435804798502, "learning_rate": 0.00013962337075831583, "loss": 12.0983, "num_tokens": 3977426.0, "step": 558 }, { "epoch": 0.39063591893780575, "grad_norm": 0.922661936419083, "learning_rate": 0.00013941545587409075, "loss": 11.9229, "num_tokens": 3984492.0, "step": 559 }, { "epoch": 0.3913347309573725, "grad_norm": 0.9600683664822316, "learning_rate": 0.00013920733906621862, "loss": 12.1813, "num_tokens": 3991120.0, "step": 560 }, { "epoch": 0.3920335429769392, "grad_norm": 0.9723044506449994, "learning_rate": 0.00013899902140087272, "loss": 12.1032, "num_tokens": 3997971.0, "step": 561 }, { "epoch": 0.39273235499650594, "grad_norm": 0.8414594620114867, "learning_rate": 0.00013879050394525523, "loss": 11.9443, "num_tokens": 4005376.0, "step": 562 }, { "epoch": 0.39343116701607267, "grad_norm": 1.0202475102587107, "learning_rate": 0.00013858178776759197, "loss": 12.1481, "num_tokens": 4012471.0, "step": 563 }, { "epoch": 0.3941299790356394, "grad_norm": 0.8839072059833434, "learning_rate": 0.00013837287393712666, "loss": 12.0321, "num_tokens": 4019054.0, "step": 564 }, { "epoch": 0.39482879105520613, "grad_norm": 0.9555850931145216, "learning_rate": 0.00013816376352411574, "loss": 12.1077, "num_tokens": 4025454.0, "step": 565 }, { "epoch": 0.39552760307477286, "grad_norm": 0.8679296924666821, "learning_rate": 0.00013795445759982262, "loss": 11.7888, "num_tokens": 4032992.0, "step": 566 }, { "epoch": 0.39622641509433965, "grad_norm": 0.8722743434457823, "learning_rate": 0.00013774495723651236, "loss": 12.024, "num_tokens": 4040064.0, "step": 567 }, { "epoch": 0.3969252271139064, "grad_norm": 0.9600855639345066, "learning_rate": 0.0001375352635074461, "loss": 11.8821, "num_tokens": 4046967.0, "step": 568 }, { "epoch": 0.3976240391334731, "grad_norm": 0.7877602768482204, "learning_rate": 0.0001373253774868756, "loss": 12.0148, "num_tokens": 4055301.0, "step": 569 }, { "epoch": 0.39832285115303984, "grad_norm": 0.9074208297899601, "learning_rate": 0.00013711530025003766, "loss": 12.0443, "num_tokens": 4062431.0, "step": 570 }, { "epoch": 0.39902166317260657, "grad_norm": 0.8287729757656834, "learning_rate": 0.00013690503287314883, "loss": 11.8087, "num_tokens": 4070230.0, "step": 571 }, { "epoch": 0.3997204751921733, "grad_norm": 0.9316554946864943, "learning_rate": 0.00013669457643339955, "loss": 11.8952, "num_tokens": 4077148.0, "step": 572 }, { "epoch": 0.40041928721174, "grad_norm": 0.8817289397600743, "learning_rate": 0.00013648393200894893, "loss": 11.8121, "num_tokens": 4084168.0, "step": 573 }, { "epoch": 0.40111809923130676, "grad_norm": 0.8430911631310684, "learning_rate": 0.00013627310067891913, "loss": 12.0071, "num_tokens": 4092313.0, "step": 574 }, { "epoch": 0.4018169112508735, "grad_norm": 0.8381632302819609, "learning_rate": 0.00013606208352338973, "loss": 11.8803, "num_tokens": 4099578.0, "step": 575 }, { "epoch": 0.4025157232704403, "grad_norm": 0.8404927564721982, "learning_rate": 0.00013585088162339231, "loss": 12.0778, "num_tokens": 4106534.0, "step": 576 }, { "epoch": 0.403214535290007, "grad_norm": 0.9183215666016679, "learning_rate": 0.00013563949606090503, "loss": 11.9381, "num_tokens": 4113134.0, "step": 577 }, { "epoch": 0.40391334730957373, "grad_norm": 0.8298708376682066, "learning_rate": 0.00013542792791884674, "loss": 11.9096, "num_tokens": 4120863.0, "step": 578 }, { "epoch": 0.40461215932914046, "grad_norm": 0.8725360483372107, "learning_rate": 0.00013521617828107175, "loss": 11.8436, "num_tokens": 4127780.0, "step": 579 }, { "epoch": 0.4053109713487072, "grad_norm": 0.8961375996287267, "learning_rate": 0.00013500424823236412, "loss": 11.95, "num_tokens": 4134829.0, "step": 580 }, { "epoch": 0.4060097833682739, "grad_norm": 0.7861779536888713, "learning_rate": 0.0001347921388584322, "loss": 11.7022, "num_tokens": 4142834.0, "step": 581 }, { "epoch": 0.40670859538784065, "grad_norm": 0.8496401776165243, "learning_rate": 0.000134579851245903, "loss": 12.1181, "num_tokens": 4149774.0, "step": 582 }, { "epoch": 0.4074074074074074, "grad_norm": 0.8468938520035636, "learning_rate": 0.00013436738648231656, "loss": 12.0059, "num_tokens": 4156412.0, "step": 583 }, { "epoch": 0.40810621942697417, "grad_norm": 0.8790337397038394, "learning_rate": 0.00013415474565612058, "loss": 12.1166, "num_tokens": 4163132.0, "step": 584 }, { "epoch": 0.4088050314465409, "grad_norm": 0.8589311219069107, "learning_rate": 0.00013394192985666465, "loss": 12.0116, "num_tokens": 4170145.0, "step": 585 }, { "epoch": 0.40950384346610763, "grad_norm": 0.8696135886818056, "learning_rate": 0.0001337289401741947, "loss": 11.9189, "num_tokens": 4177826.0, "step": 586 }, { "epoch": 0.41020265548567436, "grad_norm": 1.2530218930273918, "learning_rate": 0.0001335157776998476, "loss": 11.873, "num_tokens": 4184556.0, "step": 587 }, { "epoch": 0.4109014675052411, "grad_norm": 1.017543600554075, "learning_rate": 0.00013330244352564527, "loss": 11.9424, "num_tokens": 4191042.0, "step": 588 }, { "epoch": 0.4116002795248078, "grad_norm": 0.9275163950537736, "learning_rate": 0.0001330889387444893, "loss": 12.1217, "num_tokens": 4197862.0, "step": 589 }, { "epoch": 0.41229909154437455, "grad_norm": 0.8853093500307286, "learning_rate": 0.00013287526445015531, "loss": 12.0612, "num_tokens": 4205404.0, "step": 590 }, { "epoch": 0.4129979035639413, "grad_norm": 0.8464143780556126, "learning_rate": 0.0001326614217372873, "loss": 11.897, "num_tokens": 4212599.0, "step": 591 }, { "epoch": 0.413696715583508, "grad_norm": 0.9073287957293683, "learning_rate": 0.0001324474117013921, "loss": 12.1639, "num_tokens": 4219704.0, "step": 592 }, { "epoch": 0.4143955276030748, "grad_norm": 0.8985675842119918, "learning_rate": 0.00013223323543883373, "loss": 12.0619, "num_tokens": 4226784.0, "step": 593 }, { "epoch": 0.41509433962264153, "grad_norm": 0.9107156893964011, "learning_rate": 0.0001320188940468277, "loss": 11.903, "num_tokens": 4233717.0, "step": 594 }, { "epoch": 0.41579315164220826, "grad_norm": 0.8989603756831062, "learning_rate": 0.0001318043886234356, "loss": 12.0936, "num_tokens": 4240647.0, "step": 595 }, { "epoch": 0.416491963661775, "grad_norm": 0.9806315936138839, "learning_rate": 0.00013158972026755926, "loss": 11.9643, "num_tokens": 4247672.0, "step": 596 }, { "epoch": 0.4171907756813417, "grad_norm": 0.8241384074500768, "learning_rate": 0.0001313748900789352, "loss": 11.8706, "num_tokens": 4255152.0, "step": 597 }, { "epoch": 0.41788958770090845, "grad_norm": 0.9271536643066998, "learning_rate": 0.0001311598991581291, "loss": 12.1374, "num_tokens": 4261453.0, "step": 598 }, { "epoch": 0.4185883997204752, "grad_norm": 0.9002546899881367, "learning_rate": 0.00013094474860652987, "loss": 11.9134, "num_tokens": 4268508.0, "step": 599 }, { "epoch": 0.4192872117400419, "grad_norm": 0.8757592331202514, "learning_rate": 0.00013072943952634447, "loss": 11.9621, "num_tokens": 4275543.0, "step": 600 }, { "epoch": 0.41998602375960864, "grad_norm": 0.9204020510434368, "learning_rate": 0.00013051397302059171, "loss": 12.1479, "num_tokens": 4282191.0, "step": 601 }, { "epoch": 0.4206848357791754, "grad_norm": 0.8328460955320042, "learning_rate": 0.00013029835019309714, "loss": 12.0701, "num_tokens": 4289593.0, "step": 602 }, { "epoch": 0.42138364779874216, "grad_norm": 0.7838151120132228, "learning_rate": 0.000130082572148487, "loss": 11.8089, "num_tokens": 4297297.0, "step": 603 }, { "epoch": 0.4220824598183089, "grad_norm": 0.8090146795838783, "learning_rate": 0.00012986663999218261, "loss": 12.0356, "num_tokens": 4304161.0, "step": 604 }, { "epoch": 0.4227812718378756, "grad_norm": 0.8425172850446164, "learning_rate": 0.00012965055483039507, "loss": 11.7753, "num_tokens": 4311640.0, "step": 605 }, { "epoch": 0.42348008385744235, "grad_norm": 0.8445465278085014, "learning_rate": 0.00012943431777011902, "loss": 12.0445, "num_tokens": 4318619.0, "step": 606 }, { "epoch": 0.4241788958770091, "grad_norm": 0.8513080972741544, "learning_rate": 0.00012921792991912753, "loss": 12.0995, "num_tokens": 4325488.0, "step": 607 }, { "epoch": 0.4248777078965758, "grad_norm": 0.7722421017802668, "learning_rate": 0.00012900139238596598, "loss": 11.7651, "num_tokens": 4332936.0, "step": 608 }, { "epoch": 0.42557651991614254, "grad_norm": 0.8918135734711797, "learning_rate": 0.00012878470627994664, "loss": 11.8054, "num_tokens": 4339915.0, "step": 609 }, { "epoch": 0.42627533193570927, "grad_norm": 0.8189265441430075, "learning_rate": 0.0001285678727111429, "loss": 11.7899, "num_tokens": 4347052.0, "step": 610 }, { "epoch": 0.42697414395527605, "grad_norm": 0.8964986737425554, "learning_rate": 0.00012835089279038362, "loss": 11.9812, "num_tokens": 4353752.0, "step": 611 }, { "epoch": 0.4276729559748428, "grad_norm": 0.8413717581563538, "learning_rate": 0.00012813376762924733, "loss": 12.1173, "num_tokens": 4361038.0, "step": 612 }, { "epoch": 0.4283717679944095, "grad_norm": 0.850087071440023, "learning_rate": 0.0001279164983400568, "loss": 11.9678, "num_tokens": 4368560.0, "step": 613 }, { "epoch": 0.42907058001397624, "grad_norm": 0.8010995265019617, "learning_rate": 0.00012769908603587292, "loss": 11.7605, "num_tokens": 4376222.0, "step": 614 }, { "epoch": 0.429769392033543, "grad_norm": 0.8025102750844321, "learning_rate": 0.0001274815318304894, "loss": 11.8735, "num_tokens": 4383884.0, "step": 615 }, { "epoch": 0.4304682040531097, "grad_norm": 0.8707077238242065, "learning_rate": 0.0001272638368384269, "loss": 11.9285, "num_tokens": 4390978.0, "step": 616 }, { "epoch": 0.43116701607267643, "grad_norm": 0.8116822458689641, "learning_rate": 0.00012704600217492725, "loss": 12.1714, "num_tokens": 4398693.0, "step": 617 }, { "epoch": 0.43186582809224316, "grad_norm": 0.826675965931149, "learning_rate": 0.0001268280289559479, "loss": 11.9573, "num_tokens": 4405911.0, "step": 618 }, { "epoch": 0.43256464011180995, "grad_norm": 0.8534971115758093, "learning_rate": 0.00012660991829815602, "loss": 11.9621, "num_tokens": 4413468.0, "step": 619 }, { "epoch": 0.4332634521313767, "grad_norm": 0.8760979261507388, "learning_rate": 0.00012639167131892293, "loss": 12.0476, "num_tokens": 4420628.0, "step": 620 }, { "epoch": 0.4339622641509434, "grad_norm": 0.8714727411601618, "learning_rate": 0.0001261732891363183, "loss": 11.9621, "num_tokens": 4427900.0, "step": 621 }, { "epoch": 0.43466107617051014, "grad_norm": 0.8922177474156737, "learning_rate": 0.0001259547728691045, "loss": 12.0367, "num_tokens": 4434789.0, "step": 622 }, { "epoch": 0.43535988819007687, "grad_norm": 0.841294675830371, "learning_rate": 0.00012573612363673067, "loss": 12.1268, "num_tokens": 4442355.0, "step": 623 }, { "epoch": 0.4360587002096436, "grad_norm": 0.8662418572358458, "learning_rate": 0.00012551734255932727, "loss": 11.8095, "num_tokens": 4449354.0, "step": 624 }, { "epoch": 0.43675751222921033, "grad_norm": 0.8966435415783298, "learning_rate": 0.0001252984307577001, "loss": 11.7621, "num_tokens": 4456996.0, "step": 625 }, { "epoch": 0.43745632424877706, "grad_norm": 0.9140099642044371, "learning_rate": 0.00012507938935332478, "loss": 11.8998, "num_tokens": 4464108.0, "step": 626 }, { "epoch": 0.4381551362683438, "grad_norm": 0.9252375343477144, "learning_rate": 0.00012486021946834068, "loss": 11.7647, "num_tokens": 4471782.0, "step": 627 }, { "epoch": 0.4388539482879106, "grad_norm": 0.9539679302319235, "learning_rate": 0.00012464092222554552, "loss": 11.768, "num_tokens": 4478870.0, "step": 628 }, { "epoch": 0.4395527603074773, "grad_norm": 0.8265019024489157, "learning_rate": 0.00012442149874838948, "loss": 11.7075, "num_tokens": 4486006.0, "step": 629 }, { "epoch": 0.44025157232704404, "grad_norm": 0.8259779874857927, "learning_rate": 0.00012420195016096933, "loss": 12.1217, "num_tokens": 4493433.0, "step": 630 }, { "epoch": 0.44095038434661077, "grad_norm": 0.8504634346376905, "learning_rate": 0.00012398227758802285, "loss": 11.6897, "num_tokens": 4501615.0, "step": 631 }, { "epoch": 0.4416491963661775, "grad_norm": 0.8422557275254934, "learning_rate": 0.00012376248215492297, "loss": 12.0223, "num_tokens": 4508453.0, "step": 632 }, { "epoch": 0.44234800838574423, "grad_norm": 0.8739580752551247, "learning_rate": 0.000123542564987672, "loss": 11.9015, "num_tokens": 4515645.0, "step": 633 }, { "epoch": 0.44304682040531096, "grad_norm": 0.7878006817363354, "learning_rate": 0.00012332252721289594, "loss": 11.6232, "num_tokens": 4522914.0, "step": 634 }, { "epoch": 0.4437456324248777, "grad_norm": 0.8701782429674091, "learning_rate": 0.00012310236995783866, "loss": 12.065, "num_tokens": 4530012.0, "step": 635 }, { "epoch": 0.4444444444444444, "grad_norm": 0.8206317742230089, "learning_rate": 0.00012288209435035605, "loss": 11.8848, "num_tokens": 4536666.0, "step": 636 }, { "epoch": 0.4451432564640112, "grad_norm": 0.8478667503316676, "learning_rate": 0.00012266170151891036, "loss": 11.8555, "num_tokens": 4543673.0, "step": 637 }, { "epoch": 0.44584206848357794, "grad_norm": 0.8711894725771603, "learning_rate": 0.00012244119259256442, "loss": 11.9741, "num_tokens": 4550285.0, "step": 638 }, { "epoch": 0.44654088050314467, "grad_norm": 0.8073562820853625, "learning_rate": 0.00012222056870097572, "loss": 12.1476, "num_tokens": 4557630.0, "step": 639 }, { "epoch": 0.4472396925227114, "grad_norm": 0.8550949203116837, "learning_rate": 0.00012199983097439079, "loss": 12.0266, "num_tokens": 4564552.0, "step": 640 }, { "epoch": 0.4479385045422781, "grad_norm": 0.8361717391483878, "learning_rate": 0.00012177898054363923, "loss": 11.8465, "num_tokens": 4572109.0, "step": 641 }, { "epoch": 0.44863731656184486, "grad_norm": 0.8342965373541649, "learning_rate": 0.00012155801854012816, "loss": 11.8991, "num_tokens": 4579365.0, "step": 642 }, { "epoch": 0.4493361285814116, "grad_norm": 0.9064601938720505, "learning_rate": 0.00012133694609583615, "loss": 11.8593, "num_tokens": 4585759.0, "step": 643 }, { "epoch": 0.4500349406009783, "grad_norm": 0.9010741359394927, "learning_rate": 0.00012111576434330766, "loss": 11.9357, "num_tokens": 4592247.0, "step": 644 }, { "epoch": 0.45073375262054505, "grad_norm": 0.8334005590912353, "learning_rate": 0.00012089447441564705, "loss": 11.8785, "num_tokens": 4599348.0, "step": 645 }, { "epoch": 0.45143256464011183, "grad_norm": 0.9178185738773574, "learning_rate": 0.00012067307744651288, "loss": 11.7183, "num_tokens": 4606162.0, "step": 646 }, { "epoch": 0.45213137665967856, "grad_norm": 0.875013015269982, "learning_rate": 0.00012045157457011211, "loss": 11.6998, "num_tokens": 4612972.0, "step": 647 }, { "epoch": 0.4528301886792453, "grad_norm": 0.9094140975159261, "learning_rate": 0.00012022996692119424, "loss": 11.9093, "num_tokens": 4620976.0, "step": 648 }, { "epoch": 0.453529000698812, "grad_norm": 0.8153851869591483, "learning_rate": 0.00012000825563504547, "loss": 12.0035, "num_tokens": 4629303.0, "step": 649 }, { "epoch": 0.45422781271837875, "grad_norm": 0.8950570859113985, "learning_rate": 0.000119786441847483, "loss": 11.8635, "num_tokens": 4636638.0, "step": 650 }, { "epoch": 0.4549266247379455, "grad_norm": 0.8741693623971509, "learning_rate": 0.00011956452669484908, "loss": 11.824, "num_tokens": 4643800.0, "step": 651 }, { "epoch": 0.4556254367575122, "grad_norm": 0.8940551480830519, "learning_rate": 0.0001193425113140053, "loss": 11.938, "num_tokens": 4650901.0, "step": 652 }, { "epoch": 0.45632424877707894, "grad_norm": 0.9707279366744572, "learning_rate": 0.00011912039684232674, "loss": 12.0174, "num_tokens": 4657604.0, "step": 653 }, { "epoch": 0.4570230607966457, "grad_norm": 0.9648273198741646, "learning_rate": 0.000118898184417696, "loss": 11.8252, "num_tokens": 4664504.0, "step": 654 }, { "epoch": 0.45772187281621246, "grad_norm": 0.9945189728309309, "learning_rate": 0.00011867587517849757, "loss": 11.9346, "num_tokens": 4670477.0, "step": 655 }, { "epoch": 0.4584206848357792, "grad_norm": 0.7900857843512296, "learning_rate": 0.0001184534702636119, "loss": 11.8475, "num_tokens": 4677858.0, "step": 656 }, { "epoch": 0.4591194968553459, "grad_norm": 1.0503756020070543, "learning_rate": 0.00011823097081240964, "loss": 11.862, "num_tokens": 4684626.0, "step": 657 }, { "epoch": 0.45981830887491265, "grad_norm": 0.8546502745711176, "learning_rate": 0.00011800837796474561, "loss": 12.1335, "num_tokens": 4691582.0, "step": 658 }, { "epoch": 0.4605171208944794, "grad_norm": 0.8622254441465562, "learning_rate": 0.00011778569286095329, "loss": 11.9545, "num_tokens": 4699398.0, "step": 659 }, { "epoch": 0.4612159329140461, "grad_norm": 0.7788567788431954, "learning_rate": 0.00011756291664183859, "loss": 11.9524, "num_tokens": 4707448.0, "step": 660 }, { "epoch": 0.46191474493361284, "grad_norm": 0.908871171310194, "learning_rate": 0.00011734005044867426, "loss": 11.8801, "num_tokens": 4714120.0, "step": 661 }, { "epoch": 0.46261355695317957, "grad_norm": 0.9693707218421472, "learning_rate": 0.00011711709542319411, "loss": 11.9347, "num_tokens": 4721137.0, "step": 662 }, { "epoch": 0.46331236897274636, "grad_norm": 0.9709352688677537, "learning_rate": 0.00011689405270758684, "loss": 12.1173, "num_tokens": 4727881.0, "step": 663 }, { "epoch": 0.4640111809923131, "grad_norm": 0.8758468731721972, "learning_rate": 0.00011667092344449053, "loss": 11.944, "num_tokens": 4735809.0, "step": 664 }, { "epoch": 0.4647099930118798, "grad_norm": 0.9469256976252346, "learning_rate": 0.00011644770877698654, "loss": 12.1369, "num_tokens": 4742094.0, "step": 665 }, { "epoch": 0.46540880503144655, "grad_norm": 0.8237527876349462, "learning_rate": 0.00011622440984859384, "loss": 11.8306, "num_tokens": 4749810.0, "step": 666 }, { "epoch": 0.4661076170510133, "grad_norm": 1.0185008554050337, "learning_rate": 0.00011600102780326296, "loss": 11.9714, "num_tokens": 4756916.0, "step": 667 }, { "epoch": 0.46680642907058, "grad_norm": 0.9390523791728513, "learning_rate": 0.00011577756378537033, "loss": 11.9079, "num_tokens": 4764365.0, "step": 668 }, { "epoch": 0.46750524109014674, "grad_norm": 0.8903990493176246, "learning_rate": 0.00011555401893971229, "loss": 11.77, "num_tokens": 4771335.0, "step": 669 }, { "epoch": 0.46820405310971347, "grad_norm": 0.8495503404738405, "learning_rate": 0.00011533039441149926, "loss": 11.8505, "num_tokens": 4778789.0, "step": 670 }, { "epoch": 0.4689028651292802, "grad_norm": 0.8080825519779163, "learning_rate": 0.00011510669134634984, "loss": 11.858, "num_tokens": 4786533.0, "step": 671 }, { "epoch": 0.469601677148847, "grad_norm": 0.8940185008450258, "learning_rate": 0.000114882910890285, "loss": 12.0071, "num_tokens": 4793395.0, "step": 672 }, { "epoch": 0.4703004891684137, "grad_norm": 0.8102336082772988, "learning_rate": 0.00011465905418972216, "loss": 11.85, "num_tokens": 4800501.0, "step": 673 }, { "epoch": 0.47099930118798045, "grad_norm": 0.8684446349073909, "learning_rate": 0.00011443512239146941, "loss": 11.7498, "num_tokens": 4807730.0, "step": 674 }, { "epoch": 0.4716981132075472, "grad_norm": 0.8972831808941225, "learning_rate": 0.00011421111664271946, "loss": 12.1878, "num_tokens": 4814631.0, "step": 675 }, { "epoch": 0.4723969252271139, "grad_norm": 0.7811357044524173, "learning_rate": 0.00011398703809104391, "loss": 11.977, "num_tokens": 4821859.0, "step": 676 }, { "epoch": 0.47309573724668064, "grad_norm": 0.8878196813815282, "learning_rate": 0.00011376288788438734, "loss": 12.0799, "num_tokens": 4829001.0, "step": 677 }, { "epoch": 0.47379454926624737, "grad_norm": 0.7728940561788115, "learning_rate": 0.00011353866717106137, "loss": 12.0937, "num_tokens": 4836233.0, "step": 678 }, { "epoch": 0.4744933612858141, "grad_norm": 0.7652381421322222, "learning_rate": 0.0001133143770997389, "loss": 11.8881, "num_tokens": 4844216.0, "step": 679 }, { "epoch": 0.4751921733053808, "grad_norm": 0.805128881427533, "learning_rate": 0.00011309001881944809, "loss": 11.9522, "num_tokens": 4851705.0, "step": 680 }, { "epoch": 0.4758909853249476, "grad_norm": 0.8597585731076033, "learning_rate": 0.00011286559347956651, "loss": 12.1198, "num_tokens": 4858279.0, "step": 681 }, { "epoch": 0.47658979734451434, "grad_norm": 0.8453118490353572, "learning_rate": 0.00011264110222981535, "loss": 11.9634, "num_tokens": 4866344.0, "step": 682 }, { "epoch": 0.4772886093640811, "grad_norm": 0.8093626293120404, "learning_rate": 0.00011241654622025334, "loss": 11.8598, "num_tokens": 4873494.0, "step": 683 }, { "epoch": 0.4779874213836478, "grad_norm": 0.8105493286965195, "learning_rate": 0.00011219192660127116, "loss": 11.7991, "num_tokens": 4880904.0, "step": 684 }, { "epoch": 0.47868623340321453, "grad_norm": 0.7479575202425699, "learning_rate": 0.00011196724452358516, "loss": 11.7939, "num_tokens": 4888663.0, "step": 685 }, { "epoch": 0.47938504542278126, "grad_norm": 0.9078873016468286, "learning_rate": 0.00011174250113823173, "loss": 11.8651, "num_tokens": 4895524.0, "step": 686 }, { "epoch": 0.480083857442348, "grad_norm": 0.82828599173561, "learning_rate": 0.00011151769759656136, "loss": 11.8728, "num_tokens": 4902875.0, "step": 687 }, { "epoch": 0.4807826694619147, "grad_norm": 0.8498893366628105, "learning_rate": 0.00011129283505023274, "loss": 11.988, "num_tokens": 4910339.0, "step": 688 }, { "epoch": 0.48148148148148145, "grad_norm": 0.8691199007501399, "learning_rate": 0.00011106791465120678, "loss": 11.8249, "num_tokens": 4917859.0, "step": 689 }, { "epoch": 0.48218029350104824, "grad_norm": 0.8363420468675724, "learning_rate": 0.00011084293755174083, "loss": 11.7725, "num_tokens": 4924845.0, "step": 690 }, { "epoch": 0.48287910552061497, "grad_norm": 0.8342914301300645, "learning_rate": 0.0001106179049043826, "loss": 11.9105, "num_tokens": 4932581.0, "step": 691 }, { "epoch": 0.4835779175401817, "grad_norm": 0.9041859543752735, "learning_rate": 0.00011039281786196454, "loss": 11.8527, "num_tokens": 4938840.0, "step": 692 }, { "epoch": 0.48427672955974843, "grad_norm": 0.8332056105830793, "learning_rate": 0.00011016767757759758, "loss": 11.8375, "num_tokens": 4946122.0, "step": 693 }, { "epoch": 0.48497554157931516, "grad_norm": 0.7999356426489574, "learning_rate": 0.00010994248520466555, "loss": 11.8864, "num_tokens": 4953325.0, "step": 694 }, { "epoch": 0.4856743535988819, "grad_norm": 0.8840646203709945, "learning_rate": 0.00010971724189681907, "loss": 11.7926, "num_tokens": 4960210.0, "step": 695 }, { "epoch": 0.4863731656184486, "grad_norm": 0.7161655467219089, "learning_rate": 0.00010949194880796966, "loss": 11.6796, "num_tokens": 4968396.0, "step": 696 }, { "epoch": 0.48707197763801535, "grad_norm": 0.8391872409460347, "learning_rate": 0.000109266607092284, "loss": 11.8614, "num_tokens": 4975812.0, "step": 697 }, { "epoch": 0.48777078965758214, "grad_norm": 0.8136075034785465, "learning_rate": 0.00010904121790417767, "loss": 11.7733, "num_tokens": 4983615.0, "step": 698 }, { "epoch": 0.48846960167714887, "grad_norm": 0.8639838769972311, "learning_rate": 0.00010881578239830965, "loss": 11.7194, "num_tokens": 4990664.0, "step": 699 }, { "epoch": 0.4891684136967156, "grad_norm": 0.771704690891524, "learning_rate": 0.0001085903017295761, "loss": 11.8352, "num_tokens": 4997991.0, "step": 700 }, { "epoch": 0.48986722571628233, "grad_norm": 0.8082632295628299, "learning_rate": 0.00010836477705310457, "loss": 11.7211, "num_tokens": 5005535.0, "step": 701 }, { "epoch": 0.49056603773584906, "grad_norm": 0.8221099777163831, "learning_rate": 0.00010813920952424805, "loss": 11.7424, "num_tokens": 5012725.0, "step": 702 }, { "epoch": 0.4912648497554158, "grad_norm": 0.8526099066187208, "learning_rate": 0.00010791360029857908, "loss": 11.7562, "num_tokens": 5020226.0, "step": 703 }, { "epoch": 0.4919636617749825, "grad_norm": 0.8830746317686873, "learning_rate": 0.00010768795053188378, "loss": 11.8946, "num_tokens": 5027841.0, "step": 704 }, { "epoch": 0.49266247379454925, "grad_norm": 0.8467987158546797, "learning_rate": 0.00010746226138015605, "loss": 11.8602, "num_tokens": 5034672.0, "step": 705 }, { "epoch": 0.493361285814116, "grad_norm": 0.9110055267989619, "learning_rate": 0.00010723653399959141, "loss": 11.8089, "num_tokens": 5041763.0, "step": 706 }, { "epoch": 0.49406009783368277, "grad_norm": 0.8590754661724935, "learning_rate": 0.00010701076954658133, "loss": 12.0917, "num_tokens": 5048922.0, "step": 707 }, { "epoch": 0.4947589098532495, "grad_norm": 0.8276297434047943, "learning_rate": 0.00010678496917770719, "loss": 11.9569, "num_tokens": 5056008.0, "step": 708 }, { "epoch": 0.4954577218728162, "grad_norm": 0.8873635245443052, "learning_rate": 0.00010655913404973432, "loss": 11.9105, "num_tokens": 5062729.0, "step": 709 }, { "epoch": 0.49615653389238296, "grad_norm": 0.7860622687601649, "learning_rate": 0.0001063332653196062, "loss": 11.9105, "num_tokens": 5069602.0, "step": 710 }, { "epoch": 0.4968553459119497, "grad_norm": 0.8585551672598832, "learning_rate": 0.00010610736414443836, "loss": 11.8514, "num_tokens": 5076146.0, "step": 711 }, { "epoch": 0.4975541579315164, "grad_norm": 0.813654694030058, "learning_rate": 0.00010588143168151257, "loss": 11.7629, "num_tokens": 5083612.0, "step": 712 }, { "epoch": 0.49825296995108315, "grad_norm": 0.88870371822207, "learning_rate": 0.00010565546908827093, "loss": 11.7497, "num_tokens": 5090353.0, "step": 713 }, { "epoch": 0.4989517819706499, "grad_norm": 0.8315503003113751, "learning_rate": 0.00010542947752230987, "loss": 11.7071, "num_tokens": 5098640.0, "step": 714 }, { "epoch": 0.4996505939902166, "grad_norm": 0.7808060509465453, "learning_rate": 0.00010520345814137422, "loss": 11.7736, "num_tokens": 5106336.0, "step": 715 }, { "epoch": 0.5003494060097834, "grad_norm": 0.9832986052690458, "learning_rate": 0.0001049774121033514, "loss": 11.8544, "num_tokens": 5113949.0, "step": 716 }, { "epoch": 0.5010482180293501, "grad_norm": 0.8593993063532692, "learning_rate": 0.00010475134056626521, "loss": 11.5894, "num_tokens": 5120936.0, "step": 717 }, { "epoch": 0.5017470300489169, "grad_norm": 0.8555509359111242, "learning_rate": 0.00010452524468827028, "loss": 11.8802, "num_tokens": 5127936.0, "step": 718 }, { "epoch": 0.5024458420684835, "grad_norm": 0.8768545835966081, "learning_rate": 0.00010429912562764582, "loss": 11.7651, "num_tokens": 5135181.0, "step": 719 }, { "epoch": 0.5031446540880503, "grad_norm": 0.8954332003506361, "learning_rate": 0.00010407298454278983, "loss": 11.9635, "num_tokens": 5141581.0, "step": 720 }, { "epoch": 0.5038434661076171, "grad_norm": 0.7787241308163475, "learning_rate": 0.00010384682259221314, "loss": 11.8157, "num_tokens": 5149210.0, "step": 721 }, { "epoch": 0.5045422781271838, "grad_norm": 0.8043762411091894, "learning_rate": 0.00010362064093453347, "loss": 11.9794, "num_tokens": 5156153.0, "step": 722 }, { "epoch": 0.5052410901467506, "grad_norm": 0.8494247945373716, "learning_rate": 0.00010339444072846955, "loss": 11.8751, "num_tokens": 5163697.0, "step": 723 }, { "epoch": 0.5059399021663172, "grad_norm": 0.8004411120888781, "learning_rate": 0.00010316822313283503, "loss": 11.8148, "num_tokens": 5170939.0, "step": 724 }, { "epoch": 0.506638714185884, "grad_norm": 0.8098718525402316, "learning_rate": 0.00010294198930653273, "loss": 11.7984, "num_tokens": 5178058.0, "step": 725 }, { "epoch": 0.5073375262054507, "grad_norm": 0.8570224646790227, "learning_rate": 0.00010271574040854863, "loss": 11.7122, "num_tokens": 5185093.0, "step": 726 }, { "epoch": 0.5080363382250175, "grad_norm": 0.9625030998434088, "learning_rate": 0.00010248947759794583, "loss": 11.9835, "num_tokens": 5191726.0, "step": 727 }, { "epoch": 0.5087351502445842, "grad_norm": 0.8635080331426315, "learning_rate": 0.00010226320203385878, "loss": 11.7241, "num_tokens": 5198609.0, "step": 728 }, { "epoch": 0.5094339622641509, "grad_norm": 0.8677572413713257, "learning_rate": 0.00010203691487548721, "loss": 11.8883, "num_tokens": 5206060.0, "step": 729 }, { "epoch": 0.5101327742837177, "grad_norm": 0.8979446096984454, "learning_rate": 0.00010181061728209034, "loss": 11.9268, "num_tokens": 5213009.0, "step": 730 }, { "epoch": 0.5108315863032844, "grad_norm": 0.8252839906702719, "learning_rate": 0.00010158431041298076, "loss": 12.0079, "num_tokens": 5220145.0, "step": 731 }, { "epoch": 0.5115303983228512, "grad_norm": 0.8819812351285876, "learning_rate": 0.00010135799542751861, "loss": 11.8362, "num_tokens": 5226851.0, "step": 732 }, { "epoch": 0.5122292103424179, "grad_norm": 1.1383179013782923, "learning_rate": 0.0001011316734851056, "loss": 11.7761, "num_tokens": 5234276.0, "step": 733 }, { "epoch": 0.5129280223619846, "grad_norm": 0.8044321074171096, "learning_rate": 0.00010090534574517907, "loss": 11.7191, "num_tokens": 5241284.0, "step": 734 }, { "epoch": 0.5136268343815513, "grad_norm": 1.1007084709280315, "learning_rate": 0.00010067901336720611, "loss": 11.6657, "num_tokens": 5248568.0, "step": 735 }, { "epoch": 0.5143256464011181, "grad_norm": 0.8870149031390947, "learning_rate": 0.00010045267751067757, "loss": 12.0822, "num_tokens": 5255337.0, "step": 736 }, { "epoch": 0.5150244584206848, "grad_norm": 0.9108240607842385, "learning_rate": 0.00010022633933510201, "loss": 11.6611, "num_tokens": 5262391.0, "step": 737 }, { "epoch": 0.5157232704402516, "grad_norm": 0.8605558610574175, "learning_rate": 0.0001, "loss": 11.8638, "num_tokens": 5270611.0, "step": 738 }, { "epoch": 0.5164220824598184, "grad_norm": 0.8728553331392952, "learning_rate": 9.977366066489801e-05, "loss": 11.74, "num_tokens": 5278249.0, "step": 739 }, { "epoch": 0.517120894479385, "grad_norm": 0.9400700814466663, "learning_rate": 9.954732248932244e-05, "loss": 11.8641, "num_tokens": 5285271.0, "step": 740 }, { "epoch": 0.5178197064989518, "grad_norm": 0.8495250755418514, "learning_rate": 9.932098663279392e-05, "loss": 11.7304, "num_tokens": 5292168.0, "step": 741 }, { "epoch": 0.5185185185185185, "grad_norm": 0.9155703182347676, "learning_rate": 9.909465425482093e-05, "loss": 11.9257, "num_tokens": 5298892.0, "step": 742 }, { "epoch": 0.5192173305380853, "grad_norm": 0.9016515273132614, "learning_rate": 9.886832651489444e-05, "loss": 11.8944, "num_tokens": 5305743.0, "step": 743 }, { "epoch": 0.519916142557652, "grad_norm": 0.8118283003676506, "learning_rate": 9.864200457248144e-05, "loss": 11.6752, "num_tokens": 5313196.0, "step": 744 }, { "epoch": 0.5206149545772187, "grad_norm": 0.8119495179861029, "learning_rate": 9.841568958701924e-05, "loss": 11.713, "num_tokens": 5320688.0, "step": 745 }, { "epoch": 0.5213137665967854, "grad_norm": 0.8445755422496691, "learning_rate": 9.81893827179097e-05, "loss": 11.712, "num_tokens": 5328457.0, "step": 746 }, { "epoch": 0.5220125786163522, "grad_norm": 0.8525238796762149, "learning_rate": 9.796308512451284e-05, "loss": 11.8845, "num_tokens": 5335909.0, "step": 747 }, { "epoch": 0.522711390635919, "grad_norm": 0.9259968525833697, "learning_rate": 9.773679796614124e-05, "loss": 11.6382, "num_tokens": 5343377.0, "step": 748 }, { "epoch": 0.5234102026554857, "grad_norm": 0.7746588754673115, "learning_rate": 9.751052240205421e-05, "loss": 11.8223, "num_tokens": 5350725.0, "step": 749 }, { "epoch": 0.5241090146750524, "grad_norm": 0.9090044410487871, "learning_rate": 9.728425959145139e-05, "loss": 11.736, "num_tokens": 5358235.0, "step": 750 }, { "epoch": 0.5248078266946191, "grad_norm": 0.8119032545667709, "learning_rate": 9.705801069346729e-05, "loss": 11.8128, "num_tokens": 5365590.0, "step": 751 }, { "epoch": 0.5255066387141859, "grad_norm": 0.8593779120663517, "learning_rate": 9.683177686716501e-05, "loss": 11.7261, "num_tokens": 5371915.0, "step": 752 }, { "epoch": 0.5262054507337526, "grad_norm": 0.9901347880257034, "learning_rate": 9.660555927153047e-05, "loss": 11.9423, "num_tokens": 5378290.0, "step": 753 }, { "epoch": 0.5269042627533194, "grad_norm": 0.7442310095588829, "learning_rate": 9.637935906546655e-05, "loss": 11.7566, "num_tokens": 5385294.0, "step": 754 }, { "epoch": 0.527603074772886, "grad_norm": 0.8575812640939505, "learning_rate": 9.615317740778689e-05, "loss": 11.9029, "num_tokens": 5392707.0, "step": 755 }, { "epoch": 0.5283018867924528, "grad_norm": 0.8047709007984075, "learning_rate": 9.592701545721021e-05, "loss": 11.8251, "num_tokens": 5400588.0, "step": 756 }, { "epoch": 0.5290006988120196, "grad_norm": 0.7860377665780968, "learning_rate": 9.570087437235423e-05, "loss": 11.905, "num_tokens": 5407337.0, "step": 757 }, { "epoch": 0.5296995108315863, "grad_norm": 0.8194379087299098, "learning_rate": 9.547475531172973e-05, "loss": 11.8038, "num_tokens": 5414577.0, "step": 758 }, { "epoch": 0.5303983228511531, "grad_norm": 0.8070308802592453, "learning_rate": 9.524865943373481e-05, "loss": 11.8402, "num_tokens": 5421211.0, "step": 759 }, { "epoch": 0.5310971348707197, "grad_norm": 0.8803852392568843, "learning_rate": 9.502258789664865e-05, "loss": 11.944, "num_tokens": 5427910.0, "step": 760 }, { "epoch": 0.5317959468902865, "grad_norm": 0.8457383467882692, "learning_rate": 9.479654185862579e-05, "loss": 11.9176, "num_tokens": 5434742.0, "step": 761 }, { "epoch": 0.5324947589098532, "grad_norm": 0.7743210614507713, "learning_rate": 9.457052247769017e-05, "loss": 11.847, "num_tokens": 5441794.0, "step": 762 }, { "epoch": 0.53319357092942, "grad_norm": 0.7600921058483522, "learning_rate": 9.434453091172908e-05, "loss": 11.7134, "num_tokens": 5449915.0, "step": 763 }, { "epoch": 0.5338923829489868, "grad_norm": 0.8410587669442078, "learning_rate": 9.411856831848745e-05, "loss": 11.6731, "num_tokens": 5456843.0, "step": 764 }, { "epoch": 0.5345911949685535, "grad_norm": 0.8081870610413466, "learning_rate": 9.38926358555617e-05, "loss": 11.8152, "num_tokens": 5464023.0, "step": 765 }, { "epoch": 0.5352900069881202, "grad_norm": 0.8361358425721516, "learning_rate": 9.366673468039383e-05, "loss": 11.9436, "num_tokens": 5471406.0, "step": 766 }, { "epoch": 0.5359888190076869, "grad_norm": 0.8862719304918909, "learning_rate": 9.34408659502657e-05, "loss": 11.8146, "num_tokens": 5478339.0, "step": 767 }, { "epoch": 0.5366876310272537, "grad_norm": 0.8320140788538509, "learning_rate": 9.321503082229282e-05, "loss": 11.79, "num_tokens": 5485853.0, "step": 768 }, { "epoch": 0.5373864430468204, "grad_norm": 0.8427164891185774, "learning_rate": 9.298923045341869e-05, "loss": 11.6928, "num_tokens": 5493547.0, "step": 769 }, { "epoch": 0.5380852550663872, "grad_norm": 0.9065833012454514, "learning_rate": 9.276346600040862e-05, "loss": 11.4978, "num_tokens": 5500515.0, "step": 770 }, { "epoch": 0.5387840670859538, "grad_norm": 0.8367627544247601, "learning_rate": 9.253773861984397e-05, "loss": 11.6405, "num_tokens": 5507599.0, "step": 771 }, { "epoch": 0.5394828791055206, "grad_norm": 0.8541655457970996, "learning_rate": 9.231204946811624e-05, "loss": 11.7868, "num_tokens": 5514593.0, "step": 772 }, { "epoch": 0.5401816911250874, "grad_norm": 0.8505734247130968, "learning_rate": 9.208639970142093e-05, "loss": 11.8589, "num_tokens": 5521763.0, "step": 773 }, { "epoch": 0.5408805031446541, "grad_norm": 0.7014079921724546, "learning_rate": 9.186079047575197e-05, "loss": 11.7853, "num_tokens": 5529580.0, "step": 774 }, { "epoch": 0.5415793151642209, "grad_norm": 0.775435021642862, "learning_rate": 9.163522294689546e-05, "loss": 11.7545, "num_tokens": 5536873.0, "step": 775 }, { "epoch": 0.5422781271837875, "grad_norm": 0.8986198411125287, "learning_rate": 9.140969827042391e-05, "loss": 11.8047, "num_tokens": 5543550.0, "step": 776 }, { "epoch": 0.5429769392033543, "grad_norm": 0.7809603754618126, "learning_rate": 9.118421760169038e-05, "loss": 11.9793, "num_tokens": 5550884.0, "step": 777 }, { "epoch": 0.543675751222921, "grad_norm": 0.8057085091276677, "learning_rate": 9.095878209582237e-05, "loss": 11.5977, "num_tokens": 5557807.0, "step": 778 }, { "epoch": 0.5443745632424878, "grad_norm": 0.891977933987541, "learning_rate": 9.073339290771603e-05, "loss": 11.8543, "num_tokens": 5564576.0, "step": 779 }, { "epoch": 0.5450733752620545, "grad_norm": 0.7388141711422563, "learning_rate": 9.050805119203035e-05, "loss": 11.7053, "num_tokens": 5572504.0, "step": 780 }, { "epoch": 0.5457721872816212, "grad_norm": 0.7625862495200397, "learning_rate": 9.028275810318095e-05, "loss": 11.6129, "num_tokens": 5579720.0, "step": 781 }, { "epoch": 0.546470999301188, "grad_norm": 0.7581306071682549, "learning_rate": 9.005751479533449e-05, "loss": 11.8261, "num_tokens": 5586866.0, "step": 782 }, { "epoch": 0.5471698113207547, "grad_norm": 0.8062850504793252, "learning_rate": 8.983232242240247e-05, "loss": 11.7799, "num_tokens": 5593273.0, "step": 783 }, { "epoch": 0.5478686233403215, "grad_norm": 0.7696652830589925, "learning_rate": 8.96071821380355e-05, "loss": 11.9284, "num_tokens": 5600825.0, "step": 784 }, { "epoch": 0.5485674353598882, "grad_norm": 0.7465186876499212, "learning_rate": 8.938209509561741e-05, "loss": 11.8043, "num_tokens": 5608068.0, "step": 785 }, { "epoch": 0.549266247379455, "grad_norm": 0.7475919153703743, "learning_rate": 8.91570624482592e-05, "loss": 11.7928, "num_tokens": 5615637.0, "step": 786 }, { "epoch": 0.5499650593990216, "grad_norm": 0.8374151644755823, "learning_rate": 8.893208534879324e-05, "loss": 11.6898, "num_tokens": 5622822.0, "step": 787 }, { "epoch": 0.5506638714185884, "grad_norm": 0.747989105129382, "learning_rate": 8.87071649497673e-05, "loss": 11.7666, "num_tokens": 5629785.0, "step": 788 }, { "epoch": 0.5513626834381551, "grad_norm": 0.7714546710718322, "learning_rate": 8.848230240343865e-05, "loss": 11.7869, "num_tokens": 5636551.0, "step": 789 }, { "epoch": 0.5520614954577219, "grad_norm": 0.8093924548009458, "learning_rate": 8.82574988617683e-05, "loss": 11.7715, "num_tokens": 5643210.0, "step": 790 }, { "epoch": 0.5527603074772887, "grad_norm": 0.7791258807969518, "learning_rate": 8.803275547641488e-05, "loss": 11.6265, "num_tokens": 5650927.0, "step": 791 }, { "epoch": 0.5534591194968553, "grad_norm": 0.7501244610969549, "learning_rate": 8.780807339872886e-05, "loss": 11.8536, "num_tokens": 5658354.0, "step": 792 }, { "epoch": 0.5541579315164221, "grad_norm": 0.9213911806759102, "learning_rate": 8.758345377974667e-05, "loss": 11.9677, "num_tokens": 5664499.0, "step": 793 }, { "epoch": 0.5548567435359888, "grad_norm": 0.8448158401403486, "learning_rate": 8.735889777018465e-05, "loss": 11.8338, "num_tokens": 5670823.0, "step": 794 }, { "epoch": 0.5555555555555556, "grad_norm": 0.7961882205740662, "learning_rate": 8.71344065204335e-05, "loss": 11.6791, "num_tokens": 5678464.0, "step": 795 }, { "epoch": 0.5562543675751223, "grad_norm": 0.9026550542048194, "learning_rate": 8.690998118055193e-05, "loss": 11.701, "num_tokens": 5685313.0, "step": 796 }, { "epoch": 0.556953179594689, "grad_norm": 0.763089852499311, "learning_rate": 8.66856229002611e-05, "loss": 11.819, "num_tokens": 5692972.0, "step": 797 }, { "epoch": 0.5576519916142557, "grad_norm": 0.8827640381152261, "learning_rate": 8.646133282893864e-05, "loss": 11.6831, "num_tokens": 5699327.0, "step": 798 }, { "epoch": 0.5583508036338225, "grad_norm": 0.7797458428691016, "learning_rate": 8.623711211561267e-05, "loss": 11.6918, "num_tokens": 5706702.0, "step": 799 }, { "epoch": 0.5590496156533893, "grad_norm": 0.7838131784869478, "learning_rate": 8.601296190895611e-05, "loss": 11.8356, "num_tokens": 5713918.0, "step": 800 }, { "epoch": 0.559748427672956, "grad_norm": 0.8784533058303018, "learning_rate": 8.578888335728057e-05, "loss": 11.6701, "num_tokens": 5721220.0, "step": 801 }, { "epoch": 0.5604472396925227, "grad_norm": 0.8123001209869102, "learning_rate": 8.55648776085306e-05, "loss": 11.7312, "num_tokens": 5728492.0, "step": 802 }, { "epoch": 0.5611460517120894, "grad_norm": 0.7858814515665766, "learning_rate": 8.534094581027785e-05, "loss": 11.876, "num_tokens": 5735983.0, "step": 803 }, { "epoch": 0.5618448637316562, "grad_norm": 0.8844901910925126, "learning_rate": 8.511708910971505e-05, "loss": 11.7999, "num_tokens": 5742678.0, "step": 804 }, { "epoch": 0.5625436757512229, "grad_norm": 0.8078853147041326, "learning_rate": 8.489330865365018e-05, "loss": 11.7124, "num_tokens": 5750553.0, "step": 805 }, { "epoch": 0.5632424877707897, "grad_norm": 0.7784603290334375, "learning_rate": 8.466960558850077e-05, "loss": 11.4818, "num_tokens": 5758022.0, "step": 806 }, { "epoch": 0.5639412997903563, "grad_norm": 0.7639221830150267, "learning_rate": 8.444598106028773e-05, "loss": 11.6915, "num_tokens": 5765255.0, "step": 807 }, { "epoch": 0.5646401118099231, "grad_norm": 0.802882077997663, "learning_rate": 8.422243621462969e-05, "loss": 11.7076, "num_tokens": 5772746.0, "step": 808 }, { "epoch": 0.5653389238294899, "grad_norm": 0.7669134365996498, "learning_rate": 8.399897219673709e-05, "loss": 11.6297, "num_tokens": 5779819.0, "step": 809 }, { "epoch": 0.5660377358490566, "grad_norm": 0.7376107274278996, "learning_rate": 8.37755901514062e-05, "loss": 11.7143, "num_tokens": 5787293.0, "step": 810 }, { "epoch": 0.5667365478686234, "grad_norm": 0.7229725613909326, "learning_rate": 8.355229122301348e-05, "loss": 11.6863, "num_tokens": 5795358.0, "step": 811 }, { "epoch": 0.56743535988819, "grad_norm": 0.8065237716277036, "learning_rate": 8.332907655550948e-05, "loss": 11.8595, "num_tokens": 5802192.0, "step": 812 }, { "epoch": 0.5681341719077568, "grad_norm": 0.8397707150038022, "learning_rate": 8.310594729241317e-05, "loss": 11.8714, "num_tokens": 5809107.0, "step": 813 }, { "epoch": 0.5688329839273235, "grad_norm": 0.7728890632051899, "learning_rate": 8.288290457680591e-05, "loss": 11.6326, "num_tokens": 5815896.0, "step": 814 }, { "epoch": 0.5695317959468903, "grad_norm": 0.7647611926282754, "learning_rate": 8.265994955132572e-05, "loss": 11.8586, "num_tokens": 5823124.0, "step": 815 }, { "epoch": 0.570230607966457, "grad_norm": 0.835572575499216, "learning_rate": 8.243708335816145e-05, "loss": 11.9913, "num_tokens": 5829951.0, "step": 816 }, { "epoch": 0.5709294199860238, "grad_norm": 0.7771857677434221, "learning_rate": 8.221430713904672e-05, "loss": 11.5943, "num_tokens": 5836842.0, "step": 817 }, { "epoch": 0.5716282320055905, "grad_norm": 0.7915716967198777, "learning_rate": 8.19916220352544e-05, "loss": 11.7728, "num_tokens": 5844342.0, "step": 818 }, { "epoch": 0.5723270440251572, "grad_norm": 0.7337383433889063, "learning_rate": 8.176902918759041e-05, "loss": 11.7272, "num_tokens": 5851608.0, "step": 819 }, { "epoch": 0.573025856044724, "grad_norm": 0.7784845393576943, "learning_rate": 8.15465297363881e-05, "loss": 11.5759, "num_tokens": 5858938.0, "step": 820 }, { "epoch": 0.5737246680642907, "grad_norm": 0.7834054172956932, "learning_rate": 8.132412482150245e-05, "loss": 11.7405, "num_tokens": 5865983.0, "step": 821 }, { "epoch": 0.5744234800838575, "grad_norm": 0.8342582004871633, "learning_rate": 8.110181558230404e-05, "loss": 11.8399, "num_tokens": 5872600.0, "step": 822 }, { "epoch": 0.5751222921034241, "grad_norm": 0.8664104035122296, "learning_rate": 8.087960315767328e-05, "loss": 11.7264, "num_tokens": 5878398.0, "step": 823 }, { "epoch": 0.5758211041229909, "grad_norm": 0.728258050995661, "learning_rate": 8.06574886859947e-05, "loss": 11.8376, "num_tokens": 5886022.0, "step": 824 }, { "epoch": 0.5765199161425576, "grad_norm": 0.7762790042757487, "learning_rate": 8.043547330515092e-05, "loss": 11.7216, "num_tokens": 5893141.0, "step": 825 }, { "epoch": 0.5772187281621244, "grad_norm": 0.76741040938386, "learning_rate": 8.021355815251703e-05, "loss": 11.7179, "num_tokens": 5900162.0, "step": 826 }, { "epoch": 0.5779175401816912, "grad_norm": 0.8431161282769128, "learning_rate": 7.999174436495456e-05, "loss": 11.5942, "num_tokens": 5907169.0, "step": 827 }, { "epoch": 0.5786163522012578, "grad_norm": 0.761803797183389, "learning_rate": 7.97700330788058e-05, "loss": 11.866, "num_tokens": 5914949.0, "step": 828 }, { "epoch": 0.5793151642208246, "grad_norm": 0.8127619905274301, "learning_rate": 7.954842542988792e-05, "loss": 11.8023, "num_tokens": 5921813.0, "step": 829 }, { "epoch": 0.5800139762403913, "grad_norm": 0.7588539365182271, "learning_rate": 7.932692255348711e-05, "loss": 11.9015, "num_tokens": 5929467.0, "step": 830 }, { "epoch": 0.5807127882599581, "grad_norm": 0.722417526935907, "learning_rate": 7.910552558435297e-05, "loss": 11.5916, "num_tokens": 5936756.0, "step": 831 }, { "epoch": 0.5814116002795248, "grad_norm": 0.734304834951851, "learning_rate": 7.888423565669236e-05, "loss": 11.7022, "num_tokens": 5944825.0, "step": 832 }, { "epoch": 0.5821104122990916, "grad_norm": 0.7651356943028277, "learning_rate": 7.866305390416385e-05, "loss": 11.7116, "num_tokens": 5951987.0, "step": 833 }, { "epoch": 0.5828092243186582, "grad_norm": 0.7263482640928406, "learning_rate": 7.844198145987187e-05, "loss": 11.6426, "num_tokens": 5959236.0, "step": 834 }, { "epoch": 0.583508036338225, "grad_norm": 0.8136003869167677, "learning_rate": 7.82210194563608e-05, "loss": 12.0265, "num_tokens": 5966125.0, "step": 835 }, { "epoch": 0.5842068483577918, "grad_norm": 0.7931185544520268, "learning_rate": 7.800016902560924e-05, "loss": 11.7356, "num_tokens": 5974067.0, "step": 836 }, { "epoch": 0.5849056603773585, "grad_norm": 0.77986440427414, "learning_rate": 7.77794312990243e-05, "loss": 11.7585, "num_tokens": 5980876.0, "step": 837 }, { "epoch": 0.5856044723969253, "grad_norm": 0.7930623347892715, "learning_rate": 7.755880740743559e-05, "loss": 11.9373, "num_tokens": 5988494.0, "step": 838 }, { "epoch": 0.5863032844164919, "grad_norm": 0.817088280185177, "learning_rate": 7.733829848108965e-05, "loss": 11.6533, "num_tokens": 5995555.0, "step": 839 }, { "epoch": 0.5870020964360587, "grad_norm": 0.816574051747384, "learning_rate": 7.7117905649644e-05, "loss": 11.5985, "num_tokens": 6002655.0, "step": 840 }, { "epoch": 0.5877009084556254, "grad_norm": 0.7920959598315253, "learning_rate": 7.689763004216135e-05, "loss": 11.8124, "num_tokens": 6009185.0, "step": 841 }, { "epoch": 0.5883997204751922, "grad_norm": 0.8852589257704687, "learning_rate": 7.667747278710406e-05, "loss": 11.54, "num_tokens": 6016823.0, "step": 842 }, { "epoch": 0.589098532494759, "grad_norm": 0.7548742035632593, "learning_rate": 7.6457435012328e-05, "loss": 11.7246, "num_tokens": 6024771.0, "step": 843 }, { "epoch": 0.5897973445143256, "grad_norm": 0.854098594965677, "learning_rate": 7.623751784507706e-05, "loss": 11.664, "num_tokens": 6031975.0, "step": 844 }, { "epoch": 0.5904961565338924, "grad_norm": 0.7853779219200732, "learning_rate": 7.601772241197719e-05, "loss": 11.9128, "num_tokens": 6039124.0, "step": 845 }, { "epoch": 0.5911949685534591, "grad_norm": 0.8383469114270452, "learning_rate": 7.579804983903067e-05, "loss": 11.8526, "num_tokens": 6045752.0, "step": 846 }, { "epoch": 0.5918937805730259, "grad_norm": 0.8878587452859725, "learning_rate": 7.557850125161053e-05, "loss": 11.856, "num_tokens": 6052231.0, "step": 847 }, { "epoch": 0.5925925925925926, "grad_norm": 0.8333451117092037, "learning_rate": 7.535907777445449e-05, "loss": 11.6259, "num_tokens": 6059886.0, "step": 848 }, { "epoch": 0.5932914046121593, "grad_norm": 0.8078970484894613, "learning_rate": 7.513978053165934e-05, "loss": 11.8283, "num_tokens": 6066548.0, "step": 849 }, { "epoch": 0.593990216631726, "grad_norm": 0.7637621169363601, "learning_rate": 7.492061064667526e-05, "loss": 11.7571, "num_tokens": 6074097.0, "step": 850 }, { "epoch": 0.5946890286512928, "grad_norm": 0.8517447452369387, "learning_rate": 7.470156924229988e-05, "loss": 11.7946, "num_tokens": 6080717.0, "step": 851 }, { "epoch": 0.5953878406708596, "grad_norm": 0.7944555032647729, "learning_rate": 7.448265744067275e-05, "loss": 11.6181, "num_tokens": 6088678.0, "step": 852 }, { "epoch": 0.5960866526904263, "grad_norm": 0.8290060549583308, "learning_rate": 7.426387636326936e-05, "loss": 11.7951, "num_tokens": 6095534.0, "step": 853 }, { "epoch": 0.596785464709993, "grad_norm": 0.7769775625071952, "learning_rate": 7.404522713089554e-05, "loss": 11.8202, "num_tokens": 6102965.0, "step": 854 }, { "epoch": 0.5974842767295597, "grad_norm": 0.788894868014834, "learning_rate": 7.382671086368172e-05, "loss": 11.5052, "num_tokens": 6109857.0, "step": 855 }, { "epoch": 0.5981830887491265, "grad_norm": 0.9269678831820426, "learning_rate": 7.360832868107708e-05, "loss": 12.0685, "num_tokens": 6116670.0, "step": 856 }, { "epoch": 0.5988819007686932, "grad_norm": 0.7500023253076064, "learning_rate": 7.3390081701844e-05, "loss": 11.6106, "num_tokens": 6123896.0, "step": 857 }, { "epoch": 0.59958071278826, "grad_norm": 0.8334471335860142, "learning_rate": 7.317197104405213e-05, "loss": 11.8254, "num_tokens": 6130750.0, "step": 858 }, { "epoch": 0.6002795248078266, "grad_norm": 0.9036108784233868, "learning_rate": 7.295399782507275e-05, "loss": 11.6364, "num_tokens": 6137850.0, "step": 859 }, { "epoch": 0.6009783368273934, "grad_norm": 0.773889565114804, "learning_rate": 7.273616316157312e-05, "loss": 11.6782, "num_tokens": 6144967.0, "step": 860 }, { "epoch": 0.6016771488469602, "grad_norm": 0.991907637656795, "learning_rate": 7.251846816951063e-05, "loss": 11.6562, "num_tokens": 6151858.0, "step": 861 }, { "epoch": 0.6023759608665269, "grad_norm": 0.8650022175317262, "learning_rate": 7.23009139641271e-05, "loss": 11.7501, "num_tokens": 6158407.0, "step": 862 }, { "epoch": 0.6030747728860937, "grad_norm": 0.7830337647367694, "learning_rate": 7.208350165994325e-05, "loss": 11.9671, "num_tokens": 6165454.0, "step": 863 }, { "epoch": 0.6037735849056604, "grad_norm": 0.9634536848543019, "learning_rate": 7.186623237075265e-05, "loss": 11.718, "num_tokens": 6172278.0, "step": 864 }, { "epoch": 0.6044723969252271, "grad_norm": 0.8009388532348747, "learning_rate": 7.16491072096164e-05, "loss": 11.797, "num_tokens": 6179334.0, "step": 865 }, { "epoch": 0.6051712089447938, "grad_norm": 0.7794753187578644, "learning_rate": 7.143212728885714e-05, "loss": 11.7032, "num_tokens": 6186949.0, "step": 866 }, { "epoch": 0.6058700209643606, "grad_norm": 0.8209269866004077, "learning_rate": 7.121529372005335e-05, "loss": 11.7299, "num_tokens": 6194084.0, "step": 867 }, { "epoch": 0.6065688329839273, "grad_norm": 0.788054239481001, "learning_rate": 7.099860761403403e-05, "loss": 11.8418, "num_tokens": 6201312.0, "step": 868 }, { "epoch": 0.6072676450034941, "grad_norm": 0.7412690116578288, "learning_rate": 7.078207008087248e-05, "loss": 11.7934, "num_tokens": 6208547.0, "step": 869 }, { "epoch": 0.6079664570230608, "grad_norm": 0.7451063882822027, "learning_rate": 7.056568222988099e-05, "loss": 11.8384, "num_tokens": 6215891.0, "step": 870 }, { "epoch": 0.6086652690426275, "grad_norm": 0.7488238631148831, "learning_rate": 7.034944516960498e-05, "loss": 11.7576, "num_tokens": 6223503.0, "step": 871 }, { "epoch": 0.6093640810621943, "grad_norm": 0.7457484829284764, "learning_rate": 7.013336000781738e-05, "loss": 11.8307, "num_tokens": 6230239.0, "step": 872 }, { "epoch": 0.610062893081761, "grad_norm": 0.7550948672495834, "learning_rate": 6.991742785151305e-05, "loss": 11.6902, "num_tokens": 6237658.0, "step": 873 }, { "epoch": 0.6107617051013278, "grad_norm": 0.7831287226929119, "learning_rate": 6.970164980690285e-05, "loss": 11.8591, "num_tokens": 6244558.0, "step": 874 }, { "epoch": 0.6114605171208944, "grad_norm": 0.7615226027016072, "learning_rate": 6.94860269794083e-05, "loss": 11.8389, "num_tokens": 6251785.0, "step": 875 }, { "epoch": 0.6121593291404612, "grad_norm": 0.7467213227226704, "learning_rate": 6.927056047365557e-05, "loss": 11.6384, "num_tokens": 6258748.0, "step": 876 }, { "epoch": 0.6128581411600279, "grad_norm": 0.7521897728411195, "learning_rate": 6.905525139347011e-05, "loss": 11.7597, "num_tokens": 6266062.0, "step": 877 }, { "epoch": 0.6135569531795947, "grad_norm": 0.7647668153792994, "learning_rate": 6.884010084187093e-05, "loss": 11.7313, "num_tokens": 6272639.0, "step": 878 }, { "epoch": 0.6142557651991615, "grad_norm": 0.7902525252819772, "learning_rate": 6.86251099210648e-05, "loss": 11.8622, "num_tokens": 6279204.0, "step": 879 }, { "epoch": 0.6149545772187281, "grad_norm": 0.7303818732013376, "learning_rate": 6.841027973244076e-05, "loss": 11.7259, "num_tokens": 6286372.0, "step": 880 }, { "epoch": 0.6156533892382949, "grad_norm": 0.794393939694085, "learning_rate": 6.819561137656443e-05, "loss": 11.6617, "num_tokens": 6292809.0, "step": 881 }, { "epoch": 0.6163522012578616, "grad_norm": 0.7485246654196547, "learning_rate": 6.798110595317229e-05, "loss": 11.6752, "num_tokens": 6300059.0, "step": 882 }, { "epoch": 0.6170510132774284, "grad_norm": 0.7892650884136205, "learning_rate": 6.776676456116629e-05, "loss": 11.8064, "num_tokens": 6306992.0, "step": 883 }, { "epoch": 0.6177498252969951, "grad_norm": 0.7816686405399219, "learning_rate": 6.755258829860791e-05, "loss": 11.6161, "num_tokens": 6313854.0, "step": 884 }, { "epoch": 0.6184486373165619, "grad_norm": 0.7673507811300475, "learning_rate": 6.733857826271271e-05, "loss": 11.6007, "num_tokens": 6320866.0, "step": 885 }, { "epoch": 0.6191474493361285, "grad_norm": 0.7482768854684733, "learning_rate": 6.712473554984472e-05, "loss": 11.6434, "num_tokens": 6328614.0, "step": 886 }, { "epoch": 0.6198462613556953, "grad_norm": 0.7084508806822576, "learning_rate": 6.69110612555107e-05, "loss": 11.7712, "num_tokens": 6336010.0, "step": 887 }, { "epoch": 0.6205450733752621, "grad_norm": 0.8161348307383346, "learning_rate": 6.669755647435474e-05, "loss": 11.6412, "num_tokens": 6342374.0, "step": 888 }, { "epoch": 0.6212438853948288, "grad_norm": 0.7548031047135115, "learning_rate": 6.648422230015242e-05, "loss": 11.5709, "num_tokens": 6349581.0, "step": 889 }, { "epoch": 0.6219426974143956, "grad_norm": 0.7717987232600103, "learning_rate": 6.627105982580528e-05, "loss": 11.784, "num_tokens": 6356441.0, "step": 890 }, { "epoch": 0.6226415094339622, "grad_norm": 0.7798435061265528, "learning_rate": 6.605807014333538e-05, "loss": 11.646, "num_tokens": 6363148.0, "step": 891 }, { "epoch": 0.623340321453529, "grad_norm": 0.7800404194831362, "learning_rate": 6.584525434387944e-05, "loss": 11.6328, "num_tokens": 6370176.0, "step": 892 }, { "epoch": 0.6240391334730957, "grad_norm": 0.7682712236013818, "learning_rate": 6.563261351768345e-05, "loss": 11.8973, "num_tokens": 6377384.0, "step": 893 }, { "epoch": 0.6247379454926625, "grad_norm": 0.7258219997442539, "learning_rate": 6.542014875409703e-05, "loss": 11.6427, "num_tokens": 6384876.0, "step": 894 }, { "epoch": 0.6254367575122292, "grad_norm": 0.7952066180536073, "learning_rate": 6.52078611415678e-05, "loss": 11.6802, "num_tokens": 6391664.0, "step": 895 }, { "epoch": 0.6261355695317959, "grad_norm": 0.8232647183093804, "learning_rate": 6.49957517676359e-05, "loss": 11.6117, "num_tokens": 6398231.0, "step": 896 }, { "epoch": 0.6268343815513627, "grad_norm": 0.7882657323119439, "learning_rate": 6.47838217189283e-05, "loss": 11.6319, "num_tokens": 6405010.0, "step": 897 }, { "epoch": 0.6275331935709294, "grad_norm": 0.7265020915206697, "learning_rate": 6.457207208115328e-05, "loss": 11.7671, "num_tokens": 6412293.0, "step": 898 }, { "epoch": 0.6282320055904962, "grad_norm": 0.7966290295146391, "learning_rate": 6.436050393909499e-05, "loss": 11.8313, "num_tokens": 6419190.0, "step": 899 }, { "epoch": 0.6289308176100629, "grad_norm": 0.7395527198077914, "learning_rate": 6.414911837660768e-05, "loss": 11.4508, "num_tokens": 6426719.0, "step": 900 }, { "epoch": 0.6296296296296297, "grad_norm": 0.7299541853834819, "learning_rate": 6.393791647661032e-05, "loss": 11.8029, "num_tokens": 6434060.0, "step": 901 }, { "epoch": 0.6303284416491963, "grad_norm": 0.7463396559075024, "learning_rate": 6.372689932108091e-05, "loss": 11.7718, "num_tokens": 6441144.0, "step": 902 }, { "epoch": 0.6310272536687631, "grad_norm": 0.7959819255730147, "learning_rate": 6.351606799105107e-05, "loss": 11.545, "num_tokens": 6448000.0, "step": 903 }, { "epoch": 0.6317260656883298, "grad_norm": 0.6790515513892001, "learning_rate": 6.330542356660046e-05, "loss": 11.7301, "num_tokens": 6455854.0, "step": 904 }, { "epoch": 0.6324248777078966, "grad_norm": 0.7426253486979305, "learning_rate": 6.309496712685122e-05, "loss": 11.6712, "num_tokens": 6462687.0, "step": 905 }, { "epoch": 0.6331236897274634, "grad_norm": 0.7600216560906856, "learning_rate": 6.288469974996234e-05, "loss": 11.7371, "num_tokens": 6469302.0, "step": 906 }, { "epoch": 0.63382250174703, "grad_norm": 0.8364647552182698, "learning_rate": 6.267462251312445e-05, "loss": 11.7523, "num_tokens": 6475494.0, "step": 907 }, { "epoch": 0.6345213137665968, "grad_norm": 0.7196566382149244, "learning_rate": 6.24647364925539e-05, "loss": 11.6858, "num_tokens": 6483132.0, "step": 908 }, { "epoch": 0.6352201257861635, "grad_norm": 0.7234513446495888, "learning_rate": 6.225504276348766e-05, "loss": 11.5524, "num_tokens": 6490796.0, "step": 909 }, { "epoch": 0.6359189378057303, "grad_norm": 0.7221735536071997, "learning_rate": 6.204554240017742e-05, "loss": 11.6109, "num_tokens": 6498706.0, "step": 910 }, { "epoch": 0.636617749825297, "grad_norm": 0.7589093046517474, "learning_rate": 6.183623647588427e-05, "loss": 11.8576, "num_tokens": 6505674.0, "step": 911 }, { "epoch": 0.6373165618448637, "grad_norm": 0.8427489792553641, "learning_rate": 6.162712606287335e-05, "loss": 11.6848, "num_tokens": 6512122.0, "step": 912 }, { "epoch": 0.6380153738644304, "grad_norm": 0.7509384033664809, "learning_rate": 6.141821223240804e-05, "loss": 11.6497, "num_tokens": 6519491.0, "step": 913 }, { "epoch": 0.6387141858839972, "grad_norm": 0.7727801776974009, "learning_rate": 6.120949605474478e-05, "loss": 11.7904, "num_tokens": 6526251.0, "step": 914 }, { "epoch": 0.639412997903564, "grad_norm": 0.7698221947163191, "learning_rate": 6.100097859912732e-05, "loss": 11.7356, "num_tokens": 6532851.0, "step": 915 }, { "epoch": 0.6401118099231307, "grad_norm": 0.8484210212133271, "learning_rate": 6.0792660933781375e-05, "loss": 11.6482, "num_tokens": 6539983.0, "step": 916 }, { "epoch": 0.6408106219426974, "grad_norm": 0.8146973477377301, "learning_rate": 6.058454412590928e-05, "loss": 11.682, "num_tokens": 6547267.0, "step": 917 }, { "epoch": 0.6415094339622641, "grad_norm": 0.8066477438364413, "learning_rate": 6.037662924168419e-05, "loss": 11.5587, "num_tokens": 6554111.0, "step": 918 }, { "epoch": 0.6422082459818309, "grad_norm": 0.7961568117356921, "learning_rate": 6.016891734624501e-05, "loss": 11.7357, "num_tokens": 6561069.0, "step": 919 }, { "epoch": 0.6429070580013976, "grad_norm": 0.8791796417982806, "learning_rate": 5.9961409503690605e-05, "loss": 11.884, "num_tokens": 6568238.0, "step": 920 }, { "epoch": 0.6436058700209644, "grad_norm": 0.712988301703953, "learning_rate": 5.975410677707447e-05, "loss": 11.7497, "num_tokens": 6575880.0, "step": 921 }, { "epoch": 0.6443046820405312, "grad_norm": 0.7647107034158884, "learning_rate": 5.954701022839944e-05, "loss": 11.6668, "num_tokens": 6582676.0, "step": 922 }, { "epoch": 0.6450034940600978, "grad_norm": 0.7279408863948003, "learning_rate": 5.9340120918611994e-05, "loss": 11.8926, "num_tokens": 6590130.0, "step": 923 }, { "epoch": 0.6457023060796646, "grad_norm": 0.7858715219906657, "learning_rate": 5.913343990759695e-05, "loss": 11.8346, "num_tokens": 6596815.0, "step": 924 }, { "epoch": 0.6464011180992313, "grad_norm": 0.7662237988937426, "learning_rate": 5.8926968254172076e-05, "loss": 11.3992, "num_tokens": 6604041.0, "step": 925 }, { "epoch": 0.6470999301187981, "grad_norm": 0.7807542174388001, "learning_rate": 5.872070701608251e-05, "loss": 11.7433, "num_tokens": 6611449.0, "step": 926 }, { "epoch": 0.6477987421383647, "grad_norm": 0.7770790305426569, "learning_rate": 5.851465724999559e-05, "loss": 11.8598, "num_tokens": 6618417.0, "step": 927 }, { "epoch": 0.6484975541579315, "grad_norm": 0.7276280775957402, "learning_rate": 5.830882001149517e-05, "loss": 11.6542, "num_tokens": 6626409.0, "step": 928 }, { "epoch": 0.6491963661774982, "grad_norm": 0.7659912850579355, "learning_rate": 5.8103196355076305e-05, "loss": 11.7479, "num_tokens": 6633421.0, "step": 929 }, { "epoch": 0.649895178197065, "grad_norm": 0.7488754867958327, "learning_rate": 5.789778733414004e-05, "loss": 11.681, "num_tokens": 6641387.0, "step": 930 }, { "epoch": 0.6505939902166318, "grad_norm": 0.7426656034656497, "learning_rate": 5.769259400098769e-05, "loss": 11.5628, "num_tokens": 6648880.0, "step": 931 }, { "epoch": 0.6512928022361985, "grad_norm": 0.7485244300975621, "learning_rate": 5.748761740681573e-05, "loss": 11.7034, "num_tokens": 6656518.0, "step": 932 }, { "epoch": 0.6519916142557652, "grad_norm": 0.7280973172346309, "learning_rate": 5.728285860171021e-05, "loss": 11.5834, "num_tokens": 6663873.0, "step": 933 }, { "epoch": 0.6526904262753319, "grad_norm": 0.7690745999643336, "learning_rate": 5.7078318634641456e-05, "loss": 11.7479, "num_tokens": 6671021.0, "step": 934 }, { "epoch": 0.6533892382948987, "grad_norm": 0.8091833500142877, "learning_rate": 5.687399855345879e-05, "loss": 11.7293, "num_tokens": 6677620.0, "step": 935 }, { "epoch": 0.6540880503144654, "grad_norm": 0.7583842125434499, "learning_rate": 5.666989940488496e-05, "loss": 11.6877, "num_tokens": 6685101.0, "step": 936 }, { "epoch": 0.6547868623340322, "grad_norm": 0.7756271660445293, "learning_rate": 5.646602223451094e-05, "loss": 11.5556, "num_tokens": 6692207.0, "step": 937 }, { "epoch": 0.6554856743535988, "grad_norm": 0.732585726312475, "learning_rate": 5.6262368086790504e-05, "loss": 11.6001, "num_tokens": 6699759.0, "step": 938 }, { "epoch": 0.6561844863731656, "grad_norm": 0.7822132210828396, "learning_rate": 5.605893800503484e-05, "loss": 11.5981, "num_tokens": 6706906.0, "step": 939 }, { "epoch": 0.6568832983927324, "grad_norm": 0.80455226020786, "learning_rate": 5.585573303140741e-05, "loss": 11.7701, "num_tokens": 6713394.0, "step": 940 }, { "epoch": 0.6575821104122991, "grad_norm": 0.7515507860338341, "learning_rate": 5.565275420691831e-05, "loss": 11.6546, "num_tokens": 6720211.0, "step": 941 }, { "epoch": 0.6582809224318659, "grad_norm": 0.7279100113231547, "learning_rate": 5.5450002571419104e-05, "loss": 11.8563, "num_tokens": 6727718.0, "step": 942 }, { "epoch": 0.6589797344514325, "grad_norm": 0.740532850900047, "learning_rate": 5.524747916359756e-05, "loss": 11.5629, "num_tokens": 6735027.0, "step": 943 }, { "epoch": 0.6596785464709993, "grad_norm": 0.7190050209844847, "learning_rate": 5.504518502097212e-05, "loss": 11.7309, "num_tokens": 6742667.0, "step": 944 }, { "epoch": 0.660377358490566, "grad_norm": 0.7281631801675564, "learning_rate": 5.484312117988687e-05, "loss": 11.6905, "num_tokens": 6750129.0, "step": 945 }, { "epoch": 0.6610761705101328, "grad_norm": 0.7642082512718011, "learning_rate": 5.464128867550593e-05, "loss": 11.5831, "num_tokens": 6756898.0, "step": 946 }, { "epoch": 0.6617749825296995, "grad_norm": 0.7353524837824559, "learning_rate": 5.4439688541808345e-05, "loss": 11.8927, "num_tokens": 6763921.0, "step": 947 }, { "epoch": 0.6624737945492662, "grad_norm": 0.7252766686868163, "learning_rate": 5.423832181158274e-05, "loss": 11.665, "num_tokens": 6771091.0, "step": 948 }, { "epoch": 0.663172606568833, "grad_norm": 0.7113069997901473, "learning_rate": 5.4037189516422e-05, "loss": 11.5554, "num_tokens": 6778488.0, "step": 949 }, { "epoch": 0.6638714185883997, "grad_norm": 0.7014962329353019, "learning_rate": 5.383629268671804e-05, "loss": 11.5834, "num_tokens": 6785464.0, "step": 950 }, { "epoch": 0.6645702306079665, "grad_norm": 0.7875873029591225, "learning_rate": 5.3635632351656495e-05, "loss": 11.8594, "num_tokens": 6792243.0, "step": 951 }, { "epoch": 0.6652690426275332, "grad_norm": 0.7213066317257657, "learning_rate": 5.3435209539211394e-05, "loss": 11.6927, "num_tokens": 6799493.0, "step": 952 }, { "epoch": 0.6659678546471, "grad_norm": 0.7035828384585064, "learning_rate": 5.323502527614007e-05, "loss": 11.6259, "num_tokens": 6806467.0, "step": 953 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7809434468803235, "learning_rate": 5.303508058797766e-05, "loss": 11.6873, "num_tokens": 6813306.0, "step": 954 }, { "epoch": 0.6673654786862334, "grad_norm": 0.7283465666823379, "learning_rate": 5.2835376499031955e-05, "loss": 11.4497, "num_tokens": 6820518.0, "step": 955 }, { "epoch": 0.6680642907058001, "grad_norm": 0.7617743650609611, "learning_rate": 5.263591403237831e-05, "loss": 11.668, "num_tokens": 6827100.0, "step": 956 }, { "epoch": 0.6687631027253669, "grad_norm": 0.8177076699367816, "learning_rate": 5.243669420985413e-05, "loss": 11.6849, "num_tokens": 6833940.0, "step": 957 }, { "epoch": 0.6694619147449337, "grad_norm": 0.7408475483142893, "learning_rate": 5.22377180520538e-05, "loss": 11.8671, "num_tokens": 6841696.0, "step": 958 }, { "epoch": 0.6701607267645003, "grad_norm": 0.7169014961837786, "learning_rate": 5.2038986578323437e-05, "loss": 11.8537, "num_tokens": 6849308.0, "step": 959 }, { "epoch": 0.6708595387840671, "grad_norm": 0.7580468191959683, "learning_rate": 5.1840500806755575e-05, "loss": 11.5533, "num_tokens": 6856816.0, "step": 960 }, { "epoch": 0.6715583508036338, "grad_norm": 0.748707497098157, "learning_rate": 5.164226175418421e-05, "loss": 11.7469, "num_tokens": 6863209.0, "step": 961 }, { "epoch": 0.6722571628232006, "grad_norm": 0.7525456270124457, "learning_rate": 5.1444270436179185e-05, "loss": 11.4797, "num_tokens": 6869856.0, "step": 962 }, { "epoch": 0.6729559748427673, "grad_norm": 0.7211635471907168, "learning_rate": 5.12465278670414e-05, "loss": 11.6617, "num_tokens": 6877774.0, "step": 963 }, { "epoch": 0.673654786862334, "grad_norm": 0.7553976822609867, "learning_rate": 5.10490350597973e-05, "loss": 11.5848, "num_tokens": 6884547.0, "step": 964 }, { "epoch": 0.6743535988819007, "grad_norm": 0.7022807434186031, "learning_rate": 5.085179302619383e-05, "loss": 11.7242, "num_tokens": 6892114.0, "step": 965 }, { "epoch": 0.6750524109014675, "grad_norm": 0.780693519873508, "learning_rate": 5.06548027766933e-05, "loss": 11.8297, "num_tokens": 6899318.0, "step": 966 }, { "epoch": 0.6757512229210343, "grad_norm": 0.7267581172978977, "learning_rate": 5.045806532046806e-05, "loss": 11.6526, "num_tokens": 6906566.0, "step": 967 }, { "epoch": 0.676450034940601, "grad_norm": 0.7846570944458914, "learning_rate": 5.0261581665395475e-05, "loss": 11.6798, "num_tokens": 6913114.0, "step": 968 }, { "epoch": 0.6771488469601677, "grad_norm": 0.7112948402372895, "learning_rate": 5.006535281805265e-05, "loss": 11.5148, "num_tokens": 6920642.0, "step": 969 }, { "epoch": 0.6778476589797344, "grad_norm": 0.7583944622835155, "learning_rate": 4.9869379783711315e-05, "loss": 11.5235, "num_tokens": 6927787.0, "step": 970 }, { "epoch": 0.6785464709993012, "grad_norm": 0.7573666973845233, "learning_rate": 4.967366356633275e-05, "loss": 11.5615, "num_tokens": 6935406.0, "step": 971 }, { "epoch": 0.6792452830188679, "grad_norm": 0.7227775502190015, "learning_rate": 4.947820516856253e-05, "loss": 11.6955, "num_tokens": 6942968.0, "step": 972 }, { "epoch": 0.6799440950384347, "grad_norm": 0.7814966089422306, "learning_rate": 4.9283005591725375e-05, "loss": 11.752, "num_tokens": 6949948.0, "step": 973 }, { "epoch": 0.6806429070580013, "grad_norm": 0.7175546627631187, "learning_rate": 4.908806583582021e-05, "loss": 11.7433, "num_tokens": 6957454.0, "step": 974 }, { "epoch": 0.6813417190775681, "grad_norm": 0.7517273070787456, "learning_rate": 4.8893386899514746e-05, "loss": 11.6715, "num_tokens": 6964979.0, "step": 975 }, { "epoch": 0.6820405310971349, "grad_norm": 0.669368555621721, "learning_rate": 4.869896978014071e-05, "loss": 11.6769, "num_tokens": 6972329.0, "step": 976 }, { "epoch": 0.6827393431167016, "grad_norm": 0.7504037459255867, "learning_rate": 4.85048154736884e-05, "loss": 11.7083, "num_tokens": 6978773.0, "step": 977 }, { "epoch": 0.6834381551362684, "grad_norm": 0.7077905666186198, "learning_rate": 4.831092497480179e-05, "loss": 11.6217, "num_tokens": 6986336.0, "step": 978 }, { "epoch": 0.684136967155835, "grad_norm": 0.8637933521061518, "learning_rate": 4.81172992767734e-05, "loss": 11.6379, "num_tokens": 6992944.0, "step": 979 }, { "epoch": 0.6848357791754018, "grad_norm": 0.7721626875146355, "learning_rate": 4.792393937153914e-05, "loss": 11.7463, "num_tokens": 6999683.0, "step": 980 }, { "epoch": 0.6855345911949685, "grad_norm": 0.7117222748710471, "learning_rate": 4.773084624967327e-05, "loss": 11.5521, "num_tokens": 7006810.0, "step": 981 }, { "epoch": 0.6862334032145353, "grad_norm": 0.7539774224870581, "learning_rate": 4.753802090038344e-05, "loss": 11.8959, "num_tokens": 7013925.0, "step": 982 }, { "epoch": 0.686932215234102, "grad_norm": 0.8327203871710336, "learning_rate": 4.734546431150536e-05, "loss": 11.8562, "num_tokens": 7020571.0, "step": 983 }, { "epoch": 0.6876310272536688, "grad_norm": 0.7708657591442682, "learning_rate": 4.715317746949804e-05, "loss": 11.8254, "num_tokens": 7027401.0, "step": 984 }, { "epoch": 0.6883298392732355, "grad_norm": 0.7258834917462351, "learning_rate": 4.6961161359438486e-05, "loss": 11.4523, "num_tokens": 7034682.0, "step": 985 }, { "epoch": 0.6890286512928022, "grad_norm": 0.7330377602083751, "learning_rate": 4.676941696501673e-05, "loss": 11.5904, "num_tokens": 7042175.0, "step": 986 }, { "epoch": 0.689727463312369, "grad_norm": 0.8028105455984438, "learning_rate": 4.657794526853096e-05, "loss": 11.6372, "num_tokens": 7048976.0, "step": 987 }, { "epoch": 0.6904262753319357, "grad_norm": 0.7215274112971415, "learning_rate": 4.6386747250882224e-05, "loss": 11.6798, "num_tokens": 7056210.0, "step": 988 }, { "epoch": 0.6911250873515025, "grad_norm": 0.7769719500087231, "learning_rate": 4.6195823891569545e-05, "loss": 11.8147, "num_tokens": 7063291.0, "step": 989 }, { "epoch": 0.6918238993710691, "grad_norm": 0.7633412054706588, "learning_rate": 4.60051761686849e-05, "loss": 11.7149, "num_tokens": 7070074.0, "step": 990 }, { "epoch": 0.6925227113906359, "grad_norm": 0.7299361913347738, "learning_rate": 4.581480505890816e-05, "loss": 11.5324, "num_tokens": 7077328.0, "step": 991 }, { "epoch": 0.6932215234102026, "grad_norm": 0.7373024767213333, "learning_rate": 4.5624711537502206e-05, "loss": 11.7304, "num_tokens": 7084251.0, "step": 992 }, { "epoch": 0.6939203354297694, "grad_norm": 0.7830459462760436, "learning_rate": 4.543489657830777e-05, "loss": 11.6188, "num_tokens": 7091370.0, "step": 993 }, { "epoch": 0.6946191474493362, "grad_norm": 0.8182215566071669, "learning_rate": 4.52453611537385e-05, "loss": 11.6924, "num_tokens": 7097992.0, "step": 994 }, { "epoch": 0.6953179594689028, "grad_norm": 0.7946888243423976, "learning_rate": 4.505610623477611e-05, "loss": 11.6498, "num_tokens": 7104219.0, "step": 995 }, { "epoch": 0.6960167714884696, "grad_norm": 0.7549468174659378, "learning_rate": 4.486713279096515e-05, "loss": 11.5893, "num_tokens": 7110986.0, "step": 996 }, { "epoch": 0.6967155835080363, "grad_norm": 0.6921174078679769, "learning_rate": 4.4678441790408335e-05, "loss": 11.7298, "num_tokens": 7118553.0, "step": 997 }, { "epoch": 0.6974143955276031, "grad_norm": 0.7613107064310906, "learning_rate": 4.449003419976133e-05, "loss": 11.4912, "num_tokens": 7125671.0, "step": 998 }, { "epoch": 0.6981132075471698, "grad_norm": 0.7419182676475446, "learning_rate": 4.430191098422795e-05, "loss": 11.4444, "num_tokens": 7132247.0, "step": 999 }, { "epoch": 0.6988120195667366, "grad_norm": 0.8116675732612285, "learning_rate": 4.411407310755513e-05, "loss": 11.7335, "num_tokens": 7138298.0, "step": 1000 }, { "epoch": 0.6995108315863033, "grad_norm": 0.7569228200974064, "learning_rate": 4.392652153202802e-05, "loss": 11.7441, "num_tokens": 7144756.0, "step": 1001 }, { "epoch": 0.70020964360587, "grad_norm": 0.7200908962994922, "learning_rate": 4.373925721846519e-05, "loss": 11.6201, "num_tokens": 7152146.0, "step": 1002 }, { "epoch": 0.7009084556254368, "grad_norm": 0.6993029376311879, "learning_rate": 4.355228112621341e-05, "loss": 11.6431, "num_tokens": 7159343.0, "step": 1003 }, { "epoch": 0.7016072676450035, "grad_norm": 0.7461449573644776, "learning_rate": 4.336559421314298e-05, "loss": 11.6424, "num_tokens": 7165846.0, "step": 1004 }, { "epoch": 0.7023060796645703, "grad_norm": 0.7485912923255708, "learning_rate": 4.317919743564278e-05, "loss": 11.7013, "num_tokens": 7173032.0, "step": 1005 }, { "epoch": 0.7030048916841369, "grad_norm": 0.7444500724716369, "learning_rate": 4.29930917486153e-05, "loss": 11.8321, "num_tokens": 7179676.0, "step": 1006 }, { "epoch": 0.7037037037037037, "grad_norm": 0.7172721105055976, "learning_rate": 4.2807278105471735e-05, "loss": 11.551, "num_tokens": 7187102.0, "step": 1007 }, { "epoch": 0.7044025157232704, "grad_norm": 0.7075647979765408, "learning_rate": 4.2621757458127285e-05, "loss": 11.6526, "num_tokens": 7194233.0, "step": 1008 }, { "epoch": 0.7051013277428372, "grad_norm": 0.7356820423823094, "learning_rate": 4.243653075699604e-05, "loss": 11.6294, "num_tokens": 7200796.0, "step": 1009 }, { "epoch": 0.705800139762404, "grad_norm": 0.7823468359047476, "learning_rate": 4.2251598950986226e-05, "loss": 11.678, "num_tokens": 7207142.0, "step": 1010 }, { "epoch": 0.7064989517819706, "grad_norm": 0.764640535949318, "learning_rate": 4.2066962987495376e-05, "loss": 11.7764, "num_tokens": 7213920.0, "step": 1011 }, { "epoch": 0.7071977638015374, "grad_norm": 0.7037309726706639, "learning_rate": 4.188262381240534e-05, "loss": 11.8543, "num_tokens": 7221271.0, "step": 1012 }, { "epoch": 0.7078965758211041, "grad_norm": 0.6676818416659133, "learning_rate": 4.169858237007772e-05, "loss": 11.6132, "num_tokens": 7229149.0, "step": 1013 }, { "epoch": 0.7085953878406709, "grad_norm": 0.7253896868171327, "learning_rate": 4.151483960334862e-05, "loss": 11.5597, "num_tokens": 7236889.0, "step": 1014 }, { "epoch": 0.7092941998602376, "grad_norm": 0.7392536296757929, "learning_rate": 4.133139645352425e-05, "loss": 11.7499, "num_tokens": 7243535.0, "step": 1015 }, { "epoch": 0.7099930118798043, "grad_norm": 0.7859312545956098, "learning_rate": 4.114825386037576e-05, "loss": 11.8108, "num_tokens": 7249939.0, "step": 1016 }, { "epoch": 0.710691823899371, "grad_norm": 0.6840777175495678, "learning_rate": 4.0965412762134556e-05, "loss": 11.6202, "num_tokens": 7257993.0, "step": 1017 }, { "epoch": 0.7113906359189378, "grad_norm": 0.7796568358822769, "learning_rate": 4.078287409548763e-05, "loss": 11.7883, "num_tokens": 7264252.0, "step": 1018 }, { "epoch": 0.7120894479385046, "grad_norm": 0.7228198060469493, "learning_rate": 4.060063879557249e-05, "loss": 11.4887, "num_tokens": 7271420.0, "step": 1019 }, { "epoch": 0.7127882599580713, "grad_norm": 0.730805955091614, "learning_rate": 4.0418707795972574e-05, "loss": 11.8562, "num_tokens": 7278894.0, "step": 1020 }, { "epoch": 0.713487071977638, "grad_norm": 0.7206633340955824, "learning_rate": 4.023708202871239e-05, "loss": 11.494, "num_tokens": 7286940.0, "step": 1021 }, { "epoch": 0.7141858839972047, "grad_norm": 0.7074645446742183, "learning_rate": 4.005576242425272e-05, "loss": 11.2935, "num_tokens": 7294745.0, "step": 1022 }, { "epoch": 0.7148846960167715, "grad_norm": 0.7320420207557593, "learning_rate": 3.9874749911485995e-05, "loss": 11.6363, "num_tokens": 7301959.0, "step": 1023 }, { "epoch": 0.7155835080363382, "grad_norm": 0.771063008702391, "learning_rate": 3.969404541773132e-05, "loss": 11.5651, "num_tokens": 7308592.0, "step": 1024 }, { "epoch": 0.716282320055905, "grad_norm": 0.7590205282163587, "learning_rate": 3.951364986872984e-05, "loss": 11.589, "num_tokens": 7315715.0, "step": 1025 }, { "epoch": 0.7169811320754716, "grad_norm": 0.7763995366489977, "learning_rate": 3.933356418864008e-05, "loss": 11.6557, "num_tokens": 7322229.0, "step": 1026 }, { "epoch": 0.7176799440950384, "grad_norm": 0.7390841816730954, "learning_rate": 3.9153789300033e-05, "loss": 11.7646, "num_tokens": 7329104.0, "step": 1027 }, { "epoch": 0.7183787561146052, "grad_norm": 0.6928548796948936, "learning_rate": 3.8974326123887515e-05, "loss": 11.4572, "num_tokens": 7336700.0, "step": 1028 }, { "epoch": 0.7190775681341719, "grad_norm": 0.7395323821851384, "learning_rate": 3.879517557958554e-05, "loss": 11.8653, "num_tokens": 7343801.0, "step": 1029 }, { "epoch": 0.7197763801537387, "grad_norm": 0.7538585339891928, "learning_rate": 3.861633858490745e-05, "loss": 11.7817, "num_tokens": 7350746.0, "step": 1030 }, { "epoch": 0.7204751921733054, "grad_norm": 0.6744251530070715, "learning_rate": 3.8437816056027296e-05, "loss": 11.4897, "num_tokens": 7358560.0, "step": 1031 }, { "epoch": 0.7211740041928721, "grad_norm": 0.7512447279022534, "learning_rate": 3.82596089075081e-05, "loss": 11.9019, "num_tokens": 7365191.0, "step": 1032 }, { "epoch": 0.7218728162124388, "grad_norm": 0.7267257258570513, "learning_rate": 3.808171805229733e-05, "loss": 11.687, "num_tokens": 7372368.0, "step": 1033 }, { "epoch": 0.7225716282320056, "grad_norm": 0.7326891371575754, "learning_rate": 3.790414440172197e-05, "loss": 11.6076, "num_tokens": 7379881.0, "step": 1034 }, { "epoch": 0.7232704402515723, "grad_norm": 0.7370905158754345, "learning_rate": 3.7726888865484e-05, "loss": 11.4954, "num_tokens": 7386744.0, "step": 1035 }, { "epoch": 0.7239692522711391, "grad_norm": 0.6431024730369418, "learning_rate": 3.754995235165579e-05, "loss": 11.4018, "num_tokens": 7394571.0, "step": 1036 }, { "epoch": 0.7246680642907058, "grad_norm": 0.6783758768249005, "learning_rate": 3.73733357666753e-05, "loss": 11.5327, "num_tokens": 7402101.0, "step": 1037 }, { "epoch": 0.7253668763102725, "grad_norm": 0.7079863175153168, "learning_rate": 3.719704001534149e-05, "loss": 11.6665, "num_tokens": 7409120.0, "step": 1038 }, { "epoch": 0.7260656883298393, "grad_norm": 0.7085916478726362, "learning_rate": 3.702106600080979e-05, "loss": 11.7719, "num_tokens": 7416318.0, "step": 1039 }, { "epoch": 0.726764500349406, "grad_norm": 0.7136983516629083, "learning_rate": 3.6845414624587326e-05, "loss": 11.6003, "num_tokens": 7423580.0, "step": 1040 }, { "epoch": 0.7274633123689728, "grad_norm": 0.6870755578878269, "learning_rate": 3.667008678652837e-05, "loss": 11.5949, "num_tokens": 7431059.0, "step": 1041 }, { "epoch": 0.7281621243885394, "grad_norm": 0.6859117308717101, "learning_rate": 3.6495083384829723e-05, "loss": 11.5083, "num_tokens": 7438722.0, "step": 1042 }, { "epoch": 0.7288609364081062, "grad_norm": 0.7632867590575223, "learning_rate": 3.6320405316026074e-05, "loss": 11.9206, "num_tokens": 7445317.0, "step": 1043 }, { "epoch": 0.7295597484276729, "grad_norm": 0.750633699481274, "learning_rate": 3.6146053474985564e-05, "loss": 11.8181, "num_tokens": 7452194.0, "step": 1044 }, { "epoch": 0.7302585604472397, "grad_norm": 0.6731058653737066, "learning_rate": 3.597202875490494e-05, "loss": 11.5177, "num_tokens": 7460052.0, "step": 1045 }, { "epoch": 0.7309573724668065, "grad_norm": 0.7296995993926372, "learning_rate": 3.579833204730525e-05, "loss": 11.66, "num_tokens": 7467357.0, "step": 1046 }, { "epoch": 0.7316561844863732, "grad_norm": 0.7563581339981545, "learning_rate": 3.562496424202707e-05, "loss": 11.6336, "num_tokens": 7474027.0, "step": 1047 }, { "epoch": 0.7323549965059399, "grad_norm": 0.7152448422103289, "learning_rate": 3.5451926227225997e-05, "loss": 11.5833, "num_tokens": 7481232.0, "step": 1048 }, { "epoch": 0.7330538085255066, "grad_norm": 0.7233352244440772, "learning_rate": 3.5279218889368225e-05, "loss": 11.6259, "num_tokens": 7487946.0, "step": 1049 }, { "epoch": 0.7337526205450734, "grad_norm": 0.7201818429672866, "learning_rate": 3.5106843113225854e-05, "loss": 11.6353, "num_tokens": 7494713.0, "step": 1050 }, { "epoch": 0.7344514325646401, "grad_norm": 0.6925680193359807, "learning_rate": 3.493479978187236e-05, "loss": 11.6771, "num_tokens": 7502002.0, "step": 1051 }, { "epoch": 0.7351502445842069, "grad_norm": 0.7114375668835355, "learning_rate": 3.4763089776678203e-05, "loss": 11.6443, "num_tokens": 7509025.0, "step": 1052 }, { "epoch": 0.7358490566037735, "grad_norm": 0.7602494623410806, "learning_rate": 3.459171397730614e-05, "loss": 11.6227, "num_tokens": 7515456.0, "step": 1053 }, { "epoch": 0.7365478686233403, "grad_norm": 0.7069387502451085, "learning_rate": 3.44206732617069e-05, "loss": 11.6711, "num_tokens": 7522493.0, "step": 1054 }, { "epoch": 0.7372466806429071, "grad_norm": 0.7502241449288016, "learning_rate": 3.424996850611455e-05, "loss": 11.4776, "num_tokens": 7529387.0, "step": 1055 }, { "epoch": 0.7379454926624738, "grad_norm": 0.6835406817242182, "learning_rate": 3.4079600585041996e-05, "loss": 11.1981, "num_tokens": 7537726.0, "step": 1056 }, { "epoch": 0.7386443046820406, "grad_norm": 0.6792540808215116, "learning_rate": 3.3909570371276654e-05, "loss": 11.7134, "num_tokens": 7545113.0, "step": 1057 }, { "epoch": 0.7393431167016072, "grad_norm": 0.773711812243068, "learning_rate": 3.3739878735875796e-05, "loss": 11.7631, "num_tokens": 7552056.0, "step": 1058 }, { "epoch": 0.740041928721174, "grad_norm": 0.6829312703518382, "learning_rate": 3.357052654816225e-05, "loss": 11.5149, "num_tokens": 7559863.0, "step": 1059 }, { "epoch": 0.7407407407407407, "grad_norm": 0.703661171294156, "learning_rate": 3.3401514675719816e-05, "loss": 11.7388, "num_tokens": 7567323.0, "step": 1060 }, { "epoch": 0.7414395527603075, "grad_norm": 0.7508429715088617, "learning_rate": 3.323284398438886e-05, "loss": 11.5236, "num_tokens": 7573959.0, "step": 1061 }, { "epoch": 0.7421383647798742, "grad_norm": 0.7168308854044466, "learning_rate": 3.306451533826194e-05, "loss": 11.8019, "num_tokens": 7580996.0, "step": 1062 }, { "epoch": 0.742837176799441, "grad_norm": 0.734464873386508, "learning_rate": 3.289652959967925e-05, "loss": 11.8128, "num_tokens": 7587525.0, "step": 1063 }, { "epoch": 0.7435359888190077, "grad_norm": 0.6971831147564971, "learning_rate": 3.272888762922442e-05, "loss": 11.7126, "num_tokens": 7595094.0, "step": 1064 }, { "epoch": 0.7442348008385744, "grad_norm": 0.7113897493331813, "learning_rate": 3.2561590285719856e-05, "loss": 11.5176, "num_tokens": 7602618.0, "step": 1065 }, { "epoch": 0.7449336128581412, "grad_norm": 0.6655971222584358, "learning_rate": 3.2394638426222467e-05, "loss": 11.4715, "num_tokens": 7610143.0, "step": 1066 }, { "epoch": 0.7456324248777079, "grad_norm": 0.6764218802393406, "learning_rate": 3.222803290601934e-05, "loss": 11.5741, "num_tokens": 7617786.0, "step": 1067 }, { "epoch": 0.7463312368972747, "grad_norm": 0.7070245163442258, "learning_rate": 3.20617745786232e-05, "loss": 11.4435, "num_tokens": 7624874.0, "step": 1068 }, { "epoch": 0.7470300489168413, "grad_norm": 0.7230608548627172, "learning_rate": 3.189586429576812e-05, "loss": 11.622, "num_tokens": 7631644.0, "step": 1069 }, { "epoch": 0.7477288609364081, "grad_norm": 0.833660300984025, "learning_rate": 3.173030290740524e-05, "loss": 11.7174, "num_tokens": 7637610.0, "step": 1070 }, { "epoch": 0.7484276729559748, "grad_norm": 0.6958777713086013, "learning_rate": 3.1565091261698245e-05, "loss": 11.4735, "num_tokens": 7644746.0, "step": 1071 }, { "epoch": 0.7491264849755416, "grad_norm": 0.7304731992070302, "learning_rate": 3.140023020501912e-05, "loss": 11.4508, "num_tokens": 7651980.0, "step": 1072 }, { "epoch": 0.7498252969951084, "grad_norm": 0.7441043526708394, "learning_rate": 3.1235720581943827e-05, "loss": 11.6389, "num_tokens": 7658633.0, "step": 1073 }, { "epoch": 0.750524109014675, "grad_norm": 0.6853622284268183, "learning_rate": 3.107156323524788e-05, "loss": 11.3994, "num_tokens": 7666388.0, "step": 1074 }, { "epoch": 0.7512229210342418, "grad_norm": 0.773787030607795, "learning_rate": 3.0907759005902224e-05, "loss": 11.6966, "num_tokens": 7673401.0, "step": 1075 }, { "epoch": 0.7519217330538085, "grad_norm": 0.7387364189450014, "learning_rate": 3.074430873306865e-05, "loss": 11.6986, "num_tokens": 7680465.0, "step": 1076 }, { "epoch": 0.7526205450733753, "grad_norm": 0.7153460725520698, "learning_rate": 3.058121325409579e-05, "loss": 11.6219, "num_tokens": 7687681.0, "step": 1077 }, { "epoch": 0.753319357092942, "grad_norm": 0.7069130545875086, "learning_rate": 3.041847340451456e-05, "loss": 11.6594, "num_tokens": 7694753.0, "step": 1078 }, { "epoch": 0.7540181691125087, "grad_norm": 0.7643919239299855, "learning_rate": 3.0256090018034046e-05, "loss": 11.6039, "num_tokens": 7700964.0, "step": 1079 }, { "epoch": 0.7547169811320755, "grad_norm": 0.7487623121730519, "learning_rate": 3.009406392653723e-05, "loss": 11.5454, "num_tokens": 7707592.0, "step": 1080 }, { "epoch": 0.7554157931516422, "grad_norm": 0.6759735347117747, "learning_rate": 2.993239596007669e-05, "loss": 11.486, "num_tokens": 7715404.0, "step": 1081 }, { "epoch": 0.756114605171209, "grad_norm": 0.688840054711059, "learning_rate": 2.9771086946870175e-05, "loss": 11.4369, "num_tokens": 7722495.0, "step": 1082 }, { "epoch": 0.7568134171907757, "grad_norm": 0.8140590429556787, "learning_rate": 2.9610137713296783e-05, "loss": 11.8346, "num_tokens": 7728379.0, "step": 1083 }, { "epoch": 0.7575122292103424, "grad_norm": 0.7365687944687868, "learning_rate": 2.9449549083892292e-05, "loss": 11.751, "num_tokens": 7735201.0, "step": 1084 }, { "epoch": 0.7582110412299091, "grad_norm": 0.8133640666754306, "learning_rate": 2.9289321881345254e-05, "loss": 11.928, "num_tokens": 7741618.0, "step": 1085 }, { "epoch": 0.7589098532494759, "grad_norm": 0.7657442876889771, "learning_rate": 2.9129456926492548e-05, "loss": 11.4964, "num_tokens": 7748854.0, "step": 1086 }, { "epoch": 0.7596086652690426, "grad_norm": 0.6907962618994248, "learning_rate": 2.8969955038315277e-05, "loss": 11.4488, "num_tokens": 7756291.0, "step": 1087 }, { "epoch": 0.7603074772886094, "grad_norm": 0.7152174671204918, "learning_rate": 2.8810817033934656e-05, "loss": 11.4593, "num_tokens": 7763508.0, "step": 1088 }, { "epoch": 0.7610062893081762, "grad_norm": 0.6975589242543092, "learning_rate": 2.8652043728607625e-05, "loss": 11.6335, "num_tokens": 7771093.0, "step": 1089 }, { "epoch": 0.7617051013277428, "grad_norm": 0.7668716962872764, "learning_rate": 2.8493635935722928e-05, "loss": 11.5716, "num_tokens": 7777723.0, "step": 1090 }, { "epoch": 0.7624039133473096, "grad_norm": 0.7100709643860577, "learning_rate": 2.8335594466796656e-05, "loss": 11.5777, "num_tokens": 7784753.0, "step": 1091 }, { "epoch": 0.7631027253668763, "grad_norm": 0.7708747787804404, "learning_rate": 2.8177920131468273e-05, "loss": 11.8262, "num_tokens": 7791905.0, "step": 1092 }, { "epoch": 0.7638015373864431, "grad_norm": 0.7096698780955707, "learning_rate": 2.8020613737496547e-05, "loss": 11.6567, "num_tokens": 7799579.0, "step": 1093 }, { "epoch": 0.7645003494060097, "grad_norm": 0.7255469439275377, "learning_rate": 2.7863676090755176e-05, "loss": 11.6891, "num_tokens": 7806386.0, "step": 1094 }, { "epoch": 0.7651991614255765, "grad_norm": 0.6472085969452891, "learning_rate": 2.770710799522879e-05, "loss": 11.4972, "num_tokens": 7814116.0, "step": 1095 }, { "epoch": 0.7658979734451432, "grad_norm": 0.7602480524843874, "learning_rate": 2.7550910253008933e-05, "loss": 11.5528, "num_tokens": 7820763.0, "step": 1096 }, { "epoch": 0.76659678546471, "grad_norm": 0.7067899715900233, "learning_rate": 2.739508366428969e-05, "loss": 11.5087, "num_tokens": 7828321.0, "step": 1097 }, { "epoch": 0.7672955974842768, "grad_norm": 0.7107532648880457, "learning_rate": 2.723962902736389e-05, "loss": 11.5416, "num_tokens": 7835798.0, "step": 1098 }, { "epoch": 0.7679944095038435, "grad_norm": 0.7031234959761055, "learning_rate": 2.7084547138618778e-05, "loss": 11.614, "num_tokens": 7842754.0, "step": 1099 }, { "epoch": 0.7686932215234102, "grad_norm": 0.7256785231783027, "learning_rate": 2.6929838792532037e-05, "loss": 11.4756, "num_tokens": 7849613.0, "step": 1100 }, { "epoch": 0.7693920335429769, "grad_norm": 0.711318035418091, "learning_rate": 2.6775504781667725e-05, "loss": 11.6399, "num_tokens": 7856848.0, "step": 1101 }, { "epoch": 0.7700908455625437, "grad_norm": 0.7037219078316764, "learning_rate": 2.6621545896672174e-05, "loss": 11.6242, "num_tokens": 7863992.0, "step": 1102 }, { "epoch": 0.7707896575821104, "grad_norm": 0.6745280912886404, "learning_rate": 2.6467962926270017e-05, "loss": 11.5943, "num_tokens": 7872390.0, "step": 1103 }, { "epoch": 0.7714884696016772, "grad_norm": 0.7213064153858593, "learning_rate": 2.6314756657260054e-05, "loss": 11.5445, "num_tokens": 7879165.0, "step": 1104 }, { "epoch": 0.7721872816212438, "grad_norm": 0.7348100469235388, "learning_rate": 2.6161927874511216e-05, "loss": 11.4994, "num_tokens": 7886459.0, "step": 1105 }, { "epoch": 0.7728860936408106, "grad_norm": 0.6935894440933058, "learning_rate": 2.6009477360958712e-05, "loss": 11.6117, "num_tokens": 7893611.0, "step": 1106 }, { "epoch": 0.7735849056603774, "grad_norm": 0.6940669280906595, "learning_rate": 2.585740589759976e-05, "loss": 11.4804, "num_tokens": 7900850.0, "step": 1107 }, { "epoch": 0.7742837176799441, "grad_norm": 0.751295238412207, "learning_rate": 2.5705714263489776e-05, "loss": 11.5352, "num_tokens": 7907796.0, "step": 1108 }, { "epoch": 0.7749825296995109, "grad_norm": 0.7391047262828542, "learning_rate": 2.555440323573839e-05, "loss": 11.708, "num_tokens": 7915024.0, "step": 1109 }, { "epoch": 0.7756813417190775, "grad_norm": 0.8103062577331461, "learning_rate": 2.540347358950529e-05, "loss": 11.4636, "num_tokens": 7921430.0, "step": 1110 }, { "epoch": 0.7763801537386443, "grad_norm": 0.7421607619269197, "learning_rate": 2.5252926097996445e-05, "loss": 11.7304, "num_tokens": 7928272.0, "step": 1111 }, { "epoch": 0.777078965758211, "grad_norm": 0.7719958048167995, "learning_rate": 2.5102761532460008e-05, "loss": 11.6926, "num_tokens": 7934944.0, "step": 1112 }, { "epoch": 0.7777777777777778, "grad_norm": 0.7401622976674673, "learning_rate": 2.4952980662182425e-05, "loss": 11.4233, "num_tokens": 7941339.0, "step": 1113 }, { "epoch": 0.7784765897973445, "grad_norm": 0.7248195083161828, "learning_rate": 2.4803584254484568e-05, "loss": 11.6312, "num_tokens": 7948332.0, "step": 1114 }, { "epoch": 0.7791754018169113, "grad_norm": 0.732259203116434, "learning_rate": 2.4654573074717602e-05, "loss": 11.412, "num_tokens": 7955689.0, "step": 1115 }, { "epoch": 0.779874213836478, "grad_norm": 0.7059818698633936, "learning_rate": 2.4505947886259318e-05, "loss": 11.742, "num_tokens": 7962958.0, "step": 1116 }, { "epoch": 0.7805730258560447, "grad_norm": 0.6751946136877488, "learning_rate": 2.435770945050997e-05, "loss": 11.3857, "num_tokens": 7970832.0, "step": 1117 }, { "epoch": 0.7812718378756115, "grad_norm": 0.7509321728923618, "learning_rate": 2.420985852688854e-05, "loss": 11.7448, "num_tokens": 7977881.0, "step": 1118 }, { "epoch": 0.7819706498951782, "grad_norm": 0.6561745904495563, "learning_rate": 2.4062395872828846e-05, "loss": 11.531, "num_tokens": 7985659.0, "step": 1119 }, { "epoch": 0.782669461914745, "grad_norm": 0.6810374482460928, "learning_rate": 2.3915322243775562e-05, "loss": 11.511, "num_tokens": 7993219.0, "step": 1120 }, { "epoch": 0.7833682739343116, "grad_norm": 0.6896502817674913, "learning_rate": 2.3768638393180407e-05, "loss": 11.5836, "num_tokens": 8000242.0, "step": 1121 }, { "epoch": 0.7840670859538784, "grad_norm": 0.7145964062311239, "learning_rate": 2.362234507249832e-05, "loss": 11.6156, "num_tokens": 8007589.0, "step": 1122 }, { "epoch": 0.7847658979734451, "grad_norm": 0.6651730755964388, "learning_rate": 2.3476443031183503e-05, "loss": 11.6697, "num_tokens": 8015113.0, "step": 1123 }, { "epoch": 0.7854647099930119, "grad_norm": 0.730162607136164, "learning_rate": 2.3330933016685754e-05, "loss": 11.6406, "num_tokens": 8021703.0, "step": 1124 }, { "epoch": 0.7861635220125787, "grad_norm": 0.706585916019649, "learning_rate": 2.318581577444646e-05, "loss": 11.5674, "num_tokens": 8028563.0, "step": 1125 }, { "epoch": 0.7868623340321453, "grad_norm": 0.7261019540757417, "learning_rate": 2.304109204789484e-05, "loss": 11.6464, "num_tokens": 8035481.0, "step": 1126 }, { "epoch": 0.7875611460517121, "grad_norm": 0.7230591047259078, "learning_rate": 2.289676257844423e-05, "loss": 11.7388, "num_tokens": 8042374.0, "step": 1127 }, { "epoch": 0.7882599580712788, "grad_norm": 0.6710460854679413, "learning_rate": 2.275282810548811e-05, "loss": 11.5265, "num_tokens": 8050149.0, "step": 1128 }, { "epoch": 0.7889587700908456, "grad_norm": 0.7303492099047564, "learning_rate": 2.2609289366396502e-05, "loss": 11.7182, "num_tokens": 8056810.0, "step": 1129 }, { "epoch": 0.7896575821104123, "grad_norm": 0.7108633933438501, "learning_rate": 2.2466147096512035e-05, "loss": 11.8337, "num_tokens": 8063712.0, "step": 1130 }, { "epoch": 0.790356394129979, "grad_norm": 0.7013257263082335, "learning_rate": 2.2323402029146244e-05, "loss": 11.5153, "num_tokens": 8070844.0, "step": 1131 }, { "epoch": 0.7910552061495457, "grad_norm": 0.6692578889529726, "learning_rate": 2.2181054895575847e-05, "loss": 11.4817, "num_tokens": 8078320.0, "step": 1132 }, { "epoch": 0.7917540181691125, "grad_norm": 0.6722679420030302, "learning_rate": 2.2039106425038924e-05, "loss": 11.7263, "num_tokens": 8085826.0, "step": 1133 }, { "epoch": 0.7924528301886793, "grad_norm": 0.7022394162218761, "learning_rate": 2.189755734473129e-05, "loss": 11.6687, "num_tokens": 8092777.0, "step": 1134 }, { "epoch": 0.793151642208246, "grad_norm": 0.8000656873886911, "learning_rate": 2.175640837980265e-05, "loss": 11.8119, "num_tokens": 8099130.0, "step": 1135 }, { "epoch": 0.7938504542278128, "grad_norm": 0.781499895574501, "learning_rate": 2.161566025335289e-05, "loss": 11.6872, "num_tokens": 8105524.0, "step": 1136 }, { "epoch": 0.7945492662473794, "grad_norm": 0.7074827094878331, "learning_rate": 2.1475313686428544e-05, "loss": 11.4998, "num_tokens": 8112500.0, "step": 1137 }, { "epoch": 0.7952480782669462, "grad_norm": 0.7135705955177107, "learning_rate": 2.133536939801888e-05, "loss": 11.6275, "num_tokens": 8119456.0, "step": 1138 }, { "epoch": 0.7959468902865129, "grad_norm": 0.7271071660736487, "learning_rate": 2.1195828105052283e-05, "loss": 11.5599, "num_tokens": 8126211.0, "step": 1139 }, { "epoch": 0.7966457023060797, "grad_norm": 0.757471440304504, "learning_rate": 2.105669052239274e-05, "loss": 11.605, "num_tokens": 8132659.0, "step": 1140 }, { "epoch": 0.7973445143256463, "grad_norm": 0.731869331936842, "learning_rate": 2.091795736283593e-05, "loss": 11.7303, "num_tokens": 8139414.0, "step": 1141 }, { "epoch": 0.7980433263452131, "grad_norm": 0.7140975321778861, "learning_rate": 2.0779629337105722e-05, "loss": 11.413, "num_tokens": 8146787.0, "step": 1142 }, { "epoch": 0.7987421383647799, "grad_norm": 0.7644539350392501, "learning_rate": 2.064170715385052e-05, "loss": 11.5113, "num_tokens": 8153506.0, "step": 1143 }, { "epoch": 0.7994409503843466, "grad_norm": 0.65418077450009, "learning_rate": 2.050419151963957e-05, "loss": 11.7101, "num_tokens": 8161093.0, "step": 1144 }, { "epoch": 0.8001397624039134, "grad_norm": 0.7413639644128542, "learning_rate": 2.0367083138959476e-05, "loss": 11.6071, "num_tokens": 8167975.0, "step": 1145 }, { "epoch": 0.80083857442348, "grad_norm": 0.7635154540705584, "learning_rate": 2.0230382714210384e-05, "loss": 11.6118, "num_tokens": 8174799.0, "step": 1146 }, { "epoch": 0.8015373864430468, "grad_norm": 0.716235149002391, "learning_rate": 2.0094090945702616e-05, "loss": 11.6827, "num_tokens": 8181864.0, "step": 1147 }, { "epoch": 0.8022361984626135, "grad_norm": 0.6800488359307103, "learning_rate": 1.9958208531652877e-05, "loss": 11.6604, "num_tokens": 8188903.0, "step": 1148 }, { "epoch": 0.8029350104821803, "grad_norm": 0.6393895545972774, "learning_rate": 1.9822736168180778e-05, "loss": 11.641, "num_tokens": 8196709.0, "step": 1149 }, { "epoch": 0.803633822501747, "grad_norm": 0.6983448761591593, "learning_rate": 1.9687674549305335e-05, "loss": 11.6795, "num_tokens": 8204133.0, "step": 1150 }, { "epoch": 0.8043326345213138, "grad_norm": 0.659945277737391, "learning_rate": 1.9553024366941242e-05, "loss": 11.5617, "num_tokens": 8211596.0, "step": 1151 }, { "epoch": 0.8050314465408805, "grad_norm": 0.7114940504611794, "learning_rate": 1.9418786310895464e-05, "loss": 11.677, "num_tokens": 8218648.0, "step": 1152 }, { "epoch": 0.8057302585604472, "grad_norm": 0.6498997833303469, "learning_rate": 1.9284961068863673e-05, "loss": 11.5807, "num_tokens": 8226878.0, "step": 1153 }, { "epoch": 0.806429070580014, "grad_norm": 0.6851026198320295, "learning_rate": 1.9151549326426656e-05, "loss": 11.6532, "num_tokens": 8234332.0, "step": 1154 }, { "epoch": 0.8071278825995807, "grad_norm": 0.7304658584255903, "learning_rate": 1.9018551767046966e-05, "loss": 11.4973, "num_tokens": 8241077.0, "step": 1155 }, { "epoch": 0.8078266946191475, "grad_norm": 0.7013604887509809, "learning_rate": 1.8885969072065225e-05, "loss": 11.421, "num_tokens": 8248516.0, "step": 1156 }, { "epoch": 0.8085255066387141, "grad_norm": 0.7053816647394219, "learning_rate": 1.8753801920696712e-05, "loss": 11.6213, "num_tokens": 8255076.0, "step": 1157 }, { "epoch": 0.8092243186582809, "grad_norm": 0.6535806078494023, "learning_rate": 1.8622050990027995e-05, "loss": 11.5572, "num_tokens": 8262629.0, "step": 1158 }, { "epoch": 0.8099231306778477, "grad_norm": 0.7139779119311859, "learning_rate": 1.8490716955013232e-05, "loss": 11.5717, "num_tokens": 8269455.0, "step": 1159 }, { "epoch": 0.8106219426974144, "grad_norm": 0.7532169883528747, "learning_rate": 1.8359800488470978e-05, "loss": 11.5791, "num_tokens": 8276214.0, "step": 1160 }, { "epoch": 0.8113207547169812, "grad_norm": 0.6616779523982886, "learning_rate": 1.8229302261080495e-05, "loss": 11.6381, "num_tokens": 8283578.0, "step": 1161 }, { "epoch": 0.8120195667365478, "grad_norm": 0.7377501630255389, "learning_rate": 1.809922294137847e-05, "loss": 11.6844, "num_tokens": 8290424.0, "step": 1162 }, { "epoch": 0.8127183787561146, "grad_norm": 0.6456460863732775, "learning_rate": 1.7969563195755535e-05, "loss": 11.4604, "num_tokens": 8298504.0, "step": 1163 }, { "epoch": 0.8134171907756813, "grad_norm": 0.7061897389733708, "learning_rate": 1.784032368845283e-05, "loss": 11.7079, "num_tokens": 8305815.0, "step": 1164 }, { "epoch": 0.8141160027952481, "grad_norm": 0.7107549537689926, "learning_rate": 1.7711505081558734e-05, "loss": 11.5782, "num_tokens": 8312752.0, "step": 1165 }, { "epoch": 0.8148148148148148, "grad_norm": 0.6668197053834488, "learning_rate": 1.758310803500527e-05, "loss": 11.6164, "num_tokens": 8320180.0, "step": 1166 }, { "epoch": 0.8155136268343816, "grad_norm": 0.7170983306456978, "learning_rate": 1.7455133206564832e-05, "loss": 11.4534, "num_tokens": 8327115.0, "step": 1167 }, { "epoch": 0.8162124388539483, "grad_norm": 0.6641025870433105, "learning_rate": 1.73275812518469e-05, "loss": 11.4965, "num_tokens": 8334667.0, "step": 1168 }, { "epoch": 0.816911250873515, "grad_norm": 0.7537451851693961, "learning_rate": 1.7200452824294498e-05, "loss": 11.6092, "num_tokens": 8340918.0, "step": 1169 }, { "epoch": 0.8176100628930818, "grad_norm": 0.721848127760393, "learning_rate": 1.707374857518094e-05, "loss": 11.6172, "num_tokens": 8347718.0, "step": 1170 }, { "epoch": 0.8183088749126485, "grad_norm": 0.692152492264296, "learning_rate": 1.6947469153606577e-05, "loss": 11.7099, "num_tokens": 8355388.0, "step": 1171 }, { "epoch": 0.8190076869322153, "grad_norm": 0.7464531132554777, "learning_rate": 1.6821615206495312e-05, "loss": 11.7093, "num_tokens": 8362353.0, "step": 1172 }, { "epoch": 0.8197064989517819, "grad_norm": 0.7371225093942753, "learning_rate": 1.6696187378591376e-05, "loss": 11.5761, "num_tokens": 8369668.0, "step": 1173 }, { "epoch": 0.8204053109713487, "grad_norm": 0.7366402368479578, "learning_rate": 1.657118631245601e-05, "loss": 11.7975, "num_tokens": 8376453.0, "step": 1174 }, { "epoch": 0.8211041229909154, "grad_norm": 0.6621056492574404, "learning_rate": 1.6446612648464164e-05, "loss": 11.537, "num_tokens": 8384073.0, "step": 1175 }, { "epoch": 0.8218029350104822, "grad_norm": 0.6531556068495009, "learning_rate": 1.632246702480128e-05, "loss": 11.3117, "num_tokens": 8391615.0, "step": 1176 }, { "epoch": 0.822501747030049, "grad_norm": 0.6820117221205548, "learning_rate": 1.619875007745989e-05, "loss": 11.5475, "num_tokens": 8399117.0, "step": 1177 }, { "epoch": 0.8232005590496156, "grad_norm": 0.7645685404468363, "learning_rate": 1.607546244023651e-05, "loss": 11.6094, "num_tokens": 8405508.0, "step": 1178 }, { "epoch": 0.8238993710691824, "grad_norm": 0.7145369409191769, "learning_rate": 1.5952604744728272e-05, "loss": 11.7182, "num_tokens": 8412474.0, "step": 1179 }, { "epoch": 0.8245981830887491, "grad_norm": 0.7674901271848729, "learning_rate": 1.5830177620329712e-05, "loss": 11.7028, "num_tokens": 8419536.0, "step": 1180 }, { "epoch": 0.8252969951083159, "grad_norm": 0.708484709151861, "learning_rate": 1.570818169422966e-05, "loss": 11.6668, "num_tokens": 8426032.0, "step": 1181 }, { "epoch": 0.8259958071278826, "grad_norm": 0.7730341374909616, "learning_rate": 1.558661759140786e-05, "loss": 11.6669, "num_tokens": 8432351.0, "step": 1182 }, { "epoch": 0.8266946191474493, "grad_norm": 0.6863726385167277, "learning_rate": 1.5465485934631853e-05, "loss": 11.4615, "num_tokens": 8439781.0, "step": 1183 }, { "epoch": 0.827393431167016, "grad_norm": 0.6979815579089426, "learning_rate": 1.5344787344453805e-05, "loss": 11.6105, "num_tokens": 8446877.0, "step": 1184 }, { "epoch": 0.8280922431865828, "grad_norm": 0.7178919451347634, "learning_rate": 1.5224522439207246e-05, "loss": 11.4421, "num_tokens": 8453354.0, "step": 1185 }, { "epoch": 0.8287910552061496, "grad_norm": 0.7224288461147278, "learning_rate": 1.5104691835004048e-05, "loss": 11.6013, "num_tokens": 8460350.0, "step": 1186 }, { "epoch": 0.8294898672257163, "grad_norm": 0.6610925527680873, "learning_rate": 1.498529614573111e-05, "loss": 11.4952, "num_tokens": 8467882.0, "step": 1187 }, { "epoch": 0.8301886792452831, "grad_norm": 0.7644397218225072, "learning_rate": 1.4866335983047264e-05, "loss": 11.5023, "num_tokens": 8474603.0, "step": 1188 }, { "epoch": 0.8308874912648497, "grad_norm": 0.710571517689817, "learning_rate": 1.4747811956380242e-05, "loss": 11.6805, "num_tokens": 8481543.0, "step": 1189 }, { "epoch": 0.8315863032844165, "grad_norm": 0.709592960585445, "learning_rate": 1.4629724672923384e-05, "loss": 11.5167, "num_tokens": 8488957.0, "step": 1190 }, { "epoch": 0.8322851153039832, "grad_norm": 0.66919107865293, "learning_rate": 1.4512074737632686e-05, "loss": 11.4268, "num_tokens": 8496305.0, "step": 1191 }, { "epoch": 0.83298392732355, "grad_norm": 0.6974868700924324, "learning_rate": 1.439486275322357e-05, "loss": 11.5845, "num_tokens": 8503384.0, "step": 1192 }, { "epoch": 0.8336827393431167, "grad_norm": 0.7031493852558214, "learning_rate": 1.4278089320167876e-05, "loss": 11.4762, "num_tokens": 8510477.0, "step": 1193 }, { "epoch": 0.8343815513626834, "grad_norm": 0.7221258353637122, "learning_rate": 1.4161755036690771e-05, "loss": 11.4896, "num_tokens": 8517882.0, "step": 1194 }, { "epoch": 0.8350803633822502, "grad_norm": 0.7166924081429917, "learning_rate": 1.4045860498767671e-05, "loss": 11.7326, "num_tokens": 8524376.0, "step": 1195 }, { "epoch": 0.8357791754018169, "grad_norm": 0.7081783346953887, "learning_rate": 1.3930406300121179e-05, "loss": 11.6734, "num_tokens": 8531630.0, "step": 1196 }, { "epoch": 0.8364779874213837, "grad_norm": 0.7202219011638747, "learning_rate": 1.3815393032218115e-05, "loss": 11.4651, "num_tokens": 8538262.0, "step": 1197 }, { "epoch": 0.8371767994409504, "grad_norm": 0.7072816743004284, "learning_rate": 1.3700821284266351e-05, "loss": 11.4478, "num_tokens": 8545378.0, "step": 1198 }, { "epoch": 0.8378756114605171, "grad_norm": 0.6675056875060252, "learning_rate": 1.3586691643211957e-05, "loss": 11.6135, "num_tokens": 8553221.0, "step": 1199 }, { "epoch": 0.8385744234800838, "grad_norm": 0.7859448407763275, "learning_rate": 1.3473004693736036e-05, "loss": 11.647, "num_tokens": 8559107.0, "step": 1200 }, { "epoch": 0.8392732354996506, "grad_norm": 0.7094999294935215, "learning_rate": 1.3359761018251826e-05, "loss": 11.4291, "num_tokens": 8565926.0, "step": 1201 }, { "epoch": 0.8399720475192173, "grad_norm": 0.692951631444227, "learning_rate": 1.324696119690173e-05, "loss": 11.7024, "num_tokens": 8573174.0, "step": 1202 }, { "epoch": 0.8406708595387841, "grad_norm": 0.7146683634720493, "learning_rate": 1.3134605807554246e-05, "loss": 11.161, "num_tokens": 8579902.0, "step": 1203 }, { "epoch": 0.8413696715583509, "grad_norm": 0.7185790382607279, "learning_rate": 1.302269542580109e-05, "loss": 11.7996, "num_tokens": 8586798.0, "step": 1204 }, { "epoch": 0.8420684835779175, "grad_norm": 0.667557438188177, "learning_rate": 1.291123062495424e-05, "loss": 11.3108, "num_tokens": 8593890.0, "step": 1205 }, { "epoch": 0.8427672955974843, "grad_norm": 0.7789703868272325, "learning_rate": 1.2800211976042941e-05, "loss": 11.7473, "num_tokens": 8600460.0, "step": 1206 }, { "epoch": 0.843466107617051, "grad_norm": 0.6996322937515739, "learning_rate": 1.268964004781089e-05, "loss": 11.4618, "num_tokens": 8607572.0, "step": 1207 }, { "epoch": 0.8441649196366178, "grad_norm": 0.6947717087779574, "learning_rate": 1.2579515406713193e-05, "loss": 11.6363, "num_tokens": 8614609.0, "step": 1208 }, { "epoch": 0.8448637316561844, "grad_norm": 0.690200182048955, "learning_rate": 1.246983861691352e-05, "loss": 11.5823, "num_tokens": 8621756.0, "step": 1209 }, { "epoch": 0.8455625436757512, "grad_norm": 0.7267694145321842, "learning_rate": 1.236061024028129e-05, "loss": 11.4435, "num_tokens": 8628716.0, "step": 1210 }, { "epoch": 0.8462613556953179, "grad_norm": 0.6483287697138146, "learning_rate": 1.2251830836388622e-05, "loss": 11.5128, "num_tokens": 8636472.0, "step": 1211 }, { "epoch": 0.8469601677148847, "grad_norm": 0.6866343015663074, "learning_rate": 1.214350096250767e-05, "loss": 11.7093, "num_tokens": 8643444.0, "step": 1212 }, { "epoch": 0.8476589797344515, "grad_norm": 0.6681580155044914, "learning_rate": 1.2035621173607581e-05, "loss": 11.5512, "num_tokens": 8650747.0, "step": 1213 }, { "epoch": 0.8483577917540182, "grad_norm": 0.7119552317792336, "learning_rate": 1.192819202235178e-05, "loss": 11.503, "num_tokens": 8657676.0, "step": 1214 }, { "epoch": 0.8490566037735849, "grad_norm": 0.6475705814393042, "learning_rate": 1.1821214059095088e-05, "loss": 11.4912, "num_tokens": 8665448.0, "step": 1215 }, { "epoch": 0.8497554157931516, "grad_norm": 0.6749727921972105, "learning_rate": 1.1714687831880865e-05, "loss": 11.6501, "num_tokens": 8672624.0, "step": 1216 }, { "epoch": 0.8504542278127184, "grad_norm": 0.6984478602069423, "learning_rate": 1.1608613886438346e-05, "loss": 11.437, "num_tokens": 8679416.0, "step": 1217 }, { "epoch": 0.8511530398322851, "grad_norm": 0.689356359799714, "learning_rate": 1.1502992766179666e-05, "loss": 11.6217, "num_tokens": 8686327.0, "step": 1218 }, { "epoch": 0.8518518518518519, "grad_norm": 0.7150930925806451, "learning_rate": 1.139782501219715e-05, "loss": 11.5312, "num_tokens": 8693164.0, "step": 1219 }, { "epoch": 0.8525506638714185, "grad_norm": 0.7085400286578597, "learning_rate": 1.1293111163260639e-05, "loss": 11.4836, "num_tokens": 8700212.0, "step": 1220 }, { "epoch": 0.8532494758909853, "grad_norm": 0.6496563699098009, "learning_rate": 1.118885175581451e-05, "loss": 11.5562, "num_tokens": 8707503.0, "step": 1221 }, { "epoch": 0.8539482879105521, "grad_norm": 0.6396886618398422, "learning_rate": 1.1085047323975173e-05, "loss": 11.4745, "num_tokens": 8715062.0, "step": 1222 }, { "epoch": 0.8546470999301188, "grad_norm": 0.6921296350375813, "learning_rate": 1.0981698399528151e-05, "loss": 11.6231, "num_tokens": 8722494.0, "step": 1223 }, { "epoch": 0.8553459119496856, "grad_norm": 0.6599166302398856, "learning_rate": 1.0878805511925438e-05, "loss": 11.537, "num_tokens": 8730292.0, "step": 1224 }, { "epoch": 0.8560447239692522, "grad_norm": 0.6992578050598631, "learning_rate": 1.0776369188282775e-05, "loss": 11.6358, "num_tokens": 8737792.0, "step": 1225 }, { "epoch": 0.856743535988819, "grad_norm": 0.662340520996625, "learning_rate": 1.0674389953376928e-05, "loss": 11.4804, "num_tokens": 8745484.0, "step": 1226 }, { "epoch": 0.8574423480083857, "grad_norm": 0.6668124181859917, "learning_rate": 1.0572868329643027e-05, "loss": 11.4285, "num_tokens": 8752746.0, "step": 1227 }, { "epoch": 0.8581411600279525, "grad_norm": 0.747299965413557, "learning_rate": 1.0471804837171916e-05, "loss": 11.7335, "num_tokens": 8759307.0, "step": 1228 }, { "epoch": 0.8588399720475192, "grad_norm": 0.664637762098333, "learning_rate": 1.0371199993707392e-05, "loss": 11.3528, "num_tokens": 8766894.0, "step": 1229 }, { "epoch": 0.859538784067086, "grad_norm": 0.7080136942960862, "learning_rate": 1.027105431464368e-05, "loss": 11.5728, "num_tokens": 8773818.0, "step": 1230 }, { "epoch": 0.8602375960866527, "grad_norm": 0.6584697619389641, "learning_rate": 1.0171368313022677e-05, "loss": 11.4522, "num_tokens": 8781325.0, "step": 1231 }, { "epoch": 0.8609364081062194, "grad_norm": 0.7066646119627334, "learning_rate": 1.0072142499531344e-05, "loss": 11.6604, "num_tokens": 8788576.0, "step": 1232 }, { "epoch": 0.8616352201257862, "grad_norm": 0.70483414399539, "learning_rate": 9.973377382499227e-06, "loss": 11.7258, "num_tokens": 8794997.0, "step": 1233 }, { "epoch": 0.8623340321453529, "grad_norm": 0.6487083119100663, "learning_rate": 9.875073467895634e-06, "loss": 11.4893, "num_tokens": 8802443.0, "step": 1234 }, { "epoch": 0.8630328441649197, "grad_norm": 0.6216707927603725, "learning_rate": 9.777231259327212e-06, "loss": 11.4552, "num_tokens": 8810645.0, "step": 1235 }, { "epoch": 0.8637316561844863, "grad_norm": 0.6838310230373638, "learning_rate": 9.679851258035277e-06, "loss": 11.4112, "num_tokens": 8818550.0, "step": 1236 }, { "epoch": 0.8644304682040531, "grad_norm": 0.6665502799252908, "learning_rate": 9.582933962893293e-06, "loss": 11.4809, "num_tokens": 8825989.0, "step": 1237 }, { "epoch": 0.8651292802236199, "grad_norm": 0.6544525789430773, "learning_rate": 9.48647987040433e-06, "loss": 11.4967, "num_tokens": 8833884.0, "step": 1238 }, { "epoch": 0.8658280922431866, "grad_norm": 0.7756422804378578, "learning_rate": 9.390489474698439e-06, "loss": 11.5953, "num_tokens": 8840327.0, "step": 1239 }, { "epoch": 0.8665269042627534, "grad_norm": 0.6885918162181854, "learning_rate": 9.294963267530176e-06, "loss": 11.6883, "num_tokens": 8847713.0, "step": 1240 }, { "epoch": 0.86722571628232, "grad_norm": 0.6614558583260526, "learning_rate": 9.19990173827615e-06, "loss": 11.3745, "num_tokens": 8855278.0, "step": 1241 }, { "epoch": 0.8679245283018868, "grad_norm": 0.7179694406243424, "learning_rate": 9.105305373932338e-06, "loss": 11.4962, "num_tokens": 8862156.0, "step": 1242 }, { "epoch": 0.8686233403214535, "grad_norm": 0.6628392505284637, "learning_rate": 9.01117465911181e-06, "loss": 11.53, "num_tokens": 8869458.0, "step": 1243 }, { "epoch": 0.8693221523410203, "grad_norm": 0.7319153659663948, "learning_rate": 8.917510076042057e-06, "loss": 11.659, "num_tokens": 8875858.0, "step": 1244 }, { "epoch": 0.870020964360587, "grad_norm": 0.7259443353311615, "learning_rate": 8.824312104562615e-06, "loss": 11.4954, "num_tokens": 8882524.0, "step": 1245 }, { "epoch": 0.8707197763801537, "grad_norm": 0.6508593743959824, "learning_rate": 8.731581222122587e-06, "loss": 11.6273, "num_tokens": 8890194.0, "step": 1246 }, { "epoch": 0.8714185883997205, "grad_norm": 0.6796451861284053, "learning_rate": 8.639317903778189e-06, "loss": 11.5158, "num_tokens": 8897718.0, "step": 1247 }, { "epoch": 0.8721174004192872, "grad_norm": 0.6850555867708901, "learning_rate": 8.547522622190385e-06, "loss": 11.3543, "num_tokens": 8904932.0, "step": 1248 }, { "epoch": 0.872816212438854, "grad_norm": 0.663900610727234, "learning_rate": 8.45619584762235e-06, "loss": 11.6675, "num_tokens": 8912001.0, "step": 1249 }, { "epoch": 0.8735150244584207, "grad_norm": 0.6824952234223884, "learning_rate": 8.365338047937121e-06, "loss": 11.3852, "num_tokens": 8919388.0, "step": 1250 }, { "epoch": 0.8742138364779874, "grad_norm": 0.7182084938264671, "learning_rate": 8.274949688595224e-06, "loss": 11.7609, "num_tokens": 8926115.0, "step": 1251 }, { "epoch": 0.8749126484975541, "grad_norm": 0.7430847007855079, "learning_rate": 8.185031232652251e-06, "loss": 11.633, "num_tokens": 8932498.0, "step": 1252 }, { "epoch": 0.8756114605171209, "grad_norm": 0.7147212955436707, "learning_rate": 8.095583140756468e-06, "loss": 11.6627, "num_tokens": 8939059.0, "step": 1253 }, { "epoch": 0.8763102725366876, "grad_norm": 0.684812308930803, "learning_rate": 8.006605871146577e-06, "loss": 11.5533, "num_tokens": 8946032.0, "step": 1254 }, { "epoch": 0.8770090845562544, "grad_norm": 0.6692098195446166, "learning_rate": 7.918099879649144e-06, "loss": 11.6309, "num_tokens": 8953182.0, "step": 1255 }, { "epoch": 0.8777078965758212, "grad_norm": 0.7329944739560608, "learning_rate": 7.830065619676518e-06, "loss": 11.4119, "num_tokens": 8959948.0, "step": 1256 }, { "epoch": 0.8784067085953878, "grad_norm": 0.6706051183750147, "learning_rate": 7.742503542224334e-06, "loss": 11.5542, "num_tokens": 8967644.0, "step": 1257 }, { "epoch": 0.8791055206149546, "grad_norm": 0.6496787024300155, "learning_rate": 7.65541409586924e-06, "loss": 11.5373, "num_tokens": 8975384.0, "step": 1258 }, { "epoch": 0.8798043326345213, "grad_norm": 0.8124874424417432, "learning_rate": 7.568797726766686e-06, "loss": 11.4112, "num_tokens": 8981552.0, "step": 1259 }, { "epoch": 0.8805031446540881, "grad_norm": 0.6599550037061661, "learning_rate": 7.482654878648465e-06, "loss": 11.6658, "num_tokens": 8989062.0, "step": 1260 }, { "epoch": 0.8812019566736548, "grad_norm": 0.6657122628630363, "learning_rate": 7.396985992820648e-06, "loss": 11.5582, "num_tokens": 8996294.0, "step": 1261 }, { "epoch": 0.8819007686932215, "grad_norm": 0.6858647720677542, "learning_rate": 7.311791508161159e-06, "loss": 11.646, "num_tokens": 9003435.0, "step": 1262 }, { "epoch": 0.8825995807127882, "grad_norm": 0.6364378593415492, "learning_rate": 7.227071861117562e-06, "loss": 11.7278, "num_tokens": 9011341.0, "step": 1263 }, { "epoch": 0.883298392732355, "grad_norm": 0.710629114263608, "learning_rate": 7.14282748570495e-06, "loss": 11.5413, "num_tokens": 9017863.0, "step": 1264 }, { "epoch": 0.8839972047519218, "grad_norm": 0.7150617897334426, "learning_rate": 7.059058813503483e-06, "loss": 11.6807, "num_tokens": 9024474.0, "step": 1265 }, { "epoch": 0.8846960167714885, "grad_norm": 0.6571356924743433, "learning_rate": 6.975766273656425e-06, "loss": 11.5295, "num_tokens": 9032345.0, "step": 1266 }, { "epoch": 0.8853948287910552, "grad_norm": 0.7275281273595156, "learning_rate": 6.892950292867784e-06, "loss": 11.6873, "num_tokens": 9038850.0, "step": 1267 }, { "epoch": 0.8860936408106219, "grad_norm": 0.6878221992679487, "learning_rate": 6.810611295400171e-06, "loss": 11.6192, "num_tokens": 9045840.0, "step": 1268 }, { "epoch": 0.8867924528301887, "grad_norm": 0.6586995132708238, "learning_rate": 6.728749703072679e-06, "loss": 11.5176, "num_tokens": 9053258.0, "step": 1269 }, { "epoch": 0.8874912648497554, "grad_norm": 0.7177747106842158, "learning_rate": 6.647365935258642e-06, "loss": 11.4239, "num_tokens": 9060449.0, "step": 1270 }, { "epoch": 0.8881900768693222, "grad_norm": 0.7151360360117833, "learning_rate": 6.56646040888349e-06, "loss": 11.6079, "num_tokens": 9067366.0, "step": 1271 }, { "epoch": 0.8888888888888888, "grad_norm": 0.6699876527810528, "learning_rate": 6.48603353842272e-06, "loss": 11.5647, "num_tokens": 9074722.0, "step": 1272 }, { "epoch": 0.8895877009084556, "grad_norm": 0.6934463748378176, "learning_rate": 6.406085735899625e-06, "loss": 11.5854, "num_tokens": 9082068.0, "step": 1273 }, { "epoch": 0.8902865129280224, "grad_norm": 0.6559017435267815, "learning_rate": 6.326617410883295e-06, "loss": 11.7271, "num_tokens": 9089269.0, "step": 1274 }, { "epoch": 0.8909853249475891, "grad_norm": 0.7167601840979251, "learning_rate": 6.247628970486463e-06, "loss": 11.3844, "num_tokens": 9096281.0, "step": 1275 }, { "epoch": 0.8916841369671559, "grad_norm": 0.6498437618428198, "learning_rate": 6.169120819363405e-06, "loss": 11.6058, "num_tokens": 9103954.0, "step": 1276 }, { "epoch": 0.8923829489867225, "grad_norm": 0.7045432270958085, "learning_rate": 6.091093359707977e-06, "loss": 11.5676, "num_tokens": 9110403.0, "step": 1277 }, { "epoch": 0.8930817610062893, "grad_norm": 0.691785790482857, "learning_rate": 6.013546991251373e-06, "loss": 11.3183, "num_tokens": 9117361.0, "step": 1278 }, { "epoch": 0.893780573025856, "grad_norm": 0.731188326419675, "learning_rate": 5.936482111260278e-06, "loss": 11.6684, "num_tokens": 9123889.0, "step": 1279 }, { "epoch": 0.8944793850454228, "grad_norm": 0.7114338834078112, "learning_rate": 5.859899114534661e-06, "loss": 11.354, "num_tokens": 9130750.0, "step": 1280 }, { "epoch": 0.8951781970649895, "grad_norm": 0.6830212183885706, "learning_rate": 5.783798393405826e-06, "loss": 11.732, "num_tokens": 9137972.0, "step": 1281 }, { "epoch": 0.8958770090845563, "grad_norm": 0.687450384519889, "learning_rate": 5.708180337734448e-06, "loss": 11.6706, "num_tokens": 9144975.0, "step": 1282 }, { "epoch": 0.896575821104123, "grad_norm": 0.6852418292497463, "learning_rate": 5.633045334908493e-06, "loss": 11.7086, "num_tokens": 9152166.0, "step": 1283 }, { "epoch": 0.8972746331236897, "grad_norm": 0.7322479114218022, "learning_rate": 5.5583937698412856e-06, "loss": 11.5725, "num_tokens": 9158905.0, "step": 1284 }, { "epoch": 0.8979734451432565, "grad_norm": 0.7282825482109372, "learning_rate": 5.4842260249694964e-06, "loss": 11.8095, "num_tokens": 9165569.0, "step": 1285 }, { "epoch": 0.8986722571628232, "grad_norm": 0.7137772828071814, "learning_rate": 5.410542480251202e-06, "loss": 11.6792, "num_tokens": 9172201.0, "step": 1286 }, { "epoch": 0.89937106918239, "grad_norm": 0.7233254306106957, "learning_rate": 5.337343513164006e-06, "loss": 11.4407, "num_tokens": 9179084.0, "step": 1287 }, { "epoch": 0.9000698812019566, "grad_norm": 0.674280395459097, "learning_rate": 5.264629498702967e-06, "loss": 11.5151, "num_tokens": 9186520.0, "step": 1288 }, { "epoch": 0.9007686932215234, "grad_norm": 0.6428729546255919, "learning_rate": 5.192400809378783e-06, "loss": 11.5095, "num_tokens": 9193927.0, "step": 1289 }, { "epoch": 0.9014675052410901, "grad_norm": 0.6726081450868195, "learning_rate": 5.120657815215879e-06, "loss": 11.5003, "num_tokens": 9201756.0, "step": 1290 }, { "epoch": 0.9021663172606569, "grad_norm": 0.6433988501725089, "learning_rate": 5.0494008837504214e-06, "loss": 11.4588, "num_tokens": 9209398.0, "step": 1291 }, { "epoch": 0.9028651292802237, "grad_norm": 0.7420404359119663, "learning_rate": 4.978630380028582e-06, "loss": 11.516, "num_tokens": 9215890.0, "step": 1292 }, { "epoch": 0.9035639412997903, "grad_norm": 0.7831591074910151, "learning_rate": 4.908346666604502e-06, "loss": 11.7173, "num_tokens": 9222461.0, "step": 1293 }, { "epoch": 0.9042627533193571, "grad_norm": 0.7071237983835428, "learning_rate": 4.8385501035385746e-06, "loss": 11.7076, "num_tokens": 9229085.0, "step": 1294 }, { "epoch": 0.9049615653389238, "grad_norm": 0.6427994454056646, "learning_rate": 4.769241048395512e-06, "loss": 11.416, "num_tokens": 9236669.0, "step": 1295 }, { "epoch": 0.9056603773584906, "grad_norm": 0.6884218952694207, "learning_rate": 4.700419856242555e-06, "loss": 11.4504, "num_tokens": 9243791.0, "step": 1296 }, { "epoch": 0.9063591893780573, "grad_norm": 0.7146280973640139, "learning_rate": 4.632086879647635e-06, "loss": 11.6455, "num_tokens": 9250468.0, "step": 1297 }, { "epoch": 0.907058001397624, "grad_norm": 0.6551040722182023, "learning_rate": 4.564242468677615e-06, "loss": 11.4883, "num_tokens": 9257850.0, "step": 1298 }, { "epoch": 0.9077568134171907, "grad_norm": 0.6502907151046965, "learning_rate": 4.496886970896396e-06, "loss": 11.5783, "num_tokens": 9265236.0, "step": 1299 }, { "epoch": 0.9084556254367575, "grad_norm": 0.7049840144986045, "learning_rate": 4.430020731363271e-06, "loss": 11.5172, "num_tokens": 9272210.0, "step": 1300 }, { "epoch": 0.9091544374563243, "grad_norm": 0.6780643846526679, "learning_rate": 4.3636440926310144e-06, "loss": 11.8368, "num_tokens": 9279337.0, "step": 1301 }, { "epoch": 0.909853249475891, "grad_norm": 0.7179333140546513, "learning_rate": 4.2977573947442175e-06, "loss": 11.5682, "num_tokens": 9286092.0, "step": 1302 }, { "epoch": 0.9105520614954578, "grad_norm": 0.7014760945022329, "learning_rate": 4.232360975237571e-06, "loss": 11.5352, "num_tokens": 9292888.0, "step": 1303 }, { "epoch": 0.9112508735150244, "grad_norm": 0.6757947711055604, "learning_rate": 4.167455169134027e-06, "loss": 11.6058, "num_tokens": 9299710.0, "step": 1304 }, { "epoch": 0.9119496855345912, "grad_norm": 0.7019157456554327, "learning_rate": 4.103040308943195e-06, "loss": 11.6906, "num_tokens": 9306349.0, "step": 1305 }, { "epoch": 0.9126484975541579, "grad_norm": 0.6500720977440768, "learning_rate": 4.039116724659564e-06, "loss": 11.6691, "num_tokens": 9313892.0, "step": 1306 }, { "epoch": 0.9133473095737247, "grad_norm": 0.6984094359716958, "learning_rate": 3.975684743760832e-06, "loss": 11.6063, "num_tokens": 9320784.0, "step": 1307 }, { "epoch": 0.9140461215932913, "grad_norm": 0.626177459402382, "learning_rate": 3.91274469120626e-06, "loss": 11.3268, "num_tokens": 9328759.0, "step": 1308 }, { "epoch": 0.9147449336128581, "grad_norm": 0.6866335858432335, "learning_rate": 3.850296889434968e-06, "loss": 11.9733, "num_tokens": 9335584.0, "step": 1309 }, { "epoch": 0.9154437456324249, "grad_norm": 0.61356922536621, "learning_rate": 3.788341658364314e-06, "loss": 11.5132, "num_tokens": 9343277.0, "step": 1310 }, { "epoch": 0.9161425576519916, "grad_norm": 0.6604269460426199, "learning_rate": 3.726879315388199e-06, "loss": 11.714, "num_tokens": 9350951.0, "step": 1311 }, { "epoch": 0.9168413696715584, "grad_norm": 0.7222165402296626, "learning_rate": 3.665910175375498e-06, "loss": 11.5123, "num_tokens": 9357659.0, "step": 1312 }, { "epoch": 0.9175401816911251, "grad_norm": 0.6523514767911169, "learning_rate": 3.6054345506684627e-06, "loss": 11.5572, "num_tokens": 9365388.0, "step": 1313 }, { "epoch": 0.9182389937106918, "grad_norm": 0.6545144502149857, "learning_rate": 3.5454527510810352e-06, "loss": 11.4776, "num_tokens": 9372586.0, "step": 1314 }, { "epoch": 0.9189378057302585, "grad_norm": 0.6412598756459515, "learning_rate": 3.485965083897347e-06, "loss": 11.4683, "num_tokens": 9380241.0, "step": 1315 }, { "epoch": 0.9196366177498253, "grad_norm": 0.7111845448756704, "learning_rate": 3.426971853870109e-06, "loss": 11.7242, "num_tokens": 9387292.0, "step": 1316 }, { "epoch": 0.9203354297693921, "grad_norm": 0.7099462928666012, "learning_rate": 3.3684733632190157e-06, "loss": 11.4538, "num_tokens": 9394171.0, "step": 1317 }, { "epoch": 0.9210342417889588, "grad_norm": 0.6526692218044201, "learning_rate": 3.310469911629288e-06, "loss": 11.5081, "num_tokens": 9401400.0, "step": 1318 }, { "epoch": 0.9217330538085255, "grad_norm": 0.7069103876866893, "learning_rate": 3.252961796250054e-06, "loss": 11.4752, "num_tokens": 9408486.0, "step": 1319 }, { "epoch": 0.9224318658280922, "grad_norm": 0.6889792496785059, "learning_rate": 3.1959493116928476e-06, "loss": 11.7564, "num_tokens": 9415285.0, "step": 1320 }, { "epoch": 0.923130677847659, "grad_norm": 0.7002775125728018, "learning_rate": 3.1394327500301357e-06, "loss": 11.6422, "num_tokens": 9422270.0, "step": 1321 }, { "epoch": 0.9238294898672257, "grad_norm": 0.6264289816602666, "learning_rate": 3.0834124007937614e-06, "loss": 11.6305, "num_tokens": 9430223.0, "step": 1322 }, { "epoch": 0.9245283018867925, "grad_norm": 0.6590788666191844, "learning_rate": 3.0278885509735234e-06, "loss": 11.5625, "num_tokens": 9437454.0, "step": 1323 }, { "epoch": 0.9252271139063591, "grad_norm": 0.6614466147979401, "learning_rate": 2.9728614850156653e-06, "loss": 11.3846, "num_tokens": 9444734.0, "step": 1324 }, { "epoch": 0.9259259259259259, "grad_norm": 0.7243775214182312, "learning_rate": 2.9183314848214127e-06, "loss": 11.6559, "num_tokens": 9451233.0, "step": 1325 }, { "epoch": 0.9266247379454927, "grad_norm": 0.6762857814271098, "learning_rate": 2.864298829745571e-06, "loss": 11.3847, "num_tokens": 9458292.0, "step": 1326 }, { "epoch": 0.9273235499650594, "grad_norm": 0.6402745881914114, "learning_rate": 2.8107637965950506e-06, "loss": 11.4931, "num_tokens": 9465760.0, "step": 1327 }, { "epoch": 0.9280223619846262, "grad_norm": 0.7081679785796786, "learning_rate": 2.7577266596274576e-06, "loss": 11.5325, "num_tokens": 9473090.0, "step": 1328 }, { "epoch": 0.9287211740041929, "grad_norm": 0.6762077375589634, "learning_rate": 2.7051876905497375e-06, "loss": 11.6349, "num_tokens": 9480091.0, "step": 1329 }, { "epoch": 0.9294199860237596, "grad_norm": 0.6469009183554926, "learning_rate": 2.6531471585167e-06, "loss": 11.6433, "num_tokens": 9487848.0, "step": 1330 }, { "epoch": 0.9301187980433263, "grad_norm": 0.657883502355354, "learning_rate": 2.6016053301297196e-06, "loss": 11.4191, "num_tokens": 9495032.0, "step": 1331 }, { "epoch": 0.9308176100628931, "grad_norm": 0.6166537222912691, "learning_rate": 2.5505624694353024e-06, "loss": 11.4717, "num_tokens": 9502796.0, "step": 1332 }, { "epoch": 0.9315164220824598, "grad_norm": 0.7211875437218437, "learning_rate": 2.5000188379237786e-06, "loss": 11.5148, "num_tokens": 9509344.0, "step": 1333 }, { "epoch": 0.9322152341020266, "grad_norm": 0.6677353668427028, "learning_rate": 2.4499746945279566e-06, "loss": 11.5097, "num_tokens": 9516077.0, "step": 1334 }, { "epoch": 0.9329140461215933, "grad_norm": 0.6567523061369096, "learning_rate": 2.4004302956217804e-06, "loss": 11.5162, "num_tokens": 9523359.0, "step": 1335 }, { "epoch": 0.93361285814116, "grad_norm": 0.708453970623699, "learning_rate": 2.3513858950190204e-06, "loss": 11.5558, "num_tokens": 9530186.0, "step": 1336 }, { "epoch": 0.9343116701607268, "grad_norm": 0.6988749660792845, "learning_rate": 2.302841743971995e-06, "loss": 11.3291, "num_tokens": 9537136.0, "step": 1337 }, { "epoch": 0.9350104821802935, "grad_norm": 0.6776722128963621, "learning_rate": 2.2547980911702404e-06, "loss": 11.6395, "num_tokens": 9544223.0, "step": 1338 }, { "epoch": 0.9357092941998603, "grad_norm": 0.7115389242403719, "learning_rate": 2.2072551827392983e-06, "loss": 11.6698, "num_tokens": 9550920.0, "step": 1339 }, { "epoch": 0.9364081062194269, "grad_norm": 0.6466374296903232, "learning_rate": 2.1602132622393746e-06, "loss": 11.532, "num_tokens": 9558586.0, "step": 1340 }, { "epoch": 0.9371069182389937, "grad_norm": 0.6524509216771044, "learning_rate": 2.1136725706641712e-06, "loss": 11.7108, "num_tokens": 9566214.0, "step": 1341 }, { "epoch": 0.9378057302585604, "grad_norm": 0.6840710486627725, "learning_rate": 2.0676333464396126e-06, "loss": 11.5571, "num_tokens": 9573631.0, "step": 1342 }, { "epoch": 0.9385045422781272, "grad_norm": 0.6908673649722832, "learning_rate": 2.0220958254225984e-06, "loss": 11.6546, "num_tokens": 9580814.0, "step": 1343 }, { "epoch": 0.939203354297694, "grad_norm": 0.7288555180137898, "learning_rate": 1.977060240899864e-06, "loss": 11.5905, "num_tokens": 9587261.0, "step": 1344 }, { "epoch": 0.9399021663172606, "grad_norm": 0.7220132561388931, "learning_rate": 1.932526823586722e-06, "loss": 11.7036, "num_tokens": 9593973.0, "step": 1345 }, { "epoch": 0.9406009783368274, "grad_norm": 0.6198864368807762, "learning_rate": 1.8884958016259113e-06, "loss": 11.5988, "num_tokens": 9601821.0, "step": 1346 }, { "epoch": 0.9412997903563941, "grad_norm": 0.6436118875785196, "learning_rate": 1.844967400586428e-06, "loss": 11.5774, "num_tokens": 9609343.0, "step": 1347 }, { "epoch": 0.9419986023759609, "grad_norm": 0.6336553658975224, "learning_rate": 1.8019418434623404e-06, "loss": 11.5016, "num_tokens": 9616602.0, "step": 1348 }, { "epoch": 0.9426974143955276, "grad_norm": 0.7075508611037244, "learning_rate": 1.7594193506716983e-06, "loss": 11.5928, "num_tokens": 9623622.0, "step": 1349 }, { "epoch": 0.9433962264150944, "grad_norm": 0.6735353557128455, "learning_rate": 1.7174001400553586e-06, "loss": 11.6306, "num_tokens": 9631105.0, "step": 1350 }, { "epoch": 0.944095038434661, "grad_norm": 0.6532951400716054, "learning_rate": 1.6758844268758843e-06, "loss": 11.7102, "num_tokens": 9638663.0, "step": 1351 }, { "epoch": 0.9447938504542278, "grad_norm": 0.6033206298760382, "learning_rate": 1.634872423816458e-06, "loss": 11.3447, "num_tokens": 9646957.0, "step": 1352 }, { "epoch": 0.9454926624737946, "grad_norm": 0.6789421018738463, "learning_rate": 1.5943643409797594e-06, "loss": 11.4869, "num_tokens": 9653670.0, "step": 1353 }, { "epoch": 0.9461914744933613, "grad_norm": 0.6488503290113109, "learning_rate": 1.5543603858869215e-06, "loss": 11.5039, "num_tokens": 9660888.0, "step": 1354 }, { "epoch": 0.9468902865129281, "grad_norm": 0.6679560200765636, "learning_rate": 1.5148607634764446e-06, "loss": 11.5494, "num_tokens": 9668331.0, "step": 1355 }, { "epoch": 0.9475890985324947, "grad_norm": 0.6993067195845916, "learning_rate": 1.475865676103161e-06, "loss": 11.5813, "num_tokens": 9675259.0, "step": 1356 }, { "epoch": 0.9482879105520615, "grad_norm": 0.6595111575445176, "learning_rate": 1.4373753235371823e-06, "loss": 11.2185, "num_tokens": 9682674.0, "step": 1357 }, { "epoch": 0.9489867225716282, "grad_norm": 0.7788936697274639, "learning_rate": 1.3993899029628997e-06, "loss": 11.6757, "num_tokens": 9689182.0, "step": 1358 }, { "epoch": 0.949685534591195, "grad_norm": 0.6643129196285505, "learning_rate": 1.3619096089779293e-06, "loss": 11.7565, "num_tokens": 9696437.0, "step": 1359 }, { "epoch": 0.9503843466107617, "grad_norm": 0.7004411450914478, "learning_rate": 1.3249346335922007e-06, "loss": 11.7835, "num_tokens": 9703396.0, "step": 1360 }, { "epoch": 0.9510831586303284, "grad_norm": 0.670408766200783, "learning_rate": 1.2884651662268709e-06, "loss": 11.6038, "num_tokens": 9711059.0, "step": 1361 }, { "epoch": 0.9517819706498952, "grad_norm": 0.6844685491462128, "learning_rate": 1.2525013937134122e-06, "loss": 11.5823, "num_tokens": 9718400.0, "step": 1362 }, { "epoch": 0.9524807826694619, "grad_norm": 0.6595213178977267, "learning_rate": 1.2170435002926694e-06, "loss": 11.4552, "num_tokens": 9725645.0, "step": 1363 }, { "epoch": 0.9531795946890287, "grad_norm": 0.6789665079944774, "learning_rate": 1.1820916676138382e-06, "loss": 11.4484, "num_tokens": 9733005.0, "step": 1364 }, { "epoch": 0.9538784067085954, "grad_norm": 0.6607462877401795, "learning_rate": 1.147646074733655e-06, "loss": 11.5541, "num_tokens": 9740218.0, "step": 1365 }, { "epoch": 0.9545772187281621, "grad_norm": 0.7142120338643752, "learning_rate": 1.1137068981153632e-06, "loss": 11.8216, "num_tokens": 9746821.0, "step": 1366 }, { "epoch": 0.9552760307477288, "grad_norm": 0.7459534452479601, "learning_rate": 1.0802743116278714e-06, "loss": 11.5283, "num_tokens": 9753374.0, "step": 1367 }, { "epoch": 0.9559748427672956, "grad_norm": 0.7015457824024218, "learning_rate": 1.0473484865448525e-06, "loss": 11.6701, "num_tokens": 9760505.0, "step": 1368 }, { "epoch": 0.9566736547868623, "grad_norm": 0.6560790041556968, "learning_rate": 1.014929591543845e-06, "loss": 11.6314, "num_tokens": 9767972.0, "step": 1369 }, { "epoch": 0.9573724668064291, "grad_norm": 0.7033378425039997, "learning_rate": 9.830177927054428e-07, "loss": 11.5456, "num_tokens": 9774744.0, "step": 1370 }, { "epoch": 0.9580712788259959, "grad_norm": 0.6798168243800103, "learning_rate": 9.516132535123846e-07, "loss": 11.2908, "num_tokens": 9782050.0, "step": 1371 }, { "epoch": 0.9587700908455625, "grad_norm": 0.6486824565620933, "learning_rate": 9.207161348487315e-07, "loss": 11.5991, "num_tokens": 9789435.0, "step": 1372 }, { "epoch": 0.9594689028651293, "grad_norm": 0.6277062623866081, "learning_rate": 8.903265949990691e-07, "loss": 11.4873, "num_tokens": 9797498.0, "step": 1373 }, { "epoch": 0.960167714884696, "grad_norm": 0.6779506091362029, "learning_rate": 8.604447896476852e-07, "loss": 11.4729, "num_tokens": 9805014.0, "step": 1374 }, { "epoch": 0.9608665269042628, "grad_norm": 0.6345571679344191, "learning_rate": 8.310708718777371e-07, "loss": 11.314, "num_tokens": 9812890.0, "step": 1375 }, { "epoch": 0.9615653389238294, "grad_norm": 0.6829253388826183, "learning_rate": 8.022049921705299e-07, "loss": 11.4405, "num_tokens": 9819758.0, "step": 1376 }, { "epoch": 0.9622641509433962, "grad_norm": 0.659631396208915, "learning_rate": 7.73847298404684e-07, "loss": 11.4128, "num_tokens": 9826786.0, "step": 1377 }, { "epoch": 0.9629629629629629, "grad_norm": 0.6407730482855021, "learning_rate": 7.459979358554248e-07, "loss": 11.565, "num_tokens": 9834179.0, "step": 1378 }, { "epoch": 0.9636617749825297, "grad_norm": 0.6521536841219591, "learning_rate": 7.186570471937937e-07, "loss": 11.7037, "num_tokens": 9841706.0, "step": 1379 }, { "epoch": 0.9643605870020965, "grad_norm": 0.6407279577165172, "learning_rate": 6.918247724859939e-07, "loss": 11.3862, "num_tokens": 9849180.0, "step": 1380 }, { "epoch": 0.9650593990216632, "grad_norm": 0.6564743299030316, "learning_rate": 6.655012491925683e-07, "loss": 11.6232, "num_tokens": 9856546.0, "step": 1381 }, { "epoch": 0.9657582110412299, "grad_norm": 0.8390466688972114, "learning_rate": 6.396866121677559e-07, "loss": 11.7295, "num_tokens": 9863096.0, "step": 1382 }, { "epoch": 0.9664570230607966, "grad_norm": 0.6392241129240702, "learning_rate": 6.143809936588363e-07, "loss": 11.4262, "num_tokens": 9870255.0, "step": 1383 }, { "epoch": 0.9671558350803634, "grad_norm": 0.609684179142815, "learning_rate": 5.895845233053643e-07, "loss": 11.5644, "num_tokens": 9878236.0, "step": 1384 }, { "epoch": 0.9678546470999301, "grad_norm": 0.6981982820151982, "learning_rate": 5.652973281385588e-07, "loss": 11.6708, "num_tokens": 9885146.0, "step": 1385 }, { "epoch": 0.9685534591194969, "grad_norm": 0.6785618783351172, "learning_rate": 5.415195325806699e-07, "loss": 11.5522, "num_tokens": 9892008.0, "step": 1386 }, { "epoch": 0.9692522711390635, "grad_norm": 0.6523225408549266, "learning_rate": 5.182512584443022e-07, "loss": 11.6692, "num_tokens": 9899023.0, "step": 1387 }, { "epoch": 0.9699510831586303, "grad_norm": 0.6459246784791243, "learning_rate": 4.954926249317815e-07, "loss": 11.363, "num_tokens": 9906287.0, "step": 1388 }, { "epoch": 0.9706498951781971, "grad_norm": 0.6632139942263436, "learning_rate": 4.732437486345886e-07, "loss": 11.2905, "num_tokens": 9913471.0, "step": 1389 }, { "epoch": 0.9713487071977638, "grad_norm": 0.6526848721218659, "learning_rate": 4.515047435327491e-07, "loss": 11.6156, "num_tokens": 9920776.0, "step": 1390 }, { "epoch": 0.9720475192173306, "grad_norm": 0.6772714564607265, "learning_rate": 4.3027572099422207e-07, "loss": 11.5893, "num_tokens": 9927914.0, "step": 1391 }, { "epoch": 0.9727463312368972, "grad_norm": 0.7303242171313036, "learning_rate": 4.0955678977436797e-07, "loss": 11.6023, "num_tokens": 9934401.0, "step": 1392 }, { "epoch": 0.973445143256464, "grad_norm": 0.6934874178347061, "learning_rate": 3.893480560153484e-07, "loss": 11.4115, "num_tokens": 9941705.0, "step": 1393 }, { "epoch": 0.9741439552760307, "grad_norm": 0.6461084160572809, "learning_rate": 3.6964962324561593e-07, "loss": 11.5293, "num_tokens": 9949413.0, "step": 1394 }, { "epoch": 0.9748427672955975, "grad_norm": 0.6736414342784565, "learning_rate": 3.504615923793919e-07, "loss": 11.3817, "num_tokens": 9956435.0, "step": 1395 }, { "epoch": 0.9755415793151643, "grad_norm": 0.701011829428076, "learning_rate": 3.317840617160894e-07, "loss": 11.598, "num_tokens": 9963267.0, "step": 1396 }, { "epoch": 0.976240391334731, "grad_norm": 0.681775890484585, "learning_rate": 3.136171269399024e-07, "loss": 11.2813, "num_tokens": 9970795.0, "step": 1397 }, { "epoch": 0.9769392033542977, "grad_norm": 0.7029694063122361, "learning_rate": 2.959608811192283e-07, "loss": 11.5288, "num_tokens": 9977599.0, "step": 1398 }, { "epoch": 0.9776380153738644, "grad_norm": 0.6397069793309575, "learning_rate": 2.7881541470623494e-07, "loss": 11.5876, "num_tokens": 9985126.0, "step": 1399 }, { "epoch": 0.9783368273934312, "grad_norm": 0.6884389472291744, "learning_rate": 2.6218081553638364e-07, "loss": 11.3874, "num_tokens": 9991636.0, "step": 1400 }, { "epoch": 0.9790356394129979, "grad_norm": 0.652796733566214, "learning_rate": 2.4605716882801776e-07, "loss": 11.5593, "num_tokens": 9999333.0, "step": 1401 }, { "epoch": 0.9797344514325647, "grad_norm": 0.7054950985878522, "learning_rate": 2.3044455718185253e-07, "loss": 11.577, "num_tokens": 10006127.0, "step": 1402 }, { "epoch": 0.9804332634521313, "grad_norm": 0.7053254709063593, "learning_rate": 2.153430605806195e-07, "loss": 11.5192, "num_tokens": 10012832.0, "step": 1403 }, { "epoch": 0.9811320754716981, "grad_norm": 0.6350916351726831, "learning_rate": 2.0075275638862246e-07, "loss": 11.2378, "num_tokens": 10020535.0, "step": 1404 }, { "epoch": 0.9818308874912649, "grad_norm": 0.6690972398564283, "learning_rate": 1.8667371935133792e-07, "loss": 11.7258, "num_tokens": 10027624.0, "step": 1405 }, { "epoch": 0.9825296995108316, "grad_norm": 0.6592830981787241, "learning_rate": 1.7310602159505974e-07, "loss": 11.553, "num_tokens": 10034683.0, "step": 1406 }, { "epoch": 0.9832285115303984, "grad_norm": 0.6688640725405685, "learning_rate": 1.6004973262651047e-07, "loss": 11.6536, "num_tokens": 10042053.0, "step": 1407 }, { "epoch": 0.983927323549965, "grad_norm": 0.6527104327096959, "learning_rate": 1.4750491933247512e-07, "loss": 11.7204, "num_tokens": 10049229.0, "step": 1408 }, { "epoch": 0.9846261355695318, "grad_norm": 0.7319590410040872, "learning_rate": 1.3547164597949026e-07, "loss": 11.8563, "num_tokens": 10055821.0, "step": 1409 }, { "epoch": 0.9853249475890985, "grad_norm": 0.5893377672732698, "learning_rate": 1.2394997421347753e-07, "loss": 11.5842, "num_tokens": 10064206.0, "step": 1410 }, { "epoch": 0.9860237596086653, "grad_norm": 0.7685477405141135, "learning_rate": 1.1293996305946631e-07, "loss": 11.5083, "num_tokens": 10070918.0, "step": 1411 }, { "epoch": 0.986722571628232, "grad_norm": 0.6450315989039693, "learning_rate": 1.0244166892124928e-07, "loss": 11.5581, "num_tokens": 10078257.0, "step": 1412 }, { "epoch": 0.9874213836477987, "grad_norm": 0.6761611605290332, "learning_rate": 9.245514558112733e-08, "loss": 11.5999, "num_tokens": 10085142.0, "step": 1413 }, { "epoch": 0.9881201956673655, "grad_norm": 0.7301424243829707, "learning_rate": 8.298044419962069e-08, "loss": 11.5774, "num_tokens": 10091532.0, "step": 1414 }, { "epoch": 0.9888190076869322, "grad_norm": 0.6810160455763407, "learning_rate": 7.401761331521372e-08, "loss": 11.5121, "num_tokens": 10098630.0, "step": 1415 }, { "epoch": 0.989517819706499, "grad_norm": 0.6946045633198202, "learning_rate": 6.556669884408839e-08, "loss": 11.5327, "num_tokens": 10105381.0, "step": 1416 }, { "epoch": 0.9902166317260657, "grad_norm": 0.6767554553393855, "learning_rate": 5.7627744079902235e-08, "loss": 11.5641, "num_tokens": 10112328.0, "step": 1417 }, { "epoch": 0.9909154437456325, "grad_norm": 0.7223008729413601, "learning_rate": 5.0200789693588544e-08, "loss": 11.6189, "num_tokens": 10118964.0, "step": 1418 }, { "epoch": 0.9916142557651991, "grad_norm": 0.6661158640448771, "learning_rate": 4.32858737330899e-08, "loss": 11.39, "num_tokens": 10125874.0, "step": 1419 }, { "epoch": 0.9923130677847659, "grad_norm": 0.7615114163388279, "learning_rate": 3.6883031623224926e-08, "loss": 11.8602, "num_tokens": 10132274.0, "step": 1420 }, { "epoch": 0.9930118798043326, "grad_norm": 0.7254778236725256, "learning_rate": 3.099229616547739e-08, "loss": 11.6168, "num_tokens": 10138845.0, "step": 1421 }, { "epoch": 0.9937106918238994, "grad_norm": 0.6847796254122286, "learning_rate": 2.5613697537818504e-08, "loss": 11.6849, "num_tokens": 10145801.0, "step": 1422 }, { "epoch": 0.9944095038434662, "grad_norm": 0.6968642739944435, "learning_rate": 2.074726329457377e-08, "loss": 11.5605, "num_tokens": 10152749.0, "step": 1423 }, { "epoch": 0.9951083158630328, "grad_norm": 0.6498028004501105, "learning_rate": 1.6393018366278602e-08, "loss": 11.5241, "num_tokens": 10160312.0, "step": 1424 }, { "epoch": 0.9958071278825996, "grad_norm": 0.691794824675225, "learning_rate": 1.2550985059522902e-08, "loss": 11.5186, "num_tokens": 10167099.0, "step": 1425 }, { "epoch": 0.9965059399021663, "grad_norm": 0.6808541350424336, "learning_rate": 9.221183056895566e-09, "loss": 11.468, "num_tokens": 10174489.0, "step": 1426 }, { "epoch": 0.9972047519217331, "grad_norm": 0.6518646820012849, "learning_rate": 6.4036294168068335e-09, "loss": 11.6372, "num_tokens": 10182102.0, "step": 1427 }, { "epoch": 0.9979035639412998, "grad_norm": 0.6820092282899476, "learning_rate": 4.0983385734660875e-09, "loss": 11.4125, "num_tokens": 10188883.0, "step": 1428 }, { "epoch": 0.9986023759608665, "grad_norm": 0.6891039074476841, "learning_rate": 2.305322336781934e-09, "loss": 11.593, "num_tokens": 10195672.0, "step": 1429 }, { "epoch": 0.9993011879804332, "grad_norm": 0.6506047130474067, "learning_rate": 1.0245898922844888e-09, "loss": 11.7408, "num_tokens": 10203012.0, "step": 1430 }, { "epoch": 1.0, "grad_norm": 0.6706512392208532, "learning_rate": 2.561478011253726e-10, "loss": 11.5274, "num_tokens": 10210317.0, "step": 1431 }, { "epoch": 1.0, "step": 1431, "total_flos": 638292458766336.0, "train_loss": 12.375564571863118, "train_runtime": 11888.1182, "train_samples_per_second": 7.706, "train_steps_per_second": 0.12 } ], "logging_steps": 1.0, "max_steps": 1431, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 638292458766336.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }