{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1286, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015552099533437014, "grad_norm": 2.850832163686265, "learning_rate": 0.0, "loss": 0.864, "num_tokens": 98304.0, "step": 1 }, { "epoch": 0.003110419906687403, "grad_norm": 2.621327414478985, "learning_rate": 1.0256410256410257e-06, "loss": 0.8205, "num_tokens": 196608.0, "step": 2 }, { "epoch": 0.004665629860031105, "grad_norm": 2.7922266839076704, "learning_rate": 2.0512820512820513e-06, "loss": 0.8452, "num_tokens": 294912.0, "step": 3 }, { "epoch": 0.006220839813374806, "grad_norm": 2.679559226785009, "learning_rate": 3.0769230769230774e-06, "loss": 0.8349, "num_tokens": 393216.0, "step": 4 }, { "epoch": 0.007776049766718507, "grad_norm": 2.460740190055204, "learning_rate": 4.102564102564103e-06, "loss": 0.8078, "num_tokens": 491520.0, "step": 5 }, { "epoch": 0.00933125972006221, "grad_norm": 2.2188142112474214, "learning_rate": 5.128205128205128e-06, "loss": 0.7565, "num_tokens": 589824.0, "step": 6 }, { "epoch": 0.01088646967340591, "grad_norm": 1.9397844978440306, "learning_rate": 6.153846153846155e-06, "loss": 0.6494, "num_tokens": 688128.0, "step": 7 }, { "epoch": 0.012441679626749611, "grad_norm": 2.388897331006519, "learning_rate": 7.17948717948718e-06, "loss": 0.6162, "num_tokens": 786432.0, "step": 8 }, { "epoch": 0.013996889580093312, "grad_norm": 1.8521202177878355, "learning_rate": 8.205128205128205e-06, "loss": 0.5603, "num_tokens": 884736.0, "step": 9 }, { "epoch": 0.015552099533437015, "grad_norm": 1.5922916497254584, "learning_rate": 9.230769230769232e-06, "loss": 0.4987, "num_tokens": 983040.0, "step": 10 }, { "epoch": 0.017107309486780714, "grad_norm": 1.175368918787764, "learning_rate": 1.0256410256410256e-05, "loss": 0.4578, "num_tokens": 1081344.0, "step": 11 }, { "epoch": 0.01866251944012442, "grad_norm": 0.9003615317542575, "learning_rate": 1.1282051282051283e-05, "loss": 0.4003, "num_tokens": 1179648.0, "step": 12 }, { "epoch": 0.02021772939346812, "grad_norm": 0.8939831531786895, "learning_rate": 1.230769230769231e-05, "loss": 0.3844, "num_tokens": 1277952.0, "step": 13 }, { "epoch": 0.02177293934681182, "grad_norm": 0.8630847695300882, "learning_rate": 1.3333333333333333e-05, "loss": 0.3811, "num_tokens": 1376256.0, "step": 14 }, { "epoch": 0.02332814930015552, "grad_norm": 0.624666853918488, "learning_rate": 1.435897435897436e-05, "loss": 0.3601, "num_tokens": 1474560.0, "step": 15 }, { "epoch": 0.024883359253499222, "grad_norm": 0.6167023091301801, "learning_rate": 1.5384615384615387e-05, "loss": 0.3387, "num_tokens": 1572864.0, "step": 16 }, { "epoch": 0.026438569206842923, "grad_norm": 0.5559104458420491, "learning_rate": 1.641025641025641e-05, "loss": 0.3231, "num_tokens": 1671168.0, "step": 17 }, { "epoch": 0.027993779160186624, "grad_norm": 0.5021888670944655, "learning_rate": 1.7435897435897438e-05, "loss": 0.3128, "num_tokens": 1769472.0, "step": 18 }, { "epoch": 0.029548989113530325, "grad_norm": 0.718029843237883, "learning_rate": 1.8461538461538465e-05, "loss": 0.3163, "num_tokens": 1867776.0, "step": 19 }, { "epoch": 0.03110419906687403, "grad_norm": 0.49288713599926304, "learning_rate": 1.9487179487179488e-05, "loss": 0.3098, "num_tokens": 1966080.0, "step": 20 }, { "epoch": 0.03265940902021773, "grad_norm": 0.4452166440260948, "learning_rate": 2.0512820512820512e-05, "loss": 0.3026, "num_tokens": 2064384.0, "step": 21 }, { "epoch": 0.03421461897356143, "grad_norm": 0.4626132540505872, "learning_rate": 2.153846153846154e-05, "loss": 0.306, "num_tokens": 2162688.0, "step": 22 }, { "epoch": 0.03576982892690513, "grad_norm": 0.41515781074927094, "learning_rate": 2.2564102564102566e-05, "loss": 0.2834, "num_tokens": 2260992.0, "step": 23 }, { "epoch": 0.03732503888024884, "grad_norm": 0.4262368748037849, "learning_rate": 2.3589743589743593e-05, "loss": 0.2997, "num_tokens": 2359296.0, "step": 24 }, { "epoch": 0.038880248833592534, "grad_norm": 0.4081101208670365, "learning_rate": 2.461538461538462e-05, "loss": 0.2795, "num_tokens": 2457600.0, "step": 25 }, { "epoch": 0.04043545878693624, "grad_norm": 0.431624940467908, "learning_rate": 2.5641025641025646e-05, "loss": 0.305, "num_tokens": 2555904.0, "step": 26 }, { "epoch": 0.041990668740279936, "grad_norm": 0.41284532737395896, "learning_rate": 2.6666666666666667e-05, "loss": 0.2739, "num_tokens": 2654208.0, "step": 27 }, { "epoch": 0.04354587869362364, "grad_norm": 0.47870377778440637, "learning_rate": 2.7692307692307694e-05, "loss": 0.2824, "num_tokens": 2752512.0, "step": 28 }, { "epoch": 0.04510108864696734, "grad_norm": 0.41398942383194176, "learning_rate": 2.871794871794872e-05, "loss": 0.281, "num_tokens": 2850816.0, "step": 29 }, { "epoch": 0.04665629860031104, "grad_norm": 0.36447128808149937, "learning_rate": 2.9743589743589747e-05, "loss": 0.25, "num_tokens": 2949120.0, "step": 30 }, { "epoch": 0.04821150855365474, "grad_norm": 0.4110503981668635, "learning_rate": 3.0769230769230774e-05, "loss": 0.2712, "num_tokens": 3047424.0, "step": 31 }, { "epoch": 0.049766718506998445, "grad_norm": 0.40755772093711445, "learning_rate": 3.1794871794871795e-05, "loss": 0.2695, "num_tokens": 3145728.0, "step": 32 }, { "epoch": 0.05132192846034215, "grad_norm": 0.37313909958643876, "learning_rate": 3.282051282051282e-05, "loss": 0.2582, "num_tokens": 3244032.0, "step": 33 }, { "epoch": 0.05287713841368585, "grad_norm": 0.39062798103594293, "learning_rate": 3.384615384615385e-05, "loss": 0.2634, "num_tokens": 3342336.0, "step": 34 }, { "epoch": 0.05443234836702955, "grad_norm": 0.3677317032681235, "learning_rate": 3.4871794871794875e-05, "loss": 0.2448, "num_tokens": 3440640.0, "step": 35 }, { "epoch": 0.05598755832037325, "grad_norm": 0.4055742321262628, "learning_rate": 3.58974358974359e-05, "loss": 0.2556, "num_tokens": 3538944.0, "step": 36 }, { "epoch": 0.05754276827371695, "grad_norm": 0.37002642800457675, "learning_rate": 3.692307692307693e-05, "loss": 0.2675, "num_tokens": 3637248.0, "step": 37 }, { "epoch": 0.05909797822706065, "grad_norm": 0.36841419474296555, "learning_rate": 3.794871794871795e-05, "loss": 0.2476, "num_tokens": 3735552.0, "step": 38 }, { "epoch": 0.060653188180404355, "grad_norm": 0.3723174037856664, "learning_rate": 3.8974358974358976e-05, "loss": 0.2555, "num_tokens": 3833856.0, "step": 39 }, { "epoch": 0.06220839813374806, "grad_norm": 0.4309025451145102, "learning_rate": 4e-05, "loss": 0.2586, "num_tokens": 3932160.0, "step": 40 }, { "epoch": 0.06376360808709176, "grad_norm": 0.36381017252460623, "learning_rate": 3.9999942877248535e-05, "loss": 0.2636, "num_tokens": 4030464.0, "step": 41 }, { "epoch": 0.06531881804043546, "grad_norm": 0.37480514493747463, "learning_rate": 3.999977150935671e-05, "loss": 0.2467, "num_tokens": 4128768.0, "step": 42 }, { "epoch": 0.06687402799377916, "grad_norm": 0.3551496604804387, "learning_rate": 3.9999485897412184e-05, "loss": 0.2418, "num_tokens": 4227072.0, "step": 43 }, { "epoch": 0.06842923794712286, "grad_norm": 0.37221738777932345, "learning_rate": 3.999908604322772e-05, "loss": 0.2422, "num_tokens": 4325376.0, "step": 44 }, { "epoch": 0.06998444790046657, "grad_norm": 0.38373813414538355, "learning_rate": 3.99985719493412e-05, "loss": 0.2501, "num_tokens": 4423680.0, "step": 45 }, { "epoch": 0.07153965785381027, "grad_norm": 0.35109111876762833, "learning_rate": 3.999794361901555e-05, "loss": 0.2403, "num_tokens": 4521984.0, "step": 46 }, { "epoch": 0.07309486780715396, "grad_norm": 0.3572544976048282, "learning_rate": 3.999720105623877e-05, "loss": 0.2428, "num_tokens": 4620288.0, "step": 47 }, { "epoch": 0.07465007776049767, "grad_norm": 0.3660820292272284, "learning_rate": 3.999634426572388e-05, "loss": 0.234, "num_tokens": 4718592.0, "step": 48 }, { "epoch": 0.07620528771384137, "grad_norm": 0.3570790057540019, "learning_rate": 3.999537325290891e-05, "loss": 0.2563, "num_tokens": 4816896.0, "step": 49 }, { "epoch": 0.07776049766718507, "grad_norm": 0.366563047499154, "learning_rate": 3.999428802395686e-05, "loss": 0.2325, "num_tokens": 4907869.0, "step": 50 }, { "epoch": 0.07931570762052877, "grad_norm": 0.36364325129358965, "learning_rate": 3.9993088585755635e-05, "loss": 0.2566, "num_tokens": 5006173.0, "step": 51 }, { "epoch": 0.08087091757387248, "grad_norm": 0.37314116726411756, "learning_rate": 3.999177494591804e-05, "loss": 0.2558, "num_tokens": 5104477.0, "step": 52 }, { "epoch": 0.08242612752721618, "grad_norm": 0.3380090014018147, "learning_rate": 3.9990347112781716e-05, "loss": 0.2395, "num_tokens": 5202781.0, "step": 53 }, { "epoch": 0.08398133748055987, "grad_norm": 0.3747599286645192, "learning_rate": 3.998880509540908e-05, "loss": 0.2556, "num_tokens": 5301085.0, "step": 54 }, { "epoch": 0.08553654743390357, "grad_norm": 0.362755667091162, "learning_rate": 3.9987148903587264e-05, "loss": 0.2473, "num_tokens": 5399389.0, "step": 55 }, { "epoch": 0.08709175738724728, "grad_norm": 0.32546269884947954, "learning_rate": 3.998537854782808e-05, "loss": 0.2321, "num_tokens": 5497693.0, "step": 56 }, { "epoch": 0.08864696734059098, "grad_norm": 0.3371495022769505, "learning_rate": 3.998349403936793e-05, "loss": 0.2489, "num_tokens": 5595997.0, "step": 57 }, { "epoch": 0.09020217729393468, "grad_norm": 0.3971039796049158, "learning_rate": 3.998149539016772e-05, "loss": 0.2433, "num_tokens": 5694301.0, "step": 58 }, { "epoch": 0.09175738724727839, "grad_norm": 0.34705279729420085, "learning_rate": 3.9979382612912833e-05, "loss": 0.2348, "num_tokens": 5792605.0, "step": 59 }, { "epoch": 0.09331259720062209, "grad_norm": 0.3456916344515809, "learning_rate": 3.997715572101302e-05, "loss": 0.238, "num_tokens": 5890909.0, "step": 60 }, { "epoch": 0.09486780715396578, "grad_norm": 0.34488856595089473, "learning_rate": 3.9974814728602285e-05, "loss": 0.2407, "num_tokens": 5989213.0, "step": 61 }, { "epoch": 0.09642301710730948, "grad_norm": 0.3577308090390812, "learning_rate": 3.997235965053885e-05, "loss": 0.239, "num_tokens": 6087517.0, "step": 62 }, { "epoch": 0.09797822706065319, "grad_norm": 0.3144068441900845, "learning_rate": 3.996979050240503e-05, "loss": 0.2216, "num_tokens": 6185821.0, "step": 63 }, { "epoch": 0.09953343701399689, "grad_norm": 0.3432883152179806, "learning_rate": 3.996710730050712e-05, "loss": 0.2313, "num_tokens": 6284125.0, "step": 64 }, { "epoch": 0.10108864696734059, "grad_norm": 0.32537794062151115, "learning_rate": 3.996431006187536e-05, "loss": 0.216, "num_tokens": 6382429.0, "step": 65 }, { "epoch": 0.1026438569206843, "grad_norm": 0.32473278561347874, "learning_rate": 3.9961398804263714e-05, "loss": 0.2377, "num_tokens": 6480733.0, "step": 66 }, { "epoch": 0.104199066874028, "grad_norm": 0.3173406003242754, "learning_rate": 3.995837354614987e-05, "loss": 0.2136, "num_tokens": 6579037.0, "step": 67 }, { "epoch": 0.1057542768273717, "grad_norm": 0.3270747425584779, "learning_rate": 3.995523430673505e-05, "loss": 0.2307, "num_tokens": 6677341.0, "step": 68 }, { "epoch": 0.10730948678071539, "grad_norm": 0.36111149543972787, "learning_rate": 3.9951981105943934e-05, "loss": 0.2326, "num_tokens": 6775645.0, "step": 69 }, { "epoch": 0.1088646967340591, "grad_norm": 0.3311812440414598, "learning_rate": 3.9948613964424485e-05, "loss": 0.225, "num_tokens": 6873949.0, "step": 70 }, { "epoch": 0.1104199066874028, "grad_norm": 0.3382336119955324, "learning_rate": 3.994513290354786e-05, "loss": 0.2506, "num_tokens": 6972253.0, "step": 71 }, { "epoch": 0.1119751166407465, "grad_norm": 0.33051563571470793, "learning_rate": 3.994153794540826e-05, "loss": 0.2386, "num_tokens": 7070557.0, "step": 72 }, { "epoch": 0.11353032659409021, "grad_norm": 0.3212727995693181, "learning_rate": 3.993782911282278e-05, "loss": 0.2191, "num_tokens": 7168861.0, "step": 73 }, { "epoch": 0.1150855365474339, "grad_norm": 0.31364999627764234, "learning_rate": 3.993400642933128e-05, "loss": 0.2229, "num_tokens": 7267165.0, "step": 74 }, { "epoch": 0.1166407465007776, "grad_norm": 0.31566058285979465, "learning_rate": 3.993006991919622e-05, "loss": 0.2235, "num_tokens": 7365469.0, "step": 75 }, { "epoch": 0.1181959564541213, "grad_norm": 0.3167941259557774, "learning_rate": 3.992601960740254e-05, "loss": 0.2246, "num_tokens": 7463773.0, "step": 76 }, { "epoch": 0.11975116640746501, "grad_norm": 0.3148600595643472, "learning_rate": 3.9921855519657435e-05, "loss": 0.2159, "num_tokens": 7562077.0, "step": 77 }, { "epoch": 0.12130637636080871, "grad_norm": 0.32690800532566694, "learning_rate": 3.991757768239027e-05, "loss": 0.2223, "num_tokens": 7660381.0, "step": 78 }, { "epoch": 0.12286158631415241, "grad_norm": 0.3185350536944732, "learning_rate": 3.9913186122752344e-05, "loss": 0.2201, "num_tokens": 7758685.0, "step": 79 }, { "epoch": 0.12441679626749612, "grad_norm": 0.3446957177765422, "learning_rate": 3.990868086861679e-05, "loss": 0.2292, "num_tokens": 7856989.0, "step": 80 }, { "epoch": 0.12597200622083982, "grad_norm": 0.3260898255198152, "learning_rate": 3.990406194857831e-05, "loss": 0.2306, "num_tokens": 7955293.0, "step": 81 }, { "epoch": 0.12752721617418353, "grad_norm": 0.30829678619163003, "learning_rate": 3.989932939195307e-05, "loss": 0.2213, "num_tokens": 8053597.0, "step": 82 }, { "epoch": 0.1290824261275272, "grad_norm": 0.48482173133274953, "learning_rate": 3.989448322877848e-05, "loss": 0.222, "num_tokens": 8151901.0, "step": 83 }, { "epoch": 0.13063763608087092, "grad_norm": 0.32641835889507975, "learning_rate": 3.988952348981299e-05, "loss": 0.226, "num_tokens": 8250205.0, "step": 84 }, { "epoch": 0.1321928460342146, "grad_norm": 0.2993853241879743, "learning_rate": 3.9884450206535944e-05, "loss": 0.2185, "num_tokens": 8348509.0, "step": 85 }, { "epoch": 0.13374805598755832, "grad_norm": 0.3233091352633471, "learning_rate": 3.9879263411147314e-05, "loss": 0.2226, "num_tokens": 8446813.0, "step": 86 }, { "epoch": 0.13530326594090203, "grad_norm": 0.32643486163425744, "learning_rate": 3.987396313656756e-05, "loss": 0.234, "num_tokens": 8545117.0, "step": 87 }, { "epoch": 0.1368584758942457, "grad_norm": 0.3193143217420879, "learning_rate": 3.986854941643737e-05, "loss": 0.2437, "num_tokens": 8643421.0, "step": 88 }, { "epoch": 0.13841368584758942, "grad_norm": 0.31421128146642074, "learning_rate": 3.9863022285117485e-05, "loss": 0.2266, "num_tokens": 8741725.0, "step": 89 }, { "epoch": 0.13996889580093314, "grad_norm": 0.32304454430355284, "learning_rate": 3.985738177768845e-05, "loss": 0.2222, "num_tokens": 8840029.0, "step": 90 }, { "epoch": 0.14152410575427682, "grad_norm": 0.3215049649231576, "learning_rate": 3.98516279299504e-05, "loss": 0.2174, "num_tokens": 8938333.0, "step": 91 }, { "epoch": 0.14307931570762053, "grad_norm": 0.3367513895048256, "learning_rate": 3.9845760778422866e-05, "loss": 0.2193, "num_tokens": 9036637.0, "step": 92 }, { "epoch": 0.14463452566096424, "grad_norm": 0.31325249468145605, "learning_rate": 3.983978036034449e-05, "loss": 0.216, "num_tokens": 9134941.0, "step": 93 }, { "epoch": 0.14618973561430793, "grad_norm": 0.3285912671237864, "learning_rate": 3.983368671367281e-05, "loss": 0.2276, "num_tokens": 9233245.0, "step": 94 }, { "epoch": 0.14774494556765164, "grad_norm": 0.3421696179003487, "learning_rate": 3.9827479877084043e-05, "loss": 0.2275, "num_tokens": 9331549.0, "step": 95 }, { "epoch": 0.14930015552099535, "grad_norm": 0.3084022350822793, "learning_rate": 3.9821159889972814e-05, "loss": 0.2153, "num_tokens": 9429853.0, "step": 96 }, { "epoch": 0.15085536547433903, "grad_norm": 0.29262350685558713, "learning_rate": 3.9814726792451894e-05, "loss": 0.2073, "num_tokens": 9528157.0, "step": 97 }, { "epoch": 0.15241057542768274, "grad_norm": 0.32985465867232566, "learning_rate": 3.980818062535199e-05, "loss": 0.2134, "num_tokens": 9626461.0, "step": 98 }, { "epoch": 0.15396578538102643, "grad_norm": 0.32455177999462903, "learning_rate": 3.980152143022143e-05, "loss": 0.2321, "num_tokens": 9724765.0, "step": 99 }, { "epoch": 0.15552099533437014, "grad_norm": 0.32415315842948483, "learning_rate": 3.9794749249325945e-05, "loss": 0.2111, "num_tokens": 9811820.0, "step": 100 }, { "epoch": 0.15707620528771385, "grad_norm": 0.32747006866158823, "learning_rate": 3.978786412564839e-05, "loss": 0.2245, "num_tokens": 9910124.0, "step": 101 }, { "epoch": 0.15863141524105753, "grad_norm": 0.3097311884813666, "learning_rate": 3.978086610288844e-05, "loss": 0.2207, "num_tokens": 10008428.0, "step": 102 }, { "epoch": 0.16018662519440124, "grad_norm": 0.33495682676905253, "learning_rate": 3.977375522546236e-05, "loss": 0.215, "num_tokens": 10106732.0, "step": 103 }, { "epoch": 0.16174183514774496, "grad_norm": 0.3084392041679088, "learning_rate": 3.97665315385027e-05, "loss": 0.2128, "num_tokens": 10205036.0, "step": 104 }, { "epoch": 0.16329704510108864, "grad_norm": 0.3073018362247758, "learning_rate": 3.975919508785798e-05, "loss": 0.2203, "num_tokens": 10303340.0, "step": 105 }, { "epoch": 0.16485225505443235, "grad_norm": 0.3083905217865771, "learning_rate": 3.975174592009247e-05, "loss": 0.2218, "num_tokens": 10401644.0, "step": 106 }, { "epoch": 0.16640746500777606, "grad_norm": 0.3238926674958416, "learning_rate": 3.974418408248583e-05, "loss": 0.2311, "num_tokens": 10499948.0, "step": 107 }, { "epoch": 0.16796267496111975, "grad_norm": 0.3008986484033563, "learning_rate": 3.973650962303283e-05, "loss": 0.2114, "num_tokens": 10598252.0, "step": 108 }, { "epoch": 0.16951788491446346, "grad_norm": 0.3157463422588962, "learning_rate": 3.9728722590443044e-05, "loss": 0.2167, "num_tokens": 10696556.0, "step": 109 }, { "epoch": 0.17107309486780714, "grad_norm": 0.308735276686564, "learning_rate": 3.972082303414057e-05, "loss": 0.2219, "num_tokens": 10794860.0, "step": 110 }, { "epoch": 0.17262830482115085, "grad_norm": 0.31009722418406466, "learning_rate": 3.971281100426366e-05, "loss": 0.2264, "num_tokens": 10893164.0, "step": 111 }, { "epoch": 0.17418351477449456, "grad_norm": 0.33859003660128173, "learning_rate": 3.970468655166445e-05, "loss": 0.2231, "num_tokens": 10991468.0, "step": 112 }, { "epoch": 0.17573872472783825, "grad_norm": 0.3063209244224891, "learning_rate": 3.969644972790862e-05, "loss": 0.2283, "num_tokens": 11089772.0, "step": 113 }, { "epoch": 0.17729393468118196, "grad_norm": 0.30630591237924, "learning_rate": 3.968810058527506e-05, "loss": 0.2216, "num_tokens": 11188076.0, "step": 114 }, { "epoch": 0.17884914463452567, "grad_norm": 0.30756864886596896, "learning_rate": 3.967963917675555e-05, "loss": 0.2149, "num_tokens": 11286380.0, "step": 115 }, { "epoch": 0.18040435458786935, "grad_norm": 0.31340223912481163, "learning_rate": 3.967106555605441e-05, "loss": 0.2234, "num_tokens": 11384684.0, "step": 116 }, { "epoch": 0.18195956454121306, "grad_norm": 0.287717656833506, "learning_rate": 3.9662379777588176e-05, "loss": 0.2166, "num_tokens": 11482988.0, "step": 117 }, { "epoch": 0.18351477449455678, "grad_norm": 0.3025552284426974, "learning_rate": 3.9653581896485256e-05, "loss": 0.2225, "num_tokens": 11581292.0, "step": 118 }, { "epoch": 0.18506998444790046, "grad_norm": 0.2920662843460402, "learning_rate": 3.964467196858554e-05, "loss": 0.2183, "num_tokens": 11679596.0, "step": 119 }, { "epoch": 0.18662519440124417, "grad_norm": 0.29477149292004745, "learning_rate": 3.96356500504401e-05, "loss": 0.22, "num_tokens": 11777900.0, "step": 120 }, { "epoch": 0.18818040435458788, "grad_norm": 0.29768236045238605, "learning_rate": 3.962651619931081e-05, "loss": 0.2183, "num_tokens": 11876204.0, "step": 121 }, { "epoch": 0.18973561430793157, "grad_norm": 0.3026064609026335, "learning_rate": 3.961727047316997e-05, "loss": 0.2287, "num_tokens": 11974508.0, "step": 122 }, { "epoch": 0.19129082426127528, "grad_norm": 0.28932117404344854, "learning_rate": 3.960791293069993e-05, "loss": 0.211, "num_tokens": 12072812.0, "step": 123 }, { "epoch": 0.19284603421461896, "grad_norm": 0.33819817436644517, "learning_rate": 3.959844363129278e-05, "loss": 0.2157, "num_tokens": 12171116.0, "step": 124 }, { "epoch": 0.19440124416796267, "grad_norm": 0.2963134344855962, "learning_rate": 3.958886263504988e-05, "loss": 0.2097, "num_tokens": 12269420.0, "step": 125 }, { "epoch": 0.19595645412130638, "grad_norm": 0.30355521973807087, "learning_rate": 3.957917000278156e-05, "loss": 0.2184, "num_tokens": 12367724.0, "step": 126 }, { "epoch": 0.19751166407465007, "grad_norm": 0.3563725799359531, "learning_rate": 3.956936579600669e-05, "loss": 0.2186, "num_tokens": 12466028.0, "step": 127 }, { "epoch": 0.19906687402799378, "grad_norm": 0.3067679959501372, "learning_rate": 3.95594500769523e-05, "loss": 0.2218, "num_tokens": 12564332.0, "step": 128 }, { "epoch": 0.2006220839813375, "grad_norm": 0.2964317372149136, "learning_rate": 3.954942290855317e-05, "loss": 0.2191, "num_tokens": 12662636.0, "step": 129 }, { "epoch": 0.20217729393468117, "grad_norm": 0.34612660940352297, "learning_rate": 3.9539284354451486e-05, "loss": 0.2102, "num_tokens": 12760940.0, "step": 130 }, { "epoch": 0.20373250388802489, "grad_norm": 0.28695860601659834, "learning_rate": 3.952903447899635e-05, "loss": 0.2155, "num_tokens": 12859244.0, "step": 131 }, { "epoch": 0.2052877138413686, "grad_norm": 0.3636078980071715, "learning_rate": 3.9518673347243445e-05, "loss": 0.2231, "num_tokens": 12957548.0, "step": 132 }, { "epoch": 0.20684292379471228, "grad_norm": 0.38537301898501136, "learning_rate": 3.950820102495459e-05, "loss": 0.2285, "num_tokens": 13055852.0, "step": 133 }, { "epoch": 0.208398133748056, "grad_norm": 0.29151511185976403, "learning_rate": 3.949761757859733e-05, "loss": 0.2048, "num_tokens": 13154156.0, "step": 134 }, { "epoch": 0.2099533437013997, "grad_norm": 0.2962986601252305, "learning_rate": 3.9486923075344495e-05, "loss": 0.2158, "num_tokens": 13252460.0, "step": 135 }, { "epoch": 0.2115085536547434, "grad_norm": 0.3404987518473795, "learning_rate": 3.9476117583073804e-05, "loss": 0.2264, "num_tokens": 13350764.0, "step": 136 }, { "epoch": 0.2130637636080871, "grad_norm": 0.3073558716108716, "learning_rate": 3.946520117036743e-05, "loss": 0.2247, "num_tokens": 13449068.0, "step": 137 }, { "epoch": 0.21461897356143078, "grad_norm": 0.32283296771265385, "learning_rate": 3.945417390651153e-05, "loss": 0.2215, "num_tokens": 13547372.0, "step": 138 }, { "epoch": 0.2161741835147745, "grad_norm": 0.3220333585099124, "learning_rate": 3.9443035861495856e-05, "loss": 0.2251, "num_tokens": 13645676.0, "step": 139 }, { "epoch": 0.2177293934681182, "grad_norm": 0.30853637191405303, "learning_rate": 3.9431787106013256e-05, "loss": 0.1986, "num_tokens": 13743980.0, "step": 140 }, { "epoch": 0.2192846034214619, "grad_norm": 0.3023491360466548, "learning_rate": 3.942042771145928e-05, "loss": 0.2131, "num_tokens": 13842284.0, "step": 141 }, { "epoch": 0.2208398133748056, "grad_norm": 0.2886166720191575, "learning_rate": 3.9408957749931696e-05, "loss": 0.2263, "num_tokens": 13940588.0, "step": 142 }, { "epoch": 0.2223950233281493, "grad_norm": 0.28434719836795047, "learning_rate": 3.9397377294230013e-05, "loss": 0.216, "num_tokens": 14038892.0, "step": 143 }, { "epoch": 0.223950233281493, "grad_norm": 0.29772193923540274, "learning_rate": 3.938568641785509e-05, "loss": 0.2206, "num_tokens": 14137196.0, "step": 144 }, { "epoch": 0.2255054432348367, "grad_norm": 0.2873548673047821, "learning_rate": 3.9373885195008585e-05, "loss": 0.2087, "num_tokens": 14235500.0, "step": 145 }, { "epoch": 0.22706065318818042, "grad_norm": 0.29531137238306604, "learning_rate": 3.936197370059253e-05, "loss": 0.2264, "num_tokens": 14333804.0, "step": 146 }, { "epoch": 0.2286158631415241, "grad_norm": 0.5624672744334344, "learning_rate": 3.934995201020885e-05, "loss": 0.2222, "num_tokens": 14432108.0, "step": 147 }, { "epoch": 0.2301710730948678, "grad_norm": 0.2851543921420554, "learning_rate": 3.933782020015889e-05, "loss": 0.2236, "num_tokens": 14530412.0, "step": 148 }, { "epoch": 0.2317262830482115, "grad_norm": 0.3262454082172415, "learning_rate": 3.932557834744291e-05, "loss": 0.2239, "num_tokens": 14628716.0, "step": 149 }, { "epoch": 0.2332814930015552, "grad_norm": 0.30564222230669735, "learning_rate": 3.931322652975961e-05, "loss": 0.2145, "num_tokens": 14723318.0, "step": 150 }, { "epoch": 0.23483670295489892, "grad_norm": 0.2801340333811278, "learning_rate": 3.930076482550564e-05, "loss": 0.2115, "num_tokens": 14821622.0, "step": 151 }, { "epoch": 0.2363919129082426, "grad_norm": 0.2764146043902825, "learning_rate": 3.9288193313775083e-05, "loss": 0.2062, "num_tokens": 14919926.0, "step": 152 }, { "epoch": 0.2379471228615863, "grad_norm": 0.3168280112002025, "learning_rate": 3.927551207435898e-05, "loss": 0.2136, "num_tokens": 15018230.0, "step": 153 }, { "epoch": 0.23950233281493002, "grad_norm": 0.38038728243326975, "learning_rate": 3.9262721187744816e-05, "loss": 0.2099, "num_tokens": 15116534.0, "step": 154 }, { "epoch": 0.2410575427682737, "grad_norm": 0.2890831430355307, "learning_rate": 3.924982073511598e-05, "loss": 0.217, "num_tokens": 15214838.0, "step": 155 }, { "epoch": 0.24261275272161742, "grad_norm": 0.27837915612473546, "learning_rate": 3.923681079835131e-05, "loss": 0.2063, "num_tokens": 15313142.0, "step": 156 }, { "epoch": 0.24416796267496113, "grad_norm": 0.28667150271920794, "learning_rate": 3.92236914600245e-05, "loss": 0.2043, "num_tokens": 15411446.0, "step": 157 }, { "epoch": 0.24572317262830481, "grad_norm": 0.3016175820010192, "learning_rate": 3.921046280340363e-05, "loss": 0.2085, "num_tokens": 15509750.0, "step": 158 }, { "epoch": 0.24727838258164853, "grad_norm": 0.2981247044522806, "learning_rate": 3.919712491245062e-05, "loss": 0.2163, "num_tokens": 15608054.0, "step": 159 }, { "epoch": 0.24883359253499224, "grad_norm": 0.28252953543839854, "learning_rate": 3.918367787182069e-05, "loss": 0.2029, "num_tokens": 15706358.0, "step": 160 }, { "epoch": 0.25038880248833595, "grad_norm": 0.2964632318009124, "learning_rate": 3.9170121766861845e-05, "loss": 0.2235, "num_tokens": 15804662.0, "step": 161 }, { "epoch": 0.25194401244167963, "grad_norm": 0.2820596717298698, "learning_rate": 3.9156456683614294e-05, "loss": 0.2026, "num_tokens": 15902966.0, "step": 162 }, { "epoch": 0.2534992223950233, "grad_norm": 0.2938819718288651, "learning_rate": 3.914268270880997e-05, "loss": 0.219, "num_tokens": 16001270.0, "step": 163 }, { "epoch": 0.25505443234836706, "grad_norm": 0.2941067698689447, "learning_rate": 3.912879992987188e-05, "loss": 0.2159, "num_tokens": 16099574.0, "step": 164 }, { "epoch": 0.25660964230171074, "grad_norm": 0.3001131103962647, "learning_rate": 3.911480843491366e-05, "loss": 0.2188, "num_tokens": 16197878.0, "step": 165 }, { "epoch": 0.2581648522550544, "grad_norm": 0.2803888215510493, "learning_rate": 3.910070831273894e-05, "loss": 0.2087, "num_tokens": 16296182.0, "step": 166 }, { "epoch": 0.25972006220839816, "grad_norm": 0.2777760756740868, "learning_rate": 3.9086499652840795e-05, "loss": 0.2092, "num_tokens": 16394486.0, "step": 167 }, { "epoch": 0.26127527216174184, "grad_norm": 0.2854051354981221, "learning_rate": 3.90721825454012e-05, "loss": 0.2135, "num_tokens": 16492790.0, "step": 168 }, { "epoch": 0.26283048211508553, "grad_norm": 0.2920760117109576, "learning_rate": 3.905775708129045e-05, "loss": 0.2167, "num_tokens": 16591094.0, "step": 169 }, { "epoch": 0.2643856920684292, "grad_norm": 0.2775150241221793, "learning_rate": 3.904322335206655e-05, "loss": 0.2098, "num_tokens": 16689398.0, "step": 170 }, { "epoch": 0.26594090202177295, "grad_norm": 0.29486000649797484, "learning_rate": 3.9028581449974695e-05, "loss": 0.2172, "num_tokens": 16787702.0, "step": 171 }, { "epoch": 0.26749611197511663, "grad_norm": 0.36197398698127725, "learning_rate": 3.901383146794663e-05, "loss": 0.2086, "num_tokens": 16886006.0, "step": 172 }, { "epoch": 0.2690513219284603, "grad_norm": 0.28853907323709294, "learning_rate": 3.899897349960008e-05, "loss": 0.2017, "num_tokens": 16984310.0, "step": 173 }, { "epoch": 0.27060653188180406, "grad_norm": 0.30814665258267326, "learning_rate": 3.898400763923816e-05, "loss": 0.2182, "num_tokens": 17082614.0, "step": 174 }, { "epoch": 0.27216174183514774, "grad_norm": 0.296161229613388, "learning_rate": 3.896893398184877e-05, "loss": 0.2117, "num_tokens": 17180918.0, "step": 175 }, { "epoch": 0.2737169517884914, "grad_norm": 0.2901058957171869, "learning_rate": 3.8953752623103994e-05, "loss": 0.2101, "num_tokens": 17279222.0, "step": 176 }, { "epoch": 0.27527216174183516, "grad_norm": 0.44430533897311775, "learning_rate": 3.8938463659359515e-05, "loss": 0.2126, "num_tokens": 17377526.0, "step": 177 }, { "epoch": 0.27682737169517885, "grad_norm": 0.29075366903050304, "learning_rate": 3.8923067187653946e-05, "loss": 0.216, "num_tokens": 17475830.0, "step": 178 }, { "epoch": 0.27838258164852253, "grad_norm": 0.2835160137125022, "learning_rate": 3.8907563305708264e-05, "loss": 0.2056, "num_tokens": 17574134.0, "step": 179 }, { "epoch": 0.27993779160186627, "grad_norm": 0.27852394910472533, "learning_rate": 3.88919521119252e-05, "loss": 0.2021, "num_tokens": 17672438.0, "step": 180 }, { "epoch": 0.28149300155520995, "grad_norm": 0.2759720973877624, "learning_rate": 3.8876233705388556e-05, "loss": 0.2055, "num_tokens": 17770742.0, "step": 181 }, { "epoch": 0.28304821150855364, "grad_norm": 0.2760146430803605, "learning_rate": 3.8860408185862624e-05, "loss": 0.2028, "num_tokens": 17869046.0, "step": 182 }, { "epoch": 0.2846034214618974, "grad_norm": 0.3082594153426294, "learning_rate": 3.8844475653791544e-05, "loss": 0.2241, "num_tokens": 17967350.0, "step": 183 }, { "epoch": 0.28615863141524106, "grad_norm": 0.28913876402156236, "learning_rate": 3.882843621029866e-05, "loss": 0.1961, "num_tokens": 18065654.0, "step": 184 }, { "epoch": 0.28771384136858474, "grad_norm": 0.3064712502716389, "learning_rate": 3.8812289957185866e-05, "loss": 0.2306, "num_tokens": 18163958.0, "step": 185 }, { "epoch": 0.2892690513219285, "grad_norm": 0.3127767642278516, "learning_rate": 3.8796036996932996e-05, "loss": 0.2211, "num_tokens": 18262262.0, "step": 186 }, { "epoch": 0.29082426127527217, "grad_norm": 0.28455204825070884, "learning_rate": 3.8779677432697126e-05, "loss": 0.2031, "num_tokens": 18360566.0, "step": 187 }, { "epoch": 0.29237947122861585, "grad_norm": 0.3008765344520019, "learning_rate": 3.8763211368311974e-05, "loss": 0.2039, "num_tokens": 18458870.0, "step": 188 }, { "epoch": 0.2939346811819596, "grad_norm": 0.2950630512216382, "learning_rate": 3.874663890828719e-05, "loss": 0.1992, "num_tokens": 18557174.0, "step": 189 }, { "epoch": 0.2954898911353033, "grad_norm": 0.27431658375946144, "learning_rate": 3.8729960157807714e-05, "loss": 0.2015, "num_tokens": 18655478.0, "step": 190 }, { "epoch": 0.29704510108864696, "grad_norm": 0.2856968012390358, "learning_rate": 3.871317522273312e-05, "loss": 0.2087, "num_tokens": 18753782.0, "step": 191 }, { "epoch": 0.2986003110419907, "grad_norm": 0.29498051801351943, "learning_rate": 3.869628420959693e-05, "loss": 0.2173, "num_tokens": 18852086.0, "step": 192 }, { "epoch": 0.3001555209953344, "grad_norm": 0.289691406112623, "learning_rate": 3.8679287225605937e-05, "loss": 0.2124, "num_tokens": 18950390.0, "step": 193 }, { "epoch": 0.30171073094867806, "grad_norm": 1.024546613907252, "learning_rate": 3.866218437863952e-05, "loss": 0.2117, "num_tokens": 19048694.0, "step": 194 }, { "epoch": 0.30326594090202175, "grad_norm": 1.4265780989567831, "learning_rate": 3.8644975777248984e-05, "loss": 0.2189, "num_tokens": 19146998.0, "step": 195 }, { "epoch": 0.3048211508553655, "grad_norm": 0.2683972230206361, "learning_rate": 3.8627661530656854e-05, "loss": 0.2106, "num_tokens": 19245302.0, "step": 196 }, { "epoch": 0.30637636080870917, "grad_norm": 0.2846922689678994, "learning_rate": 3.8610241748756164e-05, "loss": 0.2155, "num_tokens": 19343606.0, "step": 197 }, { "epoch": 0.30793157076205285, "grad_norm": 0.2886820697144978, "learning_rate": 3.859271654210979e-05, "loss": 0.215, "num_tokens": 19441910.0, "step": 198 }, { "epoch": 0.3094867807153966, "grad_norm": 0.29194333848128134, "learning_rate": 3.857508602194973e-05, "loss": 0.2195, "num_tokens": 19540214.0, "step": 199 }, { "epoch": 0.3110419906687403, "grad_norm": 0.2882449080731781, "learning_rate": 3.855735030017643e-05, "loss": 0.2095, "num_tokens": 19638007.0, "step": 200 }, { "epoch": 0.31259720062208396, "grad_norm": 0.30387147479766496, "learning_rate": 3.853950948935801e-05, "loss": 0.2056, "num_tokens": 19736311.0, "step": 201 }, { "epoch": 0.3141524105754277, "grad_norm": 0.2994077185233693, "learning_rate": 3.8521563702729595e-05, "loss": 0.212, "num_tokens": 19834615.0, "step": 202 }, { "epoch": 0.3157076205287714, "grad_norm": 0.2851302646940208, "learning_rate": 3.850351305419261e-05, "loss": 0.1953, "num_tokens": 19932919.0, "step": 203 }, { "epoch": 0.31726283048211507, "grad_norm": 0.3033298001742302, "learning_rate": 3.848535765831402e-05, "loss": 0.2257, "num_tokens": 20031223.0, "step": 204 }, { "epoch": 0.3188180404354588, "grad_norm": 0.325001207080733, "learning_rate": 3.846709763032563e-05, "loss": 0.2135, "num_tokens": 20129527.0, "step": 205 }, { "epoch": 0.3203732503888025, "grad_norm": 0.28627855008714914, "learning_rate": 3.8448733086123315e-05, "loss": 0.2067, "num_tokens": 20227831.0, "step": 206 }, { "epoch": 0.32192846034214617, "grad_norm": 0.2689245489420853, "learning_rate": 3.843026414226634e-05, "loss": 0.2116, "num_tokens": 20326135.0, "step": 207 }, { "epoch": 0.3234836702954899, "grad_norm": 0.28764083060898005, "learning_rate": 3.841169091597659e-05, "loss": 0.2065, "num_tokens": 20424439.0, "step": 208 }, { "epoch": 0.3250388802488336, "grad_norm": 0.31021562999923524, "learning_rate": 3.839301352513779e-05, "loss": 0.2115, "num_tokens": 20522743.0, "step": 209 }, { "epoch": 0.3265940902021773, "grad_norm": 0.27963790995216875, "learning_rate": 3.837423208829486e-05, "loss": 0.2012, "num_tokens": 20621047.0, "step": 210 }, { "epoch": 0.328149300155521, "grad_norm": 0.26353742416800763, "learning_rate": 3.8355346724653045e-05, "loss": 0.1926, "num_tokens": 20719351.0, "step": 211 }, { "epoch": 0.3297045101088647, "grad_norm": 0.2811831888105367, "learning_rate": 3.833635755407723e-05, "loss": 0.2027, "num_tokens": 20817655.0, "step": 212 }, { "epoch": 0.3312597200622084, "grad_norm": 0.29788084840623447, "learning_rate": 3.831726469709115e-05, "loss": 0.2112, "num_tokens": 20915959.0, "step": 213 }, { "epoch": 0.3328149300155521, "grad_norm": 0.28913207499343113, "learning_rate": 3.829806827487664e-05, "loss": 0.2111, "num_tokens": 21014263.0, "step": 214 }, { "epoch": 0.3343701399688958, "grad_norm": 0.2907185532538335, "learning_rate": 3.827876840927287e-05, "loss": 0.2164, "num_tokens": 21112567.0, "step": 215 }, { "epoch": 0.3359253499222395, "grad_norm": 0.5297602668324771, "learning_rate": 3.8259365222775546e-05, "loss": 0.2094, "num_tokens": 21210871.0, "step": 216 }, { "epoch": 0.33748055987558323, "grad_norm": 0.30601393539356647, "learning_rate": 3.823985883853617e-05, "loss": 0.2103, "num_tokens": 21309175.0, "step": 217 }, { "epoch": 0.3390357698289269, "grad_norm": 0.2852703341084803, "learning_rate": 3.8220249380361207e-05, "loss": 0.2092, "num_tokens": 21407479.0, "step": 218 }, { "epoch": 0.3405909797822706, "grad_norm": 0.2780581203206016, "learning_rate": 3.820053697271135e-05, "loss": 0.2005, "num_tokens": 21505783.0, "step": 219 }, { "epoch": 0.3421461897356143, "grad_norm": 0.3423740079487521, "learning_rate": 3.818072174070072e-05, "loss": 0.2209, "num_tokens": 21604087.0, "step": 220 }, { "epoch": 0.343701399688958, "grad_norm": 0.2696534971124917, "learning_rate": 3.816080381009604e-05, "loss": 0.2021, "num_tokens": 21702391.0, "step": 221 }, { "epoch": 0.3452566096423017, "grad_norm": 0.28368553193912727, "learning_rate": 3.814078330731585e-05, "loss": 0.215, "num_tokens": 21800695.0, "step": 222 }, { "epoch": 0.3468118195956454, "grad_norm": 0.2668339767994437, "learning_rate": 3.812066035942975e-05, "loss": 0.2076, "num_tokens": 21898999.0, "step": 223 }, { "epoch": 0.3483670295489891, "grad_norm": 0.26750915853223634, "learning_rate": 3.8100435094157525e-05, "loss": 0.2102, "num_tokens": 21997303.0, "step": 224 }, { "epoch": 0.3499222395023328, "grad_norm": 0.3274523971862046, "learning_rate": 3.8080107639868374e-05, "loss": 0.2201, "num_tokens": 22095607.0, "step": 225 }, { "epoch": 0.3514774494556765, "grad_norm": 0.28505688109249316, "learning_rate": 3.80596781255801e-05, "loss": 0.2196, "num_tokens": 22193911.0, "step": 226 }, { "epoch": 0.35303265940902023, "grad_norm": 0.2771918120313445, "learning_rate": 3.8039146680958245e-05, "loss": 0.2009, "num_tokens": 22292215.0, "step": 227 }, { "epoch": 0.3545878693623639, "grad_norm": 0.276282343767416, "learning_rate": 3.8018513436315335e-05, "loss": 0.1997, "num_tokens": 22390519.0, "step": 228 }, { "epoch": 0.3561430793157076, "grad_norm": 0.27341875889459266, "learning_rate": 3.799777852261001e-05, "loss": 0.1947, "num_tokens": 22488823.0, "step": 229 }, { "epoch": 0.35769828926905134, "grad_norm": 0.3113095693229513, "learning_rate": 3.797694207144617e-05, "loss": 0.2132, "num_tokens": 22587127.0, "step": 230 }, { "epoch": 0.359253499222395, "grad_norm": 0.2741158849682268, "learning_rate": 3.795600421507222e-05, "loss": 0.2005, "num_tokens": 22685431.0, "step": 231 }, { "epoch": 0.3608087091757387, "grad_norm": 0.32328177026955135, "learning_rate": 3.793496508638015e-05, "loss": 0.2103, "num_tokens": 22783735.0, "step": 232 }, { "epoch": 0.36236391912908245, "grad_norm": 0.2841787845819011, "learning_rate": 3.791382481890472e-05, "loss": 0.2005, "num_tokens": 22882039.0, "step": 233 }, { "epoch": 0.36391912908242613, "grad_norm": 0.28597267756994643, "learning_rate": 3.7892583546822635e-05, "loss": 0.2011, "num_tokens": 22980343.0, "step": 234 }, { "epoch": 0.3654743390357698, "grad_norm": 0.2702219938397021, "learning_rate": 3.7871241404951656e-05, "loss": 0.2097, "num_tokens": 23078647.0, "step": 235 }, { "epoch": 0.36702954898911355, "grad_norm": 0.2755983313372928, "learning_rate": 3.7849798528749766e-05, "loss": 0.203, "num_tokens": 23176951.0, "step": 236 }, { "epoch": 0.36858475894245724, "grad_norm": 0.28617433778188606, "learning_rate": 3.7828255054314316e-05, "loss": 0.2041, "num_tokens": 23275255.0, "step": 237 }, { "epoch": 0.3701399688958009, "grad_norm": 0.2786087302232774, "learning_rate": 3.7806611118381137e-05, "loss": 0.2018, "num_tokens": 23373559.0, "step": 238 }, { "epoch": 0.37169517884914466, "grad_norm": 0.2624664804194973, "learning_rate": 3.778486685832369e-05, "loss": 0.196, "num_tokens": 23471863.0, "step": 239 }, { "epoch": 0.37325038880248834, "grad_norm": 0.36815792406393694, "learning_rate": 3.776302241215221e-05, "loss": 0.2188, "num_tokens": 23570167.0, "step": 240 }, { "epoch": 0.374805598755832, "grad_norm": 0.30213759519827343, "learning_rate": 3.7741077918512764e-05, "loss": 0.2121, "num_tokens": 23668471.0, "step": 241 }, { "epoch": 0.37636080870917576, "grad_norm": 0.27762296181175466, "learning_rate": 3.771903351668647e-05, "loss": 0.2017, "num_tokens": 23766775.0, "step": 242 }, { "epoch": 0.37791601866251945, "grad_norm": 0.2857401698618315, "learning_rate": 3.769688934658854e-05, "loss": 0.2105, "num_tokens": 23865079.0, "step": 243 }, { "epoch": 0.37947122861586313, "grad_norm": 0.2705373842528118, "learning_rate": 3.7674645548767395e-05, "loss": 0.2059, "num_tokens": 23963383.0, "step": 244 }, { "epoch": 0.3810264385692068, "grad_norm": 0.2786642602097985, "learning_rate": 3.765230226440381e-05, "loss": 0.2083, "num_tokens": 24061687.0, "step": 245 }, { "epoch": 0.38258164852255055, "grad_norm": 0.25251905490790805, "learning_rate": 3.762985963531001e-05, "loss": 0.1999, "num_tokens": 24159991.0, "step": 246 }, { "epoch": 0.38413685847589424, "grad_norm": 0.2538297061717743, "learning_rate": 3.7607317803928716e-05, "loss": 0.1973, "num_tokens": 24258295.0, "step": 247 }, { "epoch": 0.3856920684292379, "grad_norm": 0.26941442956714595, "learning_rate": 3.758467691333233e-05, "loss": 0.2076, "num_tokens": 24356599.0, "step": 248 }, { "epoch": 0.38724727838258166, "grad_norm": 0.27621026429130807, "learning_rate": 3.756193710722194e-05, "loss": 0.221, "num_tokens": 24454903.0, "step": 249 }, { "epoch": 0.38880248833592534, "grad_norm": 0.3011076170588903, "learning_rate": 3.753909852992649e-05, "loss": 0.1957, "num_tokens": 24552845.0, "step": 250 }, { "epoch": 0.39035769828926903, "grad_norm": 0.281218642822812, "learning_rate": 3.751616132640178e-05, "loss": 0.2186, "num_tokens": 24651149.0, "step": 251 }, { "epoch": 0.39191290824261277, "grad_norm": 0.3244909260228358, "learning_rate": 3.7493125642229614e-05, "loss": 0.2083, "num_tokens": 24749453.0, "step": 252 }, { "epoch": 0.39346811819595645, "grad_norm": 0.26797894896478014, "learning_rate": 3.746999162361685e-05, "loss": 0.1944, "num_tokens": 24847757.0, "step": 253 }, { "epoch": 0.39502332814930013, "grad_norm": 0.267406533787372, "learning_rate": 3.744675941739446e-05, "loss": 0.2021, "num_tokens": 24946061.0, "step": 254 }, { "epoch": 0.3965785381026439, "grad_norm": 0.2976262852478504, "learning_rate": 3.742342917101661e-05, "loss": 0.2078, "num_tokens": 25044365.0, "step": 255 }, { "epoch": 0.39813374805598756, "grad_norm": 0.30055595026480825, "learning_rate": 3.7400001032559733e-05, "loss": 0.2072, "num_tokens": 25142669.0, "step": 256 }, { "epoch": 0.39968895800933124, "grad_norm": 0.2773691864466274, "learning_rate": 3.737647515072159e-05, "loss": 0.2094, "num_tokens": 25240973.0, "step": 257 }, { "epoch": 0.401244167962675, "grad_norm": 0.28631211334087525, "learning_rate": 3.7352851674820285e-05, "loss": 0.2157, "num_tokens": 25339277.0, "step": 258 }, { "epoch": 0.40279937791601866, "grad_norm": 0.2599922537637334, "learning_rate": 3.7329130754793374e-05, "loss": 0.2083, "num_tokens": 25437581.0, "step": 259 }, { "epoch": 0.40435458786936235, "grad_norm": 0.2857206899116342, "learning_rate": 3.730531254119688e-05, "loss": 0.2148, "num_tokens": 25535885.0, "step": 260 }, { "epoch": 0.4059097978227061, "grad_norm": 0.2547013158464654, "learning_rate": 3.7281397185204354e-05, "loss": 0.2035, "num_tokens": 25634189.0, "step": 261 }, { "epoch": 0.40746500777604977, "grad_norm": 0.27073012213248016, "learning_rate": 3.725738483860589e-05, "loss": 0.2143, "num_tokens": 25732493.0, "step": 262 }, { "epoch": 0.40902021772939345, "grad_norm": 0.2789556096307983, "learning_rate": 3.723327565380718e-05, "loss": 0.205, "num_tokens": 25830797.0, "step": 263 }, { "epoch": 0.4105754276827372, "grad_norm": 0.2814040292929754, "learning_rate": 3.7209069783828587e-05, "loss": 0.2114, "num_tokens": 25929101.0, "step": 264 }, { "epoch": 0.4121306376360809, "grad_norm": 0.26747762300983874, "learning_rate": 3.718476738230407e-05, "loss": 0.2104, "num_tokens": 26027405.0, "step": 265 }, { "epoch": 0.41368584758942456, "grad_norm": 0.2543952312099394, "learning_rate": 3.7160368603480316e-05, "loss": 0.2023, "num_tokens": 26125709.0, "step": 266 }, { "epoch": 0.4152410575427683, "grad_norm": 0.25533428320807894, "learning_rate": 3.713587360221569e-05, "loss": 0.2025, "num_tokens": 26224013.0, "step": 267 }, { "epoch": 0.416796267496112, "grad_norm": 0.2687221161631139, "learning_rate": 3.7111282533979296e-05, "loss": 0.2001, "num_tokens": 26322317.0, "step": 268 }, { "epoch": 0.41835147744945567, "grad_norm": 0.2634336465483639, "learning_rate": 3.708659555484995e-05, "loss": 0.2039, "num_tokens": 26420621.0, "step": 269 }, { "epoch": 0.4199066874027994, "grad_norm": 0.2769585667547769, "learning_rate": 3.706181282151526e-05, "loss": 0.2032, "num_tokens": 26518925.0, "step": 270 }, { "epoch": 0.4214618973561431, "grad_norm": 0.27640297955538207, "learning_rate": 3.7036934491270514e-05, "loss": 0.2158, "num_tokens": 26617229.0, "step": 271 }, { "epoch": 0.4230171073094868, "grad_norm": 0.25545454717814303, "learning_rate": 3.7011960722017804e-05, "loss": 0.2108, "num_tokens": 26715533.0, "step": 272 }, { "epoch": 0.42457231726283046, "grad_norm": 0.2699996506762095, "learning_rate": 3.6986891672264956e-05, "loss": 0.2162, "num_tokens": 26813837.0, "step": 273 }, { "epoch": 0.4261275272161742, "grad_norm": 0.2714442417848743, "learning_rate": 3.696172750112453e-05, "loss": 0.207, "num_tokens": 26912141.0, "step": 274 }, { "epoch": 0.4276827371695179, "grad_norm": 0.2433032240232944, "learning_rate": 3.693646836831282e-05, "loss": 0.1894, "num_tokens": 27010445.0, "step": 275 }, { "epoch": 0.42923794712286156, "grad_norm": 0.2570880134626158, "learning_rate": 3.691111443414886e-05, "loss": 0.1945, "num_tokens": 27108749.0, "step": 276 }, { "epoch": 0.4307931570762053, "grad_norm": 0.2576742023907654, "learning_rate": 3.6885665859553364e-05, "loss": 0.2029, "num_tokens": 27207053.0, "step": 277 }, { "epoch": 0.432348367029549, "grad_norm": 0.25974575330556987, "learning_rate": 3.686012280604772e-05, "loss": 0.192, "num_tokens": 27305357.0, "step": 278 }, { "epoch": 0.43390357698289267, "grad_norm": 0.26588859307180346, "learning_rate": 3.6834485435753e-05, "loss": 0.2062, "num_tokens": 27403661.0, "step": 279 }, { "epoch": 0.4354587869362364, "grad_norm": 0.2719436000023082, "learning_rate": 3.680875391138888e-05, "loss": 0.206, "num_tokens": 27501965.0, "step": 280 }, { "epoch": 0.4370139968895801, "grad_norm": 0.2728800878836229, "learning_rate": 3.678292839627263e-05, "loss": 0.2058, "num_tokens": 27600269.0, "step": 281 }, { "epoch": 0.4385692068429238, "grad_norm": 0.26807304882877836, "learning_rate": 3.675700905431809e-05, "loss": 0.2067, "num_tokens": 27698573.0, "step": 282 }, { "epoch": 0.4401244167962675, "grad_norm": 0.26210118482020434, "learning_rate": 3.6730996050034596e-05, "loss": 0.2006, "num_tokens": 27796877.0, "step": 283 }, { "epoch": 0.4416796267496112, "grad_norm": 0.26364648802101737, "learning_rate": 3.670488954852597e-05, "loss": 0.1986, "num_tokens": 27895181.0, "step": 284 }, { "epoch": 0.4432348367029549, "grad_norm": 1.088265110040825, "learning_rate": 3.6678689715489464e-05, "loss": 0.1914, "num_tokens": 27993485.0, "step": 285 }, { "epoch": 0.4447900466562986, "grad_norm": 0.2977492286682399, "learning_rate": 3.6652396717214696e-05, "loss": 0.2065, "num_tokens": 28091789.0, "step": 286 }, { "epoch": 0.4463452566096423, "grad_norm": 0.2489422759117313, "learning_rate": 3.662601072058259e-05, "loss": 0.1839, "num_tokens": 28190093.0, "step": 287 }, { "epoch": 0.447900466562986, "grad_norm": 0.2798925865960865, "learning_rate": 3.6599531893064335e-05, "loss": 0.1962, "num_tokens": 28288397.0, "step": 288 }, { "epoch": 0.4494556765163297, "grad_norm": 0.295760638629796, "learning_rate": 3.657296040272034e-05, "loss": 0.2, "num_tokens": 28386701.0, "step": 289 }, { "epoch": 0.4510108864696734, "grad_norm": 0.2848015061759715, "learning_rate": 3.6546296418199094e-05, "loss": 0.2012, "num_tokens": 28485005.0, "step": 290 }, { "epoch": 0.4525660964230171, "grad_norm": 0.28826676249382704, "learning_rate": 3.651954010873619e-05, "loss": 0.2134, "num_tokens": 28583309.0, "step": 291 }, { "epoch": 0.45412130637636083, "grad_norm": 0.2805293408599211, "learning_rate": 3.6492691644153173e-05, "loss": 0.2019, "num_tokens": 28681613.0, "step": 292 }, { "epoch": 0.4556765163297045, "grad_norm": 0.26213182999337253, "learning_rate": 3.646575119485652e-05, "loss": 0.1986, "num_tokens": 28779917.0, "step": 293 }, { "epoch": 0.4572317262830482, "grad_norm": 0.2863390990213621, "learning_rate": 3.64387189318365e-05, "loss": 0.2124, "num_tokens": 28878221.0, "step": 294 }, { "epoch": 0.45878693623639194, "grad_norm": 0.3313388217819153, "learning_rate": 3.641159502666615e-05, "loss": 0.2057, "num_tokens": 28976525.0, "step": 295 }, { "epoch": 0.4603421461897356, "grad_norm": 0.3266326248066806, "learning_rate": 3.638437965150015e-05, "loss": 0.1956, "num_tokens": 29074829.0, "step": 296 }, { "epoch": 0.4618973561430793, "grad_norm": 0.25665711208232417, "learning_rate": 3.635707297907373e-05, "loss": 0.2012, "num_tokens": 29173133.0, "step": 297 }, { "epoch": 0.463452566096423, "grad_norm": 0.3201724530028497, "learning_rate": 3.632967518270159e-05, "loss": 0.1969, "num_tokens": 29271437.0, "step": 298 }, { "epoch": 0.46500777604976673, "grad_norm": 0.2557161672280044, "learning_rate": 3.630218643627678e-05, "loss": 0.1959, "num_tokens": 29369741.0, "step": 299 }, { "epoch": 0.4665629860031104, "grad_norm": 0.30875435886821534, "learning_rate": 3.6274606914269626e-05, "loss": 0.18, "num_tokens": 29463426.0, "step": 300 }, { "epoch": 0.4681181959564541, "grad_norm": 0.2677510897946269, "learning_rate": 3.624693679172658e-05, "loss": 0.2025, "num_tokens": 29561730.0, "step": 301 }, { "epoch": 0.46967340590979784, "grad_norm": 0.27890390333276255, "learning_rate": 3.6219176244269145e-05, "loss": 0.2135, "num_tokens": 29660034.0, "step": 302 }, { "epoch": 0.4712286158631415, "grad_norm": 0.27025439678651686, "learning_rate": 3.6191325448092764e-05, "loss": 0.1865, "num_tokens": 29758338.0, "step": 303 }, { "epoch": 0.4727838258164852, "grad_norm": 0.2751990234515936, "learning_rate": 3.616338457996566e-05, "loss": 0.1995, "num_tokens": 29856642.0, "step": 304 }, { "epoch": 0.47433903576982894, "grad_norm": 0.28066699886648344, "learning_rate": 3.6135353817227746e-05, "loss": 0.205, "num_tokens": 29954946.0, "step": 305 }, { "epoch": 0.4758942457231726, "grad_norm": 0.37169392876320145, "learning_rate": 3.610723333778952e-05, "loss": 0.1969, "num_tokens": 30053250.0, "step": 306 }, { "epoch": 0.4774494556765163, "grad_norm": 0.2601665998393591, "learning_rate": 3.6079023320130865e-05, "loss": 0.1939, "num_tokens": 30151554.0, "step": 307 }, { "epoch": 0.47900466562986005, "grad_norm": 0.26234747495719646, "learning_rate": 3.60507239433e-05, "loss": 0.2009, "num_tokens": 30249858.0, "step": 308 }, { "epoch": 0.48055987558320373, "grad_norm": 0.5234348086589087, "learning_rate": 3.6022335386912275e-05, "loss": 0.2029, "num_tokens": 30348162.0, "step": 309 }, { "epoch": 0.4821150855365474, "grad_norm": 0.3332464508375318, "learning_rate": 3.5993857831149084e-05, "loss": 0.2032, "num_tokens": 30446466.0, "step": 310 }, { "epoch": 0.48367029548989116, "grad_norm": 0.2585185076626204, "learning_rate": 3.5965291456756684e-05, "loss": 0.2124, "num_tokens": 30544770.0, "step": 311 }, { "epoch": 0.48522550544323484, "grad_norm": 0.26504550168852614, "learning_rate": 3.5936636445045065e-05, "loss": 0.2001, "num_tokens": 30643074.0, "step": 312 }, { "epoch": 0.4867807153965785, "grad_norm": 0.2506410212468444, "learning_rate": 3.590789297788679e-05, "loss": 0.2068, "num_tokens": 30741378.0, "step": 313 }, { "epoch": 0.48833592534992226, "grad_norm": 0.2524559553530682, "learning_rate": 3.587906123771586e-05, "loss": 0.1912, "num_tokens": 30839682.0, "step": 314 }, { "epoch": 0.48989113530326595, "grad_norm": 0.26419820191628945, "learning_rate": 3.585014140752653e-05, "loss": 0.1965, "num_tokens": 30937986.0, "step": 315 }, { "epoch": 0.49144634525660963, "grad_norm": 0.25725110744716406, "learning_rate": 3.5821133670872156e-05, "loss": 0.2124, "num_tokens": 31036290.0, "step": 316 }, { "epoch": 0.49300155520995337, "grad_norm": 0.3611934552133302, "learning_rate": 3.579203821186406e-05, "loss": 0.1977, "num_tokens": 31134594.0, "step": 317 }, { "epoch": 0.49455676516329705, "grad_norm": 0.25329001802607654, "learning_rate": 3.5762855215170295e-05, "loss": 0.1899, "num_tokens": 31232898.0, "step": 318 }, { "epoch": 0.49611197511664074, "grad_norm": 0.2524276308216609, "learning_rate": 3.5733584866014545e-05, "loss": 0.2009, "num_tokens": 31331202.0, "step": 319 }, { "epoch": 0.4976671850699845, "grad_norm": 0.2500908010278474, "learning_rate": 3.5704227350174916e-05, "loss": 0.1925, "num_tokens": 31429506.0, "step": 320 }, { "epoch": 0.49922239502332816, "grad_norm": 0.27096857764918125, "learning_rate": 3.5674782853982734e-05, "loss": 0.1964, "num_tokens": 31527810.0, "step": 321 }, { "epoch": 0.5007776049766719, "grad_norm": 0.31532221227677754, "learning_rate": 3.564525156432141e-05, "loss": 0.2051, "num_tokens": 31626114.0, "step": 322 }, { "epoch": 0.5023328149300156, "grad_norm": 0.27053664594533516, "learning_rate": 3.561563366862523e-05, "loss": 0.1951, "num_tokens": 31724418.0, "step": 323 }, { "epoch": 0.5038880248833593, "grad_norm": 0.2486535496711848, "learning_rate": 3.558592935487816e-05, "loss": 0.1946, "num_tokens": 31822722.0, "step": 324 }, { "epoch": 0.505443234836703, "grad_norm": 0.2504868905080709, "learning_rate": 3.5556138811612646e-05, "loss": 0.1974, "num_tokens": 31921026.0, "step": 325 }, { "epoch": 0.5069984447900466, "grad_norm": 0.2519336145350125, "learning_rate": 3.552626222790846e-05, "loss": 0.2071, "num_tokens": 32019330.0, "step": 326 }, { "epoch": 0.5085536547433903, "grad_norm": 0.2469067491600789, "learning_rate": 3.549629979339144e-05, "loss": 0.2066, "num_tokens": 32117634.0, "step": 327 }, { "epoch": 0.5101088646967341, "grad_norm": 0.260497609159978, "learning_rate": 3.5466251698232334e-05, "loss": 0.1999, "num_tokens": 32215938.0, "step": 328 }, { "epoch": 0.5116640746500778, "grad_norm": 0.25134040713583267, "learning_rate": 3.5436118133145584e-05, "loss": 0.1962, "num_tokens": 32314242.0, "step": 329 }, { "epoch": 0.5132192846034215, "grad_norm": 0.26028392034576, "learning_rate": 3.540589928938808e-05, "loss": 0.1985, "num_tokens": 32412546.0, "step": 330 }, { "epoch": 0.5147744945567652, "grad_norm": 0.2482167571698514, "learning_rate": 3.537559535875799e-05, "loss": 0.1932, "num_tokens": 32510850.0, "step": 331 }, { "epoch": 0.5163297045101088, "grad_norm": 0.254501969238784, "learning_rate": 3.534520653359353e-05, "loss": 0.2087, "num_tokens": 32609154.0, "step": 332 }, { "epoch": 0.5178849144634525, "grad_norm": 0.23009166219343338, "learning_rate": 3.531473300677174e-05, "loss": 0.1837, "num_tokens": 32707458.0, "step": 333 }, { "epoch": 0.5194401244167963, "grad_norm": 0.2880057582682568, "learning_rate": 3.5284174971707245e-05, "loss": 0.1961, "num_tokens": 32805762.0, "step": 334 }, { "epoch": 0.52099533437014, "grad_norm": 0.26058785333022877, "learning_rate": 3.525353262235105e-05, "loss": 0.1981, "num_tokens": 32904066.0, "step": 335 }, { "epoch": 0.5225505443234837, "grad_norm": 0.2555315616081863, "learning_rate": 3.5222806153189304e-05, "loss": 0.2004, "num_tokens": 33002370.0, "step": 336 }, { "epoch": 0.5241057542768274, "grad_norm": 0.290855300670303, "learning_rate": 3.519199575924205e-05, "loss": 0.1961, "num_tokens": 33100674.0, "step": 337 }, { "epoch": 0.5256609642301711, "grad_norm": 0.2562813149975831, "learning_rate": 3.516110163606202e-05, "loss": 0.1966, "num_tokens": 33198978.0, "step": 338 }, { "epoch": 0.5272161741835147, "grad_norm": 0.26441638310803967, "learning_rate": 3.513012397973335e-05, "loss": 0.194, "num_tokens": 33297282.0, "step": 339 }, { "epoch": 0.5287713841368584, "grad_norm": 0.30456450499638277, "learning_rate": 3.5099062986870376e-05, "loss": 0.2075, "num_tokens": 33395586.0, "step": 340 }, { "epoch": 0.5303265940902022, "grad_norm": 0.26847630422957713, "learning_rate": 3.506791885461636e-05, "loss": 0.1986, "num_tokens": 33493890.0, "step": 341 }, { "epoch": 0.5318818040435459, "grad_norm": 0.26280081840325037, "learning_rate": 3.5036691780642246e-05, "loss": 0.1956, "num_tokens": 33592194.0, "step": 342 }, { "epoch": 0.5334370139968896, "grad_norm": 0.2714921046970229, "learning_rate": 3.500538196314541e-05, "loss": 0.2164, "num_tokens": 33690498.0, "step": 343 }, { "epoch": 0.5349922239502333, "grad_norm": 0.2892659300376526, "learning_rate": 3.497398960084841e-05, "loss": 0.1916, "num_tokens": 33788802.0, "step": 344 }, { "epoch": 0.536547433903577, "grad_norm": 0.26170169238025515, "learning_rate": 3.494251489299769e-05, "loss": 0.2185, "num_tokens": 33887106.0, "step": 345 }, { "epoch": 0.5381026438569206, "grad_norm": 0.2433459396386798, "learning_rate": 3.491095803936235e-05, "loss": 0.1937, "num_tokens": 33985410.0, "step": 346 }, { "epoch": 0.5396578538102644, "grad_norm": 0.25412757288991705, "learning_rate": 3.4879319240232884e-05, "loss": 0.1977, "num_tokens": 34083714.0, "step": 347 }, { "epoch": 0.5412130637636081, "grad_norm": 0.25732974226299693, "learning_rate": 3.484759869641987e-05, "loss": 0.2042, "num_tokens": 34182018.0, "step": 348 }, { "epoch": 0.5427682737169518, "grad_norm": 0.244733785211672, "learning_rate": 3.481579660925271e-05, "loss": 0.1959, "num_tokens": 34280322.0, "step": 349 }, { "epoch": 0.5443234836702955, "grad_norm": 0.2788002459946353, "learning_rate": 3.4783913180578385e-05, "loss": 0.1948, "num_tokens": 34369271.0, "step": 350 }, { "epoch": 0.5458786936236392, "grad_norm": 0.257948916483429, "learning_rate": 3.4751948612760134e-05, "loss": 0.201, "num_tokens": 34467575.0, "step": 351 }, { "epoch": 0.5474339035769828, "grad_norm": 0.2639149380756715, "learning_rate": 3.471990310867619e-05, "loss": 0.198, "num_tokens": 34565879.0, "step": 352 }, { "epoch": 0.5489891135303266, "grad_norm": 0.261944283968436, "learning_rate": 3.468777687171849e-05, "loss": 0.1944, "num_tokens": 34664183.0, "step": 353 }, { "epoch": 0.5505443234836703, "grad_norm": 0.2567680528335663, "learning_rate": 3.4655570105791344e-05, "loss": 0.2022, "num_tokens": 34762487.0, "step": 354 }, { "epoch": 0.552099533437014, "grad_norm": 0.24282538120617025, "learning_rate": 3.4623283015310234e-05, "loss": 0.1908, "num_tokens": 34860791.0, "step": 355 }, { "epoch": 0.5536547433903577, "grad_norm": 0.24016395755469333, "learning_rate": 3.4590915805200425e-05, "loss": 0.1894, "num_tokens": 34959095.0, "step": 356 }, { "epoch": 0.5552099533437014, "grad_norm": 0.2633869534817302, "learning_rate": 3.45584686808957e-05, "loss": 0.2035, "num_tokens": 35057399.0, "step": 357 }, { "epoch": 0.5567651632970451, "grad_norm": 0.2390366250153578, "learning_rate": 3.452594184833705e-05, "loss": 0.1784, "num_tokens": 35155703.0, "step": 358 }, { "epoch": 0.5583203732503889, "grad_norm": 0.24725543559423963, "learning_rate": 3.449333551397141e-05, "loss": 0.2012, "num_tokens": 35254007.0, "step": 359 }, { "epoch": 0.5598755832037325, "grad_norm": 0.2805806458833397, "learning_rate": 3.4460649884750275e-05, "loss": 0.2022, "num_tokens": 35352311.0, "step": 360 }, { "epoch": 0.5614307931570762, "grad_norm": 0.28634876129734627, "learning_rate": 3.442788516812842e-05, "loss": 0.1975, "num_tokens": 35450615.0, "step": 361 }, { "epoch": 0.5629860031104199, "grad_norm": 0.24789326329017625, "learning_rate": 3.43950415720626e-05, "loss": 0.201, "num_tokens": 35548919.0, "step": 362 }, { "epoch": 0.5645412130637636, "grad_norm": 0.2391499734798821, "learning_rate": 3.436211930501022e-05, "loss": 0.1845, "num_tokens": 35647223.0, "step": 363 }, { "epoch": 0.5660964230171073, "grad_norm": 0.26374575599879957, "learning_rate": 3.4329118575927996e-05, "loss": 0.2124, "num_tokens": 35745527.0, "step": 364 }, { "epoch": 0.567651632970451, "grad_norm": 0.2621333048992788, "learning_rate": 3.4296039594270637e-05, "loss": 0.2032, "num_tokens": 35843831.0, "step": 365 }, { "epoch": 0.5692068429237948, "grad_norm": 0.24023559507855347, "learning_rate": 3.4262882569989535e-05, "loss": 0.1891, "num_tokens": 35942135.0, "step": 366 }, { "epoch": 0.5707620528771384, "grad_norm": 0.2616175418204755, "learning_rate": 3.4229647713531404e-05, "loss": 0.2043, "num_tokens": 36040439.0, "step": 367 }, { "epoch": 0.5723172628304821, "grad_norm": 0.2836204765284182, "learning_rate": 3.419633523583697e-05, "loss": 0.1978, "num_tokens": 36138743.0, "step": 368 }, { "epoch": 0.5738724727838258, "grad_norm": 0.26691588158409146, "learning_rate": 3.41629453483396e-05, "loss": 0.1994, "num_tokens": 36237047.0, "step": 369 }, { "epoch": 0.5754276827371695, "grad_norm": 0.24473513595665233, "learning_rate": 3.4129478262963966e-05, "loss": 0.2036, "num_tokens": 36335351.0, "step": 370 }, { "epoch": 0.5769828926905132, "grad_norm": 0.268368973868535, "learning_rate": 3.409593419212478e-05, "loss": 0.2163, "num_tokens": 36433655.0, "step": 371 }, { "epoch": 0.578538102643857, "grad_norm": 0.2858964086679865, "learning_rate": 3.40623133487253e-05, "loss": 0.1891, "num_tokens": 36531959.0, "step": 372 }, { "epoch": 0.5800933125972006, "grad_norm": 0.2471231062107934, "learning_rate": 3.4028615946156106e-05, "loss": 0.1892, "num_tokens": 36630263.0, "step": 373 }, { "epoch": 0.5816485225505443, "grad_norm": 0.261408538572406, "learning_rate": 3.399484219829367e-05, "loss": 0.2126, "num_tokens": 36728567.0, "step": 374 }, { "epoch": 0.583203732503888, "grad_norm": 0.28657984975946205, "learning_rate": 3.396099231949903e-05, "loss": 0.2063, "num_tokens": 36826871.0, "step": 375 }, { "epoch": 0.5847589424572317, "grad_norm": 0.23881176566903456, "learning_rate": 3.3927066524616466e-05, "loss": 0.189, "num_tokens": 36925175.0, "step": 376 }, { "epoch": 0.5863141524105754, "grad_norm": 0.25451678591220284, "learning_rate": 3.389306502897204e-05, "loss": 0.1982, "num_tokens": 37023479.0, "step": 377 }, { "epoch": 0.5878693623639192, "grad_norm": 0.26047136966689616, "learning_rate": 3.38589880483723e-05, "loss": 0.1954, "num_tokens": 37121783.0, "step": 378 }, { "epoch": 0.5894245723172629, "grad_norm": 0.24792492307318398, "learning_rate": 3.38248357991029e-05, "loss": 0.1942, "num_tokens": 37220087.0, "step": 379 }, { "epoch": 0.5909797822706065, "grad_norm": 0.26495630603189063, "learning_rate": 3.379060849792724e-05, "loss": 0.1991, "num_tokens": 37318391.0, "step": 380 }, { "epoch": 0.5925349922239502, "grad_norm": 0.25327543801764146, "learning_rate": 3.375630636208505e-05, "loss": 0.2105, "num_tokens": 37416695.0, "step": 381 }, { "epoch": 0.5940902021772939, "grad_norm": 0.24791035602947195, "learning_rate": 3.372192960929102e-05, "loss": 0.1933, "num_tokens": 37514999.0, "step": 382 }, { "epoch": 0.5956454121306376, "grad_norm": 0.29970476819605013, "learning_rate": 3.3687478457733466e-05, "loss": 0.2006, "num_tokens": 37613303.0, "step": 383 }, { "epoch": 0.5972006220839814, "grad_norm": 0.2702234554734602, "learning_rate": 3.36529531260729e-05, "loss": 0.2066, "num_tokens": 37711607.0, "step": 384 }, { "epoch": 0.5987558320373251, "grad_norm": 0.2496747440074347, "learning_rate": 3.3618353833440645e-05, "loss": 0.1884, "num_tokens": 37809911.0, "step": 385 }, { "epoch": 0.6003110419906688, "grad_norm": 0.26465373090557887, "learning_rate": 3.358368079943745e-05, "loss": 0.196, "num_tokens": 37908215.0, "step": 386 }, { "epoch": 0.6018662519440124, "grad_norm": 0.254171312053526, "learning_rate": 3.354893424413211e-05, "loss": 0.1889, "num_tokens": 38006519.0, "step": 387 }, { "epoch": 0.6034214618973561, "grad_norm": 0.2510387132666909, "learning_rate": 3.3514114388060044e-05, "loss": 0.1833, "num_tokens": 38104823.0, "step": 388 }, { "epoch": 0.6049766718506998, "grad_norm": 0.2519148485737051, "learning_rate": 3.3479221452221926e-05, "loss": 0.1999, "num_tokens": 38203127.0, "step": 389 }, { "epoch": 0.6065318818040435, "grad_norm": 0.26344198090661797, "learning_rate": 3.344425565808226e-05, "loss": 0.1909, "num_tokens": 38301431.0, "step": 390 }, { "epoch": 0.6080870917573873, "grad_norm": 0.23886640804102746, "learning_rate": 3.340921722756796e-05, "loss": 0.1736, "num_tokens": 38399735.0, "step": 391 }, { "epoch": 0.609642301710731, "grad_norm": 0.24771961559314445, "learning_rate": 3.3374106383067e-05, "loss": 0.1787, "num_tokens": 38498039.0, "step": 392 }, { "epoch": 0.6111975116640747, "grad_norm": 0.27257779461052445, "learning_rate": 3.333892334742691e-05, "loss": 0.1948, "num_tokens": 38596343.0, "step": 393 }, { "epoch": 0.6127527216174183, "grad_norm": 0.26098945367685944, "learning_rate": 3.3303668343953466e-05, "loss": 0.192, "num_tokens": 38694647.0, "step": 394 }, { "epoch": 0.614307931570762, "grad_norm": 0.2563696715515177, "learning_rate": 3.32683415964092e-05, "loss": 0.1854, "num_tokens": 38792951.0, "step": 395 }, { "epoch": 0.6158631415241057, "grad_norm": 0.2563553823948422, "learning_rate": 3.323294332901201e-05, "loss": 0.2117, "num_tokens": 38891255.0, "step": 396 }, { "epoch": 0.6174183514774495, "grad_norm": 0.2692403041487945, "learning_rate": 3.31974737664337e-05, "loss": 0.1892, "num_tokens": 38989559.0, "step": 397 }, { "epoch": 0.6189735614307932, "grad_norm": 0.25094753910920775, "learning_rate": 3.3161933133798614e-05, "loss": 0.1964, "num_tokens": 39087863.0, "step": 398 }, { "epoch": 0.6205287713841369, "grad_norm": 0.29690770550073226, "learning_rate": 3.312632165668217e-05, "loss": 0.2063, "num_tokens": 39186167.0, "step": 399 }, { "epoch": 0.6220839813374806, "grad_norm": 0.2714346620354062, "learning_rate": 3.309063956110944e-05, "loss": 0.1986, "num_tokens": 39283735.0, "step": 400 }, { "epoch": 0.6236391912908242, "grad_norm": 0.24034700211042803, "learning_rate": 3.305488707355368e-05, "loss": 0.1838, "num_tokens": 39382039.0, "step": 401 }, { "epoch": 0.6251944012441679, "grad_norm": 0.23417615288421406, "learning_rate": 3.3019064420934946e-05, "loss": 0.1792, "num_tokens": 39480343.0, "step": 402 }, { "epoch": 0.6267496111975117, "grad_norm": 0.33664616783063706, "learning_rate": 3.298317183061863e-05, "loss": 0.1972, "num_tokens": 39578647.0, "step": 403 }, { "epoch": 0.6283048211508554, "grad_norm": 0.2562754521166614, "learning_rate": 3.2947209530414e-05, "loss": 0.204, "num_tokens": 39676951.0, "step": 404 }, { "epoch": 0.6298600311041991, "grad_norm": 0.29685973941425536, "learning_rate": 3.29111777485728e-05, "loss": 0.1851, "num_tokens": 39775255.0, "step": 405 }, { "epoch": 0.6314152410575428, "grad_norm": 0.23498446969362843, "learning_rate": 3.287507671378775e-05, "loss": 0.19, "num_tokens": 39873559.0, "step": 406 }, { "epoch": 0.6329704510108864, "grad_norm": 0.2627411277813095, "learning_rate": 3.283890665519111e-05, "loss": 0.1896, "num_tokens": 39971863.0, "step": 407 }, { "epoch": 0.6345256609642301, "grad_norm": 0.24208785274909855, "learning_rate": 3.280266780235325e-05, "loss": 0.1844, "num_tokens": 40070167.0, "step": 408 }, { "epoch": 0.6360808709175739, "grad_norm": 0.24124713846337847, "learning_rate": 3.276636038528117e-05, "loss": 0.1882, "num_tokens": 40168471.0, "step": 409 }, { "epoch": 0.6376360808709176, "grad_norm": 0.25756619713092677, "learning_rate": 3.2729984634417035e-05, "loss": 0.1889, "num_tokens": 40266775.0, "step": 410 }, { "epoch": 0.6391912908242613, "grad_norm": 0.26324306070812886, "learning_rate": 3.269354078063674e-05, "loss": 0.2133, "num_tokens": 40365079.0, "step": 411 }, { "epoch": 0.640746500777605, "grad_norm": 0.2672939357328508, "learning_rate": 3.2657029055248424e-05, "loss": 0.2029, "num_tokens": 40463383.0, "step": 412 }, { "epoch": 0.6423017107309487, "grad_norm": 0.24380230610576556, "learning_rate": 3.262044968999099e-05, "loss": 0.1824, "num_tokens": 40561687.0, "step": 413 }, { "epoch": 0.6438569206842923, "grad_norm": 0.23276154104942368, "learning_rate": 3.2583802917032655e-05, "loss": 0.187, "num_tokens": 40659991.0, "step": 414 }, { "epoch": 0.645412130637636, "grad_norm": 0.23500953006795935, "learning_rate": 3.254708896896948e-05, "loss": 0.1903, "num_tokens": 40758295.0, "step": 415 }, { "epoch": 0.6469673405909798, "grad_norm": 0.24805697981527272, "learning_rate": 3.2510308078823885e-05, "loss": 0.2036, "num_tokens": 40856599.0, "step": 416 }, { "epoch": 0.6485225505443235, "grad_norm": 0.25268587168674045, "learning_rate": 3.247346048004316e-05, "loss": 0.1948, "num_tokens": 40954903.0, "step": 417 }, { "epoch": 0.6500777604976672, "grad_norm": 0.2558734657172133, "learning_rate": 3.243654640649799e-05, "loss": 0.2004, "num_tokens": 41053207.0, "step": 418 }, { "epoch": 0.6516329704510109, "grad_norm": 0.2888317580475259, "learning_rate": 3.239956609248099e-05, "loss": 0.1843, "num_tokens": 41151511.0, "step": 419 }, { "epoch": 0.6531881804043546, "grad_norm": 0.23971671183757823, "learning_rate": 3.236251977270518e-05, "loss": 0.1853, "num_tokens": 41249815.0, "step": 420 }, { "epoch": 0.6547433903576982, "grad_norm": 0.38496527079591764, "learning_rate": 3.232540768230255e-05, "loss": 0.2033, "num_tokens": 41348119.0, "step": 421 }, { "epoch": 0.656298600311042, "grad_norm": 0.25056947727009576, "learning_rate": 3.2288230056822496e-05, "loss": 0.1913, "num_tokens": 41446423.0, "step": 422 }, { "epoch": 0.6578538102643857, "grad_norm": 0.24583783366730325, "learning_rate": 3.2250987132230386e-05, "loss": 0.1939, "num_tokens": 41544727.0, "step": 423 }, { "epoch": 0.6594090202177294, "grad_norm": 0.24631615184181077, "learning_rate": 3.221367914490603e-05, "loss": 0.1929, "num_tokens": 41643031.0, "step": 424 }, { "epoch": 0.6609642301710731, "grad_norm": 0.2368352433104199, "learning_rate": 3.217630633164219e-05, "loss": 0.1863, "num_tokens": 41741335.0, "step": 425 }, { "epoch": 0.6625194401244168, "grad_norm": 0.25647397671318445, "learning_rate": 3.213886892964309e-05, "loss": 0.196, "num_tokens": 41839639.0, "step": 426 }, { "epoch": 0.6640746500777605, "grad_norm": 0.24932036077328457, "learning_rate": 3.2101367176522886e-05, "loss": 0.1868, "num_tokens": 41937943.0, "step": 427 }, { "epoch": 0.6656298600311042, "grad_norm": 0.2551823348570766, "learning_rate": 3.206380131030416e-05, "loss": 0.1946, "num_tokens": 42036247.0, "step": 428 }, { "epoch": 0.6671850699844479, "grad_norm": 0.25071748648700043, "learning_rate": 3.202617156941644e-05, "loss": 0.1942, "num_tokens": 42134551.0, "step": 429 }, { "epoch": 0.6687402799377916, "grad_norm": 0.26781667827247524, "learning_rate": 3.1988478192694636e-05, "loss": 0.1859, "num_tokens": 42232855.0, "step": 430 }, { "epoch": 0.6702954898911353, "grad_norm": 0.2396761537868174, "learning_rate": 3.195072141937758e-05, "loss": 0.1771, "num_tokens": 42331159.0, "step": 431 }, { "epoch": 0.671850699844479, "grad_norm": 0.24338504488836876, "learning_rate": 3.191290148910647e-05, "loss": 0.1861, "num_tokens": 42429463.0, "step": 432 }, { "epoch": 0.6734059097978227, "grad_norm": 0.24145186260190182, "learning_rate": 3.1875018641923355e-05, "loss": 0.1937, "num_tokens": 42527767.0, "step": 433 }, { "epoch": 0.6749611197511665, "grad_norm": 0.2383109827355437, "learning_rate": 3.183707311826962e-05, "loss": 0.1867, "num_tokens": 42626071.0, "step": 434 }, { "epoch": 0.6765163297045101, "grad_norm": 0.22133039719760006, "learning_rate": 3.179906515898446e-05, "loss": 0.1694, "num_tokens": 42724375.0, "step": 435 }, { "epoch": 0.6780715396578538, "grad_norm": 0.24213689973994762, "learning_rate": 3.1760995005303344e-05, "loss": 0.1884, "num_tokens": 42822679.0, "step": 436 }, { "epoch": 0.6796267496111975, "grad_norm": 0.2599241050590087, "learning_rate": 3.172286289885647e-05, "loss": 0.1931, "num_tokens": 42920983.0, "step": 437 }, { "epoch": 0.6811819595645412, "grad_norm": 0.23467341769809025, "learning_rate": 3.1684669081667285e-05, "loss": 0.1842, "num_tokens": 43019287.0, "step": 438 }, { "epoch": 0.6827371695178849, "grad_norm": 0.27332812726049455, "learning_rate": 3.164641379615088e-05, "loss": 0.1932, "num_tokens": 43117591.0, "step": 439 }, { "epoch": 0.6842923794712286, "grad_norm": 0.27235476055605734, "learning_rate": 3.160809728511251e-05, "loss": 0.2023, "num_tokens": 43215895.0, "step": 440 }, { "epoch": 0.6858475894245724, "grad_norm": 0.24464064373264757, "learning_rate": 3.1569719791746e-05, "loss": 0.1822, "num_tokens": 43314199.0, "step": 441 }, { "epoch": 0.687402799377916, "grad_norm": 0.24624971620139574, "learning_rate": 3.153128155963224e-05, "loss": 0.1976, "num_tokens": 43412503.0, "step": 442 }, { "epoch": 0.6889580093312597, "grad_norm": 0.2564286869094276, "learning_rate": 3.149278283273763e-05, "loss": 0.1914, "num_tokens": 43510807.0, "step": 443 }, { "epoch": 0.6905132192846034, "grad_norm": 0.2545554679716715, "learning_rate": 3.1454223855412526e-05, "loss": 0.2014, "num_tokens": 43609111.0, "step": 444 }, { "epoch": 0.6920684292379471, "grad_norm": 0.24780033233386622, "learning_rate": 3.1415604872389694e-05, "loss": 0.1965, "num_tokens": 43707415.0, "step": 445 }, { "epoch": 0.6936236391912908, "grad_norm": 0.26016725860631945, "learning_rate": 3.1376926128782746e-05, "loss": 0.1869, "num_tokens": 43805719.0, "step": 446 }, { "epoch": 0.6951788491446346, "grad_norm": 0.27084951285919684, "learning_rate": 3.133818787008461e-05, "loss": 0.1908, "num_tokens": 43904023.0, "step": 447 }, { "epoch": 0.6967340590979783, "grad_norm": 0.2535779808652896, "learning_rate": 3.1299390342165945e-05, "loss": 0.1908, "num_tokens": 44002327.0, "step": 448 }, { "epoch": 0.6982892690513219, "grad_norm": 0.2653525054494949, "learning_rate": 3.126053379127358e-05, "loss": 0.1945, "num_tokens": 44100631.0, "step": 449 }, { "epoch": 0.6998444790046656, "grad_norm": 0.2494314572532233, "learning_rate": 3.122161846402897e-05, "loss": 0.1932, "num_tokens": 44196051.0, "step": 450 }, { "epoch": 0.7013996889580093, "grad_norm": 0.24804146253106418, "learning_rate": 3.1182644607426634e-05, "loss": 0.1899, "num_tokens": 44294355.0, "step": 451 }, { "epoch": 0.702954898911353, "grad_norm": 0.29282635293074266, "learning_rate": 3.114361246883256e-05, "loss": 0.1989, "num_tokens": 44392659.0, "step": 452 }, { "epoch": 0.7045101088646968, "grad_norm": 0.26707496336783715, "learning_rate": 3.110452229598264e-05, "loss": 0.1971, "num_tokens": 44490963.0, "step": 453 }, { "epoch": 0.7060653188180405, "grad_norm": 0.267881156115659, "learning_rate": 3.106537433698113e-05, "loss": 0.2036, "num_tokens": 44589267.0, "step": 454 }, { "epoch": 0.7076205287713841, "grad_norm": 0.2471625715441626, "learning_rate": 3.102616884029905e-05, "loss": 0.1882, "num_tokens": 44687571.0, "step": 455 }, { "epoch": 0.7091757387247278, "grad_norm": 0.2566848156960226, "learning_rate": 3.098690605477261e-05, "loss": 0.2097, "num_tokens": 44785875.0, "step": 456 }, { "epoch": 0.7107309486780715, "grad_norm": 0.24317637732593742, "learning_rate": 3.0947586229601606e-05, "loss": 0.1908, "num_tokens": 44884179.0, "step": 457 }, { "epoch": 0.7122861586314152, "grad_norm": 0.2402846178623033, "learning_rate": 3.09082096143479e-05, "loss": 0.1757, "num_tokens": 44982483.0, "step": 458 }, { "epoch": 0.713841368584759, "grad_norm": 0.26506382868800105, "learning_rate": 3.086877645893377e-05, "loss": 0.1918, "num_tokens": 45080787.0, "step": 459 }, { "epoch": 0.7153965785381027, "grad_norm": 0.2728582092358834, "learning_rate": 3.082928701364038e-05, "loss": 0.1957, "num_tokens": 45179091.0, "step": 460 }, { "epoch": 0.7169517884914464, "grad_norm": 0.27238167039867445, "learning_rate": 3.0789741529106123e-05, "loss": 0.1907, "num_tokens": 45277395.0, "step": 461 }, { "epoch": 0.71850699844479, "grad_norm": 0.284755503888087, "learning_rate": 3.075014025632512e-05, "loss": 0.2009, "num_tokens": 45375699.0, "step": 462 }, { "epoch": 0.7200622083981337, "grad_norm": 0.25861641975886623, "learning_rate": 3.0710483446645545e-05, "loss": 0.1861, "num_tokens": 45474003.0, "step": 463 }, { "epoch": 0.7216174183514774, "grad_norm": 0.2479692749206761, "learning_rate": 3.067077135176808e-05, "loss": 0.2014, "num_tokens": 45572307.0, "step": 464 }, { "epoch": 0.7231726283048211, "grad_norm": 0.24610427658734063, "learning_rate": 3.063100422374429e-05, "loss": 0.1902, "num_tokens": 45670611.0, "step": 465 }, { "epoch": 0.7247278382581649, "grad_norm": 0.23077187492537235, "learning_rate": 3.0591182314975046e-05, "loss": 0.185, "num_tokens": 45768915.0, "step": 466 }, { "epoch": 0.7262830482115086, "grad_norm": 0.25820714596942584, "learning_rate": 3.055130587820889e-05, "loss": 0.2093, "num_tokens": 45867219.0, "step": 467 }, { "epoch": 0.7278382581648523, "grad_norm": 0.24850244318937914, "learning_rate": 3.0511375166540473e-05, "loss": 0.1998, "num_tokens": 45965523.0, "step": 468 }, { "epoch": 0.7293934681181959, "grad_norm": 0.24986192439679675, "learning_rate": 3.047139043340892e-05, "loss": 0.1918, "num_tokens": 46063827.0, "step": 469 }, { "epoch": 0.7309486780715396, "grad_norm": 0.2360976740882969, "learning_rate": 3.043135193259623e-05, "loss": 0.1887, "num_tokens": 46162131.0, "step": 470 }, { "epoch": 0.7325038880248833, "grad_norm": 0.24351703755970078, "learning_rate": 3.0391259918225646e-05, "loss": 0.1928, "num_tokens": 46260435.0, "step": 471 }, { "epoch": 0.7340590979782271, "grad_norm": 0.4681980260670316, "learning_rate": 3.03511146447601e-05, "loss": 0.1924, "num_tokens": 46358739.0, "step": 472 }, { "epoch": 0.7356143079315708, "grad_norm": 0.2537518317864699, "learning_rate": 3.031091636700052e-05, "loss": 0.186, "num_tokens": 46457043.0, "step": 473 }, { "epoch": 0.7371695178849145, "grad_norm": 0.2540602189958833, "learning_rate": 3.0270665340084266e-05, "loss": 0.185, "num_tokens": 46555347.0, "step": 474 }, { "epoch": 0.7387247278382582, "grad_norm": 0.24777243964362722, "learning_rate": 3.0230361819483497e-05, "loss": 0.2049, "num_tokens": 46653651.0, "step": 475 }, { "epoch": 0.7402799377916018, "grad_norm": 0.27624374863243845, "learning_rate": 3.0190006061003544e-05, "loss": 0.1904, "num_tokens": 46751955.0, "step": 476 }, { "epoch": 0.7418351477449455, "grad_norm": 0.25181931073897273, "learning_rate": 3.014959832078129e-05, "loss": 0.1906, "num_tokens": 46850259.0, "step": 477 }, { "epoch": 0.7433903576982893, "grad_norm": 0.2474586033873101, "learning_rate": 3.0109138855283543e-05, "loss": 0.1872, "num_tokens": 46948563.0, "step": 478 }, { "epoch": 0.744945567651633, "grad_norm": 0.23482212802052954, "learning_rate": 3.0068627921305427e-05, "loss": 0.18, "num_tokens": 47046867.0, "step": 479 }, { "epoch": 0.7465007776049767, "grad_norm": 0.25399985536850495, "learning_rate": 3.002806577596871e-05, "loss": 0.1912, "num_tokens": 47145171.0, "step": 480 }, { "epoch": 0.7480559875583204, "grad_norm": 0.25775244490512805, "learning_rate": 2.998745267672021e-05, "loss": 0.1865, "num_tokens": 47243475.0, "step": 481 }, { "epoch": 0.749611197511664, "grad_norm": 0.24729295965356832, "learning_rate": 2.9946788881330145e-05, "loss": 0.1819, "num_tokens": 47341779.0, "step": 482 }, { "epoch": 0.7511664074650077, "grad_norm": 0.2633974211642851, "learning_rate": 2.9906074647890508e-05, "loss": 0.1976, "num_tokens": 47440083.0, "step": 483 }, { "epoch": 0.7527216174183515, "grad_norm": 0.24579641639829808, "learning_rate": 2.986531023481341e-05, "loss": 0.1865, "num_tokens": 47538387.0, "step": 484 }, { "epoch": 0.7542768273716952, "grad_norm": 0.27525400974385933, "learning_rate": 2.9824495900829445e-05, "loss": 0.1877, "num_tokens": 47636691.0, "step": 485 }, { "epoch": 0.7558320373250389, "grad_norm": 0.25509018604801825, "learning_rate": 2.978363190498608e-05, "loss": 0.1955, "num_tokens": 47734995.0, "step": 486 }, { "epoch": 0.7573872472783826, "grad_norm": 0.25018980195848045, "learning_rate": 2.9742718506645962e-05, "loss": 0.184, "num_tokens": 47833299.0, "step": 487 }, { "epoch": 0.7589424572317263, "grad_norm": 0.2763574328911098, "learning_rate": 2.9701755965485302e-05, "loss": 0.197, "num_tokens": 47931603.0, "step": 488 }, { "epoch": 0.76049766718507, "grad_norm": 0.2554349702863663, "learning_rate": 2.966074454149221e-05, "loss": 0.1876, "num_tokens": 48029907.0, "step": 489 }, { "epoch": 0.7620528771384136, "grad_norm": 0.33903714455160083, "learning_rate": 2.9619684494965068e-05, "loss": 0.2013, "num_tokens": 48128211.0, "step": 490 }, { "epoch": 0.7636080870917574, "grad_norm": 0.2800310989582483, "learning_rate": 2.9578576086510862e-05, "loss": 0.1884, "num_tokens": 48226515.0, "step": 491 }, { "epoch": 0.7651632970451011, "grad_norm": 0.27137385367029726, "learning_rate": 2.9537419577043514e-05, "loss": 0.1827, "num_tokens": 48324819.0, "step": 492 }, { "epoch": 0.7667185069984448, "grad_norm": 0.2604977486718509, "learning_rate": 2.949621522778227e-05, "loss": 0.1962, "num_tokens": 48423123.0, "step": 493 }, { "epoch": 0.7682737169517885, "grad_norm": 0.2905928278276323, "learning_rate": 2.9454963300249968e-05, "loss": 0.1769, "num_tokens": 48521427.0, "step": 494 }, { "epoch": 0.7698289269051322, "grad_norm": 0.24606855080144002, "learning_rate": 2.941366405627148e-05, "loss": 0.184, "num_tokens": 48619731.0, "step": 495 }, { "epoch": 0.7713841368584758, "grad_norm": 0.24226052098276832, "learning_rate": 2.937231775797196e-05, "loss": 0.1835, "num_tokens": 48718035.0, "step": 496 }, { "epoch": 0.7729393468118196, "grad_norm": 0.25306866774186176, "learning_rate": 2.9330924667775215e-05, "loss": 0.1918, "num_tokens": 48816339.0, "step": 497 }, { "epoch": 0.7744945567651633, "grad_norm": 0.25112572906214087, "learning_rate": 2.928948504840205e-05, "loss": 0.1849, "num_tokens": 48914643.0, "step": 498 }, { "epoch": 0.776049766718507, "grad_norm": 0.2466031613844019, "learning_rate": 2.9247999162868592e-05, "loss": 0.1872, "num_tokens": 49012947.0, "step": 499 }, { "epoch": 0.7776049766718507, "grad_norm": 0.25608198444540126, "learning_rate": 2.9206467274484596e-05, "loss": 0.1914, "num_tokens": 49105442.0, "step": 500 }, { "epoch": 0.7791601866251944, "grad_norm": 0.24737275844072482, "learning_rate": 2.9164889646851814e-05, "loss": 0.1744, "num_tokens": 49203746.0, "step": 501 }, { "epoch": 0.7807153965785381, "grad_norm": 0.2448934533905562, "learning_rate": 2.9123266543862308e-05, "loss": 0.1944, "num_tokens": 49302050.0, "step": 502 }, { "epoch": 0.7822706065318819, "grad_norm": 0.24120870382564183, "learning_rate": 2.9081598229696762e-05, "loss": 0.1938, "num_tokens": 49400354.0, "step": 503 }, { "epoch": 0.7838258164852255, "grad_norm": 0.23140998584921263, "learning_rate": 2.903988496882281e-05, "loss": 0.1861, "num_tokens": 49498658.0, "step": 504 }, { "epoch": 0.7853810264385692, "grad_norm": 0.23929251102143806, "learning_rate": 2.899812702599337e-05, "loss": 0.1894, "num_tokens": 49596962.0, "step": 505 }, { "epoch": 0.7869362363919129, "grad_norm": 0.24702258221390186, "learning_rate": 2.8956324666244954e-05, "loss": 0.1988, "num_tokens": 49695266.0, "step": 506 }, { "epoch": 0.7884914463452566, "grad_norm": 0.2476685144925104, "learning_rate": 2.891447815489598e-05, "loss": 0.1973, "num_tokens": 49793570.0, "step": 507 }, { "epoch": 0.7900466562986003, "grad_norm": 0.24625144911639574, "learning_rate": 2.8872587757545102e-05, "loss": 0.1879, "num_tokens": 49891874.0, "step": 508 }, { "epoch": 0.7916018662519441, "grad_norm": 0.23235402819806908, "learning_rate": 2.8830653740069518e-05, "loss": 0.1924, "num_tokens": 49990178.0, "step": 509 }, { "epoch": 0.7931570762052877, "grad_norm": 0.2585133874918373, "learning_rate": 2.8788676368623276e-05, "loss": 0.1946, "num_tokens": 50088482.0, "step": 510 }, { "epoch": 0.7947122861586314, "grad_norm": 0.2683691798928568, "learning_rate": 2.874665590963559e-05, "loss": 0.1901, "num_tokens": 50186786.0, "step": 511 }, { "epoch": 0.7962674961119751, "grad_norm": 0.2714133610676743, "learning_rate": 2.8704592629809164e-05, "loss": 0.1916, "num_tokens": 50285090.0, "step": 512 }, { "epoch": 0.7978227060653188, "grad_norm": 0.25653434213941745, "learning_rate": 2.866248679611846e-05, "loss": 0.1822, "num_tokens": 50383394.0, "step": 513 }, { "epoch": 0.7993779160186625, "grad_norm": 0.22513281157136983, "learning_rate": 2.8620338675808047e-05, "loss": 0.1791, "num_tokens": 50481698.0, "step": 514 }, { "epoch": 0.8009331259720062, "grad_norm": 0.23599073062900047, "learning_rate": 2.857814853639089e-05, "loss": 0.1768, "num_tokens": 50580002.0, "step": 515 }, { "epoch": 0.80248833592535, "grad_norm": 0.24660063902249849, "learning_rate": 2.8535916645646625e-05, "loss": 0.1979, "num_tokens": 50678306.0, "step": 516 }, { "epoch": 0.8040435458786936, "grad_norm": 0.24182261632595906, "learning_rate": 2.84936432716199e-05, "loss": 0.1938, "num_tokens": 50776610.0, "step": 517 }, { "epoch": 0.8055987558320373, "grad_norm": 0.2532248174191202, "learning_rate": 2.8451328682618658e-05, "loss": 0.1856, "num_tokens": 50874914.0, "step": 518 }, { "epoch": 0.807153965785381, "grad_norm": 0.2632756160552327, "learning_rate": 2.8408973147212417e-05, "loss": 0.188, "num_tokens": 50973218.0, "step": 519 }, { "epoch": 0.8087091757387247, "grad_norm": 0.24574709460605068, "learning_rate": 2.8366576934230603e-05, "loss": 0.1916, "num_tokens": 51071522.0, "step": 520 }, { "epoch": 0.8102643856920684, "grad_norm": 0.23571786575465006, "learning_rate": 2.83241403127608e-05, "loss": 0.1833, "num_tokens": 51169826.0, "step": 521 }, { "epoch": 0.8118195956454122, "grad_norm": 0.2457635784401683, "learning_rate": 2.8281663552147066e-05, "loss": 0.1776, "num_tokens": 51268130.0, "step": 522 }, { "epoch": 0.8133748055987559, "grad_norm": 0.2570631240854664, "learning_rate": 2.8239146921988246e-05, "loss": 0.185, "num_tokens": 51366434.0, "step": 523 }, { "epoch": 0.8149300155520995, "grad_norm": 0.24552097835452405, "learning_rate": 2.8196590692136215e-05, "loss": 0.1839, "num_tokens": 51464738.0, "step": 524 }, { "epoch": 0.8164852255054432, "grad_norm": 0.2545813029448225, "learning_rate": 2.8153995132694177e-05, "loss": 0.1883, "num_tokens": 51563042.0, "step": 525 }, { "epoch": 0.8180404354587869, "grad_norm": 0.25034743305200285, "learning_rate": 2.8111360514014965e-05, "loss": 0.1839, "num_tokens": 51661346.0, "step": 526 }, { "epoch": 0.8195956454121306, "grad_norm": 0.2977311169431757, "learning_rate": 2.8068687106699353e-05, "loss": 0.1999, "num_tokens": 51759650.0, "step": 527 }, { "epoch": 0.8211508553654744, "grad_norm": 0.2537978398328302, "learning_rate": 2.8025975181594252e-05, "loss": 0.1792, "num_tokens": 51857954.0, "step": 528 }, { "epoch": 0.8227060653188181, "grad_norm": 0.2551615313726019, "learning_rate": 2.7983225009791093e-05, "loss": 0.1883, "num_tokens": 51956258.0, "step": 529 }, { "epoch": 0.8242612752721618, "grad_norm": 0.24532955999802808, "learning_rate": 2.794043686262402e-05, "loss": 0.1897, "num_tokens": 52054562.0, "step": 530 }, { "epoch": 0.8258164852255054, "grad_norm": 0.2430685687502802, "learning_rate": 2.7897611011668227e-05, "loss": 0.1861, "num_tokens": 52152866.0, "step": 531 }, { "epoch": 0.8273716951788491, "grad_norm": 0.2867599917926755, "learning_rate": 2.785474772873821e-05, "loss": 0.1842, "num_tokens": 52251170.0, "step": 532 }, { "epoch": 0.8289269051321928, "grad_norm": 0.25465189181614767, "learning_rate": 2.7811847285886036e-05, "loss": 0.1886, "num_tokens": 52349474.0, "step": 533 }, { "epoch": 0.8304821150855366, "grad_norm": 0.2500223226771342, "learning_rate": 2.7768909955399634e-05, "loss": 0.2014, "num_tokens": 52447778.0, "step": 534 }, { "epoch": 0.8320373250388803, "grad_norm": 0.2522055983028013, "learning_rate": 2.7725936009801056e-05, "loss": 0.1885, "num_tokens": 52546082.0, "step": 535 }, { "epoch": 0.833592534992224, "grad_norm": 0.24365522631619138, "learning_rate": 2.7682925721844755e-05, "loss": 0.1914, "num_tokens": 52644386.0, "step": 536 }, { "epoch": 0.8351477449455676, "grad_norm": 0.25725437195323503, "learning_rate": 2.763987936451582e-05, "loss": 0.1853, "num_tokens": 52742690.0, "step": 537 }, { "epoch": 0.8367029548989113, "grad_norm": 0.2659315325527136, "learning_rate": 2.7596797211028316e-05, "loss": 0.1919, "num_tokens": 52840994.0, "step": 538 }, { "epoch": 0.838258164852255, "grad_norm": 0.2648446227879442, "learning_rate": 2.7553679534823467e-05, "loss": 0.1863, "num_tokens": 52939298.0, "step": 539 }, { "epoch": 0.8398133748055988, "grad_norm": 0.24041574300289495, "learning_rate": 2.7510526609567977e-05, "loss": 0.1889, "num_tokens": 53037602.0, "step": 540 }, { "epoch": 0.8413685847589425, "grad_norm": 0.2472188589644515, "learning_rate": 2.7467338709152275e-05, "loss": 0.1852, "num_tokens": 53135906.0, "step": 541 }, { "epoch": 0.8429237947122862, "grad_norm": 0.2604465188417931, "learning_rate": 2.7424116107688765e-05, "loss": 0.181, "num_tokens": 53234210.0, "step": 542 }, { "epoch": 0.8444790046656299, "grad_norm": 0.2610722379614401, "learning_rate": 2.738085907951011e-05, "loss": 0.1894, "num_tokens": 53332514.0, "step": 543 }, { "epoch": 0.8460342146189735, "grad_norm": 0.2461138923019881, "learning_rate": 2.733756789916747e-05, "loss": 0.1866, "num_tokens": 53430818.0, "step": 544 }, { "epoch": 0.8475894245723172, "grad_norm": 0.24449293437827638, "learning_rate": 2.7294242841428773e-05, "loss": 0.1874, "num_tokens": 53529122.0, "step": 545 }, { "epoch": 0.8491446345256609, "grad_norm": 0.23771234598603178, "learning_rate": 2.7250884181276963e-05, "loss": 0.1754, "num_tokens": 53627426.0, "step": 546 }, { "epoch": 0.8506998444790047, "grad_norm": 0.23596754483571186, "learning_rate": 2.720749219390826e-05, "loss": 0.1904, "num_tokens": 53725730.0, "step": 547 }, { "epoch": 0.8522550544323484, "grad_norm": 0.2584518270103045, "learning_rate": 2.716406715473041e-05, "loss": 0.1935, "num_tokens": 53824034.0, "step": 548 }, { "epoch": 0.8538102643856921, "grad_norm": 0.2377104323859995, "learning_rate": 2.7120609339360932e-05, "loss": 0.1804, "num_tokens": 53922338.0, "step": 549 }, { "epoch": 0.8553654743390358, "grad_norm": 0.26876785559623406, "learning_rate": 2.707711902362539e-05, "loss": 0.1847, "num_tokens": 54020483.0, "step": 550 }, { "epoch": 0.8569206842923794, "grad_norm": 0.24641637149980203, "learning_rate": 2.703359648355561e-05, "loss": 0.1827, "num_tokens": 54118787.0, "step": 551 }, { "epoch": 0.8584758942457231, "grad_norm": 0.2581413902462616, "learning_rate": 2.699004199538795e-05, "loss": 0.2034, "num_tokens": 54217091.0, "step": 552 }, { "epoch": 0.8600311041990669, "grad_norm": 0.2352111604704282, "learning_rate": 2.694645583556155e-05, "loss": 0.174, "num_tokens": 54315395.0, "step": 553 }, { "epoch": 0.8615863141524106, "grad_norm": 0.22876867091664588, "learning_rate": 2.6902838280716558e-05, "loss": 0.179, "num_tokens": 54413699.0, "step": 554 }, { "epoch": 0.8631415241057543, "grad_norm": 0.24697473335758263, "learning_rate": 2.6859189607692396e-05, "loss": 0.188, "num_tokens": 54512003.0, "step": 555 }, { "epoch": 0.864696734059098, "grad_norm": 0.23725709825256255, "learning_rate": 2.681551009352598e-05, "loss": 0.1852, "num_tokens": 54610307.0, "step": 556 }, { "epoch": 0.8662519440124417, "grad_norm": 0.245318895973701, "learning_rate": 2.6771800015449976e-05, "loss": 0.1887, "num_tokens": 54708611.0, "step": 557 }, { "epoch": 0.8678071539657853, "grad_norm": 0.23776770438374994, "learning_rate": 2.6728059650891057e-05, "loss": 0.192, "num_tokens": 54806915.0, "step": 558 }, { "epoch": 0.8693623639191291, "grad_norm": 0.23504769850002405, "learning_rate": 2.6684289277468093e-05, "loss": 0.181, "num_tokens": 54905219.0, "step": 559 }, { "epoch": 0.8709175738724728, "grad_norm": 0.2594777902070823, "learning_rate": 2.6640489172990457e-05, "loss": 0.1776, "num_tokens": 55003523.0, "step": 560 }, { "epoch": 0.8724727838258165, "grad_norm": 0.2534712982466307, "learning_rate": 2.6596659615456176e-05, "loss": 0.2059, "num_tokens": 55101827.0, "step": 561 }, { "epoch": 0.8740279937791602, "grad_norm": 0.24876607814548288, "learning_rate": 2.6552800883050253e-05, "loss": 0.1971, "num_tokens": 55200131.0, "step": 562 }, { "epoch": 0.8755832037325039, "grad_norm": 0.26342383050391777, "learning_rate": 2.650891325414286e-05, "loss": 0.1992, "num_tokens": 55298435.0, "step": 563 }, { "epoch": 0.8771384136858476, "grad_norm": 0.24370699301078524, "learning_rate": 2.6464997007287556e-05, "loss": 0.1831, "num_tokens": 55396739.0, "step": 564 }, { "epoch": 0.8786936236391913, "grad_norm": 0.24250749320935744, "learning_rate": 2.642105242121955e-05, "loss": 0.1848, "num_tokens": 55495043.0, "step": 565 }, { "epoch": 0.880248833592535, "grad_norm": 0.2474808263726014, "learning_rate": 2.6377079774853903e-05, "loss": 0.1855, "num_tokens": 55593347.0, "step": 566 }, { "epoch": 0.8818040435458787, "grad_norm": 0.2485277304425725, "learning_rate": 2.633307934728382e-05, "loss": 0.1849, "num_tokens": 55691651.0, "step": 567 }, { "epoch": 0.8833592534992224, "grad_norm": 0.27725439980777616, "learning_rate": 2.628905141777877e-05, "loss": 0.1971, "num_tokens": 55789955.0, "step": 568 }, { "epoch": 0.8849144634525661, "grad_norm": 0.24224774752907519, "learning_rate": 2.624499626578281e-05, "loss": 0.1867, "num_tokens": 55888259.0, "step": 569 }, { "epoch": 0.8864696734059098, "grad_norm": 0.25308621593147057, "learning_rate": 2.620091417091278e-05, "loss": 0.1838, "num_tokens": 55986563.0, "step": 570 }, { "epoch": 0.8880248833592534, "grad_norm": 0.8781894188002322, "learning_rate": 2.6156805412956518e-05, "loss": 0.2014, "num_tokens": 56084867.0, "step": 571 }, { "epoch": 0.8895800933125972, "grad_norm": 0.23755372287433352, "learning_rate": 2.6112670271871094e-05, "loss": 0.1785, "num_tokens": 56183171.0, "step": 572 }, { "epoch": 0.8911353032659409, "grad_norm": 0.25129578641198524, "learning_rate": 2.6068509027781028e-05, "loss": 0.1811, "num_tokens": 56281475.0, "step": 573 }, { "epoch": 0.8926905132192846, "grad_norm": 0.24505038750490782, "learning_rate": 2.6024321960976512e-05, "loss": 0.1857, "num_tokens": 56379779.0, "step": 574 }, { "epoch": 0.8942457231726283, "grad_norm": 0.2708900842557107, "learning_rate": 2.5980109351911646e-05, "loss": 0.1917, "num_tokens": 56478083.0, "step": 575 }, { "epoch": 0.895800933125972, "grad_norm": 0.2546478787072239, "learning_rate": 2.5935871481202634e-05, "loss": 0.1733, "num_tokens": 56576387.0, "step": 576 }, { "epoch": 0.8973561430793157, "grad_norm": 0.28270332309769747, "learning_rate": 2.589160862962602e-05, "loss": 0.2004, "num_tokens": 56674691.0, "step": 577 }, { "epoch": 0.8989113530326595, "grad_norm": 0.2426638450450087, "learning_rate": 2.58473210781169e-05, "loss": 0.1808, "num_tokens": 56772995.0, "step": 578 }, { "epoch": 0.9004665629860031, "grad_norm": 0.26316788657495793, "learning_rate": 2.580300910776715e-05, "loss": 0.1953, "num_tokens": 56871299.0, "step": 579 }, { "epoch": 0.9020217729393468, "grad_norm": 0.25397816123683553, "learning_rate": 2.5758672999823608e-05, "loss": 0.2074, "num_tokens": 56969603.0, "step": 580 }, { "epoch": 0.9035769828926905, "grad_norm": 0.23566103364734284, "learning_rate": 2.5714313035686333e-05, "loss": 0.1843, "num_tokens": 57067907.0, "step": 581 }, { "epoch": 0.9051321928460342, "grad_norm": 0.24299493192788618, "learning_rate": 2.5669929496906792e-05, "loss": 0.1831, "num_tokens": 57166211.0, "step": 582 }, { "epoch": 0.9066874027993779, "grad_norm": 0.2478586706024622, "learning_rate": 2.5625522665186084e-05, "loss": 0.1848, "num_tokens": 57264515.0, "step": 583 }, { "epoch": 0.9082426127527217, "grad_norm": 0.23906930720439762, "learning_rate": 2.558109282237314e-05, "loss": 0.1775, "num_tokens": 57362819.0, "step": 584 }, { "epoch": 0.9097978227060654, "grad_norm": 0.248211813376106, "learning_rate": 2.5536640250462945e-05, "loss": 0.1865, "num_tokens": 57461123.0, "step": 585 }, { "epoch": 0.911353032659409, "grad_norm": 0.2265577459874205, "learning_rate": 2.549216523159474e-05, "loss": 0.1749, "num_tokens": 57559427.0, "step": 586 }, { "epoch": 0.9129082426127527, "grad_norm": 0.2462401683124997, "learning_rate": 2.5447668048050257e-05, "loss": 0.1714, "num_tokens": 57657731.0, "step": 587 }, { "epoch": 0.9144634525660964, "grad_norm": 0.24896528714646432, "learning_rate": 2.5403148982251873e-05, "loss": 0.1798, "num_tokens": 57756035.0, "step": 588 }, { "epoch": 0.9160186625194401, "grad_norm": 0.2724868491773357, "learning_rate": 2.5358608316760886e-05, "loss": 0.1769, "num_tokens": 57854339.0, "step": 589 }, { "epoch": 0.9175738724727839, "grad_norm": 0.2755560259757669, "learning_rate": 2.531404633427565e-05, "loss": 0.1772, "num_tokens": 57952643.0, "step": 590 }, { "epoch": 0.9191290824261276, "grad_norm": 0.299596022417472, "learning_rate": 2.526946331762986e-05, "loss": 0.1751, "num_tokens": 58050947.0, "step": 591 }, { "epoch": 0.9206842923794712, "grad_norm": 0.24280402300981693, "learning_rate": 2.5224859549790672e-05, "loss": 0.1753, "num_tokens": 58149251.0, "step": 592 }, { "epoch": 0.9222395023328149, "grad_norm": 0.27877284576562394, "learning_rate": 2.5180235313856982e-05, "loss": 0.1767, "num_tokens": 58247555.0, "step": 593 }, { "epoch": 0.9237947122861586, "grad_norm": 0.235923499123493, "learning_rate": 2.513559089305758e-05, "loss": 0.1769, "num_tokens": 58345859.0, "step": 594 }, { "epoch": 0.9253499222395023, "grad_norm": 0.2475940357829114, "learning_rate": 2.5090926570749362e-05, "loss": 0.1801, "num_tokens": 58444163.0, "step": 595 }, { "epoch": 0.926905132192846, "grad_norm": 0.24145836330116133, "learning_rate": 2.504624263041557e-05, "loss": 0.189, "num_tokens": 58542467.0, "step": 596 }, { "epoch": 0.9284603421461898, "grad_norm": 0.24466159765364295, "learning_rate": 2.5001539355663915e-05, "loss": 0.1781, "num_tokens": 58640771.0, "step": 597 }, { "epoch": 0.9300155520995335, "grad_norm": 0.25958498192370033, "learning_rate": 2.495681703022486e-05, "loss": 0.1898, "num_tokens": 58739075.0, "step": 598 }, { "epoch": 0.9315707620528771, "grad_norm": 0.24123687838135593, "learning_rate": 2.491207593794977e-05, "loss": 0.1907, "num_tokens": 58837379.0, "step": 599 }, { "epoch": 0.9331259720062208, "grad_norm": 0.4572625877547282, "learning_rate": 2.4867316362809116e-05, "loss": 0.1828, "num_tokens": 58932707.0, "step": 600 }, { "epoch": 0.9346811819595645, "grad_norm": 0.2653398470981905, "learning_rate": 2.4822538588890694e-05, "loss": 0.1924, "num_tokens": 59031011.0, "step": 601 }, { "epoch": 0.9362363919129082, "grad_norm": 0.2528321687772901, "learning_rate": 2.4777742900397785e-05, "loss": 0.1841, "num_tokens": 59129315.0, "step": 602 }, { "epoch": 0.937791601866252, "grad_norm": 0.3001791769011213, "learning_rate": 2.4732929581647406e-05, "loss": 0.1954, "num_tokens": 59227619.0, "step": 603 }, { "epoch": 0.9393468118195957, "grad_norm": 0.2278857500430177, "learning_rate": 2.4688098917068436e-05, "loss": 0.1875, "num_tokens": 59325923.0, "step": 604 }, { "epoch": 0.9409020217729394, "grad_norm": 0.2742004664642663, "learning_rate": 2.4643251191199874e-05, "loss": 0.1843, "num_tokens": 59424227.0, "step": 605 }, { "epoch": 0.942457231726283, "grad_norm": 0.26877408417000354, "learning_rate": 2.4598386688688994e-05, "loss": 0.195, "num_tokens": 59522531.0, "step": 606 }, { "epoch": 0.9440124416796267, "grad_norm": 0.2310597265685783, "learning_rate": 2.4553505694289558e-05, "loss": 0.1821, "num_tokens": 59620835.0, "step": 607 }, { "epoch": 0.9455676516329704, "grad_norm": 0.27327746685313076, "learning_rate": 2.450860849286e-05, "loss": 0.1815, "num_tokens": 59719139.0, "step": 608 }, { "epoch": 0.9471228615863142, "grad_norm": 0.25212673757168175, "learning_rate": 2.446369536936161e-05, "loss": 0.1749, "num_tokens": 59817443.0, "step": 609 }, { "epoch": 0.9486780715396579, "grad_norm": 0.23773554989536358, "learning_rate": 2.4418766608856747e-05, "loss": 0.1763, "num_tokens": 59915747.0, "step": 610 }, { "epoch": 0.9502332814930016, "grad_norm": 0.24384001575398828, "learning_rate": 2.4373822496507014e-05, "loss": 0.1857, "num_tokens": 60014051.0, "step": 611 }, { "epoch": 0.9517884914463453, "grad_norm": 0.2549931843169707, "learning_rate": 2.4328863317571448e-05, "loss": 0.1798, "num_tokens": 60112355.0, "step": 612 }, { "epoch": 0.9533437013996889, "grad_norm": 0.23525760673277274, "learning_rate": 2.4283889357404718e-05, "loss": 0.1912, "num_tokens": 60210659.0, "step": 613 }, { "epoch": 0.9548989113530326, "grad_norm": 0.23997095825227063, "learning_rate": 2.4238900901455305e-05, "loss": 0.1721, "num_tokens": 60308963.0, "step": 614 }, { "epoch": 0.9564541213063764, "grad_norm": 0.2235324918384866, "learning_rate": 2.4193898235263703e-05, "loss": 0.1823, "num_tokens": 60407267.0, "step": 615 }, { "epoch": 0.9580093312597201, "grad_norm": 0.22180914216523645, "learning_rate": 2.4148881644460583e-05, "loss": 0.1785, "num_tokens": 60505571.0, "step": 616 }, { "epoch": 0.9595645412130638, "grad_norm": 0.24863367215106144, "learning_rate": 2.4103851414765008e-05, "loss": 0.1821, "num_tokens": 60603875.0, "step": 617 }, { "epoch": 0.9611197511664075, "grad_norm": 0.2445394118995308, "learning_rate": 2.4058807831982605e-05, "loss": 0.1764, "num_tokens": 60702179.0, "step": 618 }, { "epoch": 0.9626749611197511, "grad_norm": 0.23798681870972097, "learning_rate": 2.4013751182003744e-05, "loss": 0.1902, "num_tokens": 60800483.0, "step": 619 }, { "epoch": 0.9642301710730948, "grad_norm": 0.23847804737135286, "learning_rate": 2.3968681750801745e-05, "loss": 0.18, "num_tokens": 60898787.0, "step": 620 }, { "epoch": 0.9657853810264385, "grad_norm": 0.4447312919928524, "learning_rate": 2.3923599824431035e-05, "loss": 0.1898, "num_tokens": 60997091.0, "step": 621 }, { "epoch": 0.9673405909797823, "grad_norm": 0.24223591009603138, "learning_rate": 2.3878505689025363e-05, "loss": 0.1912, "num_tokens": 61095395.0, "step": 622 }, { "epoch": 0.968895800933126, "grad_norm": 0.23486631567518862, "learning_rate": 2.3833399630795958e-05, "loss": 0.1888, "num_tokens": 61193699.0, "step": 623 }, { "epoch": 0.9704510108864697, "grad_norm": 0.2934882983932006, "learning_rate": 2.3788281936029725e-05, "loss": 0.1798, "num_tokens": 61292003.0, "step": 624 }, { "epoch": 0.9720062208398134, "grad_norm": 0.2440619543227725, "learning_rate": 2.374315289108743e-05, "loss": 0.176, "num_tokens": 61390307.0, "step": 625 }, { "epoch": 0.973561430793157, "grad_norm": 0.2422383405970723, "learning_rate": 2.3698012782401865e-05, "loss": 0.1964, "num_tokens": 61488611.0, "step": 626 }, { "epoch": 0.9751166407465007, "grad_norm": 0.2329743577594852, "learning_rate": 2.3652861896476075e-05, "loss": 0.1828, "num_tokens": 61586915.0, "step": 627 }, { "epoch": 0.9766718506998445, "grad_norm": 0.27160868417601397, "learning_rate": 2.360770051988146e-05, "loss": 0.1814, "num_tokens": 61685219.0, "step": 628 }, { "epoch": 0.9782270606531882, "grad_norm": 0.24904038641250523, "learning_rate": 2.3562528939256048e-05, "loss": 0.1803, "num_tokens": 61783523.0, "step": 629 }, { "epoch": 0.9797822706065319, "grad_norm": 0.24131446400760403, "learning_rate": 2.3517347441302608e-05, "loss": 0.1773, "num_tokens": 61881827.0, "step": 630 }, { "epoch": 0.9813374805598756, "grad_norm": 0.25578943905615514, "learning_rate": 2.3472156312786857e-05, "loss": 0.1765, "num_tokens": 61980131.0, "step": 631 }, { "epoch": 0.9828926905132193, "grad_norm": 0.22911184397944495, "learning_rate": 2.3426955840535647e-05, "loss": 0.1807, "num_tokens": 62078435.0, "step": 632 }, { "epoch": 0.9844479004665629, "grad_norm": 0.25770543649409394, "learning_rate": 2.3381746311435122e-05, "loss": 0.1872, "num_tokens": 62176739.0, "step": 633 }, { "epoch": 0.9860031104199067, "grad_norm": 0.2567417172345955, "learning_rate": 2.3336528012428917e-05, "loss": 0.1873, "num_tokens": 62275043.0, "step": 634 }, { "epoch": 0.9875583203732504, "grad_norm": 0.23265754180848378, "learning_rate": 2.3291301230516325e-05, "loss": 0.1751, "num_tokens": 62373347.0, "step": 635 }, { "epoch": 0.9891135303265941, "grad_norm": 0.2425187955016214, "learning_rate": 2.3246066252750482e-05, "loss": 0.1853, "num_tokens": 62471651.0, "step": 636 }, { "epoch": 0.9906687402799378, "grad_norm": 0.3166509211508891, "learning_rate": 2.3200823366236548e-05, "loss": 0.1811, "num_tokens": 62569955.0, "step": 637 }, { "epoch": 0.9922239502332815, "grad_norm": 0.22530225549274288, "learning_rate": 2.315557285812986e-05, "loss": 0.1805, "num_tokens": 62668259.0, "step": 638 }, { "epoch": 0.9937791601866252, "grad_norm": 0.24079649595203478, "learning_rate": 2.3110315015634157e-05, "loss": 0.1933, "num_tokens": 62766563.0, "step": 639 }, { "epoch": 0.995334370139969, "grad_norm": 0.23534877986282188, "learning_rate": 2.3065050125999712e-05, "loss": 0.1875, "num_tokens": 62864867.0, "step": 640 }, { "epoch": 0.9968895800933126, "grad_norm": 0.2486401307791681, "learning_rate": 2.3019778476521535e-05, "loss": 0.1794, "num_tokens": 62963171.0, "step": 641 }, { "epoch": 0.9984447900466563, "grad_norm": 0.26374898153263554, "learning_rate": 2.297450035453752e-05, "loss": 0.1807, "num_tokens": 63061475.0, "step": 642 }, { "epoch": 1.0, "grad_norm": 0.2772096266812356, "learning_rate": 2.2929216047426667e-05, "loss": 0.1892, "num_tokens": 63148436.0, "step": 643 }, { "epoch": 1.0015552099533438, "grad_norm": 0.3514750862111504, "learning_rate": 2.2883925842607225e-05, "loss": 0.1449, "num_tokens": 63246740.0, "step": 644 }, { "epoch": 1.0031104199066874, "grad_norm": 0.21632188158235477, "learning_rate": 2.283863002753487e-05, "loss": 0.1231, "num_tokens": 63345044.0, "step": 645 }, { "epoch": 1.0046656298600312, "grad_norm": 0.2054519241846781, "learning_rate": 2.2793328889700884e-05, "loss": 0.1246, "num_tokens": 63443348.0, "step": 646 }, { "epoch": 1.0062208398133747, "grad_norm": 0.21605831136412817, "learning_rate": 2.2748022716630348e-05, "loss": 0.1254, "num_tokens": 63541652.0, "step": 647 }, { "epoch": 1.0077760497667185, "grad_norm": 0.23160152164780054, "learning_rate": 2.270271179588029e-05, "loss": 0.1371, "num_tokens": 63639956.0, "step": 648 }, { "epoch": 1.009331259720062, "grad_norm": 0.24301244443304176, "learning_rate": 2.2657396415037867e-05, "loss": 0.1236, "num_tokens": 63738260.0, "step": 649 }, { "epoch": 1.010886469673406, "grad_norm": 0.2709282090379499, "learning_rate": 2.2612076861718545e-05, "loss": 0.1341, "num_tokens": 63836564.0, "step": 650 }, { "epoch": 1.0124416796267497, "grad_norm": 0.25608807122384075, "learning_rate": 2.256675342356429e-05, "loss": 0.1215, "num_tokens": 63934868.0, "step": 651 }, { "epoch": 1.0139968895800933, "grad_norm": 0.2615771821590131, "learning_rate": 2.2521426388241704e-05, "loss": 0.1089, "num_tokens": 64033172.0, "step": 652 }, { "epoch": 1.015552099533437, "grad_norm": 0.2887802618291261, "learning_rate": 2.2476096043440236e-05, "loss": 0.1241, "num_tokens": 64131476.0, "step": 653 }, { "epoch": 1.0171073094867806, "grad_norm": 0.28265774240007374, "learning_rate": 2.2430762676870325e-05, "loss": 0.1241, "num_tokens": 64229780.0, "step": 654 }, { "epoch": 1.0186625194401244, "grad_norm": 0.27155862067600883, "learning_rate": 2.23854265762616e-05, "loss": 0.1281, "num_tokens": 64328084.0, "step": 655 }, { "epoch": 1.0202177293934682, "grad_norm": 0.25215630422224666, "learning_rate": 2.234008802936104e-05, "loss": 0.1217, "num_tokens": 64426388.0, "step": 656 }, { "epoch": 1.0217729393468118, "grad_norm": 0.2604459412170271, "learning_rate": 2.2294747323931146e-05, "loss": 0.122, "num_tokens": 64524692.0, "step": 657 }, { "epoch": 1.0233281493001556, "grad_norm": 0.2491558330542108, "learning_rate": 2.224940474774812e-05, "loss": 0.1242, "num_tokens": 64622996.0, "step": 658 }, { "epoch": 1.0248833592534992, "grad_norm": 0.24198167126606032, "learning_rate": 2.220406058860006e-05, "loss": 0.1167, "num_tokens": 64721300.0, "step": 659 }, { "epoch": 1.026438569206843, "grad_norm": 0.25233860153325693, "learning_rate": 2.215871513428508e-05, "loss": 0.1291, "num_tokens": 64819604.0, "step": 660 }, { "epoch": 1.0279937791601865, "grad_norm": 0.2412489356206769, "learning_rate": 2.2113368672609514e-05, "loss": 0.1281, "num_tokens": 64917908.0, "step": 661 }, { "epoch": 1.0295489891135303, "grad_norm": 0.2312229067743367, "learning_rate": 2.2068021491386124e-05, "loss": 0.1267, "num_tokens": 65016212.0, "step": 662 }, { "epoch": 1.0311041990668741, "grad_norm": 0.22663786529100793, "learning_rate": 2.202267387843221e-05, "loss": 0.1219, "num_tokens": 65114516.0, "step": 663 }, { "epoch": 1.0326594090202177, "grad_norm": 0.23157073920531296, "learning_rate": 2.1977326121567803e-05, "loss": 0.1306, "num_tokens": 65212820.0, "step": 664 }, { "epoch": 1.0342146189735615, "grad_norm": 0.229691446907342, "learning_rate": 2.1931978508613882e-05, "loss": 0.1211, "num_tokens": 65311124.0, "step": 665 }, { "epoch": 1.035769828926905, "grad_norm": 0.24226822465989611, "learning_rate": 2.188663132739049e-05, "loss": 0.1332, "num_tokens": 65409428.0, "step": 666 }, { "epoch": 1.0373250388802489, "grad_norm": 0.25493968921469323, "learning_rate": 2.1841284865714933e-05, "loss": 0.1271, "num_tokens": 65507732.0, "step": 667 }, { "epoch": 1.0388802488335926, "grad_norm": 0.22632500366224964, "learning_rate": 2.179593941139994e-05, "loss": 0.1221, "num_tokens": 65606036.0, "step": 668 }, { "epoch": 1.0404354587869362, "grad_norm": 0.23785889055333242, "learning_rate": 2.1750595252251874e-05, "loss": 0.1249, "num_tokens": 65704340.0, "step": 669 }, { "epoch": 1.04199066874028, "grad_norm": 0.3739109325466868, "learning_rate": 2.1705252676068863e-05, "loss": 0.1229, "num_tokens": 65802644.0, "step": 670 }, { "epoch": 1.0435458786936236, "grad_norm": 0.24369072816508394, "learning_rate": 2.165991197063897e-05, "loss": 0.1283, "num_tokens": 65900948.0, "step": 671 }, { "epoch": 1.0451010886469674, "grad_norm": 0.25751726349236603, "learning_rate": 2.161457342373841e-05, "loss": 0.1205, "num_tokens": 65999252.0, "step": 672 }, { "epoch": 1.046656298600311, "grad_norm": 0.25299260476784213, "learning_rate": 2.156923732312968e-05, "loss": 0.1308, "num_tokens": 66097556.0, "step": 673 }, { "epoch": 1.0482115085536547, "grad_norm": 0.2504986780560142, "learning_rate": 2.1523903956559776e-05, "loss": 0.1295, "num_tokens": 66195860.0, "step": 674 }, { "epoch": 1.0497667185069985, "grad_norm": 0.24562698667609756, "learning_rate": 2.14785736117583e-05, "loss": 0.1209, "num_tokens": 66294164.0, "step": 675 }, { "epoch": 1.0513219284603421, "grad_norm": 0.25990082064399606, "learning_rate": 2.143324657643571e-05, "loss": 0.1207, "num_tokens": 66392468.0, "step": 676 }, { "epoch": 1.052877138413686, "grad_norm": 0.22410138016957767, "learning_rate": 2.1387923138281458e-05, "loss": 0.1169, "num_tokens": 66490772.0, "step": 677 }, { "epoch": 1.0544323483670295, "grad_norm": 0.246550640984817, "learning_rate": 2.1342603584962142e-05, "loss": 0.1315, "num_tokens": 66589076.0, "step": 678 }, { "epoch": 1.0559875583203733, "grad_norm": 0.23696879973275062, "learning_rate": 2.1297288204119718e-05, "loss": 0.1234, "num_tokens": 66687380.0, "step": 679 }, { "epoch": 1.0575427682737168, "grad_norm": 0.2590397505850009, "learning_rate": 2.125197728336966e-05, "loss": 0.1274, "num_tokens": 66785684.0, "step": 680 }, { "epoch": 1.0590979782270606, "grad_norm": 0.23444431366781038, "learning_rate": 2.120667111029912e-05, "loss": 0.1245, "num_tokens": 66883988.0, "step": 681 }, { "epoch": 1.0606531881804044, "grad_norm": 0.30324864179208744, "learning_rate": 2.116136997246514e-05, "loss": 0.1258, "num_tokens": 66982292.0, "step": 682 }, { "epoch": 1.062208398133748, "grad_norm": 0.24666010170593083, "learning_rate": 2.1116074157392784e-05, "loss": 0.1313, "num_tokens": 67080596.0, "step": 683 }, { "epoch": 1.0637636080870918, "grad_norm": 0.30254288319665723, "learning_rate": 2.1070783952573332e-05, "loss": 0.1181, "num_tokens": 67178900.0, "step": 684 }, { "epoch": 1.0653188180404354, "grad_norm": 0.2260954864265992, "learning_rate": 2.1025499645462485e-05, "loss": 0.1222, "num_tokens": 67277204.0, "step": 685 }, { "epoch": 1.0668740279937792, "grad_norm": 0.25922910671896615, "learning_rate": 2.0980221523478478e-05, "loss": 0.129, "num_tokens": 67375508.0, "step": 686 }, { "epoch": 1.068429237947123, "grad_norm": 0.24830490711172187, "learning_rate": 2.0934949874000297e-05, "loss": 0.135, "num_tokens": 67473812.0, "step": 687 }, { "epoch": 1.0699844479004665, "grad_norm": 0.23190020812723014, "learning_rate": 2.088968498436585e-05, "loss": 0.1266, "num_tokens": 67572116.0, "step": 688 }, { "epoch": 1.0715396578538103, "grad_norm": 0.2722469510377753, "learning_rate": 2.084442714187015e-05, "loss": 0.1267, "num_tokens": 67670420.0, "step": 689 }, { "epoch": 1.073094867807154, "grad_norm": 0.23866104911903796, "learning_rate": 2.0799176633763465e-05, "loss": 0.1194, "num_tokens": 67768724.0, "step": 690 }, { "epoch": 1.0746500777604977, "grad_norm": 0.23217637599043478, "learning_rate": 2.075393374724953e-05, "loss": 0.1221, "num_tokens": 67867028.0, "step": 691 }, { "epoch": 1.0762052877138413, "grad_norm": 0.2326506189813573, "learning_rate": 2.070869876948368e-05, "loss": 0.1234, "num_tokens": 67965332.0, "step": 692 }, { "epoch": 1.077760497667185, "grad_norm": 0.2304457239829569, "learning_rate": 2.0663471987571085e-05, "loss": 0.1231, "num_tokens": 68061160.0, "step": 693 }, { "epoch": 1.0793157076205289, "grad_norm": 0.23676201690206153, "learning_rate": 2.0618253688564884e-05, "loss": 0.123, "num_tokens": 68159464.0, "step": 694 }, { "epoch": 1.0808709175738724, "grad_norm": 0.23191229755192025, "learning_rate": 2.0573044159464355e-05, "loss": 0.1195, "num_tokens": 68257768.0, "step": 695 }, { "epoch": 1.0824261275272162, "grad_norm": 0.26185133913839936, "learning_rate": 2.0527843687213146e-05, "loss": 0.1251, "num_tokens": 68356072.0, "step": 696 }, { "epoch": 1.0839813374805598, "grad_norm": 0.24054147655495378, "learning_rate": 2.0482652558697394e-05, "loss": 0.1231, "num_tokens": 68454376.0, "step": 697 }, { "epoch": 1.0855365474339036, "grad_norm": 0.25511662505293825, "learning_rate": 2.0437471060743958e-05, "loss": 0.1272, "num_tokens": 68552680.0, "step": 698 }, { "epoch": 1.0870917573872472, "grad_norm": 0.23637078357899033, "learning_rate": 2.0392299480118548e-05, "loss": 0.1271, "num_tokens": 68650984.0, "step": 699 }, { "epoch": 1.088646967340591, "grad_norm": 0.2383086004262525, "learning_rate": 2.0347138103523934e-05, "loss": 0.1181, "num_tokens": 68749288.0, "step": 700 }, { "epoch": 1.0902021772939348, "grad_norm": 0.25169401456965096, "learning_rate": 2.0301987217598137e-05, "loss": 0.1254, "num_tokens": 68847592.0, "step": 701 }, { "epoch": 1.0917573872472783, "grad_norm": 0.2431549903489215, "learning_rate": 2.0256847108912575e-05, "loss": 0.1208, "num_tokens": 68945896.0, "step": 702 }, { "epoch": 1.0933125972006221, "grad_norm": 0.2605485733148268, "learning_rate": 2.0211718063970287e-05, "loss": 0.129, "num_tokens": 69044200.0, "step": 703 }, { "epoch": 1.0948678071539657, "grad_norm": 0.23687864550710064, "learning_rate": 2.016660036920405e-05, "loss": 0.1193, "num_tokens": 69142504.0, "step": 704 }, { "epoch": 1.0964230171073095, "grad_norm": 0.2703392462401265, "learning_rate": 2.0121494310974646e-05, "loss": 0.1281, "num_tokens": 69240808.0, "step": 705 }, { "epoch": 1.0979782270606533, "grad_norm": 0.22084156426517076, "learning_rate": 2.0076400175568978e-05, "loss": 0.1184, "num_tokens": 69339112.0, "step": 706 }, { "epoch": 1.0995334370139969, "grad_norm": 0.2485304579586954, "learning_rate": 2.0031318249198267e-05, "loss": 0.1254, "num_tokens": 69437416.0, "step": 707 }, { "epoch": 1.1010886469673407, "grad_norm": 0.2350023548025591, "learning_rate": 1.9986248817996258e-05, "loss": 0.1253, "num_tokens": 69535720.0, "step": 708 }, { "epoch": 1.1026438569206842, "grad_norm": 0.23898852935629253, "learning_rate": 1.9941192168017404e-05, "loss": 0.1262, "num_tokens": 69634024.0, "step": 709 }, { "epoch": 1.104199066874028, "grad_norm": 0.24320923715040021, "learning_rate": 1.9896148585234994e-05, "loss": 0.123, "num_tokens": 69732328.0, "step": 710 }, { "epoch": 1.1057542768273716, "grad_norm": 0.25974016789188537, "learning_rate": 1.9851118355539426e-05, "loss": 0.1236, "num_tokens": 69830632.0, "step": 711 }, { "epoch": 1.1073094867807154, "grad_norm": 0.2497578454454358, "learning_rate": 1.9806101764736306e-05, "loss": 0.1276, "num_tokens": 69928936.0, "step": 712 }, { "epoch": 1.1088646967340592, "grad_norm": 0.27398906233888515, "learning_rate": 1.9761099098544704e-05, "loss": 0.1259, "num_tokens": 70027240.0, "step": 713 }, { "epoch": 1.1104199066874028, "grad_norm": 0.24514397106810126, "learning_rate": 1.9716110642595287e-05, "loss": 0.1205, "num_tokens": 70125544.0, "step": 714 }, { "epoch": 1.1119751166407466, "grad_norm": 0.2360533103644338, "learning_rate": 1.967113668242856e-05, "loss": 0.1161, "num_tokens": 70223848.0, "step": 715 }, { "epoch": 1.1135303265940901, "grad_norm": 0.2497735219273821, "learning_rate": 1.962617750349299e-05, "loss": 0.1267, "num_tokens": 70322152.0, "step": 716 }, { "epoch": 1.115085536547434, "grad_norm": 0.23906748650991658, "learning_rate": 1.9581233391143256e-05, "loss": 0.1203, "num_tokens": 70420456.0, "step": 717 }, { "epoch": 1.1166407465007775, "grad_norm": 0.2352896933145863, "learning_rate": 1.9536304630638393e-05, "loss": 0.12, "num_tokens": 70518760.0, "step": 718 }, { "epoch": 1.1181959564541213, "grad_norm": 0.2464004090757325, "learning_rate": 1.9491391507140004e-05, "loss": 0.1281, "num_tokens": 70617064.0, "step": 719 }, { "epoch": 1.119751166407465, "grad_norm": 0.2470063418212819, "learning_rate": 1.9446494305710445e-05, "loss": 0.1195, "num_tokens": 70715368.0, "step": 720 }, { "epoch": 1.1213063763608087, "grad_norm": 0.24652224187416974, "learning_rate": 1.9401613311311008e-05, "loss": 0.1263, "num_tokens": 70813672.0, "step": 721 }, { "epoch": 1.1228615863141524, "grad_norm": 0.24811850899880608, "learning_rate": 1.935674880880013e-05, "loss": 0.1195, "num_tokens": 70911976.0, "step": 722 }, { "epoch": 1.124416796267496, "grad_norm": 0.25903036287817277, "learning_rate": 1.9311901082931573e-05, "loss": 0.1266, "num_tokens": 71010280.0, "step": 723 }, { "epoch": 1.1259720062208398, "grad_norm": 0.29119827729673237, "learning_rate": 1.9267070418352597e-05, "loss": 0.1305, "num_tokens": 71108584.0, "step": 724 }, { "epoch": 1.1275272161741836, "grad_norm": 0.24073101379423606, "learning_rate": 1.9222257099602214e-05, "loss": 0.1182, "num_tokens": 71206888.0, "step": 725 }, { "epoch": 1.1290824261275272, "grad_norm": 0.23612577481079258, "learning_rate": 1.9177461411109312e-05, "loss": 0.1169, "num_tokens": 71305192.0, "step": 726 }, { "epoch": 1.130637636080871, "grad_norm": 0.24559914057143678, "learning_rate": 1.9132683637190893e-05, "loss": 0.1214, "num_tokens": 71403496.0, "step": 727 }, { "epoch": 1.1321928460342146, "grad_norm": 0.241040009048793, "learning_rate": 1.9087924062050235e-05, "loss": 0.1314, "num_tokens": 71501800.0, "step": 728 }, { "epoch": 1.1337480559875583, "grad_norm": 0.2572042988842975, "learning_rate": 1.9043182969775148e-05, "loss": 0.1241, "num_tokens": 71600104.0, "step": 729 }, { "epoch": 1.1353032659409021, "grad_norm": 0.23347717901127985, "learning_rate": 1.899846064433609e-05, "loss": 0.1239, "num_tokens": 71698408.0, "step": 730 }, { "epoch": 1.1368584758942457, "grad_norm": 0.2712155321845268, "learning_rate": 1.8953757369584443e-05, "loss": 0.1297, "num_tokens": 71796712.0, "step": 731 }, { "epoch": 1.1384136858475895, "grad_norm": 0.23110321057918357, "learning_rate": 1.8909073429250634e-05, "loss": 0.1204, "num_tokens": 71895016.0, "step": 732 }, { "epoch": 1.139968895800933, "grad_norm": 0.2302306318492987, "learning_rate": 1.8864409106942422e-05, "loss": 0.1193, "num_tokens": 71993320.0, "step": 733 }, { "epoch": 1.1415241057542769, "grad_norm": 0.2674876559977168, "learning_rate": 1.881976468614302e-05, "loss": 0.1326, "num_tokens": 72091624.0, "step": 734 }, { "epoch": 1.1430793157076204, "grad_norm": 0.2579839637616266, "learning_rate": 1.877514045020933e-05, "loss": 0.1218, "num_tokens": 72189928.0, "step": 735 }, { "epoch": 1.1446345256609642, "grad_norm": 0.23171583116423178, "learning_rate": 1.8730536682370147e-05, "loss": 0.1209, "num_tokens": 72288232.0, "step": 736 }, { "epoch": 1.1461897356143078, "grad_norm": 0.2322561742538261, "learning_rate": 1.8685953665724357e-05, "loss": 0.1205, "num_tokens": 72386536.0, "step": 737 }, { "epoch": 1.1477449455676516, "grad_norm": 0.27126832133009965, "learning_rate": 1.8641391683239126e-05, "loss": 0.1261, "num_tokens": 72484840.0, "step": 738 }, { "epoch": 1.1493001555209954, "grad_norm": 0.23278531169623862, "learning_rate": 1.8596851017748133e-05, "loss": 0.1261, "num_tokens": 72583144.0, "step": 739 }, { "epoch": 1.150855365474339, "grad_norm": 0.2503629516769014, "learning_rate": 1.855233195194975e-05, "loss": 0.1372, "num_tokens": 72681448.0, "step": 740 }, { "epoch": 1.1524105754276828, "grad_norm": 0.2337643029068019, "learning_rate": 1.8507834768405263e-05, "loss": 0.1197, "num_tokens": 72779752.0, "step": 741 }, { "epoch": 1.1539657853810263, "grad_norm": 0.2341884598936436, "learning_rate": 1.846335974953706e-05, "loss": 0.1189, "num_tokens": 72878056.0, "step": 742 }, { "epoch": 1.1555209953343701, "grad_norm": 0.24562276647682196, "learning_rate": 1.8418907177626865e-05, "loss": 0.1271, "num_tokens": 72972623.0, "step": 743 }, { "epoch": 1.157076205287714, "grad_norm": 0.26673354683432793, "learning_rate": 1.8374477334813922e-05, "loss": 0.1289, "num_tokens": 73070927.0, "step": 744 }, { "epoch": 1.1586314152410575, "grad_norm": 0.25111054255989135, "learning_rate": 1.833007050309321e-05, "loss": 0.1361, "num_tokens": 73169231.0, "step": 745 }, { "epoch": 1.1601866251944013, "grad_norm": 0.2313212187988624, "learning_rate": 1.8285686964313676e-05, "loss": 0.1216, "num_tokens": 73267535.0, "step": 746 }, { "epoch": 1.1617418351477449, "grad_norm": 0.22264221814370444, "learning_rate": 1.82413270001764e-05, "loss": 0.1155, "num_tokens": 73365839.0, "step": 747 }, { "epoch": 1.1632970451010887, "grad_norm": 0.24593066602504973, "learning_rate": 1.8196990892232856e-05, "loss": 0.1302, "num_tokens": 73464143.0, "step": 748 }, { "epoch": 1.1648522550544325, "grad_norm": 0.23326811405881098, "learning_rate": 1.81526789218831e-05, "loss": 0.1224, "num_tokens": 73562447.0, "step": 749 }, { "epoch": 1.166407465007776, "grad_norm": 0.23744640059211083, "learning_rate": 1.8108391370373982e-05, "loss": 0.1226, "num_tokens": 73660751.0, "step": 750 }, { "epoch": 1.1679626749611198, "grad_norm": 0.27392132171213424, "learning_rate": 1.806412851879737e-05, "loss": 0.1357, "num_tokens": 73759055.0, "step": 751 }, { "epoch": 1.1695178849144634, "grad_norm": 0.24440988714513176, "learning_rate": 1.801989064808836e-05, "loss": 0.1285, "num_tokens": 73857359.0, "step": 752 }, { "epoch": 1.1710730948678072, "grad_norm": 0.26329373230876585, "learning_rate": 1.7975678039023497e-05, "loss": 0.1236, "num_tokens": 73955663.0, "step": 753 }, { "epoch": 1.1726283048211508, "grad_norm": 0.2400300135577282, "learning_rate": 1.7931490972218978e-05, "loss": 0.1265, "num_tokens": 74053967.0, "step": 754 }, { "epoch": 1.1741835147744946, "grad_norm": 0.2667561063677596, "learning_rate": 1.7887329728128912e-05, "loss": 0.1403, "num_tokens": 74152271.0, "step": 755 }, { "epoch": 1.1757387247278381, "grad_norm": 0.2570901339302435, "learning_rate": 1.784319458704348e-05, "loss": 0.1305, "num_tokens": 74250575.0, "step": 756 }, { "epoch": 1.177293934681182, "grad_norm": 0.2699618039057867, "learning_rate": 1.7799085829087222e-05, "loss": 0.137, "num_tokens": 74348879.0, "step": 757 }, { "epoch": 1.1788491446345257, "grad_norm": 0.2500449381153466, "learning_rate": 1.7755003734217195e-05, "loss": 0.1307, "num_tokens": 74447183.0, "step": 758 }, { "epoch": 1.1804043545878693, "grad_norm": 0.22732412617532516, "learning_rate": 1.7710948582221234e-05, "loss": 0.1122, "num_tokens": 74545487.0, "step": 759 }, { "epoch": 1.181959564541213, "grad_norm": 0.2463299889302792, "learning_rate": 1.766692065271619e-05, "loss": 0.1268, "num_tokens": 74643791.0, "step": 760 }, { "epoch": 1.1835147744945567, "grad_norm": 0.24171461291825863, "learning_rate": 1.76229202251461e-05, "loss": 0.124, "num_tokens": 74742095.0, "step": 761 }, { "epoch": 1.1850699844479005, "grad_norm": 0.23746463868476872, "learning_rate": 1.757894757878046e-05, "loss": 0.1211, "num_tokens": 74840399.0, "step": 762 }, { "epoch": 1.1866251944012443, "grad_norm": 0.2324262173571099, "learning_rate": 1.7535002992712456e-05, "loss": 0.1199, "num_tokens": 74938703.0, "step": 763 }, { "epoch": 1.1881804043545878, "grad_norm": 0.25356359865980144, "learning_rate": 1.7491086745857146e-05, "loss": 0.1323, "num_tokens": 75037007.0, "step": 764 }, { "epoch": 1.1897356143079316, "grad_norm": 0.2441909139887883, "learning_rate": 1.7447199116949746e-05, "loss": 0.1275, "num_tokens": 75135311.0, "step": 765 }, { "epoch": 1.1912908242612752, "grad_norm": 0.24059373306048767, "learning_rate": 1.7403340384543833e-05, "loss": 0.1253, "num_tokens": 75233615.0, "step": 766 }, { "epoch": 1.192846034214619, "grad_norm": 0.24896753529092833, "learning_rate": 1.7359510827009555e-05, "loss": 0.1356, "num_tokens": 75331919.0, "step": 767 }, { "epoch": 1.1944012441679628, "grad_norm": 0.249716319882526, "learning_rate": 1.731571072253191e-05, "loss": 0.1222, "num_tokens": 75430223.0, "step": 768 }, { "epoch": 1.1959564541213064, "grad_norm": 0.24919391093610735, "learning_rate": 1.7271940349108948e-05, "loss": 0.1327, "num_tokens": 75528527.0, "step": 769 }, { "epoch": 1.1975116640746502, "grad_norm": 0.24717194509838064, "learning_rate": 1.722819998455003e-05, "loss": 0.1244, "num_tokens": 75626831.0, "step": 770 }, { "epoch": 1.1990668740279937, "grad_norm": 0.23155066728290422, "learning_rate": 1.7184489906474028e-05, "loss": 0.1182, "num_tokens": 75725135.0, "step": 771 }, { "epoch": 1.2006220839813375, "grad_norm": 0.3141980091067843, "learning_rate": 1.7140810392307607e-05, "loss": 0.1255, "num_tokens": 75823439.0, "step": 772 }, { "epoch": 1.202177293934681, "grad_norm": 0.23107919403987087, "learning_rate": 1.7097161719283444e-05, "loss": 0.1201, "num_tokens": 75921743.0, "step": 773 }, { "epoch": 1.2037325038880249, "grad_norm": 0.2321607839462832, "learning_rate": 1.7053544164438452e-05, "loss": 0.1204, "num_tokens": 76020047.0, "step": 774 }, { "epoch": 1.2052877138413687, "grad_norm": 0.2279579784510273, "learning_rate": 1.7009958004612054e-05, "loss": 0.1184, "num_tokens": 76118351.0, "step": 775 }, { "epoch": 1.2068429237947123, "grad_norm": 0.3038856164024539, "learning_rate": 1.6966403516444393e-05, "loss": 0.1188, "num_tokens": 76216655.0, "step": 776 }, { "epoch": 1.208398133748056, "grad_norm": 0.23800021576615515, "learning_rate": 1.692288097637462e-05, "loss": 0.1288, "num_tokens": 76314959.0, "step": 777 }, { "epoch": 1.2099533437013996, "grad_norm": 0.22840069237977112, "learning_rate": 1.6879390660639074e-05, "loss": 0.1192, "num_tokens": 76413263.0, "step": 778 }, { "epoch": 1.2115085536547434, "grad_norm": 0.2894767057185955, "learning_rate": 1.68359328452696e-05, "loss": 0.1188, "num_tokens": 76511567.0, "step": 779 }, { "epoch": 1.213063763608087, "grad_norm": 0.23471495121702973, "learning_rate": 1.6792507806091743e-05, "loss": 0.116, "num_tokens": 76609871.0, "step": 780 }, { "epoch": 1.2146189735614308, "grad_norm": 0.24657867368660616, "learning_rate": 1.674911581872304e-05, "loss": 0.1218, "num_tokens": 76708175.0, "step": 781 }, { "epoch": 1.2161741835147746, "grad_norm": 0.24683093437085896, "learning_rate": 1.6705757158571233e-05, "loss": 0.1253, "num_tokens": 76806479.0, "step": 782 }, { "epoch": 1.2177293934681181, "grad_norm": 0.23289694285465223, "learning_rate": 1.6662432100832532e-05, "loss": 0.1213, "num_tokens": 76904783.0, "step": 783 }, { "epoch": 1.219284603421462, "grad_norm": 0.24384941233776306, "learning_rate": 1.66191409204899e-05, "loss": 0.1238, "num_tokens": 77003087.0, "step": 784 }, { "epoch": 1.2208398133748055, "grad_norm": 0.29752985855902286, "learning_rate": 1.657588389231124e-05, "loss": 0.1231, "num_tokens": 77101391.0, "step": 785 }, { "epoch": 1.2223950233281493, "grad_norm": 0.2647633691225542, "learning_rate": 1.653266129084773e-05, "loss": 0.1221, "num_tokens": 77199695.0, "step": 786 }, { "epoch": 1.223950233281493, "grad_norm": 0.2648176847345265, "learning_rate": 1.6489473390432032e-05, "loss": 0.1284, "num_tokens": 77297999.0, "step": 787 }, { "epoch": 1.2255054432348367, "grad_norm": 0.7389131787556925, "learning_rate": 1.6446320465176532e-05, "loss": 0.1196, "num_tokens": 77396303.0, "step": 788 }, { "epoch": 1.2270606531881805, "grad_norm": 0.246282410090374, "learning_rate": 1.640320278897169e-05, "loss": 0.1261, "num_tokens": 77494607.0, "step": 789 }, { "epoch": 1.228615863141524, "grad_norm": 0.22465771283240696, "learning_rate": 1.6360120635484183e-05, "loss": 0.1169, "num_tokens": 77592911.0, "step": 790 }, { "epoch": 1.2301710730948678, "grad_norm": 0.380556526358376, "learning_rate": 1.6317074278155254e-05, "loss": 0.1193, "num_tokens": 77691215.0, "step": 791 }, { "epoch": 1.2317262830482114, "grad_norm": 0.24103339450301026, "learning_rate": 1.6274063990198947e-05, "loss": 0.1215, "num_tokens": 77789519.0, "step": 792 }, { "epoch": 1.2332814930015552, "grad_norm": 0.2557057173214443, "learning_rate": 1.623109004460037e-05, "loss": 0.1271, "num_tokens": 77882934.0, "step": 793 }, { "epoch": 1.234836702954899, "grad_norm": 0.23342993211464855, "learning_rate": 1.6188152714113976e-05, "loss": 0.1181, "num_tokens": 77981238.0, "step": 794 }, { "epoch": 1.2363919129082426, "grad_norm": 0.22879954032454894, "learning_rate": 1.61452522712618e-05, "loss": 0.1266, "num_tokens": 78079542.0, "step": 795 }, { "epoch": 1.2379471228615864, "grad_norm": 0.24167692625731477, "learning_rate": 1.6102388988331776e-05, "loss": 0.122, "num_tokens": 78177846.0, "step": 796 }, { "epoch": 1.23950233281493, "grad_norm": 0.2579635445791769, "learning_rate": 1.6059563137375988e-05, "loss": 0.1278, "num_tokens": 78276150.0, "step": 797 }, { "epoch": 1.2410575427682737, "grad_norm": 0.25854618993316164, "learning_rate": 1.6016774990208913e-05, "loss": 0.1227, "num_tokens": 78374454.0, "step": 798 }, { "epoch": 1.2426127527216173, "grad_norm": 0.23877678919770923, "learning_rate": 1.5974024818405753e-05, "loss": 0.127, "num_tokens": 78472758.0, "step": 799 }, { "epoch": 1.244167962674961, "grad_norm": 0.25846314983093316, "learning_rate": 1.5931312893300656e-05, "loss": 0.1318, "num_tokens": 78571062.0, "step": 800 }, { "epoch": 1.245723172628305, "grad_norm": 0.24890882998489833, "learning_rate": 1.5888639485985037e-05, "loss": 0.1349, "num_tokens": 78669366.0, "step": 801 }, { "epoch": 1.2472783825816485, "grad_norm": 0.27127167082689096, "learning_rate": 1.5846004867305835e-05, "loss": 0.1266, "num_tokens": 78767670.0, "step": 802 }, { "epoch": 1.2488335925349923, "grad_norm": 0.23768278689396913, "learning_rate": 1.5803409307863798e-05, "loss": 0.1243, "num_tokens": 78865974.0, "step": 803 }, { "epoch": 1.2503888024883358, "grad_norm": 0.266928391513493, "learning_rate": 1.5760853078011753e-05, "loss": 0.1211, "num_tokens": 78964278.0, "step": 804 }, { "epoch": 1.2519440124416796, "grad_norm": 0.24014316682466508, "learning_rate": 1.571833644785293e-05, "loss": 0.1279, "num_tokens": 79062582.0, "step": 805 }, { "epoch": 1.2534992223950234, "grad_norm": 0.2514791093290629, "learning_rate": 1.567585968723921e-05, "loss": 0.13, "num_tokens": 79160886.0, "step": 806 }, { "epoch": 1.255054432348367, "grad_norm": 0.2469730226833448, "learning_rate": 1.5633423065769403e-05, "loss": 0.1244, "num_tokens": 79259190.0, "step": 807 }, { "epoch": 1.2566096423017108, "grad_norm": 0.22580730207135863, "learning_rate": 1.559102685278759e-05, "loss": 0.1085, "num_tokens": 79357494.0, "step": 808 }, { "epoch": 1.2581648522550544, "grad_norm": 0.24329384380227762, "learning_rate": 1.5548671317381354e-05, "loss": 0.1291, "num_tokens": 79455798.0, "step": 809 }, { "epoch": 1.2597200622083982, "grad_norm": 0.241004066078562, "learning_rate": 1.550635672838011e-05, "loss": 0.1267, "num_tokens": 79554102.0, "step": 810 }, { "epoch": 1.261275272161742, "grad_norm": 0.26266103869258783, "learning_rate": 1.5464083354353384e-05, "loss": 0.1296, "num_tokens": 79652406.0, "step": 811 }, { "epoch": 1.2628304821150855, "grad_norm": 0.2431773896467409, "learning_rate": 1.5421851463609113e-05, "loss": 0.1298, "num_tokens": 79750710.0, "step": 812 }, { "epoch": 1.264385692068429, "grad_norm": 0.23548067769701464, "learning_rate": 1.537966132419195e-05, "loss": 0.1182, "num_tokens": 79849014.0, "step": 813 }, { "epoch": 1.265940902021773, "grad_norm": 0.23511545449355714, "learning_rate": 1.533751320388154e-05, "loss": 0.1245, "num_tokens": 79947318.0, "step": 814 }, { "epoch": 1.2674961119751167, "grad_norm": 0.23915024531214854, "learning_rate": 1.529540737019084e-05, "loss": 0.1264, "num_tokens": 80045622.0, "step": 815 }, { "epoch": 1.2690513219284603, "grad_norm": 0.27847669725925683, "learning_rate": 1.5253344090364416e-05, "loss": 0.1239, "num_tokens": 80143926.0, "step": 816 }, { "epoch": 1.270606531881804, "grad_norm": 0.23250091494171893, "learning_rate": 1.5211323631376732e-05, "loss": 0.1168, "num_tokens": 80242230.0, "step": 817 }, { "epoch": 1.2721617418351476, "grad_norm": 0.23757199674114146, "learning_rate": 1.5169346259930491e-05, "loss": 0.1168, "num_tokens": 80340534.0, "step": 818 }, { "epoch": 1.2737169517884914, "grad_norm": 0.24984269436254544, "learning_rate": 1.5127412242454904e-05, "loss": 0.1199, "num_tokens": 80438838.0, "step": 819 }, { "epoch": 1.2752721617418352, "grad_norm": 0.2478957433331349, "learning_rate": 1.508552184510402e-05, "loss": 0.1293, "num_tokens": 80537142.0, "step": 820 }, { "epoch": 1.2768273716951788, "grad_norm": 0.23895614251269662, "learning_rate": 1.504367533375505e-05, "loss": 0.1283, "num_tokens": 80635446.0, "step": 821 }, { "epoch": 1.2783825816485226, "grad_norm": 0.24521445375426792, "learning_rate": 1.5001872974006633e-05, "loss": 0.1186, "num_tokens": 80733750.0, "step": 822 }, { "epoch": 1.2799377916018662, "grad_norm": 0.24464695385618007, "learning_rate": 1.4960115031177198e-05, "loss": 0.1182, "num_tokens": 80832054.0, "step": 823 }, { "epoch": 1.28149300155521, "grad_norm": 0.2386558503795457, "learning_rate": 1.4918401770303245e-05, "loss": 0.1208, "num_tokens": 80930358.0, "step": 824 }, { "epoch": 1.2830482115085537, "grad_norm": 0.23488733306818604, "learning_rate": 1.4876733456137698e-05, "loss": 0.1176, "num_tokens": 81028662.0, "step": 825 }, { "epoch": 1.2846034214618973, "grad_norm": 0.39624654421722705, "learning_rate": 1.4835110353148192e-05, "loss": 0.1329, "num_tokens": 81126966.0, "step": 826 }, { "epoch": 1.2861586314152411, "grad_norm": 0.24292606989838864, "learning_rate": 1.4793532725515416e-05, "loss": 0.1297, "num_tokens": 81225270.0, "step": 827 }, { "epoch": 1.2877138413685847, "grad_norm": 0.30411927141562084, "learning_rate": 1.4752000837131415e-05, "loss": 0.1266, "num_tokens": 81323574.0, "step": 828 }, { "epoch": 1.2892690513219285, "grad_norm": 0.22830415259030062, "learning_rate": 1.4710514951597952e-05, "loss": 0.1166, "num_tokens": 81421878.0, "step": 829 }, { "epoch": 1.2908242612752723, "grad_norm": 0.23018643454118298, "learning_rate": 1.4669075332224787e-05, "loss": 0.1188, "num_tokens": 81520182.0, "step": 830 }, { "epoch": 1.2923794712286159, "grad_norm": 0.24183737998451765, "learning_rate": 1.4627682242028045e-05, "loss": 0.1211, "num_tokens": 81618486.0, "step": 831 }, { "epoch": 1.2939346811819596, "grad_norm": 0.22273440393357746, "learning_rate": 1.4586335943728523e-05, "loss": 0.1155, "num_tokens": 81716790.0, "step": 832 }, { "epoch": 1.2954898911353032, "grad_norm": 0.23672133220135774, "learning_rate": 1.4545036699750034e-05, "loss": 0.124, "num_tokens": 81815094.0, "step": 833 }, { "epoch": 1.297045101088647, "grad_norm": 0.2436568437446056, "learning_rate": 1.4503784772217744e-05, "loss": 0.1194, "num_tokens": 81913398.0, "step": 834 }, { "epoch": 1.2986003110419908, "grad_norm": 0.2476613652777395, "learning_rate": 1.4462580422956491e-05, "loss": 0.1205, "num_tokens": 82011702.0, "step": 835 }, { "epoch": 1.3001555209953344, "grad_norm": 0.24369627821322162, "learning_rate": 1.4421423913489143e-05, "loss": 0.1165, "num_tokens": 82110006.0, "step": 836 }, { "epoch": 1.301710730948678, "grad_norm": 0.2588646864365097, "learning_rate": 1.4380315505034932e-05, "loss": 0.1178, "num_tokens": 82208310.0, "step": 837 }, { "epoch": 1.3032659409020217, "grad_norm": 0.2335009881251776, "learning_rate": 1.4339255458507796e-05, "loss": 0.1196, "num_tokens": 82306614.0, "step": 838 }, { "epoch": 1.3048211508553655, "grad_norm": 0.23137795104371775, "learning_rate": 1.4298244034514706e-05, "loss": 0.1203, "num_tokens": 82404918.0, "step": 839 }, { "epoch": 1.3063763608087091, "grad_norm": 0.2557439548221492, "learning_rate": 1.425728149335404e-05, "loss": 0.1252, "num_tokens": 82503222.0, "step": 840 }, { "epoch": 1.307931570762053, "grad_norm": 0.24620534654424298, "learning_rate": 1.4216368095013926e-05, "loss": 0.1183, "num_tokens": 82601526.0, "step": 841 }, { "epoch": 1.3094867807153965, "grad_norm": 0.2588539923690533, "learning_rate": 1.4175504099170563e-05, "loss": 0.125, "num_tokens": 82699830.0, "step": 842 }, { "epoch": 1.3110419906687403, "grad_norm": 0.2557022543384923, "learning_rate": 1.4134689765186607e-05, "loss": 0.1245, "num_tokens": 82791674.0, "step": 843 }, { "epoch": 1.312597200622084, "grad_norm": 0.4238582546218328, "learning_rate": 1.4093925352109495e-05, "loss": 0.1268, "num_tokens": 82889978.0, "step": 844 }, { "epoch": 1.3141524105754276, "grad_norm": 0.24564890183290994, "learning_rate": 1.4053211118669858e-05, "loss": 0.1177, "num_tokens": 82988282.0, "step": 845 }, { "epoch": 1.3157076205287714, "grad_norm": 0.25477215360716404, "learning_rate": 1.4012547323279794e-05, "loss": 0.1224, "num_tokens": 83086586.0, "step": 846 }, { "epoch": 1.317262830482115, "grad_norm": 0.2533133888177759, "learning_rate": 1.3971934224031296e-05, "loss": 0.1187, "num_tokens": 83184890.0, "step": 847 }, { "epoch": 1.3188180404354588, "grad_norm": 0.2413527406907875, "learning_rate": 1.393137207869458e-05, "loss": 0.122, "num_tokens": 83283194.0, "step": 848 }, { "epoch": 1.3203732503888026, "grad_norm": 0.2561396587786084, "learning_rate": 1.389086114471646e-05, "loss": 0.1285, "num_tokens": 83381498.0, "step": 849 }, { "epoch": 1.3219284603421462, "grad_norm": 0.247970008100992, "learning_rate": 1.3850401679218717e-05, "loss": 0.1198, "num_tokens": 83479802.0, "step": 850 }, { "epoch": 1.32348367029549, "grad_norm": 0.25348385656518096, "learning_rate": 1.3809993938996464e-05, "loss": 0.1246, "num_tokens": 83578106.0, "step": 851 }, { "epoch": 1.3250388802488335, "grad_norm": 0.22945361935056377, "learning_rate": 1.3769638180516509e-05, "loss": 0.1138, "num_tokens": 83676410.0, "step": 852 }, { "epoch": 1.3265940902021773, "grad_norm": 0.23821152037787752, "learning_rate": 1.3729334659915736e-05, "loss": 0.1198, "num_tokens": 83774714.0, "step": 853 }, { "epoch": 1.3281493001555211, "grad_norm": 0.24973044825926788, "learning_rate": 1.3689083632999483e-05, "loss": 0.1139, "num_tokens": 83873018.0, "step": 854 }, { "epoch": 1.3297045101088647, "grad_norm": 0.23567382600681255, "learning_rate": 1.3648885355239907e-05, "loss": 0.1237, "num_tokens": 83971322.0, "step": 855 }, { "epoch": 1.3312597200622083, "grad_norm": 0.23425966798840697, "learning_rate": 1.3608740081774357e-05, "loss": 0.1208, "num_tokens": 84069626.0, "step": 856 }, { "epoch": 1.332814930015552, "grad_norm": 0.2333147596108081, "learning_rate": 1.356864806740378e-05, "loss": 0.1121, "num_tokens": 84167930.0, "step": 857 }, { "epoch": 1.3343701399688959, "grad_norm": 0.28520629923979945, "learning_rate": 1.3528609566591086e-05, "loss": 0.1216, "num_tokens": 84266234.0, "step": 858 }, { "epoch": 1.3359253499222394, "grad_norm": 0.23834727090539895, "learning_rate": 1.3488624833459538e-05, "loss": 0.1166, "num_tokens": 84364538.0, "step": 859 }, { "epoch": 1.3374805598755832, "grad_norm": 0.2563433944661087, "learning_rate": 1.3448694121791114e-05, "loss": 0.1255, "num_tokens": 84462842.0, "step": 860 }, { "epoch": 1.3390357698289268, "grad_norm": 0.26899923278093835, "learning_rate": 1.340881768502496e-05, "loss": 0.1223, "num_tokens": 84561146.0, "step": 861 }, { "epoch": 1.3405909797822706, "grad_norm": 0.26028667790612775, "learning_rate": 1.3368995776255713e-05, "loss": 0.1266, "num_tokens": 84659450.0, "step": 862 }, { "epoch": 1.3421461897356144, "grad_norm": 0.24559934257758978, "learning_rate": 1.3329228648231925e-05, "loss": 0.1171, "num_tokens": 84757754.0, "step": 863 }, { "epoch": 1.343701399688958, "grad_norm": 0.2526853461888656, "learning_rate": 1.328951655335446e-05, "loss": 0.1177, "num_tokens": 84856058.0, "step": 864 }, { "epoch": 1.3452566096423018, "grad_norm": 0.2512999250784762, "learning_rate": 1.3249859743674883e-05, "loss": 0.1276, "num_tokens": 84954362.0, "step": 865 }, { "epoch": 1.3468118195956453, "grad_norm": 0.28724240588996763, "learning_rate": 1.3210258470893886e-05, "loss": 0.1219, "num_tokens": 85052666.0, "step": 866 }, { "epoch": 1.3483670295489891, "grad_norm": 0.24139074668660324, "learning_rate": 1.3170712986359635e-05, "loss": 0.1202, "num_tokens": 85150970.0, "step": 867 }, { "epoch": 1.349922239502333, "grad_norm": 0.24129038102685887, "learning_rate": 1.3131223541066227e-05, "loss": 0.1218, "num_tokens": 85249274.0, "step": 868 }, { "epoch": 1.3514774494556765, "grad_norm": 0.24423792142416206, "learning_rate": 1.3091790385652105e-05, "loss": 0.1206, "num_tokens": 85347578.0, "step": 869 }, { "epoch": 1.3530326594090203, "grad_norm": 0.42670532758072466, "learning_rate": 1.3052413770398395e-05, "loss": 0.1225, "num_tokens": 85445882.0, "step": 870 }, { "epoch": 1.3545878693623639, "grad_norm": 0.22984454754663225, "learning_rate": 1.3013093945227398e-05, "loss": 0.1132, "num_tokens": 85544186.0, "step": 871 }, { "epoch": 1.3561430793157077, "grad_norm": 0.23702746439314373, "learning_rate": 1.2973831159700953e-05, "loss": 0.1247, "num_tokens": 85642490.0, "step": 872 }, { "epoch": 1.3576982892690515, "grad_norm": 0.2426891103146545, "learning_rate": 1.2934625663018876e-05, "loss": 0.1279, "num_tokens": 85740794.0, "step": 873 }, { "epoch": 1.359253499222395, "grad_norm": 0.24578232314102652, "learning_rate": 1.289547770401737e-05, "loss": 0.1265, "num_tokens": 85839098.0, "step": 874 }, { "epoch": 1.3608087091757386, "grad_norm": 0.24880626850790766, "learning_rate": 1.2856387531167452e-05, "loss": 0.123, "num_tokens": 85937402.0, "step": 875 }, { "epoch": 1.3623639191290824, "grad_norm": 0.23269080586404062, "learning_rate": 1.281735539257337e-05, "loss": 0.1173, "num_tokens": 86035706.0, "step": 876 }, { "epoch": 1.3639191290824262, "grad_norm": 0.48545706033577124, "learning_rate": 1.2778381535971032e-05, "loss": 0.1246, "num_tokens": 86134010.0, "step": 877 }, { "epoch": 1.3654743390357698, "grad_norm": 0.23236932043842862, "learning_rate": 1.2739466208726424e-05, "loss": 0.1226, "num_tokens": 86232314.0, "step": 878 }, { "epoch": 1.3670295489891136, "grad_norm": 0.22994098318024458, "learning_rate": 1.2700609657834067e-05, "loss": 0.1131, "num_tokens": 86330618.0, "step": 879 }, { "epoch": 1.3685847589424571, "grad_norm": 0.23135436302408957, "learning_rate": 1.2661812129915393e-05, "loss": 0.1163, "num_tokens": 86428922.0, "step": 880 }, { "epoch": 1.370139968895801, "grad_norm": 0.24313415643020572, "learning_rate": 1.2623073871217258e-05, "loss": 0.1238, "num_tokens": 86527226.0, "step": 881 }, { "epoch": 1.3716951788491447, "grad_norm": 0.2803771989927827, "learning_rate": 1.2584395127610317e-05, "loss": 0.1269, "num_tokens": 86625530.0, "step": 882 }, { "epoch": 1.3732503888024883, "grad_norm": 0.2329298826654858, "learning_rate": 1.2545776144587488e-05, "loss": 0.1185, "num_tokens": 86723834.0, "step": 883 }, { "epoch": 1.374805598755832, "grad_norm": 0.25789848707660035, "learning_rate": 1.2507217167262376e-05, "loss": 0.1287, "num_tokens": 86822138.0, "step": 884 }, { "epoch": 1.3763608087091757, "grad_norm": 0.23010707370662678, "learning_rate": 1.2468718440367766e-05, "loss": 0.1087, "num_tokens": 86920442.0, "step": 885 }, { "epoch": 1.3779160186625194, "grad_norm": 0.23740220719961977, "learning_rate": 1.2430280208254008e-05, "loss": 0.1189, "num_tokens": 87018746.0, "step": 886 }, { "epoch": 1.3794712286158632, "grad_norm": 0.2591224152640492, "learning_rate": 1.2391902714887494e-05, "loss": 0.1308, "num_tokens": 87117050.0, "step": 887 }, { "epoch": 1.3810264385692068, "grad_norm": 0.23235044509572297, "learning_rate": 1.2353586203849117e-05, "loss": 0.1119, "num_tokens": 87215354.0, "step": 888 }, { "epoch": 1.3825816485225506, "grad_norm": 0.24806120766249276, "learning_rate": 1.2315330918332716e-05, "loss": 0.1135, "num_tokens": 87313658.0, "step": 889 }, { "epoch": 1.3841368584758942, "grad_norm": 0.4019235972526062, "learning_rate": 1.2277137101143534e-05, "loss": 0.1229, "num_tokens": 87411962.0, "step": 890 }, { "epoch": 1.385692068429238, "grad_norm": 0.24029163637805007, "learning_rate": 1.2239004994696669e-05, "loss": 0.1137, "num_tokens": 87510266.0, "step": 891 }, { "epoch": 1.3872472783825818, "grad_norm": 0.24820296629792415, "learning_rate": 1.2200934841015539e-05, "loss": 0.1156, "num_tokens": 87608570.0, "step": 892 }, { "epoch": 1.3888024883359253, "grad_norm": 0.2518073300670305, "learning_rate": 1.2162926881730388e-05, "loss": 0.124, "num_tokens": 87702026.0, "step": 893 }, { "epoch": 1.390357698289269, "grad_norm": 0.2393480668091628, "learning_rate": 1.2124981358076653e-05, "loss": 0.1238, "num_tokens": 87800330.0, "step": 894 }, { "epoch": 1.3919129082426127, "grad_norm": 0.27466389097859856, "learning_rate": 1.2087098510893536e-05, "loss": 0.1286, "num_tokens": 87898634.0, "step": 895 }, { "epoch": 1.3934681181959565, "grad_norm": 0.2767912035632735, "learning_rate": 1.2049278580622423e-05, "loss": 0.1243, "num_tokens": 87996938.0, "step": 896 }, { "epoch": 1.3950233281493, "grad_norm": 0.24181948569174525, "learning_rate": 1.2011521807305373e-05, "loss": 0.1236, "num_tokens": 88095242.0, "step": 897 }, { "epoch": 1.3965785381026439, "grad_norm": 0.31330668330860956, "learning_rate": 1.1973828430583571e-05, "loss": 0.1139, "num_tokens": 88193546.0, "step": 898 }, { "epoch": 1.3981337480559874, "grad_norm": 0.24271888281118678, "learning_rate": 1.1936198689695846e-05, "loss": 0.1206, "num_tokens": 88291850.0, "step": 899 }, { "epoch": 1.3996889580093312, "grad_norm": 0.24064071539108423, "learning_rate": 1.1898632823477121e-05, "loss": 0.1216, "num_tokens": 88390154.0, "step": 900 }, { "epoch": 1.401244167962675, "grad_norm": 0.2213568202399043, "learning_rate": 1.1861131070356908e-05, "loss": 0.1169, "num_tokens": 88488458.0, "step": 901 }, { "epoch": 1.4027993779160186, "grad_norm": 0.22597326249218186, "learning_rate": 1.1823693668357809e-05, "loss": 0.1126, "num_tokens": 88586762.0, "step": 902 }, { "epoch": 1.4043545878693624, "grad_norm": 0.26146009891726973, "learning_rate": 1.178632085509398e-05, "loss": 0.1311, "num_tokens": 88685066.0, "step": 903 }, { "epoch": 1.405909797822706, "grad_norm": 0.24456911844386486, "learning_rate": 1.1749012867769625e-05, "loss": 0.1224, "num_tokens": 88783370.0, "step": 904 }, { "epoch": 1.4074650077760498, "grad_norm": 0.255018987718338, "learning_rate": 1.171176994317751e-05, "loss": 0.121, "num_tokens": 88881674.0, "step": 905 }, { "epoch": 1.4090202177293936, "grad_norm": 0.24652347379949763, "learning_rate": 1.1674592317697454e-05, "loss": 0.123, "num_tokens": 88979978.0, "step": 906 }, { "epoch": 1.4105754276827371, "grad_norm": 0.25599234514415975, "learning_rate": 1.1637480227294824e-05, "loss": 0.117, "num_tokens": 89078282.0, "step": 907 }, { "epoch": 1.412130637636081, "grad_norm": 0.23857636390062362, "learning_rate": 1.1600433907519015e-05, "loss": 0.122, "num_tokens": 89176586.0, "step": 908 }, { "epoch": 1.4136858475894245, "grad_norm": 0.23490696111608755, "learning_rate": 1.1563453593502014e-05, "loss": 0.1228, "num_tokens": 89274890.0, "step": 909 }, { "epoch": 1.4152410575427683, "grad_norm": 0.2469003691491552, "learning_rate": 1.1526539519956854e-05, "loss": 0.1254, "num_tokens": 89373194.0, "step": 910 }, { "epoch": 1.416796267496112, "grad_norm": 0.24449197125444116, "learning_rate": 1.1489691921176125e-05, "loss": 0.1196, "num_tokens": 89471498.0, "step": 911 }, { "epoch": 1.4183514774494557, "grad_norm": 0.2513049480711658, "learning_rate": 1.1452911031030527e-05, "loss": 0.1242, "num_tokens": 89569802.0, "step": 912 }, { "epoch": 1.4199066874027995, "grad_norm": 0.2382351384585437, "learning_rate": 1.1416197082967351e-05, "loss": 0.1138, "num_tokens": 89668106.0, "step": 913 }, { "epoch": 1.421461897356143, "grad_norm": 0.23481890584127435, "learning_rate": 1.1379550310009022e-05, "loss": 0.1207, "num_tokens": 89766410.0, "step": 914 }, { "epoch": 1.4230171073094868, "grad_norm": 0.23156612306403848, "learning_rate": 1.1342970944751585e-05, "loss": 0.1174, "num_tokens": 89864714.0, "step": 915 }, { "epoch": 1.4245723172628304, "grad_norm": 0.24938547880898287, "learning_rate": 1.1306459219363255e-05, "loss": 0.122, "num_tokens": 89963018.0, "step": 916 }, { "epoch": 1.4261275272161742, "grad_norm": 0.23686571233313733, "learning_rate": 1.1270015365582969e-05, "loss": 0.1157, "num_tokens": 90061322.0, "step": 917 }, { "epoch": 1.4276827371695178, "grad_norm": 0.24989757413370006, "learning_rate": 1.1233639614718839e-05, "loss": 0.1201, "num_tokens": 90159626.0, "step": 918 }, { "epoch": 1.4292379471228616, "grad_norm": 0.2324645060426858, "learning_rate": 1.1197332197646754e-05, "loss": 0.1113, "num_tokens": 90257930.0, "step": 919 }, { "epoch": 1.4307931570762054, "grad_norm": 0.2884270353675501, "learning_rate": 1.1161093344808891e-05, "loss": 0.124, "num_tokens": 90356234.0, "step": 920 }, { "epoch": 1.432348367029549, "grad_norm": 0.23580511311630953, "learning_rate": 1.1124923286212259e-05, "loss": 0.1235, "num_tokens": 90454538.0, "step": 921 }, { "epoch": 1.4339035769828927, "grad_norm": 0.24633037527015766, "learning_rate": 1.1088822251427205e-05, "loss": 0.1229, "num_tokens": 90552842.0, "step": 922 }, { "epoch": 1.4354587869362363, "grad_norm": 0.2454168997968038, "learning_rate": 1.1052790469586e-05, "loss": 0.1236, "num_tokens": 90651146.0, "step": 923 }, { "epoch": 1.43701399688958, "grad_norm": 0.24971389336428332, "learning_rate": 1.101682816938138e-05, "loss": 0.1211, "num_tokens": 90749450.0, "step": 924 }, { "epoch": 1.4385692068429239, "grad_norm": 0.34769220208660867, "learning_rate": 1.0980935579065061e-05, "loss": 0.1223, "num_tokens": 90847754.0, "step": 925 }, { "epoch": 1.4401244167962675, "grad_norm": 0.2366949502550198, "learning_rate": 1.0945112926446329e-05, "loss": 0.1214, "num_tokens": 90946058.0, "step": 926 }, { "epoch": 1.4416796267496113, "grad_norm": 0.24199015328646079, "learning_rate": 1.090936043889057e-05, "loss": 0.119, "num_tokens": 91044362.0, "step": 927 }, { "epoch": 1.4432348367029548, "grad_norm": 0.2503855307351663, "learning_rate": 1.0873678343317831e-05, "loss": 0.1246, "num_tokens": 91142666.0, "step": 928 }, { "epoch": 1.4447900466562986, "grad_norm": 0.23658027234389065, "learning_rate": 1.0838066866201392e-05, "loss": 0.1205, "num_tokens": 91240970.0, "step": 929 }, { "epoch": 1.4463452566096424, "grad_norm": 0.23068099289430136, "learning_rate": 1.0802526233566308e-05, "loss": 0.1114, "num_tokens": 91339274.0, "step": 930 }, { "epoch": 1.447900466562986, "grad_norm": 0.2429284015936713, "learning_rate": 1.0767056670988007e-05, "loss": 0.1141, "num_tokens": 91437578.0, "step": 931 }, { "epoch": 1.4494556765163298, "grad_norm": 0.2418996591152503, "learning_rate": 1.07316584035908e-05, "loss": 0.1241, "num_tokens": 91535882.0, "step": 932 }, { "epoch": 1.4510108864696734, "grad_norm": 0.22756790579331063, "learning_rate": 1.0696331656046533e-05, "loss": 0.1172, "num_tokens": 91634186.0, "step": 933 }, { "epoch": 1.4525660964230172, "grad_norm": 0.2329340184821911, "learning_rate": 1.0661076652573096e-05, "loss": 0.1204, "num_tokens": 91732490.0, "step": 934 }, { "epoch": 1.454121306376361, "grad_norm": 0.24157834427682787, "learning_rate": 1.0625893616933012e-05, "loss": 0.1172, "num_tokens": 91830794.0, "step": 935 }, { "epoch": 1.4556765163297045, "grad_norm": 0.2316894134218242, "learning_rate": 1.059078277243204e-05, "loss": 0.1185, "num_tokens": 91929098.0, "step": 936 }, { "epoch": 1.457231726283048, "grad_norm": 0.2272437673946785, "learning_rate": 1.0555744341917746e-05, "loss": 0.1169, "num_tokens": 92027402.0, "step": 937 }, { "epoch": 1.4587869362363919, "grad_norm": 0.2660914337388917, "learning_rate": 1.0520778547778078e-05, "loss": 0.1214, "num_tokens": 92125706.0, "step": 938 }, { "epoch": 1.4603421461897357, "grad_norm": 0.2390345363325179, "learning_rate": 1.0485885611939964e-05, "loss": 0.1222, "num_tokens": 92224010.0, "step": 939 }, { "epoch": 1.4618973561430793, "grad_norm": 0.24080498761185098, "learning_rate": 1.0451065755867896e-05, "loss": 0.1194, "num_tokens": 92322314.0, "step": 940 }, { "epoch": 1.463452566096423, "grad_norm": 0.22849886830127855, "learning_rate": 1.041631920056256e-05, "loss": 0.1139, "num_tokens": 92420618.0, "step": 941 }, { "epoch": 1.4650077760497666, "grad_norm": 0.22965429289530512, "learning_rate": 1.0381646166559363e-05, "loss": 0.1203, "num_tokens": 92518922.0, "step": 942 }, { "epoch": 1.4665629860031104, "grad_norm": 0.24947502133983523, "learning_rate": 1.0347046873927104e-05, "loss": 0.1156, "num_tokens": 92616912.0, "step": 943 }, { "epoch": 1.4681181959564542, "grad_norm": 0.23310022538921366, "learning_rate": 1.0312521542266536e-05, "loss": 0.118, "num_tokens": 92715216.0, "step": 944 }, { "epoch": 1.4696734059097978, "grad_norm": 0.2434242269071857, "learning_rate": 1.0278070390708992e-05, "loss": 0.1164, "num_tokens": 92813520.0, "step": 945 }, { "epoch": 1.4712286158631416, "grad_norm": 0.2446403270011219, "learning_rate": 1.0243693637914967e-05, "loss": 0.1202, "num_tokens": 92911824.0, "step": 946 }, { "epoch": 1.4727838258164851, "grad_norm": 0.24638779611320646, "learning_rate": 1.0209391502072767e-05, "loss": 0.1194, "num_tokens": 93010128.0, "step": 947 }, { "epoch": 1.474339035769829, "grad_norm": 0.24008804727529473, "learning_rate": 1.0175164200897103e-05, "loss": 0.1197, "num_tokens": 93108432.0, "step": 948 }, { "epoch": 1.4758942457231727, "grad_norm": 0.2406233597594623, "learning_rate": 1.0141011951627708e-05, "loss": 0.1183, "num_tokens": 93206736.0, "step": 949 }, { "epoch": 1.4774494556765163, "grad_norm": 0.23364753469696886, "learning_rate": 1.0106934971027967e-05, "loss": 0.1141, "num_tokens": 93305040.0, "step": 950 }, { "epoch": 1.47900466562986, "grad_norm": 0.22804612862015275, "learning_rate": 1.0072933475383533e-05, "loss": 0.1194, "num_tokens": 93403344.0, "step": 951 }, { "epoch": 1.4805598755832037, "grad_norm": 0.24097731211740045, "learning_rate": 1.0039007680500966e-05, "loss": 0.1202, "num_tokens": 93501648.0, "step": 952 }, { "epoch": 1.4821150855365475, "grad_norm": 0.22927571162364319, "learning_rate": 1.000515780170634e-05, "loss": 0.1202, "num_tokens": 93599952.0, "step": 953 }, { "epoch": 1.4836702954898913, "grad_norm": 0.2458276160622986, "learning_rate": 9.971384053843906e-06, "loss": 0.115, "num_tokens": 93698256.0, "step": 954 }, { "epoch": 1.4852255054432348, "grad_norm": 0.2457062102620966, "learning_rate": 9.937686651274712e-06, "loss": 0.1244, "num_tokens": 93796560.0, "step": 955 }, { "epoch": 1.4867807153965784, "grad_norm": 0.26202078573537935, "learning_rate": 9.904065807875227e-06, "loss": 0.1256, "num_tokens": 93894864.0, "step": 956 }, { "epoch": 1.4883359253499222, "grad_norm": 0.23669966744706547, "learning_rate": 9.870521737036031e-06, "loss": 0.1193, "num_tokens": 93993168.0, "step": 957 }, { "epoch": 1.489891135303266, "grad_norm": 0.2527891488424356, "learning_rate": 9.837054651660417e-06, "loss": 0.1259, "num_tokens": 94091472.0, "step": 958 }, { "epoch": 1.4914463452566096, "grad_norm": 0.23599048631775932, "learning_rate": 9.803664764163039e-06, "loss": 0.1135, "num_tokens": 94189776.0, "step": 959 }, { "epoch": 1.4930015552099534, "grad_norm": 0.2297069537016781, "learning_rate": 9.770352286468599e-06, "loss": 0.1182, "num_tokens": 94288080.0, "step": 960 }, { "epoch": 1.494556765163297, "grad_norm": 0.23302126746046317, "learning_rate": 9.737117430010467e-06, "loss": 0.1155, "num_tokens": 94386384.0, "step": 961 }, { "epoch": 1.4961119751166407, "grad_norm": 0.2303730379470508, "learning_rate": 9.703960405729374e-06, "loss": 0.1152, "num_tokens": 94484688.0, "step": 962 }, { "epoch": 1.4976671850699845, "grad_norm": 0.24277296943526766, "learning_rate": 9.670881424072016e-06, "loss": 0.1233, "num_tokens": 94582992.0, "step": 963 }, { "epoch": 1.499222395023328, "grad_norm": 0.2238902073588979, "learning_rate": 9.637880694989783e-06, "loss": 0.115, "num_tokens": 94681296.0, "step": 964 }, { "epoch": 1.500777604976672, "grad_norm": 0.23000847445886533, "learning_rate": 9.604958427937405e-06, "loss": 0.1183, "num_tokens": 94779600.0, "step": 965 }, { "epoch": 1.5023328149300155, "grad_norm": 0.2322664405531211, "learning_rate": 9.572114831871589e-06, "loss": 0.1182, "num_tokens": 94877904.0, "step": 966 }, { "epoch": 1.5038880248833593, "grad_norm": 0.227855455957449, "learning_rate": 9.539350115249734e-06, "loss": 0.1122, "num_tokens": 94976208.0, "step": 967 }, { "epoch": 1.505443234836703, "grad_norm": 0.2372283762656264, "learning_rate": 9.50666448602859e-06, "loss": 0.1212, "num_tokens": 95074512.0, "step": 968 }, { "epoch": 1.5069984447900466, "grad_norm": 0.33879625096896326, "learning_rate": 9.474058151662953e-06, "loss": 0.1103, "num_tokens": 95172816.0, "step": 969 }, { "epoch": 1.5085536547433902, "grad_norm": 0.2491375506505011, "learning_rate": 9.441531319104314e-06, "loss": 0.1213, "num_tokens": 95271120.0, "step": 970 }, { "epoch": 1.510108864696734, "grad_norm": 0.2420760385571861, "learning_rate": 9.409084194799588e-06, "loss": 0.1162, "num_tokens": 95369424.0, "step": 971 }, { "epoch": 1.5116640746500778, "grad_norm": 0.2417435228142586, "learning_rate": 9.376716984689772e-06, "loss": 0.1168, "num_tokens": 95467728.0, "step": 972 }, { "epoch": 1.5132192846034216, "grad_norm": 0.2506758364258815, "learning_rate": 9.34442989420866e-06, "loss": 0.1171, "num_tokens": 95566032.0, "step": 973 }, { "epoch": 1.5147744945567652, "grad_norm": 0.24400023116765582, "learning_rate": 9.31222312828152e-06, "loss": 0.1108, "num_tokens": 95664336.0, "step": 974 }, { "epoch": 1.5163297045101087, "grad_norm": 0.2555791605001545, "learning_rate": 9.28009689132381e-06, "loss": 0.1176, "num_tokens": 95762640.0, "step": 975 }, { "epoch": 1.5178849144634525, "grad_norm": 0.26908242509780067, "learning_rate": 9.24805138723987e-06, "loss": 0.1214, "num_tokens": 95860944.0, "step": 976 }, { "epoch": 1.5194401244167963, "grad_norm": 0.239108260246027, "learning_rate": 9.216086819421621e-06, "loss": 0.1236, "num_tokens": 95959248.0, "step": 977 }, { "epoch": 1.5209953343701401, "grad_norm": 0.2478559651689416, "learning_rate": 9.184203390747299e-06, "loss": 0.1217, "num_tokens": 96057552.0, "step": 978 }, { "epoch": 1.5225505443234837, "grad_norm": 0.24720346625287962, "learning_rate": 9.15240130358014e-06, "loss": 0.1305, "num_tokens": 96155856.0, "step": 979 }, { "epoch": 1.5241057542768273, "grad_norm": 0.2529950501795325, "learning_rate": 9.12068075976712e-06, "loss": 0.1181, "num_tokens": 96254160.0, "step": 980 }, { "epoch": 1.525660964230171, "grad_norm": 0.2727155131013765, "learning_rate": 9.08904196063765e-06, "loss": 0.1128, "num_tokens": 96352464.0, "step": 981 }, { "epoch": 1.5272161741835149, "grad_norm": 0.23877015529487056, "learning_rate": 9.057485107002313e-06, "loss": 0.1186, "num_tokens": 96450768.0, "step": 982 }, { "epoch": 1.5287713841368584, "grad_norm": 0.24620108829843318, "learning_rate": 9.026010399151598e-06, "loss": 0.1225, "num_tokens": 96549072.0, "step": 983 }, { "epoch": 1.5303265940902022, "grad_norm": 0.24023760322266094, "learning_rate": 8.994618036854594e-06, "loss": 0.1211, "num_tokens": 96647376.0, "step": 984 }, { "epoch": 1.5318818040435458, "grad_norm": 0.2273978251164774, "learning_rate": 8.963308219357758e-06, "loss": 0.1063, "num_tokens": 96745680.0, "step": 985 }, { "epoch": 1.5334370139968896, "grad_norm": 0.2317730463071571, "learning_rate": 8.93208114538365e-06, "loss": 0.1205, "num_tokens": 96843984.0, "step": 986 }, { "epoch": 1.5349922239502334, "grad_norm": 0.24684295124593486, "learning_rate": 8.900937013129633e-06, "loss": 0.1242, "num_tokens": 96942288.0, "step": 987 }, { "epoch": 1.536547433903577, "grad_norm": 0.22363323345962044, "learning_rate": 8.869876020266651e-06, "loss": 0.111, "num_tokens": 97040592.0, "step": 988 }, { "epoch": 1.5381026438569205, "grad_norm": 0.23856576073469396, "learning_rate": 8.838898363937986e-06, "loss": 0.1166, "num_tokens": 97138896.0, "step": 989 }, { "epoch": 1.5396578538102643, "grad_norm": 0.24335617381697627, "learning_rate": 8.808004240757952e-06, "loss": 0.1115, "num_tokens": 97237200.0, "step": 990 }, { "epoch": 1.5412130637636081, "grad_norm": 0.2346418336243782, "learning_rate": 8.777193846810703e-06, "loss": 0.1146, "num_tokens": 97335504.0, "step": 991 }, { "epoch": 1.542768273716952, "grad_norm": 0.22997663582623593, "learning_rate": 8.746467377648953e-06, "loss": 0.1117, "num_tokens": 97433808.0, "step": 992 }, { "epoch": 1.5443234836702955, "grad_norm": 0.2645231965406234, "learning_rate": 8.715825028292761e-06, "loss": 0.1182, "num_tokens": 97525408.0, "step": 993 }, { "epoch": 1.545878693623639, "grad_norm": 0.26992289630946864, "learning_rate": 8.685266993228266e-06, "loss": 0.1263, "num_tokens": 97623712.0, "step": 994 }, { "epoch": 1.5474339035769828, "grad_norm": 0.2444733573980823, "learning_rate": 8.654793466406473e-06, "loss": 0.1247, "num_tokens": 97722016.0, "step": 995 }, { "epoch": 1.5489891135303266, "grad_norm": 0.23487388030473644, "learning_rate": 8.624404641242014e-06, "loss": 0.1186, "num_tokens": 97820320.0, "step": 996 }, { "epoch": 1.5505443234836704, "grad_norm": 0.25117879785253444, "learning_rate": 8.594100710611928e-06, "loss": 0.1231, "num_tokens": 97918624.0, "step": 997 }, { "epoch": 1.552099533437014, "grad_norm": 0.23906917785688453, "learning_rate": 8.563881866854422e-06, "loss": 0.1183, "num_tokens": 98016928.0, "step": 998 }, { "epoch": 1.5536547433903576, "grad_norm": 0.23992428214318207, "learning_rate": 8.533748301767667e-06, "loss": 0.1201, "num_tokens": 98115232.0, "step": 999 }, { "epoch": 1.5552099533437014, "grad_norm": 0.24120738990122378, "learning_rate": 8.503700206608567e-06, "loss": 0.1196, "num_tokens": 98213536.0, "step": 1000 }, { "epoch": 1.5567651632970452, "grad_norm": 0.2386849852413916, "learning_rate": 8.473737772091549e-06, "loss": 0.1147, "num_tokens": 98311840.0, "step": 1001 }, { "epoch": 1.558320373250389, "grad_norm": 0.2619718890771935, "learning_rate": 8.443861188387357e-06, "loss": 0.1316, "num_tokens": 98410144.0, "step": 1002 }, { "epoch": 1.5598755832037325, "grad_norm": 0.24044825666597733, "learning_rate": 8.414070645121845e-06, "loss": 0.1208, "num_tokens": 98508448.0, "step": 1003 }, { "epoch": 1.5614307931570761, "grad_norm": 0.23073832021807839, "learning_rate": 8.384366331374772e-06, "loss": 0.1206, "num_tokens": 98606752.0, "step": 1004 }, { "epoch": 1.56298600311042, "grad_norm": 0.2365596942017463, "learning_rate": 8.354748435678587e-06, "loss": 0.1148, "num_tokens": 98705056.0, "step": 1005 }, { "epoch": 1.5645412130637637, "grad_norm": 0.23453793433250525, "learning_rate": 8.325217146017268e-06, "loss": 0.1205, "num_tokens": 98803360.0, "step": 1006 }, { "epoch": 1.5660964230171073, "grad_norm": 0.22161266911870406, "learning_rate": 8.295772649825093e-06, "loss": 0.1114, "num_tokens": 98901664.0, "step": 1007 }, { "epoch": 1.5676516329704508, "grad_norm": 0.2278706669782075, "learning_rate": 8.266415133985458e-06, "loss": 0.1141, "num_tokens": 98999968.0, "step": 1008 }, { "epoch": 1.5692068429237946, "grad_norm": 0.22166410066787945, "learning_rate": 8.237144784829709e-06, "loss": 0.1127, "num_tokens": 99098272.0, "step": 1009 }, { "epoch": 1.5707620528771384, "grad_norm": 0.2374572508514638, "learning_rate": 8.207961788135955e-06, "loss": 0.1153, "num_tokens": 99196576.0, "step": 1010 }, { "epoch": 1.5723172628304822, "grad_norm": 0.24029166641712962, "learning_rate": 8.17886632912785e-06, "loss": 0.1151, "num_tokens": 99294880.0, "step": 1011 }, { "epoch": 1.5738724727838258, "grad_norm": 0.24444261279650253, "learning_rate": 8.149858592473475e-06, "loss": 0.1235, "num_tokens": 99393184.0, "step": 1012 }, { "epoch": 1.5754276827371694, "grad_norm": 0.23349497252110385, "learning_rate": 8.120938762284146e-06, "loss": 0.1165, "num_tokens": 99491488.0, "step": 1013 }, { "epoch": 1.5769828926905132, "grad_norm": 0.23050700384783576, "learning_rate": 8.092107022113214e-06, "loss": 0.1132, "num_tokens": 99589792.0, "step": 1014 }, { "epoch": 1.578538102643857, "grad_norm": 0.2388528286164938, "learning_rate": 8.06336355495494e-06, "loss": 0.1111, "num_tokens": 99688096.0, "step": 1015 }, { "epoch": 1.5800933125972008, "grad_norm": 0.2238920378432902, "learning_rate": 8.034708543243322e-06, "loss": 0.1123, "num_tokens": 99786400.0, "step": 1016 }, { "epoch": 1.5816485225505443, "grad_norm": 0.2282049848829997, "learning_rate": 8.006142168850925e-06, "loss": 0.1112, "num_tokens": 99884704.0, "step": 1017 }, { "epoch": 1.583203732503888, "grad_norm": 0.23382478197821732, "learning_rate": 7.977664613087734e-06, "loss": 0.1118, "num_tokens": 99983008.0, "step": 1018 }, { "epoch": 1.5847589424572317, "grad_norm": 0.2581306319003927, "learning_rate": 7.949276056700012e-06, "loss": 0.1133, "num_tokens": 100081312.0, "step": 1019 }, { "epoch": 1.5863141524105755, "grad_norm": 0.24810156758640295, "learning_rate": 7.920976679869142e-06, "loss": 0.1161, "num_tokens": 100179616.0, "step": 1020 }, { "epoch": 1.5878693623639193, "grad_norm": 0.23758672617441118, "learning_rate": 7.892766662210489e-06, "loss": 0.1145, "num_tokens": 100277920.0, "step": 1021 }, { "epoch": 1.5894245723172629, "grad_norm": 0.23646892975212838, "learning_rate": 7.864646182772256e-06, "loss": 0.1179, "num_tokens": 100376224.0, "step": 1022 }, { "epoch": 1.5909797822706064, "grad_norm": 0.23492472192546082, "learning_rate": 7.836615420034346e-06, "loss": 0.1186, "num_tokens": 100474528.0, "step": 1023 }, { "epoch": 1.5925349922239502, "grad_norm": 0.24148032579454623, "learning_rate": 7.808674551907245e-06, "loss": 0.1082, "num_tokens": 100572832.0, "step": 1024 }, { "epoch": 1.594090202177294, "grad_norm": 0.24988134439693527, "learning_rate": 7.780823755730859e-06, "loss": 0.1166, "num_tokens": 100671136.0, "step": 1025 }, { "epoch": 1.5956454121306376, "grad_norm": 0.23944329187003088, "learning_rate": 7.753063208273429e-06, "loss": 0.1133, "num_tokens": 100769440.0, "step": 1026 }, { "epoch": 1.5972006220839814, "grad_norm": 0.24361745439351928, "learning_rate": 7.725393085730381e-06, "loss": 0.1173, "num_tokens": 100867744.0, "step": 1027 }, { "epoch": 1.598755832037325, "grad_norm": 0.24195238430872743, "learning_rate": 7.697813563723222e-06, "loss": 0.1212, "num_tokens": 100966048.0, "step": 1028 }, { "epoch": 1.6003110419906688, "grad_norm": 0.25313607741206406, "learning_rate": 7.670324817298414e-06, "loss": 0.1235, "num_tokens": 101064352.0, "step": 1029 }, { "epoch": 1.6018662519440126, "grad_norm": 0.2231112428646485, "learning_rate": 7.642927020926269e-06, "loss": 0.1136, "num_tokens": 101162656.0, "step": 1030 }, { "epoch": 1.6034214618973561, "grad_norm": 0.28674099279011156, "learning_rate": 7.6156203484998546e-06, "loss": 0.1197, "num_tokens": 101260960.0, "step": 1031 }, { "epoch": 1.6049766718506997, "grad_norm": 0.23990735718847947, "learning_rate": 7.588404973333852e-06, "loss": 0.118, "num_tokens": 101359264.0, "step": 1032 }, { "epoch": 1.6065318818040435, "grad_norm": 0.23226077606513393, "learning_rate": 7.5612810681635064e-06, "loss": 0.1181, "num_tokens": 101457568.0, "step": 1033 }, { "epoch": 1.6080870917573873, "grad_norm": 0.2338554584329969, "learning_rate": 7.534248805143487e-06, "loss": 0.1103, "num_tokens": 101555872.0, "step": 1034 }, { "epoch": 1.609642301710731, "grad_norm": 0.2521365080806678, "learning_rate": 7.507308355846833e-06, "loss": 0.1273, "num_tokens": 101654176.0, "step": 1035 }, { "epoch": 1.6111975116640747, "grad_norm": 0.22072260926399442, "learning_rate": 7.480459891263812e-06, "loss": 0.1133, "num_tokens": 101752480.0, "step": 1036 }, { "epoch": 1.6127527216174182, "grad_norm": 0.28112308527295793, "learning_rate": 7.453703581800904e-06, "loss": 0.1192, "num_tokens": 101850784.0, "step": 1037 }, { "epoch": 1.614307931570762, "grad_norm": 0.24131439042084155, "learning_rate": 7.427039597279667e-06, "loss": 0.1195, "num_tokens": 101949088.0, "step": 1038 }, { "epoch": 1.6158631415241058, "grad_norm": 0.2407311622700103, "learning_rate": 7.4004681069356655e-06, "loss": 0.1141, "num_tokens": 102047392.0, "step": 1039 }, { "epoch": 1.6174183514774496, "grad_norm": 0.2342434540014886, "learning_rate": 7.373989279417418e-06, "loss": 0.1063, "num_tokens": 102145696.0, "step": 1040 }, { "epoch": 1.6189735614307932, "grad_norm": 0.22518291273666124, "learning_rate": 7.347603282785315e-06, "loss": 0.1117, "num_tokens": 102244000.0, "step": 1041 }, { "epoch": 1.6205287713841368, "grad_norm": 0.21919665028775406, "learning_rate": 7.32131028451054e-06, "loss": 0.1146, "num_tokens": 102342304.0, "step": 1042 }, { "epoch": 1.6220839813374806, "grad_norm": 0.25237061438763414, "learning_rate": 7.295110451474034e-06, "loss": 0.1209, "num_tokens": 102437208.0, "step": 1043 }, { "epoch": 1.6236391912908243, "grad_norm": 0.2675731532575791, "learning_rate": 7.269003949965412e-06, "loss": 0.1284, "num_tokens": 102535512.0, "step": 1044 }, { "epoch": 1.625194401244168, "grad_norm": 0.2304753172939709, "learning_rate": 7.242990945681918e-06, "loss": 0.113, "num_tokens": 102633816.0, "step": 1045 }, { "epoch": 1.6267496111975117, "grad_norm": 0.2490745322710372, "learning_rate": 7.217071603727372e-06, "loss": 0.128, "num_tokens": 102732120.0, "step": 1046 }, { "epoch": 1.6283048211508553, "grad_norm": 0.25479731262653144, "learning_rate": 7.1912460886111234e-06, "loss": 0.1216, "num_tokens": 102830424.0, "step": 1047 }, { "epoch": 1.629860031104199, "grad_norm": 0.22670208367397873, "learning_rate": 7.1655145642470034e-06, "loss": 0.1114, "num_tokens": 102928728.0, "step": 1048 }, { "epoch": 1.6314152410575429, "grad_norm": 0.24038098960789286, "learning_rate": 7.1398771939522825e-06, "loss": 0.1188, "num_tokens": 103027032.0, "step": 1049 }, { "epoch": 1.6329704510108864, "grad_norm": 0.23795172322645147, "learning_rate": 7.114334140446643e-06, "loss": 0.1194, "num_tokens": 103125336.0, "step": 1050 }, { "epoch": 1.63452566096423, "grad_norm": 0.23628510632713184, "learning_rate": 7.088885565851143e-06, "loss": 0.1169, "num_tokens": 103223640.0, "step": 1051 }, { "epoch": 1.6360808709175738, "grad_norm": 0.25235924536645904, "learning_rate": 7.0635316316871795e-06, "loss": 0.1118, "num_tokens": 103321944.0, "step": 1052 }, { "epoch": 1.6376360808709176, "grad_norm": 0.3195977632742205, "learning_rate": 7.0382724988754755e-06, "loss": 0.1161, "num_tokens": 103420248.0, "step": 1053 }, { "epoch": 1.6391912908242614, "grad_norm": 0.23656344346945493, "learning_rate": 7.013108327735048e-06, "loss": 0.1147, "num_tokens": 103518552.0, "step": 1054 }, { "epoch": 1.640746500777605, "grad_norm": 0.29302298015694267, "learning_rate": 6.988039277982201e-06, "loss": 0.1136, "num_tokens": 103616856.0, "step": 1055 }, { "epoch": 1.6423017107309485, "grad_norm": 0.24697903754461725, "learning_rate": 6.96306550872949e-06, "loss": 0.1195, "num_tokens": 103715160.0, "step": 1056 }, { "epoch": 1.6438569206842923, "grad_norm": 0.25817833068577906, "learning_rate": 6.938187178484747e-06, "loss": 0.1147, "num_tokens": 103813464.0, "step": 1057 }, { "epoch": 1.6454121306376361, "grad_norm": 0.23021153233033995, "learning_rate": 6.913404445150045e-06, "loss": 0.1144, "num_tokens": 103911768.0, "step": 1058 }, { "epoch": 1.64696734059098, "grad_norm": 0.23671196723564147, "learning_rate": 6.888717466020713e-06, "loss": 0.1126, "num_tokens": 104010072.0, "step": 1059 }, { "epoch": 1.6485225505443235, "grad_norm": 0.24260815600929325, "learning_rate": 6.864126397784312e-06, "loss": 0.1137, "num_tokens": 104108376.0, "step": 1060 }, { "epoch": 1.650077760497667, "grad_norm": 0.2516172272902628, "learning_rate": 6.839631396519686e-06, "loss": 0.1192, "num_tokens": 104206680.0, "step": 1061 }, { "epoch": 1.6516329704510109, "grad_norm": 0.2748489620050858, "learning_rate": 6.815232617695933e-06, "loss": 0.1164, "num_tokens": 104304984.0, "step": 1062 }, { "epoch": 1.6531881804043547, "grad_norm": 0.255902129910511, "learning_rate": 6.790930216171418e-06, "loss": 0.1227, "num_tokens": 104403288.0, "step": 1063 }, { "epoch": 1.6547433903576982, "grad_norm": 0.23418417881746753, "learning_rate": 6.766724346192817e-06, "loss": 0.1104, "num_tokens": 104501592.0, "step": 1064 }, { "epoch": 1.656298600311042, "grad_norm": 0.24907267458801582, "learning_rate": 6.742615161394123e-06, "loss": 0.1148, "num_tokens": 104599896.0, "step": 1065 }, { "epoch": 1.6578538102643856, "grad_norm": 0.24530905558779278, "learning_rate": 6.7186028147956585e-06, "loss": 0.1118, "num_tokens": 104698200.0, "step": 1066 }, { "epoch": 1.6594090202177294, "grad_norm": 0.2501881466569968, "learning_rate": 6.6946874588031275e-06, "loss": 0.1227, "num_tokens": 104796504.0, "step": 1067 }, { "epoch": 1.6609642301710732, "grad_norm": 0.23065613501809226, "learning_rate": 6.670869245206635e-06, "loss": 0.1082, "num_tokens": 104894808.0, "step": 1068 }, { "epoch": 1.6625194401244168, "grad_norm": 0.22585994027439044, "learning_rate": 6.647148325179722e-06, "loss": 0.1112, "num_tokens": 104993112.0, "step": 1069 }, { "epoch": 1.6640746500777603, "grad_norm": 0.23594461140395456, "learning_rate": 6.623524849278416e-06, "loss": 0.1186, "num_tokens": 105091416.0, "step": 1070 }, { "epoch": 1.6656298600311041, "grad_norm": 0.2315620888643486, "learning_rate": 6.599998967440266e-06, "loss": 0.1162, "num_tokens": 105189720.0, "step": 1071 }, { "epoch": 1.667185069984448, "grad_norm": 0.2270588388118617, "learning_rate": 6.576570828983397e-06, "loss": 0.1136, "num_tokens": 105288024.0, "step": 1072 }, { "epoch": 1.6687402799377917, "grad_norm": 0.24554442490476408, "learning_rate": 6.553240582605551e-06, "loss": 0.113, "num_tokens": 105386328.0, "step": 1073 }, { "epoch": 1.6702954898911353, "grad_norm": 0.23610319576137076, "learning_rate": 6.530008376383158e-06, "loss": 0.1149, "num_tokens": 105484632.0, "step": 1074 }, { "epoch": 1.6718506998444789, "grad_norm": 0.2324571894249009, "learning_rate": 6.5068743577703895e-06, "loss": 0.1178, "num_tokens": 105582936.0, "step": 1075 }, { "epoch": 1.6734059097978227, "grad_norm": 0.3674994505336357, "learning_rate": 6.483838673598228e-06, "loss": 0.1147, "num_tokens": 105681240.0, "step": 1076 }, { "epoch": 1.6749611197511665, "grad_norm": 0.24243194197251203, "learning_rate": 6.460901470073518e-06, "loss": 0.1137, "num_tokens": 105779544.0, "step": 1077 }, { "epoch": 1.6765163297045103, "grad_norm": 0.24202700682014475, "learning_rate": 6.438062892778063e-06, "loss": 0.1169, "num_tokens": 105877848.0, "step": 1078 }, { "epoch": 1.6780715396578538, "grad_norm": 0.23118777068706425, "learning_rate": 6.415323086667682e-06, "loss": 0.1146, "num_tokens": 105976152.0, "step": 1079 }, { "epoch": 1.6796267496111974, "grad_norm": 0.24237012765005606, "learning_rate": 6.392682196071289e-06, "loss": 0.1194, "num_tokens": 106074456.0, "step": 1080 }, { "epoch": 1.6811819595645412, "grad_norm": 0.22524933301241987, "learning_rate": 6.370140364689999e-06, "loss": 0.1123, "num_tokens": 106172760.0, "step": 1081 }, { "epoch": 1.682737169517885, "grad_norm": 0.23160147762411873, "learning_rate": 6.3476977355961885e-06, "loss": 0.1142, "num_tokens": 106271064.0, "step": 1082 }, { "epoch": 1.6842923794712286, "grad_norm": 0.25601435313748966, "learning_rate": 6.325354451232612e-06, "loss": 0.1276, "num_tokens": 106369368.0, "step": 1083 }, { "epoch": 1.6858475894245724, "grad_norm": 0.24392332438817838, "learning_rate": 6.303110653411462e-06, "loss": 0.1165, "num_tokens": 106467672.0, "step": 1084 }, { "epoch": 1.687402799377916, "grad_norm": 0.2510114644237995, "learning_rate": 6.280966483313528e-06, "loss": 0.1184, "num_tokens": 106565976.0, "step": 1085 }, { "epoch": 1.6889580093312597, "grad_norm": 0.2463244702411794, "learning_rate": 6.25892208148724e-06, "loss": 0.1201, "num_tokens": 106664280.0, "step": 1086 }, { "epoch": 1.6905132192846035, "grad_norm": 0.2599952971498088, "learning_rate": 6.236977587847801e-06, "loss": 0.1222, "num_tokens": 106762584.0, "step": 1087 }, { "epoch": 1.692068429237947, "grad_norm": 0.22132402378330054, "learning_rate": 6.215133141676312e-06, "loss": 0.1062, "num_tokens": 106860888.0, "step": 1088 }, { "epoch": 1.6936236391912907, "grad_norm": 0.23204452657381527, "learning_rate": 6.193388881618868e-06, "loss": 0.1132, "num_tokens": 106959192.0, "step": 1089 }, { "epoch": 1.6951788491446345, "grad_norm": 0.24344645550692862, "learning_rate": 6.171744945685692e-06, "loss": 0.1212, "num_tokens": 107057496.0, "step": 1090 }, { "epoch": 1.6967340590979783, "grad_norm": 0.2493895025188908, "learning_rate": 6.15020147125024e-06, "loss": 0.1204, "num_tokens": 107155800.0, "step": 1091 }, { "epoch": 1.698289269051322, "grad_norm": 0.23713255201749872, "learning_rate": 6.128758595048347e-06, "loss": 0.1088, "num_tokens": 107254104.0, "step": 1092 }, { "epoch": 1.6998444790046656, "grad_norm": 0.24300805840221984, "learning_rate": 6.107416453177371e-06, "loss": 0.1128, "num_tokens": 107345455.0, "step": 1093 }, { "epoch": 1.7013996889580092, "grad_norm": 0.24171958197041804, "learning_rate": 6.086175181095282e-06, "loss": 0.1195, "num_tokens": 107443759.0, "step": 1094 }, { "epoch": 1.702954898911353, "grad_norm": 0.24169596782659672, "learning_rate": 6.065034913619855e-06, "loss": 0.1255, "num_tokens": 107542063.0, "step": 1095 }, { "epoch": 1.7045101088646968, "grad_norm": 0.24163773853825565, "learning_rate": 6.043995784927785e-06, "loss": 0.1153, "num_tokens": 107640367.0, "step": 1096 }, { "epoch": 1.7060653188180406, "grad_norm": 0.22984894909085157, "learning_rate": 6.023057928553832e-06, "loss": 0.1109, "num_tokens": 107738671.0, "step": 1097 }, { "epoch": 1.7076205287713841, "grad_norm": 0.2508054859039707, "learning_rate": 6.00222147739e-06, "loss": 0.1243, "num_tokens": 107836975.0, "step": 1098 }, { "epoch": 1.7091757387247277, "grad_norm": 0.23708560433656875, "learning_rate": 5.9814865636846685e-06, "loss": 0.1197, "num_tokens": 107935279.0, "step": 1099 }, { "epoch": 1.7107309486780715, "grad_norm": 0.23916646860094032, "learning_rate": 5.9608533190417584e-06, "loss": 0.1133, "num_tokens": 108033583.0, "step": 1100 }, { "epoch": 1.7122861586314153, "grad_norm": 0.23137797342531316, "learning_rate": 5.940321874419906e-06, "loss": 0.1175, "num_tokens": 108131887.0, "step": 1101 }, { "epoch": 1.713841368584759, "grad_norm": 0.23651926561339875, "learning_rate": 5.919892360131625e-06, "loss": 0.1135, "num_tokens": 108230191.0, "step": 1102 }, { "epoch": 1.7153965785381027, "grad_norm": 0.24220228293038878, "learning_rate": 5.899564905842478e-06, "loss": 0.1219, "num_tokens": 108328495.0, "step": 1103 }, { "epoch": 1.7169517884914463, "grad_norm": 0.23208119131930194, "learning_rate": 5.879339640570253e-06, "loss": 0.1157, "num_tokens": 108426799.0, "step": 1104 }, { "epoch": 1.71850699844479, "grad_norm": 0.2615251349354323, "learning_rate": 5.859216692684151e-06, "loss": 0.11, "num_tokens": 108525103.0, "step": 1105 }, { "epoch": 1.7200622083981338, "grad_norm": 0.24358547982252307, "learning_rate": 5.839196189903968e-06, "loss": 0.1173, "num_tokens": 108623407.0, "step": 1106 }, { "epoch": 1.7216174183514774, "grad_norm": 0.23969630993017224, "learning_rate": 5.819278259299286e-06, "loss": 0.1181, "num_tokens": 108721711.0, "step": 1107 }, { "epoch": 1.723172628304821, "grad_norm": 0.2446672393045858, "learning_rate": 5.799463027288647e-06, "loss": 0.1216, "num_tokens": 108820015.0, "step": 1108 }, { "epoch": 1.7247278382581648, "grad_norm": 0.2288207840815691, "learning_rate": 5.779750619638796e-06, "loss": 0.1138, "num_tokens": 108918319.0, "step": 1109 }, { "epoch": 1.7262830482115086, "grad_norm": 0.22953642893864118, "learning_rate": 5.7601411614638385e-06, "loss": 0.1147, "num_tokens": 109016623.0, "step": 1110 }, { "epoch": 1.7278382581648524, "grad_norm": 0.23482959929151484, "learning_rate": 5.740634777224455e-06, "loss": 0.116, "num_tokens": 109114927.0, "step": 1111 }, { "epoch": 1.729393468118196, "grad_norm": 0.2382311166730701, "learning_rate": 5.7212315907271346e-06, "loss": 0.1181, "num_tokens": 109213231.0, "step": 1112 }, { "epoch": 1.7309486780715395, "grad_norm": 0.2342526102089699, "learning_rate": 5.701931725123362e-06, "loss": 0.1173, "num_tokens": 109311535.0, "step": 1113 }, { "epoch": 1.7325038880248833, "grad_norm": 0.24198878305828778, "learning_rate": 5.682735302908857e-06, "loss": 0.1184, "num_tokens": 109409839.0, "step": 1114 }, { "epoch": 1.734059097978227, "grad_norm": 0.2969900690839277, "learning_rate": 5.663642445922777e-06, "loss": 0.1192, "num_tokens": 109508143.0, "step": 1115 }, { "epoch": 1.735614307931571, "grad_norm": 0.23485264728709546, "learning_rate": 5.644653275346954e-06, "loss": 0.1176, "num_tokens": 109606447.0, "step": 1116 }, { "epoch": 1.7371695178849145, "grad_norm": 0.2355832004213927, "learning_rate": 5.6257679117051386e-06, "loss": 0.1124, "num_tokens": 109704751.0, "step": 1117 }, { "epoch": 1.738724727838258, "grad_norm": 0.2629036360991254, "learning_rate": 5.606986474862207e-06, "loss": 0.1219, "num_tokens": 109803055.0, "step": 1118 }, { "epoch": 1.7402799377916018, "grad_norm": 0.22932639648123562, "learning_rate": 5.58830908402342e-06, "loss": 0.1146, "num_tokens": 109901359.0, "step": 1119 }, { "epoch": 1.7418351477449456, "grad_norm": 0.21651485315268038, "learning_rate": 5.569735857733663e-06, "loss": 0.1048, "num_tokens": 109999663.0, "step": 1120 }, { "epoch": 1.7433903576982894, "grad_norm": 0.266064002716142, "learning_rate": 5.551266913876693e-06, "loss": 0.1197, "num_tokens": 110097967.0, "step": 1121 }, { "epoch": 1.744945567651633, "grad_norm": 0.24103468071561196, "learning_rate": 5.53290236967438e-06, "loss": 0.1203, "num_tokens": 110196271.0, "step": 1122 }, { "epoch": 1.7465007776049766, "grad_norm": 0.23374617589014193, "learning_rate": 5.514642341685983e-06, "loss": 0.1163, "num_tokens": 110294575.0, "step": 1123 }, { "epoch": 1.7480559875583204, "grad_norm": 0.2324525252479243, "learning_rate": 5.496486945807394e-06, "loss": 0.1141, "num_tokens": 110392879.0, "step": 1124 }, { "epoch": 1.7496111975116642, "grad_norm": 0.4472443470428658, "learning_rate": 5.478436297270408e-06, "loss": 0.1129, "num_tokens": 110491183.0, "step": 1125 }, { "epoch": 1.7511664074650077, "grad_norm": 0.2647285280646164, "learning_rate": 5.4604905106419995e-06, "loss": 0.1146, "num_tokens": 110589487.0, "step": 1126 }, { "epoch": 1.7527216174183515, "grad_norm": 0.24938302620536143, "learning_rate": 5.442649699823575e-06, "loss": 0.1257, "num_tokens": 110687791.0, "step": 1127 }, { "epoch": 1.754276827371695, "grad_norm": 0.25733682377759565, "learning_rate": 5.424913978050268e-06, "loss": 0.1173, "num_tokens": 110786095.0, "step": 1128 }, { "epoch": 1.755832037325039, "grad_norm": 0.2345657288570333, "learning_rate": 5.407283457890216e-06, "loss": 0.1138, "num_tokens": 110884399.0, "step": 1129 }, { "epoch": 1.7573872472783827, "grad_norm": 0.23216447226462245, "learning_rate": 5.389758251243843e-06, "loss": 0.1119, "num_tokens": 110982703.0, "step": 1130 }, { "epoch": 1.7589424572317263, "grad_norm": 0.24635686337259693, "learning_rate": 5.372338469343153e-06, "loss": 0.119, "num_tokens": 111081007.0, "step": 1131 }, { "epoch": 1.7604976671850698, "grad_norm": 0.23676007626286039, "learning_rate": 5.355024222751015e-06, "loss": 0.1141, "num_tokens": 111179311.0, "step": 1132 }, { "epoch": 1.7620528771384136, "grad_norm": 0.23865154857910023, "learning_rate": 5.337815621360483e-06, "loss": 0.113, "num_tokens": 111277615.0, "step": 1133 }, { "epoch": 1.7636080870917574, "grad_norm": 0.24732142905764834, "learning_rate": 5.3207127743940725e-06, "loss": 0.1186, "num_tokens": 111375919.0, "step": 1134 }, { "epoch": 1.7651632970451012, "grad_norm": 0.2309438933031769, "learning_rate": 5.3037157904030755e-06, "loss": 0.1108, "num_tokens": 111474223.0, "step": 1135 }, { "epoch": 1.7667185069984448, "grad_norm": 0.22491733738803257, "learning_rate": 5.2868247772668835e-06, "loss": 0.1062, "num_tokens": 111572527.0, "step": 1136 }, { "epoch": 1.7682737169517884, "grad_norm": 0.2320843596216636, "learning_rate": 5.270039842192291e-06, "loss": 0.1105, "num_tokens": 111670831.0, "step": 1137 }, { "epoch": 1.7698289269051322, "grad_norm": 0.2343663878889808, "learning_rate": 5.2533610917128205e-06, "loss": 0.1157, "num_tokens": 111769135.0, "step": 1138 }, { "epoch": 1.771384136858476, "grad_norm": 0.24544927648299983, "learning_rate": 5.2367886316880325e-06, "loss": 0.1231, "num_tokens": 111867439.0, "step": 1139 }, { "epoch": 1.7729393468118197, "grad_norm": 0.24361999048078606, "learning_rate": 5.220322567302874e-06, "loss": 0.1138, "num_tokens": 111965743.0, "step": 1140 }, { "epoch": 1.7744945567651633, "grad_norm": 0.24426175196852432, "learning_rate": 5.2039630030670115e-06, "loss": 0.1175, "num_tokens": 112064047.0, "step": 1141 }, { "epoch": 1.776049766718507, "grad_norm": 0.244276265105335, "learning_rate": 5.187710042814138e-06, "loss": 0.116, "num_tokens": 112162351.0, "step": 1142 }, { "epoch": 1.7776049766718507, "grad_norm": 0.2657159871579296, "learning_rate": 5.1715637897013446e-06, "loss": 0.1171, "num_tokens": 112260417.0, "step": 1143 }, { "epoch": 1.7791601866251945, "grad_norm": 0.23369428820910815, "learning_rate": 5.155524346208456e-06, "loss": 0.117, "num_tokens": 112358721.0, "step": 1144 }, { "epoch": 1.780715396578538, "grad_norm": 0.23272198396286217, "learning_rate": 5.1395918141373795e-06, "loss": 0.1099, "num_tokens": 112457025.0, "step": 1145 }, { "epoch": 1.7822706065318819, "grad_norm": 0.23301105646379042, "learning_rate": 5.123766294611448e-06, "loss": 0.115, "num_tokens": 112555329.0, "step": 1146 }, { "epoch": 1.7838258164852254, "grad_norm": 0.24964959443498663, "learning_rate": 5.108047888074803e-06, "loss": 0.1217, "num_tokens": 112653633.0, "step": 1147 }, { "epoch": 1.7853810264385692, "grad_norm": 0.2404215519356634, "learning_rate": 5.092436694291736e-06, "loss": 0.117, "num_tokens": 112751937.0, "step": 1148 }, { "epoch": 1.786936236391913, "grad_norm": 0.22071897395479617, "learning_rate": 5.076932812346062e-06, "loss": 0.1082, "num_tokens": 112850241.0, "step": 1149 }, { "epoch": 1.7884914463452566, "grad_norm": 0.2350383377855435, "learning_rate": 5.0615363406404914e-06, "loss": 0.1189, "num_tokens": 112948545.0, "step": 1150 }, { "epoch": 1.7900466562986002, "grad_norm": 0.24730799846242238, "learning_rate": 5.046247376896008e-06, "loss": 0.1167, "num_tokens": 113046849.0, "step": 1151 }, { "epoch": 1.791601866251944, "grad_norm": 0.2433515699316054, "learning_rate": 5.031066018151238e-06, "loss": 0.1211, "num_tokens": 113145153.0, "step": 1152 }, { "epoch": 1.7931570762052877, "grad_norm": 0.23691409680355138, "learning_rate": 5.0159923607618495e-06, "loss": 0.1111, "num_tokens": 113243457.0, "step": 1153 }, { "epoch": 1.7947122861586315, "grad_norm": 0.24606930004965644, "learning_rate": 5.001026500399926e-06, "loss": 0.1189, "num_tokens": 113341761.0, "step": 1154 }, { "epoch": 1.7962674961119751, "grad_norm": 0.23567706906017602, "learning_rate": 4.986168532053376e-06, "loss": 0.1155, "num_tokens": 113440065.0, "step": 1155 }, { "epoch": 1.7978227060653187, "grad_norm": 0.24709924283298534, "learning_rate": 4.971418550025305e-06, "loss": 0.1229, "num_tokens": 113538369.0, "step": 1156 }, { "epoch": 1.7993779160186625, "grad_norm": 0.2520341126613842, "learning_rate": 4.956776647933449e-06, "loss": 0.1137, "num_tokens": 113636673.0, "step": 1157 }, { "epoch": 1.8009331259720063, "grad_norm": 0.2427471358933041, "learning_rate": 4.9422429187095586e-06, "loss": 0.1132, "num_tokens": 113734977.0, "step": 1158 }, { "epoch": 1.80248833592535, "grad_norm": 0.24552120510893494, "learning_rate": 4.927817454598804e-06, "loss": 0.121, "num_tokens": 113833281.0, "step": 1159 }, { "epoch": 1.8040435458786936, "grad_norm": 0.24996123522650368, "learning_rate": 4.913500347159209e-06, "loss": 0.1199, "num_tokens": 113931585.0, "step": 1160 }, { "epoch": 1.8055987558320372, "grad_norm": 0.2532293994946295, "learning_rate": 4.899291687261064e-06, "loss": 0.1124, "num_tokens": 114029889.0, "step": 1161 }, { "epoch": 1.807153965785381, "grad_norm": 0.22689220184704512, "learning_rate": 4.885191565086341e-06, "loss": 0.1079, "num_tokens": 114128193.0, "step": 1162 }, { "epoch": 1.8087091757387248, "grad_norm": 0.23776702672295633, "learning_rate": 4.87120007012812e-06, "loss": 0.1161, "num_tokens": 114226497.0, "step": 1163 }, { "epoch": 1.8102643856920684, "grad_norm": 0.2352505550382406, "learning_rate": 4.857317291190034e-06, "loss": 0.1181, "num_tokens": 114324801.0, "step": 1164 }, { "epoch": 1.8118195956454122, "grad_norm": 0.26100199887118003, "learning_rate": 4.8435433163857035e-06, "loss": 0.1254, "num_tokens": 114423105.0, "step": 1165 }, { "epoch": 1.8133748055987557, "grad_norm": 0.24134567356795733, "learning_rate": 4.829878233138161e-06, "loss": 0.1092, "num_tokens": 114521409.0, "step": 1166 }, { "epoch": 1.8149300155520995, "grad_norm": 0.230559532326562, "learning_rate": 4.816322128179316e-06, "loss": 0.11, "num_tokens": 114619713.0, "step": 1167 }, { "epoch": 1.8164852255054433, "grad_norm": 0.30048758898074923, "learning_rate": 4.802875087549388e-06, "loss": 0.1135, "num_tokens": 114718017.0, "step": 1168 }, { "epoch": 1.818040435458787, "grad_norm": 0.2511568611657092, "learning_rate": 4.7895371965963776e-06, "loss": 0.1134, "num_tokens": 114816321.0, "step": 1169 }, { "epoch": 1.8195956454121305, "grad_norm": 0.22394423200431665, "learning_rate": 4.776308539975507e-06, "loss": 0.1082, "num_tokens": 114914625.0, "step": 1170 }, { "epoch": 1.8211508553654743, "grad_norm": 0.22848108869302045, "learning_rate": 4.763189201648696e-06, "loss": 0.1181, "num_tokens": 115012929.0, "step": 1171 }, { "epoch": 1.822706065318818, "grad_norm": 0.31364749577083223, "learning_rate": 4.750179264884019e-06, "loss": 0.1197, "num_tokens": 115111233.0, "step": 1172 }, { "epoch": 1.8242612752721619, "grad_norm": 0.23470332667448177, "learning_rate": 4.73727881225519e-06, "loss": 0.1153, "num_tokens": 115209537.0, "step": 1173 }, { "epoch": 1.8258164852255054, "grad_norm": 0.23346116377677373, "learning_rate": 4.724487925641022e-06, "loss": 0.1079, "num_tokens": 115307841.0, "step": 1174 }, { "epoch": 1.827371695178849, "grad_norm": 0.25528632767843423, "learning_rate": 4.711806686224923e-06, "loss": 0.1114, "num_tokens": 115406145.0, "step": 1175 }, { "epoch": 1.8289269051321928, "grad_norm": 0.23561706434305543, "learning_rate": 4.69923517449437e-06, "loss": 0.116, "num_tokens": 115504449.0, "step": 1176 }, { "epoch": 1.8304821150855366, "grad_norm": 0.228552735444969, "learning_rate": 4.686773470240394e-06, "loss": 0.1073, "num_tokens": 115602753.0, "step": 1177 }, { "epoch": 1.8320373250388804, "grad_norm": 0.22563743629473423, "learning_rate": 4.6744216525570934e-06, "loss": 0.1075, "num_tokens": 115701057.0, "step": 1178 }, { "epoch": 1.833592534992224, "grad_norm": 0.24345501703428782, "learning_rate": 4.662179799841115e-06, "loss": 0.1155, "num_tokens": 115799361.0, "step": 1179 }, { "epoch": 1.8351477449455675, "grad_norm": 0.22949069548388754, "learning_rate": 4.650047989791151e-06, "loss": 0.111, "num_tokens": 115897665.0, "step": 1180 }, { "epoch": 1.8367029548989113, "grad_norm": 0.2287529714643926, "learning_rate": 4.6380262994074755e-06, "loss": 0.1126, "num_tokens": 115995969.0, "step": 1181 }, { "epoch": 1.8382581648522551, "grad_norm": 0.24074605882069178, "learning_rate": 4.626114804991422e-06, "loss": 0.1161, "num_tokens": 116094273.0, "step": 1182 }, { "epoch": 1.839813374805599, "grad_norm": 0.2523309290499349, "learning_rate": 4.614313582144913e-06, "loss": 0.1138, "num_tokens": 116192577.0, "step": 1183 }, { "epoch": 1.8413685847589425, "grad_norm": 0.2447889106514995, "learning_rate": 4.602622705769988e-06, "loss": 0.1105, "num_tokens": 116290881.0, "step": 1184 }, { "epoch": 1.842923794712286, "grad_norm": 0.24673899761095372, "learning_rate": 4.5910422500683135e-06, "loss": 0.1189, "num_tokens": 116389185.0, "step": 1185 }, { "epoch": 1.8444790046656299, "grad_norm": 0.25399767757677305, "learning_rate": 4.579572288540728e-06, "loss": 0.1264, "num_tokens": 116487489.0, "step": 1186 }, { "epoch": 1.8460342146189737, "grad_norm": 0.33116219150005394, "learning_rate": 4.568212893986752e-06, "loss": 0.1196, "num_tokens": 116585793.0, "step": 1187 }, { "epoch": 1.8475894245723172, "grad_norm": 0.2369346477579848, "learning_rate": 4.556964138504152e-06, "loss": 0.1152, "num_tokens": 116684097.0, "step": 1188 }, { "epoch": 1.8491446345256608, "grad_norm": 0.23616144698186053, "learning_rate": 4.545826093488473e-06, "loss": 0.1121, "num_tokens": 116782401.0, "step": 1189 }, { "epoch": 1.8506998444790046, "grad_norm": 0.23276591479938763, "learning_rate": 4.534798829632576e-06, "loss": 0.1112, "num_tokens": 116880705.0, "step": 1190 }, { "epoch": 1.8522550544323484, "grad_norm": 0.40627843409929476, "learning_rate": 4.523882416926199e-06, "loss": 0.1135, "num_tokens": 116979009.0, "step": 1191 }, { "epoch": 1.8538102643856922, "grad_norm": 0.24313095215882777, "learning_rate": 4.513076924655512e-06, "loss": 0.1138, "num_tokens": 117077313.0, "step": 1192 }, { "epoch": 1.8553654743390358, "grad_norm": 0.2504833965656308, "learning_rate": 4.50238242140268e-06, "loss": 0.1151, "num_tokens": 117170088.0, "step": 1193 }, { "epoch": 1.8569206842923793, "grad_norm": 0.2394460506524325, "learning_rate": 4.491798975045414e-06, "loss": 0.1109, "num_tokens": 117268392.0, "step": 1194 }, { "epoch": 1.8584758942457231, "grad_norm": 0.2366744721129791, "learning_rate": 4.48132665275656e-06, "loss": 0.1093, "num_tokens": 117366696.0, "step": 1195 }, { "epoch": 1.860031104199067, "grad_norm": 0.24378384519909443, "learning_rate": 4.4709655210036565e-06, "loss": 0.1188, "num_tokens": 117465000.0, "step": 1196 }, { "epoch": 1.8615863141524107, "grad_norm": 0.2413135107460472, "learning_rate": 4.46071564554852e-06, "loss": 0.1168, "num_tokens": 117563304.0, "step": 1197 }, { "epoch": 1.8631415241057543, "grad_norm": 0.24848700036351093, "learning_rate": 4.4505770914468304e-06, "loss": 0.1182, "num_tokens": 117661608.0, "step": 1198 }, { "epoch": 1.8646967340590979, "grad_norm": 0.23032879422468805, "learning_rate": 4.440549923047707e-06, "loss": 0.1054, "num_tokens": 117759912.0, "step": 1199 }, { "epoch": 1.8662519440124417, "grad_norm": 0.2330908948446889, "learning_rate": 4.430634203993314e-06, "loss": 0.1143, "num_tokens": 117858216.0, "step": 1200 }, { "epoch": 1.8678071539657854, "grad_norm": 0.2508979974143698, "learning_rate": 4.420829997218441e-06, "loss": 0.1133, "num_tokens": 117956520.0, "step": 1201 }, { "epoch": 1.8693623639191292, "grad_norm": 0.2683394856728626, "learning_rate": 4.411137364950122e-06, "loss": 0.1116, "num_tokens": 118054824.0, "step": 1202 }, { "epoch": 1.8709175738724728, "grad_norm": 0.2344317810409382, "learning_rate": 4.401556368707227e-06, "loss": 0.1138, "num_tokens": 118153128.0, "step": 1203 }, { "epoch": 1.8724727838258164, "grad_norm": 0.24268175787163057, "learning_rate": 4.39208706930007e-06, "loss": 0.1209, "num_tokens": 118251432.0, "step": 1204 }, { "epoch": 1.8740279937791602, "grad_norm": 0.21841862265228124, "learning_rate": 4.382729526830038e-06, "loss": 0.1092, "num_tokens": 118349736.0, "step": 1205 }, { "epoch": 1.875583203732504, "grad_norm": 0.23129030051493663, "learning_rate": 4.3734838006891945e-06, "loss": 0.1139, "num_tokens": 118448040.0, "step": 1206 }, { "epoch": 1.8771384136858476, "grad_norm": 0.2490145032942356, "learning_rate": 4.364349949559904e-06, "loss": 0.1159, "num_tokens": 118546344.0, "step": 1207 }, { "epoch": 1.8786936236391913, "grad_norm": 0.2372266632198088, "learning_rate": 4.355328031414468e-06, "loss": 0.1133, "num_tokens": 118644648.0, "step": 1208 }, { "epoch": 1.880248833592535, "grad_norm": 0.24577355577698717, "learning_rate": 4.346418103514753e-06, "loss": 0.1151, "num_tokens": 118742952.0, "step": 1209 }, { "epoch": 1.8818040435458787, "grad_norm": 0.2377445931089844, "learning_rate": 4.337620222411829e-06, "loss": 0.1132, "num_tokens": 118841256.0, "step": 1210 }, { "epoch": 1.8833592534992225, "grad_norm": 0.24103066992664912, "learning_rate": 4.328934443945599e-06, "loss": 0.1125, "num_tokens": 118939560.0, "step": 1211 }, { "epoch": 1.884914463452566, "grad_norm": 0.23656503087611813, "learning_rate": 4.320360823244458e-06, "loss": 0.1136, "num_tokens": 119037864.0, "step": 1212 }, { "epoch": 1.8864696734059097, "grad_norm": 0.23957142222733446, "learning_rate": 4.3118994147249475e-06, "loss": 0.1086, "num_tokens": 119136168.0, "step": 1213 }, { "epoch": 1.8880248833592534, "grad_norm": 0.2768296115064777, "learning_rate": 4.303550272091386e-06, "loss": 0.1117, "num_tokens": 119234472.0, "step": 1214 }, { "epoch": 1.8895800933125972, "grad_norm": 0.23956196732821528, "learning_rate": 4.2953134483355545e-06, "loss": 0.1097, "num_tokens": 119332776.0, "step": 1215 }, { "epoch": 1.891135303265941, "grad_norm": 0.23079200078868692, "learning_rate": 4.2871889957363454e-06, "loss": 0.108, "num_tokens": 119431080.0, "step": 1216 }, { "epoch": 1.8926905132192846, "grad_norm": 0.22677853561718056, "learning_rate": 4.279176965859438e-06, "loss": 0.1067, "num_tokens": 119529384.0, "step": 1217 }, { "epoch": 1.8942457231726282, "grad_norm": 0.2719799292116923, "learning_rate": 4.2712774095569595e-06, "loss": 0.1211, "num_tokens": 119627688.0, "step": 1218 }, { "epoch": 1.895800933125972, "grad_norm": 0.23538269950099025, "learning_rate": 4.2634903769671775e-06, "loss": 0.1113, "num_tokens": 119725992.0, "step": 1219 }, { "epoch": 1.8973561430793158, "grad_norm": 0.2784477540758658, "learning_rate": 4.255815917514174e-06, "loss": 0.1208, "num_tokens": 119824296.0, "step": 1220 }, { "epoch": 1.8989113530326596, "grad_norm": 0.23534043345850839, "learning_rate": 4.24825407990753e-06, "loss": 0.1085, "num_tokens": 119922600.0, "step": 1221 }, { "epoch": 1.9004665629860031, "grad_norm": 0.23793077281855352, "learning_rate": 4.240804912142022e-06, "loss": 0.1117, "num_tokens": 120020904.0, "step": 1222 }, { "epoch": 1.9020217729393467, "grad_norm": 0.24556316574507348, "learning_rate": 4.23346846149731e-06, "loss": 0.1145, "num_tokens": 120119208.0, "step": 1223 }, { "epoch": 1.9035769828926905, "grad_norm": 0.24136861437657953, "learning_rate": 4.226244774537645e-06, "loss": 0.1154, "num_tokens": 120217512.0, "step": 1224 }, { "epoch": 1.9051321928460343, "grad_norm": 0.23998783284118244, "learning_rate": 4.219133897111567e-06, "loss": 0.1093, "num_tokens": 120315816.0, "step": 1225 }, { "epoch": 1.9066874027993779, "grad_norm": 0.24361703415358707, "learning_rate": 4.212135874351618e-06, "loss": 0.1167, "num_tokens": 120414120.0, "step": 1226 }, { "epoch": 1.9082426127527217, "grad_norm": 0.25565577553716584, "learning_rate": 4.205250750674059e-06, "loss": 0.1076, "num_tokens": 120512424.0, "step": 1227 }, { "epoch": 1.9097978227060652, "grad_norm": 0.24048524340468672, "learning_rate": 4.198478569778577e-06, "loss": 0.105, "num_tokens": 120610728.0, "step": 1228 }, { "epoch": 1.911353032659409, "grad_norm": 0.23116163597071102, "learning_rate": 4.191819374648016e-06, "loss": 0.1125, "num_tokens": 120709032.0, "step": 1229 }, { "epoch": 1.9129082426127528, "grad_norm": 0.25437194786361916, "learning_rate": 4.1852732075481085e-06, "loss": 0.1107, "num_tokens": 120807336.0, "step": 1230 }, { "epoch": 1.9144634525660964, "grad_norm": 0.2441231452386611, "learning_rate": 4.17884011002719e-06, "loss": 0.1182, "num_tokens": 120905640.0, "step": 1231 }, { "epoch": 1.91601866251944, "grad_norm": 0.23070632989013276, "learning_rate": 4.172520122915959e-06, "loss": 0.1123, "num_tokens": 121003944.0, "step": 1232 }, { "epoch": 1.9175738724727838, "grad_norm": 0.24092054933977855, "learning_rate": 4.166313286327193e-06, "loss": 0.1191, "num_tokens": 121102248.0, "step": 1233 }, { "epoch": 1.9191290824261276, "grad_norm": 0.24489698001432195, "learning_rate": 4.160219639655519e-06, "loss": 0.1216, "num_tokens": 121200552.0, "step": 1234 }, { "epoch": 1.9206842923794714, "grad_norm": 0.244688740809069, "learning_rate": 4.154239221577138e-06, "loss": 0.1193, "num_tokens": 121298856.0, "step": 1235 }, { "epoch": 1.922239502332815, "grad_norm": 0.23070691819708222, "learning_rate": 4.148372070049601e-06, "loss": 0.1084, "num_tokens": 121397160.0, "step": 1236 }, { "epoch": 1.9237947122861585, "grad_norm": 0.23152147835176695, "learning_rate": 4.142618222311559e-06, "loss": 0.1097, "num_tokens": 121495464.0, "step": 1237 }, { "epoch": 1.9253499222395023, "grad_norm": 0.23504746955227834, "learning_rate": 4.13697771488252e-06, "loss": 0.1084, "num_tokens": 121593768.0, "step": 1238 }, { "epoch": 1.926905132192846, "grad_norm": 0.2328276733450882, "learning_rate": 4.1314505835626325e-06, "loss": 0.113, "num_tokens": 121692072.0, "step": 1239 }, { "epoch": 1.9284603421461899, "grad_norm": 0.25674406887787987, "learning_rate": 4.1260368634324454e-06, "loss": 0.1128, "num_tokens": 121790376.0, "step": 1240 }, { "epoch": 1.9300155520995335, "grad_norm": 0.24739278294115774, "learning_rate": 4.12073658885269e-06, "loss": 0.1167, "num_tokens": 121888680.0, "step": 1241 }, { "epoch": 1.931570762052877, "grad_norm": 0.2365548404560239, "learning_rate": 4.1155497934640634e-06, "loss": 0.1122, "num_tokens": 121986984.0, "step": 1242 }, { "epoch": 1.9331259720062208, "grad_norm": 0.2499534988915868, "learning_rate": 4.110476510187014e-06, "loss": 0.1208, "num_tokens": 122082464.0, "step": 1243 }, { "epoch": 1.9346811819595646, "grad_norm": 0.24768536155584056, "learning_rate": 4.105516771221528e-06, "loss": 0.1227, "num_tokens": 122180768.0, "step": 1244 }, { "epoch": 1.9362363919129082, "grad_norm": 0.26165293789524996, "learning_rate": 4.100670608046933e-06, "loss": 0.1088, "num_tokens": 122279072.0, "step": 1245 }, { "epoch": 1.937791601866252, "grad_norm": 0.2430745871769547, "learning_rate": 4.095938051421693e-06, "loss": 0.1194, "num_tokens": 122377376.0, "step": 1246 }, { "epoch": 1.9393468118195956, "grad_norm": 0.2291067848899989, "learning_rate": 4.091319131383215e-06, "loss": 0.1122, "num_tokens": 122475680.0, "step": 1247 }, { "epoch": 1.9409020217729394, "grad_norm": 0.24194234715963103, "learning_rate": 4.0868138772476545e-06, "loss": 0.117, "num_tokens": 122573984.0, "step": 1248 }, { "epoch": 1.9424572317262832, "grad_norm": 0.25264050914946873, "learning_rate": 4.082422317609737e-06, "loss": 0.1231, "num_tokens": 122672288.0, "step": 1249 }, { "epoch": 1.9440124416796267, "grad_norm": 0.27470362712649815, "learning_rate": 4.078144480342569e-06, "loss": 0.1137, "num_tokens": 122770592.0, "step": 1250 }, { "epoch": 1.9455676516329703, "grad_norm": 0.23763453973230397, "learning_rate": 4.073980392597468e-06, "loss": 0.1131, "num_tokens": 122868896.0, "step": 1251 }, { "epoch": 1.947122861586314, "grad_norm": 0.24786008510734983, "learning_rate": 4.069930080803783e-06, "loss": 0.1225, "num_tokens": 122967200.0, "step": 1252 }, { "epoch": 1.9486780715396579, "grad_norm": 0.2500438065587374, "learning_rate": 4.065993570668728e-06, "loss": 0.1172, "num_tokens": 123065504.0, "step": 1253 }, { "epoch": 1.9502332814930017, "grad_norm": 0.24951023921666932, "learning_rate": 4.062170887177228e-06, "loss": 0.1154, "num_tokens": 123163808.0, "step": 1254 }, { "epoch": 1.9517884914463453, "grad_norm": 0.28075312157935256, "learning_rate": 4.0584620545917465e-06, "loss": 0.1149, "num_tokens": 123262112.0, "step": 1255 }, { "epoch": 1.9533437013996888, "grad_norm": 0.24120377270061594, "learning_rate": 4.0548670964521445e-06, "loss": 0.1176, "num_tokens": 123360416.0, "step": 1256 }, { "epoch": 1.9548989113530326, "grad_norm": 0.2984896237835927, "learning_rate": 4.051386035575521e-06, "loss": 0.1134, "num_tokens": 123458720.0, "step": 1257 }, { "epoch": 1.9564541213063764, "grad_norm": 0.24505713990938602, "learning_rate": 4.048018894056071e-06, "loss": 0.1187, "num_tokens": 123557024.0, "step": 1258 }, { "epoch": 1.9580093312597202, "grad_norm": 0.23496403745550973, "learning_rate": 4.044765693264952e-06, "loss": 0.1186, "num_tokens": 123655328.0, "step": 1259 }, { "epoch": 1.9595645412130638, "grad_norm": 0.22563198116798694, "learning_rate": 4.041626453850136e-06, "loss": 0.112, "num_tokens": 123753632.0, "step": 1260 }, { "epoch": 1.9611197511664074, "grad_norm": 0.234911906973064, "learning_rate": 4.038601195736291e-06, "loss": 0.1158, "num_tokens": 123851936.0, "step": 1261 }, { "epoch": 1.9626749611197511, "grad_norm": 0.22380897466319002, "learning_rate": 4.035689938124649e-06, "loss": 0.1108, "num_tokens": 123950240.0, "step": 1262 }, { "epoch": 1.964230171073095, "grad_norm": 0.23830190916089342, "learning_rate": 4.03289269949288e-06, "loss": 0.1172, "num_tokens": 124048544.0, "step": 1263 }, { "epoch": 1.9657853810264385, "grad_norm": 0.24700822703944994, "learning_rate": 4.030209497594977e-06, "loss": 0.1152, "num_tokens": 124146848.0, "step": 1264 }, { "epoch": 1.9673405909797823, "grad_norm": 0.23141086058504215, "learning_rate": 4.027640349461152e-06, "loss": 0.1152, "num_tokens": 124245152.0, "step": 1265 }, { "epoch": 1.9688958009331259, "grad_norm": 0.2314391624406049, "learning_rate": 4.0251852713977155e-06, "loss": 0.1142, "num_tokens": 124343456.0, "step": 1266 }, { "epoch": 1.9704510108864697, "grad_norm": 0.36174066673637284, "learning_rate": 4.022844278986983e-06, "loss": 0.1167, "num_tokens": 124441760.0, "step": 1267 }, { "epoch": 1.9720062208398135, "grad_norm": 0.24271773385150686, "learning_rate": 4.020617387087165e-06, "loss": 0.1141, "num_tokens": 124540064.0, "step": 1268 }, { "epoch": 1.973561430793157, "grad_norm": 0.24267162853784024, "learning_rate": 4.018504609832286e-06, "loss": 0.1169, "num_tokens": 124638368.0, "step": 1269 }, { "epoch": 1.9751166407465006, "grad_norm": 0.2349292519164687, "learning_rate": 4.016505960632083e-06, "loss": 0.1081, "num_tokens": 124736672.0, "step": 1270 }, { "epoch": 1.9766718506998444, "grad_norm": 0.23559709818353639, "learning_rate": 4.014621452171927e-06, "loss": 0.1124, "num_tokens": 124834976.0, "step": 1271 }, { "epoch": 1.9782270606531882, "grad_norm": 0.2406066778931546, "learning_rate": 4.012851096412741e-06, "loss": 0.1152, "num_tokens": 124933280.0, "step": 1272 }, { "epoch": 1.979782270606532, "grad_norm": 0.25450938861495653, "learning_rate": 4.011194904590927e-06, "loss": 0.1145, "num_tokens": 125031584.0, "step": 1273 }, { "epoch": 1.9813374805598756, "grad_norm": 0.2415602671412981, "learning_rate": 4.009652887218286e-06, "loss": 0.1142, "num_tokens": 125129888.0, "step": 1274 }, { "epoch": 1.9828926905132191, "grad_norm": 0.23464801253396364, "learning_rate": 4.008225054081961e-06, "loss": 0.1137, "num_tokens": 125228192.0, "step": 1275 }, { "epoch": 1.984447900466563, "grad_norm": 0.23713232970634235, "learning_rate": 4.006911414244368e-06, "loss": 0.114, "num_tokens": 125326496.0, "step": 1276 }, { "epoch": 1.9860031104199067, "grad_norm": 0.23750854006569883, "learning_rate": 4.005711976043143e-06, "loss": 0.1113, "num_tokens": 125424800.0, "step": 1277 }, { "epoch": 1.9875583203732505, "grad_norm": 0.23882383171815755, "learning_rate": 4.0046267470910885e-06, "loss": 0.1157, "num_tokens": 125523104.0, "step": 1278 }, { "epoch": 1.989113530326594, "grad_norm": 0.24479990201033644, "learning_rate": 4.003655734276124e-06, "loss": 0.1148, "num_tokens": 125621408.0, "step": 1279 }, { "epoch": 1.9906687402799377, "grad_norm": 0.23669837415286826, "learning_rate": 4.002798943761236e-06, "loss": 0.1114, "num_tokens": 125719712.0, "step": 1280 }, { "epoch": 1.9922239502332815, "grad_norm": 0.22403265195411615, "learning_rate": 4.002056380984457e-06, "loss": 0.1091, "num_tokens": 125818016.0, "step": 1281 }, { "epoch": 1.9937791601866253, "grad_norm": 0.23043826806764098, "learning_rate": 4.0014280506588055e-06, "loss": 0.1099, "num_tokens": 125916320.0, "step": 1282 }, { "epoch": 1.995334370139969, "grad_norm": 0.3021439531697977, "learning_rate": 4.00091395677228e-06, "loss": 0.1145, "num_tokens": 126014624.0, "step": 1283 }, { "epoch": 1.9968895800933126, "grad_norm": 0.24254290976476686, "learning_rate": 4.000514102587821e-06, "loss": 0.115, "num_tokens": 126112928.0, "step": 1284 }, { "epoch": 1.9984447900466562, "grad_norm": 0.2908588057997786, "learning_rate": 4.000228490643293e-06, "loss": 0.1168, "num_tokens": 126211232.0, "step": 1285 }, { "epoch": 2.0, "grad_norm": 0.256738077508375, "learning_rate": 4.000057122751463e-06, "loss": 0.1148, "num_tokens": 126296872.0, "step": 1286 }, { "epoch": 2.0, "step": 1286, "total_flos": 2265829537480704.0, "train_loss": 0.166707560617019, "train_runtime": 22070.6322, "train_samples_per_second": 1.864, "train_steps_per_second": 0.058 } ], "logging_steps": 1, "max_steps": 1286, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2265829537480704.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }