{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.929260450160772, "eval_steps": 20, "global_step": 231, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.19299747049808502, "learning_rate": 2e-05, "loss": 1.3437, "step": 1 }, { "epoch": 0.01, "eval_loss": 1.3747950792312622, "eval_runtime": 11.0278, "eval_samples_per_second": 235.858, "eval_steps_per_second": 59.032, "step": 1 }, { "epoch": 0.03, "grad_norm": 0.19121848046779633, "learning_rate": 4e-05, "loss": 1.3845, "step": 2 }, { "epoch": 0.04, "grad_norm": 0.19551381468772888, "learning_rate": 6e-05, "loss": 1.3774, "step": 3 }, { "epoch": 0.05, "grad_norm": 0.19498884677886963, "learning_rate": 8e-05, "loss": 1.3771, "step": 4 }, { "epoch": 0.06, "grad_norm": 0.24242694675922394, "learning_rate": 0.0001, "loss": 1.3558, "step": 5 }, { "epoch": 0.08, "grad_norm": 0.1539306640625, "learning_rate": 0.00012, "loss": 1.2809, "step": 6 }, { "epoch": 0.09, "grad_norm": 0.22252629697322845, "learning_rate": 0.00014, "loss": 1.2982, "step": 7 }, { "epoch": 0.1, "grad_norm": 0.18322250247001648, "learning_rate": 0.00016, "loss": 1.2663, "step": 8 }, { "epoch": 0.12, "grad_norm": 0.144040048122406, "learning_rate": 0.00018, "loss": 1.245, "step": 9 }, { "epoch": 0.13, "grad_norm": 0.16758786141872406, "learning_rate": 0.0002, "loss": 1.2265, "step": 10 }, { "epoch": 0.14, "grad_norm": 0.12311970442533493, "learning_rate": 0.00019999444309209432, "loss": 1.2582, "step": 11 }, { "epoch": 0.15, "grad_norm": 0.1770094484090805, "learning_rate": 0.0001999777729859618, "loss": 1.2295, "step": 12 }, { "epoch": 0.17, "grad_norm": 0.1003749668598175, "learning_rate": 0.00019994999153428737, "loss": 1.202, "step": 13 }, { "epoch": 0.18, "grad_norm": 0.10201102495193481, "learning_rate": 0.00019991110182465032, "loss": 1.2693, "step": 14 }, { "epoch": 0.19, "grad_norm": 0.10821792483329773, "learning_rate": 0.0001998611081791814, "loss": 1.2339, "step": 15 }, { "epoch": 0.21, "grad_norm": 0.0722450315952301, "learning_rate": 0.00019980001615408228, "loss": 1.2122, "step": 16 }, { "epoch": 0.22, "grad_norm": 0.0798899456858635, "learning_rate": 0.00019972783253900808, "loss": 1.2107, "step": 17 }, { "epoch": 0.23, "grad_norm": 0.06952076405286789, "learning_rate": 0.00019964456535631286, "loss": 1.2585, "step": 18 }, { "epoch": 0.24, "grad_norm": 0.060726992785930634, "learning_rate": 0.00019955022386015792, "loss": 1.1975, "step": 19 }, { "epoch": 0.26, "grad_norm": 0.06560595333576202, "learning_rate": 0.00019944481853548335, "loss": 1.2174, "step": 20 }, { "epoch": 0.26, "eval_loss": 1.2183723449707031, "eval_runtime": 11.1201, "eval_samples_per_second": 233.9, "eval_steps_per_second": 58.543, "step": 20 }, { "epoch": 0.27, "grad_norm": 0.06686750054359436, "learning_rate": 0.00019932836109684286, "loss": 1.2287, "step": 21 }, { "epoch": 0.28, "grad_norm": 0.05899977311491966, "learning_rate": 0.0001992008644871016, "loss": 1.177, "step": 22 }, { "epoch": 0.3, "grad_norm": 0.06782848387956619, "learning_rate": 0.00019906234287599798, "loss": 1.248, "step": 23 }, { "epoch": 0.31, "grad_norm": 0.07001505047082901, "learning_rate": 0.00019891281165856873, "loss": 1.2467, "step": 24 }, { "epoch": 0.32, "grad_norm": 0.05859360471367836, "learning_rate": 0.00019875228745343794, "loss": 1.2126, "step": 25 }, { "epoch": 0.33, "grad_norm": 0.06061583012342453, "learning_rate": 0.00019858078810097002, "loss": 1.2395, "step": 26 }, { "epoch": 0.35, "grad_norm": 0.058314863592386246, "learning_rate": 0.00019839833266128724, "loss": 1.1868, "step": 27 }, { "epoch": 0.36, "grad_norm": 0.05482524260878563, "learning_rate": 0.00019820494141215104, "loss": 1.1378, "step": 28 }, { "epoch": 0.37, "grad_norm": 0.056380610913038254, "learning_rate": 0.00019800063584670863, "loss": 1.2303, "step": 29 }, { "epoch": 0.39, "grad_norm": 0.05643482878804207, "learning_rate": 0.00019778543867110426, "loss": 1.2546, "step": 30 }, { "epoch": 0.4, "grad_norm": 0.05201110243797302, "learning_rate": 0.00019755937380195568, "loss": 1.2279, "step": 31 }, { "epoch": 0.41, "grad_norm": 0.0630900114774704, "learning_rate": 0.00019732246636369605, "loss": 1.1497, "step": 32 }, { "epoch": 0.42, "grad_norm": 0.0517457090318203, "learning_rate": 0.0001970747426857817, "loss": 1.2043, "step": 33 }, { "epoch": 0.44, "grad_norm": 0.05967947468161583, "learning_rate": 0.00019681623029976588, "loss": 1.1867, "step": 34 }, { "epoch": 0.45, "grad_norm": 0.05957586690783501, "learning_rate": 0.00019654695793623907, "loss": 1.2171, "step": 35 }, { "epoch": 0.46, "grad_norm": 0.05819874256849289, "learning_rate": 0.00019626695552163578, "loss": 1.2242, "step": 36 }, { "epoch": 0.48, "grad_norm": 0.05496300384402275, "learning_rate": 0.0001959762541749086, "loss": 1.2311, "step": 37 }, { "epoch": 0.49, "grad_norm": 0.05849285423755646, "learning_rate": 0.00019567488620406983, "loss": 1.2037, "step": 38 }, { "epoch": 0.5, "grad_norm": 0.05827626585960388, "learning_rate": 0.00019536288510260056, "loss": 1.2251, "step": 39 }, { "epoch": 0.51, "grad_norm": 0.05698635056614876, "learning_rate": 0.00019504028554572864, "loss": 1.2468, "step": 40 }, { "epoch": 0.51, "eval_loss": 1.2000694274902344, "eval_runtime": 11.1133, "eval_samples_per_second": 234.043, "eval_steps_per_second": 58.578, "step": 40 }, { "epoch": 0.53, "grad_norm": 0.058647479861974716, "learning_rate": 0.00019470712338657458, "loss": 1.1732, "step": 41 }, { "epoch": 0.54, "grad_norm": 0.05649631842970848, "learning_rate": 0.00019436343565216711, "loss": 1.1747, "step": 42 }, { "epoch": 0.55, "grad_norm": 0.05524146929383278, "learning_rate": 0.000194009260539328, "loss": 1.2022, "step": 43 }, { "epoch": 0.57, "grad_norm": 0.05937836691737175, "learning_rate": 0.00019364463741042694, "loss": 1.1534, "step": 44 }, { "epoch": 0.58, "grad_norm": 0.061414506286382675, "learning_rate": 0.00019326960678900688, "loss": 1.215, "step": 45 }, { "epoch": 0.59, "grad_norm": 0.05350738391280174, "learning_rate": 0.00019288421035528028, "loss": 1.2095, "step": 46 }, { "epoch": 0.6, "grad_norm": 0.05849866196513176, "learning_rate": 0.000192488490941497, "loss": 1.1703, "step": 47 }, { "epoch": 0.62, "grad_norm": 0.05610804632306099, "learning_rate": 0.0001920824925271838, "loss": 1.1423, "step": 48 }, { "epoch": 0.63, "grad_norm": 0.053889110684394836, "learning_rate": 0.00019166626023425662, "loss": 1.21, "step": 49 }, { "epoch": 0.64, "grad_norm": 0.057582154870033264, "learning_rate": 0.00019123984032200586, "loss": 1.2279, "step": 50 }, { "epoch": 0.66, "grad_norm": 0.05618985369801521, "learning_rate": 0.00019080328018195513, "loss": 1.1671, "step": 51 }, { "epoch": 0.67, "grad_norm": 0.055953964591026306, "learning_rate": 0.00019035662833259432, "loss": 1.2564, "step": 52 }, { "epoch": 0.68, "grad_norm": 0.05924614518880844, "learning_rate": 0.00018989993441398726, "loss": 1.1691, "step": 53 }, { "epoch": 0.69, "grad_norm": 0.06068262457847595, "learning_rate": 0.00018943324918225494, "loss": 1.1482, "step": 54 }, { "epoch": 0.71, "grad_norm": 0.05965428054332733, "learning_rate": 0.00018895662450393438, "loss": 1.2091, "step": 55 }, { "epoch": 0.72, "grad_norm": 0.059606753289699554, "learning_rate": 0.00018847011335021449, "loss": 1.2394, "step": 56 }, { "epoch": 0.73, "grad_norm": 0.059509821236133575, "learning_rate": 0.00018797376979104872, "loss": 1.1737, "step": 57 }, { "epoch": 0.75, "grad_norm": 0.06392043828964233, "learning_rate": 0.0001874676489891461, "loss": 1.1871, "step": 58 }, { "epoch": 0.76, "grad_norm": 0.06026330962777138, "learning_rate": 0.00018695180719384029, "loss": 1.1912, "step": 59 }, { "epoch": 0.77, "grad_norm": 0.06576694548130035, "learning_rate": 0.00018642630173483832, "loss": 1.183, "step": 60 }, { "epoch": 0.77, "eval_loss": 1.1930629014968872, "eval_runtime": 11.089, "eval_samples_per_second": 234.556, "eval_steps_per_second": 58.707, "step": 60 }, { "epoch": 0.78, "grad_norm": 0.05714555084705353, "learning_rate": 0.00018589119101584898, "loss": 1.208, "step": 61 }, { "epoch": 0.8, "grad_norm": 0.059180356562137604, "learning_rate": 0.00018534653450809197, "loss": 1.2427, "step": 62 }, { "epoch": 0.81, "grad_norm": 0.05717024207115173, "learning_rate": 0.0001847923927436884, "loss": 1.1717, "step": 63 }, { "epoch": 0.82, "grad_norm": 0.06259780377149582, "learning_rate": 0.0001842288273089332, "loss": 1.2008, "step": 64 }, { "epoch": 0.84, "grad_norm": 0.05737931653857231, "learning_rate": 0.00018365590083745085, "loss": 1.1685, "step": 65 }, { "epoch": 0.85, "grad_norm": 0.06073923781514168, "learning_rate": 0.0001830736770032341, "loss": 1.1572, "step": 66 }, { "epoch": 0.86, "grad_norm": 0.059637729078531265, "learning_rate": 0.00018248222051356754, "loss": 1.1963, "step": 67 }, { "epoch": 0.87, "grad_norm": 0.06196809560060501, "learning_rate": 0.00018188159710183594, "loss": 1.1727, "step": 68 }, { "epoch": 0.89, "grad_norm": 0.07422679662704468, "learning_rate": 0.00018127187352021907, "loss": 1.1976, "step": 69 }, { "epoch": 0.9, "grad_norm": 0.05969575420022011, "learning_rate": 0.00018065311753227273, "loss": 1.1523, "step": 70 }, { "epoch": 0.91, "grad_norm": 0.06363425403833389, "learning_rate": 0.00018002539790539773, "loss": 1.1861, "step": 71 }, { "epoch": 0.93, "grad_norm": 0.059063684195280075, "learning_rate": 0.0001793887844031972, "loss": 1.2235, "step": 72 }, { "epoch": 0.94, "grad_norm": 0.0626487135887146, "learning_rate": 0.00017874334777772327, "loss": 1.19, "step": 73 }, { "epoch": 0.95, "grad_norm": 0.05672723054885864, "learning_rate": 0.00017808915976161362, "loss": 1.1781, "step": 74 }, { "epoch": 0.96, "grad_norm": 0.061358433216810226, "learning_rate": 0.00017742629306011944, "loss": 1.2017, "step": 75 }, { "epoch": 0.98, "grad_norm": 0.05795949324965477, "learning_rate": 0.000176754821343025, "loss": 1.1802, "step": 76 }, { "epoch": 0.99, "grad_norm": 0.06237269937992096, "learning_rate": 0.00017607481923646016, "loss": 1.1777, "step": 77 }, { "epoch": 1.0, "grad_norm": 0.05692945048213005, "learning_rate": 0.0001753863623146066, "loss": 1.1938, "step": 78 }, { "epoch": 1.02, "grad_norm": 0.059807874262332916, "learning_rate": 0.00017468952709129846, "loss": 1.1685, "step": 79 }, { "epoch": 1.01, "grad_norm": 0.07109411805868149, "learning_rate": 0.00017398439101151905, "loss": 1.1977, "step": 80 }, { "epoch": 1.01, "eval_loss": 1.1896089315414429, "eval_runtime": 11.089, "eval_samples_per_second": 234.558, "eval_steps_per_second": 58.707, "step": 80 }, { "epoch": 1.02, "grad_norm": 0.05896366387605667, "learning_rate": 0.00017327103244279348, "loss": 1.1763, "step": 81 }, { "epoch": 1.03, "grad_norm": 0.08269200474023819, "learning_rate": 0.00017254953066647913, "loss": 1.1641, "step": 82 }, { "epoch": 1.05, "grad_norm": 0.06595351547002792, "learning_rate": 0.00017181996586895454, "loss": 1.1634, "step": 83 }, { "epoch": 1.06, "grad_norm": 0.06604241579771042, "learning_rate": 0.0001710824191327075, "loss": 1.1239, "step": 84 }, { "epoch": 1.07, "grad_norm": 0.07021407783031464, "learning_rate": 0.00017033697242732377, "loss": 1.1669, "step": 85 }, { "epoch": 1.08, "grad_norm": 0.06747168302536011, "learning_rate": 0.00016958370860037717, "loss": 1.1433, "step": 86 }, { "epoch": 1.1, "grad_norm": 0.06354415416717529, "learning_rate": 0.00016882271136822206, "loss": 1.1654, "step": 87 }, { "epoch": 1.11, "grad_norm": 0.06672206521034241, "learning_rate": 0.0001680540653066891, "loss": 1.1496, "step": 88 }, { "epoch": 1.12, "grad_norm": 0.07277010381221771, "learning_rate": 0.00016727785584168581, "loss": 1.1536, "step": 89 }, { "epoch": 1.14, "grad_norm": 0.06562938541173935, "learning_rate": 0.0001664941692397025, "loss": 1.1426, "step": 90 }, { "epoch": 1.15, "grad_norm": 0.07519260793924332, "learning_rate": 0.00016570309259822453, "loss": 1.1224, "step": 91 }, { "epoch": 1.16, "grad_norm": 0.0678098201751709, "learning_rate": 0.00016490471383605288, "loss": 1.1504, "step": 92 }, { "epoch": 1.17, "grad_norm": 0.0652153342962265, "learning_rate": 0.0001640991216835326, "loss": 1.1371, "step": 93 }, { "epoch": 1.19, "grad_norm": 0.07604055106639862, "learning_rate": 0.0001632864056726917, "loss": 1.1339, "step": 94 }, { "epoch": 1.2, "grad_norm": 0.07262595742940903, "learning_rate": 0.00016246665612729074, "loss": 1.1725, "step": 95 }, { "epoch": 1.21, "grad_norm": 0.06982827931642532, "learning_rate": 0.00016163996415278424, "loss": 1.1351, "step": 96 }, { "epoch": 1.23, "grad_norm": 0.07324743270874023, "learning_rate": 0.00016080642162619565, "loss": 1.1496, "step": 97 }, { "epoch": 1.24, "grad_norm": 0.08407079428434372, "learning_rate": 0.00015996612118590603, "loss": 1.1549, "step": 98 }, { "epoch": 1.25, "grad_norm": 0.07180128991603851, "learning_rate": 0.00015911915622135862, "loss": 1.1311, "step": 99 }, { "epoch": 1.26, "grad_norm": 0.08804436773061752, "learning_rate": 0.00015826562086267956, "loss": 1.1471, "step": 100 }, { "epoch": 1.26, "eval_loss": 1.1906447410583496, "eval_runtime": 11.084, "eval_samples_per_second": 234.662, "eval_steps_per_second": 58.733, "step": 100 }, { "epoch": 1.28, "grad_norm": 0.08186957985162735, "learning_rate": 0.00015740560997021648, "loss": 1.1858, "step": 101 }, { "epoch": 1.29, "grad_norm": 0.07139725238084793, "learning_rate": 0.00015653921912399589, "loss": 1.1356, "step": 102 }, { "epoch": 1.3, "grad_norm": 0.07663066685199738, "learning_rate": 0.0001556665446131007, "loss": 1.1787, "step": 103 }, { "epoch": 1.32, "grad_norm": 0.07882555574178696, "learning_rate": 0.0001547876834249687, "loss": 1.1738, "step": 104 }, { "epoch": 1.33, "grad_norm": 0.07430710643529892, "learning_rate": 0.00015390273323461352, "loss": 1.1517, "step": 105 }, { "epoch": 1.34, "grad_norm": 0.07954781502485275, "learning_rate": 0.00015301179239376938, "loss": 1.1281, "step": 106 }, { "epoch": 1.35, "grad_norm": 0.07749734818935394, "learning_rate": 0.00015211495991996027, "loss": 1.1459, "step": 107 }, { "epoch": 1.37, "grad_norm": 0.08005411177873611, "learning_rate": 0.0001512123354854955, "loss": 1.1766, "step": 108 }, { "epoch": 1.38, "grad_norm": 0.07647199928760529, "learning_rate": 0.0001503040194063922, "loss": 1.1737, "step": 109 }, { "epoch": 1.39, "grad_norm": 0.0819963812828064, "learning_rate": 0.00014939011263122634, "loss": 1.1781, "step": 110 }, { "epoch": 1.41, "grad_norm": 0.08222784847021103, "learning_rate": 0.00014847071672991367, "loss": 1.1882, "step": 111 }, { "epoch": 1.42, "grad_norm": 0.07998190820217133, "learning_rate": 0.00014754593388242117, "loss": 1.1609, "step": 112 }, { "epoch": 1.43, "grad_norm": 0.08468078076839447, "learning_rate": 0.0001466158668674112, "loss": 1.1513, "step": 113 }, { "epoch": 1.44, "grad_norm": 0.09136669337749481, "learning_rate": 0.00014568061905081875, "loss": 1.1576, "step": 114 }, { "epoch": 1.46, "grad_norm": 0.07721763849258423, "learning_rate": 0.00014474029437436348, "loss": 1.128, "step": 115 }, { "epoch": 1.47, "grad_norm": 0.08751308917999268, "learning_rate": 0.00014379499734399798, "loss": 1.1504, "step": 116 }, { "epoch": 1.48, "grad_norm": 0.08155680447816849, "learning_rate": 0.0001428448330182931, "loss": 1.1022, "step": 117 }, { "epoch": 1.5, "grad_norm": 0.078422911465168, "learning_rate": 0.00014188990699676184, "loss": 1.1638, "step": 118 }, { "epoch": 1.51, "grad_norm": 0.08801288157701492, "learning_rate": 0.00014093032540812348, "loss": 1.1993, "step": 119 }, { "epoch": 1.52, "grad_norm": 0.08341657370328903, "learning_rate": 0.00013996619489850822, "loss": 1.1711, "step": 120 }, { "epoch": 1.52, "eval_loss": 1.1901248693466187, "eval_runtime": 11.1106, "eval_samples_per_second": 234.101, "eval_steps_per_second": 58.593, "step": 120 }, { "epoch": 1.53, "grad_norm": 0.0804174616932869, "learning_rate": 0.00013899762261960518, "loss": 1.1479, "step": 121 }, { "epoch": 1.55, "grad_norm": 0.07937135547399521, "learning_rate": 0.00013802471621675338, "loss": 1.1628, "step": 122 }, { "epoch": 1.56, "grad_norm": 0.08624235540628433, "learning_rate": 0.00013704758381697844, "loss": 1.1533, "step": 123 }, { "epoch": 1.57, "grad_norm": 0.0815419852733612, "learning_rate": 0.00013606633401697557, "loss": 1.1444, "step": 124 }, { "epoch": 1.59, "grad_norm": 0.07709202915430069, "learning_rate": 0.0001350810758710401, "loss": 1.1644, "step": 125 }, { "epoch": 1.6, "grad_norm": 0.08101613074541092, "learning_rate": 0.0001340919188789477, "loss": 1.1425, "step": 126 }, { "epoch": 1.61, "grad_norm": 0.08749125152826309, "learning_rate": 0.00013309897297378455, "loss": 1.1884, "step": 127 }, { "epoch": 1.62, "grad_norm": 0.081383615732193, "learning_rate": 0.00013210234850972964, "loss": 1.1658, "step": 128 }, { "epoch": 1.64, "grad_norm": 0.08141542971134186, "learning_rate": 0.00013110215624979025, "loss": 1.1531, "step": 129 }, { "epoch": 1.65, "grad_norm": 0.08194302022457123, "learning_rate": 0.0001300985073534919, "loss": 1.1593, "step": 130 }, { "epoch": 1.66, "grad_norm": 0.07621721178293228, "learning_rate": 0.0001290915133645243, "loss": 1.1529, "step": 131 }, { "epoch": 1.68, "grad_norm": 0.07566998898983002, "learning_rate": 0.00012808128619834461, "loss": 1.1842, "step": 132 }, { "epoch": 1.69, "grad_norm": 0.07865401357412338, "learning_rate": 0.00012706793812973941, "loss": 1.1718, "step": 133 }, { "epoch": 1.7, "grad_norm": 0.08009400218725204, "learning_rate": 0.00012605158178034654, "loss": 1.1261, "step": 134 }, { "epoch": 1.71, "grad_norm": 0.08229468762874603, "learning_rate": 0.00012503233010613865, "loss": 1.1576, "step": 135 }, { "epoch": 1.73, "grad_norm": 0.08450435101985931, "learning_rate": 0.00012401029638486953, "loss": 1.2012, "step": 136 }, { "epoch": 1.74, "grad_norm": 0.07879026234149933, "learning_rate": 0.00012298559420348437, "loss": 1.1856, "step": 137 }, { "epoch": 1.75, "grad_norm": 0.07778112590312958, "learning_rate": 0.0001219583374454963, "loss": 1.1626, "step": 138 }, { "epoch": 1.77, "grad_norm": 0.07738618552684784, "learning_rate": 0.00012092864027832933, "loss": 1.1496, "step": 139 }, { "epoch": 1.78, "grad_norm": 0.07988622784614563, "learning_rate": 0.00011989661714062999, "loss": 1.1749, "step": 140 }, { "epoch": 1.78, "eval_loss": 1.1883182525634766, "eval_runtime": 11.1351, "eval_samples_per_second": 233.586, "eval_steps_per_second": 58.464, "step": 140 }, { "epoch": 1.79, "grad_norm": 0.08023218810558319, "learning_rate": 0.00011886238272954897, "loss": 1.1623, "step": 141 }, { "epoch": 1.8, "grad_norm": 0.07769781351089478, "learning_rate": 0.0001178260519879937, "loss": 1.1498, "step": 142 }, { "epoch": 1.82, "grad_norm": 0.07680430263280869, "learning_rate": 0.0001167877400918541, "loss": 1.1341, "step": 143 }, { "epoch": 1.83, "grad_norm": 0.08009691536426544, "learning_rate": 0.0001157475624372018, "loss": 1.1662, "step": 144 }, { "epoch": 1.84, "grad_norm": 0.0748942419886589, "learning_rate": 0.00011470563462746541, "loss": 1.1499, "step": 145 }, { "epoch": 1.86, "grad_norm": 0.07798086851835251, "learning_rate": 0.0001136620724605827, "loss": 1.1493, "step": 146 }, { "epoch": 1.87, "grad_norm": 0.08360154926776886, "learning_rate": 0.00011261699191613066, "loss": 1.1474, "step": 147 }, { "epoch": 1.88, "grad_norm": 0.08265041559934616, "learning_rate": 0.00011157050914243614, "loss": 1.1274, "step": 148 }, { "epoch": 1.89, "grad_norm": 0.07973820716142654, "learning_rate": 0.00011052274044366711, "loss": 1.147, "step": 149 }, { "epoch": 1.91, "grad_norm": 0.07632620632648468, "learning_rate": 0.00010947380226690684, "loss": 1.1228, "step": 150 }, { "epoch": 1.92, "grad_norm": 0.0813874751329422, "learning_rate": 0.00010842381118921232, "loss": 1.1464, "step": 151 }, { "epoch": 1.93, "grad_norm": 0.08030478656291962, "learning_rate": 0.00010737288390465792, "loss": 1.1154, "step": 152 }, { "epoch": 1.95, "grad_norm": 0.07874114066362381, "learning_rate": 0.00010632113721136636, "loss": 1.1608, "step": 153 }, { "epoch": 1.96, "grad_norm": 0.0795789510011673, "learning_rate": 0.00010526868799852796, "loss": 1.0918, "step": 154 }, { "epoch": 1.97, "grad_norm": 0.07888780534267426, "learning_rate": 0.00010421565323340971, "loss": 1.1535, "step": 155 }, { "epoch": 1.98, "grad_norm": 0.08071374148130417, "learning_rate": 0.0001031621499483559, "loss": 1.1306, "step": 156 }, { "epoch": 2.0, "grad_norm": 0.0843919888138771, "learning_rate": 0.00010210829522778111, "loss": 1.1597, "step": 157 }, { "epoch": 2.01, "grad_norm": 0.07720304280519485, "learning_rate": 0.00010105420619515798, "loss": 1.1234, "step": 158 }, { "epoch": 2.0, "grad_norm": 0.07769129425287247, "learning_rate": 0.0001, "loss": 1.1594, "step": 159 }, { "epoch": 2.02, "grad_norm": 0.08473801612854004, "learning_rate": 9.894579380484204e-05, "loss": 1.1057, "step": 160 }, { "epoch": 2.02, "eval_loss": 1.1866793632507324, "eval_runtime": 11.0867, "eval_samples_per_second": 234.605, "eval_steps_per_second": 58.719, "step": 160 }, { "epoch": 2.03, "grad_norm": 0.08069656789302826, "learning_rate": 9.789170477221891e-05, "loss": 1.1069, "step": 161 }, { "epoch": 2.04, "grad_norm": 0.08355725556612015, "learning_rate": 9.683785005164411e-05, "loss": 1.076, "step": 162 }, { "epoch": 2.05, "grad_norm": 0.09702204912900925, "learning_rate": 9.57843467665903e-05, "loss": 1.105, "step": 163 }, { "epoch": 2.07, "grad_norm": 0.08148284256458282, "learning_rate": 9.473131200147205e-05, "loss": 1.0932, "step": 164 }, { "epoch": 2.08, "grad_norm": 0.08034205436706543, "learning_rate": 9.367886278863366e-05, "loss": 1.096, "step": 165 }, { "epoch": 2.09, "grad_norm": 0.08467669785022736, "learning_rate": 9.26271160953421e-05, "loss": 1.1349, "step": 166 }, { "epoch": 2.11, "grad_norm": 0.08533730357885361, "learning_rate": 9.157618881078772e-05, "loss": 1.1344, "step": 167 }, { "epoch": 2.12, "grad_norm": 0.09051905572414398, "learning_rate": 9.052619773309317e-05, "loss": 1.0794, "step": 168 }, { "epoch": 2.13, "grad_norm": 0.08847147971391678, "learning_rate": 8.947725955633294e-05, "loss": 1.1084, "step": 169 }, { "epoch": 2.14, "grad_norm": 0.08844807744026184, "learning_rate": 8.84294908575639e-05, "loss": 1.1123, "step": 170 }, { "epoch": 2.16, "grad_norm": 0.09571530669927597, "learning_rate": 8.738300808386935e-05, "loss": 1.0753, "step": 171 }, { "epoch": 2.17, "grad_norm": 0.09825875610113144, "learning_rate": 8.633792753941733e-05, "loss": 1.0986, "step": 172 }, { "epoch": 2.18, "grad_norm": 0.09977909922599792, "learning_rate": 8.529436537253458e-05, "loss": 1.1116, "step": 173 }, { "epoch": 2.2, "grad_norm": 0.090195432305336, "learning_rate": 8.425243756279824e-05, "loss": 1.0809, "step": 174 }, { "epoch": 2.21, "grad_norm": 0.09112106263637543, "learning_rate": 8.321225990814591e-05, "loss": 1.1074, "step": 175 }, { "epoch": 2.22, "grad_norm": 0.09786658734083176, "learning_rate": 8.217394801200631e-05, "loss": 1.1107, "step": 176 }, { "epoch": 2.23, "grad_norm": 0.0941900834441185, "learning_rate": 8.113761727045105e-05, "loss": 1.0686, "step": 177 }, { "epoch": 2.25, "grad_norm": 0.09430352598428726, "learning_rate": 8.010338285937006e-05, "loss": 1.0676, "step": 178 }, { "epoch": 2.26, "grad_norm": 0.09434497356414795, "learning_rate": 7.907135972167069e-05, "loss": 1.0885, "step": 179 }, { "epoch": 2.27, "grad_norm": 0.09693111479282379, "learning_rate": 7.804166255450373e-05, "loss": 1.103, "step": 180 }, { "epoch": 2.27, "eval_loss": 1.194423794746399, "eval_runtime": 11.0499, "eval_samples_per_second": 235.387, "eval_steps_per_second": 58.915, "step": 180 }, { "epoch": 2.29, "grad_norm": 0.09600096195936203, "learning_rate": 7.701440579651566e-05, "loss": 1.048, "step": 181 }, { "epoch": 2.3, "grad_norm": 0.09983541816473007, "learning_rate": 7.598970361513051e-05, "loss": 1.1519, "step": 182 }, { "epoch": 2.31, "grad_norm": 0.09226010739803314, "learning_rate": 7.496766989386136e-05, "loss": 1.0913, "step": 183 }, { "epoch": 2.32, "grad_norm": 0.09644059091806412, "learning_rate": 7.394841821965345e-05, "loss": 1.1105, "step": 184 }, { "epoch": 2.34, "grad_norm": 0.09558931738138199, "learning_rate": 7.293206187026061e-05, "loss": 1.1251, "step": 185 }, { "epoch": 2.35, "grad_norm": 0.09943023324012756, "learning_rate": 7.191871380165538e-05, "loss": 1.0779, "step": 186 }, { "epoch": 2.36, "grad_norm": 0.09804826229810715, "learning_rate": 7.090848663547574e-05, "loss": 1.1258, "step": 187 }, { "epoch": 2.38, "grad_norm": 0.10091326385736465, "learning_rate": 6.990149264650814e-05, "loss": 1.0677, "step": 188 }, { "epoch": 2.39, "grad_norm": 0.09859558939933777, "learning_rate": 6.889784375020978e-05, "loss": 1.1112, "step": 189 }, { "epoch": 2.4, "grad_norm": 0.1040605828166008, "learning_rate": 6.789765149027039e-05, "loss": 1.1375, "step": 190 }, { "epoch": 2.41, "grad_norm": 0.11397302150726318, "learning_rate": 6.690102702621548e-05, "loss": 1.1079, "step": 191 }, { "epoch": 2.43, "grad_norm": 0.11060205101966858, "learning_rate": 6.590808112105232e-05, "loss": 1.128, "step": 192 }, { "epoch": 2.44, "grad_norm": 0.10098471492528915, "learning_rate": 6.491892412895995e-05, "loss": 1.1437, "step": 193 }, { "epoch": 2.45, "grad_norm": 0.1036130040884018, "learning_rate": 6.393366598302446e-05, "loss": 1.1106, "step": 194 }, { "epoch": 2.47, "grad_norm": 0.10566971451044083, "learning_rate": 6.295241618302156e-05, "loss": 1.1209, "step": 195 }, { "epoch": 2.48, "grad_norm": 0.1123911514878273, "learning_rate": 6.197528378324665e-05, "loss": 1.133, "step": 196 }, { "epoch": 2.49, "grad_norm": 0.10308899730443954, "learning_rate": 6.100237738039484e-05, "loss": 1.0973, "step": 197 }, { "epoch": 2.5, "grad_norm": 0.10143700242042542, "learning_rate": 6.0033805101491794e-05, "loss": 1.118, "step": 198 }, { "epoch": 2.52, "grad_norm": 0.10845794528722763, "learning_rate": 5.9069674591876534e-05, "loss": 1.0832, "step": 199 }, { "epoch": 2.53, "grad_norm": 0.115594282746315, "learning_rate": 5.811009300323818e-05, "loss": 1.0881, "step": 200 }, { "epoch": 2.53, "eval_loss": 1.1978853940963745, "eval_runtime": 11.0803, "eval_samples_per_second": 234.74, "eval_steps_per_second": 58.753, "step": 200 }, { "epoch": 2.54, "grad_norm": 0.10974434018135071, "learning_rate": 5.7155166981706956e-05, "loss": 1.1413, "step": 201 }, { "epoch": 2.56, "grad_norm": 0.10608117282390594, "learning_rate": 5.620500265600206e-05, "loss": 1.1216, "step": 202 }, { "epoch": 2.57, "grad_norm": 0.1109042689204216, "learning_rate": 5.525970562563656e-05, "loss": 1.1106, "step": 203 }, { "epoch": 2.58, "grad_norm": 0.11192983388900757, "learning_rate": 5.431938094918132e-05, "loss": 1.1122, "step": 204 }, { "epoch": 2.59, "grad_norm": 0.10309618711471558, "learning_rate": 5.3384133132588784e-05, "loss": 1.1012, "step": 205 }, { "epoch": 2.61, "grad_norm": 0.09989263117313385, "learning_rate": 5.2454066117578815e-05, "loss": 1.1164, "step": 206 }, { "epoch": 2.62, "grad_norm": 0.1028265655040741, "learning_rate": 5.152928327008635e-05, "loss": 1.1335, "step": 207 }, { "epoch": 2.63, "grad_norm": 0.1162625402212143, "learning_rate": 5.060988736877366e-05, "loss": 1.1484, "step": 208 }, { "epoch": 2.65, "grad_norm": 0.10568027198314667, "learning_rate": 4.9695980593607817e-05, "loss": 1.0821, "step": 209 }, { "epoch": 2.66, "grad_norm": 0.10472647845745087, "learning_rate": 4.8787664514504504e-05, "loss": 1.0867, "step": 210 }, { "epoch": 2.67, "grad_norm": 0.10326943546533585, "learning_rate": 4.788504008003978e-05, "loss": 1.0787, "step": 211 }, { "epoch": 2.68, "grad_norm": 0.10474622249603271, "learning_rate": 4.698820760623064e-05, "loss": 1.1189, "step": 212 }, { "epoch": 2.7, "grad_norm": 0.10698090493679047, "learning_rate": 4.609726676538652e-05, "loss": 1.0987, "step": 213 }, { "epoch": 2.71, "grad_norm": 0.10720671713352203, "learning_rate": 4.521231657503132e-05, "loss": 1.0837, "step": 214 }, { "epoch": 2.72, "grad_norm": 0.10953236371278763, "learning_rate": 4.433345538689929e-05, "loss": 1.1136, "step": 215 }, { "epoch": 2.74, "grad_norm": 0.10939928144216537, "learning_rate": 4.346078087600412e-05, "loss": 1.1346, "step": 216 }, { "epoch": 2.75, "grad_norm": 0.10348028689622879, "learning_rate": 4.2594390029783534e-05, "loss": 1.0539, "step": 217 }, { "epoch": 2.76, "grad_norm": 0.10443831980228424, "learning_rate": 4.173437913732048e-05, "loss": 1.1246, "step": 218 }, { "epoch": 2.77, "grad_norm": 0.10903069376945496, "learning_rate": 4.088084377864135e-05, "loss": 1.0711, "step": 219 }, { "epoch": 2.79, "grad_norm": 0.10401046276092529, "learning_rate": 4.003387881409397e-05, "loss": 1.1117, "step": 220 }, { "epoch": 2.79, "eval_loss": 1.1959346532821655, "eval_runtime": 11.1605, "eval_samples_per_second": 233.054, "eval_steps_per_second": 58.331, "step": 220 }, { "epoch": 2.8, "grad_norm": 0.10746972262859344, "learning_rate": 3.9193578373804364e-05, "loss": 1.1183, "step": 221 }, { "epoch": 2.81, "grad_norm": 0.10168997198343277, "learning_rate": 3.836003584721577e-05, "loss": 1.1311, "step": 222 }, { "epoch": 2.83, "grad_norm": 0.10692451149225235, "learning_rate": 3.7533343872709294e-05, "loss": 1.1081, "step": 223 }, { "epoch": 2.84, "grad_norm": 0.1075156182050705, "learning_rate": 3.671359432730834e-05, "loss": 1.1085, "step": 224 }, { "epoch": 2.85, "grad_norm": 0.10297439247369766, "learning_rate": 3.5900878316467454e-05, "loss": 1.1376, "step": 225 }, { "epoch": 2.86, "grad_norm": 0.10721824318170547, "learning_rate": 3.509528616394716e-05, "loss": 1.1136, "step": 226 }, { "epoch": 2.88, "grad_norm": 0.11028096079826355, "learning_rate": 3.429690740177549e-05, "loss": 1.1318, "step": 227 }, { "epoch": 2.89, "grad_norm": 0.10864273458719254, "learning_rate": 3.350583076029754e-05, "loss": 1.1396, "step": 228 }, { "epoch": 2.9, "grad_norm": 0.1059827134013176, "learning_rate": 3.272214415831418e-05, "loss": 1.1048, "step": 229 }, { "epoch": 2.92, "grad_norm": 0.10945747792720795, "learning_rate": 3.1945934693310896e-05, "loss": 1.106, "step": 230 }, { "epoch": 2.93, "grad_norm": 0.10941856354475021, "learning_rate": 3.117728863177796e-05, "loss": 1.1144, "step": 231 } ], "logging_steps": 1, "max_steps": 308, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 77, "total_flos": 6.893136578000978e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }