{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9940119760479043, "eval_steps": 500, "global_step": 13500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022177866489243733, "grad_norm": 4.408344745635986, "learning_rate": 2.3645320197044334e-06, "loss": 1.7171, "step": 100 }, { "epoch": 0.04435573297848747, "grad_norm": 0.8857895135879517, "learning_rate": 4.8275862068965525e-06, "loss": 0.6279, "step": 200 }, { "epoch": 0.0665335994677312, "grad_norm": 2.241079807281494, "learning_rate": 7.290640394088671e-06, "loss": 0.5064, "step": 300 }, { "epoch": 0.08871146595697493, "grad_norm": 1.0983695983886719, "learning_rate": 9.75369458128079e-06, "loss": 0.4626, "step": 400 }, { "epoch": 0.11088933244621868, "grad_norm": 1.031287670135498, "learning_rate": 1.2216748768472909e-05, "loss": 0.423, "step": 500 }, { "epoch": 0.1330671989354624, "grad_norm": 1.961317777633667, "learning_rate": 1.4679802955665026e-05, "loss": 0.4184, "step": 600 }, { "epoch": 0.15524506542470615, "grad_norm": 1.3524340391159058, "learning_rate": 1.7142857142857142e-05, "loss": 0.397, "step": 700 }, { "epoch": 0.17742293191394987, "grad_norm": 1.7465412616729736, "learning_rate": 1.9605911330049263e-05, "loss": 0.3795, "step": 800 }, { "epoch": 0.1996007984031936, "grad_norm": 1.1473759412765503, "learning_rate": 1.986787259142745e-05, "loss": 0.3657, "step": 900 }, { "epoch": 0.22177866489243736, "grad_norm": 1.5489747524261475, "learning_rate": 1.9710578057412507e-05, "loss": 0.3726, "step": 1000 }, { "epoch": 0.24395653138168108, "grad_norm": 1.2034003734588623, "learning_rate": 1.9553283523397563e-05, "loss": 0.3502, "step": 1100 }, { "epoch": 0.2661343978709248, "grad_norm": 2.0690724849700928, "learning_rate": 1.939598898938262e-05, "loss": 0.3518, "step": 1200 }, { "epoch": 0.28831226436016855, "grad_norm": 1.9681050777435303, "learning_rate": 1.9238694455367677e-05, "loss": 0.3361, "step": 1300 }, { "epoch": 0.3104901308494123, "grad_norm": 1.3863286972045898, "learning_rate": 1.9081399921352733e-05, "loss": 0.3397, "step": 1400 }, { "epoch": 0.33266799733865604, "grad_norm": 0.9572322964668274, "learning_rate": 1.8924105387337793e-05, "loss": 0.3339, "step": 1500 }, { "epoch": 0.35484586382789973, "grad_norm": 0.892398476600647, "learning_rate": 1.8766810853322847e-05, "loss": 0.3218, "step": 1600 }, { "epoch": 0.3770237303171435, "grad_norm": 1.2381540536880493, "learning_rate": 1.8609516319307907e-05, "loss": 0.3255, "step": 1700 }, { "epoch": 0.3992015968063872, "grad_norm": 0.8742302060127258, "learning_rate": 1.8452221785292963e-05, "loss": 0.3166, "step": 1800 }, { "epoch": 0.421379463295631, "grad_norm": 1.0703165531158447, "learning_rate": 1.829492725127802e-05, "loss": 0.3098, "step": 1900 }, { "epoch": 0.4435573297848747, "grad_norm": 1.6606981754302979, "learning_rate": 1.8137632717263076e-05, "loss": 0.3102, "step": 2000 }, { "epoch": 0.4657351962741184, "grad_norm": 1.0174481868743896, "learning_rate": 1.7980338183248133e-05, "loss": 0.3061, "step": 2100 }, { "epoch": 0.48791306276336216, "grad_norm": 0.9234058856964111, "learning_rate": 1.7823043649233193e-05, "loss": 0.3023, "step": 2200 }, { "epoch": 0.5100909292526059, "grad_norm": 0.8972137570381165, "learning_rate": 1.7665749115218246e-05, "loss": 0.3062, "step": 2300 }, { "epoch": 0.5322687957418496, "grad_norm": 0.7803289890289307, "learning_rate": 1.7508454581203306e-05, "loss": 0.2996, "step": 2400 }, { "epoch": 0.5544466622310934, "grad_norm": 0.879205584526062, "learning_rate": 1.7351160047188363e-05, "loss": 0.303, "step": 2500 }, { "epoch": 0.5766245287203371, "grad_norm": 1.0589395761489868, "learning_rate": 1.719386551317342e-05, "loss": 0.2876, "step": 2600 }, { "epoch": 0.5988023952095808, "grad_norm": 0.9810135960578918, "learning_rate": 1.7036570979158476e-05, "loss": 0.2841, "step": 2700 }, { "epoch": 0.6209802616988246, "grad_norm": 0.835926353931427, "learning_rate": 1.6879276445143533e-05, "loss": 0.2861, "step": 2800 }, { "epoch": 0.6431581281880683, "grad_norm": 0.9618144631385803, "learning_rate": 1.672198191112859e-05, "loss": 0.2881, "step": 2900 }, { "epoch": 0.6653359946773121, "grad_norm": 1.2271337509155273, "learning_rate": 1.6564687377113646e-05, "loss": 0.2795, "step": 3000 }, { "epoch": 0.6875138611665558, "grad_norm": 0.933788537979126, "learning_rate": 1.6407392843098702e-05, "loss": 0.2758, "step": 3100 }, { "epoch": 0.7096917276557995, "grad_norm": 1.3361326456069946, "learning_rate": 1.6250098309083762e-05, "loss": 0.2755, "step": 3200 }, { "epoch": 0.7318695941450433, "grad_norm": 0.9134598970413208, "learning_rate": 1.6092803775068816e-05, "loss": 0.2693, "step": 3300 }, { "epoch": 0.754047460634287, "grad_norm": 0.8436419367790222, "learning_rate": 1.5935509241053876e-05, "loss": 0.2709, "step": 3400 }, { "epoch": 0.7762253271235308, "grad_norm": 0.7325775623321533, "learning_rate": 1.5778214707038932e-05, "loss": 0.2766, "step": 3500 }, { "epoch": 0.7984031936127745, "grad_norm": 0.9576388597488403, "learning_rate": 1.562092017302399e-05, "loss": 0.2683, "step": 3600 }, { "epoch": 0.8205810601020181, "grad_norm": 0.812353789806366, "learning_rate": 1.5463625639009045e-05, "loss": 0.2643, "step": 3700 }, { "epoch": 0.842758926591262, "grad_norm": 1.00551176071167, "learning_rate": 1.5306331104994102e-05, "loss": 0.2696, "step": 3800 }, { "epoch": 0.8649367930805056, "grad_norm": 0.7504218816757202, "learning_rate": 1.5149036570979159e-05, "loss": 0.262, "step": 3900 }, { "epoch": 0.8871146595697494, "grad_norm": 0.6838926076889038, "learning_rate": 1.4991742036964217e-05, "loss": 0.2582, "step": 4000 }, { "epoch": 0.9092925260589931, "grad_norm": 0.9068514108657837, "learning_rate": 1.4834447502949274e-05, "loss": 0.2613, "step": 4100 }, { "epoch": 0.9314703925482368, "grad_norm": 0.8156359791755676, "learning_rate": 1.4677152968934332e-05, "loss": 0.2575, "step": 4200 }, { "epoch": 0.9536482590374806, "grad_norm": 0.8061220049858093, "learning_rate": 1.4519858434919387e-05, "loss": 0.2512, "step": 4300 }, { "epoch": 0.9758261255267243, "grad_norm": 0.7665420174598694, "learning_rate": 1.4362563900904445e-05, "loss": 0.2551, "step": 4400 }, { "epoch": 0.998003992015968, "grad_norm": 1.094953179359436, "learning_rate": 1.4205269366889502e-05, "loss": 0.2515, "step": 4500 }, { "epoch": 1.0201818585052118, "grad_norm": 1.0698802471160889, "learning_rate": 1.4047974832874558e-05, "loss": 0.2425, "step": 4600 }, { "epoch": 1.0423597249944556, "grad_norm": 0.9805143475532532, "learning_rate": 1.3890680298859615e-05, "loss": 0.2353, "step": 4700 }, { "epoch": 1.0645375914836992, "grad_norm": 1.0466519594192505, "learning_rate": 1.3733385764844673e-05, "loss": 0.2449, "step": 4800 }, { "epoch": 1.086715457972943, "grad_norm": 0.9419561624526978, "learning_rate": 1.3576091230829728e-05, "loss": 0.2362, "step": 4900 }, { "epoch": 1.1088933244621868, "grad_norm": 0.9370637536048889, "learning_rate": 1.3418796696814786e-05, "loss": 0.2327, "step": 5000 }, { "epoch": 1.1310711909514304, "grad_norm": 0.7672102451324463, "learning_rate": 1.3261502162799845e-05, "loss": 0.2337, "step": 5100 }, { "epoch": 1.1532490574406742, "grad_norm": 1.0745601654052734, "learning_rate": 1.3104207628784901e-05, "loss": 0.24, "step": 5200 }, { "epoch": 1.175426923929918, "grad_norm": 1.0820897817611694, "learning_rate": 1.2946913094769958e-05, "loss": 0.2271, "step": 5300 }, { "epoch": 1.1976047904191618, "grad_norm": 1.155911922454834, "learning_rate": 1.2789618560755015e-05, "loss": 0.2361, "step": 5400 }, { "epoch": 1.2197826569084054, "grad_norm": 0.9654746651649475, "learning_rate": 1.2632324026740073e-05, "loss": 0.2389, "step": 5500 }, { "epoch": 1.2419605233976492, "grad_norm": 1.0573245286941528, "learning_rate": 1.2475029492725128e-05, "loss": 0.2264, "step": 5600 }, { "epoch": 1.264138389886893, "grad_norm": 1.3749500513076782, "learning_rate": 1.2317734958710186e-05, "loss": 0.229, "step": 5700 }, { "epoch": 1.2863162563761366, "grad_norm": 0.9389622211456299, "learning_rate": 1.2160440424695243e-05, "loss": 0.2277, "step": 5800 }, { "epoch": 1.3084941228653804, "grad_norm": 1.2547938823699951, "learning_rate": 1.20031458906803e-05, "loss": 0.2265, "step": 5900 }, { "epoch": 1.3306719893546242, "grad_norm": 1.1487092971801758, "learning_rate": 1.1845851356665356e-05, "loss": 0.2266, "step": 6000 }, { "epoch": 1.3528498558438677, "grad_norm": 0.6461149454116821, "learning_rate": 1.1688556822650414e-05, "loss": 0.2235, "step": 6100 }, { "epoch": 1.3750277223331115, "grad_norm": 0.8437641859054565, "learning_rate": 1.1531262288635473e-05, "loss": 0.2266, "step": 6200 }, { "epoch": 1.3972055888223553, "grad_norm": 0.8984001278877258, "learning_rate": 1.1373967754620527e-05, "loss": 0.2195, "step": 6300 }, { "epoch": 1.419383455311599, "grad_norm": 1.1755112409591675, "learning_rate": 1.1216673220605586e-05, "loss": 0.2168, "step": 6400 }, { "epoch": 1.4415613218008427, "grad_norm": 1.250999927520752, "learning_rate": 1.1059378686590642e-05, "loss": 0.2214, "step": 6500 }, { "epoch": 1.4637391882900865, "grad_norm": 1.2418690919876099, "learning_rate": 1.0902084152575699e-05, "loss": 0.2196, "step": 6600 }, { "epoch": 1.4859170547793301, "grad_norm": 0.9416905641555786, "learning_rate": 1.0744789618560756e-05, "loss": 0.2237, "step": 6700 }, { "epoch": 1.508094921268574, "grad_norm": 0.9549462199211121, "learning_rate": 1.0587495084545814e-05, "loss": 0.2231, "step": 6800 }, { "epoch": 1.5302727877578177, "grad_norm": 0.9897739291191101, "learning_rate": 1.0430200550530869e-05, "loss": 0.221, "step": 6900 }, { "epoch": 1.5524506542470613, "grad_norm": 1.0174314975738525, "learning_rate": 1.0272906016515927e-05, "loss": 0.2193, "step": 7000 }, { "epoch": 1.5746285207363053, "grad_norm": 0.8986598253250122, "learning_rate": 1.0115611482500984e-05, "loss": 0.2114, "step": 7100 }, { "epoch": 1.596806387225549, "grad_norm": 0.7662016749382019, "learning_rate": 9.95831694848604e-06, "loss": 0.2162, "step": 7200 }, { "epoch": 1.6189842537147925, "grad_norm": 0.875023603439331, "learning_rate": 9.801022414471097e-06, "loss": 0.2093, "step": 7300 }, { "epoch": 1.6411621202040365, "grad_norm": 1.059648036956787, "learning_rate": 9.643727880456155e-06, "loss": 0.2114, "step": 7400 }, { "epoch": 1.66333998669328, "grad_norm": 1.2008799314498901, "learning_rate": 9.486433346441212e-06, "loss": 0.2129, "step": 7500 }, { "epoch": 1.685517853182524, "grad_norm": 1.009397029876709, "learning_rate": 9.32913881242627e-06, "loss": 0.2069, "step": 7600 }, { "epoch": 1.7076957196717677, "grad_norm": 0.9461073875427246, "learning_rate": 9.171844278411327e-06, "loss": 0.2109, "step": 7700 }, { "epoch": 1.7298735861610113, "grad_norm": 0.7946839332580566, "learning_rate": 9.014549744396383e-06, "loss": 0.2051, "step": 7800 }, { "epoch": 1.752051452650255, "grad_norm": 1.0686787366867065, "learning_rate": 8.85725521038144e-06, "loss": 0.2114, "step": 7900 }, { "epoch": 1.7742293191394989, "grad_norm": 1.1309982538223267, "learning_rate": 8.699960676366497e-06, "loss": 0.2113, "step": 8000 }, { "epoch": 1.7964071856287425, "grad_norm": 0.8873094320297241, "learning_rate": 8.542666142351555e-06, "loss": 0.2032, "step": 8100 }, { "epoch": 1.8185850521179863, "grad_norm": 1.1685720682144165, "learning_rate": 8.385371608336611e-06, "loss": 0.2046, "step": 8200 }, { "epoch": 1.84076291860723, "grad_norm": 1.1391305923461914, "learning_rate": 8.228077074321668e-06, "loss": 0.2059, "step": 8300 }, { "epoch": 1.8629407850964737, "grad_norm": 1.0028046369552612, "learning_rate": 8.070782540306725e-06, "loss": 0.2051, "step": 8400 }, { "epoch": 1.8851186515857175, "grad_norm": 1.3470697402954102, "learning_rate": 7.913488006291781e-06, "loss": 0.2059, "step": 8500 }, { "epoch": 1.9072965180749613, "grad_norm": 1.290456771850586, "learning_rate": 7.75619347227684e-06, "loss": 0.1995, "step": 8600 }, { "epoch": 1.9294743845642048, "grad_norm": 0.7506065964698792, "learning_rate": 7.598898938261896e-06, "loss": 0.2011, "step": 8700 }, { "epoch": 1.9516522510534486, "grad_norm": 1.170919418334961, "learning_rate": 7.441604404246953e-06, "loss": 0.2017, "step": 8800 }, { "epoch": 1.9738301175426924, "grad_norm": 1.1888222694396973, "learning_rate": 7.28430987023201e-06, "loss": 0.1998, "step": 8900 }, { "epoch": 1.996007984031936, "grad_norm": 1.1401287317276, "learning_rate": 7.127015336217067e-06, "loss": 0.1996, "step": 9000 }, { "epoch": 2.01818585052118, "grad_norm": 1.0609304904937744, "learning_rate": 6.969720802202124e-06, "loss": 0.194, "step": 9100 }, { "epoch": 2.0403637170104236, "grad_norm": 0.7136222124099731, "learning_rate": 6.812426268187181e-06, "loss": 0.1907, "step": 9200 }, { "epoch": 2.062541583499667, "grad_norm": 0.9201442003250122, "learning_rate": 6.6551317341722375e-06, "loss": 0.1899, "step": 9300 }, { "epoch": 2.0847194499889112, "grad_norm": 1.034180998802185, "learning_rate": 6.497837200157295e-06, "loss": 0.1905, "step": 9400 }, { "epoch": 2.106897316478155, "grad_norm": 1.2538888454437256, "learning_rate": 6.340542666142352e-06, "loss": 0.1895, "step": 9500 }, { "epoch": 2.1290751829673984, "grad_norm": 1.1865867376327515, "learning_rate": 6.18324813212741e-06, "loss": 0.1903, "step": 9600 }, { "epoch": 2.1512530494566424, "grad_norm": 1.1879113912582397, "learning_rate": 6.0259535981124665e-06, "loss": 0.1827, "step": 9700 }, { "epoch": 2.173430915945886, "grad_norm": 0.959338903427124, "learning_rate": 5.868659064097523e-06, "loss": 0.1871, "step": 9800 }, { "epoch": 2.1956087824351296, "grad_norm": 1.0765694379806519, "learning_rate": 5.7113645300825806e-06, "loss": 0.1904, "step": 9900 }, { "epoch": 2.2177866489243736, "grad_norm": 1.1562960147857666, "learning_rate": 5.554069996067637e-06, "loss": 0.1852, "step": 10000 }, { "epoch": 2.239964515413617, "grad_norm": 1.1772807836532593, "learning_rate": 5.396775462052695e-06, "loss": 0.1875, "step": 10100 }, { "epoch": 2.2621423819028608, "grad_norm": 0.9771366715431213, "learning_rate": 5.239480928037751e-06, "loss": 0.1899, "step": 10200 }, { "epoch": 2.284320248392105, "grad_norm": 0.7828590273857117, "learning_rate": 5.082186394022808e-06, "loss": 0.1846, "step": 10300 }, { "epoch": 2.3064981148813484, "grad_norm": 1.0688682794570923, "learning_rate": 4.924891860007865e-06, "loss": 0.186, "step": 10400 }, { "epoch": 2.3286759813705924, "grad_norm": 1.2667362689971924, "learning_rate": 4.767597325992922e-06, "loss": 0.186, "step": 10500 }, { "epoch": 2.350853847859836, "grad_norm": 0.9742441177368164, "learning_rate": 4.610302791977979e-06, "loss": 0.1822, "step": 10600 }, { "epoch": 2.3730317143490796, "grad_norm": 0.8631011843681335, "learning_rate": 4.453008257963036e-06, "loss": 0.1789, "step": 10700 }, { "epoch": 2.3952095808383236, "grad_norm": 0.7579483985900879, "learning_rate": 4.2957137239480934e-06, "loss": 0.1865, "step": 10800 }, { "epoch": 2.417387447327567, "grad_norm": 0.8615408539772034, "learning_rate": 4.13841918993315e-06, "loss": 0.1805, "step": 10900 }, { "epoch": 2.4395653138168107, "grad_norm": 1.0644463300704956, "learning_rate": 3.9811246559182075e-06, "loss": 0.1849, "step": 11000 }, { "epoch": 2.4617431803060548, "grad_norm": 0.9933910965919495, "learning_rate": 3.823830121903264e-06, "loss": 0.1846, "step": 11100 }, { "epoch": 2.4839210467952983, "grad_norm": 1.011958360671997, "learning_rate": 3.666535587888321e-06, "loss": 0.1863, "step": 11200 }, { "epoch": 2.506098913284542, "grad_norm": 1.0306683778762817, "learning_rate": 3.5092410538733786e-06, "loss": 0.1853, "step": 11300 }, { "epoch": 2.528276779773786, "grad_norm": 1.0129719972610474, "learning_rate": 3.351946519858435e-06, "loss": 0.1855, "step": 11400 }, { "epoch": 2.5504546462630295, "grad_norm": 1.0215705633163452, "learning_rate": 3.1946519858434922e-06, "loss": 0.1867, "step": 11500 }, { "epoch": 2.572632512752273, "grad_norm": 1.202038288116455, "learning_rate": 3.0373574518285493e-06, "loss": 0.1839, "step": 11600 }, { "epoch": 2.594810379241517, "grad_norm": 1.19171142578125, "learning_rate": 2.8800629178136063e-06, "loss": 0.1776, "step": 11700 }, { "epoch": 2.6169882457307607, "grad_norm": 1.0898429155349731, "learning_rate": 2.7227683837986633e-06, "loss": 0.178, "step": 11800 }, { "epoch": 2.6391661122200043, "grad_norm": 1.005279779434204, "learning_rate": 2.56547384978372e-06, "loss": 0.1811, "step": 11900 }, { "epoch": 2.6613439787092483, "grad_norm": 1.0780277252197266, "learning_rate": 2.408179315768777e-06, "loss": 0.1831, "step": 12000 }, { "epoch": 2.683521845198492, "grad_norm": 1.318746566772461, "learning_rate": 2.252457727093984e-06, "loss": 0.1835, "step": 12100 }, { "epoch": 2.7056997116877355, "grad_norm": 1.289838433265686, "learning_rate": 2.0951631930790405e-06, "loss": 0.1813, "step": 12200 }, { "epoch": 2.7278775781769795, "grad_norm": 0.806324303150177, "learning_rate": 1.9378686590640976e-06, "loss": 0.1778, "step": 12300 }, { "epoch": 2.750055444666223, "grad_norm": 1.2230814695358276, "learning_rate": 1.7805741250491546e-06, "loss": 0.1797, "step": 12400 }, { "epoch": 2.7722333111554667, "grad_norm": 1.0323050022125244, "learning_rate": 1.6232795910342116e-06, "loss": 0.1832, "step": 12500 }, { "epoch": 2.7944111776447107, "grad_norm": 0.9353643655776978, "learning_rate": 1.4659850570192689e-06, "loss": 0.1828, "step": 12600 }, { "epoch": 2.8165890441339543, "grad_norm": 0.8385490775108337, "learning_rate": 1.3086905230043257e-06, "loss": 0.1763, "step": 12700 }, { "epoch": 2.838766910623198, "grad_norm": 0.9432787299156189, "learning_rate": 1.1513959889893827e-06, "loss": 0.18, "step": 12800 }, { "epoch": 2.860944777112442, "grad_norm": 1.0854963064193726, "learning_rate": 9.941014549744397e-07, "loss": 0.1786, "step": 12900 }, { "epoch": 2.8831226436016855, "grad_norm": 1.0914461612701416, "learning_rate": 8.368069209594968e-07, "loss": 0.1804, "step": 13000 }, { "epoch": 2.905300510090929, "grad_norm": 0.8744707703590393, "learning_rate": 6.795123869445537e-07, "loss": 0.1776, "step": 13100 }, { "epoch": 2.927478376580173, "grad_norm": 1.073390245437622, "learning_rate": 5.222178529296107e-07, "loss": 0.1797, "step": 13200 }, { "epoch": 2.9496562430694167, "grad_norm": 1.0887576341629028, "learning_rate": 3.6492331891466777e-07, "loss": 0.1791, "step": 13300 }, { "epoch": 2.9718341095586602, "grad_norm": 1.3841413259506226, "learning_rate": 2.0762878489972477e-07, "loss": 0.1792, "step": 13400 }, { "epoch": 2.9940119760479043, "grad_norm": 1.0988340377807617, "learning_rate": 5.033425088478176e-08, "loss": 0.1834, "step": 13500 } ], "logging_steps": 100, "max_steps": 13527, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.765106604499366e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }