{ "best_metric": 0.1550179123878479, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.12281276146822365, "eval_steps": 100, "global_step": 422, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002910255011095347, "grad_norm": 0.5728808045387268, "learning_rate": 2e-05, "loss": 0.6246, "step": 1 }, { "epoch": 0.0002910255011095347, "eval_loss": 0.6954387426376343, "eval_runtime": 1569.9362, "eval_samples_per_second": 7.373, "eval_steps_per_second": 1.843, "step": 1 }, { "epoch": 0.0005820510022190694, "grad_norm": 0.6031620502471924, "learning_rate": 4e-05, "loss": 0.6844, "step": 2 }, { "epoch": 0.0008730765033286041, "grad_norm": 0.592772364616394, "learning_rate": 6e-05, "loss": 0.6462, "step": 3 }, { "epoch": 0.0011641020044381389, "grad_norm": 0.6578194499015808, "learning_rate": 8e-05, "loss": 0.6733, "step": 4 }, { "epoch": 0.0014551275055476737, "grad_norm": 0.7925109267234802, "learning_rate": 0.0001, "loss": 0.6427, "step": 5 }, { "epoch": 0.0017461530066572083, "grad_norm": 0.877795934677124, "learning_rate": 0.00012, "loss": 0.5856, "step": 6 }, { "epoch": 0.002037178507766743, "grad_norm": 0.6358482837677002, "learning_rate": 0.00014, "loss": 0.4832, "step": 7 }, { "epoch": 0.0023282040088762777, "grad_norm": 0.562201201915741, "learning_rate": 0.00016, "loss": 0.412, "step": 8 }, { "epoch": 0.0026192295099858125, "grad_norm": 0.47625571489334106, "learning_rate": 0.00018, "loss": 0.3772, "step": 9 }, { "epoch": 0.0029102550110953473, "grad_norm": 0.2741421163082123, "learning_rate": 0.0002, "loss": 0.309, "step": 10 }, { "epoch": 0.003201280512204882, "grad_norm": 0.22830943763256073, "learning_rate": 0.00019999709281135722, "loss": 0.2466, "step": 11 }, { "epoch": 0.0034923060133144166, "grad_norm": 0.28191593289375305, "learning_rate": 0.00019998837141446378, "loss": 0.257, "step": 12 }, { "epoch": 0.0037833315144239514, "grad_norm": 0.18168823421001434, "learning_rate": 0.00019997383631641463, "loss": 0.2153, "step": 13 }, { "epoch": 0.004074357015533486, "grad_norm": 0.15139496326446533, "learning_rate": 0.00019995348836233516, "loss": 0.1961, "step": 14 }, { "epoch": 0.004365382516643021, "grad_norm": 0.14195305109024048, "learning_rate": 0.00019992732873533222, "loss": 0.2014, "step": 15 }, { "epoch": 0.004656408017752555, "grad_norm": 0.125483900308609, "learning_rate": 0.00019989535895642525, "loss": 0.1955, "step": 16 }, { "epoch": 0.004947433518862091, "grad_norm": 0.13404949009418488, "learning_rate": 0.00019985758088445773, "loss": 0.2072, "step": 17 }, { "epoch": 0.005238459019971625, "grad_norm": 0.1380385011434555, "learning_rate": 0.00019981399671598939, "loss": 0.1803, "step": 18 }, { "epoch": 0.005529484521081159, "grad_norm": 0.11921346187591553, "learning_rate": 0.00019976460898516818, "loss": 0.1949, "step": 19 }, { "epoch": 0.005820510022190695, "grad_norm": 0.12015374004840851, "learning_rate": 0.00019970942056358307, "loss": 0.1801, "step": 20 }, { "epoch": 0.006111535523300229, "grad_norm": 0.11771855503320694, "learning_rate": 0.00019964843466009714, "loss": 0.187, "step": 21 }, { "epoch": 0.006402561024409764, "grad_norm": 0.09964954853057861, "learning_rate": 0.00019958165482066094, "loss": 0.1856, "step": 22 }, { "epoch": 0.006693586525519299, "grad_norm": 0.09882669895887375, "learning_rate": 0.00019950908492810622, "loss": 0.1749, "step": 23 }, { "epoch": 0.006984612026628833, "grad_norm": 0.10145384073257446, "learning_rate": 0.0001994307292019204, "loss": 0.1966, "step": 24 }, { "epoch": 0.007275637527738368, "grad_norm": 0.097194142639637, "learning_rate": 0.000199346592198001, "loss": 0.1866, "step": 25 }, { "epoch": 0.007566663028847903, "grad_norm": 0.0885874480009079, "learning_rate": 0.0001992566788083908, "loss": 0.167, "step": 26 }, { "epoch": 0.007857688529957437, "grad_norm": 0.09368938952684402, "learning_rate": 0.0001991609942609936, "loss": 0.1761, "step": 27 }, { "epoch": 0.008148714031066972, "grad_norm": 0.09040655195713043, "learning_rate": 0.00019905954411926992, "loss": 0.1842, "step": 28 }, { "epoch": 0.008439739532176508, "grad_norm": 0.08293917775154114, "learning_rate": 0.00019895233428191377, "loss": 0.1615, "step": 29 }, { "epoch": 0.008730765033286042, "grad_norm": 0.09155099838972092, "learning_rate": 0.00019883937098250963, "loss": 0.1766, "step": 30 }, { "epoch": 0.009021790534395576, "grad_norm": 0.08348311483860016, "learning_rate": 0.00019872066078916988, "loss": 0.1489, "step": 31 }, { "epoch": 0.00931281603550511, "grad_norm": 0.08968468010425568, "learning_rate": 0.000198596210604153, "loss": 0.1978, "step": 32 }, { "epoch": 0.009603841536614645, "grad_norm": 0.08129172772169113, "learning_rate": 0.00019846602766346235, "loss": 0.1637, "step": 33 }, { "epoch": 0.009894867037724181, "grad_norm": 0.07663632184267044, "learning_rate": 0.00019833011953642525, "loss": 0.1746, "step": 34 }, { "epoch": 0.010185892538833716, "grad_norm": 0.07788729667663574, "learning_rate": 0.00019818849412525294, "loss": 0.1841, "step": 35 }, { "epoch": 0.01047691803994325, "grad_norm": 0.0812305361032486, "learning_rate": 0.00019804115966458115, "loss": 0.1938, "step": 36 }, { "epoch": 0.010767943541052784, "grad_norm": 0.08344654738903046, "learning_rate": 0.00019788812472099136, "loss": 0.175, "step": 37 }, { "epoch": 0.011058969042162319, "grad_norm": 0.08840890228748322, "learning_rate": 0.0001977293981925125, "loss": 0.2129, "step": 38 }, { "epoch": 0.011349994543271855, "grad_norm": 0.0735771432518959, "learning_rate": 0.00019756498930810382, "loss": 0.1713, "step": 39 }, { "epoch": 0.01164102004438139, "grad_norm": 0.07437434047460556, "learning_rate": 0.00019739490762711812, "loss": 0.1559, "step": 40 }, { "epoch": 0.011932045545490924, "grad_norm": 0.07056687772274017, "learning_rate": 0.00019721916303874605, "loss": 0.175, "step": 41 }, { "epoch": 0.012223071046600458, "grad_norm": 0.07450737059116364, "learning_rate": 0.00019703776576144105, "loss": 0.1911, "step": 42 }, { "epoch": 0.012514096547709993, "grad_norm": 0.07784494012594223, "learning_rate": 0.00019685072634232522, "loss": 0.1826, "step": 43 }, { "epoch": 0.012805122048819529, "grad_norm": 0.06952424347400665, "learning_rate": 0.00019665805565657603, "loss": 0.1661, "step": 44 }, { "epoch": 0.013096147549929063, "grad_norm": 0.08172561228275299, "learning_rate": 0.00019645976490679403, "loss": 0.1906, "step": 45 }, { "epoch": 0.013387173051038597, "grad_norm": 0.0775056853890419, "learning_rate": 0.0001962558656223516, "loss": 0.1821, "step": 46 }, { "epoch": 0.013678198552148132, "grad_norm": 0.07613001763820648, "learning_rate": 0.0001960463696587224, "loss": 0.1815, "step": 47 }, { "epoch": 0.013969224053257666, "grad_norm": 0.07065246999263763, "learning_rate": 0.00019583128919679215, "loss": 0.1691, "step": 48 }, { "epoch": 0.0142602495543672, "grad_norm": 0.06258969753980637, "learning_rate": 0.00019561063674215036, "loss": 0.156, "step": 49 }, { "epoch": 0.014551275055476737, "grad_norm": 0.06769802421331406, "learning_rate": 0.00019538442512436328, "loss": 0.1547, "step": 50 }, { "epoch": 0.014842300556586271, "grad_norm": 0.07802195101976395, "learning_rate": 0.00019515266749622778, "loss": 0.1827, "step": 51 }, { "epoch": 0.015133326057695805, "grad_norm": 0.070973701775074, "learning_rate": 0.00019491537733300676, "loss": 0.1553, "step": 52 }, { "epoch": 0.01542435155880534, "grad_norm": 0.07104197889566422, "learning_rate": 0.0001946725684316456, "loss": 0.1791, "step": 53 }, { "epoch": 0.015715377059914874, "grad_norm": 0.0786806121468544, "learning_rate": 0.00019442425490996988, "loss": 0.1875, "step": 54 }, { "epoch": 0.01600640256102441, "grad_norm": 0.06334872543811798, "learning_rate": 0.0001941704512058646, "loss": 0.1738, "step": 55 }, { "epoch": 0.016297428062133943, "grad_norm": 0.07314591854810715, "learning_rate": 0.0001939111720764347, "loss": 0.1454, "step": 56 }, { "epoch": 0.01658845356324348, "grad_norm": 0.07218274474143982, "learning_rate": 0.00019364643259714694, "loss": 0.1813, "step": 57 }, { "epoch": 0.016879479064353015, "grad_norm": 0.07224708050489426, "learning_rate": 0.00019337624816095358, "loss": 0.1849, "step": 58 }, { "epoch": 0.017170504565462548, "grad_norm": 0.069266676902771, "learning_rate": 0.00019310063447739698, "loss": 0.1659, "step": 59 }, { "epoch": 0.017461530066572084, "grad_norm": 0.06808722019195557, "learning_rate": 0.0001928196075716966, "loss": 0.1735, "step": 60 }, { "epoch": 0.017752555567681617, "grad_norm": 0.06713879108428955, "learning_rate": 0.00019253318378381704, "loss": 0.1592, "step": 61 }, { "epoch": 0.018043581068791153, "grad_norm": 0.07436138391494751, "learning_rate": 0.00019224137976751795, "loss": 0.1817, "step": 62 }, { "epoch": 0.01833460656990069, "grad_norm": 0.08002530038356781, "learning_rate": 0.00019194421248938575, "loss": 0.1746, "step": 63 }, { "epoch": 0.01862563207101022, "grad_norm": 0.076021708548069, "learning_rate": 0.00019164169922784716, "loss": 0.169, "step": 64 }, { "epoch": 0.018916657572119758, "grad_norm": 0.06513865292072296, "learning_rate": 0.00019133385757216459, "loss": 0.1562, "step": 65 }, { "epoch": 0.01920768307322929, "grad_norm": 0.07233449071645737, "learning_rate": 0.00019102070542141328, "loss": 0.1782, "step": 66 }, { "epoch": 0.019498708574338827, "grad_norm": 0.068657785654068, "learning_rate": 0.00019070226098344078, "loss": 0.162, "step": 67 }, { "epoch": 0.019789734075448363, "grad_norm": 0.07379890233278275, "learning_rate": 0.0001903785427738082, "loss": 0.1697, "step": 68 }, { "epoch": 0.020080759576557895, "grad_norm": 0.06616660207509995, "learning_rate": 0.00019004956961471355, "loss": 0.1379, "step": 69 }, { "epoch": 0.02037178507766743, "grad_norm": 0.0688454881310463, "learning_rate": 0.00018971536063389744, "loss": 0.1668, "step": 70 }, { "epoch": 0.020662810578776964, "grad_norm": 0.06237876042723656, "learning_rate": 0.00018937593526353096, "loss": 0.1414, "step": 71 }, { "epoch": 0.0209538360798865, "grad_norm": 0.06888283044099808, "learning_rate": 0.00018903131323908578, "loss": 0.1648, "step": 72 }, { "epoch": 0.021244861580996036, "grad_norm": 0.06820754706859589, "learning_rate": 0.00018868151459818658, "loss": 0.1705, "step": 73 }, { "epoch": 0.02153588708210557, "grad_norm": 0.07488477230072021, "learning_rate": 0.00018832655967944607, "loss": 0.1818, "step": 74 }, { "epoch": 0.021826912583215105, "grad_norm": 0.06322386115789413, "learning_rate": 0.00018796646912128245, "loss": 0.1606, "step": 75 }, { "epoch": 0.022117938084324638, "grad_norm": 0.06205340102314949, "learning_rate": 0.00018760126386071935, "loss": 0.1538, "step": 76 }, { "epoch": 0.022408963585434174, "grad_norm": 0.07324095070362091, "learning_rate": 0.00018723096513216842, "loss": 0.1647, "step": 77 }, { "epoch": 0.02269998908654371, "grad_norm": 0.08337172865867615, "learning_rate": 0.0001868555944661949, "loss": 0.1974, "step": 78 }, { "epoch": 0.022991014587653243, "grad_norm": 0.08376014232635498, "learning_rate": 0.00018647517368826545, "loss": 0.1718, "step": 79 }, { "epoch": 0.02328204008876278, "grad_norm": 0.07914192974567413, "learning_rate": 0.00018608972491747944, "loss": 0.1914, "step": 80 }, { "epoch": 0.02357306558987231, "grad_norm": 0.0706939846277237, "learning_rate": 0.00018569927056528263, "loss": 0.1597, "step": 81 }, { "epoch": 0.023864091090981848, "grad_norm": 0.0692586675286293, "learning_rate": 0.00018530383333416418, "loss": 0.1773, "step": 82 }, { "epoch": 0.024155116592091384, "grad_norm": 0.07192862033843994, "learning_rate": 0.00018490343621633659, "loss": 0.1756, "step": 83 }, { "epoch": 0.024446142093200916, "grad_norm": 0.07629089802503586, "learning_rate": 0.00018449810249239902, "loss": 0.1749, "step": 84 }, { "epoch": 0.024737167594310452, "grad_norm": 0.0677715539932251, "learning_rate": 0.00018408785572998336, "loss": 0.1579, "step": 85 }, { "epoch": 0.025028193095419985, "grad_norm": 0.07510597258806229, "learning_rate": 0.0001836727197823842, "loss": 0.1664, "step": 86 }, { "epoch": 0.02531921859652952, "grad_norm": 0.07853872328996658, "learning_rate": 0.00018325271878717186, "loss": 0.1611, "step": 87 }, { "epoch": 0.025610244097639057, "grad_norm": 0.06929906457662582, "learning_rate": 0.00018282787716478868, "loss": 0.1678, "step": 88 }, { "epoch": 0.02590126959874859, "grad_norm": 0.08053125441074371, "learning_rate": 0.00018239821961712953, "loss": 0.1891, "step": 89 }, { "epoch": 0.026192295099858126, "grad_norm": 0.07177354395389557, "learning_rate": 0.00018196377112610526, "loss": 0.1712, "step": 90 }, { "epoch": 0.02648332060096766, "grad_norm": 0.06295143812894821, "learning_rate": 0.00018152455695219025, "loss": 0.1483, "step": 91 }, { "epoch": 0.026774346102077195, "grad_norm": 0.0677083432674408, "learning_rate": 0.00018108060263295362, "loss": 0.162, "step": 92 }, { "epoch": 0.027065371603186728, "grad_norm": 0.075025275349617, "learning_rate": 0.0001806319339815745, "loss": 0.1755, "step": 93 }, { "epoch": 0.027356397104296264, "grad_norm": 0.07081899046897888, "learning_rate": 0.00018017857708534107, "loss": 0.1608, "step": 94 }, { "epoch": 0.0276474226054058, "grad_norm": 0.0698823481798172, "learning_rate": 0.0001797205583041337, "loss": 0.168, "step": 95 }, { "epoch": 0.027938448106515332, "grad_norm": 0.07225258648395538, "learning_rate": 0.00017925790426889235, "loss": 0.1584, "step": 96 }, { "epoch": 0.02822947360762487, "grad_norm": 0.06617313623428345, "learning_rate": 0.00017879064188006818, "loss": 0.1562, "step": 97 }, { "epoch": 0.0285204991087344, "grad_norm": 0.07458757609128952, "learning_rate": 0.00017831879830605937, "loss": 0.1643, "step": 98 }, { "epoch": 0.028811524609843937, "grad_norm": 0.07620100677013397, "learning_rate": 0.00017784240098163152, "loss": 0.1705, "step": 99 }, { "epoch": 0.029102550110953473, "grad_norm": 0.06909362226724625, "learning_rate": 0.00017736147760632248, "loss": 0.1441, "step": 100 }, { "epoch": 0.029102550110953473, "eval_loss": 0.16861361265182495, "eval_runtime": 1578.1522, "eval_samples_per_second": 7.335, "eval_steps_per_second": 1.834, "step": 100 }, { "epoch": 0.029393575612063006, "grad_norm": 0.06833357363939285, "learning_rate": 0.00017687605614283167, "loss": 0.1575, "step": 101 }, { "epoch": 0.029684601113172542, "grad_norm": 0.0781283900141716, "learning_rate": 0.0001763861648153945, "loss": 0.1809, "step": 102 }, { "epoch": 0.029975626614282075, "grad_norm": 0.06533481925725937, "learning_rate": 0.00017589183210814095, "loss": 0.1639, "step": 103 }, { "epoch": 0.03026665211539161, "grad_norm": 0.06966619938611984, "learning_rate": 0.00017539308676343973, "loss": 0.1554, "step": 104 }, { "epoch": 0.030557677616501147, "grad_norm": 0.065178282558918, "learning_rate": 0.00017488995778022686, "loss": 0.1477, "step": 105 }, { "epoch": 0.03084870311761068, "grad_norm": 0.06928521394729614, "learning_rate": 0.0001743824744123196, "loss": 0.1558, "step": 106 }, { "epoch": 0.031139728618720216, "grad_norm": 0.06974615901708603, "learning_rate": 0.00017387066616671572, "loss": 0.1536, "step": 107 }, { "epoch": 0.03143075411982975, "grad_norm": 0.07279954850673676, "learning_rate": 0.00017335456280187752, "loss": 0.1604, "step": 108 }, { "epoch": 0.031721779620939285, "grad_norm": 0.07937084138393402, "learning_rate": 0.00017283419432600184, "loss": 0.1835, "step": 109 }, { "epoch": 0.03201280512204882, "grad_norm": 0.07247325778007507, "learning_rate": 0.00017230959099527512, "loss": 0.1814, "step": 110 }, { "epoch": 0.03230383062315836, "grad_norm": 0.0691957026720047, "learning_rate": 0.00017178078331211432, "loss": 0.1497, "step": 111 }, { "epoch": 0.032594856124267886, "grad_norm": 0.06361576914787292, "learning_rate": 0.0001712478020233932, "loss": 0.1497, "step": 112 }, { "epoch": 0.03288588162537742, "grad_norm": 0.07963518798351288, "learning_rate": 0.00017071067811865476, "loss": 0.1695, "step": 113 }, { "epoch": 0.03317690712648696, "grad_norm": 0.07183556258678436, "learning_rate": 0.00017016944282830933, "loss": 0.1655, "step": 114 }, { "epoch": 0.033467932627596494, "grad_norm": 0.06393659859895706, "learning_rate": 0.00016962412762181869, "loss": 0.1578, "step": 115 }, { "epoch": 0.03375895812870603, "grad_norm": 0.0766880065202713, "learning_rate": 0.00016907476420586633, "loss": 0.1702, "step": 116 }, { "epoch": 0.03404998362981556, "grad_norm": 0.08281052857637405, "learning_rate": 0.00016852138452251388, "loss": 0.1889, "step": 117 }, { "epoch": 0.034341009130925096, "grad_norm": 0.07395404577255249, "learning_rate": 0.00016796402074734402, "loss": 0.166, "step": 118 }, { "epoch": 0.03463203463203463, "grad_norm": 0.06773527711629868, "learning_rate": 0.0001674027052875895, "loss": 0.1559, "step": 119 }, { "epoch": 0.03492306013314417, "grad_norm": 0.06645803898572922, "learning_rate": 0.00016683747078024888, "loss": 0.1568, "step": 120 }, { "epoch": 0.035214085634253704, "grad_norm": 0.07534414529800415, "learning_rate": 0.00016626835009018892, "loss": 0.1494, "step": 121 }, { "epoch": 0.03550511113536323, "grad_norm": 0.07698463648557663, "learning_rate": 0.00016569537630823383, "loss": 0.1731, "step": 122 }, { "epoch": 0.03579613663647277, "grad_norm": 0.07219712436199188, "learning_rate": 0.000165118582749241, "loss": 0.1609, "step": 123 }, { "epoch": 0.036087162137582306, "grad_norm": 0.06992946565151215, "learning_rate": 0.0001645380029501641, "loss": 0.1525, "step": 124 }, { "epoch": 0.03637818763869184, "grad_norm": 0.08205880224704742, "learning_rate": 0.00016395367066810313, "loss": 0.1477, "step": 125 }, { "epoch": 0.03666921313980138, "grad_norm": 0.07560513913631439, "learning_rate": 0.00016336561987834153, "loss": 0.1787, "step": 126 }, { "epoch": 0.03696023864091091, "grad_norm": 0.07634903490543365, "learning_rate": 0.00016277388477237086, "loss": 0.1622, "step": 127 }, { "epoch": 0.03725126414202044, "grad_norm": 0.07008732855319977, "learning_rate": 0.00016217849975590272, "loss": 0.1583, "step": 128 }, { "epoch": 0.03754228964312998, "grad_norm": 0.07009593397378922, "learning_rate": 0.00016157949944686827, "loss": 0.151, "step": 129 }, { "epoch": 0.037833315144239515, "grad_norm": 0.07215984910726547, "learning_rate": 0.00016097691867340545, "loss": 0.1551, "step": 130 }, { "epoch": 0.03812434064534905, "grad_norm": 0.07626433670520782, "learning_rate": 0.0001603707924718338, "loss": 0.1568, "step": 131 }, { "epoch": 0.03841536614645858, "grad_norm": 0.07062167674303055, "learning_rate": 0.00015976115608461758, "loss": 0.1689, "step": 132 }, { "epoch": 0.03870639164756812, "grad_norm": 0.06966700404882431, "learning_rate": 0.00015914804495831635, "loss": 0.1754, "step": 133 }, { "epoch": 0.03899741714867765, "grad_norm": 0.06686096638441086, "learning_rate": 0.00015853149474152423, "loss": 0.1698, "step": 134 }, { "epoch": 0.03928844264978719, "grad_norm": 0.07211221009492874, "learning_rate": 0.00015791154128279696, "loss": 0.1637, "step": 135 }, { "epoch": 0.039579468150896725, "grad_norm": 0.07641851156949997, "learning_rate": 0.00015728822062856758, "loss": 0.1805, "step": 136 }, { "epoch": 0.039870493652006254, "grad_norm": 0.07227915525436401, "learning_rate": 0.0001566615690210507, "loss": 0.1689, "step": 137 }, { "epoch": 0.04016151915311579, "grad_norm": 0.0694207176566124, "learning_rate": 0.00015603162289613503, "loss": 0.1491, "step": 138 }, { "epoch": 0.04045254465422533, "grad_norm": 0.06391049921512604, "learning_rate": 0.00015539841888126488, "loss": 0.1491, "step": 139 }, { "epoch": 0.04074357015533486, "grad_norm": 0.07112076133489609, "learning_rate": 0.0001547619937933108, "loss": 0.163, "step": 140 }, { "epoch": 0.0410345956564444, "grad_norm": 0.07369551062583923, "learning_rate": 0.00015412238463642845, "loss": 0.1587, "step": 141 }, { "epoch": 0.04132562115755393, "grad_norm": 0.07277291268110275, "learning_rate": 0.00015347962859990744, "loss": 0.1711, "step": 142 }, { "epoch": 0.041616646658663464, "grad_norm": 0.0772562026977539, "learning_rate": 0.00015283376305600866, "loss": 0.1757, "step": 143 }, { "epoch": 0.041907672159773, "grad_norm": 0.0680210217833519, "learning_rate": 0.00015218482555779165, "loss": 0.1739, "step": 144 }, { "epoch": 0.042198697660882536, "grad_norm": 0.06750776618719101, "learning_rate": 0.0001515328538369309, "loss": 0.1507, "step": 145 }, { "epoch": 0.04248972316199207, "grad_norm": 0.0778404027223587, "learning_rate": 0.00015087788580152206, "loss": 0.1668, "step": 146 }, { "epoch": 0.0427807486631016, "grad_norm": 0.07347714900970459, "learning_rate": 0.0001502199595338778, "loss": 0.1709, "step": 147 }, { "epoch": 0.04307177416421114, "grad_norm": 0.07480094581842422, "learning_rate": 0.00014955911328831355, "loss": 0.1639, "step": 148 }, { "epoch": 0.043362799665320674, "grad_norm": 0.07880245894193649, "learning_rate": 0.00014889538548892338, "loss": 0.1838, "step": 149 }, { "epoch": 0.04365382516643021, "grad_norm": 0.07331986725330353, "learning_rate": 0.00014822881472734562, "loss": 0.1686, "step": 150 }, { "epoch": 0.043944850667539746, "grad_norm": 0.062073007225990295, "learning_rate": 0.00014755943976051927, "loss": 0.1414, "step": 151 }, { "epoch": 0.044235876168649275, "grad_norm": 0.0803973376750946, "learning_rate": 0.00014688729950843035, "loss": 0.1627, "step": 152 }, { "epoch": 0.04452690166975881, "grad_norm": 0.07154098898172379, "learning_rate": 0.00014621243305184897, "loss": 0.159, "step": 153 }, { "epoch": 0.04481792717086835, "grad_norm": 0.07353903353214264, "learning_rate": 0.0001455348796300571, "loss": 0.1503, "step": 154 }, { "epoch": 0.045108952671977884, "grad_norm": 0.07070112973451614, "learning_rate": 0.00014485467863856703, "loss": 0.1387, "step": 155 }, { "epoch": 0.04539997817308742, "grad_norm": 0.0715903788805008, "learning_rate": 0.0001441718696268307, "loss": 0.1554, "step": 156 }, { "epoch": 0.04569100367419695, "grad_norm": 0.06608898937702179, "learning_rate": 0.00014348649229594017, "loss": 0.1416, "step": 157 }, { "epoch": 0.045982029175306485, "grad_norm": 0.07506071776151657, "learning_rate": 0.0001427985864963193, "loss": 0.1668, "step": 158 }, { "epoch": 0.04627305467641602, "grad_norm": 0.08147752285003662, "learning_rate": 0.00014210819222540663, "loss": 0.1625, "step": 159 }, { "epoch": 0.04656408017752556, "grad_norm": 0.0735524520277977, "learning_rate": 0.00014141534962532984, "loss": 0.1643, "step": 160 }, { "epoch": 0.046855105678635094, "grad_norm": 0.07707686722278595, "learning_rate": 0.00014072009898057173, "loss": 0.1609, "step": 161 }, { "epoch": 0.04714613117974462, "grad_norm": 0.07367052882909775, "learning_rate": 0.0001400224807156278, "loss": 0.1548, "step": 162 }, { "epoch": 0.04743715668085416, "grad_norm": 0.07454690337181091, "learning_rate": 0.00013932253539265604, "loss": 0.1646, "step": 163 }, { "epoch": 0.047728182181963695, "grad_norm": 0.07515886425971985, "learning_rate": 0.0001386203037091183, "loss": 0.1449, "step": 164 }, { "epoch": 0.04801920768307323, "grad_norm": 0.07659222930669785, "learning_rate": 0.00013791582649541403, "loss": 0.1555, "step": 165 }, { "epoch": 0.04831023318418277, "grad_norm": 0.06687572598457336, "learning_rate": 0.00013720914471250644, "loss": 0.142, "step": 166 }, { "epoch": 0.048601258685292296, "grad_norm": 0.06918177753686905, "learning_rate": 0.00013650029944954048, "loss": 0.15, "step": 167 }, { "epoch": 0.04889228418640183, "grad_norm": 0.07094506174325943, "learning_rate": 0.0001357893319214542, "loss": 0.1356, "step": 168 }, { "epoch": 0.04918330968751137, "grad_norm": 0.07885518670082092, "learning_rate": 0.000135076283466582, "loss": 0.1698, "step": 169 }, { "epoch": 0.049474335188620905, "grad_norm": 0.0780964344739914, "learning_rate": 0.00013436119554425133, "loss": 0.1672, "step": 170 }, { "epoch": 0.04976536068973044, "grad_norm": 0.06983709335327148, "learning_rate": 0.00013364410973237185, "loss": 0.1662, "step": 171 }, { "epoch": 0.05005638619083997, "grad_norm": 0.07986849546432495, "learning_rate": 0.00013292506772501819, "loss": 0.1456, "step": 172 }, { "epoch": 0.050347411691949506, "grad_norm": 0.07250676304101944, "learning_rate": 0.00013220411133000543, "loss": 0.1575, "step": 173 }, { "epoch": 0.05063843719305904, "grad_norm": 0.07436628639698029, "learning_rate": 0.0001314812824664585, "loss": 0.1374, "step": 174 }, { "epoch": 0.05092946269416858, "grad_norm": 0.07161043584346771, "learning_rate": 0.00013075662316237464, "loss": 0.1432, "step": 175 }, { "epoch": 0.051220488195278115, "grad_norm": 0.08502248674631119, "learning_rate": 0.0001300301755521798, "loss": 0.1709, "step": 176 }, { "epoch": 0.051511513696387644, "grad_norm": 0.07785065472126007, "learning_rate": 0.00012930198187427886, "loss": 0.1765, "step": 177 }, { "epoch": 0.05180253919749718, "grad_norm": 0.07903854548931122, "learning_rate": 0.0001285720844685996, "loss": 0.1662, "step": 178 }, { "epoch": 0.052093564698606716, "grad_norm": 0.08938921988010406, "learning_rate": 0.00012784052577413096, "loss": 0.1826, "step": 179 }, { "epoch": 0.05238459019971625, "grad_norm": 0.08065656572580338, "learning_rate": 0.00012710734832645557, "loss": 0.169, "step": 180 }, { "epoch": 0.05267561570082579, "grad_norm": 0.07795777171850204, "learning_rate": 0.00012637259475527634, "loss": 0.1762, "step": 181 }, { "epoch": 0.05296664120193532, "grad_norm": 0.0752585232257843, "learning_rate": 0.00012563630778193805, "loss": 0.1627, "step": 182 }, { "epoch": 0.053257666703044854, "grad_norm": 0.06943824142217636, "learning_rate": 0.0001248985302169432, "loss": 0.1509, "step": 183 }, { "epoch": 0.05354869220415439, "grad_norm": 0.07209432125091553, "learning_rate": 0.00012415930495746302, "loss": 0.1655, "step": 184 }, { "epoch": 0.053839717705263926, "grad_norm": 0.07933972030878067, "learning_rate": 0.00012341867498484303, "loss": 0.1677, "step": 185 }, { "epoch": 0.054130743206373455, "grad_norm": 0.07778981328010559, "learning_rate": 0.00012267668336210413, "loss": 0.1581, "step": 186 }, { "epoch": 0.05442176870748299, "grad_norm": 0.07811924070119858, "learning_rate": 0.00012193337323143867, "loss": 0.1495, "step": 187 }, { "epoch": 0.05471279420859253, "grad_norm": 0.07439761608839035, "learning_rate": 0.00012118878781170214, "loss": 0.154, "step": 188 }, { "epoch": 0.05500381970970206, "grad_norm": 0.07360168546438217, "learning_rate": 0.00012044297039589998, "loss": 0.1385, "step": 189 }, { "epoch": 0.0552948452108116, "grad_norm": 0.0793876051902771, "learning_rate": 0.00011969596434867063, "loss": 0.1493, "step": 190 }, { "epoch": 0.05558587071192113, "grad_norm": 0.07218817621469498, "learning_rate": 0.00011894781310376398, "loss": 0.1384, "step": 191 }, { "epoch": 0.055876896213030665, "grad_norm": 0.07586020231246948, "learning_rate": 0.00011819856016151615, "loss": 0.16, "step": 192 }, { "epoch": 0.0561679217141402, "grad_norm": 0.07959903031587601, "learning_rate": 0.00011744824908631997, "loss": 0.1615, "step": 193 }, { "epoch": 0.05645894721524974, "grad_norm": 0.0836108848452568, "learning_rate": 0.00011669692350409223, "loss": 0.161, "step": 194 }, { "epoch": 0.05674997271635927, "grad_norm": 0.06765826046466827, "learning_rate": 0.00011594462709973683, "loss": 0.1336, "step": 195 }, { "epoch": 0.0570409982174688, "grad_norm": 0.07990916073322296, "learning_rate": 0.00011519140361460509, "loss": 0.1503, "step": 196 }, { "epoch": 0.05733202371857834, "grad_norm": 0.07911231368780136, "learning_rate": 0.00011443729684395224, "loss": 0.1532, "step": 197 }, { "epoch": 0.057623049219687875, "grad_norm": 0.07620514929294586, "learning_rate": 0.00011368235063439103, "loss": 0.1462, "step": 198 }, { "epoch": 0.05791407472079741, "grad_norm": 0.0822930559515953, "learning_rate": 0.00011292660888134241, "loss": 0.1628, "step": 199 }, { "epoch": 0.05820510022190695, "grad_norm": 0.08735381811857224, "learning_rate": 0.00011217011552648316, "loss": 0.1841, "step": 200 }, { "epoch": 0.05820510022190695, "eval_loss": 0.16103102266788483, "eval_runtime": 1580.5853, "eval_samples_per_second": 7.323, "eval_steps_per_second": 1.831, "step": 200 }, { "epoch": 0.058496125723016476, "grad_norm": 0.07938691228628159, "learning_rate": 0.00011141291455519116, "loss": 0.1433, "step": 201 }, { "epoch": 0.05878715122412601, "grad_norm": 0.07802147418260574, "learning_rate": 0.00011065504999398762, "loss": 0.1553, "step": 202 }, { "epoch": 0.05907817672523555, "grad_norm": 0.09244146943092346, "learning_rate": 0.00010989656590797748, "loss": 0.1501, "step": 203 }, { "epoch": 0.059369202226345084, "grad_norm": 0.07255587726831436, "learning_rate": 0.00010913750639828711, "loss": 0.1428, "step": 204 }, { "epoch": 0.05966022772745462, "grad_norm": 0.06894674152135849, "learning_rate": 0.00010837791559950028, "loss": 0.1465, "step": 205 }, { "epoch": 0.05995125322856415, "grad_norm": 0.06965523213148117, "learning_rate": 0.00010761783767709182, "loss": 0.1407, "step": 206 }, { "epoch": 0.060242278729673686, "grad_norm": 0.07710455358028412, "learning_rate": 0.0001068573168248598, "loss": 0.1565, "step": 207 }, { "epoch": 0.06053330423078322, "grad_norm": 0.0719844400882721, "learning_rate": 0.00010609639726235591, "loss": 0.1483, "step": 208 }, { "epoch": 0.06082432973189276, "grad_norm": 0.08301186561584473, "learning_rate": 0.00010533512323231437, "loss": 0.167, "step": 209 }, { "epoch": 0.061115355233002294, "grad_norm": 0.07628195732831955, "learning_rate": 0.00010457353899807946, "loss": 0.1398, "step": 210 }, { "epoch": 0.06140638073411182, "grad_norm": 0.07327238470315933, "learning_rate": 0.00010381168884103188, "loss": 0.1499, "step": 211 }, { "epoch": 0.06169740623522136, "grad_norm": 0.07578903436660767, "learning_rate": 0.00010304961705801415, "loss": 0.1521, "step": 212 }, { "epoch": 0.061988431736330896, "grad_norm": 0.07652377337217331, "learning_rate": 0.00010228736795875489, "loss": 0.1555, "step": 213 }, { "epoch": 0.06227945723744043, "grad_norm": 0.08231547474861145, "learning_rate": 0.0001015249858632926, "loss": 0.1625, "step": 214 }, { "epoch": 0.06257048273854997, "grad_norm": 0.06857180595397949, "learning_rate": 0.00010076251509939866, "loss": 0.1421, "step": 215 }, { "epoch": 0.0628615082396595, "grad_norm": 0.0719117596745491, "learning_rate": 0.0001, "loss": 0.147, "step": 216 }, { "epoch": 0.06315253374076904, "grad_norm": 0.06776969879865646, "learning_rate": 9.923748490060135e-05, "loss": 0.1404, "step": 217 }, { "epoch": 0.06344355924187857, "grad_norm": 0.07235383242368698, "learning_rate": 9.847501413670742e-05, "loss": 0.1486, "step": 218 }, { "epoch": 0.0637345847429881, "grad_norm": 0.08163522928953171, "learning_rate": 9.771263204124514e-05, "loss": 0.1588, "step": 219 }, { "epoch": 0.06402561024409764, "grad_norm": 0.08176423609256744, "learning_rate": 9.695038294198589e-05, "loss": 0.1693, "step": 220 }, { "epoch": 0.06431663574520717, "grad_norm": 0.08553650230169296, "learning_rate": 9.618831115896815e-05, "loss": 0.1765, "step": 221 }, { "epoch": 0.06460766124631671, "grad_norm": 0.08697977662086487, "learning_rate": 9.542646100192056e-05, "loss": 0.1547, "step": 222 }, { "epoch": 0.06489868674742624, "grad_norm": 0.07338088750839233, "learning_rate": 9.466487676768563e-05, "loss": 0.1485, "step": 223 }, { "epoch": 0.06518971224853577, "grad_norm": 0.08859650790691376, "learning_rate": 9.390360273764411e-05, "loss": 0.1608, "step": 224 }, { "epoch": 0.06548073774964532, "grad_norm": 0.0738888531923294, "learning_rate": 9.314268317514024e-05, "loss": 0.145, "step": 225 }, { "epoch": 0.06577176325075484, "grad_norm": 0.07572459429502487, "learning_rate": 9.238216232290822e-05, "loss": 0.1477, "step": 226 }, { "epoch": 0.06606278875186439, "grad_norm": 0.07947902381420135, "learning_rate": 9.162208440049976e-05, "loss": 0.1571, "step": 227 }, { "epoch": 0.06635381425297392, "grad_norm": 0.08949624747037888, "learning_rate": 9.08624936017129e-05, "loss": 0.1767, "step": 228 }, { "epoch": 0.06664483975408345, "grad_norm": 0.07495246082544327, "learning_rate": 9.010343409202256e-05, "loss": 0.163, "step": 229 }, { "epoch": 0.06693586525519299, "grad_norm": 0.08242852240800858, "learning_rate": 8.93449500060124e-05, "loss": 0.1764, "step": 230 }, { "epoch": 0.06722689075630252, "grad_norm": 0.08443745970726013, "learning_rate": 8.858708544480887e-05, "loss": 0.1646, "step": 231 }, { "epoch": 0.06751791625741206, "grad_norm": 0.08463139832019806, "learning_rate": 8.782988447351685e-05, "loss": 0.158, "step": 232 }, { "epoch": 0.06780894175852159, "grad_norm": 0.0822853296995163, "learning_rate": 8.707339111865763e-05, "loss": 0.1477, "step": 233 }, { "epoch": 0.06809996725963112, "grad_norm": 0.07560363411903381, "learning_rate": 8.6317649365609e-05, "loss": 0.161, "step": 234 }, { "epoch": 0.06839099276074066, "grad_norm": 0.08058074116706848, "learning_rate": 8.556270315604778e-05, "loss": 0.16, "step": 235 }, { "epoch": 0.06868201826185019, "grad_norm": 0.08817370980978012, "learning_rate": 8.480859638539492e-05, "loss": 0.1781, "step": 236 }, { "epoch": 0.06897304376295973, "grad_norm": 0.07165003567934036, "learning_rate": 8.405537290026318e-05, "loss": 0.1274, "step": 237 }, { "epoch": 0.06926406926406926, "grad_norm": 0.07525242120027542, "learning_rate": 8.33030764959078e-05, "loss": 0.1491, "step": 238 }, { "epoch": 0.0695550947651788, "grad_norm": 0.07487063109874725, "learning_rate": 8.255175091368004e-05, "loss": 0.1566, "step": 239 }, { "epoch": 0.06984612026628834, "grad_norm": 0.08262135833501816, "learning_rate": 8.180143983848387e-05, "loss": 0.1615, "step": 240 }, { "epoch": 0.07013714576739787, "grad_norm": 0.07212772965431213, "learning_rate": 8.105218689623603e-05, "loss": 0.124, "step": 241 }, { "epoch": 0.07042817126850741, "grad_norm": 0.07601718604564667, "learning_rate": 8.030403565132942e-05, "loss": 0.1442, "step": 242 }, { "epoch": 0.07071919676961694, "grad_norm": 0.06825247406959534, "learning_rate": 7.955702960410007e-05, "loss": 0.1332, "step": 243 }, { "epoch": 0.07101022227072647, "grad_norm": 0.08394462615251541, "learning_rate": 7.881121218829787e-05, "loss": 0.1574, "step": 244 }, { "epoch": 0.07130124777183601, "grad_norm": 0.073182113468647, "learning_rate": 7.806662676856133e-05, "loss": 0.1487, "step": 245 }, { "epoch": 0.07159227327294554, "grad_norm": 0.07771697640419006, "learning_rate": 7.732331663789592e-05, "loss": 0.1441, "step": 246 }, { "epoch": 0.07188329877405508, "grad_norm": 0.08099354058504105, "learning_rate": 7.658132501515702e-05, "loss": 0.1517, "step": 247 }, { "epoch": 0.07217432427516461, "grad_norm": 0.08727878332138062, "learning_rate": 7.584069504253703e-05, "loss": 0.17, "step": 248 }, { "epoch": 0.07246534977627414, "grad_norm": 0.07737427949905396, "learning_rate": 7.510146978305683e-05, "loss": 0.1481, "step": 249 }, { "epoch": 0.07275637527738368, "grad_norm": 0.08307147771120071, "learning_rate": 7.436369221806201e-05, "loss": 0.1623, "step": 250 }, { "epoch": 0.07304740077849321, "grad_norm": 0.08056792616844177, "learning_rate": 7.362740524472372e-05, "loss": 0.1443, "step": 251 }, { "epoch": 0.07333842627960276, "grad_norm": 0.09201047569513321, "learning_rate": 7.289265167354449e-05, "loss": 0.168, "step": 252 }, { "epoch": 0.07362945178071229, "grad_norm": 0.08724138140678406, "learning_rate": 7.215947422586906e-05, "loss": 0.1566, "step": 253 }, { "epoch": 0.07392047728182181, "grad_norm": 0.07930216193199158, "learning_rate": 7.142791553140045e-05, "loss": 0.1461, "step": 254 }, { "epoch": 0.07421150278293136, "grad_norm": 0.07905029505491257, "learning_rate": 7.069801812572117e-05, "loss": 0.1364, "step": 255 }, { "epoch": 0.07450252828404089, "grad_norm": 0.08945164084434509, "learning_rate": 6.996982444782021e-05, "loss": 0.1698, "step": 256 }, { "epoch": 0.07479355378515043, "grad_norm": 0.08269284665584564, "learning_rate": 6.92433768376254e-05, "loss": 0.1662, "step": 257 }, { "epoch": 0.07508457928625996, "grad_norm": 0.08096525073051453, "learning_rate": 6.851871753354153e-05, "loss": 0.1526, "step": 258 }, { "epoch": 0.07537560478736949, "grad_norm": 0.08036118000745773, "learning_rate": 6.77958886699946e-05, "loss": 0.1507, "step": 259 }, { "epoch": 0.07566663028847903, "grad_norm": 0.07475791871547699, "learning_rate": 6.707493227498188e-05, "loss": 0.1565, "step": 260 }, { "epoch": 0.07595765578958856, "grad_norm": 0.07425666600465775, "learning_rate": 6.635589026762818e-05, "loss": 0.1474, "step": 261 }, { "epoch": 0.0762486812906981, "grad_norm": 0.08315514773130417, "learning_rate": 6.563880445574873e-05, "loss": 0.1601, "step": 262 }, { "epoch": 0.07653970679180763, "grad_norm": 0.08405305445194244, "learning_rate": 6.492371653341804e-05, "loss": 0.1584, "step": 263 }, { "epoch": 0.07683073229291716, "grad_norm": 0.08118040859699249, "learning_rate": 6.421066807854584e-05, "loss": 0.1517, "step": 264 }, { "epoch": 0.0771217577940267, "grad_norm": 0.07679029554128647, "learning_rate": 6.349970055045956e-05, "loss": 0.1503, "step": 265 }, { "epoch": 0.07741278329513623, "grad_norm": 0.07175418734550476, "learning_rate": 6.279085528749359e-05, "loss": 0.1449, "step": 266 }, { "epoch": 0.07770380879624578, "grad_norm": 0.07590077817440033, "learning_rate": 6.208417350458598e-05, "loss": 0.1384, "step": 267 }, { "epoch": 0.0779948342973553, "grad_norm": 0.09335010498762131, "learning_rate": 6.137969629088173e-05, "loss": 0.1557, "step": 268 }, { "epoch": 0.07828585979846484, "grad_norm": 0.0851510614156723, "learning_rate": 6.067746460734398e-05, "loss": 0.1643, "step": 269 }, { "epoch": 0.07857688529957438, "grad_norm": 0.0860925167798996, "learning_rate": 5.9977519284372194e-05, "loss": 0.1663, "step": 270 }, { "epoch": 0.07886791080068391, "grad_norm": 0.0843532383441925, "learning_rate": 5.927990101942828e-05, "loss": 0.1587, "step": 271 }, { "epoch": 0.07915893630179345, "grad_norm": 0.08070237189531326, "learning_rate": 5.8584650374670135e-05, "loss": 0.141, "step": 272 }, { "epoch": 0.07944996180290298, "grad_norm": 0.07470858842134476, "learning_rate": 5.789180777459337e-05, "loss": 0.1256, "step": 273 }, { "epoch": 0.07974098730401251, "grad_norm": 0.08068849891424179, "learning_rate": 5.720141350368072e-05, "loss": 0.1539, "step": 274 }, { "epoch": 0.08003201280512205, "grad_norm": 0.08972500264644623, "learning_rate": 5.651350770405983e-05, "loss": 0.1493, "step": 275 }, { "epoch": 0.08032303830623158, "grad_norm": 0.09065806120634079, "learning_rate": 5.582813037316927e-05, "loss": 0.1556, "step": 276 }, { "epoch": 0.08061406380734112, "grad_norm": 0.0844668298959732, "learning_rate": 5.5145321361432956e-05, "loss": 0.1526, "step": 277 }, { "epoch": 0.08090508930845065, "grad_norm": 0.08198230713605881, "learning_rate": 5.446512036994287e-05, "loss": 0.1468, "step": 278 }, { "epoch": 0.08119611480956018, "grad_norm": 0.09235311299562454, "learning_rate": 5.3787566948151056e-05, "loss": 0.1693, "step": 279 }, { "epoch": 0.08148714031066973, "grad_norm": 0.08648033440113068, "learning_rate": 5.3112700491569666e-05, "loss": 0.1598, "step": 280 }, { "epoch": 0.08177816581177925, "grad_norm": 0.09177742153406143, "learning_rate": 5.244056023948075e-05, "loss": 0.1662, "step": 281 }, { "epoch": 0.0820691913128888, "grad_norm": 0.08776862174272537, "learning_rate": 5.177118527265438e-05, "loss": 0.1519, "step": 282 }, { "epoch": 0.08236021681399833, "grad_norm": 0.09411810338497162, "learning_rate": 5.1104614511076645e-05, "loss": 0.1618, "step": 283 }, { "epoch": 0.08265124231510786, "grad_norm": 0.1135680079460144, "learning_rate": 5.044088671168644e-05, "loss": 0.1503, "step": 284 }, { "epoch": 0.0829422678162174, "grad_norm": 0.10310947149991989, "learning_rate": 4.9780040466122235e-05, "loss": 0.1486, "step": 285 }, { "epoch": 0.08323329331732693, "grad_norm": 0.0861528143286705, "learning_rate": 4.912211419847794e-05, "loss": 0.1419, "step": 286 }, { "epoch": 0.08352431881843647, "grad_norm": 0.09601995348930359, "learning_rate": 4.846714616306908e-05, "loss": 0.1606, "step": 287 }, { "epoch": 0.083815344319546, "grad_norm": 0.08836833387613297, "learning_rate": 4.7815174442208354e-05, "loss": 0.1555, "step": 288 }, { "epoch": 0.08410636982065553, "grad_norm": 0.0788690447807312, "learning_rate": 4.716623694399134e-05, "loss": 0.1482, "step": 289 }, { "epoch": 0.08439739532176507, "grad_norm": 0.08364184200763702, "learning_rate": 4.652037140009259e-05, "loss": 0.1487, "step": 290 }, { "epoch": 0.0846884208228746, "grad_norm": 0.083168163895607, "learning_rate": 4.587761536357152e-05, "loss": 0.1483, "step": 291 }, { "epoch": 0.08497944632398415, "grad_norm": 0.0829937607049942, "learning_rate": 4.523800620668921e-05, "loss": 0.1481, "step": 292 }, { "epoch": 0.08527047182509367, "grad_norm": 0.08684185147285461, "learning_rate": 4.4601581118735105e-05, "loss": 0.1575, "step": 293 }, { "epoch": 0.0855614973262032, "grad_norm": 0.08446146547794342, "learning_rate": 4.3968377103865024e-05, "loss": 0.1464, "step": 294 }, { "epoch": 0.08585252282731275, "grad_norm": 0.08333813399076462, "learning_rate": 4.333843097894932e-05, "loss": 0.1365, "step": 295 }, { "epoch": 0.08614354832842228, "grad_norm": 0.09695811569690704, "learning_rate": 4.271177937143245e-05, "loss": 0.1671, "step": 296 }, { "epoch": 0.08643457382953182, "grad_norm": 0.09023378789424896, "learning_rate": 4.2088458717203085e-05, "loss": 0.1518, "step": 297 }, { "epoch": 0.08672559933064135, "grad_norm": 0.08972320705652237, "learning_rate": 4.146850525847579e-05, "loss": 0.157, "step": 298 }, { "epoch": 0.08701662483175088, "grad_norm": 0.10669073462486267, "learning_rate": 4.0851955041683675e-05, "loss": 0.1722, "step": 299 }, { "epoch": 0.08730765033286042, "grad_norm": 0.08736032247543335, "learning_rate": 4.023884391538244e-05, "loss": 0.1416, "step": 300 }, { "epoch": 0.08730765033286042, "eval_loss": 0.15649166703224182, "eval_runtime": 1579.1525, "eval_samples_per_second": 7.33, "eval_steps_per_second": 1.833, "step": 300 }, { "epoch": 0.08759867583396995, "grad_norm": 0.08116644620895386, "learning_rate": 3.9629207528166224e-05, "loss": 0.1422, "step": 301 }, { "epoch": 0.08788970133507949, "grad_norm": 0.07979750633239746, "learning_rate": 3.902308132659457e-05, "loss": 0.1464, "step": 302 }, { "epoch": 0.08818072683618902, "grad_norm": 0.07950086891651154, "learning_rate": 3.842050055313174e-05, "loss": 0.1352, "step": 303 }, { "epoch": 0.08847175233729855, "grad_norm": 0.0806957557797432, "learning_rate": 3.7821500244097274e-05, "loss": 0.1462, "step": 304 }, { "epoch": 0.0887627778384081, "grad_norm": 0.09410198032855988, "learning_rate": 3.722611522762917e-05, "loss": 0.1696, "step": 305 }, { "epoch": 0.08905380333951762, "grad_norm": 0.08685276657342911, "learning_rate": 3.663438012165848e-05, "loss": 0.1552, "step": 306 }, { "epoch": 0.08934482884062717, "grad_norm": 0.08622679114341736, "learning_rate": 3.604632933189691e-05, "loss": 0.1558, "step": 307 }, { "epoch": 0.0896358543417367, "grad_norm": 0.09552394598722458, "learning_rate": 3.5461997049835914e-05, "loss": 0.1573, "step": 308 }, { "epoch": 0.08992687984284622, "grad_norm": 0.07481776177883148, "learning_rate": 3.488141725075901e-05, "loss": 0.141, "step": 309 }, { "epoch": 0.09021790534395577, "grad_norm": 0.08296847343444824, "learning_rate": 3.430462369176619e-05, "loss": 0.1506, "step": 310 }, { "epoch": 0.0905089308450653, "grad_norm": 0.08685876429080963, "learning_rate": 3.373164990981108e-05, "loss": 0.1571, "step": 311 }, { "epoch": 0.09079995634617484, "grad_norm": 0.10209972411394119, "learning_rate": 3.316252921975116e-05, "loss": 0.1594, "step": 312 }, { "epoch": 0.09109098184728437, "grad_norm": 0.08648335188627243, "learning_rate": 3.259729471241051e-05, "loss": 0.1376, "step": 313 }, { "epoch": 0.0913820073483939, "grad_norm": 0.09473145008087158, "learning_rate": 3.203597925265598e-05, "loss": 0.1729, "step": 314 }, { "epoch": 0.09167303284950344, "grad_norm": 0.08934331685304642, "learning_rate": 3.1478615477486114e-05, "loss": 0.1494, "step": 315 }, { "epoch": 0.09196405835061297, "grad_norm": 0.08760195225477219, "learning_rate": 3.092523579413372e-05, "loss": 0.1459, "step": 316 }, { "epoch": 0.09225508385172251, "grad_norm": 0.08313330262899399, "learning_rate": 3.0375872378181337e-05, "loss": 0.1433, "step": 317 }, { "epoch": 0.09254610935283204, "grad_norm": 0.09257054328918457, "learning_rate": 2.98305571716907e-05, "loss": 0.1601, "step": 318 }, { "epoch": 0.09283713485394157, "grad_norm": 0.09015868604183197, "learning_rate": 2.9289321881345254e-05, "loss": 0.1628, "step": 319 }, { "epoch": 0.09312816035505112, "grad_norm": 0.08400508761405945, "learning_rate": 2.875219797660681e-05, "loss": 0.1429, "step": 320 }, { "epoch": 0.09341918585616064, "grad_norm": 0.08535855263471603, "learning_rate": 2.821921668788571e-05, "loss": 0.148, "step": 321 }, { "epoch": 0.09371021135727019, "grad_norm": 0.07879780232906342, "learning_rate": 2.769040900472488e-05, "loss": 0.1419, "step": 322 }, { "epoch": 0.09400123685837972, "grad_norm": 0.08892536163330078, "learning_rate": 2.71658056739982e-05, "loss": 0.1575, "step": 323 }, { "epoch": 0.09429226235948925, "grad_norm": 0.09279028326272964, "learning_rate": 2.6645437198122502e-05, "loss": 0.1407, "step": 324 }, { "epoch": 0.09458328786059879, "grad_norm": 0.07712056487798691, "learning_rate": 2.612933383328432e-05, "loss": 0.1439, "step": 325 }, { "epoch": 0.09487431336170832, "grad_norm": 0.09146617352962494, "learning_rate": 2.5617525587680402e-05, "loss": 0.1688, "step": 326 }, { "epoch": 0.09516533886281786, "grad_norm": 0.09416982531547546, "learning_rate": 2.5110042219773178e-05, "loss": 0.1466, "step": 327 }, { "epoch": 0.09545636436392739, "grad_norm": 0.10706603527069092, "learning_rate": 2.4606913236560282e-05, "loss": 0.1436, "step": 328 }, { "epoch": 0.09574738986503692, "grad_norm": 0.09682357311248779, "learning_rate": 2.410816789185907e-05, "loss": 0.1628, "step": 329 }, { "epoch": 0.09603841536614646, "grad_norm": 0.09372398257255554, "learning_rate": 2.3613835184605525e-05, "loss": 0.1587, "step": 330 }, { "epoch": 0.09632944086725599, "grad_norm": 0.08698038756847382, "learning_rate": 2.3123943857168318e-05, "loss": 0.1519, "step": 331 }, { "epoch": 0.09662046636836553, "grad_norm": 0.0826997235417366, "learning_rate": 2.2638522393677563e-05, "loss": 0.1492, "step": 332 }, { "epoch": 0.09691149186947506, "grad_norm": 0.08685287833213806, "learning_rate": 2.2157599018368492e-05, "loss": 0.1433, "step": 333 }, { "epoch": 0.09720251737058459, "grad_norm": 0.08258447051048279, "learning_rate": 2.1681201693940668e-05, "loss": 0.1425, "step": 334 }, { "epoch": 0.09749354287169414, "grad_norm": 0.09492287784814835, "learning_rate": 2.1209358119931845e-05, "loss": 0.1617, "step": 335 }, { "epoch": 0.09778456837280367, "grad_norm": 0.08668515086174011, "learning_rate": 2.074209573110769e-05, "loss": 0.1477, "step": 336 }, { "epoch": 0.09807559387391321, "grad_norm": 0.09318174421787262, "learning_rate": 2.027944169586633e-05, "loss": 0.156, "step": 337 }, { "epoch": 0.09836661937502274, "grad_norm": 0.08594143390655518, "learning_rate": 1.982142291465896e-05, "loss": 0.1671, "step": 338 }, { "epoch": 0.09865764487613227, "grad_norm": 0.08649858832359314, "learning_rate": 1.9368066018425503e-05, "loss": 0.1318, "step": 339 }, { "epoch": 0.09894867037724181, "grad_norm": 0.08002059906721115, "learning_rate": 1.891939736704641e-05, "loss": 0.1444, "step": 340 }, { "epoch": 0.09923969587835134, "grad_norm": 0.08685939759016037, "learning_rate": 1.8475443047809782e-05, "loss": 0.1385, "step": 341 }, { "epoch": 0.09953072137946088, "grad_norm": 0.08656252920627594, "learning_rate": 1.8036228873894746e-05, "loss": 0.1565, "step": 342 }, { "epoch": 0.09982174688057041, "grad_norm": 0.09303918480873108, "learning_rate": 1.760178038287048e-05, "loss": 0.1529, "step": 343 }, { "epoch": 0.10011277238167994, "grad_norm": 0.08067046105861664, "learning_rate": 1.7172122835211337e-05, "loss": 0.1541, "step": 344 }, { "epoch": 0.10040379788278948, "grad_norm": 0.09442534297704697, "learning_rate": 1.674728121282819e-05, "loss": 0.1632, "step": 345 }, { "epoch": 0.10069482338389901, "grad_norm": 0.08498077839612961, "learning_rate": 1.6327280217615792e-05, "loss": 0.1589, "step": 346 }, { "epoch": 0.10098584888500856, "grad_norm": 0.0830565020442009, "learning_rate": 1.591214427001667e-05, "loss": 0.1389, "step": 347 }, { "epoch": 0.10127687438611808, "grad_norm": 0.08560646325349808, "learning_rate": 1.5501897507601014e-05, "loss": 0.1328, "step": 348 }, { "epoch": 0.10156789988722761, "grad_norm": 0.07962379604578018, "learning_rate": 1.5096563783663432e-05, "loss": 0.1426, "step": 349 }, { "epoch": 0.10185892538833716, "grad_norm": 0.08420272916555405, "learning_rate": 1.4696166665835853e-05, "loss": 0.1587, "step": 350 }, { "epoch": 0.10214995088944669, "grad_norm": 0.09415718913078308, "learning_rate": 1.4300729434717396e-05, "loss": 0.1711, "step": 351 }, { "epoch": 0.10244097639055623, "grad_norm": 0.08557581901550293, "learning_rate": 1.3910275082520573e-05, "loss": 0.1483, "step": 352 }, { "epoch": 0.10273200189166576, "grad_norm": 0.10299764573574066, "learning_rate": 1.3524826311734551e-05, "loss": 0.1402, "step": 353 }, { "epoch": 0.10302302739277529, "grad_norm": 0.08215977996587753, "learning_rate": 1.3144405533805138e-05, "loss": 0.1445, "step": 354 }, { "epoch": 0.10331405289388483, "grad_norm": 0.08453574031591415, "learning_rate": 1.2769034867831586e-05, "loss": 0.147, "step": 355 }, { "epoch": 0.10360507839499436, "grad_norm": 0.09578870236873627, "learning_rate": 1.2398736139280686e-05, "loss": 0.1537, "step": 356 }, { "epoch": 0.1038961038961039, "grad_norm": 0.08585759252309799, "learning_rate": 1.2033530878717548e-05, "loss": 0.1386, "step": 357 }, { "epoch": 0.10418712939721343, "grad_norm": 0.08463095873594284, "learning_rate": 1.167344032055394e-05, "loss": 0.1399, "step": 358 }, { "epoch": 0.10447815489832296, "grad_norm": 0.09077376872301102, "learning_rate": 1.1318485401813439e-05, "loss": 0.178, "step": 359 }, { "epoch": 0.1047691803994325, "grad_norm": 0.07958751171827316, "learning_rate": 1.096868676091425e-05, "loss": 0.1379, "step": 360 }, { "epoch": 0.10506020590054203, "grad_norm": 0.09366265684366226, "learning_rate": 1.0624064736469053e-05, "loss": 0.1503, "step": 361 }, { "epoch": 0.10535123140165158, "grad_norm": 0.07501017302274704, "learning_rate": 1.02846393661026e-05, "loss": 0.1343, "step": 362 }, { "epoch": 0.1056422569027611, "grad_norm": 0.0942973643541336, "learning_rate": 9.950430385286491e-06, "loss": 0.1677, "step": 363 }, { "epoch": 0.10593328240387063, "grad_norm": 0.08338665217161179, "learning_rate": 9.62145722619182e-06, "loss": 0.1475, "step": 364 }, { "epoch": 0.10622430790498018, "grad_norm": 0.09208852797746658, "learning_rate": 9.297739016559226e-06, "loss": 0.1569, "step": 365 }, { "epoch": 0.10651533340608971, "grad_norm": 0.08696365356445312, "learning_rate": 8.979294578586738e-06, "loss": 0.1644, "step": 366 }, { "epoch": 0.10680635890719924, "grad_norm": 0.09445630013942719, "learning_rate": 8.666142427835444e-06, "loss": 0.1605, "step": 367 }, { "epoch": 0.10709738440830878, "grad_norm": 0.08878560364246368, "learning_rate": 8.358300772152849e-06, "loss": 0.1659, "step": 368 }, { "epoch": 0.10738840990941831, "grad_norm": 0.09253693372011185, "learning_rate": 8.055787510614288e-06, "loss": 0.1549, "step": 369 }, { "epoch": 0.10767943541052785, "grad_norm": 0.09546509385108948, "learning_rate": 7.758620232482084e-06, "loss": 0.1604, "step": 370 }, { "epoch": 0.10797046091163738, "grad_norm": 0.08155137300491333, "learning_rate": 7.46681621618297e-06, "loss": 0.1396, "step": 371 }, { "epoch": 0.10826148641274691, "grad_norm": 0.0899728313088417, "learning_rate": 7.180392428303395e-06, "loss": 0.158, "step": 372 }, { "epoch": 0.10855251191385645, "grad_norm": 0.08545383810997009, "learning_rate": 6.8993655226030405e-06, "loss": 0.1545, "step": 373 }, { "epoch": 0.10884353741496598, "grad_norm": 0.0924701988697052, "learning_rate": 6.623751839046455e-06, "loss": 0.1569, "step": 374 }, { "epoch": 0.10913456291607553, "grad_norm": 0.0837499350309372, "learning_rate": 6.353567402853056e-06, "loss": 0.1595, "step": 375 }, { "epoch": 0.10942558841718505, "grad_norm": 0.08741695433855057, "learning_rate": 6.0888279235653214e-06, "loss": 0.1582, "step": 376 }, { "epoch": 0.10971661391829458, "grad_norm": 0.08700203150510788, "learning_rate": 5.82954879413542e-06, "loss": 0.1552, "step": 377 }, { "epoch": 0.11000763941940413, "grad_norm": 0.07841800898313522, "learning_rate": 5.575745090030138e-06, "loss": 0.1462, "step": 378 }, { "epoch": 0.11029866492051366, "grad_norm": 0.07555332779884338, "learning_rate": 5.327431568354402e-06, "loss": 0.1358, "step": 379 }, { "epoch": 0.1105896904216232, "grad_norm": 0.07444937527179718, "learning_rate": 5.084622666993244e-06, "loss": 0.1383, "step": 380 }, { "epoch": 0.11088071592273273, "grad_norm": 0.08368539065122604, "learning_rate": 4.847332503772228e-06, "loss": 0.1368, "step": 381 }, { "epoch": 0.11117174142384226, "grad_norm": 0.08841046690940857, "learning_rate": 4.61557487563673e-06, "loss": 0.1504, "step": 382 }, { "epoch": 0.1114627669249518, "grad_norm": 0.08533196896314621, "learning_rate": 4.389363257849632e-06, "loss": 0.1449, "step": 383 }, { "epoch": 0.11175379242606133, "grad_norm": 0.09036096930503845, "learning_rate": 4.168710803207865e-06, "loss": 0.1555, "step": 384 }, { "epoch": 0.11204481792717087, "grad_norm": 0.08787363022565842, "learning_rate": 3.953630341277604e-06, "loss": 0.1483, "step": 385 }, { "epoch": 0.1123358434282804, "grad_norm": 0.09143967181444168, "learning_rate": 3.7441343776484117e-06, "loss": 0.1519, "step": 386 }, { "epoch": 0.11262686892938993, "grad_norm": 0.08185972273349762, "learning_rate": 3.540235093205979e-06, "loss": 0.1382, "step": 387 }, { "epoch": 0.11291789443049947, "grad_norm": 0.08177818357944489, "learning_rate": 3.3419443434240083e-06, "loss": 0.1439, "step": 388 }, { "epoch": 0.113208919931609, "grad_norm": 0.09348271787166595, "learning_rate": 3.1492736576747893e-06, "loss": 0.1582, "step": 389 }, { "epoch": 0.11349994543271855, "grad_norm": 0.08014369755983353, "learning_rate": 2.9622342385589254e-06, "loss": 0.1426, "step": 390 }, { "epoch": 0.11379097093382808, "grad_norm": 0.1089087501168251, "learning_rate": 2.7808369612539407e-06, "loss": 0.1504, "step": 391 }, { "epoch": 0.1140819964349376, "grad_norm": 0.0827077105641365, "learning_rate": 2.6050923728818787e-06, "loss": 0.14, "step": 392 }, { "epoch": 0.11437302193604715, "grad_norm": 0.08474422991275787, "learning_rate": 2.4350106918962e-06, "loss": 0.1501, "step": 393 }, { "epoch": 0.11466404743715668, "grad_norm": 0.09697262942790985, "learning_rate": 2.2706018074875045e-06, "loss": 0.1648, "step": 394 }, { "epoch": 0.11495507293826622, "grad_norm": 0.08163170516490936, "learning_rate": 2.111875279008657e-06, "loss": 0.1571, "step": 395 }, { "epoch": 0.11524609843937575, "grad_norm": 0.09289243817329407, "learning_rate": 1.9588403354188325e-06, "loss": 0.1667, "step": 396 }, { "epoch": 0.11553712394048528, "grad_norm": 0.09270413219928741, "learning_rate": 1.811505874747066e-06, "loss": 0.1641, "step": 397 }, { "epoch": 0.11582814944159482, "grad_norm": 0.08141297101974487, "learning_rate": 1.6698804635747579e-06, "loss": 0.1534, "step": 398 }, { "epoch": 0.11611917494270435, "grad_norm": 0.09712623059749603, "learning_rate": 1.5339723365376479e-06, "loss": 0.1538, "step": 399 }, { "epoch": 0.1164102004438139, "grad_norm": 0.08977963030338287, "learning_rate": 1.4037893958469995e-06, "loss": 0.153, "step": 400 }, { "epoch": 0.1164102004438139, "eval_loss": 0.1550179123878479, "eval_runtime": 1577.2866, "eval_samples_per_second": 7.339, "eval_steps_per_second": 1.835, "step": 400 }, { "epoch": 0.11670122594492342, "grad_norm": 0.08147388696670532, "learning_rate": 1.2793392108301439e-06, "loss": 0.1476, "step": 401 }, { "epoch": 0.11699225144603295, "grad_norm": 0.09103590250015259, "learning_rate": 1.160629017490389e-06, "loss": 0.162, "step": 402 }, { "epoch": 0.1172832769471425, "grad_norm": 0.08841648697853088, "learning_rate": 1.0476657180862325e-06, "loss": 0.1679, "step": 403 }, { "epoch": 0.11757430244825202, "grad_norm": 0.08707400411367416, "learning_rate": 9.404558807301067e-07, "loss": 0.1487, "step": 404 }, { "epoch": 0.11786532794936157, "grad_norm": 0.09094386547803879, "learning_rate": 8.390057390064265e-07, "loss": 0.1487, "step": 405 }, { "epoch": 0.1181563534504711, "grad_norm": 0.0876602977514267, "learning_rate": 7.433211916092142e-07, "loss": 0.1473, "step": 406 }, { "epoch": 0.11844737895158063, "grad_norm": 0.08664330095052719, "learning_rate": 6.534078019990398e-07, "loss": 0.1517, "step": 407 }, { "epoch": 0.11873840445269017, "grad_norm": 0.07475589960813522, "learning_rate": 5.69270798079613e-07, "loss": 0.1372, "step": 408 }, { "epoch": 0.1190294299537997, "grad_norm": 0.07260264456272125, "learning_rate": 4.909150718937717e-07, "loss": 0.1234, "step": 409 }, { "epoch": 0.11932045545490924, "grad_norm": 0.0814119204878807, "learning_rate": 4.1834517933907467e-07, "loss": 0.149, "step": 410 }, { "epoch": 0.11961148095601877, "grad_norm": 0.10258995741605759, "learning_rate": 3.5156533990285956e-07, "loss": 0.16, "step": 411 }, { "epoch": 0.1199025064571283, "grad_norm": 0.07404506951570511, "learning_rate": 2.9057943641693785e-07, "loss": 0.1342, "step": 412 }, { "epoch": 0.12019353195823784, "grad_norm": 0.08446568995714188, "learning_rate": 2.3539101483184278e-07, "loss": 0.1473, "step": 413 }, { "epoch": 0.12048455745934737, "grad_norm": 0.08428740501403809, "learning_rate": 1.8600328401061629e-07, "loss": 0.1522, "step": 414 }, { "epoch": 0.12077558296045691, "grad_norm": 0.0767434611916542, "learning_rate": 1.4241911554225828e-07, "loss": 0.1447, "step": 415 }, { "epoch": 0.12106660846156644, "grad_norm": 0.07769133895635605, "learning_rate": 1.0464104357477133e-07, "loss": 0.1407, "step": 416 }, { "epoch": 0.12135763396267597, "grad_norm": 0.08757521212100983, "learning_rate": 7.267126466777852e-08, "loss": 0.1437, "step": 417 }, { "epoch": 0.12164865946378552, "grad_norm": 0.10252358764410019, "learning_rate": 4.651163766484779e-08, "loss": 0.1719, "step": 418 }, { "epoch": 0.12193968496489505, "grad_norm": 0.08464578539133072, "learning_rate": 2.6163683585389565e-08, "loss": 0.1565, "step": 419 }, { "epoch": 0.12223071046600459, "grad_norm": 0.08833827823400497, "learning_rate": 1.1628585536216374e-08, "loss": 0.1483, "step": 420 }, { "epoch": 0.12252173596711412, "grad_norm": 0.08319747447967529, "learning_rate": 2.907188642786718e-09, "loss": 0.1472, "step": 421 }, { "epoch": 0.12281276146822365, "grad_norm": 0.0925559550523758, "learning_rate": 0.0, "loss": 0.1534, "step": 422 } ], "logging_steps": 1, "max_steps": 422, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3277580175561196e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }