andstor's picture
Upload folder using huggingface_hub
57953ae verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9983597594313833,
"eval_steps": 500,
"global_step": 1371,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002186987424822307,
"grad_norm": 0.11989043653011322,
"learning_rate": 2.1739130434782607e-06,
"loss": 0.7588,
"step": 1
},
{
"epoch": 0.004373974849644614,
"grad_norm": 0.08302941918373108,
"learning_rate": 4.347826086956521e-06,
"loss": 0.8145,
"step": 2
},
{
"epoch": 0.0065609622744669215,
"grad_norm": 0.15307161211967468,
"learning_rate": 6.521739130434782e-06,
"loss": 0.8127,
"step": 3
},
{
"epoch": 0.008747949699289229,
"grad_norm": 0.13161885738372803,
"learning_rate": 8.695652173913043e-06,
"loss": 0.6707,
"step": 4
},
{
"epoch": 0.010934937124111536,
"grad_norm": 0.09451252222061157,
"learning_rate": 1.0869565217391303e-05,
"loss": 0.7497,
"step": 5
},
{
"epoch": 0.013121924548933843,
"grad_norm": 0.0813838317990303,
"learning_rate": 1.3043478260869564e-05,
"loss": 1.0007,
"step": 6
},
{
"epoch": 0.01530891197375615,
"grad_norm": 0.12192627787590027,
"learning_rate": 1.5217391304347826e-05,
"loss": 0.6703,
"step": 7
},
{
"epoch": 0.017495899398578457,
"grad_norm": 0.14730937778949738,
"learning_rate": 1.7391304347826085e-05,
"loss": 0.9552,
"step": 8
},
{
"epoch": 0.019682886823400764,
"grad_norm": 0.13510680198669434,
"learning_rate": 1.9565217391304346e-05,
"loss": 0.9591,
"step": 9
},
{
"epoch": 0.02186987424822307,
"grad_norm": 0.11157332360744476,
"learning_rate": 2.1739130434782607e-05,
"loss": 0.9358,
"step": 10
},
{
"epoch": 0.02405686167304538,
"grad_norm": 0.11157120019197464,
"learning_rate": 2.3913043478260864e-05,
"loss": 0.8377,
"step": 11
},
{
"epoch": 0.026243849097867686,
"grad_norm": 0.13191162049770355,
"learning_rate": 2.6086956521739128e-05,
"loss": 0.8974,
"step": 12
},
{
"epoch": 0.028430836522689993,
"grad_norm": 0.14399488270282745,
"learning_rate": 2.826086956521739e-05,
"loss": 0.778,
"step": 13
},
{
"epoch": 0.0306178239475123,
"grad_norm": 0.11593582481145859,
"learning_rate": 3.0434782608695653e-05,
"loss": 0.9507,
"step": 14
},
{
"epoch": 0.03280481137233461,
"grad_norm": 0.16411006450653076,
"learning_rate": 3.260869565217391e-05,
"loss": 0.6949,
"step": 15
},
{
"epoch": 0.034991798797156914,
"grad_norm": 0.13450156152248383,
"learning_rate": 3.478260869565217e-05,
"loss": 0.8162,
"step": 16
},
{
"epoch": 0.037178786221979225,
"grad_norm": 0.12586522102355957,
"learning_rate": 3.695652173913043e-05,
"loss": 0.8776,
"step": 17
},
{
"epoch": 0.03936577364680153,
"grad_norm": 0.10510208457708359,
"learning_rate": 3.913043478260869e-05,
"loss": 0.7852,
"step": 18
},
{
"epoch": 0.04155276107162384,
"grad_norm": 0.12737107276916504,
"learning_rate": 4.130434782608695e-05,
"loss": 0.9647,
"step": 19
},
{
"epoch": 0.04373974849644614,
"grad_norm": 0.1500634402036667,
"learning_rate": 4.3478260869565214e-05,
"loss": 0.7532,
"step": 20
},
{
"epoch": 0.045926735921268454,
"grad_norm": 0.16161426901817322,
"learning_rate": 4.5652173913043474e-05,
"loss": 0.811,
"step": 21
},
{
"epoch": 0.04811372334609076,
"grad_norm": 0.1249527782201767,
"learning_rate": 4.782608695652173e-05,
"loss": 0.795,
"step": 22
},
{
"epoch": 0.05030071077091307,
"grad_norm": 0.1505545973777771,
"learning_rate": 4.9999999999999996e-05,
"loss": 0.9194,
"step": 23
},
{
"epoch": 0.05248769819573537,
"grad_norm": 0.13624198734760284,
"learning_rate": 5.2173913043478256e-05,
"loss": 0.97,
"step": 24
},
{
"epoch": 0.05467468562055768,
"grad_norm": 0.15684515237808228,
"learning_rate": 5.434782608695652e-05,
"loss": 0.6862,
"step": 25
},
{
"epoch": 0.056861673045379986,
"grad_norm": 0.14302442967891693,
"learning_rate": 5.652173913043478e-05,
"loss": 0.8062,
"step": 26
},
{
"epoch": 0.0590486604702023,
"grad_norm": 0.23029306530952454,
"learning_rate": 5.869565217391304e-05,
"loss": 0.9101,
"step": 27
},
{
"epoch": 0.0612356478950246,
"grad_norm": 0.24247854948043823,
"learning_rate": 6.0869565217391306e-05,
"loss": 0.8779,
"step": 28
},
{
"epoch": 0.0634226353198469,
"grad_norm": 0.1507425308227539,
"learning_rate": 6.304347826086956e-05,
"loss": 0.7181,
"step": 29
},
{
"epoch": 0.06560962274466922,
"grad_norm": 0.18965087831020355,
"learning_rate": 6.521739130434782e-05,
"loss": 0.8163,
"step": 30
},
{
"epoch": 0.06779661016949153,
"grad_norm": 0.2104681432247162,
"learning_rate": 6.739130434782608e-05,
"loss": 0.9495,
"step": 31
},
{
"epoch": 0.06998359759431383,
"grad_norm": 0.21606619656085968,
"learning_rate": 6.956521739130434e-05,
"loss": 0.9565,
"step": 32
},
{
"epoch": 0.07217058501913615,
"grad_norm": 0.2107428014278412,
"learning_rate": 7.17391304347826e-05,
"loss": 0.7743,
"step": 33
},
{
"epoch": 0.07435757244395845,
"grad_norm": 0.3160182535648346,
"learning_rate": 7.391304347826086e-05,
"loss": 1.0056,
"step": 34
},
{
"epoch": 0.07654455986878075,
"grad_norm": 0.2970617115497589,
"learning_rate": 7.608695652173912e-05,
"loss": 0.8122,
"step": 35
},
{
"epoch": 0.07873154729360306,
"grad_norm": 0.17866499722003937,
"learning_rate": 7.826086956521738e-05,
"loss": 0.7953,
"step": 36
},
{
"epoch": 0.08091853471842538,
"grad_norm": 0.32111942768096924,
"learning_rate": 8.043478260869566e-05,
"loss": 0.9121,
"step": 37
},
{
"epoch": 0.08310552214324768,
"grad_norm": 0.20938844978809357,
"learning_rate": 8.26086956521739e-05,
"loss": 0.887,
"step": 38
},
{
"epoch": 0.08529250956806998,
"grad_norm": 0.27339646220207214,
"learning_rate": 8.478260869565217e-05,
"loss": 0.7808,
"step": 39
},
{
"epoch": 0.08747949699289229,
"grad_norm": 0.19005413353443146,
"learning_rate": 8.695652173913043e-05,
"loss": 0.6723,
"step": 40
},
{
"epoch": 0.0896664844177146,
"grad_norm": 0.19314634799957275,
"learning_rate": 8.913043478260869e-05,
"loss": 0.8384,
"step": 41
},
{
"epoch": 0.09185347184253691,
"grad_norm": 0.21565446257591248,
"learning_rate": 9.130434782608695e-05,
"loss": 0.7402,
"step": 42
},
{
"epoch": 0.09404045926735921,
"grad_norm": 0.3733920753002167,
"learning_rate": 9.347826086956521e-05,
"loss": 0.9476,
"step": 43
},
{
"epoch": 0.09622744669218151,
"grad_norm": 0.3119434714317322,
"learning_rate": 9.565217391304346e-05,
"loss": 0.7324,
"step": 44
},
{
"epoch": 0.09841443411700383,
"grad_norm": 0.20734310150146484,
"learning_rate": 9.782608695652173e-05,
"loss": 0.6521,
"step": 45
},
{
"epoch": 0.10060142154182614,
"grad_norm": 0.2809116840362549,
"learning_rate": 9.999999999999999e-05,
"loss": 0.7374,
"step": 46
},
{
"epoch": 0.10278840896664844,
"grad_norm": 0.2248832732439041,
"learning_rate": 0.00010217391304347825,
"loss": 0.7822,
"step": 47
},
{
"epoch": 0.10497539639147074,
"grad_norm": 0.26310572028160095,
"learning_rate": 0.00010434782608695651,
"loss": 0.844,
"step": 48
},
{
"epoch": 0.10716238381629306,
"grad_norm": 0.20629820227622986,
"learning_rate": 0.00010652173913043477,
"loss": 0.9024,
"step": 49
},
{
"epoch": 0.10934937124111536,
"grad_norm": 0.40926942229270935,
"learning_rate": 0.00010869565217391303,
"loss": 0.8497,
"step": 50
},
{
"epoch": 0.11153635866593767,
"grad_norm": 0.34393706917762756,
"learning_rate": 0.00011086956521739128,
"loss": 0.8326,
"step": 51
},
{
"epoch": 0.11372334609075997,
"grad_norm": 0.25371822714805603,
"learning_rate": 0.00011304347826086956,
"loss": 1.0089,
"step": 52
},
{
"epoch": 0.11591033351558229,
"grad_norm": 0.3484710454940796,
"learning_rate": 0.00011521739130434782,
"loss": 0.7667,
"step": 53
},
{
"epoch": 0.1180973209404046,
"grad_norm": 0.5894125699996948,
"learning_rate": 0.00011739130434782608,
"loss": 0.7977,
"step": 54
},
{
"epoch": 0.1202843083652269,
"grad_norm": 0.29829731583595276,
"learning_rate": 0.00011956521739130434,
"loss": 0.7545,
"step": 55
},
{
"epoch": 0.1224712957900492,
"grad_norm": 0.4180648922920227,
"learning_rate": 0.00012173913043478261,
"loss": 0.9833,
"step": 56
},
{
"epoch": 0.12465828321487152,
"grad_norm": 0.24174439907073975,
"learning_rate": 0.00012391304347826086,
"loss": 0.5948,
"step": 57
},
{
"epoch": 0.1268452706396938,
"grad_norm": 0.253364235162735,
"learning_rate": 0.00012608695652173912,
"loss": 0.7528,
"step": 58
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.31262415647506714,
"learning_rate": 0.00012826086956521738,
"loss": 0.7635,
"step": 59
},
{
"epoch": 0.13121924548933844,
"grad_norm": 0.2893831729888916,
"learning_rate": 0.00013043478260869564,
"loss": 0.7426,
"step": 60
},
{
"epoch": 0.13340623291416073,
"grad_norm": 0.26717469096183777,
"learning_rate": 0.0001326086956521739,
"loss": 0.7747,
"step": 61
},
{
"epoch": 0.13559322033898305,
"grad_norm": 0.3445766270160675,
"learning_rate": 0.00013478260869565216,
"loss": 0.802,
"step": 62
},
{
"epoch": 0.13778020776380537,
"grad_norm": 0.3893512487411499,
"learning_rate": 0.00013695652173913042,
"loss": 1.0112,
"step": 63
},
{
"epoch": 0.13996719518862766,
"grad_norm": 0.2807013988494873,
"learning_rate": 0.00013913043478260868,
"loss": 0.832,
"step": 64
},
{
"epoch": 0.14215418261344998,
"grad_norm": 0.3300040662288666,
"learning_rate": 0.00014130434782608694,
"loss": 0.8425,
"step": 65
},
{
"epoch": 0.1443411700382723,
"grad_norm": 0.3051323890686035,
"learning_rate": 0.0001434782608695652,
"loss": 0.7218,
"step": 66
},
{
"epoch": 0.14652815746309458,
"grad_norm": 0.25623396039009094,
"learning_rate": 0.00014565217391304347,
"loss": 0.7398,
"step": 67
},
{
"epoch": 0.1487151448879169,
"grad_norm": 0.3793390989303589,
"learning_rate": 0.00014782608695652173,
"loss": 0.7293,
"step": 68
},
{
"epoch": 0.1509021323127392,
"grad_norm": 0.3046607971191406,
"learning_rate": 0.00015,
"loss": 0.7507,
"step": 69
},
{
"epoch": 0.1530891197375615,
"grad_norm": 0.23061273992061615,
"learning_rate": 0.00015217391304347825,
"loss": 0.6682,
"step": 70
},
{
"epoch": 0.15527610716238383,
"grad_norm": 0.3328089714050293,
"learning_rate": 0.00015434782608695648,
"loss": 0.6736,
"step": 71
},
{
"epoch": 0.15746309458720611,
"grad_norm": 0.4419778287410736,
"learning_rate": 0.00015652173913043477,
"loss": 0.8789,
"step": 72
},
{
"epoch": 0.15965008201202843,
"grad_norm": 0.3310529291629791,
"learning_rate": 0.00015869565217391303,
"loss": 0.8108,
"step": 73
},
{
"epoch": 0.16183706943685075,
"grad_norm": 0.4529496729373932,
"learning_rate": 0.00016086956521739132,
"loss": 1.0239,
"step": 74
},
{
"epoch": 0.16402405686167304,
"grad_norm": 0.3741857707500458,
"learning_rate": 0.00016304347826086955,
"loss": 0.7601,
"step": 75
},
{
"epoch": 0.16621104428649536,
"grad_norm": 0.2660742700099945,
"learning_rate": 0.0001652173913043478,
"loss": 0.7989,
"step": 76
},
{
"epoch": 0.16839803171131765,
"grad_norm": 0.28130316734313965,
"learning_rate": 0.00016739130434782607,
"loss": 0.8459,
"step": 77
},
{
"epoch": 0.17058501913613996,
"grad_norm": 0.3322678804397583,
"learning_rate": 0.00016956521739130433,
"loss": 0.7567,
"step": 78
},
{
"epoch": 0.17277200656096228,
"grad_norm": 0.30039381980895996,
"learning_rate": 0.0001717391304347826,
"loss": 0.7353,
"step": 79
},
{
"epoch": 0.17495899398578457,
"grad_norm": 0.30451035499572754,
"learning_rate": 0.00017391304347826085,
"loss": 0.7913,
"step": 80
},
{
"epoch": 0.1771459814106069,
"grad_norm": 0.30815356969833374,
"learning_rate": 0.00017608695652173914,
"loss": 0.7766,
"step": 81
},
{
"epoch": 0.1793329688354292,
"grad_norm": 0.5257038474082947,
"learning_rate": 0.00017826086956521738,
"loss": 0.7486,
"step": 82
},
{
"epoch": 0.1815199562602515,
"grad_norm": 0.22373591363430023,
"learning_rate": 0.00018043478260869564,
"loss": 0.79,
"step": 83
},
{
"epoch": 0.18370694368507381,
"grad_norm": 0.21466179192066193,
"learning_rate": 0.0001826086956521739,
"loss": 0.6091,
"step": 84
},
{
"epoch": 0.1858939311098961,
"grad_norm": 0.3204774558544159,
"learning_rate": 0.00018478260869565216,
"loss": 1.015,
"step": 85
},
{
"epoch": 0.18808091853471842,
"grad_norm": 0.272977739572525,
"learning_rate": 0.00018695652173913042,
"loss": 0.7317,
"step": 86
},
{
"epoch": 0.19026790595954074,
"grad_norm": 0.32803332805633545,
"learning_rate": 0.00018913043478260868,
"loss": 0.7552,
"step": 87
},
{
"epoch": 0.19245489338436303,
"grad_norm": 0.308023065328598,
"learning_rate": 0.0001913043478260869,
"loss": 0.7058,
"step": 88
},
{
"epoch": 0.19464188080918535,
"grad_norm": 0.2604801654815674,
"learning_rate": 0.0001934782608695652,
"loss": 0.6967,
"step": 89
},
{
"epoch": 0.19682886823400766,
"grad_norm": 0.3489021062850952,
"learning_rate": 0.00019565217391304346,
"loss": 0.7518,
"step": 90
},
{
"epoch": 0.19901585565882995,
"grad_norm": 0.6137279272079468,
"learning_rate": 0.00019782608695652172,
"loss": 0.635,
"step": 91
},
{
"epoch": 0.20120284308365227,
"grad_norm": 0.41480115056037903,
"learning_rate": 0.00019999999999999998,
"loss": 0.8928,
"step": 92
},
{
"epoch": 0.2033898305084746,
"grad_norm": 0.22284042835235596,
"learning_rate": 0.00020217391304347824,
"loss": 0.5862,
"step": 93
},
{
"epoch": 0.20557681793329688,
"grad_norm": 0.233658567070961,
"learning_rate": 0.0002043478260869565,
"loss": 0.8148,
"step": 94
},
{
"epoch": 0.2077638053581192,
"grad_norm": 0.21716511249542236,
"learning_rate": 0.00020652173913043474,
"loss": 0.6474,
"step": 95
},
{
"epoch": 0.2099507927829415,
"grad_norm": 0.506393551826477,
"learning_rate": 0.00020869565217391303,
"loss": 0.7149,
"step": 96
},
{
"epoch": 0.2121377802077638,
"grad_norm": 0.3504016697406769,
"learning_rate": 0.00021086956521739129,
"loss": 0.647,
"step": 97
},
{
"epoch": 0.21432476763258612,
"grad_norm": 0.28688108921051025,
"learning_rate": 0.00021304347826086955,
"loss": 0.6584,
"step": 98
},
{
"epoch": 0.2165117550574084,
"grad_norm": 0.35572630167007446,
"learning_rate": 0.0002152173913043478,
"loss": 0.8177,
"step": 99
},
{
"epoch": 0.21869874248223073,
"grad_norm": 0.30645623803138733,
"learning_rate": 0.00021739130434782607,
"loss": 0.7421,
"step": 100
},
{
"epoch": 0.22088572990705305,
"grad_norm": 0.480013370513916,
"learning_rate": 0.00021956521739130433,
"loss": 0.7542,
"step": 101
},
{
"epoch": 0.22307271733187534,
"grad_norm": 0.23101027309894562,
"learning_rate": 0.00022173913043478256,
"loss": 0.81,
"step": 102
},
{
"epoch": 0.22525970475669765,
"grad_norm": 0.37322309613227844,
"learning_rate": 0.00022391304347826085,
"loss": 0.8879,
"step": 103
},
{
"epoch": 0.22744669218151994,
"grad_norm": 1.5672107934951782,
"learning_rate": 0.0002260869565217391,
"loss": 0.7838,
"step": 104
},
{
"epoch": 0.22963367960634226,
"grad_norm": 0.5281320810317993,
"learning_rate": 0.0002282608695652174,
"loss": 0.7246,
"step": 105
},
{
"epoch": 0.23182066703116458,
"grad_norm": 0.597309947013855,
"learning_rate": 0.00023043478260869563,
"loss": 0.6229,
"step": 106
},
{
"epoch": 0.23400765445598687,
"grad_norm": 0.29928773641586304,
"learning_rate": 0.0002326086956521739,
"loss": 0.779,
"step": 107
},
{
"epoch": 0.2361946418808092,
"grad_norm": 0.3042626678943634,
"learning_rate": 0.00023478260869565215,
"loss": 0.6647,
"step": 108
},
{
"epoch": 0.2383816293056315,
"grad_norm": 0.3099993169307709,
"learning_rate": 0.00023695652173913041,
"loss": 0.8173,
"step": 109
},
{
"epoch": 0.2405686167304538,
"grad_norm": 0.21835339069366455,
"learning_rate": 0.00023913043478260867,
"loss": 0.7145,
"step": 110
},
{
"epoch": 0.2427556041552761,
"grad_norm": 0.2737351357936859,
"learning_rate": 0.00024130434782608694,
"loss": 0.754,
"step": 111
},
{
"epoch": 0.2449425915800984,
"grad_norm": 0.2737314999103546,
"learning_rate": 0.00024347826086956522,
"loss": 0.6692,
"step": 112
},
{
"epoch": 0.24712957900492072,
"grad_norm": 0.369526743888855,
"learning_rate": 0.00024565217391304343,
"loss": 0.7039,
"step": 113
},
{
"epoch": 0.24931656642974304,
"grad_norm": 0.2262083888053894,
"learning_rate": 0.0002478260869565217,
"loss": 0.6004,
"step": 114
},
{
"epoch": 0.25150355385456535,
"grad_norm": 0.42596694827079773,
"learning_rate": 0.00025,
"loss": 0.8972,
"step": 115
},
{
"epoch": 0.2536905412793876,
"grad_norm": 0.4870564043521881,
"learning_rate": 0.00025217391304347824,
"loss": 0.7305,
"step": 116
},
{
"epoch": 0.25587752870420993,
"grad_norm": 0.3326433598995209,
"learning_rate": 0.00025434782608695647,
"loss": 0.7079,
"step": 117
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.3588925004005432,
"learning_rate": 0.00025652173913043476,
"loss": 0.7682,
"step": 118
},
{
"epoch": 0.26025150355385457,
"grad_norm": 0.2966621518135071,
"learning_rate": 0.00025869565217391305,
"loss": 0.8244,
"step": 119
},
{
"epoch": 0.2624384909786769,
"grad_norm": 0.2213324010372162,
"learning_rate": 0.0002608695652173913,
"loss": 0.841,
"step": 120
},
{
"epoch": 0.2646254784034992,
"grad_norm": 0.28340932726860046,
"learning_rate": 0.00026304347826086957,
"loss": 0.7646,
"step": 121
},
{
"epoch": 0.26681246582832147,
"grad_norm": 0.3026011884212494,
"learning_rate": 0.0002652173913043478,
"loss": 0.8269,
"step": 122
},
{
"epoch": 0.2689994532531438,
"grad_norm": 0.3213091194629669,
"learning_rate": 0.00026739130434782604,
"loss": 0.7456,
"step": 123
},
{
"epoch": 0.2711864406779661,
"grad_norm": 0.24254000186920166,
"learning_rate": 0.0002695652173913043,
"loss": 0.786,
"step": 124
},
{
"epoch": 0.2733734281027884,
"grad_norm": 0.22490260004997253,
"learning_rate": 0.0002717391304347826,
"loss": 0.8288,
"step": 125
},
{
"epoch": 0.27556041552761074,
"grad_norm": 0.2039777934551239,
"learning_rate": 0.00027391304347826085,
"loss": 0.7204,
"step": 126
},
{
"epoch": 0.277747402952433,
"grad_norm": 0.2281191200017929,
"learning_rate": 0.0002760869565217391,
"loss": 0.5744,
"step": 127
},
{
"epoch": 0.2799343903772553,
"grad_norm": 0.33240583539009094,
"learning_rate": 0.00027826086956521737,
"loss": 0.6398,
"step": 128
},
{
"epoch": 0.28212137780207763,
"grad_norm": 0.38755086064338684,
"learning_rate": 0.00028043478260869565,
"loss": 0.6739,
"step": 129
},
{
"epoch": 0.28430836522689995,
"grad_norm": 0.5284032821655273,
"learning_rate": 0.0002826086956521739,
"loss": 1.0215,
"step": 130
},
{
"epoch": 0.28649535265172227,
"grad_norm": 0.8248558044433594,
"learning_rate": 0.0002847826086956521,
"loss": 0.6937,
"step": 131
},
{
"epoch": 0.2886823400765446,
"grad_norm": 0.264347106218338,
"learning_rate": 0.0002869565217391304,
"loss": 0.6745,
"step": 132
},
{
"epoch": 0.29086932750136685,
"grad_norm": 0.24335810542106628,
"learning_rate": 0.00028913043478260864,
"loss": 0.8085,
"step": 133
},
{
"epoch": 0.29305631492618917,
"grad_norm": 0.2641212046146393,
"learning_rate": 0.00029130434782608693,
"loss": 0.6991,
"step": 134
},
{
"epoch": 0.2952433023510115,
"grad_norm": 0.2698618769645691,
"learning_rate": 0.0002934782608695652,
"loss": 0.7643,
"step": 135
},
{
"epoch": 0.2974302897758338,
"grad_norm": 0.24988499283790588,
"learning_rate": 0.00029565217391304345,
"loss": 0.8905,
"step": 136
},
{
"epoch": 0.2996172772006561,
"grad_norm": 0.2180056869983673,
"learning_rate": 0.0002978260869565217,
"loss": 0.7743,
"step": 137
},
{
"epoch": 0.3018042646254784,
"grad_norm": 0.23834429681301117,
"learning_rate": 0.0003,
"loss": 0.6164,
"step": 138
},
{
"epoch": 0.3039912520503007,
"grad_norm": 0.33471837639808655,
"learning_rate": 0.00029975669099756687,
"loss": 0.9367,
"step": 139
},
{
"epoch": 0.306178239475123,
"grad_norm": 0.22311441600322723,
"learning_rate": 0.0002995133819951338,
"loss": 0.8235,
"step": 140
},
{
"epoch": 0.30836522689994533,
"grad_norm": 0.16766682267189026,
"learning_rate": 0.0002992700729927007,
"loss": 0.6212,
"step": 141
},
{
"epoch": 0.31055221432476765,
"grad_norm": 0.21076077222824097,
"learning_rate": 0.0002990267639902676,
"loss": 0.7472,
"step": 142
},
{
"epoch": 0.3127392017495899,
"grad_norm": 0.33612027764320374,
"learning_rate": 0.0002987834549878345,
"loss": 0.7475,
"step": 143
},
{
"epoch": 0.31492618917441223,
"grad_norm": 0.2724473476409912,
"learning_rate": 0.0002985401459854014,
"loss": 0.7422,
"step": 144
},
{
"epoch": 0.31711317659923455,
"grad_norm": 0.23170293867588043,
"learning_rate": 0.0002982968369829683,
"loss": 0.7233,
"step": 145
},
{
"epoch": 0.31930016402405687,
"grad_norm": 0.2461654394865036,
"learning_rate": 0.00029805352798053527,
"loss": 0.6717,
"step": 146
},
{
"epoch": 0.3214871514488792,
"grad_norm": 0.2988247573375702,
"learning_rate": 0.00029781021897810217,
"loss": 0.8926,
"step": 147
},
{
"epoch": 0.3236741388737015,
"grad_norm": 0.18185736238956451,
"learning_rate": 0.00029756690997566907,
"loss": 0.6663,
"step": 148
},
{
"epoch": 0.32586112629852376,
"grad_norm": 0.276687890291214,
"learning_rate": 0.000297323600973236,
"loss": 0.6903,
"step": 149
},
{
"epoch": 0.3280481137233461,
"grad_norm": 0.3481093645095825,
"learning_rate": 0.0002970802919708029,
"loss": 0.7468,
"step": 150
},
{
"epoch": 0.3302351011481684,
"grad_norm": 0.21930567920207977,
"learning_rate": 0.0002968369829683698,
"loss": 0.6268,
"step": 151
},
{
"epoch": 0.3324220885729907,
"grad_norm": 0.18267425894737244,
"learning_rate": 0.0002965936739659367,
"loss": 0.7194,
"step": 152
},
{
"epoch": 0.33460907599781303,
"grad_norm": 0.7277535200119019,
"learning_rate": 0.0002963503649635036,
"loss": 0.7393,
"step": 153
},
{
"epoch": 0.3367960634226353,
"grad_norm": 0.3378921151161194,
"learning_rate": 0.0002961070559610705,
"loss": 0.7413,
"step": 154
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.20400595664978027,
"learning_rate": 0.00029586374695863746,
"loss": 0.7604,
"step": 155
},
{
"epoch": 0.34117003827227993,
"grad_norm": 0.3428679406642914,
"learning_rate": 0.00029562043795620436,
"loss": 0.6905,
"step": 156
},
{
"epoch": 0.34335702569710225,
"grad_norm": 0.25741925835609436,
"learning_rate": 0.00029537712895377126,
"loss": 0.8333,
"step": 157
},
{
"epoch": 0.34554401312192456,
"grad_norm": 0.2198708951473236,
"learning_rate": 0.00029513381995133816,
"loss": 0.7183,
"step": 158
},
{
"epoch": 0.3477310005467469,
"grad_norm": 0.2663215696811676,
"learning_rate": 0.0002948905109489051,
"loss": 0.6736,
"step": 159
},
{
"epoch": 0.34991798797156914,
"grad_norm": 0.26539289951324463,
"learning_rate": 0.000294647201946472,
"loss": 0.7691,
"step": 160
},
{
"epoch": 0.35210497539639146,
"grad_norm": 0.21398472785949707,
"learning_rate": 0.0002944038929440389,
"loss": 0.7259,
"step": 161
},
{
"epoch": 0.3542919628212138,
"grad_norm": 0.27584224939346313,
"learning_rate": 0.0002941605839416058,
"loss": 0.7451,
"step": 162
},
{
"epoch": 0.3564789502460361,
"grad_norm": 0.27322661876678467,
"learning_rate": 0.0002939172749391727,
"loss": 0.7429,
"step": 163
},
{
"epoch": 0.3586659376708584,
"grad_norm": 0.3097633421421051,
"learning_rate": 0.0002936739659367396,
"loss": 0.7925,
"step": 164
},
{
"epoch": 0.3608529250956807,
"grad_norm": 0.235543355345726,
"learning_rate": 0.00029343065693430656,
"loss": 0.6892,
"step": 165
},
{
"epoch": 0.363039912520503,
"grad_norm": 0.34558114409446716,
"learning_rate": 0.00029318734793187345,
"loss": 0.8239,
"step": 166
},
{
"epoch": 0.3652268999453253,
"grad_norm": 0.5169651508331299,
"learning_rate": 0.00029294403892944035,
"loss": 0.5348,
"step": 167
},
{
"epoch": 0.36741388737014763,
"grad_norm": 0.4853683412075043,
"learning_rate": 0.0002927007299270073,
"loss": 0.7482,
"step": 168
},
{
"epoch": 0.36960087479496995,
"grad_norm": 0.3244207203388214,
"learning_rate": 0.0002924574209245742,
"loss": 0.6755,
"step": 169
},
{
"epoch": 0.3717878622197922,
"grad_norm": 0.3096265494823456,
"learning_rate": 0.0002922141119221411,
"loss": 0.8395,
"step": 170
},
{
"epoch": 0.3739748496446145,
"grad_norm": 0.21022038161754608,
"learning_rate": 0.000291970802919708,
"loss": 0.7376,
"step": 171
},
{
"epoch": 0.37616183706943684,
"grad_norm": 0.23877666890621185,
"learning_rate": 0.0002917274939172749,
"loss": 0.7051,
"step": 172
},
{
"epoch": 0.37834882449425916,
"grad_norm": 0.4041813015937805,
"learning_rate": 0.0002914841849148418,
"loss": 0.6341,
"step": 173
},
{
"epoch": 0.3805358119190815,
"grad_norm": 0.45476263761520386,
"learning_rate": 0.00029124087591240875,
"loss": 0.6939,
"step": 174
},
{
"epoch": 0.3827227993439038,
"grad_norm": 0.3100184202194214,
"learning_rate": 0.00029099756690997565,
"loss": 0.6321,
"step": 175
},
{
"epoch": 0.38490978676872606,
"grad_norm": 0.31327834725379944,
"learning_rate": 0.00029075425790754255,
"loss": 0.623,
"step": 176
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.23366397619247437,
"learning_rate": 0.0002905109489051095,
"loss": 0.6799,
"step": 177
},
{
"epoch": 0.3892837616183707,
"grad_norm": 0.312284380197525,
"learning_rate": 0.0002902676399026764,
"loss": 0.6979,
"step": 178
},
{
"epoch": 0.391470749043193,
"grad_norm": 0.39591529965400696,
"learning_rate": 0.0002900243309002433,
"loss": 0.8571,
"step": 179
},
{
"epoch": 0.39365773646801533,
"grad_norm": 0.22407367825508118,
"learning_rate": 0.0002897810218978102,
"loss": 0.7724,
"step": 180
},
{
"epoch": 0.3958447238928376,
"grad_norm": 0.41758400201797485,
"learning_rate": 0.0002895377128953771,
"loss": 0.5597,
"step": 181
},
{
"epoch": 0.3980317113176599,
"grad_norm": 0.22731241583824158,
"learning_rate": 0.000289294403892944,
"loss": 0.7618,
"step": 182
},
{
"epoch": 0.4002186987424822,
"grad_norm": 0.24491345882415771,
"learning_rate": 0.00028905109489051094,
"loss": 0.6777,
"step": 183
},
{
"epoch": 0.40240568616730454,
"grad_norm": 0.2861243188381195,
"learning_rate": 0.00028880778588807784,
"loss": 0.8928,
"step": 184
},
{
"epoch": 0.40459267359212686,
"grad_norm": 0.30325135588645935,
"learning_rate": 0.00028856447688564474,
"loss": 0.6794,
"step": 185
},
{
"epoch": 0.4067796610169492,
"grad_norm": 0.22165870666503906,
"learning_rate": 0.0002883211678832117,
"loss": 0.7288,
"step": 186
},
{
"epoch": 0.40896664844177144,
"grad_norm": 0.265067994594574,
"learning_rate": 0.0002880778588807786,
"loss": 0.6641,
"step": 187
},
{
"epoch": 0.41115363586659376,
"grad_norm": 0.3085087835788727,
"learning_rate": 0.0002878345498783455,
"loss": 0.7916,
"step": 188
},
{
"epoch": 0.4133406232914161,
"grad_norm": 0.30947744846343994,
"learning_rate": 0.0002875912408759124,
"loss": 0.834,
"step": 189
},
{
"epoch": 0.4155276107162384,
"grad_norm": 0.2581535875797272,
"learning_rate": 0.0002873479318734793,
"loss": 0.6255,
"step": 190
},
{
"epoch": 0.4177145981410607,
"grad_norm": 0.24718667566776276,
"learning_rate": 0.0002871046228710462,
"loss": 0.7883,
"step": 191
},
{
"epoch": 0.419901585565883,
"grad_norm": 0.2618321180343628,
"learning_rate": 0.00028686131386861314,
"loss": 0.6922,
"step": 192
},
{
"epoch": 0.4220885729907053,
"grad_norm": 0.24760881066322327,
"learning_rate": 0.00028661800486618004,
"loss": 0.7304,
"step": 193
},
{
"epoch": 0.4242755604155276,
"grad_norm": 0.27126792073249817,
"learning_rate": 0.00028637469586374693,
"loss": 0.5676,
"step": 194
},
{
"epoch": 0.4264625478403499,
"grad_norm": 0.1799423098564148,
"learning_rate": 0.00028613138686131383,
"loss": 0.7223,
"step": 195
},
{
"epoch": 0.42864953526517224,
"grad_norm": 0.2653333246707916,
"learning_rate": 0.0002858880778588808,
"loss": 0.7486,
"step": 196
},
{
"epoch": 0.4308365226899945,
"grad_norm": 0.17445164918899536,
"learning_rate": 0.0002856447688564477,
"loss": 0.6661,
"step": 197
},
{
"epoch": 0.4330235101148168,
"grad_norm": 0.20842154324054718,
"learning_rate": 0.0002854014598540146,
"loss": 0.5784,
"step": 198
},
{
"epoch": 0.43521049753963914,
"grad_norm": 0.2216557264328003,
"learning_rate": 0.0002851581508515815,
"loss": 0.8205,
"step": 199
},
{
"epoch": 0.43739748496446146,
"grad_norm": 0.3524712920188904,
"learning_rate": 0.0002849148418491484,
"loss": 0.8784,
"step": 200
},
{
"epoch": 0.4395844723892838,
"grad_norm": 0.22435776889324188,
"learning_rate": 0.0002846715328467153,
"loss": 0.7975,
"step": 201
},
{
"epoch": 0.4417714598141061,
"grad_norm": 0.33707621693611145,
"learning_rate": 0.00028442822384428223,
"loss": 0.8767,
"step": 202
},
{
"epoch": 0.44395844723892836,
"grad_norm": 0.20236724615097046,
"learning_rate": 0.00028418491484184913,
"loss": 0.6695,
"step": 203
},
{
"epoch": 0.4461454346637507,
"grad_norm": 0.26543137431144714,
"learning_rate": 0.000283941605839416,
"loss": 0.7137,
"step": 204
},
{
"epoch": 0.448332422088573,
"grad_norm": 0.21210715174674988,
"learning_rate": 0.000283698296836983,
"loss": 0.8809,
"step": 205
},
{
"epoch": 0.4505194095133953,
"grad_norm": 0.21614502370357513,
"learning_rate": 0.0002834549878345499,
"loss": 0.6771,
"step": 206
},
{
"epoch": 0.4527063969382176,
"grad_norm": 0.30795833468437195,
"learning_rate": 0.0002832116788321168,
"loss": 0.6966,
"step": 207
},
{
"epoch": 0.4548933843630399,
"grad_norm": 0.4060954749584198,
"learning_rate": 0.0002829683698296837,
"loss": 0.7059,
"step": 208
},
{
"epoch": 0.4570803717878622,
"grad_norm": 0.24772609770298004,
"learning_rate": 0.00028272506082725057,
"loss": 0.6992,
"step": 209
},
{
"epoch": 0.4592673592126845,
"grad_norm": 0.2909943461418152,
"learning_rate": 0.00028248175182481747,
"loss": 0.8624,
"step": 210
},
{
"epoch": 0.46145434663750684,
"grad_norm": 0.2036535143852234,
"learning_rate": 0.0002822384428223844,
"loss": 0.7753,
"step": 211
},
{
"epoch": 0.46364133406232916,
"grad_norm": 0.1994384229183197,
"learning_rate": 0.0002819951338199513,
"loss": 0.7294,
"step": 212
},
{
"epoch": 0.4658283214871515,
"grad_norm": 0.2482912242412567,
"learning_rate": 0.0002817518248175182,
"loss": 0.6213,
"step": 213
},
{
"epoch": 0.46801530891197374,
"grad_norm": 0.42890939116477966,
"learning_rate": 0.0002815085158150851,
"loss": 0.8935,
"step": 214
},
{
"epoch": 0.47020229633679606,
"grad_norm": 0.24268397688865662,
"learning_rate": 0.000281265206812652,
"loss": 0.6253,
"step": 215
},
{
"epoch": 0.4723892837616184,
"grad_norm": 0.3331579267978668,
"learning_rate": 0.00028102189781021897,
"loss": 0.7022,
"step": 216
},
{
"epoch": 0.4745762711864407,
"grad_norm": 0.34377002716064453,
"learning_rate": 0.00028077858880778587,
"loss": 0.8386,
"step": 217
},
{
"epoch": 0.476763258611263,
"grad_norm": 0.2543483078479767,
"learning_rate": 0.00028053527980535277,
"loss": 0.6084,
"step": 218
},
{
"epoch": 0.47895024603608527,
"grad_norm": 0.30651986598968506,
"learning_rate": 0.00028029197080291966,
"loss": 0.7624,
"step": 219
},
{
"epoch": 0.4811372334609076,
"grad_norm": 0.3476787209510803,
"learning_rate": 0.0002800486618004866,
"loss": 0.822,
"step": 220
},
{
"epoch": 0.4833242208857299,
"grad_norm": 0.3727283477783203,
"learning_rate": 0.0002798053527980535,
"loss": 0.7416,
"step": 221
},
{
"epoch": 0.4855112083105522,
"grad_norm": 0.3289774954319,
"learning_rate": 0.0002795620437956204,
"loss": 0.8264,
"step": 222
},
{
"epoch": 0.48769819573537454,
"grad_norm": 0.26083284616470337,
"learning_rate": 0.0002793187347931873,
"loss": 0.6279,
"step": 223
},
{
"epoch": 0.4898851831601968,
"grad_norm": 0.2844780683517456,
"learning_rate": 0.0002790754257907542,
"loss": 0.6315,
"step": 224
},
{
"epoch": 0.4920721705850191,
"grad_norm": 0.3443123996257782,
"learning_rate": 0.0002788321167883211,
"loss": 0.6538,
"step": 225
},
{
"epoch": 0.49425915800984144,
"grad_norm": 0.23209474980831146,
"learning_rate": 0.00027858880778588806,
"loss": 0.7205,
"step": 226
},
{
"epoch": 0.49644614543466375,
"grad_norm": 0.26261788606643677,
"learning_rate": 0.00027834549878345496,
"loss": 0.7253,
"step": 227
},
{
"epoch": 0.4986331328594861,
"grad_norm": 0.28650718927383423,
"learning_rate": 0.00027810218978102186,
"loss": 0.889,
"step": 228
},
{
"epoch": 0.5008201202843083,
"grad_norm": 0.2478565275669098,
"learning_rate": 0.0002778588807785888,
"loss": 0.7619,
"step": 229
},
{
"epoch": 0.5030071077091307,
"grad_norm": 0.17673347890377045,
"learning_rate": 0.0002776155717761557,
"loss": 0.8684,
"step": 230
},
{
"epoch": 0.505194095133953,
"grad_norm": 0.28806573152542114,
"learning_rate": 0.0002773722627737226,
"loss": 0.7499,
"step": 231
},
{
"epoch": 0.5073810825587752,
"grad_norm": 0.2507832646369934,
"learning_rate": 0.0002771289537712895,
"loss": 0.9297,
"step": 232
},
{
"epoch": 0.5095680699835976,
"grad_norm": 0.29228198528289795,
"learning_rate": 0.0002768856447688564,
"loss": 0.8578,
"step": 233
},
{
"epoch": 0.5117550574084199,
"grad_norm": 0.5378915667533875,
"learning_rate": 0.0002766423357664233,
"loss": 0.8647,
"step": 234
},
{
"epoch": 0.5139420448332422,
"grad_norm": 0.6002528071403503,
"learning_rate": 0.0002763990267639902,
"loss": 0.8368,
"step": 235
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.19659245014190674,
"learning_rate": 0.00027615571776155715,
"loss": 0.6983,
"step": 236
},
{
"epoch": 0.5183160196828869,
"grad_norm": 0.2815648913383484,
"learning_rate": 0.00027591240875912405,
"loss": 0.7741,
"step": 237
},
{
"epoch": 0.5205030071077091,
"grad_norm": 0.2534239888191223,
"learning_rate": 0.00027566909975669095,
"loss": 0.9392,
"step": 238
},
{
"epoch": 0.5226899945325314,
"grad_norm": 0.30477020144462585,
"learning_rate": 0.0002754257907542579,
"loss": 0.7839,
"step": 239
},
{
"epoch": 0.5248769819573538,
"grad_norm": 0.321443647146225,
"learning_rate": 0.0002751824817518248,
"loss": 0.8445,
"step": 240
},
{
"epoch": 0.527063969382176,
"grad_norm": 0.3917739689350128,
"learning_rate": 0.0002749391727493917,
"loss": 0.6641,
"step": 241
},
{
"epoch": 0.5292509568069984,
"grad_norm": 0.2380986511707306,
"learning_rate": 0.0002746958637469586,
"loss": 0.8242,
"step": 242
},
{
"epoch": 0.5314379442318207,
"grad_norm": 0.1695939153432846,
"learning_rate": 0.0002744525547445255,
"loss": 0.7013,
"step": 243
},
{
"epoch": 0.5336249316566429,
"grad_norm": 0.24696393311023712,
"learning_rate": 0.0002742092457420924,
"loss": 0.8488,
"step": 244
},
{
"epoch": 0.5358119190814653,
"grad_norm": 0.2278507500886917,
"learning_rate": 0.00027396593673965935,
"loss": 0.7894,
"step": 245
},
{
"epoch": 0.5379989065062876,
"grad_norm": 0.41331958770751953,
"learning_rate": 0.00027372262773722625,
"loss": 0.8343,
"step": 246
},
{
"epoch": 0.5401858939311099,
"grad_norm": 0.29076704382896423,
"learning_rate": 0.00027347931873479315,
"loss": 0.995,
"step": 247
},
{
"epoch": 0.5423728813559322,
"grad_norm": 0.23243111371994019,
"learning_rate": 0.0002732360097323601,
"loss": 0.7456,
"step": 248
},
{
"epoch": 0.5445598687807545,
"grad_norm": 0.21154357492923737,
"learning_rate": 0.000272992700729927,
"loss": 0.6853,
"step": 249
},
{
"epoch": 0.5467468562055768,
"grad_norm": 0.24274934828281403,
"learning_rate": 0.0002727493917274939,
"loss": 0.6452,
"step": 250
},
{
"epoch": 0.5489338436303991,
"grad_norm": 0.37139129638671875,
"learning_rate": 0.0002725060827250608,
"loss": 0.7449,
"step": 251
},
{
"epoch": 0.5511208310552215,
"grad_norm": 0.17621925473213196,
"learning_rate": 0.0002722627737226277,
"loss": 0.6824,
"step": 252
},
{
"epoch": 0.5533078184800437,
"grad_norm": 0.19210177659988403,
"learning_rate": 0.0002720194647201946,
"loss": 0.6186,
"step": 253
},
{
"epoch": 0.555494805904866,
"grad_norm": 0.21780337393283844,
"learning_rate": 0.00027177615571776154,
"loss": 0.663,
"step": 254
},
{
"epoch": 0.5576817933296884,
"grad_norm": 0.21192163228988647,
"learning_rate": 0.00027153284671532844,
"loss": 0.8801,
"step": 255
},
{
"epoch": 0.5598687807545106,
"grad_norm": 0.27523308992385864,
"learning_rate": 0.00027128953771289534,
"loss": 0.6769,
"step": 256
},
{
"epoch": 0.562055768179333,
"grad_norm": 0.24207553267478943,
"learning_rate": 0.0002710462287104623,
"loss": 0.4965,
"step": 257
},
{
"epoch": 0.5642427556041553,
"grad_norm": 0.33707237243652344,
"learning_rate": 0.0002708029197080292,
"loss": 0.7787,
"step": 258
},
{
"epoch": 0.5664297430289775,
"grad_norm": 0.2669321596622467,
"learning_rate": 0.0002705596107055961,
"loss": 1.0172,
"step": 259
},
{
"epoch": 0.5686167304537999,
"grad_norm": 0.26386845111846924,
"learning_rate": 0.000270316301703163,
"loss": 0.6477,
"step": 260
},
{
"epoch": 0.5708037178786222,
"grad_norm": 0.304721474647522,
"learning_rate": 0.0002700729927007299,
"loss": 0.8301,
"step": 261
},
{
"epoch": 0.5729907053034445,
"grad_norm": 0.20255905389785767,
"learning_rate": 0.0002698296836982968,
"loss": 0.5643,
"step": 262
},
{
"epoch": 0.5751776927282668,
"grad_norm": 0.2723388671875,
"learning_rate": 0.00026958637469586374,
"loss": 0.6883,
"step": 263
},
{
"epoch": 0.5773646801530892,
"grad_norm": 0.27381351590156555,
"learning_rate": 0.00026934306569343063,
"loss": 0.808,
"step": 264
},
{
"epoch": 0.5795516675779114,
"grad_norm": 0.25915855169296265,
"learning_rate": 0.00026909975669099753,
"loss": 0.722,
"step": 265
},
{
"epoch": 0.5817386550027337,
"grad_norm": 0.22392873466014862,
"learning_rate": 0.0002688564476885645,
"loss": 0.6744,
"step": 266
},
{
"epoch": 0.5839256424275561,
"grad_norm": 0.2078748643398285,
"learning_rate": 0.0002686131386861314,
"loss": 0.8127,
"step": 267
},
{
"epoch": 0.5861126298523783,
"grad_norm": 0.18671007454395294,
"learning_rate": 0.0002683698296836983,
"loss": 0.6276,
"step": 268
},
{
"epoch": 0.5882996172772007,
"grad_norm": 0.3014012575149536,
"learning_rate": 0.0002681265206812652,
"loss": 0.7543,
"step": 269
},
{
"epoch": 0.590486604702023,
"grad_norm": 0.23588421940803528,
"learning_rate": 0.0002678832116788321,
"loss": 0.8301,
"step": 270
},
{
"epoch": 0.5926735921268452,
"grad_norm": 0.37635311484336853,
"learning_rate": 0.000267639902676399,
"loss": 0.8239,
"step": 271
},
{
"epoch": 0.5948605795516676,
"grad_norm": 0.23310554027557373,
"learning_rate": 0.0002673965936739659,
"loss": 0.8723,
"step": 272
},
{
"epoch": 0.5970475669764899,
"grad_norm": 0.47537633776664734,
"learning_rate": 0.00026715328467153283,
"loss": 0.7915,
"step": 273
},
{
"epoch": 0.5992345544013122,
"grad_norm": 0.2815110981464386,
"learning_rate": 0.0002669099756690997,
"loss": 0.8004,
"step": 274
},
{
"epoch": 0.6014215418261345,
"grad_norm": 0.19834642112255096,
"learning_rate": 0.0002666666666666666,
"loss": 0.7457,
"step": 275
},
{
"epoch": 0.6036085292509568,
"grad_norm": 0.5626861453056335,
"learning_rate": 0.0002664233576642336,
"loss": 0.6196,
"step": 276
},
{
"epoch": 0.6057955166757791,
"grad_norm": 0.2784450054168701,
"learning_rate": 0.0002661800486618005,
"loss": 0.6365,
"step": 277
},
{
"epoch": 0.6079825041006014,
"grad_norm": 0.23809124529361725,
"learning_rate": 0.0002659367396593674,
"loss": 0.7889,
"step": 278
},
{
"epoch": 0.6101694915254238,
"grad_norm": 0.25168001651763916,
"learning_rate": 0.0002656934306569343,
"loss": 0.6327,
"step": 279
},
{
"epoch": 0.612356478950246,
"grad_norm": 0.2970046401023865,
"learning_rate": 0.00026545012165450117,
"loss": 0.6913,
"step": 280
},
{
"epoch": 0.6145434663750683,
"grad_norm": 0.3090710937976837,
"learning_rate": 0.00026520681265206807,
"loss": 0.7131,
"step": 281
},
{
"epoch": 0.6167304537998907,
"grad_norm": 0.2775273621082306,
"learning_rate": 0.000264963503649635,
"loss": 0.8556,
"step": 282
},
{
"epoch": 0.6189174412247129,
"grad_norm": 0.3191220164299011,
"learning_rate": 0.0002647201946472019,
"loss": 0.8762,
"step": 283
},
{
"epoch": 0.6211044286495353,
"grad_norm": 0.2520481050014496,
"learning_rate": 0.0002644768856447688,
"loss": 0.6358,
"step": 284
},
{
"epoch": 0.6232914160743576,
"grad_norm": 0.31783685088157654,
"learning_rate": 0.00026423357664233577,
"loss": 0.773,
"step": 285
},
{
"epoch": 0.6254784034991798,
"grad_norm": 0.33624374866485596,
"learning_rate": 0.00026399026763990267,
"loss": 0.963,
"step": 286
},
{
"epoch": 0.6276653909240022,
"grad_norm": 0.3576049208641052,
"learning_rate": 0.00026374695863746957,
"loss": 0.6658,
"step": 287
},
{
"epoch": 0.6298523783488245,
"grad_norm": 0.2659110426902771,
"learning_rate": 0.00026350364963503647,
"loss": 0.6662,
"step": 288
},
{
"epoch": 0.6320393657736468,
"grad_norm": 0.3657420575618744,
"learning_rate": 0.00026326034063260337,
"loss": 0.9873,
"step": 289
},
{
"epoch": 0.6342263531984691,
"grad_norm": 0.24509188532829285,
"learning_rate": 0.00026301703163017026,
"loss": 0.7795,
"step": 290
},
{
"epoch": 0.6364133406232915,
"grad_norm": 0.24286092817783356,
"learning_rate": 0.0002627737226277372,
"loss": 0.7611,
"step": 291
},
{
"epoch": 0.6386003280481137,
"grad_norm": 0.2804836332798004,
"learning_rate": 0.0002625304136253041,
"loss": 0.759,
"step": 292
},
{
"epoch": 0.640787315472936,
"grad_norm": 0.3322978615760803,
"learning_rate": 0.000262287104622871,
"loss": 0.6943,
"step": 293
},
{
"epoch": 0.6429743028977584,
"grad_norm": 0.2114831805229187,
"learning_rate": 0.00026204379562043797,
"loss": 0.6729,
"step": 294
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.2177094966173172,
"learning_rate": 0.00026180048661800486,
"loss": 0.7916,
"step": 295
},
{
"epoch": 0.647348277747403,
"grad_norm": 0.2582005560398102,
"learning_rate": 0.00026155717761557176,
"loss": 0.7655,
"step": 296
},
{
"epoch": 0.6495352651722253,
"grad_norm": 0.2613639831542969,
"learning_rate": 0.00026131386861313866,
"loss": 0.6482,
"step": 297
},
{
"epoch": 0.6517222525970475,
"grad_norm": 0.2764948606491089,
"learning_rate": 0.00026107055961070556,
"loss": 0.7022,
"step": 298
},
{
"epoch": 0.6539092400218699,
"grad_norm": 0.20186789333820343,
"learning_rate": 0.00026082725060827246,
"loss": 0.7853,
"step": 299
},
{
"epoch": 0.6560962274466922,
"grad_norm": 0.3178173303604126,
"learning_rate": 0.0002605839416058394,
"loss": 0.8393,
"step": 300
},
{
"epoch": 0.6582832148715145,
"grad_norm": 0.35939186811447144,
"learning_rate": 0.0002603406326034063,
"loss": 0.7078,
"step": 301
},
{
"epoch": 0.6604702022963368,
"grad_norm": 0.3983876407146454,
"learning_rate": 0.0002600973236009732,
"loss": 0.8271,
"step": 302
},
{
"epoch": 0.6626571897211591,
"grad_norm": 0.19504043459892273,
"learning_rate": 0.00025985401459854016,
"loss": 0.7748,
"step": 303
},
{
"epoch": 0.6648441771459814,
"grad_norm": 0.21278342604637146,
"learning_rate": 0.00025961070559610706,
"loss": 0.8016,
"step": 304
},
{
"epoch": 0.6670311645708037,
"grad_norm": 0.29927191138267517,
"learning_rate": 0.00025936739659367396,
"loss": 0.844,
"step": 305
},
{
"epoch": 0.6692181519956261,
"grad_norm": 0.22748655080795288,
"learning_rate": 0.00025912408759124085,
"loss": 0.6786,
"step": 306
},
{
"epoch": 0.6714051394204483,
"grad_norm": 0.21796458959579468,
"learning_rate": 0.00025888077858880775,
"loss": 0.8343,
"step": 307
},
{
"epoch": 0.6735921268452706,
"grad_norm": 0.26962918043136597,
"learning_rate": 0.00025863746958637465,
"loss": 0.8058,
"step": 308
},
{
"epoch": 0.675779114270093,
"grad_norm": 0.2169698178768158,
"learning_rate": 0.00025839416058394155,
"loss": 0.8341,
"step": 309
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.5226082801818848,
"learning_rate": 0.0002581508515815085,
"loss": 0.8038,
"step": 310
},
{
"epoch": 0.6801530891197376,
"grad_norm": 0.2540872395038605,
"learning_rate": 0.0002579075425790754,
"loss": 0.6485,
"step": 311
},
{
"epoch": 0.6823400765445599,
"grad_norm": 0.2758027911186218,
"learning_rate": 0.0002576642335766423,
"loss": 0.7258,
"step": 312
},
{
"epoch": 0.6845270639693821,
"grad_norm": 0.3712478280067444,
"learning_rate": 0.00025742092457420925,
"loss": 1.0087,
"step": 313
},
{
"epoch": 0.6867140513942045,
"grad_norm": 0.29959022998809814,
"learning_rate": 0.00025717761557177615,
"loss": 0.7344,
"step": 314
},
{
"epoch": 0.6889010388190268,
"grad_norm": 0.29603782296180725,
"learning_rate": 0.00025693430656934305,
"loss": 0.7633,
"step": 315
},
{
"epoch": 0.6910880262438491,
"grad_norm": 0.26212218403816223,
"learning_rate": 0.00025669099756690995,
"loss": 0.7762,
"step": 316
},
{
"epoch": 0.6932750136686714,
"grad_norm": 0.2501971423625946,
"learning_rate": 0.00025644768856447685,
"loss": 0.6449,
"step": 317
},
{
"epoch": 0.6954620010934938,
"grad_norm": 0.20236985385417938,
"learning_rate": 0.00025620437956204374,
"loss": 0.6661,
"step": 318
},
{
"epoch": 0.697648988518316,
"grad_norm": 0.28867748379707336,
"learning_rate": 0.0002559610705596107,
"loss": 0.7168,
"step": 319
},
{
"epoch": 0.6998359759431383,
"grad_norm": 0.25392022728919983,
"learning_rate": 0.0002557177615571776,
"loss": 0.8255,
"step": 320
},
{
"epoch": 0.7020229633679607,
"grad_norm": 0.2739144563674927,
"learning_rate": 0.0002554744525547445,
"loss": 0.8782,
"step": 321
},
{
"epoch": 0.7042099507927829,
"grad_norm": 0.3195747137069702,
"learning_rate": 0.00025523114355231145,
"loss": 0.7681,
"step": 322
},
{
"epoch": 0.7063969382176053,
"grad_norm": 0.6262739300727844,
"learning_rate": 0.00025498783454987834,
"loss": 0.6497,
"step": 323
},
{
"epoch": 0.7085839256424276,
"grad_norm": 0.18836063146591187,
"learning_rate": 0.00025474452554744524,
"loss": 0.6773,
"step": 324
},
{
"epoch": 0.7107709130672498,
"grad_norm": 0.428913950920105,
"learning_rate": 0.00025450121654501214,
"loss": 0.6359,
"step": 325
},
{
"epoch": 0.7129579004920722,
"grad_norm": 0.2561635375022888,
"learning_rate": 0.00025425790754257904,
"loss": 0.6768,
"step": 326
},
{
"epoch": 0.7151448879168945,
"grad_norm": 0.2519037425518036,
"learning_rate": 0.00025401459854014594,
"loss": 0.941,
"step": 327
},
{
"epoch": 0.7173318753417168,
"grad_norm": 0.22086481750011444,
"learning_rate": 0.0002537712895377129,
"loss": 0.6448,
"step": 328
},
{
"epoch": 0.7195188627665391,
"grad_norm": 0.3844771385192871,
"learning_rate": 0.0002535279805352798,
"loss": 0.6043,
"step": 329
},
{
"epoch": 0.7217058501913614,
"grad_norm": 0.2547963857650757,
"learning_rate": 0.0002532846715328467,
"loss": 0.9912,
"step": 330
},
{
"epoch": 0.7238928376161837,
"grad_norm": 0.40474840998649597,
"learning_rate": 0.00025304136253041364,
"loss": 0.5905,
"step": 331
},
{
"epoch": 0.726079825041006,
"grad_norm": 0.20748649537563324,
"learning_rate": 0.00025279805352798054,
"loss": 0.6245,
"step": 332
},
{
"epoch": 0.7282668124658284,
"grad_norm": 0.29902809858322144,
"learning_rate": 0.00025255474452554744,
"loss": 0.7478,
"step": 333
},
{
"epoch": 0.7304537998906506,
"grad_norm": 0.21671514213085175,
"learning_rate": 0.00025231143552311433,
"loss": 0.5296,
"step": 334
},
{
"epoch": 0.7326407873154729,
"grad_norm": 0.1979508250951767,
"learning_rate": 0.00025206812652068123,
"loss": 0.5523,
"step": 335
},
{
"epoch": 0.7348277747402953,
"grad_norm": 0.25213825702667236,
"learning_rate": 0.00025182481751824813,
"loss": 0.9787,
"step": 336
},
{
"epoch": 0.7370147621651175,
"grad_norm": 0.32967931032180786,
"learning_rate": 0.0002515815085158151,
"loss": 0.7161,
"step": 337
},
{
"epoch": 0.7392017495899399,
"grad_norm": 0.30640098452568054,
"learning_rate": 0.000251338199513382,
"loss": 0.9517,
"step": 338
},
{
"epoch": 0.7413887370147622,
"grad_norm": 0.1820855438709259,
"learning_rate": 0.0002510948905109489,
"loss": 0.6219,
"step": 339
},
{
"epoch": 0.7435757244395844,
"grad_norm": 0.29584068059921265,
"learning_rate": 0.00025085158150851583,
"loss": 0.7692,
"step": 340
},
{
"epoch": 0.7457627118644068,
"grad_norm": 0.3015952408313751,
"learning_rate": 0.00025060827250608273,
"loss": 0.812,
"step": 341
},
{
"epoch": 0.747949699289229,
"grad_norm": 0.364886611700058,
"learning_rate": 0.00025036496350364963,
"loss": 0.7881,
"step": 342
},
{
"epoch": 0.7501366867140514,
"grad_norm": 0.2170587182044983,
"learning_rate": 0.00025012165450121653,
"loss": 0.6989,
"step": 343
},
{
"epoch": 0.7523236741388737,
"grad_norm": 0.23260867595672607,
"learning_rate": 0.00024987834549878343,
"loss": 0.6581,
"step": 344
},
{
"epoch": 0.7545106615636961,
"grad_norm": 0.36740902066230774,
"learning_rate": 0.0002496350364963503,
"loss": 0.9984,
"step": 345
},
{
"epoch": 0.7566976489885183,
"grad_norm": 0.6248576641082764,
"learning_rate": 0.0002493917274939172,
"loss": 0.9879,
"step": 346
},
{
"epoch": 0.7588846364133406,
"grad_norm": 0.44404783844947815,
"learning_rate": 0.0002491484184914842,
"loss": 0.616,
"step": 347
},
{
"epoch": 0.761071623838163,
"grad_norm": 0.2840265929698944,
"learning_rate": 0.0002489051094890511,
"loss": 0.9053,
"step": 348
},
{
"epoch": 0.7632586112629852,
"grad_norm": 0.34335142374038696,
"learning_rate": 0.000248661800486618,
"loss": 0.7877,
"step": 349
},
{
"epoch": 0.7654455986878076,
"grad_norm": 0.28032955527305603,
"learning_rate": 0.0002484184914841849,
"loss": 0.5934,
"step": 350
},
{
"epoch": 0.7676325861126299,
"grad_norm": 0.35794079303741455,
"learning_rate": 0.0002481751824817518,
"loss": 0.736,
"step": 351
},
{
"epoch": 0.7698195735374521,
"grad_norm": 0.1937468945980072,
"learning_rate": 0.0002479318734793187,
"loss": 0.7268,
"step": 352
},
{
"epoch": 0.7720065609622745,
"grad_norm": 0.2442459911108017,
"learning_rate": 0.0002476885644768856,
"loss": 0.9092,
"step": 353
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.2178357094526291,
"learning_rate": 0.0002474452554744525,
"loss": 0.832,
"step": 354
},
{
"epoch": 0.7763805358119191,
"grad_norm": 0.2904297113418579,
"learning_rate": 0.0002472019464720194,
"loss": 0.6973,
"step": 355
},
{
"epoch": 0.7785675232367414,
"grad_norm": 0.2849595248699188,
"learning_rate": 0.00024695863746958637,
"loss": 0.8439,
"step": 356
},
{
"epoch": 0.7807545106615636,
"grad_norm": 0.30786654353141785,
"learning_rate": 0.00024671532846715327,
"loss": 0.8282,
"step": 357
},
{
"epoch": 0.782941498086386,
"grad_norm": 0.2731088697910309,
"learning_rate": 0.00024647201946472017,
"loss": 0.7614,
"step": 358
},
{
"epoch": 0.7851284855112083,
"grad_norm": 0.2967981696128845,
"learning_rate": 0.0002462287104622871,
"loss": 0.7059,
"step": 359
},
{
"epoch": 0.7873154729360307,
"grad_norm": 0.2427809238433838,
"learning_rate": 0.000245985401459854,
"loss": 0.5235,
"step": 360
},
{
"epoch": 0.7895024603608529,
"grad_norm": 0.3543761074542999,
"learning_rate": 0.0002457420924574209,
"loss": 0.6882,
"step": 361
},
{
"epoch": 0.7916894477856752,
"grad_norm": 0.2084377259016037,
"learning_rate": 0.0002454987834549878,
"loss": 0.6333,
"step": 362
},
{
"epoch": 0.7938764352104976,
"grad_norm": 0.3653489649295807,
"learning_rate": 0.0002452554744525547,
"loss": 0.8776,
"step": 363
},
{
"epoch": 0.7960634226353198,
"grad_norm": 0.2806954085826874,
"learning_rate": 0.0002450121654501216,
"loss": 0.7464,
"step": 364
},
{
"epoch": 0.7982504100601422,
"grad_norm": 0.3652292788028717,
"learning_rate": 0.00024476885644768856,
"loss": 0.93,
"step": 365
},
{
"epoch": 0.8004373974849645,
"grad_norm": 0.24262574315071106,
"learning_rate": 0.00024452554744525546,
"loss": 0.8502,
"step": 366
},
{
"epoch": 0.8026243849097867,
"grad_norm": 0.273867666721344,
"learning_rate": 0.00024428223844282236,
"loss": 0.9274,
"step": 367
},
{
"epoch": 0.8048113723346091,
"grad_norm": 0.21722102165222168,
"learning_rate": 0.0002440389294403893,
"loss": 0.8045,
"step": 368
},
{
"epoch": 0.8069983597594313,
"grad_norm": 0.19634899497032166,
"learning_rate": 0.00024379562043795619,
"loss": 0.7424,
"step": 369
},
{
"epoch": 0.8091853471842537,
"grad_norm": 0.27201011776924133,
"learning_rate": 0.00024355231143552308,
"loss": 0.797,
"step": 370
},
{
"epoch": 0.811372334609076,
"grad_norm": 0.254142165184021,
"learning_rate": 0.00024330900243309,
"loss": 0.6142,
"step": 371
},
{
"epoch": 0.8135593220338984,
"grad_norm": 0.7009087204933167,
"learning_rate": 0.0002430656934306569,
"loss": 0.6703,
"step": 372
},
{
"epoch": 0.8157463094587206,
"grad_norm": 0.2147742360830307,
"learning_rate": 0.0002428223844282238,
"loss": 0.8446,
"step": 373
},
{
"epoch": 0.8179332968835429,
"grad_norm": 0.18214701116085052,
"learning_rate": 0.00024257907542579076,
"loss": 0.6536,
"step": 374
},
{
"epoch": 0.8201202843083653,
"grad_norm": 0.22022093832492828,
"learning_rate": 0.00024233576642335766,
"loss": 0.7452,
"step": 375
},
{
"epoch": 0.8223072717331875,
"grad_norm": 0.19220127165317535,
"learning_rate": 0.00024209245742092456,
"loss": 0.699,
"step": 376
},
{
"epoch": 0.8244942591580099,
"grad_norm": 0.26980119943618774,
"learning_rate": 0.00024184914841849148,
"loss": 0.8433,
"step": 377
},
{
"epoch": 0.8266812465828322,
"grad_norm": 0.1975000947713852,
"learning_rate": 0.00024160583941605838,
"loss": 0.5667,
"step": 378
},
{
"epoch": 0.8288682340076544,
"grad_norm": 0.28691354393959045,
"learning_rate": 0.00024136253041362528,
"loss": 0.764,
"step": 379
},
{
"epoch": 0.8310552214324768,
"grad_norm": 0.23176266252994537,
"learning_rate": 0.0002411192214111922,
"loss": 0.5348,
"step": 380
},
{
"epoch": 0.833242208857299,
"grad_norm": 0.2583778202533722,
"learning_rate": 0.0002408759124087591,
"loss": 0.8583,
"step": 381
},
{
"epoch": 0.8354291962821214,
"grad_norm": 0.1877242922782898,
"learning_rate": 0.000240632603406326,
"loss": 0.6818,
"step": 382
},
{
"epoch": 0.8376161837069437,
"grad_norm": 0.3764333724975586,
"learning_rate": 0.0002403892944038929,
"loss": 0.8631,
"step": 383
},
{
"epoch": 0.839803171131766,
"grad_norm": 0.30223846435546875,
"learning_rate": 0.00024014598540145985,
"loss": 0.7702,
"step": 384
},
{
"epoch": 0.8419901585565883,
"grad_norm": 0.43627509474754333,
"learning_rate": 0.00023990267639902675,
"loss": 0.8994,
"step": 385
},
{
"epoch": 0.8441771459814106,
"grad_norm": 0.2544715404510498,
"learning_rate": 0.00023965936739659365,
"loss": 0.6475,
"step": 386
},
{
"epoch": 0.846364133406233,
"grad_norm": 0.23747164011001587,
"learning_rate": 0.00023941605839416057,
"loss": 0.7199,
"step": 387
},
{
"epoch": 0.8485511208310552,
"grad_norm": 0.3392624855041504,
"learning_rate": 0.00023917274939172747,
"loss": 0.763,
"step": 388
},
{
"epoch": 0.8507381082558775,
"grad_norm": 0.25245627760887146,
"learning_rate": 0.00023892944038929437,
"loss": 0.7532,
"step": 389
},
{
"epoch": 0.8529250956806999,
"grad_norm": 0.2674003839492798,
"learning_rate": 0.0002386861313868613,
"loss": 0.599,
"step": 390
},
{
"epoch": 0.8551120831055221,
"grad_norm": 0.27161166071891785,
"learning_rate": 0.0002384428223844282,
"loss": 0.9355,
"step": 391
},
{
"epoch": 0.8572990705303445,
"grad_norm": 0.18150918185710907,
"learning_rate": 0.0002381995133819951,
"loss": 0.6056,
"step": 392
},
{
"epoch": 0.8594860579551667,
"grad_norm": 0.22968190908432007,
"learning_rate": 0.00023795620437956204,
"loss": 0.767,
"step": 393
},
{
"epoch": 0.861673045379989,
"grad_norm": 0.21685199439525604,
"learning_rate": 0.00023771289537712894,
"loss": 0.7246,
"step": 394
},
{
"epoch": 0.8638600328048114,
"grad_norm": 0.26542550325393677,
"learning_rate": 0.00023746958637469584,
"loss": 0.7106,
"step": 395
},
{
"epoch": 0.8660470202296336,
"grad_norm": 0.23525013029575348,
"learning_rate": 0.00023722627737226277,
"loss": 0.6958,
"step": 396
},
{
"epoch": 0.868234007654456,
"grad_norm": 0.20633290708065033,
"learning_rate": 0.00023698296836982967,
"loss": 0.643,
"step": 397
},
{
"epoch": 0.8704209950792783,
"grad_norm": 0.21550309658050537,
"learning_rate": 0.00023673965936739656,
"loss": 0.7449,
"step": 398
},
{
"epoch": 0.8726079825041007,
"grad_norm": 0.2124805748462677,
"learning_rate": 0.0002364963503649635,
"loss": 0.7398,
"step": 399
},
{
"epoch": 0.8747949699289229,
"grad_norm": 0.21294209361076355,
"learning_rate": 0.0002362530413625304,
"loss": 0.7934,
"step": 400
},
{
"epoch": 0.8769819573537452,
"grad_norm": 0.36196568608283997,
"learning_rate": 0.00023600973236009729,
"loss": 0.7848,
"step": 401
},
{
"epoch": 0.8791689447785676,
"grad_norm": 0.27596211433410645,
"learning_rate": 0.0002357664233576642,
"loss": 0.7286,
"step": 402
},
{
"epoch": 0.8813559322033898,
"grad_norm": 0.27594348788261414,
"learning_rate": 0.00023552311435523114,
"loss": 0.8247,
"step": 403
},
{
"epoch": 0.8835429196282122,
"grad_norm": 0.2970782518386841,
"learning_rate": 0.00023527980535279804,
"loss": 0.7548,
"step": 404
},
{
"epoch": 0.8857299070530344,
"grad_norm": 0.39152461290359497,
"learning_rate": 0.00023503649635036496,
"loss": 0.8263,
"step": 405
},
{
"epoch": 0.8879168944778567,
"grad_norm": 0.42587387561798096,
"learning_rate": 0.00023479318734793186,
"loss": 0.9905,
"step": 406
},
{
"epoch": 0.8901038819026791,
"grad_norm": 0.314147412776947,
"learning_rate": 0.00023454987834549876,
"loss": 0.6665,
"step": 407
},
{
"epoch": 0.8922908693275013,
"grad_norm": 0.34058940410614014,
"learning_rate": 0.00023430656934306568,
"loss": 0.7359,
"step": 408
},
{
"epoch": 0.8944778567523237,
"grad_norm": 0.2528778612613678,
"learning_rate": 0.00023406326034063258,
"loss": 0.693,
"step": 409
},
{
"epoch": 0.896664844177146,
"grad_norm": 0.17990703880786896,
"learning_rate": 0.00023381995133819948,
"loss": 0.7565,
"step": 410
},
{
"epoch": 0.8988518316019682,
"grad_norm": 0.17062903940677643,
"learning_rate": 0.0002335766423357664,
"loss": 0.7891,
"step": 411
},
{
"epoch": 0.9010388190267906,
"grad_norm": 0.3442295789718628,
"learning_rate": 0.0002333333333333333,
"loss": 0.6173,
"step": 412
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.45662209391593933,
"learning_rate": 0.0002330900243309002,
"loss": 0.796,
"step": 413
},
{
"epoch": 0.9054127938764353,
"grad_norm": 0.17335475981235504,
"learning_rate": 0.00023284671532846715,
"loss": 0.6825,
"step": 414
},
{
"epoch": 0.9075997813012575,
"grad_norm": 0.22652967274188995,
"learning_rate": 0.00023260340632603405,
"loss": 0.7512,
"step": 415
},
{
"epoch": 0.9097867687260798,
"grad_norm": 0.349649041891098,
"learning_rate": 0.00023236009732360095,
"loss": 0.8205,
"step": 416
},
{
"epoch": 0.9119737561509021,
"grad_norm": 0.18699604272842407,
"learning_rate": 0.00023211678832116788,
"loss": 0.6451,
"step": 417
},
{
"epoch": 0.9141607435757244,
"grad_norm": 0.2398325353860855,
"learning_rate": 0.00023187347931873478,
"loss": 0.6891,
"step": 418
},
{
"epoch": 0.9163477310005468,
"grad_norm": 0.22116120159626007,
"learning_rate": 0.00023163017031630167,
"loss": 0.6765,
"step": 419
},
{
"epoch": 0.918534718425369,
"grad_norm": 0.24642986059188843,
"learning_rate": 0.00023138686131386857,
"loss": 0.6119,
"step": 420
},
{
"epoch": 0.9207217058501913,
"grad_norm": 0.2329958975315094,
"learning_rate": 0.0002311435523114355,
"loss": 0.7286,
"step": 421
},
{
"epoch": 0.9229086932750137,
"grad_norm": 0.5355735421180725,
"learning_rate": 0.0002309002433090024,
"loss": 0.79,
"step": 422
},
{
"epoch": 0.9250956806998359,
"grad_norm": 0.4554167091846466,
"learning_rate": 0.0002306569343065693,
"loss": 0.6942,
"step": 423
},
{
"epoch": 0.9272826681246583,
"grad_norm": 0.2831968367099762,
"learning_rate": 0.00023041362530413625,
"loss": 0.7531,
"step": 424
},
{
"epoch": 0.9294696555494806,
"grad_norm": 0.2321235090494156,
"learning_rate": 0.00023017031630170315,
"loss": 0.6902,
"step": 425
},
{
"epoch": 0.931656642974303,
"grad_norm": 0.4006916880607605,
"learning_rate": 0.00022992700729927004,
"loss": 0.6725,
"step": 426
},
{
"epoch": 0.9338436303991252,
"grad_norm": 0.3189490735530853,
"learning_rate": 0.00022968369829683697,
"loss": 0.769,
"step": 427
},
{
"epoch": 0.9360306178239475,
"grad_norm": 0.4294585585594177,
"learning_rate": 0.00022944038929440387,
"loss": 0.8656,
"step": 428
},
{
"epoch": 0.9382176052487698,
"grad_norm": 0.34347137808799744,
"learning_rate": 0.00022919708029197077,
"loss": 0.5948,
"step": 429
},
{
"epoch": 0.9404045926735921,
"grad_norm": 0.21789056062698364,
"learning_rate": 0.0002289537712895377,
"loss": 0.8035,
"step": 430
},
{
"epoch": 0.9425915800984145,
"grad_norm": 0.1835460364818573,
"learning_rate": 0.0002287104622871046,
"loss": 0.6128,
"step": 431
},
{
"epoch": 0.9447785675232367,
"grad_norm": 0.3390374183654785,
"learning_rate": 0.0002284671532846715,
"loss": 0.7788,
"step": 432
},
{
"epoch": 0.946965554948059,
"grad_norm": 0.23330353200435638,
"learning_rate": 0.00022822384428223844,
"loss": 0.7653,
"step": 433
},
{
"epoch": 0.9491525423728814,
"grad_norm": 0.2357734590768814,
"learning_rate": 0.00022798053527980534,
"loss": 0.765,
"step": 434
},
{
"epoch": 0.9513395297977036,
"grad_norm": 0.2517554759979248,
"learning_rate": 0.00022773722627737224,
"loss": 0.7815,
"step": 435
},
{
"epoch": 0.953526517222526,
"grad_norm": 0.23417727649211884,
"learning_rate": 0.00022749391727493916,
"loss": 0.9801,
"step": 436
},
{
"epoch": 0.9557135046473483,
"grad_norm": 0.256149023771286,
"learning_rate": 0.00022725060827250606,
"loss": 0.734,
"step": 437
},
{
"epoch": 0.9579004920721705,
"grad_norm": 0.31608134508132935,
"learning_rate": 0.00022700729927007296,
"loss": 0.707,
"step": 438
},
{
"epoch": 0.9600874794969929,
"grad_norm": 0.23100577294826508,
"learning_rate": 0.00022676399026763989,
"loss": 0.6734,
"step": 439
},
{
"epoch": 0.9622744669218152,
"grad_norm": 0.27026960253715515,
"learning_rate": 0.00022652068126520678,
"loss": 0.7884,
"step": 440
},
{
"epoch": 0.9644614543466375,
"grad_norm": 0.24245603382587433,
"learning_rate": 0.00022627737226277368,
"loss": 0.5405,
"step": 441
},
{
"epoch": 0.9666484417714598,
"grad_norm": 0.25354650616645813,
"learning_rate": 0.00022603406326034064,
"loss": 0.629,
"step": 442
},
{
"epoch": 0.9688354291962821,
"grad_norm": 0.35559025406837463,
"learning_rate": 0.00022579075425790753,
"loss": 0.5673,
"step": 443
},
{
"epoch": 0.9710224166211044,
"grad_norm": 0.18353384733200073,
"learning_rate": 0.00022554744525547443,
"loss": 0.7391,
"step": 444
},
{
"epoch": 0.9732094040459267,
"grad_norm": 0.20255619287490845,
"learning_rate": 0.00022530413625304136,
"loss": 0.605,
"step": 445
},
{
"epoch": 0.9753963914707491,
"grad_norm": 0.24910545349121094,
"learning_rate": 0.00022506082725060826,
"loss": 0.7387,
"step": 446
},
{
"epoch": 0.9775833788955713,
"grad_norm": 0.30054211616516113,
"learning_rate": 0.00022481751824817515,
"loss": 0.7649,
"step": 447
},
{
"epoch": 0.9797703663203936,
"grad_norm": 0.2318667322397232,
"learning_rate": 0.00022457420924574208,
"loss": 0.6788,
"step": 448
},
{
"epoch": 0.981957353745216,
"grad_norm": 0.27025488018989563,
"learning_rate": 0.00022433090024330898,
"loss": 0.8761,
"step": 449
},
{
"epoch": 0.9841443411700382,
"grad_norm": 0.324431836605072,
"learning_rate": 0.00022408759124087588,
"loss": 0.5286,
"step": 450
},
{
"epoch": 0.9863313285948606,
"grad_norm": 0.22321289777755737,
"learning_rate": 0.00022384428223844283,
"loss": 0.9685,
"step": 451
},
{
"epoch": 0.9885183160196829,
"grad_norm": 0.348459929227829,
"learning_rate": 0.00022360097323600973,
"loss": 0.9153,
"step": 452
},
{
"epoch": 0.9907053034445052,
"grad_norm": 0.24513466656208038,
"learning_rate": 0.00022335766423357663,
"loss": 0.7944,
"step": 453
},
{
"epoch": 0.9928922908693275,
"grad_norm": 0.296447217464447,
"learning_rate": 0.00022311435523114355,
"loss": 0.7568,
"step": 454
},
{
"epoch": 0.9950792782941498,
"grad_norm": 0.27960076928138733,
"learning_rate": 0.00022287104622871045,
"loss": 0.6744,
"step": 455
},
{
"epoch": 0.9972662657189721,
"grad_norm": 0.2234726995229721,
"learning_rate": 0.00022262773722627735,
"loss": 0.8226,
"step": 456
},
{
"epoch": 0.9994532531437944,
"grad_norm": 0.20796756446361542,
"learning_rate": 0.00022238442822384425,
"loss": 0.6815,
"step": 457
},
{
"epoch": 1.0016402405686167,
"grad_norm": 0.4041379392147064,
"learning_rate": 0.00022214111922141117,
"loss": 0.814,
"step": 458
},
{
"epoch": 1.003827227993439,
"grad_norm": 0.2340199053287506,
"learning_rate": 0.00022189781021897807,
"loss": 0.9068,
"step": 459
},
{
"epoch": 1.0060142154182614,
"grad_norm": 0.24355943500995636,
"learning_rate": 0.00022165450121654497,
"loss": 0.8377,
"step": 460
},
{
"epoch": 1.0082012028430836,
"grad_norm": 0.27959203720092773,
"learning_rate": 0.00022141119221411192,
"loss": 0.6917,
"step": 461
},
{
"epoch": 1.010388190267906,
"grad_norm": 0.28080224990844727,
"learning_rate": 0.00022116788321167882,
"loss": 0.6356,
"step": 462
},
{
"epoch": 1.0125751776927283,
"grad_norm": 0.48801225423812866,
"learning_rate": 0.00022092457420924572,
"loss": 0.5904,
"step": 463
},
{
"epoch": 1.0147621651175505,
"grad_norm": 0.22513045370578766,
"learning_rate": 0.00022068126520681264,
"loss": 1.0814,
"step": 464
},
{
"epoch": 1.0169491525423728,
"grad_norm": 0.24892054498195648,
"learning_rate": 0.00022043795620437954,
"loss": 0.682,
"step": 465
},
{
"epoch": 1.0191361399671952,
"grad_norm": 0.27827882766723633,
"learning_rate": 0.00022019464720194644,
"loss": 0.5133,
"step": 466
},
{
"epoch": 1.0213231273920176,
"grad_norm": 0.22580872476100922,
"learning_rate": 0.00021995133819951337,
"loss": 0.6408,
"step": 467
},
{
"epoch": 1.0235101148168397,
"grad_norm": 0.27323248982429504,
"learning_rate": 0.00021970802919708026,
"loss": 0.6774,
"step": 468
},
{
"epoch": 1.025697102241662,
"grad_norm": 0.2104388028383255,
"learning_rate": 0.00021946472019464716,
"loss": 0.7655,
"step": 469
},
{
"epoch": 1.0278840896664845,
"grad_norm": 0.26010340452194214,
"learning_rate": 0.00021922141119221412,
"loss": 0.6855,
"step": 470
},
{
"epoch": 1.0300710770913066,
"grad_norm": 0.22332607209682465,
"learning_rate": 0.00021897810218978101,
"loss": 0.8742,
"step": 471
},
{
"epoch": 1.032258064516129,
"grad_norm": 0.22284770011901855,
"learning_rate": 0.0002187347931873479,
"loss": 0.7075,
"step": 472
},
{
"epoch": 1.0344450519409514,
"grad_norm": 0.32503169775009155,
"learning_rate": 0.00021849148418491484,
"loss": 0.8198,
"step": 473
},
{
"epoch": 1.0366320393657737,
"grad_norm": 0.2516832947731018,
"learning_rate": 0.00021824817518248174,
"loss": 0.6606,
"step": 474
},
{
"epoch": 1.038819026790596,
"grad_norm": 0.20064838230609894,
"learning_rate": 0.00021800486618004863,
"loss": 0.6696,
"step": 475
},
{
"epoch": 1.0410060142154183,
"grad_norm": 0.24873629212379456,
"learning_rate": 0.00021776155717761556,
"loss": 0.8343,
"step": 476
},
{
"epoch": 1.0431930016402406,
"grad_norm": 0.23766379058361053,
"learning_rate": 0.00021751824817518246,
"loss": 0.6831,
"step": 477
},
{
"epoch": 1.0453799890650628,
"grad_norm": 0.24385926127433777,
"learning_rate": 0.00021727493917274936,
"loss": 0.6712,
"step": 478
},
{
"epoch": 1.0475669764898852,
"grad_norm": 0.3146672546863556,
"learning_rate": 0.00021703163017031628,
"loss": 0.6183,
"step": 479
},
{
"epoch": 1.0497539639147075,
"grad_norm": 0.25711727142333984,
"learning_rate": 0.0002167883211678832,
"loss": 0.6252,
"step": 480
},
{
"epoch": 1.0519409513395297,
"grad_norm": 0.2440115511417389,
"learning_rate": 0.0002165450121654501,
"loss": 0.7278,
"step": 481
},
{
"epoch": 1.054127938764352,
"grad_norm": 0.2689894735813141,
"learning_rate": 0.00021630170316301703,
"loss": 0.8418,
"step": 482
},
{
"epoch": 1.0563149261891744,
"grad_norm": 0.2136611044406891,
"learning_rate": 0.00021605839416058393,
"loss": 0.6313,
"step": 483
},
{
"epoch": 1.0585019136139968,
"grad_norm": 0.2452273964881897,
"learning_rate": 0.00021581508515815083,
"loss": 0.8624,
"step": 484
},
{
"epoch": 1.060688901038819,
"grad_norm": 0.24893832206726074,
"learning_rate": 0.00021557177615571775,
"loss": 0.7416,
"step": 485
},
{
"epoch": 1.0628758884636413,
"grad_norm": 0.25064295530319214,
"learning_rate": 0.00021532846715328465,
"loss": 0.7699,
"step": 486
},
{
"epoch": 1.0650628758884637,
"grad_norm": 0.20812906324863434,
"learning_rate": 0.00021508515815085155,
"loss": 0.6415,
"step": 487
},
{
"epoch": 1.0672498633132859,
"grad_norm": 0.1655895859003067,
"learning_rate": 0.00021484184914841848,
"loss": 0.5422,
"step": 488
},
{
"epoch": 1.0694368507381082,
"grad_norm": 0.32013434171676636,
"learning_rate": 0.00021459854014598537,
"loss": 0.7758,
"step": 489
},
{
"epoch": 1.0716238381629306,
"grad_norm": 0.3376011252403259,
"learning_rate": 0.00021435523114355227,
"loss": 0.829,
"step": 490
},
{
"epoch": 1.0738108255877528,
"grad_norm": 0.3153345584869385,
"learning_rate": 0.00021411192214111923,
"loss": 0.7714,
"step": 491
},
{
"epoch": 1.0759978130125751,
"grad_norm": 0.3034818470478058,
"learning_rate": 0.00021386861313868612,
"loss": 0.6347,
"step": 492
},
{
"epoch": 1.0781848004373975,
"grad_norm": 0.2922978699207306,
"learning_rate": 0.00021362530413625302,
"loss": 0.7736,
"step": 493
},
{
"epoch": 1.0803717878622199,
"grad_norm": 0.2873200476169586,
"learning_rate": 0.00021338199513381992,
"loss": 0.7169,
"step": 494
},
{
"epoch": 1.082558775287042,
"grad_norm": 0.19887448847293854,
"learning_rate": 0.00021313868613138685,
"loss": 0.591,
"step": 495
},
{
"epoch": 1.0847457627118644,
"grad_norm": 0.2438717931509018,
"learning_rate": 0.00021289537712895374,
"loss": 0.7372,
"step": 496
},
{
"epoch": 1.0869327501366868,
"grad_norm": 0.2844999432563782,
"learning_rate": 0.00021265206812652064,
"loss": 0.9492,
"step": 497
},
{
"epoch": 1.089119737561509,
"grad_norm": 0.23038767278194427,
"learning_rate": 0.00021240875912408757,
"loss": 0.6491,
"step": 498
},
{
"epoch": 1.0913067249863313,
"grad_norm": 0.25681063532829285,
"learning_rate": 0.00021216545012165447,
"loss": 0.7385,
"step": 499
},
{
"epoch": 1.0934937124111537,
"grad_norm": 0.26198524236679077,
"learning_rate": 0.00021192214111922137,
"loss": 0.6631,
"step": 500
},
{
"epoch": 1.095680699835976,
"grad_norm": 0.2462042272090912,
"learning_rate": 0.00021167883211678832,
"loss": 0.6845,
"step": 501
},
{
"epoch": 1.0978676872607982,
"grad_norm": 0.4053664803504944,
"learning_rate": 0.00021143552311435522,
"loss": 0.8192,
"step": 502
},
{
"epoch": 1.1000546746856206,
"grad_norm": 0.1960192620754242,
"learning_rate": 0.00021119221411192211,
"loss": 0.654,
"step": 503
},
{
"epoch": 1.102241662110443,
"grad_norm": 0.288463294506073,
"learning_rate": 0.00021094890510948904,
"loss": 0.845,
"step": 504
},
{
"epoch": 1.104428649535265,
"grad_norm": 0.2577453553676605,
"learning_rate": 0.00021070559610705594,
"loss": 0.7532,
"step": 505
},
{
"epoch": 1.1066156369600875,
"grad_norm": 0.2428467571735382,
"learning_rate": 0.00021046228710462284,
"loss": 0.633,
"step": 506
},
{
"epoch": 1.1088026243849098,
"grad_norm": 0.2504101097583771,
"learning_rate": 0.00021021897810218976,
"loss": 0.7633,
"step": 507
},
{
"epoch": 1.110989611809732,
"grad_norm": 0.30137497186660767,
"learning_rate": 0.00020997566909975666,
"loss": 0.7516,
"step": 508
},
{
"epoch": 1.1131765992345544,
"grad_norm": 0.26197975873947144,
"learning_rate": 0.00020973236009732356,
"loss": 0.772,
"step": 509
},
{
"epoch": 1.1153635866593767,
"grad_norm": 0.21030549705028534,
"learning_rate": 0.0002094890510948905,
"loss": 0.656,
"step": 510
},
{
"epoch": 1.117550574084199,
"grad_norm": 0.32491016387939453,
"learning_rate": 0.0002092457420924574,
"loss": 0.6437,
"step": 511
},
{
"epoch": 1.1197375615090213,
"grad_norm": 0.35852229595184326,
"learning_rate": 0.0002090024330900243,
"loss": 0.6878,
"step": 512
},
{
"epoch": 1.1219245489338436,
"grad_norm": 0.2437012642621994,
"learning_rate": 0.00020875912408759123,
"loss": 0.7602,
"step": 513
},
{
"epoch": 1.124111536358666,
"grad_norm": 0.30889564752578735,
"learning_rate": 0.00020851581508515813,
"loss": 0.8807,
"step": 514
},
{
"epoch": 1.1262985237834882,
"grad_norm": 0.24090994894504547,
"learning_rate": 0.00020827250608272503,
"loss": 0.6094,
"step": 515
},
{
"epoch": 1.1284855112083105,
"grad_norm": 0.22549685835838318,
"learning_rate": 0.00020802919708029196,
"loss": 0.6548,
"step": 516
},
{
"epoch": 1.130672498633133,
"grad_norm": 0.21927274763584137,
"learning_rate": 0.00020778588807785885,
"loss": 0.5024,
"step": 517
},
{
"epoch": 1.132859486057955,
"grad_norm": 0.2773030996322632,
"learning_rate": 0.00020754257907542575,
"loss": 0.7162,
"step": 518
},
{
"epoch": 1.1350464734827774,
"grad_norm": 0.23646964132785797,
"learning_rate": 0.0002072992700729927,
"loss": 0.495,
"step": 519
},
{
"epoch": 1.1372334609075998,
"grad_norm": 0.18650543689727783,
"learning_rate": 0.0002070559610705596,
"loss": 0.6832,
"step": 520
},
{
"epoch": 1.1394204483324222,
"grad_norm": 0.2712174952030182,
"learning_rate": 0.0002068126520681265,
"loss": 0.6178,
"step": 521
},
{
"epoch": 1.1416074357572443,
"grad_norm": 0.5166855454444885,
"learning_rate": 0.00020656934306569343,
"loss": 0.7423,
"step": 522
},
{
"epoch": 1.1437944231820667,
"grad_norm": 0.23658710718154907,
"learning_rate": 0.00020632603406326033,
"loss": 0.823,
"step": 523
},
{
"epoch": 1.145981410606889,
"grad_norm": 0.2502736747264862,
"learning_rate": 0.00020608272506082722,
"loss": 0.7652,
"step": 524
},
{
"epoch": 1.1481683980317112,
"grad_norm": 0.3579782545566559,
"learning_rate": 0.00020583941605839415,
"loss": 0.6607,
"step": 525
},
{
"epoch": 1.1503553854565336,
"grad_norm": 0.23584862053394318,
"learning_rate": 0.00020559610705596105,
"loss": 0.5478,
"step": 526
},
{
"epoch": 1.152542372881356,
"grad_norm": 0.20075763761997223,
"learning_rate": 0.00020535279805352795,
"loss": 0.4904,
"step": 527
},
{
"epoch": 1.1547293603061783,
"grad_norm": 0.28536489605903625,
"learning_rate": 0.0002051094890510949,
"loss": 0.725,
"step": 528
},
{
"epoch": 1.1569163477310005,
"grad_norm": 0.2919155955314636,
"learning_rate": 0.0002048661800486618,
"loss": 0.7854,
"step": 529
},
{
"epoch": 1.1591033351558229,
"grad_norm": 0.2859315574169159,
"learning_rate": 0.0002046228710462287,
"loss": 0.7588,
"step": 530
},
{
"epoch": 1.1612903225806452,
"grad_norm": 0.2310762107372284,
"learning_rate": 0.0002043795620437956,
"loss": 0.7313,
"step": 531
},
{
"epoch": 1.1634773100054674,
"grad_norm": 0.37531688809394836,
"learning_rate": 0.00020413625304136252,
"loss": 0.7386,
"step": 532
},
{
"epoch": 1.1656642974302898,
"grad_norm": 0.2388879358768463,
"learning_rate": 0.00020389294403892942,
"loss": 0.6976,
"step": 533
},
{
"epoch": 1.1678512848551121,
"grad_norm": 0.35468119382858276,
"learning_rate": 0.00020364963503649632,
"loss": 0.7769,
"step": 534
},
{
"epoch": 1.1700382722799345,
"grad_norm": 0.35036739706993103,
"learning_rate": 0.00020340632603406324,
"loss": 0.7023,
"step": 535
},
{
"epoch": 1.1722252597047567,
"grad_norm": 0.22455590963363647,
"learning_rate": 0.00020316301703163014,
"loss": 0.6198,
"step": 536
},
{
"epoch": 1.174412247129579,
"grad_norm": 0.2568056881427765,
"learning_rate": 0.00020291970802919704,
"loss": 0.8131,
"step": 537
},
{
"epoch": 1.1765992345544014,
"grad_norm": 0.2159530222415924,
"learning_rate": 0.000202676399026764,
"loss": 0.608,
"step": 538
},
{
"epoch": 1.1787862219792236,
"grad_norm": 0.3671428859233856,
"learning_rate": 0.0002024330900243309,
"loss": 0.7317,
"step": 539
},
{
"epoch": 1.180973209404046,
"grad_norm": 0.40387099981307983,
"learning_rate": 0.0002021897810218978,
"loss": 0.7829,
"step": 540
},
{
"epoch": 1.1831601968288683,
"grad_norm": 0.23750804364681244,
"learning_rate": 0.00020194647201946471,
"loss": 0.7261,
"step": 541
},
{
"epoch": 1.1853471842536905,
"grad_norm": 0.29545098543167114,
"learning_rate": 0.0002017031630170316,
"loss": 0.641,
"step": 542
},
{
"epoch": 1.1875341716785128,
"grad_norm": 0.28032809495925903,
"learning_rate": 0.0002014598540145985,
"loss": 0.5683,
"step": 543
},
{
"epoch": 1.1897211591033352,
"grad_norm": 0.42475053668022156,
"learning_rate": 0.00020121654501216544,
"loss": 0.7681,
"step": 544
},
{
"epoch": 1.1919081465281574,
"grad_norm": 0.3492116928100586,
"learning_rate": 0.00020097323600973233,
"loss": 0.6798,
"step": 545
},
{
"epoch": 1.1940951339529797,
"grad_norm": 0.358916699886322,
"learning_rate": 0.00020072992700729923,
"loss": 0.7502,
"step": 546
},
{
"epoch": 1.196282121377802,
"grad_norm": 0.27878785133361816,
"learning_rate": 0.00020048661800486619,
"loss": 0.7625,
"step": 547
},
{
"epoch": 1.1984691088026245,
"grad_norm": 0.29086047410964966,
"learning_rate": 0.00020024330900243308,
"loss": 0.6944,
"step": 548
},
{
"epoch": 1.2006560962274466,
"grad_norm": 0.2969072759151459,
"learning_rate": 0.00019999999999999998,
"loss": 0.7105,
"step": 549
},
{
"epoch": 1.202843083652269,
"grad_norm": 0.38667795062065125,
"learning_rate": 0.0001997566909975669,
"loss": 0.7046,
"step": 550
},
{
"epoch": 1.2050300710770914,
"grad_norm": 0.26905378699302673,
"learning_rate": 0.0001995133819951338,
"loss": 0.8177,
"step": 551
},
{
"epoch": 1.2072170585019135,
"grad_norm": 0.25222644209861755,
"learning_rate": 0.0001992700729927007,
"loss": 0.7232,
"step": 552
},
{
"epoch": 1.209404045926736,
"grad_norm": 0.23291464149951935,
"learning_rate": 0.00019902676399026763,
"loss": 0.6135,
"step": 553
},
{
"epoch": 1.2115910333515583,
"grad_norm": 0.24224941432476044,
"learning_rate": 0.00019878345498783453,
"loss": 0.6832,
"step": 554
},
{
"epoch": 1.2137780207763806,
"grad_norm": 0.2552938759326935,
"learning_rate": 0.00019854014598540143,
"loss": 0.7707,
"step": 555
},
{
"epoch": 1.2159650082012028,
"grad_norm": 0.3016825318336487,
"learning_rate": 0.00019829683698296835,
"loss": 0.6199,
"step": 556
},
{
"epoch": 1.2181519956260252,
"grad_norm": 0.2980547547340393,
"learning_rate": 0.00019805352798053528,
"loss": 0.7232,
"step": 557
},
{
"epoch": 1.2203389830508475,
"grad_norm": 0.3470471203327179,
"learning_rate": 0.00019781021897810218,
"loss": 0.6665,
"step": 558
},
{
"epoch": 1.2225259704756697,
"grad_norm": 0.2844526171684265,
"learning_rate": 0.0001975669099756691,
"loss": 0.5931,
"step": 559
},
{
"epoch": 1.224712957900492,
"grad_norm": 0.2751246988773346,
"learning_rate": 0.000197323600973236,
"loss": 0.6265,
"step": 560
},
{
"epoch": 1.2268999453253144,
"grad_norm": 0.2560863792896271,
"learning_rate": 0.0001970802919708029,
"loss": 0.6442,
"step": 561
},
{
"epoch": 1.2290869327501368,
"grad_norm": 0.28800928592681885,
"learning_rate": 0.00019683698296836982,
"loss": 0.7135,
"step": 562
},
{
"epoch": 1.231273920174959,
"grad_norm": 0.44916409254074097,
"learning_rate": 0.00019659367396593672,
"loss": 0.654,
"step": 563
},
{
"epoch": 1.2334609075997813,
"grad_norm": 0.28822582960128784,
"learning_rate": 0.00019635036496350362,
"loss": 0.7907,
"step": 564
},
{
"epoch": 1.2356478950246037,
"grad_norm": 0.3168655037879944,
"learning_rate": 0.00019610705596107055,
"loss": 0.6821,
"step": 565
},
{
"epoch": 1.2378348824494259,
"grad_norm": 0.24087372422218323,
"learning_rate": 0.00019586374695863744,
"loss": 0.5753,
"step": 566
},
{
"epoch": 1.2400218698742482,
"grad_norm": 0.28054556250572205,
"learning_rate": 0.00019562043795620434,
"loss": 0.7782,
"step": 567
},
{
"epoch": 1.2422088572990706,
"grad_norm": 0.2647920250892639,
"learning_rate": 0.00019537712895377127,
"loss": 0.672,
"step": 568
},
{
"epoch": 1.2443958447238928,
"grad_norm": 0.2773146331310272,
"learning_rate": 0.0001951338199513382,
"loss": 0.6951,
"step": 569
},
{
"epoch": 1.2465828321487151,
"grad_norm": 0.22990505397319794,
"learning_rate": 0.0001948905109489051,
"loss": 0.8364,
"step": 570
},
{
"epoch": 1.2487698195735375,
"grad_norm": 0.27569764852523804,
"learning_rate": 0.000194647201946472,
"loss": 0.7833,
"step": 571
},
{
"epoch": 1.2509568069983596,
"grad_norm": 0.2720679044723511,
"learning_rate": 0.00019440389294403892,
"loss": 0.6844,
"step": 572
},
{
"epoch": 1.253143794423182,
"grad_norm": 0.31944793462753296,
"learning_rate": 0.00019416058394160581,
"loss": 0.7761,
"step": 573
},
{
"epoch": 1.2553307818480044,
"grad_norm": 0.3249347507953644,
"learning_rate": 0.0001939172749391727,
"loss": 0.6429,
"step": 574
},
{
"epoch": 1.2575177692728268,
"grad_norm": 0.3601590692996979,
"learning_rate": 0.00019367396593673964,
"loss": 0.7387,
"step": 575
},
{
"epoch": 1.259704756697649,
"grad_norm": 0.30120986700057983,
"learning_rate": 0.00019343065693430654,
"loss": 0.7797,
"step": 576
},
{
"epoch": 1.2618917441224713,
"grad_norm": 0.2647385895252228,
"learning_rate": 0.00019318734793187344,
"loss": 0.6112,
"step": 577
},
{
"epoch": 1.2640787315472937,
"grad_norm": 0.2170192301273346,
"learning_rate": 0.0001929440389294404,
"loss": 0.6963,
"step": 578
},
{
"epoch": 1.2662657189721158,
"grad_norm": 0.23418468236923218,
"learning_rate": 0.0001927007299270073,
"loss": 0.7496,
"step": 579
},
{
"epoch": 1.2684527063969382,
"grad_norm": 0.29596206545829773,
"learning_rate": 0.00019245742092457418,
"loss": 0.8172,
"step": 580
},
{
"epoch": 1.2706396938217606,
"grad_norm": 0.2754702568054199,
"learning_rate": 0.0001922141119221411,
"loss": 0.6895,
"step": 581
},
{
"epoch": 1.272826681246583,
"grad_norm": 0.2041543573141098,
"learning_rate": 0.000191970802919708,
"loss": 0.7623,
"step": 582
},
{
"epoch": 1.275013668671405,
"grad_norm": 0.3801957964897156,
"learning_rate": 0.0001917274939172749,
"loss": 0.634,
"step": 583
},
{
"epoch": 1.2772006560962275,
"grad_norm": 0.39465653896331787,
"learning_rate": 0.00019148418491484183,
"loss": 0.6114,
"step": 584
},
{
"epoch": 1.2793876435210498,
"grad_norm": 0.36799028515815735,
"learning_rate": 0.00019124087591240873,
"loss": 0.757,
"step": 585
},
{
"epoch": 1.281574630945872,
"grad_norm": 0.2876284718513489,
"learning_rate": 0.00019099756690997563,
"loss": 0.6992,
"step": 586
},
{
"epoch": 1.2837616183706944,
"grad_norm": 0.4593120813369751,
"learning_rate": 0.00019075425790754258,
"loss": 0.6095,
"step": 587
},
{
"epoch": 1.2859486057955167,
"grad_norm": 0.24458545446395874,
"learning_rate": 0.00019051094890510948,
"loss": 0.5724,
"step": 588
},
{
"epoch": 1.288135593220339,
"grad_norm": 0.22930872440338135,
"learning_rate": 0.00019026763990267638,
"loss": 0.5479,
"step": 589
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.32167893648147583,
"learning_rate": 0.0001900243309002433,
"loss": 0.7158,
"step": 590
},
{
"epoch": 1.2925095680699836,
"grad_norm": 0.2847557067871094,
"learning_rate": 0.0001897810218978102,
"loss": 0.6545,
"step": 591
},
{
"epoch": 1.2946965554948058,
"grad_norm": 0.24358853697776794,
"learning_rate": 0.0001895377128953771,
"loss": 0.7497,
"step": 592
},
{
"epoch": 1.2968835429196282,
"grad_norm": 0.26657119393348694,
"learning_rate": 0.00018929440389294403,
"loss": 0.6816,
"step": 593
},
{
"epoch": 1.2990705303444505,
"grad_norm": 0.3368627727031708,
"learning_rate": 0.00018905109489051093,
"loss": 0.613,
"step": 594
},
{
"epoch": 1.301257517769273,
"grad_norm": 0.28971466422080994,
"learning_rate": 0.00018880778588807782,
"loss": 0.814,
"step": 595
},
{
"epoch": 1.3034445051940953,
"grad_norm": 0.3216496706008911,
"learning_rate": 0.00018856447688564478,
"loss": 0.7116,
"step": 596
},
{
"epoch": 1.3056314926189174,
"grad_norm": 0.25016555190086365,
"learning_rate": 0.00018832116788321167,
"loss": 0.7034,
"step": 597
},
{
"epoch": 1.3078184800437398,
"grad_norm": 0.2602551579475403,
"learning_rate": 0.00018807785888077857,
"loss": 0.6624,
"step": 598
},
{
"epoch": 1.310005467468562,
"grad_norm": 0.1847269982099533,
"learning_rate": 0.0001878345498783455,
"loss": 0.6645,
"step": 599
},
{
"epoch": 1.3121924548933843,
"grad_norm": 0.20593389868736267,
"learning_rate": 0.0001875912408759124,
"loss": 0.6471,
"step": 600
},
{
"epoch": 1.3143794423182067,
"grad_norm": 0.2651140093803406,
"learning_rate": 0.0001873479318734793,
"loss": 0.6743,
"step": 601
},
{
"epoch": 1.316566429743029,
"grad_norm": 0.3243972659111023,
"learning_rate": 0.00018710462287104622,
"loss": 0.662,
"step": 602
},
{
"epoch": 1.3187534171678512,
"grad_norm": 0.24702341854572296,
"learning_rate": 0.00018686131386861312,
"loss": 0.746,
"step": 603
},
{
"epoch": 1.3209404045926736,
"grad_norm": 0.25382477045059204,
"learning_rate": 0.00018661800486618002,
"loss": 0.7115,
"step": 604
},
{
"epoch": 1.323127392017496,
"grad_norm": 0.26453620195388794,
"learning_rate": 0.00018637469586374697,
"loss": 0.5843,
"step": 605
},
{
"epoch": 1.3253143794423181,
"grad_norm": 0.25161460041999817,
"learning_rate": 0.00018613138686131387,
"loss": 0.7831,
"step": 606
},
{
"epoch": 1.3275013668671405,
"grad_norm": 0.2947143316268921,
"learning_rate": 0.00018588807785888077,
"loss": 0.6277,
"step": 607
},
{
"epoch": 1.3296883542919629,
"grad_norm": 0.25893881916999817,
"learning_rate": 0.00018564476885644767,
"loss": 0.6816,
"step": 608
},
{
"epoch": 1.3318753417167852,
"grad_norm": 0.3958803713321686,
"learning_rate": 0.0001854014598540146,
"loss": 0.8033,
"step": 609
},
{
"epoch": 1.3340623291416074,
"grad_norm": 0.28083765506744385,
"learning_rate": 0.0001851581508515815,
"loss": 0.6587,
"step": 610
},
{
"epoch": 1.3362493165664298,
"grad_norm": 0.26417723298072815,
"learning_rate": 0.0001849148418491484,
"loss": 0.6867,
"step": 611
},
{
"epoch": 1.3384363039912521,
"grad_norm": 0.2628178000450134,
"learning_rate": 0.0001846715328467153,
"loss": 0.6275,
"step": 612
},
{
"epoch": 1.3406232914160743,
"grad_norm": 0.20500022172927856,
"learning_rate": 0.0001844282238442822,
"loss": 0.6152,
"step": 613
},
{
"epoch": 1.3428102788408967,
"grad_norm": 0.22486689686775208,
"learning_rate": 0.0001841849148418491,
"loss": 0.5407,
"step": 614
},
{
"epoch": 1.344997266265719,
"grad_norm": 0.3170478641986847,
"learning_rate": 0.00018394160583941606,
"loss": 0.7176,
"step": 615
},
{
"epoch": 1.3471842536905414,
"grad_norm": 0.34868374466896057,
"learning_rate": 0.00018369829683698296,
"loss": 0.5815,
"step": 616
},
{
"epoch": 1.3493712411153636,
"grad_norm": 0.2484477013349533,
"learning_rate": 0.00018345498783454986,
"loss": 0.6613,
"step": 617
},
{
"epoch": 1.351558228540186,
"grad_norm": 0.2799300253391266,
"learning_rate": 0.00018321167883211678,
"loss": 0.6685,
"step": 618
},
{
"epoch": 1.353745215965008,
"grad_norm": 0.28434398770332336,
"learning_rate": 0.00018296836982968368,
"loss": 0.7881,
"step": 619
},
{
"epoch": 1.3559322033898304,
"grad_norm": 0.25863373279571533,
"learning_rate": 0.00018272506082725058,
"loss": 0.7325,
"step": 620
},
{
"epoch": 1.3581191908146528,
"grad_norm": 0.3039908707141876,
"learning_rate": 0.0001824817518248175,
"loss": 0.8676,
"step": 621
},
{
"epoch": 1.3603061782394752,
"grad_norm": 0.29525163769721985,
"learning_rate": 0.0001822384428223844,
"loss": 0.8909,
"step": 622
},
{
"epoch": 1.3624931656642976,
"grad_norm": 0.475063294172287,
"learning_rate": 0.0001819951338199513,
"loss": 0.6882,
"step": 623
},
{
"epoch": 1.3646801530891197,
"grad_norm": 0.22500012814998627,
"learning_rate": 0.00018175182481751826,
"loss": 0.6354,
"step": 624
},
{
"epoch": 1.366867140513942,
"grad_norm": 0.24890188872814178,
"learning_rate": 0.00018150851581508515,
"loss": 0.5322,
"step": 625
},
{
"epoch": 1.3690541279387642,
"grad_norm": 0.24399027228355408,
"learning_rate": 0.00018126520681265205,
"loss": 0.7255,
"step": 626
},
{
"epoch": 1.3712411153635866,
"grad_norm": 0.32299381494522095,
"learning_rate": 0.00018102189781021898,
"loss": 0.5199,
"step": 627
},
{
"epoch": 1.373428102788409,
"grad_norm": 0.4946720600128174,
"learning_rate": 0.00018077858880778588,
"loss": 0.7099,
"step": 628
},
{
"epoch": 1.3756150902132314,
"grad_norm": 0.47641122341156006,
"learning_rate": 0.00018053527980535278,
"loss": 0.752,
"step": 629
},
{
"epoch": 1.3778020776380535,
"grad_norm": 0.3367193937301636,
"learning_rate": 0.0001802919708029197,
"loss": 0.7196,
"step": 630
},
{
"epoch": 1.3799890650628759,
"grad_norm": 0.27993133664131165,
"learning_rate": 0.0001800486618004866,
"loss": 0.7357,
"step": 631
},
{
"epoch": 1.3821760524876983,
"grad_norm": 0.27575206756591797,
"learning_rate": 0.0001798053527980535,
"loss": 0.6148,
"step": 632
},
{
"epoch": 1.3843630399125204,
"grad_norm": 0.33214282989501953,
"learning_rate": 0.00017956204379562042,
"loss": 0.771,
"step": 633
},
{
"epoch": 1.3865500273373428,
"grad_norm": 0.2970830798149109,
"learning_rate": 0.00017931873479318735,
"loss": 0.6882,
"step": 634
},
{
"epoch": 1.3887370147621652,
"grad_norm": 0.3435869812965393,
"learning_rate": 0.00017907542579075425,
"loss": 0.6992,
"step": 635
},
{
"epoch": 1.3909240021869875,
"grad_norm": 0.3328729569911957,
"learning_rate": 0.00017883211678832117,
"loss": 0.6594,
"step": 636
},
{
"epoch": 1.3931109896118097,
"grad_norm": 0.3031856119632721,
"learning_rate": 0.00017858880778588807,
"loss": 0.642,
"step": 637
},
{
"epoch": 1.395297977036632,
"grad_norm": 0.2761346399784088,
"learning_rate": 0.00017834549878345497,
"loss": 1.0442,
"step": 638
},
{
"epoch": 1.3974849644614544,
"grad_norm": 0.34098902344703674,
"learning_rate": 0.0001781021897810219,
"loss": 0.9509,
"step": 639
},
{
"epoch": 1.3996719518862766,
"grad_norm": 0.4181225299835205,
"learning_rate": 0.0001778588807785888,
"loss": 0.6521,
"step": 640
},
{
"epoch": 1.401858939311099,
"grad_norm": 0.2533126473426819,
"learning_rate": 0.0001776155717761557,
"loss": 0.6221,
"step": 641
},
{
"epoch": 1.4040459267359213,
"grad_norm": 0.25691646337509155,
"learning_rate": 0.00017737226277372262,
"loss": 0.5691,
"step": 642
},
{
"epoch": 1.4062329141607437,
"grad_norm": 0.2649155557155609,
"learning_rate": 0.00017712895377128952,
"loss": 0.614,
"step": 643
},
{
"epoch": 1.4084199015855658,
"grad_norm": 0.32973209023475647,
"learning_rate": 0.00017688564476885641,
"loss": 0.878,
"step": 644
},
{
"epoch": 1.4106068890103882,
"grad_norm": 0.3559141755104065,
"learning_rate": 0.00017664233576642334,
"loss": 0.7954,
"step": 645
},
{
"epoch": 1.4127938764352104,
"grad_norm": 0.2913306653499603,
"learning_rate": 0.00017639902676399026,
"loss": 0.735,
"step": 646
},
{
"epoch": 1.4149808638600327,
"grad_norm": 0.24183817207813263,
"learning_rate": 0.00017615571776155716,
"loss": 0.5965,
"step": 647
},
{
"epoch": 1.4171678512848551,
"grad_norm": 0.2638205885887146,
"learning_rate": 0.00017591240875912406,
"loss": 0.6843,
"step": 648
},
{
"epoch": 1.4193548387096775,
"grad_norm": 0.23057186603546143,
"learning_rate": 0.000175669099756691,
"loss": 0.7453,
"step": 649
},
{
"epoch": 1.4215418261344999,
"grad_norm": 0.22737360000610352,
"learning_rate": 0.00017542579075425789,
"loss": 0.5423,
"step": 650
},
{
"epoch": 1.423728813559322,
"grad_norm": 0.25872430205345154,
"learning_rate": 0.00017518248175182478,
"loss": 0.7591,
"step": 651
},
{
"epoch": 1.4259158009841444,
"grad_norm": 0.2998059391975403,
"learning_rate": 0.0001749391727493917,
"loss": 0.6222,
"step": 652
},
{
"epoch": 1.4281027884089665,
"grad_norm": 0.21351587772369385,
"learning_rate": 0.0001746958637469586,
"loss": 0.7082,
"step": 653
},
{
"epoch": 1.430289775833789,
"grad_norm": 0.34969425201416016,
"learning_rate": 0.0001744525547445255,
"loss": 0.6319,
"step": 654
},
{
"epoch": 1.4324767632586113,
"grad_norm": 0.2845169007778168,
"learning_rate": 0.00017420924574209246,
"loss": 0.6965,
"step": 655
},
{
"epoch": 1.4346637506834337,
"grad_norm": 0.2735065221786499,
"learning_rate": 0.00017396593673965936,
"loss": 0.6866,
"step": 656
},
{
"epoch": 1.4368507381082558,
"grad_norm": 0.2701031267642975,
"learning_rate": 0.00017372262773722626,
"loss": 0.8098,
"step": 657
},
{
"epoch": 1.4390377255330782,
"grad_norm": 0.319159597158432,
"learning_rate": 0.00017347931873479318,
"loss": 0.6627,
"step": 658
},
{
"epoch": 1.4412247129579006,
"grad_norm": 0.24762673676013947,
"learning_rate": 0.00017323600973236008,
"loss": 0.8179,
"step": 659
},
{
"epoch": 1.4434117003827227,
"grad_norm": 0.26977255940437317,
"learning_rate": 0.00017299270072992698,
"loss": 0.5487,
"step": 660
},
{
"epoch": 1.445598687807545,
"grad_norm": 0.25042101740837097,
"learning_rate": 0.0001727493917274939,
"loss": 0.9502,
"step": 661
},
{
"epoch": 1.4477856752323675,
"grad_norm": 0.28913062810897827,
"learning_rate": 0.0001725060827250608,
"loss": 0.7216,
"step": 662
},
{
"epoch": 1.4499726626571898,
"grad_norm": 0.3237348198890686,
"learning_rate": 0.0001722627737226277,
"loss": 0.7644,
"step": 663
},
{
"epoch": 1.452159650082012,
"grad_norm": 0.34338346123695374,
"learning_rate": 0.00017201946472019465,
"loss": 0.9851,
"step": 664
},
{
"epoch": 1.4543466375068343,
"grad_norm": 0.1985798180103302,
"learning_rate": 0.00017177615571776155,
"loss": 0.649,
"step": 665
},
{
"epoch": 1.4565336249316567,
"grad_norm": 0.2959745526313782,
"learning_rate": 0.00017153284671532845,
"loss": 0.8134,
"step": 666
},
{
"epoch": 1.4587206123564789,
"grad_norm": 0.28383585810661316,
"learning_rate": 0.00017128953771289537,
"loss": 0.6864,
"step": 667
},
{
"epoch": 1.4609075997813012,
"grad_norm": 0.35177820920944214,
"learning_rate": 0.00017104622871046227,
"loss": 0.779,
"step": 668
},
{
"epoch": 1.4630945872061236,
"grad_norm": 0.27833032608032227,
"learning_rate": 0.00017080291970802917,
"loss": 0.7377,
"step": 669
},
{
"epoch": 1.465281574630946,
"grad_norm": 0.26814982295036316,
"learning_rate": 0.0001705596107055961,
"loss": 0.6367,
"step": 670
},
{
"epoch": 1.4674685620557681,
"grad_norm": 0.29226943850517273,
"learning_rate": 0.000170316301703163,
"loss": 0.6674,
"step": 671
},
{
"epoch": 1.4696555494805905,
"grad_norm": 0.23404401540756226,
"learning_rate": 0.0001700729927007299,
"loss": 0.6187,
"step": 672
},
{
"epoch": 1.4718425369054127,
"grad_norm": 0.1943274289369583,
"learning_rate": 0.00016982968369829685,
"loss": 0.7886,
"step": 673
},
{
"epoch": 1.474029524330235,
"grad_norm": 0.2543155550956726,
"learning_rate": 0.00016958637469586374,
"loss": 0.8211,
"step": 674
},
{
"epoch": 1.4762165117550574,
"grad_norm": 0.34419891238212585,
"learning_rate": 0.00016934306569343064,
"loss": 0.7097,
"step": 675
},
{
"epoch": 1.4784034991798798,
"grad_norm": 0.3277907371520996,
"learning_rate": 0.00016909975669099757,
"loss": 0.6725,
"step": 676
},
{
"epoch": 1.4805904866047022,
"grad_norm": 0.21943743526935577,
"learning_rate": 0.00016885644768856447,
"loss": 0.6246,
"step": 677
},
{
"epoch": 1.4827774740295243,
"grad_norm": 0.6248902678489685,
"learning_rate": 0.00016861313868613137,
"loss": 0.8422,
"step": 678
},
{
"epoch": 1.4849644614543467,
"grad_norm": 0.3430839478969574,
"learning_rate": 0.0001683698296836983,
"loss": 0.7539,
"step": 679
},
{
"epoch": 1.4871514488791688,
"grad_norm": 0.25437131524086,
"learning_rate": 0.0001681265206812652,
"loss": 0.8793,
"step": 680
},
{
"epoch": 1.4893384363039912,
"grad_norm": 0.44833317399024963,
"learning_rate": 0.0001678832116788321,
"loss": 0.7591,
"step": 681
},
{
"epoch": 1.4915254237288136,
"grad_norm": 0.359467089176178,
"learning_rate": 0.00016763990267639899,
"loss": 0.6912,
"step": 682
},
{
"epoch": 1.493712411153636,
"grad_norm": 0.3209226429462433,
"learning_rate": 0.00016739659367396594,
"loss": 0.6292,
"step": 683
},
{
"epoch": 1.495899398578458,
"grad_norm": 0.30807530879974365,
"learning_rate": 0.00016715328467153284,
"loss": 0.7619,
"step": 684
},
{
"epoch": 1.4980863860032805,
"grad_norm": 0.38420820236206055,
"learning_rate": 0.00016690997566909974,
"loss": 0.7212,
"step": 685
},
{
"epoch": 1.5002733734281026,
"grad_norm": 0.27499136328697205,
"learning_rate": 0.00016666666666666666,
"loss": 0.7246,
"step": 686
},
{
"epoch": 1.502460360852925,
"grad_norm": 0.3359529376029968,
"learning_rate": 0.00016642335766423356,
"loss": 0.7988,
"step": 687
},
{
"epoch": 1.5046473482777474,
"grad_norm": 0.2965240180492401,
"learning_rate": 0.00016618004866180046,
"loss": 0.5721,
"step": 688
},
{
"epoch": 1.5068343357025697,
"grad_norm": 0.35766786336898804,
"learning_rate": 0.00016593673965936738,
"loss": 0.8168,
"step": 689
},
{
"epoch": 1.5090213231273921,
"grad_norm": 0.2500085234642029,
"learning_rate": 0.00016569343065693428,
"loss": 0.7125,
"step": 690
},
{
"epoch": 1.5112083105522143,
"grad_norm": 0.4028027355670929,
"learning_rate": 0.00016545012165450118,
"loss": 0.8912,
"step": 691
},
{
"epoch": 1.5133952979770366,
"grad_norm": 0.365488737821579,
"learning_rate": 0.00016520681265206813,
"loss": 0.8114,
"step": 692
},
{
"epoch": 1.5155822854018588,
"grad_norm": 0.2998720109462738,
"learning_rate": 0.00016496350364963503,
"loss": 0.7185,
"step": 693
},
{
"epoch": 1.5177692728266812,
"grad_norm": 0.31432968378067017,
"learning_rate": 0.00016472019464720193,
"loss": 0.6455,
"step": 694
},
{
"epoch": 1.5199562602515035,
"grad_norm": 0.23023012280464172,
"learning_rate": 0.00016447688564476886,
"loss": 0.5255,
"step": 695
},
{
"epoch": 1.522143247676326,
"grad_norm": 0.3279372453689575,
"learning_rate": 0.00016423357664233575,
"loss": 0.696,
"step": 696
},
{
"epoch": 1.5243302351011483,
"grad_norm": 0.3116084635257721,
"learning_rate": 0.00016399026763990265,
"loss": 0.6297,
"step": 697
},
{
"epoch": 1.5265172225259704,
"grad_norm": 0.2646781802177429,
"learning_rate": 0.00016374695863746958,
"loss": 0.7854,
"step": 698
},
{
"epoch": 1.5287042099507928,
"grad_norm": 0.29048752784729004,
"learning_rate": 0.00016350364963503648,
"loss": 0.6409,
"step": 699
},
{
"epoch": 1.530891197375615,
"grad_norm": 0.2570263743400574,
"learning_rate": 0.00016326034063260337,
"loss": 0.6613,
"step": 700
},
{
"epoch": 1.5330781848004373,
"grad_norm": 0.3784395456314087,
"learning_rate": 0.00016301703163017033,
"loss": 0.5857,
"step": 701
},
{
"epoch": 1.5352651722252597,
"grad_norm": 0.3324502110481262,
"learning_rate": 0.00016277372262773723,
"loss": 0.7317,
"step": 702
},
{
"epoch": 1.537452159650082,
"grad_norm": 0.2623542249202728,
"learning_rate": 0.00016253041362530412,
"loss": 0.648,
"step": 703
},
{
"epoch": 1.5396391470749045,
"grad_norm": 0.31035107374191284,
"learning_rate": 0.00016228710462287105,
"loss": 0.8125,
"step": 704
},
{
"epoch": 1.5418261344997266,
"grad_norm": 0.35497644543647766,
"learning_rate": 0.00016204379562043795,
"loss": 0.7798,
"step": 705
},
{
"epoch": 1.544013121924549,
"grad_norm": 0.4693346321582794,
"learning_rate": 0.00016180048661800485,
"loss": 0.7838,
"step": 706
},
{
"epoch": 1.5462001093493711,
"grad_norm": 0.2803730368614197,
"learning_rate": 0.00016155717761557177,
"loss": 0.9113,
"step": 707
},
{
"epoch": 1.5483870967741935,
"grad_norm": 0.3578079342842102,
"learning_rate": 0.00016131386861313867,
"loss": 0.6923,
"step": 708
},
{
"epoch": 1.5505740841990159,
"grad_norm": 0.29390111565589905,
"learning_rate": 0.00016107055961070557,
"loss": 0.8407,
"step": 709
},
{
"epoch": 1.5527610716238383,
"grad_norm": 0.32291004061698914,
"learning_rate": 0.0001608272506082725,
"loss": 0.8082,
"step": 710
},
{
"epoch": 1.5549480590486606,
"grad_norm": 0.2640690803527832,
"learning_rate": 0.00016058394160583942,
"loss": 0.6813,
"step": 711
},
{
"epoch": 1.5571350464734828,
"grad_norm": 0.32076698541641235,
"learning_rate": 0.00016034063260340632,
"loss": 0.8319,
"step": 712
},
{
"epoch": 1.559322033898305,
"grad_norm": 0.29734277725219727,
"learning_rate": 0.00016009732360097324,
"loss": 0.9649,
"step": 713
},
{
"epoch": 1.5615090213231273,
"grad_norm": 0.3353315591812134,
"learning_rate": 0.00015985401459854014,
"loss": 0.6102,
"step": 714
},
{
"epoch": 1.5636960087479497,
"grad_norm": 0.24924345314502716,
"learning_rate": 0.00015961070559610704,
"loss": 0.6868,
"step": 715
},
{
"epoch": 1.565882996172772,
"grad_norm": 0.21561355888843536,
"learning_rate": 0.00015936739659367397,
"loss": 0.6087,
"step": 716
},
{
"epoch": 1.5680699835975944,
"grad_norm": 0.28856387734413147,
"learning_rate": 0.00015912408759124086,
"loss": 0.7849,
"step": 717
},
{
"epoch": 1.5702569710224166,
"grad_norm": 0.2342023402452469,
"learning_rate": 0.00015888077858880776,
"loss": 0.8097,
"step": 718
},
{
"epoch": 1.572443958447239,
"grad_norm": 0.27620434761047363,
"learning_rate": 0.00015863746958637466,
"loss": 0.6495,
"step": 719
},
{
"epoch": 1.574630945872061,
"grad_norm": 0.3575909733772278,
"learning_rate": 0.00015839416058394159,
"loss": 0.5667,
"step": 720
},
{
"epoch": 1.5768179332968835,
"grad_norm": 0.29075026512145996,
"learning_rate": 0.00015815085158150848,
"loss": 0.734,
"step": 721
},
{
"epoch": 1.5790049207217058,
"grad_norm": 0.317648321390152,
"learning_rate": 0.0001579075425790754,
"loss": 0.6881,
"step": 722
},
{
"epoch": 1.5811919081465282,
"grad_norm": 0.2477569282054901,
"learning_rate": 0.00015766423357664234,
"loss": 0.7097,
"step": 723
},
{
"epoch": 1.5833788955713506,
"grad_norm": 0.2733086347579956,
"learning_rate": 0.00015742092457420923,
"loss": 0.4836,
"step": 724
},
{
"epoch": 1.5855658829961727,
"grad_norm": 0.32278919219970703,
"learning_rate": 0.00015717761557177613,
"loss": 0.6931,
"step": 725
},
{
"epoch": 1.587752870420995,
"grad_norm": 0.2804641127586365,
"learning_rate": 0.00015693430656934306,
"loss": 0.6908,
"step": 726
},
{
"epoch": 1.5899398578458173,
"grad_norm": 0.28953608870506287,
"learning_rate": 0.00015669099756690996,
"loss": 0.7086,
"step": 727
},
{
"epoch": 1.5921268452706396,
"grad_norm": 0.21297629177570343,
"learning_rate": 0.00015644768856447685,
"loss": 0.6663,
"step": 728
},
{
"epoch": 1.594313832695462,
"grad_norm": 0.23495450615882874,
"learning_rate": 0.00015620437956204378,
"loss": 0.7177,
"step": 729
},
{
"epoch": 1.5965008201202844,
"grad_norm": 0.4271846413612366,
"learning_rate": 0.00015596107055961068,
"loss": 0.9376,
"step": 730
},
{
"epoch": 1.5986878075451068,
"grad_norm": 0.3190995156764984,
"learning_rate": 0.00015571776155717758,
"loss": 0.5957,
"step": 731
},
{
"epoch": 1.600874794969929,
"grad_norm": 0.3533025085926056,
"learning_rate": 0.00015547445255474453,
"loss": 0.8295,
"step": 732
},
{
"epoch": 1.6030617823947513,
"grad_norm": 0.48731425404548645,
"learning_rate": 0.00015523114355231143,
"loss": 0.7024,
"step": 733
},
{
"epoch": 1.6052487698195734,
"grad_norm": 0.2876966595649719,
"learning_rate": 0.00015498783454987833,
"loss": 0.6858,
"step": 734
},
{
"epoch": 1.6074357572443958,
"grad_norm": 0.2668203115463257,
"learning_rate": 0.00015474452554744525,
"loss": 0.7548,
"step": 735
},
{
"epoch": 1.6096227446692182,
"grad_norm": 0.3176876902580261,
"learning_rate": 0.00015450121654501215,
"loss": 0.7124,
"step": 736
},
{
"epoch": 1.6118097320940405,
"grad_norm": 0.3083260655403137,
"learning_rate": 0.00015425790754257905,
"loss": 0.682,
"step": 737
},
{
"epoch": 1.613996719518863,
"grad_norm": 0.38110706210136414,
"learning_rate": 0.00015401459854014597,
"loss": 0.9364,
"step": 738
},
{
"epoch": 1.616183706943685,
"grad_norm": 0.2112010270357132,
"learning_rate": 0.00015377128953771287,
"loss": 0.6111,
"step": 739
},
{
"epoch": 1.6183706943685072,
"grad_norm": 0.320754736661911,
"learning_rate": 0.00015352798053527977,
"loss": 0.8463,
"step": 740
},
{
"epoch": 1.6205576817933296,
"grad_norm": 0.2661709785461426,
"learning_rate": 0.00015328467153284672,
"loss": 0.6922,
"step": 741
},
{
"epoch": 1.622744669218152,
"grad_norm": 0.28991788625717163,
"learning_rate": 0.00015304136253041362,
"loss": 0.683,
"step": 742
},
{
"epoch": 1.6249316566429743,
"grad_norm": 0.23085246980190277,
"learning_rate": 0.00015279805352798052,
"loss": 0.6098,
"step": 743
},
{
"epoch": 1.6271186440677967,
"grad_norm": 0.3355705440044403,
"learning_rate": 0.00015255474452554745,
"loss": 0.7358,
"step": 744
},
{
"epoch": 1.6293056314926189,
"grad_norm": 0.2608512341976166,
"learning_rate": 0.00015231143552311434,
"loss": 0.6872,
"step": 745
},
{
"epoch": 1.6314926189174412,
"grad_norm": 0.28092092275619507,
"learning_rate": 0.00015206812652068124,
"loss": 0.7605,
"step": 746
},
{
"epoch": 1.6336796063422634,
"grad_norm": 0.3571244776248932,
"learning_rate": 0.00015182481751824817,
"loss": 0.5481,
"step": 747
},
{
"epoch": 1.6358665937670858,
"grad_norm": 0.30611398816108704,
"learning_rate": 0.00015158150851581507,
"loss": 0.6696,
"step": 748
},
{
"epoch": 1.6380535811919081,
"grad_norm": 0.32783061265945435,
"learning_rate": 0.00015133819951338196,
"loss": 0.8286,
"step": 749
},
{
"epoch": 1.6402405686167305,
"grad_norm": 0.2778065502643585,
"learning_rate": 0.00015109489051094892,
"loss": 0.6223,
"step": 750
},
{
"epoch": 1.6424275560415529,
"grad_norm": 0.2809867262840271,
"learning_rate": 0.00015085158150851582,
"loss": 0.4979,
"step": 751
},
{
"epoch": 1.644614543466375,
"grad_norm": 0.3469402492046356,
"learning_rate": 0.00015060827250608271,
"loss": 0.7277,
"step": 752
},
{
"epoch": 1.6468015308911974,
"grad_norm": 0.33360373973846436,
"learning_rate": 0.00015036496350364964,
"loss": 0.7133,
"step": 753
},
{
"epoch": 1.6489885183160196,
"grad_norm": 0.24966338276863098,
"learning_rate": 0.00015012165450121654,
"loss": 0.8344,
"step": 754
},
{
"epoch": 1.651175505740842,
"grad_norm": 0.35595226287841797,
"learning_rate": 0.00014987834549878344,
"loss": 0.5492,
"step": 755
},
{
"epoch": 1.6533624931656643,
"grad_norm": 0.36205926537513733,
"learning_rate": 0.00014963503649635036,
"loss": 0.6962,
"step": 756
},
{
"epoch": 1.6555494805904867,
"grad_norm": 0.3373574912548065,
"learning_rate": 0.00014939172749391726,
"loss": 0.9455,
"step": 757
},
{
"epoch": 1.657736468015309,
"grad_norm": 0.2560804486274719,
"learning_rate": 0.00014914841849148416,
"loss": 0.6532,
"step": 758
},
{
"epoch": 1.6599234554401312,
"grad_norm": 0.3424091339111328,
"learning_rate": 0.00014890510948905108,
"loss": 0.7255,
"step": 759
},
{
"epoch": 1.6621104428649536,
"grad_norm": 0.3578891456127167,
"learning_rate": 0.000148661800486618,
"loss": 0.689,
"step": 760
},
{
"epoch": 1.6642974302897757,
"grad_norm": 0.2998923659324646,
"learning_rate": 0.0001484184914841849,
"loss": 0.8305,
"step": 761
},
{
"epoch": 1.666484417714598,
"grad_norm": 0.29691943526268005,
"learning_rate": 0.0001481751824817518,
"loss": 0.5745,
"step": 762
},
{
"epoch": 1.6686714051394205,
"grad_norm": 0.26453182101249695,
"learning_rate": 0.00014793187347931873,
"loss": 0.6202,
"step": 763
},
{
"epoch": 1.6708583925642428,
"grad_norm": 0.24131835997104645,
"learning_rate": 0.00014768856447688563,
"loss": 0.8149,
"step": 764
},
{
"epoch": 1.6730453799890652,
"grad_norm": 0.5507832169532776,
"learning_rate": 0.00014744525547445256,
"loss": 0.7544,
"step": 765
},
{
"epoch": 1.6752323674138874,
"grad_norm": 0.3100571930408478,
"learning_rate": 0.00014720194647201945,
"loss": 0.6096,
"step": 766
},
{
"epoch": 1.6774193548387095,
"grad_norm": 0.40742942690849304,
"learning_rate": 0.00014695863746958635,
"loss": 0.8001,
"step": 767
},
{
"epoch": 1.679606342263532,
"grad_norm": 0.26272064447402954,
"learning_rate": 0.00014671532846715328,
"loss": 0.6614,
"step": 768
},
{
"epoch": 1.6817933296883543,
"grad_norm": 0.3485982418060303,
"learning_rate": 0.00014647201946472018,
"loss": 0.7596,
"step": 769
},
{
"epoch": 1.6839803171131766,
"grad_norm": 0.3311547636985779,
"learning_rate": 0.0001462287104622871,
"loss": 0.808,
"step": 770
},
{
"epoch": 1.686167304537999,
"grad_norm": 0.28489449620246887,
"learning_rate": 0.000145985401459854,
"loss": 0.683,
"step": 771
},
{
"epoch": 1.6883542919628212,
"grad_norm": 0.23958906531333923,
"learning_rate": 0.0001457420924574209,
"loss": 0.619,
"step": 772
},
{
"epoch": 1.6905412793876435,
"grad_norm": 0.2665773034095764,
"learning_rate": 0.00014549878345498782,
"loss": 0.7169,
"step": 773
},
{
"epoch": 1.6927282668124657,
"grad_norm": 0.33576110005378723,
"learning_rate": 0.00014525547445255475,
"loss": 0.7457,
"step": 774
},
{
"epoch": 1.694915254237288,
"grad_norm": 0.3103754222393036,
"learning_rate": 0.00014501216545012165,
"loss": 0.7083,
"step": 775
},
{
"epoch": 1.6971022416621104,
"grad_norm": 0.27746620774269104,
"learning_rate": 0.00014476885644768855,
"loss": 0.7648,
"step": 776
},
{
"epoch": 1.6992892290869328,
"grad_norm": 0.3597886264324188,
"learning_rate": 0.00014452554744525547,
"loss": 0.8173,
"step": 777
},
{
"epoch": 1.7014762165117552,
"grad_norm": 0.2408217489719391,
"learning_rate": 0.00014428223844282237,
"loss": 0.5872,
"step": 778
},
{
"epoch": 1.7036632039365773,
"grad_norm": 0.24239328503608704,
"learning_rate": 0.0001440389294403893,
"loss": 0.6311,
"step": 779
},
{
"epoch": 1.7058501913613997,
"grad_norm": 0.4606420695781708,
"learning_rate": 0.0001437956204379562,
"loss": 0.6742,
"step": 780
},
{
"epoch": 1.7080371787862219,
"grad_norm": 0.2773914933204651,
"learning_rate": 0.0001435523114355231,
"loss": 0.4933,
"step": 781
},
{
"epoch": 1.7102241662110442,
"grad_norm": 0.33102571964263916,
"learning_rate": 0.00014330900243309002,
"loss": 0.7694,
"step": 782
},
{
"epoch": 1.7124111536358666,
"grad_norm": 0.3455331027507782,
"learning_rate": 0.00014306569343065692,
"loss": 0.5662,
"step": 783
},
{
"epoch": 1.714598141060689,
"grad_norm": 0.28522560000419617,
"learning_rate": 0.00014282238442822384,
"loss": 0.799,
"step": 784
},
{
"epoch": 1.7167851284855113,
"grad_norm": 0.3302403688430786,
"learning_rate": 0.00014257907542579074,
"loss": 0.8366,
"step": 785
},
{
"epoch": 1.7189721159103335,
"grad_norm": 0.2695009410381317,
"learning_rate": 0.00014233576642335764,
"loss": 0.5889,
"step": 786
},
{
"epoch": 1.7211591033351559,
"grad_norm": 0.2292398363351822,
"learning_rate": 0.00014209245742092456,
"loss": 0.519,
"step": 787
},
{
"epoch": 1.723346090759978,
"grad_norm": 0.2863897383213043,
"learning_rate": 0.0001418491484184915,
"loss": 0.6394,
"step": 788
},
{
"epoch": 1.7255330781848004,
"grad_norm": 1.8092900514602661,
"learning_rate": 0.0001416058394160584,
"loss": 0.6393,
"step": 789
},
{
"epoch": 1.7277200656096228,
"grad_norm": 0.3296603262424469,
"learning_rate": 0.00014136253041362529,
"loss": 0.7414,
"step": 790
},
{
"epoch": 1.7299070530344451,
"grad_norm": 0.36179548501968384,
"learning_rate": 0.0001411192214111922,
"loss": 0.7689,
"step": 791
},
{
"epoch": 1.7320940404592675,
"grad_norm": 0.3196108937263489,
"learning_rate": 0.0001408759124087591,
"loss": 0.681,
"step": 792
},
{
"epoch": 1.7342810278840897,
"grad_norm": 0.3329809010028839,
"learning_rate": 0.000140632603406326,
"loss": 0.7421,
"step": 793
},
{
"epoch": 1.7364680153089118,
"grad_norm": 0.22216172516345978,
"learning_rate": 0.00014038929440389293,
"loss": 0.6421,
"step": 794
},
{
"epoch": 1.7386550027337342,
"grad_norm": 0.33266568183898926,
"learning_rate": 0.00014014598540145983,
"loss": 0.5699,
"step": 795
},
{
"epoch": 1.7408419901585566,
"grad_norm": 0.3858932852745056,
"learning_rate": 0.00013990267639902676,
"loss": 0.7368,
"step": 796
},
{
"epoch": 1.743028977583379,
"grad_norm": 0.3091468811035156,
"learning_rate": 0.00013965936739659366,
"loss": 0.6334,
"step": 797
},
{
"epoch": 1.7452159650082013,
"grad_norm": 0.3596084415912628,
"learning_rate": 0.00013941605839416055,
"loss": 0.6,
"step": 798
},
{
"epoch": 1.7474029524330235,
"grad_norm": 0.2971950173377991,
"learning_rate": 0.00013917274939172748,
"loss": 0.6638,
"step": 799
},
{
"epoch": 1.7495899398578458,
"grad_norm": 0.36204877495765686,
"learning_rate": 0.0001389294403892944,
"loss": 0.6704,
"step": 800
},
{
"epoch": 1.751776927282668,
"grad_norm": 0.25178369879722595,
"learning_rate": 0.0001386861313868613,
"loss": 0.6057,
"step": 801
},
{
"epoch": 1.7539639147074904,
"grad_norm": 0.2541144788265228,
"learning_rate": 0.0001384428223844282,
"loss": 0.6294,
"step": 802
},
{
"epoch": 1.7561509021323127,
"grad_norm": 0.31337326765060425,
"learning_rate": 0.0001381995133819951,
"loss": 0.7991,
"step": 803
},
{
"epoch": 1.758337889557135,
"grad_norm": 0.8276956081390381,
"learning_rate": 0.00013795620437956203,
"loss": 0.9111,
"step": 804
},
{
"epoch": 1.7605248769819575,
"grad_norm": 0.2656904458999634,
"learning_rate": 0.00013771289537712895,
"loss": 0.7048,
"step": 805
},
{
"epoch": 1.7627118644067796,
"grad_norm": 0.3123759627342224,
"learning_rate": 0.00013746958637469585,
"loss": 0.816,
"step": 806
},
{
"epoch": 1.764898851831602,
"grad_norm": 0.28710535168647766,
"learning_rate": 0.00013722627737226275,
"loss": 0.7998,
"step": 807
},
{
"epoch": 1.7670858392564242,
"grad_norm": 0.28171730041503906,
"learning_rate": 0.00013698296836982967,
"loss": 0.6835,
"step": 808
},
{
"epoch": 1.7692728266812465,
"grad_norm": 0.42397668957710266,
"learning_rate": 0.00013673965936739657,
"loss": 0.6875,
"step": 809
},
{
"epoch": 1.771459814106069,
"grad_norm": 0.309830904006958,
"learning_rate": 0.0001364963503649635,
"loss": 0.7446,
"step": 810
},
{
"epoch": 1.7736468015308913,
"grad_norm": 0.3108932375907898,
"learning_rate": 0.0001362530413625304,
"loss": 0.6415,
"step": 811
},
{
"epoch": 1.7758337889557136,
"grad_norm": 0.34336167573928833,
"learning_rate": 0.0001360097323600973,
"loss": 0.688,
"step": 812
},
{
"epoch": 1.7780207763805358,
"grad_norm": 0.2871513366699219,
"learning_rate": 0.00013576642335766422,
"loss": 0.8814,
"step": 813
},
{
"epoch": 1.7802077638053582,
"grad_norm": 0.24412307143211365,
"learning_rate": 0.00013552311435523115,
"loss": 0.6767,
"step": 814
},
{
"epoch": 1.7823947512301803,
"grad_norm": 0.3574623167514801,
"learning_rate": 0.00013527980535279804,
"loss": 0.7016,
"step": 815
},
{
"epoch": 1.7845817386550027,
"grad_norm": 0.4434225261211395,
"learning_rate": 0.00013503649635036494,
"loss": 0.6373,
"step": 816
},
{
"epoch": 1.786768726079825,
"grad_norm": 0.5134851932525635,
"learning_rate": 0.00013479318734793187,
"loss": 0.6622,
"step": 817
},
{
"epoch": 1.7889557135046474,
"grad_norm": 0.4768081307411194,
"learning_rate": 0.00013454987834549877,
"loss": 0.7665,
"step": 818
},
{
"epoch": 1.7911427009294698,
"grad_norm": 0.2798459231853485,
"learning_rate": 0.0001343065693430657,
"loss": 0.6625,
"step": 819
},
{
"epoch": 1.793329688354292,
"grad_norm": 0.27218303084373474,
"learning_rate": 0.0001340632603406326,
"loss": 0.6266,
"step": 820
},
{
"epoch": 1.7955166757791141,
"grad_norm": 0.287860244512558,
"learning_rate": 0.0001338199513381995,
"loss": 0.9758,
"step": 821
},
{
"epoch": 1.7977036632039365,
"grad_norm": 0.26204392313957214,
"learning_rate": 0.00013357664233576641,
"loss": 0.532,
"step": 822
},
{
"epoch": 1.7998906506287589,
"grad_norm": 0.29923009872436523,
"learning_rate": 0.0001333333333333333,
"loss": 0.6961,
"step": 823
},
{
"epoch": 1.8020776380535812,
"grad_norm": 0.34140443801879883,
"learning_rate": 0.00013309002433090024,
"loss": 0.8296,
"step": 824
},
{
"epoch": 1.8042646254784036,
"grad_norm": 0.2605873644351959,
"learning_rate": 0.00013284671532846714,
"loss": 0.8329,
"step": 825
},
{
"epoch": 1.8064516129032258,
"grad_norm": 0.36522653698921204,
"learning_rate": 0.00013260340632603403,
"loss": 0.8552,
"step": 826
},
{
"epoch": 1.8086386003280481,
"grad_norm": 0.29043689370155334,
"learning_rate": 0.00013236009732360096,
"loss": 0.7261,
"step": 827
},
{
"epoch": 1.8108255877528703,
"grad_norm": 0.2861742675304413,
"learning_rate": 0.00013211678832116789,
"loss": 0.596,
"step": 828
},
{
"epoch": 1.8130125751776927,
"grad_norm": 0.34066513180732727,
"learning_rate": 0.00013187347931873478,
"loss": 0.8127,
"step": 829
},
{
"epoch": 1.815199562602515,
"grad_norm": 0.3166887164115906,
"learning_rate": 0.00013163017031630168,
"loss": 0.7491,
"step": 830
},
{
"epoch": 1.8173865500273374,
"grad_norm": 0.36282384395599365,
"learning_rate": 0.0001313868613138686,
"loss": 0.7511,
"step": 831
},
{
"epoch": 1.8195735374521598,
"grad_norm": 0.36424878239631653,
"learning_rate": 0.0001311435523114355,
"loss": 0.938,
"step": 832
},
{
"epoch": 1.821760524876982,
"grad_norm": 0.3587567210197449,
"learning_rate": 0.00013090024330900243,
"loss": 0.8294,
"step": 833
},
{
"epoch": 1.8239475123018043,
"grad_norm": 0.3000282049179077,
"learning_rate": 0.00013065693430656933,
"loss": 0.7178,
"step": 834
},
{
"epoch": 1.8261344997266264,
"grad_norm": 0.2934707999229431,
"learning_rate": 0.00013041362530413623,
"loss": 0.7185,
"step": 835
},
{
"epoch": 1.8283214871514488,
"grad_norm": 0.26312437653541565,
"learning_rate": 0.00013017031630170315,
"loss": 0.6128,
"step": 836
},
{
"epoch": 1.8305084745762712,
"grad_norm": 0.27557966113090515,
"learning_rate": 0.00012992700729927008,
"loss": 0.6751,
"step": 837
},
{
"epoch": 1.8326954620010936,
"grad_norm": 0.296512633562088,
"learning_rate": 0.00012968369829683698,
"loss": 0.8259,
"step": 838
},
{
"epoch": 1.834882449425916,
"grad_norm": 0.4524163007736206,
"learning_rate": 0.00012944038929440388,
"loss": 0.6811,
"step": 839
},
{
"epoch": 1.837069436850738,
"grad_norm": 0.32787275314331055,
"learning_rate": 0.00012919708029197077,
"loss": 0.6882,
"step": 840
},
{
"epoch": 1.8392564242755605,
"grad_norm": 0.26250511407852173,
"learning_rate": 0.0001289537712895377,
"loss": 0.6858,
"step": 841
},
{
"epoch": 1.8414434117003826,
"grad_norm": 0.32813650369644165,
"learning_rate": 0.00012871046228710463,
"loss": 0.5929,
"step": 842
},
{
"epoch": 1.843630399125205,
"grad_norm": 0.3023451864719391,
"learning_rate": 0.00012846715328467152,
"loss": 0.7795,
"step": 843
},
{
"epoch": 1.8458173865500274,
"grad_norm": 0.3112645745277405,
"learning_rate": 0.00012822384428223842,
"loss": 0.517,
"step": 844
},
{
"epoch": 1.8480043739748497,
"grad_norm": 0.6681469678878784,
"learning_rate": 0.00012798053527980535,
"loss": 0.7089,
"step": 845
},
{
"epoch": 1.850191361399672,
"grad_norm": 0.2592954933643341,
"learning_rate": 0.00012773722627737225,
"loss": 0.7007,
"step": 846
},
{
"epoch": 1.8523783488244943,
"grad_norm": 0.31619131565093994,
"learning_rate": 0.00012749391727493917,
"loss": 0.4884,
"step": 847
},
{
"epoch": 1.8545653362493164,
"grad_norm": 0.3551687002182007,
"learning_rate": 0.00012725060827250607,
"loss": 0.5677,
"step": 848
},
{
"epoch": 1.8567523236741388,
"grad_norm": 0.32219335436820984,
"learning_rate": 0.00012700729927007297,
"loss": 0.6744,
"step": 849
},
{
"epoch": 1.8589393110989612,
"grad_norm": 0.28793492913246155,
"learning_rate": 0.0001267639902676399,
"loss": 0.6258,
"step": 850
},
{
"epoch": 1.8611262985237835,
"grad_norm": 0.382720410823822,
"learning_rate": 0.00012652068126520682,
"loss": 0.7977,
"step": 851
},
{
"epoch": 1.863313285948606,
"grad_norm": 0.33804479241371155,
"learning_rate": 0.00012627737226277372,
"loss": 0.7254,
"step": 852
},
{
"epoch": 1.865500273373428,
"grad_norm": 0.3259097635746002,
"learning_rate": 0.00012603406326034062,
"loss": 0.8729,
"step": 853
},
{
"epoch": 1.8676872607982504,
"grad_norm": 0.3584567606449127,
"learning_rate": 0.00012579075425790754,
"loss": 0.7337,
"step": 854
},
{
"epoch": 1.8698742482230726,
"grad_norm": 0.336674302816391,
"learning_rate": 0.00012554744525547444,
"loss": 0.6829,
"step": 855
},
{
"epoch": 1.872061235647895,
"grad_norm": 0.49990177154541016,
"learning_rate": 0.00012530413625304137,
"loss": 0.7793,
"step": 856
},
{
"epoch": 1.8742482230727173,
"grad_norm": 0.31498992443084717,
"learning_rate": 0.00012506082725060826,
"loss": 0.7355,
"step": 857
},
{
"epoch": 1.8764352104975397,
"grad_norm": 0.3050641119480133,
"learning_rate": 0.00012481751824817516,
"loss": 0.6473,
"step": 858
},
{
"epoch": 1.878622197922362,
"grad_norm": 0.27067434787750244,
"learning_rate": 0.0001245742092457421,
"loss": 0.6639,
"step": 859
},
{
"epoch": 1.8808091853471842,
"grad_norm": 0.29407691955566406,
"learning_rate": 0.000124330900243309,
"loss": 0.8002,
"step": 860
},
{
"epoch": 1.8829961727720066,
"grad_norm": 0.3786459267139435,
"learning_rate": 0.0001240875912408759,
"loss": 0.8694,
"step": 861
},
{
"epoch": 1.8851831601968287,
"grad_norm": 0.3678539991378784,
"learning_rate": 0.0001238442822384428,
"loss": 0.7188,
"step": 862
},
{
"epoch": 1.8873701476216511,
"grad_norm": 0.3660300076007843,
"learning_rate": 0.0001236009732360097,
"loss": 0.7348,
"step": 863
},
{
"epoch": 1.8895571350464735,
"grad_norm": 0.34265831112861633,
"learning_rate": 0.00012335766423357663,
"loss": 0.7046,
"step": 864
},
{
"epoch": 1.8917441224712959,
"grad_norm": 0.3664507567882538,
"learning_rate": 0.00012311435523114356,
"loss": 0.777,
"step": 865
},
{
"epoch": 1.8939311098961182,
"grad_norm": 0.36169371008872986,
"learning_rate": 0.00012287104622871046,
"loss": 0.6797,
"step": 866
},
{
"epoch": 1.8961180973209404,
"grad_norm": 0.2904834449291229,
"learning_rate": 0.00012262773722627736,
"loss": 0.6406,
"step": 867
},
{
"epoch": 1.8983050847457628,
"grad_norm": 0.3194887340068817,
"learning_rate": 0.00012238442822384428,
"loss": 0.7477,
"step": 868
},
{
"epoch": 1.900492072170585,
"grad_norm": 0.24546030163764954,
"learning_rate": 0.00012214111922141118,
"loss": 0.6013,
"step": 869
},
{
"epoch": 1.9026790595954073,
"grad_norm": 0.2817955017089844,
"learning_rate": 0.00012189781021897809,
"loss": 0.7813,
"step": 870
},
{
"epoch": 1.9048660470202297,
"grad_norm": 0.28798621892929077,
"learning_rate": 0.000121654501216545,
"loss": 0.6312,
"step": 871
},
{
"epoch": 1.907053034445052,
"grad_norm": 0.22041471302509308,
"learning_rate": 0.0001214111922141119,
"loss": 0.6671,
"step": 872
},
{
"epoch": 1.9092400218698744,
"grad_norm": 0.45332956314086914,
"learning_rate": 0.00012116788321167883,
"loss": 0.7519,
"step": 873
},
{
"epoch": 1.9114270092946966,
"grad_norm": 0.2907330393791199,
"learning_rate": 0.00012092457420924574,
"loss": 0.7048,
"step": 874
},
{
"epoch": 1.9136139967195187,
"grad_norm": 0.3308665156364441,
"learning_rate": 0.00012068126520681264,
"loss": 0.6583,
"step": 875
},
{
"epoch": 1.915800984144341,
"grad_norm": 0.314803808927536,
"learning_rate": 0.00012043795620437955,
"loss": 0.7902,
"step": 876
},
{
"epoch": 1.9179879715691635,
"grad_norm": 0.47894173860549927,
"learning_rate": 0.00012019464720194645,
"loss": 0.7153,
"step": 877
},
{
"epoch": 1.9201749589939858,
"grad_norm": 0.2984611392021179,
"learning_rate": 0.00011995133819951337,
"loss": 0.6093,
"step": 878
},
{
"epoch": 1.9223619464188082,
"grad_norm": 0.5481080412864685,
"learning_rate": 0.00011970802919708029,
"loss": 0.7026,
"step": 879
},
{
"epoch": 1.9245489338436303,
"grad_norm": 0.4306366443634033,
"learning_rate": 0.00011946472019464718,
"loss": 0.8093,
"step": 880
},
{
"epoch": 1.9267359212684527,
"grad_norm": 0.4765607416629791,
"learning_rate": 0.0001192214111922141,
"loss": 0.8378,
"step": 881
},
{
"epoch": 1.9289229086932749,
"grad_norm": 0.29230380058288574,
"learning_rate": 0.00011897810218978102,
"loss": 0.812,
"step": 882
},
{
"epoch": 1.9311098961180972,
"grad_norm": 0.27519696950912476,
"learning_rate": 0.00011873479318734792,
"loss": 0.7204,
"step": 883
},
{
"epoch": 1.9332968835429196,
"grad_norm": 0.43257808685302734,
"learning_rate": 0.00011849148418491483,
"loss": 0.7484,
"step": 884
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.34764620661735535,
"learning_rate": 0.00011824817518248174,
"loss": 0.7835,
"step": 885
},
{
"epoch": 1.9376708583925644,
"grad_norm": 0.2872960567474365,
"learning_rate": 0.00011800486618004864,
"loss": 0.6871,
"step": 886
},
{
"epoch": 1.9398578458173865,
"grad_norm": 0.3657885491847992,
"learning_rate": 0.00011776155717761557,
"loss": 0.7439,
"step": 887
},
{
"epoch": 1.942044833242209,
"grad_norm": 0.3176083564758301,
"learning_rate": 0.00011751824817518248,
"loss": 0.6768,
"step": 888
},
{
"epoch": 1.944231820667031,
"grad_norm": 0.2851628363132477,
"learning_rate": 0.00011727493917274938,
"loss": 0.6673,
"step": 889
},
{
"epoch": 1.9464188080918534,
"grad_norm": 0.2601426839828491,
"learning_rate": 0.00011703163017031629,
"loss": 0.6025,
"step": 890
},
{
"epoch": 1.9486057955166758,
"grad_norm": 0.282064288854599,
"learning_rate": 0.0001167883211678832,
"loss": 0.7084,
"step": 891
},
{
"epoch": 1.9507927829414982,
"grad_norm": 0.2761860191822052,
"learning_rate": 0.0001165450121654501,
"loss": 0.7596,
"step": 892
},
{
"epoch": 1.9529797703663205,
"grad_norm": 0.28319042921066284,
"learning_rate": 0.00011630170316301703,
"loss": 0.6179,
"step": 893
},
{
"epoch": 1.9551667577911427,
"grad_norm": 0.3847699761390686,
"learning_rate": 0.00011605839416058394,
"loss": 0.7964,
"step": 894
},
{
"epoch": 1.957353745215965,
"grad_norm": 0.5719382762908936,
"learning_rate": 0.00011581508515815084,
"loss": 0.7848,
"step": 895
},
{
"epoch": 1.9595407326407872,
"grad_norm": 0.24546296894550323,
"learning_rate": 0.00011557177615571775,
"loss": 0.7404,
"step": 896
},
{
"epoch": 1.9617277200656096,
"grad_norm": 0.2359631359577179,
"learning_rate": 0.00011532846715328465,
"loss": 0.6091,
"step": 897
},
{
"epoch": 1.963914707490432,
"grad_norm": 0.23529179394245148,
"learning_rate": 0.00011508515815085157,
"loss": 0.7032,
"step": 898
},
{
"epoch": 1.9661016949152543,
"grad_norm": 0.32363957166671753,
"learning_rate": 0.00011484184914841848,
"loss": 0.7238,
"step": 899
},
{
"epoch": 1.9682886823400767,
"grad_norm": 0.24427059292793274,
"learning_rate": 0.00011459854014598538,
"loss": 0.6704,
"step": 900
},
{
"epoch": 1.9704756697648989,
"grad_norm": 0.39608168601989746,
"learning_rate": 0.0001143552311435523,
"loss": 0.7251,
"step": 901
},
{
"epoch": 1.972662657189721,
"grad_norm": 0.2778458297252655,
"learning_rate": 0.00011411192214111922,
"loss": 0.6907,
"step": 902
},
{
"epoch": 1.9748496446145434,
"grad_norm": 0.38359907269477844,
"learning_rate": 0.00011386861313868612,
"loss": 0.792,
"step": 903
},
{
"epoch": 1.9770366320393657,
"grad_norm": 0.2692561149597168,
"learning_rate": 0.00011362530413625303,
"loss": 0.505,
"step": 904
},
{
"epoch": 1.9792236194641881,
"grad_norm": 0.35147660970687866,
"learning_rate": 0.00011338199513381994,
"loss": 0.6847,
"step": 905
},
{
"epoch": 1.9814106068890105,
"grad_norm": 0.3441888689994812,
"learning_rate": 0.00011313868613138684,
"loss": 0.7633,
"step": 906
},
{
"epoch": 1.9835975943138326,
"grad_norm": 0.22528661787509918,
"learning_rate": 0.00011289537712895377,
"loss": 0.6367,
"step": 907
},
{
"epoch": 1.985784581738655,
"grad_norm": 0.34356188774108887,
"learning_rate": 0.00011265206812652068,
"loss": 0.8377,
"step": 908
},
{
"epoch": 1.9879715691634772,
"grad_norm": 0.3173167109489441,
"learning_rate": 0.00011240875912408758,
"loss": 0.6651,
"step": 909
},
{
"epoch": 1.9901585565882995,
"grad_norm": 0.2497638314962387,
"learning_rate": 0.00011216545012165449,
"loss": 0.7402,
"step": 910
},
{
"epoch": 1.992345544013122,
"grad_norm": 0.28941065073013306,
"learning_rate": 0.00011192214111922141,
"loss": 0.7328,
"step": 911
},
{
"epoch": 1.9945325314379443,
"grad_norm": 0.3209066092967987,
"learning_rate": 0.00011167883211678831,
"loss": 0.6639,
"step": 912
},
{
"epoch": 1.9967195188627667,
"grad_norm": 0.2646278142929077,
"learning_rate": 0.00011143552311435522,
"loss": 0.6795,
"step": 913
},
{
"epoch": 1.9989065062875888,
"grad_norm": 0.25543129444122314,
"learning_rate": 0.00011119221411192212,
"loss": 0.711,
"step": 914
},
{
"epoch": 2.001093493712411,
"grad_norm": 0.37120577692985535,
"learning_rate": 0.00011094890510948904,
"loss": 0.909,
"step": 915
},
{
"epoch": 2.0032804811372333,
"grad_norm": 0.20501375198364258,
"learning_rate": 0.00011070559610705596,
"loss": 0.5982,
"step": 916
},
{
"epoch": 2.0054674685620557,
"grad_norm": 0.2816307544708252,
"learning_rate": 0.00011046228710462286,
"loss": 0.6477,
"step": 917
},
{
"epoch": 2.007654455986878,
"grad_norm": 0.23481379449367523,
"learning_rate": 0.00011021897810218977,
"loss": 0.701,
"step": 918
},
{
"epoch": 2.0098414434117005,
"grad_norm": 0.22269988059997559,
"learning_rate": 0.00010997566909975668,
"loss": 0.4909,
"step": 919
},
{
"epoch": 2.012028430836523,
"grad_norm": 0.22761498391628265,
"learning_rate": 0.00010973236009732358,
"loss": 0.5446,
"step": 920
},
{
"epoch": 2.014215418261345,
"grad_norm": 0.38109347224235535,
"learning_rate": 0.00010948905109489051,
"loss": 0.7502,
"step": 921
},
{
"epoch": 2.016402405686167,
"grad_norm": 0.26273003220558167,
"learning_rate": 0.00010924574209245742,
"loss": 0.8272,
"step": 922
},
{
"epoch": 2.0185893931109895,
"grad_norm": 0.2501181960105896,
"learning_rate": 0.00010900243309002432,
"loss": 0.6668,
"step": 923
},
{
"epoch": 2.020776380535812,
"grad_norm": 0.2221994698047638,
"learning_rate": 0.00010875912408759123,
"loss": 0.5899,
"step": 924
},
{
"epoch": 2.0229633679606343,
"grad_norm": 0.26471519470214844,
"learning_rate": 0.00010851581508515814,
"loss": 0.491,
"step": 925
},
{
"epoch": 2.0251503553854566,
"grad_norm": 0.29527121782302856,
"learning_rate": 0.00010827250608272505,
"loss": 0.6478,
"step": 926
},
{
"epoch": 2.027337342810279,
"grad_norm": 0.2646641135215759,
"learning_rate": 0.00010802919708029196,
"loss": 0.6052,
"step": 927
},
{
"epoch": 2.029524330235101,
"grad_norm": 0.2731557786464691,
"learning_rate": 0.00010778588807785888,
"loss": 0.7211,
"step": 928
},
{
"epoch": 2.0317113176599233,
"grad_norm": 0.32770606875419617,
"learning_rate": 0.00010754257907542578,
"loss": 0.777,
"step": 929
},
{
"epoch": 2.0338983050847457,
"grad_norm": 0.2406987100839615,
"learning_rate": 0.00010729927007299269,
"loss": 0.6697,
"step": 930
},
{
"epoch": 2.036085292509568,
"grad_norm": 0.2938626706600189,
"learning_rate": 0.00010705596107055961,
"loss": 0.7645,
"step": 931
},
{
"epoch": 2.0382722799343904,
"grad_norm": 0.25775012373924255,
"learning_rate": 0.00010681265206812651,
"loss": 0.721,
"step": 932
},
{
"epoch": 2.040459267359213,
"grad_norm": 0.3010717034339905,
"learning_rate": 0.00010656934306569342,
"loss": 0.565,
"step": 933
},
{
"epoch": 2.042646254784035,
"grad_norm": 0.27577218413352966,
"learning_rate": 0.00010632603406326032,
"loss": 0.5764,
"step": 934
},
{
"epoch": 2.044833242208857,
"grad_norm": 0.3049190938472748,
"learning_rate": 0.00010608272506082723,
"loss": 0.8492,
"step": 935
},
{
"epoch": 2.0470202296336795,
"grad_norm": 0.3621160686016083,
"learning_rate": 0.00010583941605839416,
"loss": 0.668,
"step": 936
},
{
"epoch": 2.049207217058502,
"grad_norm": 0.28885042667388916,
"learning_rate": 0.00010559610705596106,
"loss": 0.6898,
"step": 937
},
{
"epoch": 2.051394204483324,
"grad_norm": 0.38116586208343506,
"learning_rate": 0.00010535279805352797,
"loss": 0.8778,
"step": 938
},
{
"epoch": 2.0535811919081466,
"grad_norm": 0.3027772903442383,
"learning_rate": 0.00010510948905109488,
"loss": 0.6428,
"step": 939
},
{
"epoch": 2.055768179332969,
"grad_norm": 0.20893897116184235,
"learning_rate": 0.00010486618004866178,
"loss": 0.6471,
"step": 940
},
{
"epoch": 2.0579551667577913,
"grad_norm": 0.281434565782547,
"learning_rate": 0.0001046228710462287,
"loss": 0.6593,
"step": 941
},
{
"epoch": 2.0601421541826133,
"grad_norm": 0.3276302218437195,
"learning_rate": 0.00010437956204379562,
"loss": 0.6077,
"step": 942
},
{
"epoch": 2.0623291416074356,
"grad_norm": 0.35327035188674927,
"learning_rate": 0.00010413625304136252,
"loss": 0.5687,
"step": 943
},
{
"epoch": 2.064516129032258,
"grad_norm": 0.3210618197917938,
"learning_rate": 0.00010389294403892943,
"loss": 0.6685,
"step": 944
},
{
"epoch": 2.0667031164570804,
"grad_norm": 0.25362011790275574,
"learning_rate": 0.00010364963503649635,
"loss": 0.5067,
"step": 945
},
{
"epoch": 2.0688901038819028,
"grad_norm": 0.2774200439453125,
"learning_rate": 0.00010340632603406325,
"loss": 0.7696,
"step": 946
},
{
"epoch": 2.071077091306725,
"grad_norm": 0.39397120475769043,
"learning_rate": 0.00010316301703163016,
"loss": 0.7109,
"step": 947
},
{
"epoch": 2.0732640787315475,
"grad_norm": 0.2712627947330475,
"learning_rate": 0.00010291970802919708,
"loss": 0.5855,
"step": 948
},
{
"epoch": 2.0754510661563694,
"grad_norm": 0.20961184799671173,
"learning_rate": 0.00010267639902676397,
"loss": 0.6223,
"step": 949
},
{
"epoch": 2.077638053581192,
"grad_norm": 0.35785865783691406,
"learning_rate": 0.0001024330900243309,
"loss": 0.6426,
"step": 950
},
{
"epoch": 2.079825041006014,
"grad_norm": 0.30317097902297974,
"learning_rate": 0.0001021897810218978,
"loss": 0.5881,
"step": 951
},
{
"epoch": 2.0820120284308365,
"grad_norm": 0.2647455632686615,
"learning_rate": 0.00010194647201946471,
"loss": 0.4753,
"step": 952
},
{
"epoch": 2.084199015855659,
"grad_norm": 0.2377641350030899,
"learning_rate": 0.00010170316301703162,
"loss": 0.7245,
"step": 953
},
{
"epoch": 2.0863860032804813,
"grad_norm": 0.4126327633857727,
"learning_rate": 0.00010145985401459852,
"loss": 0.7418,
"step": 954
},
{
"epoch": 2.0885729907053037,
"grad_norm": 0.372079998254776,
"learning_rate": 0.00010121654501216545,
"loss": 0.5861,
"step": 955
},
{
"epoch": 2.0907599781301256,
"grad_norm": 0.35693153738975525,
"learning_rate": 0.00010097323600973236,
"loss": 0.63,
"step": 956
},
{
"epoch": 2.092946965554948,
"grad_norm": 0.3220914304256439,
"learning_rate": 0.00010072992700729926,
"loss": 0.6541,
"step": 957
},
{
"epoch": 2.0951339529797703,
"grad_norm": 0.28749874234199524,
"learning_rate": 0.00010048661800486617,
"loss": 0.5944,
"step": 958
},
{
"epoch": 2.0973209404045927,
"grad_norm": 0.27125856280326843,
"learning_rate": 0.00010024330900243309,
"loss": 0.546,
"step": 959
},
{
"epoch": 2.099507927829415,
"grad_norm": 0.32414090633392334,
"learning_rate": 9.999999999999999e-05,
"loss": 0.5295,
"step": 960
},
{
"epoch": 2.1016949152542375,
"grad_norm": 0.37579938769340515,
"learning_rate": 9.97566909975669e-05,
"loss": 0.6202,
"step": 961
},
{
"epoch": 2.1038819026790594,
"grad_norm": 0.3326401710510254,
"learning_rate": 9.951338199513382e-05,
"loss": 0.5674,
"step": 962
},
{
"epoch": 2.1060688901038818,
"grad_norm": 0.2777692377567291,
"learning_rate": 9.927007299270071e-05,
"loss": 0.5297,
"step": 963
},
{
"epoch": 2.108255877528704,
"grad_norm": 0.3658103942871094,
"learning_rate": 9.902676399026764e-05,
"loss": 0.6001,
"step": 964
},
{
"epoch": 2.1104428649535265,
"grad_norm": 0.30180448293685913,
"learning_rate": 9.878345498783455e-05,
"loss": 0.627,
"step": 965
},
{
"epoch": 2.112629852378349,
"grad_norm": 0.3160865604877472,
"learning_rate": 9.854014598540145e-05,
"loss": 0.6583,
"step": 966
},
{
"epoch": 2.1148168398031713,
"grad_norm": 0.38876181840896606,
"learning_rate": 9.829683698296836e-05,
"loss": 0.7201,
"step": 967
},
{
"epoch": 2.1170038272279936,
"grad_norm": 0.32533615827560425,
"learning_rate": 9.805352798053527e-05,
"loss": 0.5814,
"step": 968
},
{
"epoch": 2.1191908146528156,
"grad_norm": 0.2723495662212372,
"learning_rate": 9.781021897810217e-05,
"loss": 0.7299,
"step": 969
},
{
"epoch": 2.121377802077638,
"grad_norm": 0.3380286693572998,
"learning_rate": 9.75669099756691e-05,
"loss": 0.8313,
"step": 970
},
{
"epoch": 2.1235647895024603,
"grad_norm": 0.3675851821899414,
"learning_rate": 9.7323600973236e-05,
"loss": 0.5859,
"step": 971
},
{
"epoch": 2.1257517769272827,
"grad_norm": 0.32205119729042053,
"learning_rate": 9.708029197080291e-05,
"loss": 0.78,
"step": 972
},
{
"epoch": 2.127938764352105,
"grad_norm": 0.3244129419326782,
"learning_rate": 9.683698296836982e-05,
"loss": 0.6777,
"step": 973
},
{
"epoch": 2.1301257517769274,
"grad_norm": 0.3449605405330658,
"learning_rate": 9.659367396593672e-05,
"loss": 0.654,
"step": 974
},
{
"epoch": 2.13231273920175,
"grad_norm": 0.3051266670227051,
"learning_rate": 9.635036496350364e-05,
"loss": 0.6204,
"step": 975
},
{
"epoch": 2.1344997266265717,
"grad_norm": 0.29881876707077026,
"learning_rate": 9.610705596107056e-05,
"loss": 0.4543,
"step": 976
},
{
"epoch": 2.136686714051394,
"grad_norm": 0.2953018546104431,
"learning_rate": 9.586374695863745e-05,
"loss": 0.7972,
"step": 977
},
{
"epoch": 2.1388737014762165,
"grad_norm": 0.3214372992515564,
"learning_rate": 9.562043795620437e-05,
"loss": 0.6216,
"step": 978
},
{
"epoch": 2.141060688901039,
"grad_norm": 0.31700441241264343,
"learning_rate": 9.537712895377129e-05,
"loss": 0.5708,
"step": 979
},
{
"epoch": 2.143247676325861,
"grad_norm": 0.3516302704811096,
"learning_rate": 9.513381995133819e-05,
"loss": 0.7428,
"step": 980
},
{
"epoch": 2.1454346637506836,
"grad_norm": 0.278621643781662,
"learning_rate": 9.48905109489051e-05,
"loss": 0.5118,
"step": 981
},
{
"epoch": 2.1476216511755055,
"grad_norm": 0.39558589458465576,
"learning_rate": 9.464720194647201e-05,
"loss": 0.6228,
"step": 982
},
{
"epoch": 2.149808638600328,
"grad_norm": 0.2623763382434845,
"learning_rate": 9.440389294403891e-05,
"loss": 0.5621,
"step": 983
},
{
"epoch": 2.1519956260251503,
"grad_norm": 0.3559738099575043,
"learning_rate": 9.416058394160584e-05,
"loss": 0.6367,
"step": 984
},
{
"epoch": 2.1541826134499726,
"grad_norm": 0.34260550141334534,
"learning_rate": 9.391727493917275e-05,
"loss": 0.6587,
"step": 985
},
{
"epoch": 2.156369600874795,
"grad_norm": 0.3602772057056427,
"learning_rate": 9.367396593673965e-05,
"loss": 0.6749,
"step": 986
},
{
"epoch": 2.1585565882996174,
"grad_norm": 0.4492672383785248,
"learning_rate": 9.343065693430656e-05,
"loss": 0.6159,
"step": 987
},
{
"epoch": 2.1607435757244398,
"grad_norm": 0.30676203966140747,
"learning_rate": 9.318734793187348e-05,
"loss": 0.7105,
"step": 988
},
{
"epoch": 2.1629305631492617,
"grad_norm": 0.2810410261154175,
"learning_rate": 9.294403892944038e-05,
"loss": 0.7091,
"step": 989
},
{
"epoch": 2.165117550574084,
"grad_norm": 0.3161092698574066,
"learning_rate": 9.27007299270073e-05,
"loss": 0.6866,
"step": 990
},
{
"epoch": 2.1673045379989064,
"grad_norm": 0.30391326546669006,
"learning_rate": 9.24574209245742e-05,
"loss": 0.6473,
"step": 991
},
{
"epoch": 2.169491525423729,
"grad_norm": 0.33336496353149414,
"learning_rate": 9.22141119221411e-05,
"loss": 0.7565,
"step": 992
},
{
"epoch": 2.171678512848551,
"grad_norm": 0.27083349227905273,
"learning_rate": 9.197080291970803e-05,
"loss": 0.602,
"step": 993
},
{
"epoch": 2.1738655002733736,
"grad_norm": 0.3847806751728058,
"learning_rate": 9.172749391727493e-05,
"loss": 0.6034,
"step": 994
},
{
"epoch": 2.176052487698196,
"grad_norm": 0.334309846162796,
"learning_rate": 9.148418491484184e-05,
"loss": 0.7368,
"step": 995
},
{
"epoch": 2.178239475123018,
"grad_norm": 0.4568588435649872,
"learning_rate": 9.124087591240875e-05,
"loss": 0.6723,
"step": 996
},
{
"epoch": 2.1804264625478402,
"grad_norm": 0.23190492391586304,
"learning_rate": 9.099756690997565e-05,
"loss": 0.5024,
"step": 997
},
{
"epoch": 2.1826134499726626,
"grad_norm": 0.4212368130683899,
"learning_rate": 9.075425790754258e-05,
"loss": 0.5137,
"step": 998
},
{
"epoch": 2.184800437397485,
"grad_norm": 0.3017450273036957,
"learning_rate": 9.051094890510949e-05,
"loss": 0.659,
"step": 999
},
{
"epoch": 2.1869874248223073,
"grad_norm": 0.32203611731529236,
"learning_rate": 9.026763990267639e-05,
"loss": 0.6198,
"step": 1000
},
{
"epoch": 2.1891744122471297,
"grad_norm": 0.308056503534317,
"learning_rate": 9.00243309002433e-05,
"loss": 0.5798,
"step": 1001
},
{
"epoch": 2.191361399671952,
"grad_norm": 0.32163482904434204,
"learning_rate": 8.978102189781021e-05,
"loss": 0.4909,
"step": 1002
},
{
"epoch": 2.193548387096774,
"grad_norm": 0.28082406520843506,
"learning_rate": 8.953771289537712e-05,
"loss": 0.5911,
"step": 1003
},
{
"epoch": 2.1957353745215964,
"grad_norm": 0.3853447139263153,
"learning_rate": 8.929440389294404e-05,
"loss": 0.601,
"step": 1004
},
{
"epoch": 2.1979223619464188,
"grad_norm": 0.27736788988113403,
"learning_rate": 8.905109489051095e-05,
"loss": 0.5391,
"step": 1005
},
{
"epoch": 2.200109349371241,
"grad_norm": 0.3074529767036438,
"learning_rate": 8.880778588807785e-05,
"loss": 0.5264,
"step": 1006
},
{
"epoch": 2.2022963367960635,
"grad_norm": 0.34355053305625916,
"learning_rate": 8.856447688564476e-05,
"loss": 0.5479,
"step": 1007
},
{
"epoch": 2.204483324220886,
"grad_norm": 0.25875043869018555,
"learning_rate": 8.832116788321167e-05,
"loss": 0.5133,
"step": 1008
},
{
"epoch": 2.2066703116457083,
"grad_norm": 0.4600970447063446,
"learning_rate": 8.807785888077858e-05,
"loss": 0.7145,
"step": 1009
},
{
"epoch": 2.20885729907053,
"grad_norm": 0.4292985796928406,
"learning_rate": 8.78345498783455e-05,
"loss": 0.8484,
"step": 1010
},
{
"epoch": 2.2110442864953526,
"grad_norm": 0.38896313309669495,
"learning_rate": 8.759124087591239e-05,
"loss": 0.8592,
"step": 1011
},
{
"epoch": 2.213231273920175,
"grad_norm": 0.32829031348228455,
"learning_rate": 8.73479318734793e-05,
"loss": 0.711,
"step": 1012
},
{
"epoch": 2.2154182613449973,
"grad_norm": 0.32850679755210876,
"learning_rate": 8.710462287104623e-05,
"loss": 0.6644,
"step": 1013
},
{
"epoch": 2.2176052487698197,
"grad_norm": 0.3872655928134918,
"learning_rate": 8.686131386861313e-05,
"loss": 0.7039,
"step": 1014
},
{
"epoch": 2.219792236194642,
"grad_norm": 0.39074549078941345,
"learning_rate": 8.661800486618004e-05,
"loss": 0.6316,
"step": 1015
},
{
"epoch": 2.221979223619464,
"grad_norm": 0.33514949679374695,
"learning_rate": 8.637469586374695e-05,
"loss": 0.7362,
"step": 1016
},
{
"epoch": 2.2241662110442864,
"grad_norm": 0.37822842597961426,
"learning_rate": 8.613138686131385e-05,
"loss": 0.8549,
"step": 1017
},
{
"epoch": 2.2263531984691087,
"grad_norm": 0.2988075911998749,
"learning_rate": 8.588807785888078e-05,
"loss": 0.6768,
"step": 1018
},
{
"epoch": 2.228540185893931,
"grad_norm": 0.3298238515853882,
"learning_rate": 8.564476885644769e-05,
"loss": 0.661,
"step": 1019
},
{
"epoch": 2.2307271733187535,
"grad_norm": 0.3168882429599762,
"learning_rate": 8.540145985401459e-05,
"loss": 0.5899,
"step": 1020
},
{
"epoch": 2.232914160743576,
"grad_norm": 0.32149139046669006,
"learning_rate": 8.51581508515815e-05,
"loss": 0.6377,
"step": 1021
},
{
"epoch": 2.235101148168398,
"grad_norm": 0.3840494453907013,
"learning_rate": 8.491484184914842e-05,
"loss": 0.5914,
"step": 1022
},
{
"epoch": 2.23728813559322,
"grad_norm": 0.36953312158584595,
"learning_rate": 8.467153284671532e-05,
"loss": 0.6954,
"step": 1023
},
{
"epoch": 2.2394751230180425,
"grad_norm": 0.3132734000682831,
"learning_rate": 8.442822384428223e-05,
"loss": 0.6778,
"step": 1024
},
{
"epoch": 2.241662110442865,
"grad_norm": 0.3022383153438568,
"learning_rate": 8.418491484184915e-05,
"loss": 0.5681,
"step": 1025
},
{
"epoch": 2.2438490978676873,
"grad_norm": 0.33297014236450195,
"learning_rate": 8.394160583941604e-05,
"loss": 1.0015,
"step": 1026
},
{
"epoch": 2.2460360852925096,
"grad_norm": 0.2536577582359314,
"learning_rate": 8.369829683698297e-05,
"loss": 0.6535,
"step": 1027
},
{
"epoch": 2.248223072717332,
"grad_norm": 0.3168553113937378,
"learning_rate": 8.345498783454987e-05,
"loss": 0.4617,
"step": 1028
},
{
"epoch": 2.250410060142154,
"grad_norm": 0.41692110896110535,
"learning_rate": 8.321167883211678e-05,
"loss": 0.6289,
"step": 1029
},
{
"epoch": 2.2525970475669763,
"grad_norm": 0.31276077032089233,
"learning_rate": 8.296836982968369e-05,
"loss": 0.6558,
"step": 1030
},
{
"epoch": 2.2547840349917987,
"grad_norm": 0.382587730884552,
"learning_rate": 8.272506082725059e-05,
"loss": 0.7024,
"step": 1031
},
{
"epoch": 2.256971022416621,
"grad_norm": 0.37239089608192444,
"learning_rate": 8.248175182481752e-05,
"loss": 0.6428,
"step": 1032
},
{
"epoch": 2.2591580098414434,
"grad_norm": 0.3444945216178894,
"learning_rate": 8.223844282238443e-05,
"loss": 0.8301,
"step": 1033
},
{
"epoch": 2.261344997266266,
"grad_norm": 0.32943612337112427,
"learning_rate": 8.199513381995133e-05,
"loss": 0.8259,
"step": 1034
},
{
"epoch": 2.263531984691088,
"grad_norm": 0.3256615996360779,
"learning_rate": 8.175182481751824e-05,
"loss": 0.5633,
"step": 1035
},
{
"epoch": 2.26571897211591,
"grad_norm": 0.38470467925071716,
"learning_rate": 8.150851581508516e-05,
"loss": 0.8342,
"step": 1036
},
{
"epoch": 2.2679059595407325,
"grad_norm": 0.3568199872970581,
"learning_rate": 8.126520681265206e-05,
"loss": 0.6949,
"step": 1037
},
{
"epoch": 2.270092946965555,
"grad_norm": 0.4587413966655731,
"learning_rate": 8.102189781021897e-05,
"loss": 0.855,
"step": 1038
},
{
"epoch": 2.2722799343903772,
"grad_norm": 0.3806265890598297,
"learning_rate": 8.077858880778589e-05,
"loss": 0.7383,
"step": 1039
},
{
"epoch": 2.2744669218151996,
"grad_norm": 0.34413963556289673,
"learning_rate": 8.053527980535278e-05,
"loss": 0.7618,
"step": 1040
},
{
"epoch": 2.276653909240022,
"grad_norm": 0.41507622599601746,
"learning_rate": 8.029197080291971e-05,
"loss": 0.6976,
"step": 1041
},
{
"epoch": 2.2788408966648444,
"grad_norm": 0.3527161777019501,
"learning_rate": 8.004866180048662e-05,
"loss": 0.6337,
"step": 1042
},
{
"epoch": 2.2810278840896663,
"grad_norm": 0.405584454536438,
"learning_rate": 7.980535279805352e-05,
"loss": 0.8183,
"step": 1043
},
{
"epoch": 2.2832148715144887,
"grad_norm": 0.41590583324432373,
"learning_rate": 7.956204379562043e-05,
"loss": 0.8062,
"step": 1044
},
{
"epoch": 2.285401858939311,
"grad_norm": 0.41613471508026123,
"learning_rate": 7.931873479318733e-05,
"loss": 0.6246,
"step": 1045
},
{
"epoch": 2.2875888463641334,
"grad_norm": 0.44034960865974426,
"learning_rate": 7.907542579075424e-05,
"loss": 0.8375,
"step": 1046
},
{
"epoch": 2.2897758337889558,
"grad_norm": 0.3828635811805725,
"learning_rate": 7.883211678832117e-05,
"loss": 0.8442,
"step": 1047
},
{
"epoch": 2.291962821213778,
"grad_norm": 0.3389468491077423,
"learning_rate": 7.858880778588807e-05,
"loss": 0.7997,
"step": 1048
},
{
"epoch": 2.2941498086386005,
"grad_norm": 0.33413904905319214,
"learning_rate": 7.834549878345498e-05,
"loss": 0.6141,
"step": 1049
},
{
"epoch": 2.2963367960634224,
"grad_norm": 0.32505419850349426,
"learning_rate": 7.810218978102189e-05,
"loss": 0.5001,
"step": 1050
},
{
"epoch": 2.298523783488245,
"grad_norm": 0.3244943618774414,
"learning_rate": 7.785888077858879e-05,
"loss": 0.6723,
"step": 1051
},
{
"epoch": 2.300710770913067,
"grad_norm": 0.3737221658229828,
"learning_rate": 7.761557177615571e-05,
"loss": 0.7168,
"step": 1052
},
{
"epoch": 2.3028977583378896,
"grad_norm": 0.4390661120414734,
"learning_rate": 7.737226277372263e-05,
"loss": 0.5277,
"step": 1053
},
{
"epoch": 2.305084745762712,
"grad_norm": 0.42460954189300537,
"learning_rate": 7.712895377128952e-05,
"loss": 0.7353,
"step": 1054
},
{
"epoch": 2.3072717331875343,
"grad_norm": 0.3381803035736084,
"learning_rate": 7.688564476885644e-05,
"loss": 0.6313,
"step": 1055
},
{
"epoch": 2.3094587206123567,
"grad_norm": 0.33968648314476013,
"learning_rate": 7.664233576642336e-05,
"loss": 0.5752,
"step": 1056
},
{
"epoch": 2.3116457080371786,
"grad_norm": 0.34770649671554565,
"learning_rate": 7.639902676399026e-05,
"loss": 0.7087,
"step": 1057
},
{
"epoch": 2.313832695462001,
"grad_norm": 0.27934038639068604,
"learning_rate": 7.615571776155717e-05,
"loss": 0.5717,
"step": 1058
},
{
"epoch": 2.3160196828868234,
"grad_norm": 0.35276851058006287,
"learning_rate": 7.591240875912408e-05,
"loss": 0.5339,
"step": 1059
},
{
"epoch": 2.3182066703116457,
"grad_norm": 0.31707894802093506,
"learning_rate": 7.566909975669098e-05,
"loss": 0.5097,
"step": 1060
},
{
"epoch": 2.320393657736468,
"grad_norm": 0.47757935523986816,
"learning_rate": 7.542579075425791e-05,
"loss": 0.7004,
"step": 1061
},
{
"epoch": 2.3225806451612905,
"grad_norm": 0.3273807764053345,
"learning_rate": 7.518248175182482e-05,
"loss": 0.6859,
"step": 1062
},
{
"epoch": 2.324767632586113,
"grad_norm": 0.30111655592918396,
"learning_rate": 7.493917274939172e-05,
"loss": 0.4916,
"step": 1063
},
{
"epoch": 2.326954620010935,
"grad_norm": 0.33053281903266907,
"learning_rate": 7.469586374695863e-05,
"loss": 0.6866,
"step": 1064
},
{
"epoch": 2.329141607435757,
"grad_norm": 0.34993547201156616,
"learning_rate": 7.445255474452554e-05,
"loss": 0.6471,
"step": 1065
},
{
"epoch": 2.3313285948605795,
"grad_norm": 0.2865176200866699,
"learning_rate": 7.420924574209245e-05,
"loss": 0.4927,
"step": 1066
},
{
"epoch": 2.333515582285402,
"grad_norm": 0.43209540843963623,
"learning_rate": 7.396593673965937e-05,
"loss": 0.6368,
"step": 1067
},
{
"epoch": 2.3357025697102243,
"grad_norm": 0.3290870189666748,
"learning_rate": 7.372262773722628e-05,
"loss": 0.739,
"step": 1068
},
{
"epoch": 2.3378895571350466,
"grad_norm": 0.3443828225135803,
"learning_rate": 7.347931873479318e-05,
"loss": 0.8401,
"step": 1069
},
{
"epoch": 2.340076544559869,
"grad_norm": 0.32021573185920715,
"learning_rate": 7.323600973236009e-05,
"loss": 0.7726,
"step": 1070
},
{
"epoch": 2.342263531984691,
"grad_norm": 0.46182501316070557,
"learning_rate": 7.2992700729927e-05,
"loss": 0.9029,
"step": 1071
},
{
"epoch": 2.3444505194095133,
"grad_norm": 0.35512760281562805,
"learning_rate": 7.274939172749391e-05,
"loss": 0.6847,
"step": 1072
},
{
"epoch": 2.3466375068343357,
"grad_norm": 0.380140483379364,
"learning_rate": 7.250608272506082e-05,
"loss": 0.7038,
"step": 1073
},
{
"epoch": 2.348824494259158,
"grad_norm": 0.32431280612945557,
"learning_rate": 7.226277372262774e-05,
"loss": 0.5294,
"step": 1074
},
{
"epoch": 2.3510114816839804,
"grad_norm": 0.2768891453742981,
"learning_rate": 7.201946472019465e-05,
"loss": 0.5286,
"step": 1075
},
{
"epoch": 2.353198469108803,
"grad_norm": 0.3334331214427948,
"learning_rate": 7.177615571776155e-05,
"loss": 0.6415,
"step": 1076
},
{
"epoch": 2.3553854565336247,
"grad_norm": 0.41533592343330383,
"learning_rate": 7.153284671532846e-05,
"loss": 0.6295,
"step": 1077
},
{
"epoch": 2.357572443958447,
"grad_norm": 0.42005178332328796,
"learning_rate": 7.128953771289537e-05,
"loss": 0.8451,
"step": 1078
},
{
"epoch": 2.3597594313832695,
"grad_norm": 0.39049747586250305,
"learning_rate": 7.104622871046228e-05,
"loss": 0.8351,
"step": 1079
},
{
"epoch": 2.361946418808092,
"grad_norm": 0.33119314908981323,
"learning_rate": 7.08029197080292e-05,
"loss": 0.5981,
"step": 1080
},
{
"epoch": 2.3641334062329142,
"grad_norm": 0.4426044225692749,
"learning_rate": 7.05596107055961e-05,
"loss": 0.671,
"step": 1081
},
{
"epoch": 2.3663203936577366,
"grad_norm": 0.3445340096950531,
"learning_rate": 7.0316301703163e-05,
"loss": 0.6182,
"step": 1082
},
{
"epoch": 2.3685073810825585,
"grad_norm": 0.35596704483032227,
"learning_rate": 7.007299270072992e-05,
"loss": 0.7591,
"step": 1083
},
{
"epoch": 2.370694368507381,
"grad_norm": 0.39532068371772766,
"learning_rate": 6.982968369829683e-05,
"loss": 0.5479,
"step": 1084
},
{
"epoch": 2.3728813559322033,
"grad_norm": 0.3580004572868347,
"learning_rate": 6.958637469586374e-05,
"loss": 0.796,
"step": 1085
},
{
"epoch": 2.3750683433570257,
"grad_norm": 0.5314396023750305,
"learning_rate": 6.934306569343065e-05,
"loss": 0.5986,
"step": 1086
},
{
"epoch": 2.377255330781848,
"grad_norm": 0.5284639596939087,
"learning_rate": 6.909975669099755e-05,
"loss": 0.7934,
"step": 1087
},
{
"epoch": 2.3794423182066704,
"grad_norm": 0.38761386275291443,
"learning_rate": 6.885644768856448e-05,
"loss": 0.6072,
"step": 1088
},
{
"epoch": 2.3816293056314928,
"grad_norm": 0.3381224572658539,
"learning_rate": 6.861313868613137e-05,
"loss": 0.6392,
"step": 1089
},
{
"epoch": 2.3838162930563147,
"grad_norm": 0.3654699921607971,
"learning_rate": 6.836982968369829e-05,
"loss": 0.6068,
"step": 1090
},
{
"epoch": 2.386003280481137,
"grad_norm": 0.343288779258728,
"learning_rate": 6.81265206812652e-05,
"loss": 0.868,
"step": 1091
},
{
"epoch": 2.3881902679059595,
"grad_norm": 0.3624615967273712,
"learning_rate": 6.788321167883211e-05,
"loss": 0.6408,
"step": 1092
},
{
"epoch": 2.390377255330782,
"grad_norm": 0.3863930404186249,
"learning_rate": 6.763990267639902e-05,
"loss": 0.5778,
"step": 1093
},
{
"epoch": 2.392564242755604,
"grad_norm": 0.34366974234580994,
"learning_rate": 6.739659367396593e-05,
"loss": 0.6983,
"step": 1094
},
{
"epoch": 2.3947512301804266,
"grad_norm": 0.34117886424064636,
"learning_rate": 6.715328467153285e-05,
"loss": 0.6472,
"step": 1095
},
{
"epoch": 2.396938217605249,
"grad_norm": 0.3547564148902893,
"learning_rate": 6.690997566909974e-05,
"loss": 0.5363,
"step": 1096
},
{
"epoch": 2.399125205030071,
"grad_norm": 0.31432420015335083,
"learning_rate": 6.666666666666666e-05,
"loss": 0.5539,
"step": 1097
},
{
"epoch": 2.4013121924548932,
"grad_norm": 0.45095062255859375,
"learning_rate": 6.642335766423357e-05,
"loss": 0.6494,
"step": 1098
},
{
"epoch": 2.4034991798797156,
"grad_norm": 1.0102994441986084,
"learning_rate": 6.618004866180048e-05,
"loss": 0.988,
"step": 1099
},
{
"epoch": 2.405686167304538,
"grad_norm": 0.5170231461524963,
"learning_rate": 6.593673965936739e-05,
"loss": 0.8045,
"step": 1100
},
{
"epoch": 2.4078731547293604,
"grad_norm": 0.2993682622909546,
"learning_rate": 6.56934306569343e-05,
"loss": 0.5887,
"step": 1101
},
{
"epoch": 2.4100601421541827,
"grad_norm": 0.29023849964141846,
"learning_rate": 6.545012165450122e-05,
"loss": 0.6123,
"step": 1102
},
{
"epoch": 2.412247129579005,
"grad_norm": 0.4196130335330963,
"learning_rate": 6.520681265206811e-05,
"loss": 0.6444,
"step": 1103
},
{
"epoch": 2.414434117003827,
"grad_norm": 0.43228599429130554,
"learning_rate": 6.496350364963504e-05,
"loss": 0.7432,
"step": 1104
},
{
"epoch": 2.4166211044286494,
"grad_norm": 0.3056860566139221,
"learning_rate": 6.472019464720194e-05,
"loss": 0.6673,
"step": 1105
},
{
"epoch": 2.418808091853472,
"grad_norm": 0.4213399887084961,
"learning_rate": 6.447688564476885e-05,
"loss": 0.798,
"step": 1106
},
{
"epoch": 2.420995079278294,
"grad_norm": 0.4033665060997009,
"learning_rate": 6.423357664233576e-05,
"loss": 0.7835,
"step": 1107
},
{
"epoch": 2.4231820667031165,
"grad_norm": 0.35071858763694763,
"learning_rate": 6.399026763990267e-05,
"loss": 0.7173,
"step": 1108
},
{
"epoch": 2.425369054127939,
"grad_norm": 0.36336860060691833,
"learning_rate": 6.374695863746959e-05,
"loss": 0.6904,
"step": 1109
},
{
"epoch": 2.4275560415527613,
"grad_norm": 0.4012874662876129,
"learning_rate": 6.350364963503648e-05,
"loss": 0.6062,
"step": 1110
},
{
"epoch": 2.429743028977583,
"grad_norm": 0.3614816665649414,
"learning_rate": 6.326034063260341e-05,
"loss": 0.7757,
"step": 1111
},
{
"epoch": 2.4319300164024056,
"grad_norm": 0.34320759773254395,
"learning_rate": 6.301703163017031e-05,
"loss": 0.6789,
"step": 1112
},
{
"epoch": 2.434117003827228,
"grad_norm": 0.3566221594810486,
"learning_rate": 6.277372262773722e-05,
"loss": 0.7995,
"step": 1113
},
{
"epoch": 2.4363039912520503,
"grad_norm": 0.35487961769104004,
"learning_rate": 6.253041362530413e-05,
"loss": 0.6536,
"step": 1114
},
{
"epoch": 2.4384909786768727,
"grad_norm": 0.3311222195625305,
"learning_rate": 6.228710462287104e-05,
"loss": 0.589,
"step": 1115
},
{
"epoch": 2.440677966101695,
"grad_norm": 0.36649906635284424,
"learning_rate": 6.204379562043796e-05,
"loss": 0.7062,
"step": 1116
},
{
"epoch": 2.4428649535265174,
"grad_norm": 0.36625346541404724,
"learning_rate": 6.180048661800485e-05,
"loss": 0.6585,
"step": 1117
},
{
"epoch": 2.4450519409513394,
"grad_norm": 0.47065046429634094,
"learning_rate": 6.155717761557178e-05,
"loss": 0.8547,
"step": 1118
},
{
"epoch": 2.4472389283761617,
"grad_norm": 0.3721199333667755,
"learning_rate": 6.131386861313868e-05,
"loss": 0.7003,
"step": 1119
},
{
"epoch": 2.449425915800984,
"grad_norm": 0.3814185559749603,
"learning_rate": 6.107055961070559e-05,
"loss": 0.6616,
"step": 1120
},
{
"epoch": 2.4516129032258065,
"grad_norm": 0.34303221106529236,
"learning_rate": 6.08272506082725e-05,
"loss": 0.7311,
"step": 1121
},
{
"epoch": 2.453799890650629,
"grad_norm": 0.31710198521614075,
"learning_rate": 6.0583941605839414e-05,
"loss": 0.6767,
"step": 1122
},
{
"epoch": 2.4559868780754512,
"grad_norm": 0.378255158662796,
"learning_rate": 6.034063260340632e-05,
"loss": 0.5758,
"step": 1123
},
{
"epoch": 2.4581738655002736,
"grad_norm": 0.3049505949020386,
"learning_rate": 6.0097323600973225e-05,
"loss": 0.7468,
"step": 1124
},
{
"epoch": 2.4603608529250955,
"grad_norm": 0.31383493542671204,
"learning_rate": 5.985401459854014e-05,
"loss": 0.5064,
"step": 1125
},
{
"epoch": 2.462547840349918,
"grad_norm": 0.4120381474494934,
"learning_rate": 5.961070559610705e-05,
"loss": 0.5933,
"step": 1126
},
{
"epoch": 2.4647348277747403,
"grad_norm": 0.41584497690200806,
"learning_rate": 5.936739659367396e-05,
"loss": 0.6191,
"step": 1127
},
{
"epoch": 2.4669218151995627,
"grad_norm": 0.4834405481815338,
"learning_rate": 5.912408759124087e-05,
"loss": 0.6092,
"step": 1128
},
{
"epoch": 2.469108802624385,
"grad_norm": 0.30698856711387634,
"learning_rate": 5.8880778588807784e-05,
"loss": 0.6318,
"step": 1129
},
{
"epoch": 2.4712957900492074,
"grad_norm": 0.42027831077575684,
"learning_rate": 5.863746958637469e-05,
"loss": 0.5981,
"step": 1130
},
{
"epoch": 2.4734827774740293,
"grad_norm": 0.46082839369773865,
"learning_rate": 5.83941605839416e-05,
"loss": 0.7592,
"step": 1131
},
{
"epoch": 2.4756697648988517,
"grad_norm": 0.3530132472515106,
"learning_rate": 5.815085158150851e-05,
"loss": 0.6589,
"step": 1132
},
{
"epoch": 2.477856752323674,
"grad_norm": 0.40325507521629333,
"learning_rate": 5.790754257907542e-05,
"loss": 0.6136,
"step": 1133
},
{
"epoch": 2.4800437397484965,
"grad_norm": 0.5407168865203857,
"learning_rate": 5.7664233576642324e-05,
"loss": 0.818,
"step": 1134
},
{
"epoch": 2.482230727173319,
"grad_norm": 0.3995073139667511,
"learning_rate": 5.742092457420924e-05,
"loss": 0.7405,
"step": 1135
},
{
"epoch": 2.484417714598141,
"grad_norm": 0.327036052942276,
"learning_rate": 5.717761557177615e-05,
"loss": 0.5611,
"step": 1136
},
{
"epoch": 2.486604702022963,
"grad_norm": 0.4143662750720978,
"learning_rate": 5.693430656934306e-05,
"loss": 0.7194,
"step": 1137
},
{
"epoch": 2.4887916894477855,
"grad_norm": 0.37465140223503113,
"learning_rate": 5.669099756690997e-05,
"loss": 0.8684,
"step": 1138
},
{
"epoch": 2.490978676872608,
"grad_norm": 0.3546184301376343,
"learning_rate": 5.644768856447688e-05,
"loss": 0.5464,
"step": 1139
},
{
"epoch": 2.4931656642974303,
"grad_norm": 0.5521944165229797,
"learning_rate": 5.620437956204379e-05,
"loss": 0.6143,
"step": 1140
},
{
"epoch": 2.4953526517222526,
"grad_norm": 0.3398590385913849,
"learning_rate": 5.596107055961071e-05,
"loss": 0.7098,
"step": 1141
},
{
"epoch": 2.497539639147075,
"grad_norm": 0.28899359703063965,
"learning_rate": 5.571776155717761e-05,
"loss": 0.6263,
"step": 1142
},
{
"epoch": 2.4997266265718974,
"grad_norm": 0.3622675836086273,
"learning_rate": 5.547445255474452e-05,
"loss": 0.5183,
"step": 1143
},
{
"epoch": 2.5019136139967193,
"grad_norm": 0.3359682261943817,
"learning_rate": 5.523114355231143e-05,
"loss": 0.7125,
"step": 1144
},
{
"epoch": 2.5041006014215417,
"grad_norm": 0.42786240577697754,
"learning_rate": 5.498783454987834e-05,
"loss": 0.6445,
"step": 1145
},
{
"epoch": 2.506287588846364,
"grad_norm": 0.340658575296402,
"learning_rate": 5.4744525547445253e-05,
"loss": 0.5709,
"step": 1146
},
{
"epoch": 2.5084745762711864,
"grad_norm": 0.3030422031879425,
"learning_rate": 5.450121654501216e-05,
"loss": 0.5894,
"step": 1147
},
{
"epoch": 2.510661563696009,
"grad_norm": 0.4911826550960541,
"learning_rate": 5.425790754257907e-05,
"loss": 0.6198,
"step": 1148
},
{
"epoch": 2.512848551120831,
"grad_norm": 0.3828030824661255,
"learning_rate": 5.401459854014598e-05,
"loss": 0.7856,
"step": 1149
},
{
"epoch": 2.5150355385456535,
"grad_norm": 0.354000449180603,
"learning_rate": 5.377128953771289e-05,
"loss": 0.5489,
"step": 1150
},
{
"epoch": 2.5172225259704755,
"grad_norm": 0.2972152829170227,
"learning_rate": 5.3527980535279806e-05,
"loss": 0.773,
"step": 1151
},
{
"epoch": 2.519409513395298,
"grad_norm": 0.3820708394050598,
"learning_rate": 5.328467153284671e-05,
"loss": 0.6889,
"step": 1152
},
{
"epoch": 2.52159650082012,
"grad_norm": 0.3476285934448242,
"learning_rate": 5.304136253041362e-05,
"loss": 0.5365,
"step": 1153
},
{
"epoch": 2.5237834882449426,
"grad_norm": 0.36393001675605774,
"learning_rate": 5.279805352798053e-05,
"loss": 0.6012,
"step": 1154
},
{
"epoch": 2.525970475669765,
"grad_norm": 0.3589417338371277,
"learning_rate": 5.255474452554744e-05,
"loss": 0.6502,
"step": 1155
},
{
"epoch": 2.5281574630945873,
"grad_norm": 0.34018373489379883,
"learning_rate": 5.231143552311435e-05,
"loss": 0.6489,
"step": 1156
},
{
"epoch": 2.5303444505194097,
"grad_norm": 0.40649306774139404,
"learning_rate": 5.206812652068126e-05,
"loss": 0.6107,
"step": 1157
},
{
"epoch": 2.5325314379442316,
"grad_norm": 0.3748558759689331,
"learning_rate": 5.1824817518248176e-05,
"loss": 0.5517,
"step": 1158
},
{
"epoch": 2.534718425369054,
"grad_norm": 0.4162946939468384,
"learning_rate": 5.158150851581508e-05,
"loss": 0.5658,
"step": 1159
},
{
"epoch": 2.5369054127938764,
"grad_norm": 0.40900272130966187,
"learning_rate": 5.133819951338199e-05,
"loss": 0.6965,
"step": 1160
},
{
"epoch": 2.5390924002186988,
"grad_norm": 0.4511730372905731,
"learning_rate": 5.10948905109489e-05,
"loss": 0.7305,
"step": 1161
},
{
"epoch": 2.541279387643521,
"grad_norm": 0.4122026860713959,
"learning_rate": 5.085158150851581e-05,
"loss": 0.6032,
"step": 1162
},
{
"epoch": 2.5434663750683435,
"grad_norm": 0.33657750487327576,
"learning_rate": 5.060827250608272e-05,
"loss": 0.6772,
"step": 1163
},
{
"epoch": 2.545653362493166,
"grad_norm": 0.3611637353897095,
"learning_rate": 5.036496350364963e-05,
"loss": 0.7829,
"step": 1164
},
{
"epoch": 2.547840349917988,
"grad_norm": 0.3221738040447235,
"learning_rate": 5.0121654501216546e-05,
"loss": 0.656,
"step": 1165
},
{
"epoch": 2.55002733734281,
"grad_norm": 0.30915001034736633,
"learning_rate": 4.987834549878345e-05,
"loss": 0.55,
"step": 1166
},
{
"epoch": 2.5522143247676325,
"grad_norm": 0.3413131535053253,
"learning_rate": 4.963503649635036e-05,
"loss": 0.7515,
"step": 1167
},
{
"epoch": 2.554401312192455,
"grad_norm": 0.4244505763053894,
"learning_rate": 4.9391727493917275e-05,
"loss": 0.7202,
"step": 1168
},
{
"epoch": 2.5565882996172773,
"grad_norm": 0.2993778586387634,
"learning_rate": 4.914841849148418e-05,
"loss": 0.4497,
"step": 1169
},
{
"epoch": 2.5587752870420997,
"grad_norm": 0.43434271216392517,
"learning_rate": 4.8905109489051086e-05,
"loss": 0.591,
"step": 1170
},
{
"epoch": 2.560962274466922,
"grad_norm": 0.35246193408966064,
"learning_rate": 4.8661800486618e-05,
"loss": 0.537,
"step": 1171
},
{
"epoch": 2.563149261891744,
"grad_norm": 0.37283191084861755,
"learning_rate": 4.841849148418491e-05,
"loss": 0.5856,
"step": 1172
},
{
"epoch": 2.5653362493165663,
"grad_norm": 0.39839670062065125,
"learning_rate": 4.817518248175182e-05,
"loss": 0.4996,
"step": 1173
},
{
"epoch": 2.5675232367413887,
"grad_norm": 0.4315820634365082,
"learning_rate": 4.793187347931873e-05,
"loss": 0.7119,
"step": 1174
},
{
"epoch": 2.569710224166211,
"grad_norm": 0.4408882260322571,
"learning_rate": 4.7688564476885646e-05,
"loss": 0.7059,
"step": 1175
},
{
"epoch": 2.5718972115910335,
"grad_norm": 0.4746418595314026,
"learning_rate": 4.744525547445255e-05,
"loss": 0.6944,
"step": 1176
},
{
"epoch": 2.5740841990158554,
"grad_norm": 0.31449419260025024,
"learning_rate": 4.7201946472019456e-05,
"loss": 0.7469,
"step": 1177
},
{
"epoch": 2.576271186440678,
"grad_norm": 0.4608743190765381,
"learning_rate": 4.6958637469586375e-05,
"loss": 0.4727,
"step": 1178
},
{
"epoch": 2.5784581738655,
"grad_norm": 0.3578025996685028,
"learning_rate": 4.671532846715328e-05,
"loss": 0.8796,
"step": 1179
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.3281157612800598,
"learning_rate": 4.647201946472019e-05,
"loss": 0.5228,
"step": 1180
},
{
"epoch": 2.582832148715145,
"grad_norm": 0.34412261843681335,
"learning_rate": 4.62287104622871e-05,
"loss": 0.6171,
"step": 1181
},
{
"epoch": 2.5850191361399673,
"grad_norm": 0.32819414138793945,
"learning_rate": 4.5985401459854016e-05,
"loss": 0.6381,
"step": 1182
},
{
"epoch": 2.5872061235647896,
"grad_norm": 0.42394185066223145,
"learning_rate": 4.574209245742092e-05,
"loss": 0.6248,
"step": 1183
},
{
"epoch": 2.5893931109896116,
"grad_norm": 0.3938983082771301,
"learning_rate": 4.5498783454987826e-05,
"loss": 0.688,
"step": 1184
},
{
"epoch": 2.5915800984144344,
"grad_norm": 0.35975101590156555,
"learning_rate": 4.5255474452554745e-05,
"loss": 0.6196,
"step": 1185
},
{
"epoch": 2.5937670858392563,
"grad_norm": 0.5351125597953796,
"learning_rate": 4.501216545012165e-05,
"loss": 0.6542,
"step": 1186
},
{
"epoch": 2.5959540732640787,
"grad_norm": 0.31686198711395264,
"learning_rate": 4.476885644768856e-05,
"loss": 0.7063,
"step": 1187
},
{
"epoch": 2.598141060688901,
"grad_norm": 0.2979380786418915,
"learning_rate": 4.4525547445255474e-05,
"loss": 0.5374,
"step": 1188
},
{
"epoch": 2.6003280481137234,
"grad_norm": 0.3495193123817444,
"learning_rate": 4.428223844282238e-05,
"loss": 0.6217,
"step": 1189
},
{
"epoch": 2.602515035538546,
"grad_norm": 0.3886531591415405,
"learning_rate": 4.403892944038929e-05,
"loss": 0.5628,
"step": 1190
},
{
"epoch": 2.6047020229633677,
"grad_norm": 0.3585399091243744,
"learning_rate": 4.3795620437956196e-05,
"loss": 0.6921,
"step": 1191
},
{
"epoch": 2.6068890103881905,
"grad_norm": 0.3813333809375763,
"learning_rate": 4.3552311435523115e-05,
"loss": 0.6603,
"step": 1192
},
{
"epoch": 2.6090759978130125,
"grad_norm": 0.4587854743003845,
"learning_rate": 4.330900243309002e-05,
"loss": 0.7274,
"step": 1193
},
{
"epoch": 2.611262985237835,
"grad_norm": 0.4350600242614746,
"learning_rate": 4.3065693430656925e-05,
"loss": 0.6628,
"step": 1194
},
{
"epoch": 2.613449972662657,
"grad_norm": 0.3220929205417633,
"learning_rate": 4.2822384428223844e-05,
"loss": 0.6057,
"step": 1195
},
{
"epoch": 2.6156369600874796,
"grad_norm": 0.54576575756073,
"learning_rate": 4.257907542579075e-05,
"loss": 0.693,
"step": 1196
},
{
"epoch": 2.617823947512302,
"grad_norm": 0.393766850233078,
"learning_rate": 4.233576642335766e-05,
"loss": 0.6226,
"step": 1197
},
{
"epoch": 2.620010934937124,
"grad_norm": 0.3243195116519928,
"learning_rate": 4.209245742092457e-05,
"loss": 0.7465,
"step": 1198
},
{
"epoch": 2.6221979223619463,
"grad_norm": 0.3847908079624176,
"learning_rate": 4.1849148418491485e-05,
"loss": 0.4963,
"step": 1199
},
{
"epoch": 2.6243849097867686,
"grad_norm": 0.40093564987182617,
"learning_rate": 4.160583941605839e-05,
"loss": 0.7138,
"step": 1200
},
{
"epoch": 2.626571897211591,
"grad_norm": 0.4176326096057892,
"learning_rate": 4.1362530413625295e-05,
"loss": 0.4808,
"step": 1201
},
{
"epoch": 2.6287588846364134,
"grad_norm": 0.3477429151535034,
"learning_rate": 4.1119221411192214e-05,
"loss": 0.6285,
"step": 1202
},
{
"epoch": 2.6309458720612358,
"grad_norm": 0.4201376736164093,
"learning_rate": 4.087591240875912e-05,
"loss": 1.0551,
"step": 1203
},
{
"epoch": 2.633132859486058,
"grad_norm": 0.4241773188114166,
"learning_rate": 4.063260340632603e-05,
"loss": 0.6991,
"step": 1204
},
{
"epoch": 2.63531984691088,
"grad_norm": 0.5858724117279053,
"learning_rate": 4.038929440389294e-05,
"loss": 0.6912,
"step": 1205
},
{
"epoch": 2.6375068343357024,
"grad_norm": 0.3396605849266052,
"learning_rate": 4.0145985401459855e-05,
"loss": 0.5062,
"step": 1206
},
{
"epoch": 2.639693821760525,
"grad_norm": 0.3286657929420471,
"learning_rate": 3.990267639902676e-05,
"loss": 0.678,
"step": 1207
},
{
"epoch": 2.641880809185347,
"grad_norm": 0.3253632187843323,
"learning_rate": 3.9659367396593665e-05,
"loss": 0.5769,
"step": 1208
},
{
"epoch": 2.6440677966101696,
"grad_norm": 0.39935943484306335,
"learning_rate": 3.9416058394160584e-05,
"loss": 0.6078,
"step": 1209
},
{
"epoch": 2.646254784034992,
"grad_norm": 0.38090863823890686,
"learning_rate": 3.917274939172749e-05,
"loss": 0.6195,
"step": 1210
},
{
"epoch": 2.6484417714598143,
"grad_norm": 0.3816772401332855,
"learning_rate": 3.8929440389294394e-05,
"loss": 0.6636,
"step": 1211
},
{
"epoch": 2.6506287588846362,
"grad_norm": 0.354041188955307,
"learning_rate": 3.868613138686131e-05,
"loss": 0.6017,
"step": 1212
},
{
"epoch": 2.6528157463094586,
"grad_norm": 0.38338416814804077,
"learning_rate": 3.844282238442822e-05,
"loss": 0.5642,
"step": 1213
},
{
"epoch": 2.655002733734281,
"grad_norm": 0.4089908003807068,
"learning_rate": 3.819951338199513e-05,
"loss": 0.7222,
"step": 1214
},
{
"epoch": 2.6571897211591033,
"grad_norm": 0.44963401556015015,
"learning_rate": 3.795620437956204e-05,
"loss": 0.613,
"step": 1215
},
{
"epoch": 2.6593767085839257,
"grad_norm": 0.2840285003185272,
"learning_rate": 3.7712895377128954e-05,
"loss": 0.6435,
"step": 1216
},
{
"epoch": 2.661563696008748,
"grad_norm": 0.39185985922813416,
"learning_rate": 3.746958637469586e-05,
"loss": 0.7633,
"step": 1217
},
{
"epoch": 2.6637506834335705,
"grad_norm": 0.3823552131652832,
"learning_rate": 3.722627737226277e-05,
"loss": 0.6632,
"step": 1218
},
{
"epoch": 2.6659376708583924,
"grad_norm": 0.4937818646430969,
"learning_rate": 3.698296836982968e-05,
"loss": 0.8944,
"step": 1219
},
{
"epoch": 2.6681246582832148,
"grad_norm": 0.38062620162963867,
"learning_rate": 3.673965936739659e-05,
"loss": 0.7507,
"step": 1220
},
{
"epoch": 2.670311645708037,
"grad_norm": 0.34089863300323486,
"learning_rate": 3.64963503649635e-05,
"loss": 0.6276,
"step": 1221
},
{
"epoch": 2.6724986331328595,
"grad_norm": 0.45665138959884644,
"learning_rate": 3.625304136253041e-05,
"loss": 0.6801,
"step": 1222
},
{
"epoch": 2.674685620557682,
"grad_norm": 0.5102551579475403,
"learning_rate": 3.6009732360097324e-05,
"loss": 0.5385,
"step": 1223
},
{
"epoch": 2.6768726079825043,
"grad_norm": 0.4079155921936035,
"learning_rate": 3.576642335766423e-05,
"loss": 0.7165,
"step": 1224
},
{
"epoch": 2.6790595954073266,
"grad_norm": 0.3809445798397064,
"learning_rate": 3.552311435523114e-05,
"loss": 0.6695,
"step": 1225
},
{
"epoch": 2.6812465828321486,
"grad_norm": 0.44514816999435425,
"learning_rate": 3.527980535279805e-05,
"loss": 0.732,
"step": 1226
},
{
"epoch": 2.683433570256971,
"grad_norm": 0.40891462564468384,
"learning_rate": 3.503649635036496e-05,
"loss": 0.9004,
"step": 1227
},
{
"epoch": 2.6856205576817933,
"grad_norm": 0.44487065076828003,
"learning_rate": 3.479318734793187e-05,
"loss": 0.4452,
"step": 1228
},
{
"epoch": 2.6878075451066157,
"grad_norm": 0.27980828285217285,
"learning_rate": 3.4549878345498775e-05,
"loss": 0.6259,
"step": 1229
},
{
"epoch": 2.689994532531438,
"grad_norm": 0.37272408604621887,
"learning_rate": 3.430656934306569e-05,
"loss": 0.7493,
"step": 1230
},
{
"epoch": 2.69218151995626,
"grad_norm": 0.4146464169025421,
"learning_rate": 3.40632603406326e-05,
"loss": 0.5103,
"step": 1231
},
{
"epoch": 2.694368507381083,
"grad_norm": 0.350233793258667,
"learning_rate": 3.381995133819951e-05,
"loss": 0.6766,
"step": 1232
},
{
"epoch": 2.6965554948059047,
"grad_norm": 0.49093326926231384,
"learning_rate": 3.357664233576642e-05,
"loss": 0.6934,
"step": 1233
},
{
"epoch": 2.698742482230727,
"grad_norm": 0.4598555266857147,
"learning_rate": 3.333333333333333e-05,
"loss": 0.6618,
"step": 1234
},
{
"epoch": 2.7009294696555495,
"grad_norm": 0.4397393465042114,
"learning_rate": 3.309002433090024e-05,
"loss": 0.5864,
"step": 1235
},
{
"epoch": 2.703116457080372,
"grad_norm": 0.43458834290504456,
"learning_rate": 3.284671532846715e-05,
"loss": 0.6955,
"step": 1236
},
{
"epoch": 2.705303444505194,
"grad_norm": 0.3657298684120178,
"learning_rate": 3.260340632603406e-05,
"loss": 0.651,
"step": 1237
},
{
"epoch": 2.707490431930016,
"grad_norm": 0.4210680425167084,
"learning_rate": 3.236009732360097e-05,
"loss": 0.5718,
"step": 1238
},
{
"epoch": 2.709677419354839,
"grad_norm": 0.3858646750450134,
"learning_rate": 3.211678832116788e-05,
"loss": 0.6649,
"step": 1239
},
{
"epoch": 2.711864406779661,
"grad_norm": 0.4130675494670868,
"learning_rate": 3.187347931873479e-05,
"loss": 0.6539,
"step": 1240
},
{
"epoch": 2.7140513942044833,
"grad_norm": 0.246662899851799,
"learning_rate": 3.1630170316301705e-05,
"loss": 0.5551,
"step": 1241
},
{
"epoch": 2.7162383816293056,
"grad_norm": 0.3459307551383972,
"learning_rate": 3.138686131386861e-05,
"loss": 0.4788,
"step": 1242
},
{
"epoch": 2.718425369054128,
"grad_norm": 0.4324615001678467,
"learning_rate": 3.114355231143552e-05,
"loss": 0.7828,
"step": 1243
},
{
"epoch": 2.7206123564789504,
"grad_norm": 0.5233476758003235,
"learning_rate": 3.090024330900243e-05,
"loss": 0.4262,
"step": 1244
},
{
"epoch": 2.7227993439037723,
"grad_norm": 0.35397472977638245,
"learning_rate": 3.065693430656934e-05,
"loss": 0.688,
"step": 1245
},
{
"epoch": 2.724986331328595,
"grad_norm": 0.37005069851875305,
"learning_rate": 3.041362530413625e-05,
"loss": 0.6592,
"step": 1246
},
{
"epoch": 2.727173318753417,
"grad_norm": 0.4533984661102295,
"learning_rate": 3.017031630170316e-05,
"loss": 0.6367,
"step": 1247
},
{
"epoch": 2.7293603061782394,
"grad_norm": 0.32724103331565857,
"learning_rate": 2.992700729927007e-05,
"loss": 0.5874,
"step": 1248
},
{
"epoch": 2.731547293603062,
"grad_norm": 0.3568969666957855,
"learning_rate": 2.968369829683698e-05,
"loss": 0.8173,
"step": 1249
},
{
"epoch": 2.733734281027884,
"grad_norm": 0.3268612325191498,
"learning_rate": 2.9440389294403892e-05,
"loss": 0.4827,
"step": 1250
},
{
"epoch": 2.7359212684527066,
"grad_norm": 0.30471158027648926,
"learning_rate": 2.91970802919708e-05,
"loss": 0.7108,
"step": 1251
},
{
"epoch": 2.7381082558775285,
"grad_norm": 0.3290720582008362,
"learning_rate": 2.895377128953771e-05,
"loss": 0.639,
"step": 1252
},
{
"epoch": 2.740295243302351,
"grad_norm": 0.35110557079315186,
"learning_rate": 2.871046228710462e-05,
"loss": 0.5367,
"step": 1253
},
{
"epoch": 2.7424822307271732,
"grad_norm": 0.26838091015815735,
"learning_rate": 2.846715328467153e-05,
"loss": 0.801,
"step": 1254
},
{
"epoch": 2.7446692181519956,
"grad_norm": 0.3596297800540924,
"learning_rate": 2.822384428223844e-05,
"loss": 0.6018,
"step": 1255
},
{
"epoch": 2.746856205576818,
"grad_norm": 0.4146590530872345,
"learning_rate": 2.7980535279805354e-05,
"loss": 0.7548,
"step": 1256
},
{
"epoch": 2.7490431930016404,
"grad_norm": 0.5210931897163391,
"learning_rate": 2.773722627737226e-05,
"loss": 0.6514,
"step": 1257
},
{
"epoch": 2.7512301804264627,
"grad_norm": 0.37990838289260864,
"learning_rate": 2.749391727493917e-05,
"loss": 0.6275,
"step": 1258
},
{
"epoch": 2.7534171678512847,
"grad_norm": 0.41597574949264526,
"learning_rate": 2.725060827250608e-05,
"loss": 0.7675,
"step": 1259
},
{
"epoch": 2.755604155276107,
"grad_norm": 0.4515291452407837,
"learning_rate": 2.700729927007299e-05,
"loss": 0.6756,
"step": 1260
},
{
"epoch": 2.7577911427009294,
"grad_norm": 0.418295294046402,
"learning_rate": 2.6763990267639903e-05,
"loss": 0.6417,
"step": 1261
},
{
"epoch": 2.7599781301257518,
"grad_norm": 0.34704264998435974,
"learning_rate": 2.652068126520681e-05,
"loss": 0.8996,
"step": 1262
},
{
"epoch": 2.762165117550574,
"grad_norm": 0.3458947241306305,
"learning_rate": 2.627737226277372e-05,
"loss": 0.8436,
"step": 1263
},
{
"epoch": 2.7643521049753965,
"grad_norm": 0.39911675453186035,
"learning_rate": 2.603406326034063e-05,
"loss": 0.5799,
"step": 1264
},
{
"epoch": 2.766539092400219,
"grad_norm": 0.2880173623561859,
"learning_rate": 2.579075425790754e-05,
"loss": 0.5253,
"step": 1265
},
{
"epoch": 2.768726079825041,
"grad_norm": 0.35598114132881165,
"learning_rate": 2.554744525547445e-05,
"loss": 0.6593,
"step": 1266
},
{
"epoch": 2.770913067249863,
"grad_norm": 0.34010377526283264,
"learning_rate": 2.530413625304136e-05,
"loss": 0.6076,
"step": 1267
},
{
"epoch": 2.7731000546746856,
"grad_norm": 0.37857237458229065,
"learning_rate": 2.5060827250608273e-05,
"loss": 0.7757,
"step": 1268
},
{
"epoch": 2.775287042099508,
"grad_norm": 0.6945297718048096,
"learning_rate": 2.481751824817518e-05,
"loss": 0.7243,
"step": 1269
},
{
"epoch": 2.7774740295243303,
"grad_norm": 0.3066571354866028,
"learning_rate": 2.457420924574209e-05,
"loss": 0.6558,
"step": 1270
},
{
"epoch": 2.7796610169491527,
"grad_norm": 0.42167848348617554,
"learning_rate": 2.4330900243309e-05,
"loss": 0.6929,
"step": 1271
},
{
"epoch": 2.781848004373975,
"grad_norm": 0.4334861934185028,
"learning_rate": 2.408759124087591e-05,
"loss": 0.6516,
"step": 1272
},
{
"epoch": 2.784034991798797,
"grad_norm": 0.39597228169441223,
"learning_rate": 2.3844282238442823e-05,
"loss": 0.688,
"step": 1273
},
{
"epoch": 2.7862219792236194,
"grad_norm": 0.36653244495391846,
"learning_rate": 2.3600973236009728e-05,
"loss": 0.7899,
"step": 1274
},
{
"epoch": 2.7884089666484417,
"grad_norm": 0.4496842622756958,
"learning_rate": 2.335766423357664e-05,
"loss": 0.7682,
"step": 1275
},
{
"epoch": 2.790595954073264,
"grad_norm": 0.5105994343757629,
"learning_rate": 2.311435523114355e-05,
"loss": 0.6332,
"step": 1276
},
{
"epoch": 2.7927829414980865,
"grad_norm": 0.30159294605255127,
"learning_rate": 2.287104622871046e-05,
"loss": 0.6215,
"step": 1277
},
{
"epoch": 2.794969928922909,
"grad_norm": 0.44565349817276,
"learning_rate": 2.2627737226277372e-05,
"loss": 0.8171,
"step": 1278
},
{
"epoch": 2.7971569163477312,
"grad_norm": 0.48561230301856995,
"learning_rate": 2.238442822384428e-05,
"loss": 0.7251,
"step": 1279
},
{
"epoch": 2.799343903772553,
"grad_norm": 0.4640182554721832,
"learning_rate": 2.214111922141119e-05,
"loss": 0.8137,
"step": 1280
},
{
"epoch": 2.8015308911973755,
"grad_norm": 0.34384575486183167,
"learning_rate": 2.1897810218978098e-05,
"loss": 0.7161,
"step": 1281
},
{
"epoch": 2.803717878622198,
"grad_norm": 0.3967885971069336,
"learning_rate": 2.165450121654501e-05,
"loss": 0.6331,
"step": 1282
},
{
"epoch": 2.8059048660470203,
"grad_norm": 0.4139404892921448,
"learning_rate": 2.1411192214111922e-05,
"loss": 0.7716,
"step": 1283
},
{
"epoch": 2.8080918534718426,
"grad_norm": 0.5906177163124084,
"learning_rate": 2.116788321167883e-05,
"loss": 0.8308,
"step": 1284
},
{
"epoch": 2.8102788408966646,
"grad_norm": 0.3923112452030182,
"learning_rate": 2.0924574209245742e-05,
"loss": 0.5808,
"step": 1285
},
{
"epoch": 2.8124658283214874,
"grad_norm": 0.376613050699234,
"learning_rate": 2.0681265206812648e-05,
"loss": 0.4945,
"step": 1286
},
{
"epoch": 2.8146528157463093,
"grad_norm": 0.39711064100265503,
"learning_rate": 2.043795620437956e-05,
"loss": 0.9447,
"step": 1287
},
{
"epoch": 2.8168398031711317,
"grad_norm": 0.49172040820121765,
"learning_rate": 2.019464720194647e-05,
"loss": 0.5981,
"step": 1288
},
{
"epoch": 2.819026790595954,
"grad_norm": 0.3777097165584564,
"learning_rate": 1.995133819951338e-05,
"loss": 0.5527,
"step": 1289
},
{
"epoch": 2.8212137780207764,
"grad_norm": 0.3420855700969696,
"learning_rate": 1.9708029197080292e-05,
"loss": 0.591,
"step": 1290
},
{
"epoch": 2.823400765445599,
"grad_norm": 0.3033166825771332,
"learning_rate": 1.9464720194647197e-05,
"loss": 0.4902,
"step": 1291
},
{
"epoch": 2.8255877528704207,
"grad_norm": 0.3743399679660797,
"learning_rate": 1.922141119221411e-05,
"loss": 0.72,
"step": 1292
},
{
"epoch": 2.8277747402952436,
"grad_norm": 0.43312016129493713,
"learning_rate": 1.897810218978102e-05,
"loss": 0.5847,
"step": 1293
},
{
"epoch": 2.8299617277200655,
"grad_norm": 0.4334290623664856,
"learning_rate": 1.873479318734793e-05,
"loss": 0.737,
"step": 1294
},
{
"epoch": 2.832148715144888,
"grad_norm": 0.3262549340724945,
"learning_rate": 1.849148418491484e-05,
"loss": 0.6188,
"step": 1295
},
{
"epoch": 2.8343357025697102,
"grad_norm": 0.3808232247829437,
"learning_rate": 1.824817518248175e-05,
"loss": 0.8153,
"step": 1296
},
{
"epoch": 2.8365226899945326,
"grad_norm": 0.35475462675094604,
"learning_rate": 1.8004866180048662e-05,
"loss": 0.5671,
"step": 1297
},
{
"epoch": 2.838709677419355,
"grad_norm": 0.38812217116355896,
"learning_rate": 1.776155717761557e-05,
"loss": 0.6323,
"step": 1298
},
{
"epoch": 2.840896664844177,
"grad_norm": 0.3561973571777344,
"learning_rate": 1.751824817518248e-05,
"loss": 0.6919,
"step": 1299
},
{
"epoch": 2.8430836522689997,
"grad_norm": 0.31703197956085205,
"learning_rate": 1.7274939172749388e-05,
"loss": 0.6856,
"step": 1300
},
{
"epoch": 2.8452706396938217,
"grad_norm": 0.41529974341392517,
"learning_rate": 1.70316301703163e-05,
"loss": 0.7612,
"step": 1301
},
{
"epoch": 2.847457627118644,
"grad_norm": 0.42857563495635986,
"learning_rate": 1.678832116788321e-05,
"loss": 0.8243,
"step": 1302
},
{
"epoch": 2.8496446145434664,
"grad_norm": 0.4402436912059784,
"learning_rate": 1.654501216545012e-05,
"loss": 0.6149,
"step": 1303
},
{
"epoch": 2.8518316019682888,
"grad_norm": 0.5396206378936768,
"learning_rate": 1.630170316301703e-05,
"loss": 0.623,
"step": 1304
},
{
"epoch": 2.854018589393111,
"grad_norm": 0.3337330222129822,
"learning_rate": 1.605839416058394e-05,
"loss": 0.6207,
"step": 1305
},
{
"epoch": 2.856205576817933,
"grad_norm": 0.47766539454460144,
"learning_rate": 1.5815085158150852e-05,
"loss": 0.7012,
"step": 1306
},
{
"epoch": 2.8583925642427555,
"grad_norm": 0.3661979138851166,
"learning_rate": 1.557177615571776e-05,
"loss": 0.6951,
"step": 1307
},
{
"epoch": 2.860579551667578,
"grad_norm": 0.32364702224731445,
"learning_rate": 1.532846715328467e-05,
"loss": 0.5451,
"step": 1308
},
{
"epoch": 2.8627665390924,
"grad_norm": 0.4927031695842743,
"learning_rate": 1.508515815085158e-05,
"loss": 0.6483,
"step": 1309
},
{
"epoch": 2.8649535265172226,
"grad_norm": 0.3563484847545624,
"learning_rate": 1.484184914841849e-05,
"loss": 0.6751,
"step": 1310
},
{
"epoch": 2.867140513942045,
"grad_norm": 0.3271696865558624,
"learning_rate": 1.45985401459854e-05,
"loss": 0.5288,
"step": 1311
},
{
"epoch": 2.8693275013668673,
"grad_norm": 0.3783499300479889,
"learning_rate": 1.435523114355231e-05,
"loss": 0.7292,
"step": 1312
},
{
"epoch": 2.8715144887916892,
"grad_norm": 0.39892178773880005,
"learning_rate": 1.411192214111922e-05,
"loss": 0.7258,
"step": 1313
},
{
"epoch": 2.8737014762165116,
"grad_norm": 0.27586114406585693,
"learning_rate": 1.386861313868613e-05,
"loss": 0.4122,
"step": 1314
},
{
"epoch": 2.875888463641334,
"grad_norm": 0.4590570330619812,
"learning_rate": 1.362530413625304e-05,
"loss": 0.7205,
"step": 1315
},
{
"epoch": 2.8780754510661564,
"grad_norm": 0.34512102603912354,
"learning_rate": 1.3381995133819952e-05,
"loss": 0.7402,
"step": 1316
},
{
"epoch": 2.8802624384909787,
"grad_norm": 0.4092288613319397,
"learning_rate": 1.313868613138686e-05,
"loss": 0.7668,
"step": 1317
},
{
"epoch": 2.882449425915801,
"grad_norm": 0.4686785638332367,
"learning_rate": 1.289537712895377e-05,
"loss": 0.5874,
"step": 1318
},
{
"epoch": 2.8846364133406235,
"grad_norm": 0.341987669467926,
"learning_rate": 1.265206812652068e-05,
"loss": 0.7645,
"step": 1319
},
{
"epoch": 2.8868234007654454,
"grad_norm": 0.6410381197929382,
"learning_rate": 1.240875912408759e-05,
"loss": 0.7446,
"step": 1320
},
{
"epoch": 2.889010388190268,
"grad_norm": 0.4242047965526581,
"learning_rate": 1.21654501216545e-05,
"loss": 0.5989,
"step": 1321
},
{
"epoch": 2.89119737561509,
"grad_norm": 0.3659310042858124,
"learning_rate": 1.1922141119221411e-05,
"loss": 0.6532,
"step": 1322
},
{
"epoch": 2.8933843630399125,
"grad_norm": 0.40684065222740173,
"learning_rate": 1.167883211678832e-05,
"loss": 0.657,
"step": 1323
},
{
"epoch": 2.895571350464735,
"grad_norm": 0.47506752610206604,
"learning_rate": 1.143552311435523e-05,
"loss": 0.4426,
"step": 1324
},
{
"epoch": 2.8977583378895573,
"grad_norm": 0.3505801260471344,
"learning_rate": 1.119221411192214e-05,
"loss": 0.724,
"step": 1325
},
{
"epoch": 2.8999453253143797,
"grad_norm": 0.4182322025299072,
"learning_rate": 1.0948905109489049e-05,
"loss": 0.6425,
"step": 1326
},
{
"epoch": 2.9021323127392016,
"grad_norm": 0.5423049330711365,
"learning_rate": 1.0705596107055961e-05,
"loss": 0.6135,
"step": 1327
},
{
"epoch": 2.904319300164024,
"grad_norm": 0.47435280680656433,
"learning_rate": 1.0462287104622871e-05,
"loss": 0.6161,
"step": 1328
},
{
"epoch": 2.9065062875888463,
"grad_norm": 0.30286717414855957,
"learning_rate": 1.021897810218978e-05,
"loss": 0.5494,
"step": 1329
},
{
"epoch": 2.9086932750136687,
"grad_norm": 0.34891781210899353,
"learning_rate": 9.97566909975669e-06,
"loss": 0.8073,
"step": 1330
},
{
"epoch": 2.910880262438491,
"grad_norm": 0.3608086109161377,
"learning_rate": 9.732360097323599e-06,
"loss": 0.6207,
"step": 1331
},
{
"epoch": 2.9130672498633134,
"grad_norm": 0.2914386987686157,
"learning_rate": 9.48905109489051e-06,
"loss": 0.6153,
"step": 1332
},
{
"epoch": 2.915254237288136,
"grad_norm": 0.4532075822353363,
"learning_rate": 9.24574209245742e-06,
"loss": 0.8057,
"step": 1333
},
{
"epoch": 2.9174412247129577,
"grad_norm": 0.47955191135406494,
"learning_rate": 9.002433090024331e-06,
"loss": 0.7378,
"step": 1334
},
{
"epoch": 2.91962821213778,
"grad_norm": 0.3728046715259552,
"learning_rate": 8.75912408759124e-06,
"loss": 0.5957,
"step": 1335
},
{
"epoch": 2.9218151995626025,
"grad_norm": 0.39728742837905884,
"learning_rate": 8.51581508515815e-06,
"loss": 0.7254,
"step": 1336
},
{
"epoch": 2.924002186987425,
"grad_norm": 0.375864714384079,
"learning_rate": 8.27250608272506e-06,
"loss": 0.7013,
"step": 1337
},
{
"epoch": 2.9261891744122472,
"grad_norm": 0.3625723719596863,
"learning_rate": 8.02919708029197e-06,
"loss": 0.866,
"step": 1338
},
{
"epoch": 2.928376161837069,
"grad_norm": 0.46779105067253113,
"learning_rate": 7.78588807785888e-06,
"loss": 0.7114,
"step": 1339
},
{
"epoch": 2.930563149261892,
"grad_norm": 0.3270869851112366,
"learning_rate": 7.54257907542579e-06,
"loss": 0.6085,
"step": 1340
},
{
"epoch": 2.932750136686714,
"grad_norm": 0.3992483913898468,
"learning_rate": 7.2992700729927e-06,
"loss": 0.6498,
"step": 1341
},
{
"epoch": 2.9349371241115363,
"grad_norm": 0.41171202063560486,
"learning_rate": 7.05596107055961e-06,
"loss": 0.7382,
"step": 1342
},
{
"epoch": 2.9371241115363587,
"grad_norm": 0.7751166224479675,
"learning_rate": 6.81265206812652e-06,
"loss": 0.8629,
"step": 1343
},
{
"epoch": 2.939311098961181,
"grad_norm": 0.558593213558197,
"learning_rate": 6.56934306569343e-06,
"loss": 0.9791,
"step": 1344
},
{
"epoch": 2.9414980863860034,
"grad_norm": 0.40517720580101013,
"learning_rate": 6.32603406326034e-06,
"loss": 0.6608,
"step": 1345
},
{
"epoch": 2.9436850738108253,
"grad_norm": 0.44248199462890625,
"learning_rate": 6.08272506082725e-06,
"loss": 0.5619,
"step": 1346
},
{
"epoch": 2.945872061235648,
"grad_norm": 0.3731604814529419,
"learning_rate": 5.83941605839416e-06,
"loss": 0.6585,
"step": 1347
},
{
"epoch": 2.94805904866047,
"grad_norm": 0.524138867855072,
"learning_rate": 5.59610705596107e-06,
"loss": 0.5278,
"step": 1348
},
{
"epoch": 2.9502460360852925,
"grad_norm": 0.31725287437438965,
"learning_rate": 5.3527980535279805e-06,
"loss": 0.7118,
"step": 1349
},
{
"epoch": 2.952433023510115,
"grad_norm": 0.3865452706813812,
"learning_rate": 5.10948905109489e-06,
"loss": 0.6209,
"step": 1350
},
{
"epoch": 2.954620010934937,
"grad_norm": 0.36308881640434265,
"learning_rate": 4.866180048661799e-06,
"loss": 0.5582,
"step": 1351
},
{
"epoch": 2.9568069983597596,
"grad_norm": 0.4439944922924042,
"learning_rate": 4.62287104622871e-06,
"loss": 0.587,
"step": 1352
},
{
"epoch": 2.9589939857845815,
"grad_norm": 0.44962093234062195,
"learning_rate": 4.37956204379562e-06,
"loss": 0.7883,
"step": 1353
},
{
"epoch": 2.9611809732094043,
"grad_norm": 0.6172670722007751,
"learning_rate": 4.13625304136253e-06,
"loss": 0.7554,
"step": 1354
},
{
"epoch": 2.9633679606342263,
"grad_norm": 0.4022207260131836,
"learning_rate": 3.89294403892944e-06,
"loss": 0.7109,
"step": 1355
},
{
"epoch": 2.9655549480590486,
"grad_norm": 0.4858662486076355,
"learning_rate": 3.64963503649635e-06,
"loss": 0.7308,
"step": 1356
},
{
"epoch": 2.967741935483871,
"grad_norm": 0.4918728768825531,
"learning_rate": 3.40632603406326e-06,
"loss": 0.7418,
"step": 1357
},
{
"epoch": 2.9699289229086934,
"grad_norm": 0.5118703842163086,
"learning_rate": 3.16301703163017e-06,
"loss": 0.6361,
"step": 1358
},
{
"epoch": 2.9721159103335157,
"grad_norm": 0.4407196044921875,
"learning_rate": 2.91970802919708e-06,
"loss": 0.6971,
"step": 1359
},
{
"epoch": 2.9743028977583377,
"grad_norm": 0.33856332302093506,
"learning_rate": 2.6763990267639902e-06,
"loss": 0.5766,
"step": 1360
},
{
"epoch": 2.97648988518316,
"grad_norm": 0.45704513788223267,
"learning_rate": 2.4330900243308996e-06,
"loss": 0.6431,
"step": 1361
},
{
"epoch": 2.9786768726079824,
"grad_norm": 0.3669881224632263,
"learning_rate": 2.18978102189781e-06,
"loss": 0.5637,
"step": 1362
},
{
"epoch": 2.980863860032805,
"grad_norm": 0.33307334780693054,
"learning_rate": 1.94647201946472e-06,
"loss": 0.6372,
"step": 1363
},
{
"epoch": 2.983050847457627,
"grad_norm": 0.3178769052028656,
"learning_rate": 1.70316301703163e-06,
"loss": 0.8674,
"step": 1364
},
{
"epoch": 2.9852378348824495,
"grad_norm": 0.4288700222969055,
"learning_rate": 1.45985401459854e-06,
"loss": 0.7514,
"step": 1365
},
{
"epoch": 2.987424822307272,
"grad_norm": 0.3283116817474365,
"learning_rate": 1.2165450121654498e-06,
"loss": 0.5816,
"step": 1366
},
{
"epoch": 2.989611809732094,
"grad_norm": 0.3714343011379242,
"learning_rate": 9.7323600973236e-07,
"loss": 0.7904,
"step": 1367
},
{
"epoch": 2.991798797156916,
"grad_norm": 0.7103442549705505,
"learning_rate": 7.2992700729927e-07,
"loss": 0.7292,
"step": 1368
},
{
"epoch": 2.9939857845817386,
"grad_norm": 0.34076127409935,
"learning_rate": 4.8661800486618e-07,
"loss": 0.6302,
"step": 1369
},
{
"epoch": 2.996172772006561,
"grad_norm": 0.424398809671402,
"learning_rate": 2.4330900243309e-07,
"loss": 0.781,
"step": 1370
},
{
"epoch": 2.9983597594313833,
"grad_norm": 0.39384347200393677,
"learning_rate": 0.0,
"loss": 0.5505,
"step": 1371
},
{
"epoch": 2.9983597594313833,
"step": 1371,
"total_flos": 4.3228174920083046e+17,
"train_loss": 0.7109334499926396,
"train_runtime": 1998.4313,
"train_samples_per_second": 10.983,
"train_steps_per_second": 0.686
}
],
"logging_steps": 1.0,
"max_steps": 1371,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.3228174920083046e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}