{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 37, "global_step": 294, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003401360544217687, "grad_norm": 106.7094005171616, "learning_rate": 0.0, "loss": 2.9268, "step": 1 }, { "epoch": 0.003401360544217687, "eval_loss": 2.5302913188934326, "eval_runtime": 3.7953, "eval_samples_per_second": 14.492, "eval_steps_per_second": 1.054, "step": 1 }, { "epoch": 0.006802721088435374, "grad_norm": 57.97506009705182, "learning_rate": 6.89655172413793e-08, "loss": 2.0122, "step": 2 }, { "epoch": 0.01020408163265306, "grad_norm": 116.23413141145363, "learning_rate": 1.379310344827586e-07, "loss": 2.6743, "step": 3 }, { "epoch": 0.013605442176870748, "grad_norm": 21.262801374024775, "learning_rate": 2.0689655172413793e-07, "loss": 2.0743, "step": 4 }, { "epoch": 0.017006802721088437, "grad_norm": 59.319984755304056, "learning_rate": 2.758620689655172e-07, "loss": 2.2775, "step": 5 }, { "epoch": 0.02040816326530612, "grad_norm": 159.51320885432614, "learning_rate": 3.4482758620689656e-07, "loss": 2.1337, "step": 6 }, { "epoch": 0.023809523809523808, "grad_norm": 87.93970940325055, "learning_rate": 4.1379310344827586e-07, "loss": 1.9061, "step": 7 }, { "epoch": 0.027210884353741496, "grad_norm": 61.133777808660895, "learning_rate": 4.827586206896552e-07, "loss": 1.8118, "step": 8 }, { "epoch": 0.030612244897959183, "grad_norm": 48.65887299035499, "learning_rate": 5.517241379310344e-07, "loss": 3.4095, "step": 9 }, { "epoch": 0.034013605442176874, "grad_norm": 30.592687909719288, "learning_rate": 6.206896551724138e-07, "loss": 2.2398, "step": 10 }, { "epoch": 0.03741496598639456, "grad_norm": 74.15295766799099, "learning_rate": 6.896551724137931e-07, "loss": 3.4425, "step": 11 }, { "epoch": 0.04081632653061224, "grad_norm": 34.94892634385338, "learning_rate": 7.586206896551724e-07, "loss": 2.5405, "step": 12 }, { "epoch": 0.04421768707482993, "grad_norm": 26.538521745061775, "learning_rate": 8.275862068965517e-07, "loss": 1.9614, "step": 13 }, { "epoch": 0.047619047619047616, "grad_norm": 52.23979896259082, "learning_rate": 8.96551724137931e-07, "loss": 2.9785, "step": 14 }, { "epoch": 0.05102040816326531, "grad_norm": 30.812143999051266, "learning_rate": 9.655172413793103e-07, "loss": 2.0185, "step": 15 }, { "epoch": 0.05442176870748299, "grad_norm": 41.48478088374125, "learning_rate": 1.0344827586206896e-06, "loss": 2.1126, "step": 16 }, { "epoch": 0.05782312925170068, "grad_norm": 29.347588210089675, "learning_rate": 1.1034482758620688e-06, "loss": 2.2078, "step": 17 }, { "epoch": 0.061224489795918366, "grad_norm": 28.947554594850924, "learning_rate": 1.172413793103448e-06, "loss": 2.442, "step": 18 }, { "epoch": 0.06462585034013606, "grad_norm": 32.28592513881342, "learning_rate": 1.2413793103448275e-06, "loss": 2.8683, "step": 19 }, { "epoch": 0.06802721088435375, "grad_norm": 38.97631997775744, "learning_rate": 1.3103448275862068e-06, "loss": 2.4376, "step": 20 }, { "epoch": 0.07142857142857142, "grad_norm": 43.775478156068516, "learning_rate": 1.3793103448275862e-06, "loss": 2.4167, "step": 21 }, { "epoch": 0.07482993197278912, "grad_norm": 30.904260805899465, "learning_rate": 1.4482758620689655e-06, "loss": 2.6971, "step": 22 }, { "epoch": 0.0782312925170068, "grad_norm": 48.202871069183985, "learning_rate": 1.5172413793103447e-06, "loss": 2.5093, "step": 23 }, { "epoch": 0.08163265306122448, "grad_norm": 55.067186300198706, "learning_rate": 1.5862068965517242e-06, "loss": 2.0053, "step": 24 }, { "epoch": 0.08503401360544217, "grad_norm": 38.486811757681096, "learning_rate": 1.6551724137931035e-06, "loss": 2.2475, "step": 25 }, { "epoch": 0.08843537414965986, "grad_norm": 90.78568630900098, "learning_rate": 1.7241379310344825e-06, "loss": 3.8342, "step": 26 }, { "epoch": 0.09183673469387756, "grad_norm": 23.32050516158788, "learning_rate": 1.793103448275862e-06, "loss": 2.2496, "step": 27 }, { "epoch": 0.09523809523809523, "grad_norm": 25.01047005218693, "learning_rate": 1.8620689655172412e-06, "loss": 2.6991, "step": 28 }, { "epoch": 0.09863945578231292, "grad_norm": 27.40209208002175, "learning_rate": 1.9310344827586207e-06, "loss": 2.7017, "step": 29 }, { "epoch": 0.10204081632653061, "grad_norm": 16.372774250078056, "learning_rate": 2e-06, "loss": 2.1315, "step": 30 }, { "epoch": 0.1054421768707483, "grad_norm": 34.32100924763162, "learning_rate": 1.999984207714351e-06, "loss": 2.4298, "step": 31 }, { "epoch": 0.10884353741496598, "grad_norm": 49.15042168439896, "learning_rate": 1.9999368313561964e-06, "loss": 3.1687, "step": 32 }, { "epoch": 0.11224489795918367, "grad_norm": 27.553221322487154, "learning_rate": 1.9998578724218984e-06, "loss": 2.307, "step": 33 }, { "epoch": 0.11564625850340136, "grad_norm": 25.29898708562965, "learning_rate": 1.999747333405341e-06, "loss": 2.6711, "step": 34 }, { "epoch": 0.11904761904761904, "grad_norm": 35.13639034121329, "learning_rate": 1.9996052177978517e-06, "loss": 2.2923, "step": 35 }, { "epoch": 0.12244897959183673, "grad_norm": 61.904951168823246, "learning_rate": 1.999431530088091e-06, "loss": 3.0837, "step": 36 }, { "epoch": 0.12585034013605442, "grad_norm": 43.72931173152359, "learning_rate": 1.9992262757619108e-06, "loss": 2.9055, "step": 37 }, { "epoch": 0.12585034013605442, "eval_loss": 2.2881884574890137, "eval_runtime": 3.7387, "eval_samples_per_second": 14.711, "eval_steps_per_second": 1.07, "step": 37 }, { "epoch": 0.1292517006802721, "grad_norm": 75.128224809043, "learning_rate": 1.9989894613021807e-06, "loss": 3.9717, "step": 38 }, { "epoch": 0.1326530612244898, "grad_norm": 6.423556290490496, "learning_rate": 1.998721094188584e-06, "loss": 1.6634, "step": 39 }, { "epoch": 0.1360544217687075, "grad_norm": 8.952452652609857, "learning_rate": 1.9984211828973816e-06, "loss": 2.1183, "step": 40 }, { "epoch": 0.13945578231292516, "grad_norm": 12.837161899787583, "learning_rate": 1.998089736901142e-06, "loss": 2.1306, "step": 41 }, { "epoch": 0.14285714285714285, "grad_norm": 7.2779063942957825, "learning_rate": 1.9977267666684456e-06, "loss": 1.9831, "step": 42 }, { "epoch": 0.14625850340136054, "grad_norm": 30.288569770228293, "learning_rate": 1.9973322836635515e-06, "loss": 2.1869, "step": 43 }, { "epoch": 0.14965986394557823, "grad_norm": 11.672608976353168, "learning_rate": 1.996906300346036e-06, "loss": 1.9566, "step": 44 }, { "epoch": 0.15306122448979592, "grad_norm": 14.837719065187358, "learning_rate": 1.9964488301704e-06, "loss": 2.2152, "step": 45 }, { "epoch": 0.1564625850340136, "grad_norm": 18.558600033713702, "learning_rate": 1.9959598875856427e-06, "loss": 2.06, "step": 46 }, { "epoch": 0.1598639455782313, "grad_norm": 17.161073648503006, "learning_rate": 1.995439488034806e-06, "loss": 2.0463, "step": 47 }, { "epoch": 0.16326530612244897, "grad_norm": 10.944090642041195, "learning_rate": 1.994887647954486e-06, "loss": 1.9676, "step": 48 }, { "epoch": 0.16666666666666666, "grad_norm": 30.260773919516463, "learning_rate": 1.9943043847743164e-06, "loss": 2.4235, "step": 49 }, { "epoch": 0.17006802721088435, "grad_norm": 17.95874457178673, "learning_rate": 1.9936897169164135e-06, "loss": 2.4211, "step": 50 }, { "epoch": 0.17346938775510204, "grad_norm": 29.32804844947439, "learning_rate": 1.993043663794799e-06, "loss": 2.2786, "step": 51 }, { "epoch": 0.17687074829931973, "grad_norm": 31.224760731119037, "learning_rate": 1.9923662458147826e-06, "loss": 2.8374, "step": 52 }, { "epoch": 0.18027210884353742, "grad_norm": 4.5045539325043205, "learning_rate": 1.9916574843723217e-06, "loss": 1.6301, "step": 53 }, { "epoch": 0.1836734693877551, "grad_norm": 10.827050277516674, "learning_rate": 1.9909174018533427e-06, "loss": 2.0554, "step": 54 }, { "epoch": 0.1870748299319728, "grad_norm": 17.063187262605883, "learning_rate": 1.990146021633034e-06, "loss": 2.4202, "step": 55 }, { "epoch": 0.19047619047619047, "grad_norm": 3.946679947433292, "learning_rate": 1.98934336807511e-06, "loss": 1.7808, "step": 56 }, { "epoch": 0.19387755102040816, "grad_norm": 8.431222224384186, "learning_rate": 1.9885094665310388e-06, "loss": 1.7766, "step": 57 }, { "epoch": 0.19727891156462585, "grad_norm": 32.28667139462841, "learning_rate": 1.9876443433392433e-06, "loss": 2.2299, "step": 58 }, { "epoch": 0.20068027210884354, "grad_norm": 11.950555724182584, "learning_rate": 1.986748025824268e-06, "loss": 1.928, "step": 59 }, { "epoch": 0.20408163265306123, "grad_norm": 3.6059136679066977, "learning_rate": 1.985820542295918e-06, "loss": 1.7761, "step": 60 }, { "epoch": 0.20748299319727892, "grad_norm": 41.40947345983446, "learning_rate": 1.984861922048363e-06, "loss": 2.6704, "step": 61 }, { "epoch": 0.2108843537414966, "grad_norm": 30.634237938465816, "learning_rate": 1.983872195359212e-06, "loss": 2.7336, "step": 62 }, { "epoch": 0.21428571428571427, "grad_norm": 3.760013022701194, "learning_rate": 1.9828513934885587e-06, "loss": 1.8831, "step": 63 }, { "epoch": 0.21768707482993196, "grad_norm": 37.34059674722221, "learning_rate": 1.981799548677993e-06, "loss": 2.27, "step": 64 }, { "epoch": 0.22108843537414966, "grad_norm": 11.009700618421736, "learning_rate": 1.980716694149581e-06, "loss": 1.9265, "step": 65 }, { "epoch": 0.22448979591836735, "grad_norm": 17.609147027884987, "learning_rate": 1.9796028641048194e-06, "loss": 2.3411, "step": 66 }, { "epoch": 0.22789115646258504, "grad_norm": 17.432142291951372, "learning_rate": 1.978458093723553e-06, "loss": 2.2213, "step": 67 }, { "epoch": 0.23129251700680273, "grad_norm": 14.11664326231067, "learning_rate": 1.9772824191628632e-06, "loss": 2.0831, "step": 68 }, { "epoch": 0.23469387755102042, "grad_norm": 37.456025944063875, "learning_rate": 1.9760758775559273e-06, "loss": 2.7494, "step": 69 }, { "epoch": 0.23809523809523808, "grad_norm": 16.30994509129653, "learning_rate": 1.974838507010844e-06, "loss": 2.118, "step": 70 }, { "epoch": 0.24149659863945577, "grad_norm": 25.92468917111241, "learning_rate": 1.9735703466094324e-06, "loss": 2.1656, "step": 71 }, { "epoch": 0.24489795918367346, "grad_norm": 17.23253832018251, "learning_rate": 1.972271436405994e-06, "loss": 2.0787, "step": 72 }, { "epoch": 0.24829931972789115, "grad_norm": 6.286286593272188, "learning_rate": 1.970941817426052e-06, "loss": 1.7458, "step": 73 }, { "epoch": 0.25170068027210885, "grad_norm": 20.87004487229478, "learning_rate": 1.969581531665051e-06, "loss": 2.364, "step": 74 }, { "epoch": 0.25170068027210885, "eval_loss": 2.240875482559204, "eval_runtime": 3.7328, "eval_samples_per_second": 14.734, "eval_steps_per_second": 1.072, "step": 74 }, { "epoch": 0.25510204081632654, "grad_norm": 22.83815781491435, "learning_rate": 1.968190622087034e-06, "loss": 2.2176, "step": 75 }, { "epoch": 0.2585034013605442, "grad_norm": 39.2204163613504, "learning_rate": 1.9667691326232835e-06, "loss": 2.605, "step": 76 }, { "epoch": 0.2619047619047619, "grad_norm": 9.599486970591897, "learning_rate": 1.965317108170935e-06, "loss": 2.1652, "step": 77 }, { "epoch": 0.2653061224489796, "grad_norm": 3.7571781853463175, "learning_rate": 1.9638345945915586e-06, "loss": 1.6055, "step": 78 }, { "epoch": 0.2687074829931973, "grad_norm": 7.064670527473922, "learning_rate": 1.962321638709709e-06, "loss": 1.9937, "step": 79 }, { "epoch": 0.272108843537415, "grad_norm": 28.207901160479654, "learning_rate": 1.9607782883114506e-06, "loss": 2.2552, "step": 80 }, { "epoch": 0.2755102040816326, "grad_norm": 15.991872570963396, "learning_rate": 1.959204592142843e-06, "loss": 2.1559, "step": 81 }, { "epoch": 0.2789115646258503, "grad_norm": 13.401822104278665, "learning_rate": 1.957600599908406e-06, "loss": 2.1652, "step": 82 }, { "epoch": 0.282312925170068, "grad_norm": 14.708704691038701, "learning_rate": 1.9559663622695455e-06, "loss": 1.9673, "step": 83 }, { "epoch": 0.2857142857142857, "grad_norm": 3.3458550475032105, "learning_rate": 1.954301930842958e-06, "loss": 1.6917, "step": 84 }, { "epoch": 0.2891156462585034, "grad_norm": 3.479853146114766, "learning_rate": 1.9526073581989955e-06, "loss": 1.624, "step": 85 }, { "epoch": 0.2925170068027211, "grad_norm": 25.10854427551898, "learning_rate": 1.950882697860009e-06, "loss": 2.3626, "step": 86 }, { "epoch": 0.29591836734693877, "grad_norm": 14.389114459997433, "learning_rate": 1.9491280042986562e-06, "loss": 2.0549, "step": 87 }, { "epoch": 0.29931972789115646, "grad_norm": 17.72897272235088, "learning_rate": 1.9473433329361802e-06, "loss": 2.4525, "step": 88 }, { "epoch": 0.30272108843537415, "grad_norm": 8.212788560084723, "learning_rate": 1.945528740140662e-06, "loss": 2.1368, "step": 89 }, { "epoch": 0.30612244897959184, "grad_norm": 26.76274867022125, "learning_rate": 1.943684283225236e-06, "loss": 2.3735, "step": 90 }, { "epoch": 0.30952380952380953, "grad_norm": 23.71630229663243, "learning_rate": 1.941810020446284e-06, "loss": 2.6005, "step": 91 }, { "epoch": 0.3129251700680272, "grad_norm": 22.889738702248234, "learning_rate": 1.9399060110015917e-06, "loss": 2.6924, "step": 92 }, { "epoch": 0.3163265306122449, "grad_norm": 32.54631787971477, "learning_rate": 1.9379723150284814e-06, "loss": 2.5301, "step": 93 }, { "epoch": 0.3197278911564626, "grad_norm": 3.6877224549117344, "learning_rate": 1.936008993601912e-06, "loss": 1.6556, "step": 94 }, { "epoch": 0.3231292517006803, "grad_norm": 33.682920637388364, "learning_rate": 1.934016108732548e-06, "loss": 2.3709, "step": 95 }, { "epoch": 0.32653061224489793, "grad_norm": 19.342157148675135, "learning_rate": 1.9319937233648045e-06, "loss": 1.8713, "step": 96 }, { "epoch": 0.3299319727891156, "grad_norm": 36.9446891807536, "learning_rate": 1.929941901374856e-06, "loss": 3.1666, "step": 97 }, { "epoch": 0.3333333333333333, "grad_norm": 12.769242612326224, "learning_rate": 1.9278607075686205e-06, "loss": 2.2024, "step": 98 }, { "epoch": 0.336734693877551, "grad_norm": 7.569149644914372, "learning_rate": 1.9257502076797123e-06, "loss": 1.8434, "step": 99 }, { "epoch": 0.3401360544217687, "grad_norm": 18.672166864254265, "learning_rate": 1.9236104683673653e-06, "loss": 2.6262, "step": 100 }, { "epoch": 0.3435374149659864, "grad_norm": 7.251393661314555, "learning_rate": 1.9214415572143284e-06, "loss": 1.8447, "step": 101 }, { "epoch": 0.3469387755102041, "grad_norm": 25.8588617341962, "learning_rate": 1.919243542724731e-06, "loss": 2.3528, "step": 102 }, { "epoch": 0.35034013605442177, "grad_norm": 21.00339285362203, "learning_rate": 1.917016494321918e-06, "loss": 2.462, "step": 103 }, { "epoch": 0.35374149659863946, "grad_norm": 19.533037226832878, "learning_rate": 1.9147604823462585e-06, "loss": 2.3057, "step": 104 }, { "epoch": 0.35714285714285715, "grad_norm": 3.1087327492999286, "learning_rate": 1.9124755780529243e-06, "loss": 1.6935, "step": 105 }, { "epoch": 0.36054421768707484, "grad_norm": 35.707396347148176, "learning_rate": 1.910161853609637e-06, "loss": 2.3652, "step": 106 }, { "epoch": 0.36394557823129253, "grad_norm": 16.694934440145225, "learning_rate": 1.9078193820943916e-06, "loss": 2.6014, "step": 107 }, { "epoch": 0.3673469387755102, "grad_norm": 12.946146725042743, "learning_rate": 1.9054482374931466e-06, "loss": 1.9379, "step": 108 }, { "epoch": 0.3707482993197279, "grad_norm": 8.740650008889842, "learning_rate": 1.9030484946974878e-06, "loss": 1.9414, "step": 109 }, { "epoch": 0.3741496598639456, "grad_norm": 23.13581690576701, "learning_rate": 1.9006202295022629e-06, "loss": 2.4563, "step": 110 }, { "epoch": 0.37755102040816324, "grad_norm": 10.00026809536462, "learning_rate": 1.8981635186031869e-06, "loss": 1.8384, "step": 111 }, { "epoch": 0.37755102040816324, "eval_loss": 2.2185332775115967, "eval_runtime": 3.7603, "eval_samples_per_second": 14.626, "eval_steps_per_second": 1.064, "step": 111 }, { "epoch": 0.38095238095238093, "grad_norm": 26.376801704138895, "learning_rate": 1.89567843959442e-06, "loss": 3.095, "step": 112 }, { "epoch": 0.3843537414965986, "grad_norm": 31.801160647661863, "learning_rate": 1.8931650709661176e-06, "loss": 2.4186, "step": 113 }, { "epoch": 0.3877551020408163, "grad_norm": 3.7202396333724406, "learning_rate": 1.8906234921019504e-06, "loss": 1.8483, "step": 114 }, { "epoch": 0.391156462585034, "grad_norm": 20.22060079238643, "learning_rate": 1.8880537832765975e-06, "loss": 2.1247, "step": 115 }, { "epoch": 0.3945578231292517, "grad_norm": 29.233218070907714, "learning_rate": 1.8854560256532098e-06, "loss": 2.3962, "step": 116 }, { "epoch": 0.3979591836734694, "grad_norm": 12.311196195760077, "learning_rate": 1.882830301280849e-06, "loss": 1.9291, "step": 117 }, { "epoch": 0.4013605442176871, "grad_norm": 24.022251844658836, "learning_rate": 1.880176693091893e-06, "loss": 2.0967, "step": 118 }, { "epoch": 0.40476190476190477, "grad_norm": 15.5145598820515, "learning_rate": 1.8774952848994193e-06, "loss": 2.0164, "step": 119 }, { "epoch": 0.40816326530612246, "grad_norm": 18.669552144287866, "learning_rate": 1.874786161394556e-06, "loss": 1.9074, "step": 120 }, { "epoch": 0.41156462585034015, "grad_norm": 20.221669243742017, "learning_rate": 1.8720494081438077e-06, "loss": 2.0693, "step": 121 }, { "epoch": 0.41496598639455784, "grad_norm": 40.16853982486705, "learning_rate": 1.8692851115863521e-06, "loss": 2.7133, "step": 122 }, { "epoch": 0.41836734693877553, "grad_norm": 28.130765299643805, "learning_rate": 1.8664933590313116e-06, "loss": 2.3678, "step": 123 }, { "epoch": 0.4217687074829932, "grad_norm": 3.285521259165442, "learning_rate": 1.8636742386549936e-06, "loss": 1.643, "step": 124 }, { "epoch": 0.42517006802721086, "grad_norm": 14.918765530830019, "learning_rate": 1.8608278394981065e-06, "loss": 2.2832, "step": 125 }, { "epoch": 0.42857142857142855, "grad_norm": 3.221047286582191, "learning_rate": 1.8579542514629471e-06, "loss": 1.7598, "step": 126 }, { "epoch": 0.43197278911564624, "grad_norm": 30.02563146393063, "learning_rate": 1.8550535653105621e-06, "loss": 2.2684, "step": 127 }, { "epoch": 0.43537414965986393, "grad_norm": 14.894051195947721, "learning_rate": 1.8521258726578802e-06, "loss": 2.2898, "step": 128 }, { "epoch": 0.4387755102040816, "grad_norm": 31.346174242632404, "learning_rate": 1.849171265974818e-06, "loss": 2.4443, "step": 129 }, { "epoch": 0.4421768707482993, "grad_norm": 18.396976082720574, "learning_rate": 1.846189838581362e-06, "loss": 2.4081, "step": 130 }, { "epoch": 0.445578231292517, "grad_norm": 11.300098238275778, "learning_rate": 1.843181684644617e-06, "loss": 1.9707, "step": 131 }, { "epoch": 0.4489795918367347, "grad_norm": 9.311622064720812, "learning_rate": 1.8401468991758364e-06, "loss": 2.0055, "step": 132 }, { "epoch": 0.4523809523809524, "grad_norm": 17.268118260619143, "learning_rate": 1.837085578027418e-06, "loss": 2.1029, "step": 133 }, { "epoch": 0.4557823129251701, "grad_norm": 13.534018757700077, "learning_rate": 1.833997817889878e-06, "loss": 1.6714, "step": 134 }, { "epoch": 0.45918367346938777, "grad_norm": 25.67291091851184, "learning_rate": 1.8308837162887962e-06, "loss": 2.0809, "step": 135 }, { "epoch": 0.46258503401360546, "grad_norm": 16.78554391811326, "learning_rate": 1.827743371581737e-06, "loss": 2.095, "step": 136 }, { "epoch": 0.46598639455782315, "grad_norm": 7.0895304724541175, "learning_rate": 1.8245768829551415e-06, "loss": 2.0924, "step": 137 }, { "epoch": 0.46938775510204084, "grad_norm": 28.325113542255774, "learning_rate": 1.8213843504211956e-06, "loss": 2.2312, "step": 138 }, { "epoch": 0.47278911564625853, "grad_norm": 19.627621449351967, "learning_rate": 1.8181658748146709e-06, "loss": 2.1092, "step": 139 }, { "epoch": 0.47619047619047616, "grad_norm": 3.253642214201976, "learning_rate": 1.8149215577897394e-06, "loss": 1.8119, "step": 140 }, { "epoch": 0.47959183673469385, "grad_norm": 22.194249754011054, "learning_rate": 1.8116515018167635e-06, "loss": 1.8086, "step": 141 }, { "epoch": 0.48299319727891155, "grad_norm": 3.291628206622755, "learning_rate": 1.8083558101790595e-06, "loss": 1.6961, "step": 142 }, { "epoch": 0.48639455782312924, "grad_norm": 30.333797331495706, "learning_rate": 1.8050345869696346e-06, "loss": 2.4649, "step": 143 }, { "epoch": 0.4897959183673469, "grad_norm": 35.46381155966904, "learning_rate": 1.8016879370879004e-06, "loss": 2.375, "step": 144 }, { "epoch": 0.4931972789115646, "grad_norm": 10.065027530577671, "learning_rate": 1.798315966236358e-06, "loss": 1.7088, "step": 145 }, { "epoch": 0.4965986394557823, "grad_norm": 31.969238069641904, "learning_rate": 1.794918780917262e-06, "loss": 2.2722, "step": 146 }, { "epoch": 0.5, "grad_norm": 3.1706943713916287, "learning_rate": 1.791496488429254e-06, "loss": 1.5129, "step": 147 }, { "epoch": 0.5034013605442177, "grad_norm": 40.129409477941664, "learning_rate": 1.7880491968639751e-06, "loss": 2.8429, "step": 148 }, { "epoch": 0.5034013605442177, "eval_loss": 2.2053215503692627, "eval_runtime": 3.8702, "eval_samples_per_second": 14.211, "eval_steps_per_second": 1.034, "step": 148 }, { "epoch": 0.5068027210884354, "grad_norm": 26.985890370710862, "learning_rate": 1.7845770151026513e-06, "loss": 2.3221, "step": 149 }, { "epoch": 0.5102040816326531, "grad_norm": 34.746114296368646, "learning_rate": 1.7810800528126553e-06, "loss": 2.3499, "step": 150 }, { "epoch": 0.5136054421768708, "grad_norm": 3.902076154967714, "learning_rate": 1.7775584204440416e-06, "loss": 1.7411, "step": 151 }, { "epoch": 0.5170068027210885, "grad_norm": 27.80193827038684, "learning_rate": 1.7740122292260594e-06, "loss": 2.2895, "step": 152 }, { "epoch": 0.5204081632653061, "grad_norm": 3.4114906810600685, "learning_rate": 1.7704415911636375e-06, "loss": 1.5119, "step": 153 }, { "epoch": 0.5238095238095238, "grad_norm": 9.505522369554297, "learning_rate": 1.7668466190338483e-06, "loss": 1.844, "step": 154 }, { "epoch": 0.5272108843537415, "grad_norm": 36.46998151934392, "learning_rate": 1.7632274263823457e-06, "loss": 2.4713, "step": 155 }, { "epoch": 0.5306122448979592, "grad_norm": 17.765108257489125, "learning_rate": 1.759584127519778e-06, "loss": 2.2811, "step": 156 }, { "epoch": 0.5340136054421769, "grad_norm": 14.148223114236801, "learning_rate": 1.7559168375181775e-06, "loss": 1.8442, "step": 157 }, { "epoch": 0.5374149659863946, "grad_norm": 9.76402372234183, "learning_rate": 1.7522256722073273e-06, "loss": 1.8945, "step": 158 }, { "epoch": 0.5408163265306123, "grad_norm": 16.450896799860217, "learning_rate": 1.748510748171101e-06, "loss": 1.9574, "step": 159 }, { "epoch": 0.54421768707483, "grad_norm": 3.912613042056259, "learning_rate": 1.7447721827437819e-06, "loss": 1.6032, "step": 160 }, { "epoch": 0.5476190476190477, "grad_norm": 33.305605159021646, "learning_rate": 1.7410100940063558e-06, "loss": 2.4057, "step": 161 }, { "epoch": 0.5510204081632653, "grad_norm": 38.319973023280475, "learning_rate": 1.7372246007827833e-06, "loss": 2.5925, "step": 162 }, { "epoch": 0.5544217687074829, "grad_norm": 17.216523524482163, "learning_rate": 1.7334158226362446e-06, "loss": 2.0324, "step": 163 }, { "epoch": 0.5578231292517006, "grad_norm": 4.9862323362748535, "learning_rate": 1.7295838798653649e-06, "loss": 1.7436, "step": 164 }, { "epoch": 0.5612244897959183, "grad_norm": 4.0759355613648625, "learning_rate": 1.7257288935004132e-06, "loss": 1.7034, "step": 165 }, { "epoch": 0.564625850340136, "grad_norm": 16.519960341878562, "learning_rate": 1.7218509852994822e-06, "loss": 2.115, "step": 166 }, { "epoch": 0.5680272108843537, "grad_norm": 17.37824200525593, "learning_rate": 1.7179502777446392e-06, "loss": 2.0609, "step": 167 }, { "epoch": 0.5714285714285714, "grad_norm": 39.604264809847564, "learning_rate": 1.7140268940380605e-06, "loss": 2.3861, "step": 168 }, { "epoch": 0.5748299319727891, "grad_norm": 17.489048911326037, "learning_rate": 1.7100809580981384e-06, "loss": 1.9979, "step": 169 }, { "epoch": 0.5782312925170068, "grad_norm": 6.642641185839537, "learning_rate": 1.7061125945555679e-06, "loss": 1.7533, "step": 170 }, { "epoch": 0.5816326530612245, "grad_norm": 41.437166409250736, "learning_rate": 1.70212192874941e-06, "loss": 2.8676, "step": 171 }, { "epoch": 0.5850340136054422, "grad_norm": 12.285090452877482, "learning_rate": 1.6981090867231336e-06, "loss": 1.8715, "step": 172 }, { "epoch": 0.5884353741496599, "grad_norm": 20.351266920257437, "learning_rate": 1.694074195220634e-06, "loss": 2.5238, "step": 173 }, { "epoch": 0.5918367346938775, "grad_norm": 13.128678816386138, "learning_rate": 1.6900173816822289e-06, "loss": 1.7191, "step": 174 }, { "epoch": 0.5952380952380952, "grad_norm": 3.1331026154409565, "learning_rate": 1.6859387742406358e-06, "loss": 1.7885, "step": 175 }, { "epoch": 0.5986394557823129, "grad_norm": 12.273944679120639, "learning_rate": 1.6818385017169212e-06, "loss": 1.9361, "step": 176 }, { "epoch": 0.6020408163265306, "grad_norm": 18.988287394873876, "learning_rate": 1.6777166936164354e-06, "loss": 2.118, "step": 177 }, { "epoch": 0.6054421768707483, "grad_norm": 13.330413347581118, "learning_rate": 1.6735734801247202e-06, "loss": 1.9923, "step": 178 }, { "epoch": 0.608843537414966, "grad_norm": 8.528660885149025, "learning_rate": 1.6694089921033976e-06, "loss": 1.6938, "step": 179 }, { "epoch": 0.6122448979591837, "grad_norm": 28.049589150374253, "learning_rate": 1.6652233610860364e-06, "loss": 2.4092, "step": 180 }, { "epoch": 0.6156462585034014, "grad_norm": 19.077236893577115, "learning_rate": 1.6610167192739978e-06, "loss": 2.3235, "step": 181 }, { "epoch": 0.6190476190476191, "grad_norm": 23.109888095114325, "learning_rate": 1.6567891995322603e-06, "loss": 2.2678, "step": 182 }, { "epoch": 0.6224489795918368, "grad_norm": 19.456776496200867, "learning_rate": 1.6525409353852221e-06, "loss": 2.2764, "step": 183 }, { "epoch": 0.6258503401360545, "grad_norm": 9.82404206796416, "learning_rate": 1.6482720610124856e-06, "loss": 1.8034, "step": 184 }, { "epoch": 0.6292517006802721, "grad_norm": 24.2061776724548, "learning_rate": 1.6439827112446173e-06, "loss": 2.161, "step": 185 }, { "epoch": 0.6292517006802721, "eval_loss": 2.194326400756836, "eval_runtime": 3.7428, "eval_samples_per_second": 14.695, "eval_steps_per_second": 1.069, "step": 185 }, { "epoch": 0.6326530612244898, "grad_norm": 30.469163171671003, "learning_rate": 1.6396730215588912e-06, "loss": 2.2773, "step": 186 }, { "epoch": 0.6360544217687075, "grad_norm": 3.646917584621385, "learning_rate": 1.6353431280750082e-06, "loss": 1.5989, "step": 187 }, { "epoch": 0.6394557823129252, "grad_norm": 30.30266588230692, "learning_rate": 1.6309931675507978e-06, "loss": 2.6169, "step": 188 }, { "epoch": 0.6428571428571429, "grad_norm": 14.371186117614542, "learning_rate": 1.6266232773778983e-06, "loss": 1.9241, "step": 189 }, { "epoch": 0.6462585034013606, "grad_norm": 18.71258411403636, "learning_rate": 1.6222335955774176e-06, "loss": 2.1737, "step": 190 }, { "epoch": 0.6496598639455783, "grad_norm": 3.2723339662931585, "learning_rate": 1.617824260795573e-06, "loss": 1.8075, "step": 191 }, { "epoch": 0.6530612244897959, "grad_norm": 16.496061968286824, "learning_rate": 1.6133954122993139e-06, "loss": 2.0147, "step": 192 }, { "epoch": 0.6564625850340136, "grad_norm": 3.2013079969624805, "learning_rate": 1.608947189971921e-06, "loss": 1.6798, "step": 193 }, { "epoch": 0.6598639455782312, "grad_norm": 20.981814890242124, "learning_rate": 1.6044797343085898e-06, "loss": 2.0425, "step": 194 }, { "epoch": 0.6632653061224489, "grad_norm": 50.879018823375965, "learning_rate": 1.599993186411992e-06, "loss": 3.8504, "step": 195 }, { "epoch": 0.6666666666666666, "grad_norm": 3.283241794235971, "learning_rate": 1.59548768798782e-06, "loss": 1.4971, "step": 196 }, { "epoch": 0.6700680272108843, "grad_norm": 12.706772022061763, "learning_rate": 1.5909633813403092e-06, "loss": 1.9318, "step": 197 }, { "epoch": 0.673469387755102, "grad_norm": 7.747043673117189, "learning_rate": 1.5864204093677463e-06, "loss": 1.8641, "step": 198 }, { "epoch": 0.6768707482993197, "grad_norm": 12.685665761738797, "learning_rate": 1.5818589155579529e-06, "loss": 2.0781, "step": 199 }, { "epoch": 0.6802721088435374, "grad_norm": 8.183695796856302, "learning_rate": 1.5772790439837555e-06, "loss": 2.1112, "step": 200 }, { "epoch": 0.6836734693877551, "grad_norm": 3.6436475976280605, "learning_rate": 1.572680939298435e-06, "loss": 1.504, "step": 201 }, { "epoch": 0.6870748299319728, "grad_norm": 7.765753459491514, "learning_rate": 1.5680647467311555e-06, "loss": 1.6113, "step": 202 }, { "epoch": 0.6904761904761905, "grad_norm": 27.059590789587673, "learning_rate": 1.563430612082382e-06, "loss": 2.3797, "step": 203 }, { "epoch": 0.6938775510204082, "grad_norm": 17.865181616406808, "learning_rate": 1.5587786817192687e-06, "loss": 2.2287, "step": 204 }, { "epoch": 0.6972789115646258, "grad_norm": 11.50437842198177, "learning_rate": 1.5541091025710434e-06, "loss": 2.2926, "step": 205 }, { "epoch": 0.7006802721088435, "grad_norm": 18.03962056520961, "learning_rate": 1.5494220221243607e-06, "loss": 2.3374, "step": 206 }, { "epoch": 0.7040816326530612, "grad_norm": 19.808732477248256, "learning_rate": 1.5447175884186478e-06, "loss": 2.3215, "step": 207 }, { "epoch": 0.7074829931972789, "grad_norm": 21.35228597761302, "learning_rate": 1.539995950041426e-06, "loss": 2.2378, "step": 208 }, { "epoch": 0.7108843537414966, "grad_norm": 14.090932946927257, "learning_rate": 1.5352572561236197e-06, "loss": 2.22, "step": 209 }, { "epoch": 0.7142857142857143, "grad_norm": 22.22875395969964, "learning_rate": 1.5305016563348443e-06, "loss": 2.44, "step": 210 }, { "epoch": 0.717687074829932, "grad_norm": 12.732771656478363, "learning_rate": 1.5257293008786807e-06, "loss": 2.0598, "step": 211 }, { "epoch": 0.7210884353741497, "grad_norm": 3.3024595151809777, "learning_rate": 1.5209403404879303e-06, "loss": 1.8514, "step": 212 }, { "epoch": 0.7244897959183674, "grad_norm": 31.041628605811148, "learning_rate": 1.5161349264198535e-06, "loss": 2.4225, "step": 213 }, { "epoch": 0.7278911564625851, "grad_norm": 11.866017531018645, "learning_rate": 1.511313210451394e-06, "loss": 1.9747, "step": 214 }, { "epoch": 0.7312925170068028, "grad_norm": 23.77867996796224, "learning_rate": 1.5064753448743832e-06, "loss": 2.0971, "step": 215 }, { "epoch": 0.7346938775510204, "grad_norm": 28.640512428374876, "learning_rate": 1.5016214824907314e-06, "loss": 2.2247, "step": 216 }, { "epoch": 0.7380952380952381, "grad_norm": 9.463317499162777, "learning_rate": 1.4967517766076015e-06, "loss": 1.9511, "step": 217 }, { "epoch": 0.7414965986394558, "grad_norm": 3.6132074342008336, "learning_rate": 1.4918663810325659e-06, "loss": 1.5643, "step": 218 }, { "epoch": 0.7448979591836735, "grad_norm": 12.274299577611806, "learning_rate": 1.4869654500687492e-06, "loss": 2.0865, "step": 219 }, { "epoch": 0.7482993197278912, "grad_norm": 9.577269499797044, "learning_rate": 1.4820491385099555e-06, "loss": 2.1494, "step": 220 }, { "epoch": 0.7517006802721088, "grad_norm": 13.665325186622818, "learning_rate": 1.477117601635777e-06, "loss": 2.0676, "step": 221 }, { "epoch": 0.7551020408163265, "grad_norm": 12.044556166373619, "learning_rate": 1.4721709952066923e-06, "loss": 1.7408, "step": 222 }, { "epoch": 0.7551020408163265, "eval_loss": 2.1867611408233643, "eval_runtime": 3.7388, "eval_samples_per_second": 14.711, "eval_steps_per_second": 1.07, "step": 222 }, { "epoch": 0.7585034013605442, "grad_norm": 32.0866216128451, "learning_rate": 1.4672094754591449e-06, "loss": 2.6444, "step": 223 }, { "epoch": 0.7619047619047619, "grad_norm": 26.272890838528287, "learning_rate": 1.4622331991006082e-06, "loss": 2.0286, "step": 224 }, { "epoch": 0.7653061224489796, "grad_norm": 10.948966043777636, "learning_rate": 1.4572423233046385e-06, "loss": 1.8924, "step": 225 }, { "epoch": 0.7687074829931972, "grad_norm": 10.041220633719293, "learning_rate": 1.4522370057059079e-06, "loss": 1.8589, "step": 226 }, { "epoch": 0.7721088435374149, "grad_norm": 19.90849856575333, "learning_rate": 1.447217404395227e-06, "loss": 2.4632, "step": 227 }, { "epoch": 0.7755102040816326, "grad_norm": 3.3718807752757134, "learning_rate": 1.4421836779145511e-06, "loss": 1.7402, "step": 228 }, { "epoch": 0.7789115646258503, "grad_norm": 33.99543346002537, "learning_rate": 1.4371359852519734e-06, "loss": 2.9081, "step": 229 }, { "epoch": 0.782312925170068, "grad_norm": 12.446391408704297, "learning_rate": 1.4320744858367024e-06, "loss": 2.0828, "step": 230 }, { "epoch": 0.7857142857142857, "grad_norm": 26.19952152880794, "learning_rate": 1.4269993395340277e-06, "loss": 2.2178, "step": 231 }, { "epoch": 0.7891156462585034, "grad_norm": 36.07799078718175, "learning_rate": 1.4219107066402692e-06, "loss": 2.6926, "step": 232 }, { "epoch": 0.7925170068027211, "grad_norm": 11.216785179837261, "learning_rate": 1.4168087478777152e-06, "loss": 2.0393, "step": 233 }, { "epoch": 0.7959183673469388, "grad_norm": 17.659830496744974, "learning_rate": 1.4116936243895466e-06, "loss": 2.1082, "step": 234 }, { "epoch": 0.7993197278911565, "grad_norm": 17.001892765923902, "learning_rate": 1.406565497734745e-06, "loss": 1.9051, "step": 235 }, { "epoch": 0.8027210884353742, "grad_norm": 31.896056687773818, "learning_rate": 1.4014245298829935e-06, "loss": 2.702, "step": 236 }, { "epoch": 0.8061224489795918, "grad_norm": 6.972810630357569, "learning_rate": 1.3962708832095568e-06, "loss": 1.9466, "step": 237 }, { "epoch": 0.8095238095238095, "grad_norm": 17.689383441039308, "learning_rate": 1.3911047204901558e-06, "loss": 2.3425, "step": 238 }, { "epoch": 0.8129251700680272, "grad_norm": 16.46834046227904, "learning_rate": 1.385926204895826e-06, "loss": 2.1545, "step": 239 }, { "epoch": 0.8163265306122449, "grad_norm": 21.69161139742313, "learning_rate": 1.3807354999877614e-06, "loss": 2.3222, "step": 240 }, { "epoch": 0.8197278911564626, "grad_norm": 3.411794366451801, "learning_rate": 1.3755327697121522e-06, "loss": 1.6492, "step": 241 }, { "epoch": 0.8231292517006803, "grad_norm": 13.113564486849809, "learning_rate": 1.3703181783950031e-06, "loss": 2.0212, "step": 242 }, { "epoch": 0.826530612244898, "grad_norm": 14.798483657902382, "learning_rate": 1.3650918907369452e-06, "loss": 2.1974, "step": 243 }, { "epoch": 0.8299319727891157, "grad_norm": 10.19780084250851, "learning_rate": 1.3598540718080345e-06, "loss": 1.8543, "step": 244 }, { "epoch": 0.8333333333333334, "grad_norm": 30.023251305313995, "learning_rate": 1.3546048870425354e-06, "loss": 2.2387, "step": 245 }, { "epoch": 0.8367346938775511, "grad_norm": 22.321684071392564, "learning_rate": 1.3493445022336994e-06, "loss": 2.4305, "step": 246 }, { "epoch": 0.8401360544217688, "grad_norm": 34.98925650288134, "learning_rate": 1.3440730835285247e-06, "loss": 2.4364, "step": 247 }, { "epoch": 0.8435374149659864, "grad_norm": 3.161092974878791, "learning_rate": 1.3387907974225116e-06, "loss": 1.4885, "step": 248 }, { "epoch": 0.8469387755102041, "grad_norm": 50.11899935337027, "learning_rate": 1.3334978107544024e-06, "loss": 2.3332, "step": 249 }, { "epoch": 0.8503401360544217, "grad_norm": 15.05206270554561, "learning_rate": 1.3281942907009112e-06, "loss": 2.2131, "step": 250 }, { "epoch": 0.8537414965986394, "grad_norm": 24.869549840961, "learning_rate": 1.3228804047714462e-06, "loss": 2.2264, "step": 251 }, { "epoch": 0.8571428571428571, "grad_norm": 16.049594008906414, "learning_rate": 1.317556320802816e-06, "loss": 1.7228, "step": 252 }, { "epoch": 0.8605442176870748, "grad_norm": 14.258214783846427, "learning_rate": 1.31222220695393e-06, "loss": 1.999, "step": 253 }, { "epoch": 0.8639455782312925, "grad_norm": 3.3063413494205474, "learning_rate": 1.3068782317004874e-06, "loss": 1.4607, "step": 254 }, { "epoch": 0.8673469387755102, "grad_norm": 8.831787955552995, "learning_rate": 1.3015245638296563e-06, "loss": 2.1192, "step": 255 }, { "epoch": 0.8707482993197279, "grad_norm": 3.121872417027736, "learning_rate": 1.296161372434741e-06, "loss": 1.5467, "step": 256 }, { "epoch": 0.8741496598639455, "grad_norm": 33.22351218100941, "learning_rate": 1.2907888269098416e-06, "loss": 2.3588, "step": 257 }, { "epoch": 0.8775510204081632, "grad_norm": 3.188560179185641, "learning_rate": 1.2854070969445064e-06, "loss": 1.5405, "step": 258 }, { "epoch": 0.8809523809523809, "grad_norm": 21.318069352021737, "learning_rate": 1.2800163525183688e-06, "loss": 2.2063, "step": 259 }, { "epoch": 0.8809523809523809, "eval_loss": 2.1820290088653564, "eval_runtime": 3.8534, "eval_samples_per_second": 14.273, "eval_steps_per_second": 1.038, "step": 259 }, { "epoch": 0.8843537414965986, "grad_norm": 8.243323927611506, "learning_rate": 1.2746167638957805e-06, "loss": 1.8474, "step": 260 }, { "epoch": 0.8877551020408163, "grad_norm": 28.909948439715215, "learning_rate": 1.2692085016204333e-06, "loss": 2.2626, "step": 261 }, { "epoch": 0.891156462585034, "grad_norm": 3.0722449835450116, "learning_rate": 1.2637917365099725e-06, "loss": 1.6435, "step": 262 }, { "epoch": 0.8945578231292517, "grad_norm": 29.871491992872432, "learning_rate": 1.2583666396506023e-06, "loss": 2.1498, "step": 263 }, { "epoch": 0.8979591836734694, "grad_norm": 2.977539901133042, "learning_rate": 1.2529333823916806e-06, "loss": 1.7024, "step": 264 }, { "epoch": 0.9013605442176871, "grad_norm": 16.47476152363902, "learning_rate": 1.2474921363403094e-06, "loss": 2.532, "step": 265 }, { "epoch": 0.9047619047619048, "grad_norm": 13.022051400004793, "learning_rate": 1.2420430733559124e-06, "loss": 1.8884, "step": 266 }, { "epoch": 0.9081632653061225, "grad_norm": 8.97804602434911, "learning_rate": 1.2365863655448075e-06, "loss": 1.7885, "step": 267 }, { "epoch": 0.9115646258503401, "grad_norm": 16.047174726202446, "learning_rate": 1.2311221852547721e-06, "loss": 2.3363, "step": 268 }, { "epoch": 0.9149659863945578, "grad_norm": 3.5763323384852765, "learning_rate": 1.2256507050695977e-06, "loss": 1.701, "step": 269 }, { "epoch": 0.9183673469387755, "grad_norm": 26.929796973835796, "learning_rate": 1.220172097803641e-06, "loss": 2.3601, "step": 270 }, { "epoch": 0.9217687074829932, "grad_norm": 22.50281840057178, "learning_rate": 1.2146865364963633e-06, "loss": 2.0693, "step": 271 }, { "epoch": 0.9251700680272109, "grad_norm": 11.62602578923058, "learning_rate": 1.2091941944068665e-06, "loss": 1.9123, "step": 272 }, { "epoch": 0.9285714285714286, "grad_norm": 16.841220035990798, "learning_rate": 1.2036952450084214e-06, "loss": 2.2163, "step": 273 }, { "epoch": 0.9319727891156463, "grad_norm": 18.055133543008612, "learning_rate": 1.1981898619829879e-06, "loss": 2.2485, "step": 274 }, { "epoch": 0.935374149659864, "grad_norm": 26.45820099458286, "learning_rate": 1.1926782192157273e-06, "loss": 2.1845, "step": 275 }, { "epoch": 0.9387755102040817, "grad_norm": 3.334955291200548, "learning_rate": 1.1871604907895148e-06, "loss": 1.7059, "step": 276 }, { "epoch": 0.9421768707482994, "grad_norm": 19.511242339983163, "learning_rate": 1.1816368509794364e-06, "loss": 2.3601, "step": 277 }, { "epoch": 0.9455782312925171, "grad_norm": 21.146925953072365, "learning_rate": 1.1761074742472882e-06, "loss": 1.9957, "step": 278 }, { "epoch": 0.9489795918367347, "grad_norm": 3.5535024021194452, "learning_rate": 1.1705725352360633e-06, "loss": 1.9249, "step": 279 }, { "epoch": 0.9523809523809523, "grad_norm": 13.348912305071467, "learning_rate": 1.165032208764438e-06, "loss": 2.0641, "step": 280 }, { "epoch": 0.95578231292517, "grad_norm": 12.61033318044152, "learning_rate": 1.1594866698212483e-06, "loss": 2.169, "step": 281 }, { "epoch": 0.9591836734693877, "grad_norm": 28.256325358544956, "learning_rate": 1.1539360935599644e-06, "loss": 2.0952, "step": 282 }, { "epoch": 0.9625850340136054, "grad_norm": 12.61302060729169, "learning_rate": 1.1483806552931582e-06, "loss": 1.9411, "step": 283 }, { "epoch": 0.9659863945578231, "grad_norm": 8.711391665501074, "learning_rate": 1.142820530486966e-06, "loss": 1.7633, "step": 284 }, { "epoch": 0.9693877551020408, "grad_norm": 35.95958496013491, "learning_rate": 1.1372558947555455e-06, "loss": 2.1904, "step": 285 }, { "epoch": 0.9727891156462585, "grad_norm": 3.429092657849847, "learning_rate": 1.131686923855531e-06, "loss": 1.8276, "step": 286 }, { "epoch": 0.9761904761904762, "grad_norm": 12.871658288368948, "learning_rate": 1.1261137936804811e-06, "loss": 2.0911, "step": 287 }, { "epoch": 0.9795918367346939, "grad_norm": 13.217001333800638, "learning_rate": 1.1205366802553228e-06, "loss": 1.9614, "step": 288 }, { "epoch": 0.9829931972789115, "grad_norm": 24.712172909538513, "learning_rate": 1.1149557597307934e-06, "loss": 2.0412, "step": 289 }, { "epoch": 0.9863945578231292, "grad_norm": 10.412944718560512, "learning_rate": 1.1093712083778746e-06, "loss": 1.7787, "step": 290 }, { "epoch": 0.9897959183673469, "grad_norm": 15.631851389191027, "learning_rate": 1.1037832025822265e-06, "loss": 2.3362, "step": 291 }, { "epoch": 0.9931972789115646, "grad_norm": 12.135256117907334, "learning_rate": 1.098191918838617e-06, "loss": 2.0212, "step": 292 }, { "epoch": 0.9965986394557823, "grad_norm": 13.057522322919077, "learning_rate": 1.0925975337453462e-06, "loss": 2.2842, "step": 293 }, { "epoch": 1.0, "grad_norm": 17.565324685523922, "learning_rate": 1.0870002239986686e-06, "loss": 2.5002, "step": 294 } ], "logging_steps": 1, "max_steps": 588, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 294, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 95887829237760.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }