| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 37, | |
| "global_step": 294, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003401360544217687, | |
| "grad_norm": 106.7094005171616, | |
| "learning_rate": 0.0, | |
| "loss": 2.9268, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.003401360544217687, | |
| "eval_loss": 2.5302913188934326, | |
| "eval_runtime": 3.7953, | |
| "eval_samples_per_second": 14.492, | |
| "eval_steps_per_second": 1.054, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.006802721088435374, | |
| "grad_norm": 57.97506009705182, | |
| "learning_rate": 6.89655172413793e-08, | |
| "loss": 2.0122, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01020408163265306, | |
| "grad_norm": 116.23413141145363, | |
| "learning_rate": 1.379310344827586e-07, | |
| "loss": 2.6743, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.013605442176870748, | |
| "grad_norm": 21.262801374024775, | |
| "learning_rate": 2.0689655172413793e-07, | |
| "loss": 2.0743, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.017006802721088437, | |
| "grad_norm": 59.319984755304056, | |
| "learning_rate": 2.758620689655172e-07, | |
| "loss": 2.2775, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02040816326530612, | |
| "grad_norm": 159.51320885432614, | |
| "learning_rate": 3.4482758620689656e-07, | |
| "loss": 2.1337, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.023809523809523808, | |
| "grad_norm": 87.93970940325055, | |
| "learning_rate": 4.1379310344827586e-07, | |
| "loss": 1.9061, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.027210884353741496, | |
| "grad_norm": 61.133777808660895, | |
| "learning_rate": 4.827586206896552e-07, | |
| "loss": 1.8118, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.030612244897959183, | |
| "grad_norm": 48.65887299035499, | |
| "learning_rate": 5.517241379310344e-07, | |
| "loss": 3.4095, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.034013605442176874, | |
| "grad_norm": 30.592687909719288, | |
| "learning_rate": 6.206896551724138e-07, | |
| "loss": 2.2398, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03741496598639456, | |
| "grad_norm": 74.15295766799099, | |
| "learning_rate": 6.896551724137931e-07, | |
| "loss": 3.4425, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.04081632653061224, | |
| "grad_norm": 34.94892634385338, | |
| "learning_rate": 7.586206896551724e-07, | |
| "loss": 2.5405, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04421768707482993, | |
| "grad_norm": 26.538521745061775, | |
| "learning_rate": 8.275862068965517e-07, | |
| "loss": 1.9614, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.047619047619047616, | |
| "grad_norm": 52.23979896259082, | |
| "learning_rate": 8.96551724137931e-07, | |
| "loss": 2.9785, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.05102040816326531, | |
| "grad_norm": 30.812143999051266, | |
| "learning_rate": 9.655172413793103e-07, | |
| "loss": 2.0185, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05442176870748299, | |
| "grad_norm": 41.48478088374125, | |
| "learning_rate": 1.0344827586206896e-06, | |
| "loss": 2.1126, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.05782312925170068, | |
| "grad_norm": 29.347588210089675, | |
| "learning_rate": 1.1034482758620688e-06, | |
| "loss": 2.2078, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.061224489795918366, | |
| "grad_norm": 28.947554594850924, | |
| "learning_rate": 1.172413793103448e-06, | |
| "loss": 2.442, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06462585034013606, | |
| "grad_norm": 32.28592513881342, | |
| "learning_rate": 1.2413793103448275e-06, | |
| "loss": 2.8683, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.06802721088435375, | |
| "grad_norm": 38.97631997775744, | |
| "learning_rate": 1.3103448275862068e-06, | |
| "loss": 2.4376, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 43.775478156068516, | |
| "learning_rate": 1.3793103448275862e-06, | |
| "loss": 2.4167, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.07482993197278912, | |
| "grad_norm": 30.904260805899465, | |
| "learning_rate": 1.4482758620689655e-06, | |
| "loss": 2.6971, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0782312925170068, | |
| "grad_norm": 48.202871069183985, | |
| "learning_rate": 1.5172413793103447e-06, | |
| "loss": 2.5093, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.08163265306122448, | |
| "grad_norm": 55.067186300198706, | |
| "learning_rate": 1.5862068965517242e-06, | |
| "loss": 2.0053, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08503401360544217, | |
| "grad_norm": 38.486811757681096, | |
| "learning_rate": 1.6551724137931035e-06, | |
| "loss": 2.2475, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08843537414965986, | |
| "grad_norm": 90.78568630900098, | |
| "learning_rate": 1.7241379310344825e-06, | |
| "loss": 3.8342, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.09183673469387756, | |
| "grad_norm": 23.32050516158788, | |
| "learning_rate": 1.793103448275862e-06, | |
| "loss": 2.2496, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 25.01047005218693, | |
| "learning_rate": 1.8620689655172412e-06, | |
| "loss": 2.6991, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.09863945578231292, | |
| "grad_norm": 27.40209208002175, | |
| "learning_rate": 1.9310344827586207e-06, | |
| "loss": 2.7017, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.10204081632653061, | |
| "grad_norm": 16.372774250078056, | |
| "learning_rate": 2e-06, | |
| "loss": 2.1315, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1054421768707483, | |
| "grad_norm": 34.32100924763162, | |
| "learning_rate": 1.999984207714351e-06, | |
| "loss": 2.4298, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.10884353741496598, | |
| "grad_norm": 49.15042168439896, | |
| "learning_rate": 1.9999368313561964e-06, | |
| "loss": 3.1687, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.11224489795918367, | |
| "grad_norm": 27.553221322487154, | |
| "learning_rate": 1.9998578724218984e-06, | |
| "loss": 2.307, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.11564625850340136, | |
| "grad_norm": 25.29898708562965, | |
| "learning_rate": 1.999747333405341e-06, | |
| "loss": 2.6711, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.11904761904761904, | |
| "grad_norm": 35.13639034121329, | |
| "learning_rate": 1.9996052177978517e-06, | |
| "loss": 2.2923, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.12244897959183673, | |
| "grad_norm": 61.904951168823246, | |
| "learning_rate": 1.999431530088091e-06, | |
| "loss": 3.0837, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.12585034013605442, | |
| "grad_norm": 43.72931173152359, | |
| "learning_rate": 1.9992262757619108e-06, | |
| "loss": 2.9055, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.12585034013605442, | |
| "eval_loss": 2.2881884574890137, | |
| "eval_runtime": 3.7387, | |
| "eval_samples_per_second": 14.711, | |
| "eval_steps_per_second": 1.07, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.1292517006802721, | |
| "grad_norm": 75.128224809043, | |
| "learning_rate": 1.9989894613021807e-06, | |
| "loss": 3.9717, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1326530612244898, | |
| "grad_norm": 6.423556290490496, | |
| "learning_rate": 1.998721094188584e-06, | |
| "loss": 1.6634, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.1360544217687075, | |
| "grad_norm": 8.952452652609857, | |
| "learning_rate": 1.9984211828973816e-06, | |
| "loss": 2.1183, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.13945578231292516, | |
| "grad_norm": 12.837161899787583, | |
| "learning_rate": 1.998089736901142e-06, | |
| "loss": 2.1306, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 7.2779063942957825, | |
| "learning_rate": 1.9977267666684456e-06, | |
| "loss": 1.9831, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.14625850340136054, | |
| "grad_norm": 30.288569770228293, | |
| "learning_rate": 1.9973322836635515e-06, | |
| "loss": 2.1869, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.14965986394557823, | |
| "grad_norm": 11.672608976353168, | |
| "learning_rate": 1.996906300346036e-06, | |
| "loss": 1.9566, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.15306122448979592, | |
| "grad_norm": 14.837719065187358, | |
| "learning_rate": 1.9964488301704e-06, | |
| "loss": 2.2152, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1564625850340136, | |
| "grad_norm": 18.558600033713702, | |
| "learning_rate": 1.9959598875856427e-06, | |
| "loss": 2.06, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.1598639455782313, | |
| "grad_norm": 17.161073648503006, | |
| "learning_rate": 1.995439488034806e-06, | |
| "loss": 2.0463, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.16326530612244897, | |
| "grad_norm": 10.944090642041195, | |
| "learning_rate": 1.994887647954486e-06, | |
| "loss": 1.9676, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 30.260773919516463, | |
| "learning_rate": 1.9943043847743164e-06, | |
| "loss": 2.4235, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.17006802721088435, | |
| "grad_norm": 17.95874457178673, | |
| "learning_rate": 1.9936897169164135e-06, | |
| "loss": 2.4211, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17346938775510204, | |
| "grad_norm": 29.32804844947439, | |
| "learning_rate": 1.993043663794799e-06, | |
| "loss": 2.2786, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.17687074829931973, | |
| "grad_norm": 31.224760731119037, | |
| "learning_rate": 1.9923662458147826e-06, | |
| "loss": 2.8374, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.18027210884353742, | |
| "grad_norm": 4.5045539325043205, | |
| "learning_rate": 1.9916574843723217e-06, | |
| "loss": 1.6301, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1836734693877551, | |
| "grad_norm": 10.827050277516674, | |
| "learning_rate": 1.9909174018533427e-06, | |
| "loss": 2.0554, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.1870748299319728, | |
| "grad_norm": 17.063187262605883, | |
| "learning_rate": 1.990146021633034e-06, | |
| "loss": 2.4202, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 3.946679947433292, | |
| "learning_rate": 1.98934336807511e-06, | |
| "loss": 1.7808, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.19387755102040816, | |
| "grad_norm": 8.431222224384186, | |
| "learning_rate": 1.9885094665310388e-06, | |
| "loss": 1.7766, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.19727891156462585, | |
| "grad_norm": 32.28667139462841, | |
| "learning_rate": 1.9876443433392433e-06, | |
| "loss": 2.2299, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.20068027210884354, | |
| "grad_norm": 11.950555724182584, | |
| "learning_rate": 1.986748025824268e-06, | |
| "loss": 1.928, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.20408163265306123, | |
| "grad_norm": 3.6059136679066977, | |
| "learning_rate": 1.985820542295918e-06, | |
| "loss": 1.7761, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.20748299319727892, | |
| "grad_norm": 41.40947345983446, | |
| "learning_rate": 1.984861922048363e-06, | |
| "loss": 2.6704, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2108843537414966, | |
| "grad_norm": 30.634237938465816, | |
| "learning_rate": 1.983872195359212e-06, | |
| "loss": 2.7336, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 3.760013022701194, | |
| "learning_rate": 1.9828513934885587e-06, | |
| "loss": 1.8831, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.21768707482993196, | |
| "grad_norm": 37.34059674722221, | |
| "learning_rate": 1.981799548677993e-06, | |
| "loss": 2.27, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.22108843537414966, | |
| "grad_norm": 11.009700618421736, | |
| "learning_rate": 1.980716694149581e-06, | |
| "loss": 1.9265, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.22448979591836735, | |
| "grad_norm": 17.609147027884987, | |
| "learning_rate": 1.9796028641048194e-06, | |
| "loss": 2.3411, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.22789115646258504, | |
| "grad_norm": 17.432142291951372, | |
| "learning_rate": 1.978458093723553e-06, | |
| "loss": 2.2213, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.23129251700680273, | |
| "grad_norm": 14.11664326231067, | |
| "learning_rate": 1.9772824191628632e-06, | |
| "loss": 2.0831, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.23469387755102042, | |
| "grad_norm": 37.456025944063875, | |
| "learning_rate": 1.9760758775559273e-06, | |
| "loss": 2.7494, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 16.30994509129653, | |
| "learning_rate": 1.974838507010844e-06, | |
| "loss": 2.118, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24149659863945577, | |
| "grad_norm": 25.92468917111241, | |
| "learning_rate": 1.9735703466094324e-06, | |
| "loss": 2.1656, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.24489795918367346, | |
| "grad_norm": 17.23253832018251, | |
| "learning_rate": 1.972271436405994e-06, | |
| "loss": 2.0787, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.24829931972789115, | |
| "grad_norm": 6.286286593272188, | |
| "learning_rate": 1.970941817426052e-06, | |
| "loss": 1.7458, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.25170068027210885, | |
| "grad_norm": 20.87004487229478, | |
| "learning_rate": 1.969581531665051e-06, | |
| "loss": 2.364, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.25170068027210885, | |
| "eval_loss": 2.240875482559204, | |
| "eval_runtime": 3.7328, | |
| "eval_samples_per_second": 14.734, | |
| "eval_steps_per_second": 1.072, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.25510204081632654, | |
| "grad_norm": 22.83815781491435, | |
| "learning_rate": 1.968190622087034e-06, | |
| "loss": 2.2176, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2585034013605442, | |
| "grad_norm": 39.2204163613504, | |
| "learning_rate": 1.9667691326232835e-06, | |
| "loss": 2.605, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.2619047619047619, | |
| "grad_norm": 9.599486970591897, | |
| "learning_rate": 1.965317108170935e-06, | |
| "loss": 2.1652, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2653061224489796, | |
| "grad_norm": 3.7571781853463175, | |
| "learning_rate": 1.9638345945915586e-06, | |
| "loss": 1.6055, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2687074829931973, | |
| "grad_norm": 7.064670527473922, | |
| "learning_rate": 1.962321638709709e-06, | |
| "loss": 1.9937, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.272108843537415, | |
| "grad_norm": 28.207901160479654, | |
| "learning_rate": 1.9607782883114506e-06, | |
| "loss": 2.2552, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2755102040816326, | |
| "grad_norm": 15.991872570963396, | |
| "learning_rate": 1.959204592142843e-06, | |
| "loss": 2.1559, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2789115646258503, | |
| "grad_norm": 13.401822104278665, | |
| "learning_rate": 1.957600599908406e-06, | |
| "loss": 2.1652, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.282312925170068, | |
| "grad_norm": 14.708704691038701, | |
| "learning_rate": 1.9559663622695455e-06, | |
| "loss": 1.9673, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 3.3458550475032105, | |
| "learning_rate": 1.954301930842958e-06, | |
| "loss": 1.6917, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2891156462585034, | |
| "grad_norm": 3.479853146114766, | |
| "learning_rate": 1.9526073581989955e-06, | |
| "loss": 1.624, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2925170068027211, | |
| "grad_norm": 25.10854427551898, | |
| "learning_rate": 1.950882697860009e-06, | |
| "loss": 2.3626, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.29591836734693877, | |
| "grad_norm": 14.389114459997433, | |
| "learning_rate": 1.9491280042986562e-06, | |
| "loss": 2.0549, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.29931972789115646, | |
| "grad_norm": 17.72897272235088, | |
| "learning_rate": 1.9473433329361802e-06, | |
| "loss": 2.4525, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.30272108843537415, | |
| "grad_norm": 8.212788560084723, | |
| "learning_rate": 1.945528740140662e-06, | |
| "loss": 2.1368, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.30612244897959184, | |
| "grad_norm": 26.76274867022125, | |
| "learning_rate": 1.943684283225236e-06, | |
| "loss": 2.3735, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.30952380952380953, | |
| "grad_norm": 23.71630229663243, | |
| "learning_rate": 1.941810020446284e-06, | |
| "loss": 2.6005, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.3129251700680272, | |
| "grad_norm": 22.889738702248234, | |
| "learning_rate": 1.9399060110015917e-06, | |
| "loss": 2.6924, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.3163265306122449, | |
| "grad_norm": 32.54631787971477, | |
| "learning_rate": 1.9379723150284814e-06, | |
| "loss": 2.5301, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3197278911564626, | |
| "grad_norm": 3.6877224549117344, | |
| "learning_rate": 1.936008993601912e-06, | |
| "loss": 1.6556, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.3231292517006803, | |
| "grad_norm": 33.682920637388364, | |
| "learning_rate": 1.934016108732548e-06, | |
| "loss": 2.3709, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.32653061224489793, | |
| "grad_norm": 19.342157148675135, | |
| "learning_rate": 1.9319937233648045e-06, | |
| "loss": 1.8713, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3299319727891156, | |
| "grad_norm": 36.9446891807536, | |
| "learning_rate": 1.929941901374856e-06, | |
| "loss": 3.1666, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 12.769242612326224, | |
| "learning_rate": 1.9278607075686205e-06, | |
| "loss": 2.2024, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.336734693877551, | |
| "grad_norm": 7.569149644914372, | |
| "learning_rate": 1.9257502076797123e-06, | |
| "loss": 1.8434, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.3401360544217687, | |
| "grad_norm": 18.672166864254265, | |
| "learning_rate": 1.9236104683673653e-06, | |
| "loss": 2.6262, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3435374149659864, | |
| "grad_norm": 7.251393661314555, | |
| "learning_rate": 1.9214415572143284e-06, | |
| "loss": 1.8447, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.3469387755102041, | |
| "grad_norm": 25.8588617341962, | |
| "learning_rate": 1.919243542724731e-06, | |
| "loss": 2.3528, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.35034013605442177, | |
| "grad_norm": 21.00339285362203, | |
| "learning_rate": 1.917016494321918e-06, | |
| "loss": 2.462, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.35374149659863946, | |
| "grad_norm": 19.533037226832878, | |
| "learning_rate": 1.9147604823462585e-06, | |
| "loss": 2.3057, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 3.1087327492999286, | |
| "learning_rate": 1.9124755780529243e-06, | |
| "loss": 1.6935, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.36054421768707484, | |
| "grad_norm": 35.707396347148176, | |
| "learning_rate": 1.910161853609637e-06, | |
| "loss": 2.3652, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.36394557823129253, | |
| "grad_norm": 16.694934440145225, | |
| "learning_rate": 1.9078193820943916e-06, | |
| "loss": 2.6014, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.3673469387755102, | |
| "grad_norm": 12.946146725042743, | |
| "learning_rate": 1.9054482374931466e-06, | |
| "loss": 1.9379, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3707482993197279, | |
| "grad_norm": 8.740650008889842, | |
| "learning_rate": 1.9030484946974878e-06, | |
| "loss": 1.9414, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3741496598639456, | |
| "grad_norm": 23.13581690576701, | |
| "learning_rate": 1.9006202295022629e-06, | |
| "loss": 2.4563, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.37755102040816324, | |
| "grad_norm": 10.00026809536462, | |
| "learning_rate": 1.8981635186031869e-06, | |
| "loss": 1.8384, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.37755102040816324, | |
| "eval_loss": 2.2185332775115967, | |
| "eval_runtime": 3.7603, | |
| "eval_samples_per_second": 14.626, | |
| "eval_steps_per_second": 1.064, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 26.376801704138895, | |
| "learning_rate": 1.89567843959442e-06, | |
| "loss": 3.095, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.3843537414965986, | |
| "grad_norm": 31.801160647661863, | |
| "learning_rate": 1.8931650709661176e-06, | |
| "loss": 2.4186, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3877551020408163, | |
| "grad_norm": 3.7202396333724406, | |
| "learning_rate": 1.8906234921019504e-06, | |
| "loss": 1.8483, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.391156462585034, | |
| "grad_norm": 20.22060079238643, | |
| "learning_rate": 1.8880537832765975e-06, | |
| "loss": 2.1247, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3945578231292517, | |
| "grad_norm": 29.233218070907714, | |
| "learning_rate": 1.8854560256532098e-06, | |
| "loss": 2.3962, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.3979591836734694, | |
| "grad_norm": 12.311196195760077, | |
| "learning_rate": 1.882830301280849e-06, | |
| "loss": 1.9291, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.4013605442176871, | |
| "grad_norm": 24.022251844658836, | |
| "learning_rate": 1.880176693091893e-06, | |
| "loss": 2.0967, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.40476190476190477, | |
| "grad_norm": 15.5145598820515, | |
| "learning_rate": 1.8774952848994193e-06, | |
| "loss": 2.0164, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.40816326530612246, | |
| "grad_norm": 18.669552144287866, | |
| "learning_rate": 1.874786161394556e-06, | |
| "loss": 1.9074, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.41156462585034015, | |
| "grad_norm": 20.221669243742017, | |
| "learning_rate": 1.8720494081438077e-06, | |
| "loss": 2.0693, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.41496598639455784, | |
| "grad_norm": 40.16853982486705, | |
| "learning_rate": 1.8692851115863521e-06, | |
| "loss": 2.7133, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.41836734693877553, | |
| "grad_norm": 28.130765299643805, | |
| "learning_rate": 1.8664933590313116e-06, | |
| "loss": 2.3678, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.4217687074829932, | |
| "grad_norm": 3.285521259165442, | |
| "learning_rate": 1.8636742386549936e-06, | |
| "loss": 1.643, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.42517006802721086, | |
| "grad_norm": 14.918765530830019, | |
| "learning_rate": 1.8608278394981065e-06, | |
| "loss": 2.2832, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 3.221047286582191, | |
| "learning_rate": 1.8579542514629471e-06, | |
| "loss": 1.7598, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.43197278911564624, | |
| "grad_norm": 30.02563146393063, | |
| "learning_rate": 1.8550535653105621e-06, | |
| "loss": 2.2684, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.43537414965986393, | |
| "grad_norm": 14.894051195947721, | |
| "learning_rate": 1.8521258726578802e-06, | |
| "loss": 2.2898, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.4387755102040816, | |
| "grad_norm": 31.346174242632404, | |
| "learning_rate": 1.849171265974818e-06, | |
| "loss": 2.4443, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.4421768707482993, | |
| "grad_norm": 18.396976082720574, | |
| "learning_rate": 1.846189838581362e-06, | |
| "loss": 2.4081, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.445578231292517, | |
| "grad_norm": 11.300098238275778, | |
| "learning_rate": 1.843181684644617e-06, | |
| "loss": 1.9707, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.4489795918367347, | |
| "grad_norm": 9.311622064720812, | |
| "learning_rate": 1.8401468991758364e-06, | |
| "loss": 2.0055, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4523809523809524, | |
| "grad_norm": 17.268118260619143, | |
| "learning_rate": 1.837085578027418e-06, | |
| "loss": 2.1029, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.4557823129251701, | |
| "grad_norm": 13.534018757700077, | |
| "learning_rate": 1.833997817889878e-06, | |
| "loss": 1.6714, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.45918367346938777, | |
| "grad_norm": 25.67291091851184, | |
| "learning_rate": 1.8308837162887962e-06, | |
| "loss": 2.0809, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.46258503401360546, | |
| "grad_norm": 16.78554391811326, | |
| "learning_rate": 1.827743371581737e-06, | |
| "loss": 2.095, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.46598639455782315, | |
| "grad_norm": 7.0895304724541175, | |
| "learning_rate": 1.8245768829551415e-06, | |
| "loss": 2.0924, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.46938775510204084, | |
| "grad_norm": 28.325113542255774, | |
| "learning_rate": 1.8213843504211956e-06, | |
| "loss": 2.2312, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.47278911564625853, | |
| "grad_norm": 19.627621449351967, | |
| "learning_rate": 1.8181658748146709e-06, | |
| "loss": 2.1092, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 3.253642214201976, | |
| "learning_rate": 1.8149215577897394e-06, | |
| "loss": 1.8119, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.47959183673469385, | |
| "grad_norm": 22.194249754011054, | |
| "learning_rate": 1.8116515018167635e-06, | |
| "loss": 1.8086, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.48299319727891155, | |
| "grad_norm": 3.291628206622755, | |
| "learning_rate": 1.8083558101790595e-06, | |
| "loss": 1.6961, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.48639455782312924, | |
| "grad_norm": 30.333797331495706, | |
| "learning_rate": 1.8050345869696346e-06, | |
| "loss": 2.4649, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.4897959183673469, | |
| "grad_norm": 35.46381155966904, | |
| "learning_rate": 1.8016879370879004e-06, | |
| "loss": 2.375, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.4931972789115646, | |
| "grad_norm": 10.065027530577671, | |
| "learning_rate": 1.798315966236358e-06, | |
| "loss": 1.7088, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4965986394557823, | |
| "grad_norm": 31.969238069641904, | |
| "learning_rate": 1.794918780917262e-06, | |
| "loss": 2.2722, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.1706943713916287, | |
| "learning_rate": 1.791496488429254e-06, | |
| "loss": 1.5129, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5034013605442177, | |
| "grad_norm": 40.129409477941664, | |
| "learning_rate": 1.7880491968639751e-06, | |
| "loss": 2.8429, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5034013605442177, | |
| "eval_loss": 2.2053215503692627, | |
| "eval_runtime": 3.8702, | |
| "eval_samples_per_second": 14.211, | |
| "eval_steps_per_second": 1.034, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5068027210884354, | |
| "grad_norm": 26.985890370710862, | |
| "learning_rate": 1.7845770151026513e-06, | |
| "loss": 2.3221, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.5102040816326531, | |
| "grad_norm": 34.746114296368646, | |
| "learning_rate": 1.7810800528126553e-06, | |
| "loss": 2.3499, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5136054421768708, | |
| "grad_norm": 3.902076154967714, | |
| "learning_rate": 1.7775584204440416e-06, | |
| "loss": 1.7411, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.5170068027210885, | |
| "grad_norm": 27.80193827038684, | |
| "learning_rate": 1.7740122292260594e-06, | |
| "loss": 2.2895, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.5204081632653061, | |
| "grad_norm": 3.4114906810600685, | |
| "learning_rate": 1.7704415911636375e-06, | |
| "loss": 1.5119, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.5238095238095238, | |
| "grad_norm": 9.505522369554297, | |
| "learning_rate": 1.7668466190338483e-06, | |
| "loss": 1.844, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.5272108843537415, | |
| "grad_norm": 36.46998151934392, | |
| "learning_rate": 1.7632274263823457e-06, | |
| "loss": 2.4713, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5306122448979592, | |
| "grad_norm": 17.765108257489125, | |
| "learning_rate": 1.759584127519778e-06, | |
| "loss": 2.2811, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5340136054421769, | |
| "grad_norm": 14.148223114236801, | |
| "learning_rate": 1.7559168375181775e-06, | |
| "loss": 1.8442, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5374149659863946, | |
| "grad_norm": 9.76402372234183, | |
| "learning_rate": 1.7522256722073273e-06, | |
| "loss": 1.8945, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5408163265306123, | |
| "grad_norm": 16.450896799860217, | |
| "learning_rate": 1.748510748171101e-06, | |
| "loss": 1.9574, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.54421768707483, | |
| "grad_norm": 3.912613042056259, | |
| "learning_rate": 1.7447721827437819e-06, | |
| "loss": 1.6032, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5476190476190477, | |
| "grad_norm": 33.305605159021646, | |
| "learning_rate": 1.7410100940063558e-06, | |
| "loss": 2.4057, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5510204081632653, | |
| "grad_norm": 38.319973023280475, | |
| "learning_rate": 1.7372246007827833e-06, | |
| "loss": 2.5925, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5544217687074829, | |
| "grad_norm": 17.216523524482163, | |
| "learning_rate": 1.7334158226362446e-06, | |
| "loss": 2.0324, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5578231292517006, | |
| "grad_norm": 4.9862323362748535, | |
| "learning_rate": 1.7295838798653649e-06, | |
| "loss": 1.7436, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5612244897959183, | |
| "grad_norm": 4.0759355613648625, | |
| "learning_rate": 1.7257288935004132e-06, | |
| "loss": 1.7034, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.564625850340136, | |
| "grad_norm": 16.519960341878562, | |
| "learning_rate": 1.7218509852994822e-06, | |
| "loss": 2.115, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5680272108843537, | |
| "grad_norm": 17.37824200525593, | |
| "learning_rate": 1.7179502777446392e-06, | |
| "loss": 2.0609, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 39.604264809847564, | |
| "learning_rate": 1.7140268940380605e-06, | |
| "loss": 2.3861, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5748299319727891, | |
| "grad_norm": 17.489048911326037, | |
| "learning_rate": 1.7100809580981384e-06, | |
| "loss": 1.9979, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5782312925170068, | |
| "grad_norm": 6.642641185839537, | |
| "learning_rate": 1.7061125945555679e-06, | |
| "loss": 1.7533, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5816326530612245, | |
| "grad_norm": 41.437166409250736, | |
| "learning_rate": 1.70212192874941e-06, | |
| "loss": 2.8676, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5850340136054422, | |
| "grad_norm": 12.285090452877482, | |
| "learning_rate": 1.6981090867231336e-06, | |
| "loss": 1.8715, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5884353741496599, | |
| "grad_norm": 20.351266920257437, | |
| "learning_rate": 1.694074195220634e-06, | |
| "loss": 2.5238, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.5918367346938775, | |
| "grad_norm": 13.128678816386138, | |
| "learning_rate": 1.6900173816822289e-06, | |
| "loss": 1.7191, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5952380952380952, | |
| "grad_norm": 3.1331026154409565, | |
| "learning_rate": 1.6859387742406358e-06, | |
| "loss": 1.7885, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5986394557823129, | |
| "grad_norm": 12.273944679120639, | |
| "learning_rate": 1.6818385017169212e-06, | |
| "loss": 1.9361, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.6020408163265306, | |
| "grad_norm": 18.988287394873876, | |
| "learning_rate": 1.6777166936164354e-06, | |
| "loss": 2.118, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.6054421768707483, | |
| "grad_norm": 13.330413347581118, | |
| "learning_rate": 1.6735734801247202e-06, | |
| "loss": 1.9923, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.608843537414966, | |
| "grad_norm": 8.528660885149025, | |
| "learning_rate": 1.6694089921033976e-06, | |
| "loss": 1.6938, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.6122448979591837, | |
| "grad_norm": 28.049589150374253, | |
| "learning_rate": 1.6652233610860364e-06, | |
| "loss": 2.4092, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6156462585034014, | |
| "grad_norm": 19.077236893577115, | |
| "learning_rate": 1.6610167192739978e-06, | |
| "loss": 2.3235, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.6190476190476191, | |
| "grad_norm": 23.109888095114325, | |
| "learning_rate": 1.6567891995322603e-06, | |
| "loss": 2.2678, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.6224489795918368, | |
| "grad_norm": 19.456776496200867, | |
| "learning_rate": 1.6525409353852221e-06, | |
| "loss": 2.2764, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.6258503401360545, | |
| "grad_norm": 9.82404206796416, | |
| "learning_rate": 1.6482720610124856e-06, | |
| "loss": 1.8034, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.6292517006802721, | |
| "grad_norm": 24.2061776724548, | |
| "learning_rate": 1.6439827112446173e-06, | |
| "loss": 2.161, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6292517006802721, | |
| "eval_loss": 2.194326400756836, | |
| "eval_runtime": 3.7428, | |
| "eval_samples_per_second": 14.695, | |
| "eval_steps_per_second": 1.069, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6326530612244898, | |
| "grad_norm": 30.469163171671003, | |
| "learning_rate": 1.6396730215588912e-06, | |
| "loss": 2.2773, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6360544217687075, | |
| "grad_norm": 3.646917584621385, | |
| "learning_rate": 1.6353431280750082e-06, | |
| "loss": 1.5989, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6394557823129252, | |
| "grad_norm": 30.30266588230692, | |
| "learning_rate": 1.6309931675507978e-06, | |
| "loss": 2.6169, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 14.371186117614542, | |
| "learning_rate": 1.6266232773778983e-06, | |
| "loss": 1.9241, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6462585034013606, | |
| "grad_norm": 18.71258411403636, | |
| "learning_rate": 1.6222335955774176e-06, | |
| "loss": 2.1737, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6496598639455783, | |
| "grad_norm": 3.2723339662931585, | |
| "learning_rate": 1.617824260795573e-06, | |
| "loss": 1.8075, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6530612244897959, | |
| "grad_norm": 16.496061968286824, | |
| "learning_rate": 1.6133954122993139e-06, | |
| "loss": 2.0147, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6564625850340136, | |
| "grad_norm": 3.2013079969624805, | |
| "learning_rate": 1.608947189971921e-06, | |
| "loss": 1.6798, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6598639455782312, | |
| "grad_norm": 20.981814890242124, | |
| "learning_rate": 1.6044797343085898e-06, | |
| "loss": 2.0425, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6632653061224489, | |
| "grad_norm": 50.879018823375965, | |
| "learning_rate": 1.599993186411992e-06, | |
| "loss": 3.8504, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 3.283241794235971, | |
| "learning_rate": 1.59548768798782e-06, | |
| "loss": 1.4971, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.6700680272108843, | |
| "grad_norm": 12.706772022061763, | |
| "learning_rate": 1.5909633813403092e-06, | |
| "loss": 1.9318, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.673469387755102, | |
| "grad_norm": 7.747043673117189, | |
| "learning_rate": 1.5864204093677463e-06, | |
| "loss": 1.8641, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6768707482993197, | |
| "grad_norm": 12.685665761738797, | |
| "learning_rate": 1.5818589155579529e-06, | |
| "loss": 2.0781, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6802721088435374, | |
| "grad_norm": 8.183695796856302, | |
| "learning_rate": 1.5772790439837555e-06, | |
| "loss": 2.1112, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6836734693877551, | |
| "grad_norm": 3.6436475976280605, | |
| "learning_rate": 1.572680939298435e-06, | |
| "loss": 1.504, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6870748299319728, | |
| "grad_norm": 7.765753459491514, | |
| "learning_rate": 1.5680647467311555e-06, | |
| "loss": 1.6113, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6904761904761905, | |
| "grad_norm": 27.059590789587673, | |
| "learning_rate": 1.563430612082382e-06, | |
| "loss": 2.3797, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6938775510204082, | |
| "grad_norm": 17.865181616406808, | |
| "learning_rate": 1.5587786817192687e-06, | |
| "loss": 2.2287, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.6972789115646258, | |
| "grad_norm": 11.50437842198177, | |
| "learning_rate": 1.5541091025710434e-06, | |
| "loss": 2.2926, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.7006802721088435, | |
| "grad_norm": 18.03962056520961, | |
| "learning_rate": 1.5494220221243607e-06, | |
| "loss": 2.3374, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.7040816326530612, | |
| "grad_norm": 19.808732477248256, | |
| "learning_rate": 1.5447175884186478e-06, | |
| "loss": 2.3215, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.7074829931972789, | |
| "grad_norm": 21.35228597761302, | |
| "learning_rate": 1.539995950041426e-06, | |
| "loss": 2.2378, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.7108843537414966, | |
| "grad_norm": 14.090932946927257, | |
| "learning_rate": 1.5352572561236197e-06, | |
| "loss": 2.22, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 22.22875395969964, | |
| "learning_rate": 1.5305016563348443e-06, | |
| "loss": 2.44, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.717687074829932, | |
| "grad_norm": 12.732771656478363, | |
| "learning_rate": 1.5257293008786807e-06, | |
| "loss": 2.0598, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.7210884353741497, | |
| "grad_norm": 3.3024595151809777, | |
| "learning_rate": 1.5209403404879303e-06, | |
| "loss": 1.8514, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.7244897959183674, | |
| "grad_norm": 31.041628605811148, | |
| "learning_rate": 1.5161349264198535e-06, | |
| "loss": 2.4225, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.7278911564625851, | |
| "grad_norm": 11.866017531018645, | |
| "learning_rate": 1.511313210451394e-06, | |
| "loss": 1.9747, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.7312925170068028, | |
| "grad_norm": 23.77867996796224, | |
| "learning_rate": 1.5064753448743832e-06, | |
| "loss": 2.0971, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7346938775510204, | |
| "grad_norm": 28.640512428374876, | |
| "learning_rate": 1.5016214824907314e-06, | |
| "loss": 2.2247, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.7380952380952381, | |
| "grad_norm": 9.463317499162777, | |
| "learning_rate": 1.4967517766076015e-06, | |
| "loss": 1.9511, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.7414965986394558, | |
| "grad_norm": 3.6132074342008336, | |
| "learning_rate": 1.4918663810325659e-06, | |
| "loss": 1.5643, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.7448979591836735, | |
| "grad_norm": 12.274299577611806, | |
| "learning_rate": 1.4869654500687492e-06, | |
| "loss": 2.0865, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.7482993197278912, | |
| "grad_norm": 9.577269499797044, | |
| "learning_rate": 1.4820491385099555e-06, | |
| "loss": 2.1494, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7517006802721088, | |
| "grad_norm": 13.665325186622818, | |
| "learning_rate": 1.477117601635777e-06, | |
| "loss": 2.0676, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.7551020408163265, | |
| "grad_norm": 12.044556166373619, | |
| "learning_rate": 1.4721709952066923e-06, | |
| "loss": 1.7408, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7551020408163265, | |
| "eval_loss": 2.1867611408233643, | |
| "eval_runtime": 3.7388, | |
| "eval_samples_per_second": 14.711, | |
| "eval_steps_per_second": 1.07, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7585034013605442, | |
| "grad_norm": 32.0866216128451, | |
| "learning_rate": 1.4672094754591449e-06, | |
| "loss": 2.6444, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 26.272890838528287, | |
| "learning_rate": 1.4622331991006082e-06, | |
| "loss": 2.0286, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.7653061224489796, | |
| "grad_norm": 10.948966043777636, | |
| "learning_rate": 1.4572423233046385e-06, | |
| "loss": 1.8924, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7687074829931972, | |
| "grad_norm": 10.041220633719293, | |
| "learning_rate": 1.4522370057059079e-06, | |
| "loss": 1.8589, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.7721088435374149, | |
| "grad_norm": 19.90849856575333, | |
| "learning_rate": 1.447217404395227e-06, | |
| "loss": 2.4632, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.7755102040816326, | |
| "grad_norm": 3.3718807752757134, | |
| "learning_rate": 1.4421836779145511e-06, | |
| "loss": 1.7402, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7789115646258503, | |
| "grad_norm": 33.99543346002537, | |
| "learning_rate": 1.4371359852519734e-06, | |
| "loss": 2.9081, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.782312925170068, | |
| "grad_norm": 12.446391408704297, | |
| "learning_rate": 1.4320744858367024e-06, | |
| "loss": 2.0828, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 26.19952152880794, | |
| "learning_rate": 1.4269993395340277e-06, | |
| "loss": 2.2178, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7891156462585034, | |
| "grad_norm": 36.07799078718175, | |
| "learning_rate": 1.4219107066402692e-06, | |
| "loss": 2.6926, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.7925170068027211, | |
| "grad_norm": 11.216785179837261, | |
| "learning_rate": 1.4168087478777152e-06, | |
| "loss": 2.0393, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.7959183673469388, | |
| "grad_norm": 17.659830496744974, | |
| "learning_rate": 1.4116936243895466e-06, | |
| "loss": 2.1082, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.7993197278911565, | |
| "grad_norm": 17.001892765923902, | |
| "learning_rate": 1.406565497734745e-06, | |
| "loss": 1.9051, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.8027210884353742, | |
| "grad_norm": 31.896056687773818, | |
| "learning_rate": 1.4014245298829935e-06, | |
| "loss": 2.702, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.8061224489795918, | |
| "grad_norm": 6.972810630357569, | |
| "learning_rate": 1.3962708832095568e-06, | |
| "loss": 1.9466, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.8095238095238095, | |
| "grad_norm": 17.689383441039308, | |
| "learning_rate": 1.3911047204901558e-06, | |
| "loss": 2.3425, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.8129251700680272, | |
| "grad_norm": 16.46834046227904, | |
| "learning_rate": 1.385926204895826e-06, | |
| "loss": 2.1545, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 21.69161139742313, | |
| "learning_rate": 1.3807354999877614e-06, | |
| "loss": 2.3222, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8197278911564626, | |
| "grad_norm": 3.411794366451801, | |
| "learning_rate": 1.3755327697121522e-06, | |
| "loss": 1.6492, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.8231292517006803, | |
| "grad_norm": 13.113564486849809, | |
| "learning_rate": 1.3703181783950031e-06, | |
| "loss": 2.0212, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.826530612244898, | |
| "grad_norm": 14.798483657902382, | |
| "learning_rate": 1.3650918907369452e-06, | |
| "loss": 2.1974, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.8299319727891157, | |
| "grad_norm": 10.19780084250851, | |
| "learning_rate": 1.3598540718080345e-06, | |
| "loss": 1.8543, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 30.023251305313995, | |
| "learning_rate": 1.3546048870425354e-06, | |
| "loss": 2.2387, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8367346938775511, | |
| "grad_norm": 22.321684071392564, | |
| "learning_rate": 1.3493445022336994e-06, | |
| "loss": 2.4305, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.8401360544217688, | |
| "grad_norm": 34.98925650288134, | |
| "learning_rate": 1.3440730835285247e-06, | |
| "loss": 2.4364, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.8435374149659864, | |
| "grad_norm": 3.161092974878791, | |
| "learning_rate": 1.3387907974225116e-06, | |
| "loss": 1.4885, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.8469387755102041, | |
| "grad_norm": 50.11899935337027, | |
| "learning_rate": 1.3334978107544024e-06, | |
| "loss": 2.3332, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.8503401360544217, | |
| "grad_norm": 15.05206270554561, | |
| "learning_rate": 1.3281942907009112e-06, | |
| "loss": 2.2131, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8537414965986394, | |
| "grad_norm": 24.869549840961, | |
| "learning_rate": 1.3228804047714462e-06, | |
| "loss": 2.2264, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 16.049594008906414, | |
| "learning_rate": 1.317556320802816e-06, | |
| "loss": 1.7228, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8605442176870748, | |
| "grad_norm": 14.258214783846427, | |
| "learning_rate": 1.31222220695393e-06, | |
| "loss": 1.999, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.8639455782312925, | |
| "grad_norm": 3.3063413494205474, | |
| "learning_rate": 1.3068782317004874e-06, | |
| "loss": 1.4607, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.8673469387755102, | |
| "grad_norm": 8.831787955552995, | |
| "learning_rate": 1.3015245638296563e-06, | |
| "loss": 2.1192, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8707482993197279, | |
| "grad_norm": 3.121872417027736, | |
| "learning_rate": 1.296161372434741e-06, | |
| "loss": 1.5467, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.8741496598639455, | |
| "grad_norm": 33.22351218100941, | |
| "learning_rate": 1.2907888269098416e-06, | |
| "loss": 2.3588, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.8775510204081632, | |
| "grad_norm": 3.188560179185641, | |
| "learning_rate": 1.2854070969445064e-06, | |
| "loss": 1.5405, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.8809523809523809, | |
| "grad_norm": 21.318069352021737, | |
| "learning_rate": 1.2800163525183688e-06, | |
| "loss": 2.2063, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.8809523809523809, | |
| "eval_loss": 2.1820290088653564, | |
| "eval_runtime": 3.8534, | |
| "eval_samples_per_second": 14.273, | |
| "eval_steps_per_second": 1.038, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.8843537414965986, | |
| "grad_norm": 8.243323927611506, | |
| "learning_rate": 1.2746167638957805e-06, | |
| "loss": 1.8474, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8877551020408163, | |
| "grad_norm": 28.909948439715215, | |
| "learning_rate": 1.2692085016204333e-06, | |
| "loss": 2.2626, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.891156462585034, | |
| "grad_norm": 3.0722449835450116, | |
| "learning_rate": 1.2637917365099725e-06, | |
| "loss": 1.6435, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.8945578231292517, | |
| "grad_norm": 29.871491992872432, | |
| "learning_rate": 1.2583666396506023e-06, | |
| "loss": 2.1498, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8979591836734694, | |
| "grad_norm": 2.977539901133042, | |
| "learning_rate": 1.2529333823916806e-06, | |
| "loss": 1.7024, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.9013605442176871, | |
| "grad_norm": 16.47476152363902, | |
| "learning_rate": 1.2474921363403094e-06, | |
| "loss": 2.532, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.9047619047619048, | |
| "grad_norm": 13.022051400004793, | |
| "learning_rate": 1.2420430733559124e-06, | |
| "loss": 1.8884, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.9081632653061225, | |
| "grad_norm": 8.97804602434911, | |
| "learning_rate": 1.2365863655448075e-06, | |
| "loss": 1.7885, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.9115646258503401, | |
| "grad_norm": 16.047174726202446, | |
| "learning_rate": 1.2311221852547721e-06, | |
| "loss": 2.3363, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.9149659863945578, | |
| "grad_norm": 3.5763323384852765, | |
| "learning_rate": 1.2256507050695977e-06, | |
| "loss": 1.701, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.9183673469387755, | |
| "grad_norm": 26.929796973835796, | |
| "learning_rate": 1.220172097803641e-06, | |
| "loss": 2.3601, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9217687074829932, | |
| "grad_norm": 22.50281840057178, | |
| "learning_rate": 1.2146865364963633e-06, | |
| "loss": 2.0693, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.9251700680272109, | |
| "grad_norm": 11.62602578923058, | |
| "learning_rate": 1.2091941944068665e-06, | |
| "loss": 1.9123, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 16.841220035990798, | |
| "learning_rate": 1.2036952450084214e-06, | |
| "loss": 2.2163, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.9319727891156463, | |
| "grad_norm": 18.055133543008612, | |
| "learning_rate": 1.1981898619829879e-06, | |
| "loss": 2.2485, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.935374149659864, | |
| "grad_norm": 26.45820099458286, | |
| "learning_rate": 1.1926782192157273e-06, | |
| "loss": 2.1845, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.9387755102040817, | |
| "grad_norm": 3.334955291200548, | |
| "learning_rate": 1.1871604907895148e-06, | |
| "loss": 1.7059, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.9421768707482994, | |
| "grad_norm": 19.511242339983163, | |
| "learning_rate": 1.1816368509794364e-06, | |
| "loss": 2.3601, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.9455782312925171, | |
| "grad_norm": 21.146925953072365, | |
| "learning_rate": 1.1761074742472882e-06, | |
| "loss": 1.9957, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.9489795918367347, | |
| "grad_norm": 3.5535024021194452, | |
| "learning_rate": 1.1705725352360633e-06, | |
| "loss": 1.9249, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 13.348912305071467, | |
| "learning_rate": 1.165032208764438e-06, | |
| "loss": 2.0641, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.95578231292517, | |
| "grad_norm": 12.61033318044152, | |
| "learning_rate": 1.1594866698212483e-06, | |
| "loss": 2.169, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.9591836734693877, | |
| "grad_norm": 28.256325358544956, | |
| "learning_rate": 1.1539360935599644e-06, | |
| "loss": 2.0952, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.9625850340136054, | |
| "grad_norm": 12.61302060729169, | |
| "learning_rate": 1.1483806552931582e-06, | |
| "loss": 1.9411, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.9659863945578231, | |
| "grad_norm": 8.711391665501074, | |
| "learning_rate": 1.142820530486966e-06, | |
| "loss": 1.7633, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.9693877551020408, | |
| "grad_norm": 35.95958496013491, | |
| "learning_rate": 1.1372558947555455e-06, | |
| "loss": 2.1904, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9727891156462585, | |
| "grad_norm": 3.429092657849847, | |
| "learning_rate": 1.131686923855531e-06, | |
| "loss": 1.8276, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9761904761904762, | |
| "grad_norm": 12.871658288368948, | |
| "learning_rate": 1.1261137936804811e-06, | |
| "loss": 2.0911, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.9795918367346939, | |
| "grad_norm": 13.217001333800638, | |
| "learning_rate": 1.1205366802553228e-06, | |
| "loss": 1.9614, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.9829931972789115, | |
| "grad_norm": 24.712172909538513, | |
| "learning_rate": 1.1149557597307934e-06, | |
| "loss": 2.0412, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.9863945578231292, | |
| "grad_norm": 10.412944718560512, | |
| "learning_rate": 1.1093712083778746e-06, | |
| "loss": 1.7787, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9897959183673469, | |
| "grad_norm": 15.631851389191027, | |
| "learning_rate": 1.1037832025822265e-06, | |
| "loss": 2.3362, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.9931972789115646, | |
| "grad_norm": 12.135256117907334, | |
| "learning_rate": 1.098191918838617e-06, | |
| "loss": 2.0212, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.9965986394557823, | |
| "grad_norm": 13.057522322919077, | |
| "learning_rate": 1.0925975337453462e-06, | |
| "loss": 2.2842, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 17.565324685523922, | |
| "learning_rate": 1.0870002239986686e-06, | |
| "loss": 2.5002, | |
| "step": 294 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 588, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 294, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 95887829237760.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |