| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9973216459702945, | |
| "eval_steps": 100, | |
| "global_step": 1026, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009739469198928659, | |
| "grad_norm": 5.16649192443276, | |
| "learning_rate": 4.854368932038835e-07, | |
| "loss": 0.956, | |
| "mean_token_accuracy": 0.7770209729671478, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.019478938397857318, | |
| "grad_norm": 4.487616874759018, | |
| "learning_rate": 9.70873786407767e-07, | |
| "loss": 0.9595, | |
| "mean_token_accuracy": 0.774706457555294, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.029218407596785977, | |
| "grad_norm": 1.9918521253065147, | |
| "learning_rate": 1.4563106796116506e-06, | |
| "loss": 0.9087, | |
| "mean_token_accuracy": 0.7794785097241401, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.038957876795714635, | |
| "grad_norm": 2.140741610293528, | |
| "learning_rate": 1.941747572815534e-06, | |
| "loss": 0.8516, | |
| "mean_token_accuracy": 0.7845764443278312, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.048697345994643294, | |
| "grad_norm": 1.577104941303547, | |
| "learning_rate": 2.427184466019418e-06, | |
| "loss": 0.7835, | |
| "mean_token_accuracy": 0.795551997423172, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.05843681519357195, | |
| "grad_norm": 0.9851513931833208, | |
| "learning_rate": 2.912621359223301e-06, | |
| "loss": 0.7536, | |
| "mean_token_accuracy": 0.801626966893673, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0681762843925006, | |
| "grad_norm": 0.7665830684245213, | |
| "learning_rate": 3.398058252427185e-06, | |
| "loss": 0.7207, | |
| "mean_token_accuracy": 0.8073863789439202, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07791575359142927, | |
| "grad_norm": 0.6623786102965908, | |
| "learning_rate": 3.883495145631068e-06, | |
| "loss": 0.6976, | |
| "mean_token_accuracy": 0.8124146014451981, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08765522279035792, | |
| "grad_norm": 0.5409465013517523, | |
| "learning_rate": 4.368932038834952e-06, | |
| "loss": 0.6791, | |
| "mean_token_accuracy": 0.8153241157531739, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.09739469198928659, | |
| "grad_norm": 0.533911697336328, | |
| "learning_rate": 4.854368932038836e-06, | |
| "loss": 0.6494, | |
| "mean_token_accuracy": 0.8212776482105255, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10713416118821524, | |
| "grad_norm": 0.46335668910229383, | |
| "learning_rate": 5.3398058252427185e-06, | |
| "loss": 0.6374, | |
| "mean_token_accuracy": 0.8242872759699822, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1168736303871439, | |
| "grad_norm": 0.48547854119926936, | |
| "learning_rate": 5.825242718446602e-06, | |
| "loss": 0.6321, | |
| "mean_token_accuracy": 0.8252887204289436, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12661309958607256, | |
| "grad_norm": 0.4864652293922326, | |
| "learning_rate": 6.310679611650487e-06, | |
| "loss": 0.6246, | |
| "mean_token_accuracy": 0.8261281028389931, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1363525687850012, | |
| "grad_norm": 0.48690138801181443, | |
| "learning_rate": 6.79611650485437e-06, | |
| "loss": 0.6168, | |
| "mean_token_accuracy": 0.8278339207172394, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.14609203798392986, | |
| "grad_norm": 0.4337713878880355, | |
| "learning_rate": 7.2815533980582534e-06, | |
| "loss": 0.5925, | |
| "mean_token_accuracy": 0.8333460614085197, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.15583150718285854, | |
| "grad_norm": 0.45660620623082415, | |
| "learning_rate": 7.766990291262136e-06, | |
| "loss": 0.5973, | |
| "mean_token_accuracy": 0.8320671111345291, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1655709763817872, | |
| "grad_norm": 0.4424374164027964, | |
| "learning_rate": 8.25242718446602e-06, | |
| "loss": 0.5873, | |
| "mean_token_accuracy": 0.8338468298316002, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.17531044558071585, | |
| "grad_norm": 0.4949541791856808, | |
| "learning_rate": 8.737864077669904e-06, | |
| "loss": 0.5846, | |
| "mean_token_accuracy": 0.8346328064799309, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1850499147796445, | |
| "grad_norm": 0.44716595203588544, | |
| "learning_rate": 9.223300970873788e-06, | |
| "loss": 0.574, | |
| "mean_token_accuracy": 0.8365551233291626, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.19478938397857318, | |
| "grad_norm": 0.40922461158607365, | |
| "learning_rate": 9.708737864077671e-06, | |
| "loss": 0.5745, | |
| "mean_token_accuracy": 0.8366570115089417, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.20452885317750183, | |
| "grad_norm": 0.5016773528604923, | |
| "learning_rate": 9.99988415036596e-06, | |
| "loss": 0.5649, | |
| "mean_token_accuracy": 0.8382768034934998, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.21426832237643048, | |
| "grad_norm": 0.5010899629730204, | |
| "learning_rate": 9.99858090363555e-06, | |
| "loss": 0.569, | |
| "mean_token_accuracy": 0.8377967774868011, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.22400779157535913, | |
| "grad_norm": 0.5010895934649102, | |
| "learning_rate": 9.995829976834402e-06, | |
| "loss": 0.5654, | |
| "mean_token_accuracy": 0.8383171066641808, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2337472607742878, | |
| "grad_norm": 0.5864787846174829, | |
| "learning_rate": 9.99163216668102e-06, | |
| "loss": 0.5703, | |
| "mean_token_accuracy": 0.8369988009333611, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.24348672997321646, | |
| "grad_norm": 0.5502383423638442, | |
| "learning_rate": 9.985988688937684e-06, | |
| "loss": 0.5632, | |
| "mean_token_accuracy": 0.8387595832347869, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2532261991721451, | |
| "grad_norm": 0.4838385873501471, | |
| "learning_rate": 9.978901178058333e-06, | |
| "loss": 0.5472, | |
| "mean_token_accuracy": 0.8424718379974365, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.26296566837107377, | |
| "grad_norm": 0.5371306966614764, | |
| "learning_rate": 9.970371686715205e-06, | |
| "loss": 0.5431, | |
| "mean_token_accuracy": 0.8432716697454452, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2727051375700024, | |
| "grad_norm": 0.5584506793424302, | |
| "learning_rate": 9.960402685204347e-06, | |
| "loss": 0.5516, | |
| "mean_token_accuracy": 0.8408507108688354, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2824446067689311, | |
| "grad_norm": 0.5467853450677332, | |
| "learning_rate": 9.948997060730161e-06, | |
| "loss": 0.5464, | |
| "mean_token_accuracy": 0.8424971371889114, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.2921840759678597, | |
| "grad_norm": 0.48818575924287444, | |
| "learning_rate": 9.936158116569231e-06, | |
| "loss": 0.5489, | |
| "mean_token_accuracy": 0.8420207649469376, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.30192354516678843, | |
| "grad_norm": 0.46723378629583207, | |
| "learning_rate": 9.921889571113629e-06, | |
| "loss": 0.5462, | |
| "mean_token_accuracy": 0.8419363871216774, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3116630143657171, | |
| "grad_norm": 0.561719955013033, | |
| "learning_rate": 9.906195556793996e-06, | |
| "loss": 0.546, | |
| "mean_token_accuracy": 0.8423524782061577, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.32140248356464574, | |
| "grad_norm": 0.625160714021079, | |
| "learning_rate": 9.889080618882719e-06, | |
| "loss": 0.5335, | |
| "mean_token_accuracy": 0.8451830595731735, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3311419527635744, | |
| "grad_norm": 0.5878540451317104, | |
| "learning_rate": 9.870549714177538e-06, | |
| "loss": 0.5463, | |
| "mean_token_accuracy": 0.8417419150471688, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.34088142196250304, | |
| "grad_norm": 0.6154889939045929, | |
| "learning_rate": 9.850608209565967e-06, | |
| "loss": 0.5327, | |
| "mean_token_accuracy": 0.8450453072786331, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3506208911614317, | |
| "grad_norm": 0.4329465610848603, | |
| "learning_rate": 9.829261880470941e-06, | |
| "loss": 0.5392, | |
| "mean_token_accuracy": 0.8434989348053932, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.36036036036036034, | |
| "grad_norm": 0.5971707984256834, | |
| "learning_rate": 9.806516909178161e-06, | |
| "loss": 0.5324, | |
| "mean_token_accuracy": 0.84499292075634, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.370099829559289, | |
| "grad_norm": 0.44190322710346147, | |
| "learning_rate": 9.78237988304557e-06, | |
| "loss": 0.5332, | |
| "mean_token_accuracy": 0.845010556280613, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3798392987582177, | |
| "grad_norm": 0.4258724885095506, | |
| "learning_rate": 9.756857792595555e-06, | |
| "loss": 0.5319, | |
| "mean_token_accuracy": 0.845000034570694, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.38957876795714635, | |
| "grad_norm": 0.4729261486427116, | |
| "learning_rate": 9.729958029490353e-06, | |
| "loss": 0.5336, | |
| "mean_token_accuracy": 0.8447421163320541, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.399318237156075, | |
| "grad_norm": 0.4528083565277609, | |
| "learning_rate": 9.701688384391296e-06, | |
| "loss": 0.5347, | |
| "mean_token_accuracy": 0.8443498685956001, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.40905770635500366, | |
| "grad_norm": 0.5259149339810829, | |
| "learning_rate": 9.672057044702492e-06, | |
| "loss": 0.5199, | |
| "mean_token_accuracy": 0.848120279610157, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4187971755539323, | |
| "grad_norm": 0.46836949308672154, | |
| "learning_rate": 9.641072592199599e-06, | |
| "loss": 0.5219, | |
| "mean_token_accuracy": 0.8473641723394394, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.42853664475286096, | |
| "grad_norm": 0.5302502163183013, | |
| "learning_rate": 9.608744000544392e-06, | |
| "loss": 0.5174, | |
| "mean_token_accuracy": 0.8485160410404206, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4382761139517896, | |
| "grad_norm": 0.51343914785562, | |
| "learning_rate": 9.575080632685832e-06, | |
| "loss": 0.5239, | |
| "mean_token_accuracy": 0.846913392841816, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.44801558315071827, | |
| "grad_norm": 0.5925559680424145, | |
| "learning_rate": 9.54009223814837e-06, | |
| "loss": 0.5277, | |
| "mean_token_accuracy": 0.8454070091247559, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4577550523496469, | |
| "grad_norm": 0.426110283810703, | |
| "learning_rate": 9.503788950208324e-06, | |
| "loss": 0.5215, | |
| "mean_token_accuracy": 0.8473676040768623, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.4674945215485756, | |
| "grad_norm": 0.5303378941560718, | |
| "learning_rate": 9.466181282959083e-06, | |
| "loss": 0.5282, | |
| "mean_token_accuracy": 0.8465007901191711, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4772339907475043, | |
| "grad_norm": 0.4965315113021109, | |
| "learning_rate": 9.427280128266049e-06, | |
| "loss": 0.5179, | |
| "mean_token_accuracy": 0.8484022691845894, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.48697345994643293, | |
| "grad_norm": 0.44982624201366644, | |
| "learning_rate": 9.387096752612144e-06, | |
| "loss": 0.5224, | |
| "mean_token_accuracy": 0.8463509559631348, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4967129291453616, | |
| "grad_norm": 0.4623459721033935, | |
| "learning_rate": 9.345642793834825e-06, | |
| "loss": 0.5271, | |
| "mean_token_accuracy": 0.8463737353682518, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5064523983442902, | |
| "grad_norm": 0.5825923961149249, | |
| "learning_rate": 9.302930257755579e-06, | |
| "loss": 0.53, | |
| "mean_token_accuracy": 0.8450088694691658, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5161918675432189, | |
| "grad_norm": 0.45940800329264964, | |
| "learning_rate": 9.258971514702789e-06, | |
| "loss": 0.507, | |
| "mean_token_accuracy": 0.8508246764540672, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5259313367421475, | |
| "grad_norm": 0.4465364801256697, | |
| "learning_rate": 9.213779295929082e-06, | |
| "loss": 0.5087, | |
| "mean_token_accuracy": 0.8500014141201973, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5356708059410762, | |
| "grad_norm": 0.4686695606038613, | |
| "learning_rate": 9.167366689924116e-06, | |
| "loss": 0.5163, | |
| "mean_token_accuracy": 0.8484609499573708, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5454102751400048, | |
| "grad_norm": 0.5572145736502692, | |
| "learning_rate": 9.119747138623925e-06, | |
| "loss": 0.5221, | |
| "mean_token_accuracy": 0.8470426678657532, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5551497443389335, | |
| "grad_norm": 0.4765827682250088, | |
| "learning_rate": 9.070934433517872e-06, | |
| "loss": 0.5068, | |
| "mean_token_accuracy": 0.8509115263819694, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5648892135378621, | |
| "grad_norm": 0.49934327207360063, | |
| "learning_rate": 9.020942711654404e-06, | |
| "loss": 0.5106, | |
| "mean_token_accuracy": 0.8498208403587342, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5746286827367908, | |
| "grad_norm": 0.5101324493724546, | |
| "learning_rate": 8.969786451546691e-06, | |
| "loss": 0.5123, | |
| "mean_token_accuracy": 0.8496938437223435, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5843681519357194, | |
| "grad_norm": 0.5089021103596151, | |
| "learning_rate": 8.917480468979387e-06, | |
| "loss": 0.5128, | |
| "mean_token_accuracy": 0.8487787261605263, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5941076211346482, | |
| "grad_norm": 0.5501775930667125, | |
| "learning_rate": 8.864039912717713e-06, | |
| "loss": 0.5123, | |
| "mean_token_accuracy": 0.849444879591465, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6038470903335769, | |
| "grad_norm": 0.41563520519489466, | |
| "learning_rate": 8.809480260120096e-06, | |
| "loss": 0.5048, | |
| "mean_token_accuracy": 0.8513683333992959, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6135865595325055, | |
| "grad_norm": 0.4999791105897695, | |
| "learning_rate": 8.753817312655642e-06, | |
| "loss": 0.514, | |
| "mean_token_accuracy": 0.8484693005681038, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6233260287314342, | |
| "grad_norm": 0.48088239973601826, | |
| "learning_rate": 8.697067191327748e-06, | |
| "loss": 0.5114, | |
| "mean_token_accuracy": 0.8495015501976013, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6330654979303628, | |
| "grad_norm": 0.4440141428026334, | |
| "learning_rate": 8.639246332005163e-06, | |
| "loss": 0.5064, | |
| "mean_token_accuracy": 0.8507678374648094, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6428049671292915, | |
| "grad_norm": 0.5288324873519791, | |
| "learning_rate": 8.580371480661857e-06, | |
| "loss": 0.5024, | |
| "mean_token_accuracy": 0.8514497712254524, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6525444363282201, | |
| "grad_norm": 0.4563934748637001, | |
| "learning_rate": 8.520459688527091e-06, | |
| "loss": 0.5108, | |
| "mean_token_accuracy": 0.849525935947895, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6622839055271488, | |
| "grad_norm": 0.4756063591717307, | |
| "learning_rate": 8.459528307147066e-06, | |
| "loss": 0.51, | |
| "mean_token_accuracy": 0.8501726359128952, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6720233747260774, | |
| "grad_norm": 0.5732019741491505, | |
| "learning_rate": 8.397594983359591e-06, | |
| "loss": 0.5062, | |
| "mean_token_accuracy": 0.850397090613842, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6817628439250061, | |
| "grad_norm": 0.5267653966986828, | |
| "learning_rate": 8.334677654183254e-06, | |
| "loss": 0.5065, | |
| "mean_token_accuracy": 0.8505988359451294, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6915023131239347, | |
| "grad_norm": 0.4796149747636043, | |
| "learning_rate": 8.27079454162252e-06, | |
| "loss": 0.5028, | |
| "mean_token_accuracy": 0.851087860763073, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7012417823228634, | |
| "grad_norm": 0.4809967856664383, | |
| "learning_rate": 8.205964147390313e-06, | |
| "loss": 0.5084, | |
| "mean_token_accuracy": 0.8496400877833367, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.710981251521792, | |
| "grad_norm": 0.4348534092140185, | |
| "learning_rate": 8.140205247549583e-06, | |
| "loss": 0.4983, | |
| "mean_token_accuracy": 0.8522587567567825, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7207207207207207, | |
| "grad_norm": 0.4799630529257426, | |
| "learning_rate": 8.073536887075417e-06, | |
| "loss": 0.5119, | |
| "mean_token_accuracy": 0.8493492469191551, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7304601899196493, | |
| "grad_norm": 0.5305052950944196, | |
| "learning_rate": 8.005978374339264e-06, | |
| "loss": 0.4946, | |
| "mean_token_accuracy": 0.8531364649534225, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.740199659118578, | |
| "grad_norm": 0.46401531374429855, | |
| "learning_rate": 7.937549275516882e-06, | |
| "loss": 0.493, | |
| "mean_token_accuracy": 0.8535278528928757, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7499391283175066, | |
| "grad_norm": 0.46753323308197925, | |
| "learning_rate": 7.868269408921614e-06, | |
| "loss": 0.504, | |
| "mean_token_accuracy": 0.8511819407343865, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7596785975164354, | |
| "grad_norm": 0.43897436674183826, | |
| "learning_rate": 7.798158839264645e-06, | |
| "loss": 0.4983, | |
| "mean_token_accuracy": 0.8521765768527985, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7694180667153641, | |
| "grad_norm": 0.4030095081507055, | |
| "learning_rate": 7.7272378718439e-06, | |
| "loss": 0.5092, | |
| "mean_token_accuracy": 0.8500049978494644, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7791575359142927, | |
| "grad_norm": 0.393465790048579, | |
| "learning_rate": 7.655527046663254e-06, | |
| "loss": 0.5022, | |
| "mean_token_accuracy": 0.8510702222585678, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7888970051132214, | |
| "grad_norm": 0.49269595855244275, | |
| "learning_rate": 7.5830471324837765e-06, | |
| "loss": 0.4945, | |
| "mean_token_accuracy": 0.8529314771294594, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.79863647431215, | |
| "grad_norm": 0.4729001525106615, | |
| "learning_rate": 7.5098191208087144e-06, | |
| "loss": 0.5, | |
| "mean_token_accuracy": 0.8525989070534706, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8083759435110787, | |
| "grad_norm": 0.44720074409020516, | |
| "learning_rate": 7.4358642198039835e-06, | |
| "loss": 0.4946, | |
| "mean_token_accuracy": 0.8530281245708465, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.8181154127100073, | |
| "grad_norm": 0.4192773521625985, | |
| "learning_rate": 7.36120384815588e-06, | |
| "loss": 0.4927, | |
| "mean_token_accuracy": 0.8539465010166168, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.827854881908936, | |
| "grad_norm": 0.41037683317185314, | |
| "learning_rate": 7.285859628867851e-06, | |
| "loss": 0.4952, | |
| "mean_token_accuracy": 0.8532393842935562, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.8375943511078646, | |
| "grad_norm": 0.4536782151813187, | |
| "learning_rate": 7.209853382998077e-06, | |
| "loss": 0.4983, | |
| "mean_token_accuracy": 0.8528945103287697, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8473338203067933, | |
| "grad_norm": 0.4895833290473578, | |
| "learning_rate": 7.133207123339689e-06, | |
| "loss": 0.4939, | |
| "mean_token_accuracy": 0.853129243850708, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.8570732895057219, | |
| "grad_norm": 0.4771689532642086, | |
| "learning_rate": 7.055943048045476e-06, | |
| "loss": 0.5002, | |
| "mean_token_accuracy": 0.8518661975860595, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8668127587046506, | |
| "grad_norm": 0.4439177979495324, | |
| "learning_rate": 6.978083534198878e-06, | |
| "loss": 0.4977, | |
| "mean_token_accuracy": 0.8526063248515129, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.8765522279035792, | |
| "grad_norm": 0.45795986197220584, | |
| "learning_rate": 6.899651131333194e-06, | |
| "loss": 0.4876, | |
| "mean_token_accuracy": 0.8546572834253311, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8862916971025079, | |
| "grad_norm": 0.38084088770564956, | |
| "learning_rate": 6.82066855490081e-06, | |
| "loss": 0.5009, | |
| "mean_token_accuracy": 0.851148933172226, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.8960311663014365, | |
| "grad_norm": 0.4552737931829716, | |
| "learning_rate": 6.741158679694403e-06, | |
| "loss": 0.4968, | |
| "mean_token_accuracy": 0.8524380102753639, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9057706355003652, | |
| "grad_norm": 0.38831398930391275, | |
| "learning_rate": 6.661144533221974e-06, | |
| "loss": 0.4897, | |
| "mean_token_accuracy": 0.8537176489830017, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.9155101046992938, | |
| "grad_norm": 0.42794585960634596, | |
| "learning_rate": 6.58064928903767e-06, | |
| "loss": 0.4942, | |
| "mean_token_accuracy": 0.8529506504535675, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9252495738982226, | |
| "grad_norm": 0.42890975667323755, | |
| "learning_rate": 6.499696260030297e-06, | |
| "loss": 0.5064, | |
| "mean_token_accuracy": 0.8502799227833748, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.9349890430971513, | |
| "grad_norm": 0.4478332747744812, | |
| "learning_rate": 6.418308891671484e-06, | |
| "loss": 0.4855, | |
| "mean_token_accuracy": 0.8555838361382484, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9447285122960799, | |
| "grad_norm": 0.435211802978247, | |
| "learning_rate": 6.336510755225447e-06, | |
| "loss": 0.4835, | |
| "mean_token_accuracy": 0.8558964654803276, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.9544679814950086, | |
| "grad_norm": 0.4462726381437146, | |
| "learning_rate": 6.25432554092232e-06, | |
| "loss": 0.4898, | |
| "mean_token_accuracy": 0.8544702440500259, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9642074506939372, | |
| "grad_norm": 0.44503915866443544, | |
| "learning_rate": 6.171777051097037e-06, | |
| "loss": 0.4858, | |
| "mean_token_accuracy": 0.8554443955421448, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.9739469198928659, | |
| "grad_norm": 0.47966098493432496, | |
| "learning_rate": 6.088889193295738e-06, | |
| "loss": 0.4929, | |
| "mean_token_accuracy": 0.8535514727234841, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9836863890917945, | |
| "grad_norm": 0.5437585793823457, | |
| "learning_rate": 6.005685973351708e-06, | |
| "loss": 0.4931, | |
| "mean_token_accuracy": 0.8531181156635285, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9934258582907232, | |
| "grad_norm": 0.4124694681433172, | |
| "learning_rate": 5.922191488432857e-06, | |
| "loss": 0.4917, | |
| "mean_token_accuracy": 0.8535135626792908, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0019478938397857, | |
| "grad_norm": 0.803029283965446, | |
| "learning_rate": 5.838429920062734e-06, | |
| "loss": 0.4727, | |
| "mean_token_accuracy": 0.8567549926894051, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.0116873630387144, | |
| "grad_norm": 0.3927635667112885, | |
| "learning_rate": 5.754425527117118e-06, | |
| "loss": 0.4479, | |
| "mean_token_accuracy": 0.864441742002964, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.021426832237643, | |
| "grad_norm": 0.45928387456979786, | |
| "learning_rate": 5.670202638798213e-06, | |
| "loss": 0.4598, | |
| "mean_token_accuracy": 0.8615871027112008, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.0311663014365717, | |
| "grad_norm": 0.517389580126703, | |
| "learning_rate": 5.585785647588458e-06, | |
| "loss": 0.4572, | |
| "mean_token_accuracy": 0.8620010375976562, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0409057706355003, | |
| "grad_norm": 0.37886924917283415, | |
| "learning_rate": 5.501199002186024e-06, | |
| "loss": 0.455, | |
| "mean_token_accuracy": 0.8625968441367149, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.050645239834429, | |
| "grad_norm": 0.39773153996958316, | |
| "learning_rate": 5.416467200424032e-06, | |
| "loss": 0.45, | |
| "mean_token_accuracy": 0.8637859463691712, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.0603847090333576, | |
| "grad_norm": 0.40508395528042257, | |
| "learning_rate": 5.33161478217552e-06, | |
| "loss": 0.4516, | |
| "mean_token_accuracy": 0.8634377360343933, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.0701241782322863, | |
| "grad_norm": 0.37429603209812456, | |
| "learning_rate": 5.246666322246267e-06, | |
| "loss": 0.4445, | |
| "mean_token_accuracy": 0.8651037693023682, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.079863647431215, | |
| "grad_norm": 0.47125189912053295, | |
| "learning_rate": 5.1616464232574635e-06, | |
| "loss": 0.4626, | |
| "mean_token_accuracy": 0.8602706581354141, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.0896031166301436, | |
| "grad_norm": 0.435383825468146, | |
| "learning_rate": 5.076579708520355e-06, | |
| "loss": 0.4497, | |
| "mean_token_accuracy": 0.8635805040597916, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.0993425858290724, | |
| "grad_norm": 0.412421290706311, | |
| "learning_rate": 4.991490814904888e-06, | |
| "loss": 0.4378, | |
| "mean_token_accuracy": 0.8668369174003601, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.109082055028001, | |
| "grad_norm": 0.41209358309681204, | |
| "learning_rate": 4.906404385704402e-06, | |
| "loss": 0.4525, | |
| "mean_token_accuracy": 0.8637306377291679, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1188215242269297, | |
| "grad_norm": 0.3821281290056491, | |
| "learning_rate": 4.82134506349851e-06, | |
| "loss": 0.4564, | |
| "mean_token_accuracy": 0.8625956058502198, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.1285609934258582, | |
| "grad_norm": 0.34980242368851916, | |
| "learning_rate": 4.736337483016138e-06, | |
| "loss": 0.4513, | |
| "mean_token_accuracy": 0.8634121060371399, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.138300462624787, | |
| "grad_norm": 0.39297932611221925, | |
| "learning_rate": 4.651406264000871e-06, | |
| "loss": 0.4512, | |
| "mean_token_accuracy": 0.8632175624370575, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.1480399318237156, | |
| "grad_norm": 0.4317512663581136, | |
| "learning_rate": 4.5665760040806174e-06, | |
| "loss": 0.4558, | |
| "mean_token_accuracy": 0.8621700823307037, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.1577794010226443, | |
| "grad_norm": 0.4166519413744608, | |
| "learning_rate": 4.481871271643698e-06, | |
| "loss": 0.4543, | |
| "mean_token_accuracy": 0.8628953084349632, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.1675188702215729, | |
| "grad_norm": 0.3817664896689048, | |
| "learning_rate": 4.397316598723385e-06, | |
| "loss": 0.4599, | |
| "mean_token_accuracy": 0.8614401906728745, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.1772583394205016, | |
| "grad_norm": 0.39223028227054235, | |
| "learning_rate": 4.312936473892984e-06, | |
| "loss": 0.4559, | |
| "mean_token_accuracy": 0.8621881052851676, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.1869978086194302, | |
| "grad_norm": 0.39870192645434044, | |
| "learning_rate": 4.228755335173488e-06, | |
| "loss": 0.4554, | |
| "mean_token_accuracy": 0.8622724115848541, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.196737277818359, | |
| "grad_norm": 0.3968827579618214, | |
| "learning_rate": 4.1447975629559e-06, | |
| "loss": 0.4496, | |
| "mean_token_accuracy": 0.8639340966939926, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.2064767470172875, | |
| "grad_norm": 0.3525974840125754, | |
| "learning_rate": 4.061087472940204e-06, | |
| "loss": 0.4468, | |
| "mean_token_accuracy": 0.8643500834703446, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.2162162162162162, | |
| "grad_norm": 0.34784808537613615, | |
| "learning_rate": 3.977649309093113e-06, | |
| "loss": 0.4463, | |
| "mean_token_accuracy": 0.8645710095763206, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.225955685415145, | |
| "grad_norm": 0.3588527258157974, | |
| "learning_rate": 3.89450723662657e-06, | |
| "loss": 0.4517, | |
| "mean_token_accuracy": 0.863483439385891, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.2356951546140735, | |
| "grad_norm": 0.495886348965792, | |
| "learning_rate": 3.8116853349990574e-06, | |
| "loss": 0.4507, | |
| "mean_token_accuracy": 0.8636451244354248, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.245434623813002, | |
| "grad_norm": 0.3842625748309975, | |
| "learning_rate": 3.729207590941753e-06, | |
| "loss": 0.4377, | |
| "mean_token_accuracy": 0.8670691177248955, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.2551740930119308, | |
| "grad_norm": 0.4426936471853252, | |
| "learning_rate": 3.647097891511536e-06, | |
| "loss": 0.4495, | |
| "mean_token_accuracy": 0.8640359625220299, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.2649135622108596, | |
| "grad_norm": 0.3707914127374137, | |
| "learning_rate": 3.565380017172854e-06, | |
| "loss": 0.4397, | |
| "mean_token_accuracy": 0.8666136890649796, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.2746530314097881, | |
| "grad_norm": 0.35267532874755536, | |
| "learning_rate": 3.4840776349104755e-06, | |
| "loss": 0.4539, | |
| "mean_token_accuracy": 0.8626947477459908, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.284392500608717, | |
| "grad_norm": 0.3749743227529099, | |
| "learning_rate": 3.4032142913750956e-06, | |
| "loss": 0.4497, | |
| "mean_token_accuracy": 0.8637418314814568, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.2941319698076454, | |
| "grad_norm": 0.37600006002277636, | |
| "learning_rate": 3.322813406063794e-06, | |
| "loss": 0.4559, | |
| "mean_token_accuracy": 0.8622224271297455, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.3038714390065742, | |
| "grad_norm": 0.4005804398308692, | |
| "learning_rate": 3.242898264537331e-06, | |
| "loss": 0.4521, | |
| "mean_token_accuracy": 0.8632362619042396, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.3136109082055027, | |
| "grad_norm": 0.38630202996272256, | |
| "learning_rate": 3.1634920116762175e-06, | |
| "loss": 0.4499, | |
| "mean_token_accuracy": 0.8635998621582985, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.3233503774044315, | |
| "grad_norm": 0.36180685239531096, | |
| "learning_rate": 3.0846176449775363e-06, | |
| "loss": 0.4508, | |
| "mean_token_accuracy": 0.8636634424328804, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.33308984660336, | |
| "grad_norm": 0.3357931590954893, | |
| "learning_rate": 3.0062980078944515e-06, | |
| "loss": 0.4379, | |
| "mean_token_accuracy": 0.8665053129196167, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.3428293158022888, | |
| "grad_norm": 0.35662293115272325, | |
| "learning_rate": 2.9285557832203328e-06, | |
| "loss": 0.4458, | |
| "mean_token_accuracy": 0.8648849859833717, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3525687850012174, | |
| "grad_norm": 0.35254913645932934, | |
| "learning_rate": 2.851413486519388e-06, | |
| "loss": 0.4413, | |
| "mean_token_accuracy": 0.8654858738183975, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.3623082542001461, | |
| "grad_norm": 0.3448432687225989, | |
| "learning_rate": 2.774893459605766e-06, | |
| "loss": 0.4431, | |
| "mean_token_accuracy": 0.86555365473032, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.3720477233990747, | |
| "grad_norm": 0.35405617501926967, | |
| "learning_rate": 2.69901786407295e-06, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8651208564639091, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.3817871925980034, | |
| "grad_norm": 0.36859590132072323, | |
| "learning_rate": 2.6238086748753587e-06, | |
| "loss": 0.456, | |
| "mean_token_accuracy": 0.8627100110054016, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.3915266617969322, | |
| "grad_norm": 0.35246470987862855, | |
| "learning_rate": 2.5492876739639912e-06, | |
| "loss": 0.4533, | |
| "mean_token_accuracy": 0.8628792524337768, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.4012661309958607, | |
| "grad_norm": 0.36351033252725057, | |
| "learning_rate": 2.475476443977996e-06, | |
| "loss": 0.4469, | |
| "mean_token_accuracy": 0.8642540082335473, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.4110056001947893, | |
| "grad_norm": 0.35938233389923585, | |
| "learning_rate": 2.40239636199393e-06, | |
| "loss": 0.4451, | |
| "mean_token_accuracy": 0.8650069192051888, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.420745069393718, | |
| "grad_norm": 0.3673326064574297, | |
| "learning_rate": 2.3300685933345656e-06, | |
| "loss": 0.4485, | |
| "mean_token_accuracy": 0.8642319470643998, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.4304845385926468, | |
| "grad_norm": 0.37166399663475924, | |
| "learning_rate": 2.2585140854390432e-06, | |
| "loss": 0.4496, | |
| "mean_token_accuracy": 0.8641144469380379, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.4402240077915753, | |
| "grad_norm": 0.34969080725493995, | |
| "learning_rate": 2.187753561796097e-06, | |
| "loss": 0.449, | |
| "mean_token_accuracy": 0.8638374775648117, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.449963476990504, | |
| "grad_norm": 0.33346605175419364, | |
| "learning_rate": 2.117807515942163e-06, | |
| "loss": 0.4487, | |
| "mean_token_accuracy": 0.8639461770653725, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.4597029461894326, | |
| "grad_norm": 0.3402282068550279, | |
| "learning_rate": 2.0486962055260744e-06, | |
| "loss": 0.4381, | |
| "mean_token_accuracy": 0.8668898791074753, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.4694424153883614, | |
| "grad_norm": 0.3454615445905308, | |
| "learning_rate": 1.9804396464420798e-06, | |
| "loss": 0.4407, | |
| "mean_token_accuracy": 0.8662415385246277, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.47918188458729, | |
| "grad_norm": 0.3815154168732918, | |
| "learning_rate": 1.9130576070328695e-06, | |
| "loss": 0.4502, | |
| "mean_token_accuracy": 0.863922019302845, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.4889213537862187, | |
| "grad_norm": 0.35782414062256107, | |
| "learning_rate": 1.8465696023643115e-06, | |
| "loss": 0.4484, | |
| "mean_token_accuracy": 0.8640252217650414, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.4986608229851472, | |
| "grad_norm": 0.34966179386580715, | |
| "learning_rate": 1.7809948885735295e-06, | |
| "loss": 0.4476, | |
| "mean_token_accuracy": 0.8640663206577301, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.508400292184076, | |
| "grad_norm": 0.3283826370466805, | |
| "learning_rate": 1.7163524572919748e-06, | |
| "loss": 0.4535, | |
| "mean_token_accuracy": 0.8629004299640656, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.5181397613830048, | |
| "grad_norm": 0.3309412101835561, | |
| "learning_rate": 1.6526610301451028e-06, | |
| "loss": 0.4374, | |
| "mean_token_accuracy": 0.8666589662432671, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.5278792305819333, | |
| "grad_norm": 0.35820157894670385, | |
| "learning_rate": 1.5899390533302538e-06, | |
| "loss": 0.4387, | |
| "mean_token_accuracy": 0.8667290091514588, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.5376186997808619, | |
| "grad_norm": 0.34472450626785944, | |
| "learning_rate": 1.5282046922742876e-06, | |
| "loss": 0.4502, | |
| "mean_token_accuracy": 0.8635128363966942, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.5473581689797906, | |
| "grad_norm": 0.31690650065908227, | |
| "learning_rate": 1.4674758263725614e-06, | |
| "loss": 0.4461, | |
| "mean_token_accuracy": 0.8644041374325753, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.5570976381787194, | |
| "grad_norm": 0.3261992242400567, | |
| "learning_rate": 1.4077700438107183e-06, | |
| "loss": 0.445, | |
| "mean_token_accuracy": 0.8651425749063492, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.566837107377648, | |
| "grad_norm": 0.37132210271715693, | |
| "learning_rate": 1.3491046364708294e-06, | |
| "loss": 0.445, | |
| "mean_token_accuracy": 0.8648254871368408, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.5765765765765765, | |
| "grad_norm": 0.34739884194127096, | |
| "learning_rate": 1.2914965949233572e-06, | |
| "loss": 0.4474, | |
| "mean_token_accuracy": 0.8643729150295257, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.5863160457755052, | |
| "grad_norm": 0.3092847696386134, | |
| "learning_rate": 1.2349626035063705e-06, | |
| "loss": 0.4389, | |
| "mean_token_accuracy": 0.8666358023881913, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.596055514974434, | |
| "grad_norm": 0.33368514787450476, | |
| "learning_rate": 1.1795190354934587e-06, | |
| "loss": 0.4635, | |
| "mean_token_accuracy": 0.8606240957975387, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.6057949841733625, | |
| "grad_norm": 0.3355701607651583, | |
| "learning_rate": 1.1251819483517334e-06, | |
| "loss": 0.4469, | |
| "mean_token_accuracy": 0.8647612199187279, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.615534453372291, | |
| "grad_norm": 0.3215469441235877, | |
| "learning_rate": 1.0719670790912928e-06, | |
| "loss": 0.4479, | |
| "mean_token_accuracy": 0.8641064539551735, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.6252739225712198, | |
| "grad_norm": 0.3331920310980409, | |
| "learning_rate": 1.019889839707498e-06, | |
| "loss": 0.447, | |
| "mean_token_accuracy": 0.8645351231098175, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.6350133917701486, | |
| "grad_norm": 0.3311460501397384, | |
| "learning_rate": 9.689653127173743e-07, | |
| "loss": 0.4548, | |
| "mean_token_accuracy": 0.8624306350946427, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.6447528609690771, | |
| "grad_norm": 0.3387701231769685, | |
| "learning_rate": 9.192082467914465e-07, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8649628892540931, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.654492330168006, | |
| "grad_norm": 0.33969198086970775, | |
| "learning_rate": 8.706330524822548e-07, | |
| "loss": 0.4413, | |
| "mean_token_accuracy": 0.865912164747715, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.6642317993669344, | |
| "grad_norm": 0.32626435288941413, | |
| "learning_rate": 8.232537980507848e-07, | |
| "loss": 0.4454, | |
| "mean_token_accuracy": 0.8650734156370163, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.6739712685658632, | |
| "grad_norm": 0.32534723430048623, | |
| "learning_rate": 7.770842053920585e-07, | |
| "loss": 0.4424, | |
| "mean_token_accuracy": 0.8653772249817848, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.683710737764792, | |
| "grad_norm": 0.33973250353406415, | |
| "learning_rate": 7.321376460610136e-07, | |
| "loss": 0.4398, | |
| "mean_token_accuracy": 0.865781269967556, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.6934502069637205, | |
| "grad_norm": 0.31477206750092435, | |
| "learning_rate": 6.884271373998608e-07, | |
| "loss": 0.4402, | |
| "mean_token_accuracy": 0.8660067468881607, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.703189676162649, | |
| "grad_norm": 0.3537664329765484, | |
| "learning_rate": 6.459653387680248e-07, | |
| "loss": 0.4426, | |
| "mean_token_accuracy": 0.8656087100505829, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.7129291453615778, | |
| "grad_norm": 0.32858889216418335, | |
| "learning_rate": 6.047645478757635e-07, | |
| "loss": 0.4367, | |
| "mean_token_accuracy": 0.8670719146728516, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.7226686145605066, | |
| "grad_norm": 0.32864732265316704, | |
| "learning_rate": 5.648366972225222e-07, | |
| "loss": 0.4527, | |
| "mean_token_accuracy": 0.8630405649542808, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.7324080837594351, | |
| "grad_norm": 0.3173505795175411, | |
| "learning_rate": 5.261933506410722e-07, | |
| "loss": 0.4401, | |
| "mean_token_accuracy": 0.8665132194757461, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.7421475529583637, | |
| "grad_norm": 0.36040952773775503, | |
| "learning_rate": 4.888456999484098e-07, | |
| "loss": 0.4465, | |
| "mean_token_accuracy": 0.8646954327821732, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.7518870221572924, | |
| "grad_norm": 0.31604383070212816, | |
| "learning_rate": 4.528045617044019e-07, | |
| "loss": 0.443, | |
| "mean_token_accuracy": 0.8652036920189857, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.7616264913562212, | |
| "grad_norm": 0.31389323834734445, | |
| "learning_rate": 4.180803740791156e-07, | |
| "loss": 0.4426, | |
| "mean_token_accuracy": 0.8656812936067582, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.7713659605551497, | |
| "grad_norm": 0.3097110381509705, | |
| "learning_rate": 3.846831938297324e-07, | |
| "loss": 0.4468, | |
| "mean_token_accuracy": 0.8643257409334183, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.7811054297540783, | |
| "grad_norm": 0.31179540512244613, | |
| "learning_rate": 3.5262269338792623e-07, | |
| "loss": 0.4447, | |
| "mean_token_accuracy": 0.8651932507753373, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.790844898953007, | |
| "grad_norm": 0.33502042463366694, | |
| "learning_rate": 3.219081580585548e-07, | |
| "loss": 0.4508, | |
| "mean_token_accuracy": 0.863429008424282, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.8005843681519358, | |
| "grad_norm": 0.3164739894198197, | |
| "learning_rate": 2.9254848333046817e-07, | |
| "loss": 0.4528, | |
| "mean_token_accuracy": 0.8630515649914742, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.8103238373508643, | |
| "grad_norm": 0.3030075688047627, | |
| "learning_rate": 2.645521723002037e-07, | |
| "loss": 0.4507, | |
| "mean_token_accuracy": 0.8635053560137749, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.8200633065497929, | |
| "grad_norm": 0.3371646202292577, | |
| "learning_rate": 2.3792733320934348e-07, | |
| "loss": 0.4441, | |
| "mean_token_accuracy": 0.8654543533921242, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.8298027757487216, | |
| "grad_norm": 0.3074575906934633, | |
| "learning_rate": 2.12681677096217e-07, | |
| "loss": 0.4351, | |
| "mean_token_accuracy": 0.8675669968128205, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.8395422449476504, | |
| "grad_norm": 0.30631898093344184, | |
| "learning_rate": 1.888225155626433e-07, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8653656959533691, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.8492817141465792, | |
| "grad_norm": 0.2958594503607962, | |
| "learning_rate": 1.6635675865635859e-07, | |
| "loss": 0.4568, | |
| "mean_token_accuracy": 0.8619549512863159, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.8590211833455077, | |
| "grad_norm": 0.31258753396965294, | |
| "learning_rate": 1.4529091286973994e-07, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8656645834445953, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.8687606525444362, | |
| "grad_norm": 0.30999125345039114, | |
| "learning_rate": 1.2563107925540774e-07, | |
| "loss": 0.4444, | |
| "mean_token_accuracy": 0.8654705569148063, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.878500121743365, | |
| "grad_norm": 0.3160340098992061, | |
| "learning_rate": 1.0738295165924783e-07, | |
| "loss": 0.4459, | |
| "mean_token_accuracy": 0.8648449763655662, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.8882395909422938, | |
| "grad_norm": 0.3187758679651324, | |
| "learning_rate": 9.055181507137245e-08, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8654528453946113, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.8979790601412223, | |
| "grad_norm": 0.33617665839091787, | |
| "learning_rate": 7.514254409549005e-08, | |
| "loss": 0.4481, | |
| "mean_token_accuracy": 0.8641701668500901, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.9077185293401508, | |
| "grad_norm": 0.31232672260705024, | |
| "learning_rate": 6.115960153712963e-08, | |
| "loss": 0.4414, | |
| "mean_token_accuracy": 0.8660656422376632, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.9174579985390796, | |
| "grad_norm": 0.303119007372895, | |
| "learning_rate": 4.860703711113246e-08, | |
| "loss": 0.4362, | |
| "mean_token_accuracy": 0.8676238685846329, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.9271974677380084, | |
| "grad_norm": 0.30341116951092534, | |
| "learning_rate": 3.748848626878132e-08, | |
| "loss": 0.4508, | |
| "mean_token_accuracy": 0.8636912703514099, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.936936936936937, | |
| "grad_norm": 0.31515978754450974, | |
| "learning_rate": 2.7807169144906108e-08, | |
| "loss": 0.4528, | |
| "mean_token_accuracy": 0.8630853027105332, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.9466764061358655, | |
| "grad_norm": 0.30748596132213357, | |
| "learning_rate": 1.9565889625275945e-08, | |
| "loss": 0.4558, | |
| "mean_token_accuracy": 0.8628044292330742, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.9564158753347942, | |
| "grad_norm": 0.30248992557073057, | |
| "learning_rate": 1.2767034534540978e-08, | |
| "loss": 0.4467, | |
| "mean_token_accuracy": 0.86439578384161, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.966155344533723, | |
| "grad_norm": 0.3256862744869616, | |
| "learning_rate": 7.412572944965335e-09, | |
| "loss": 0.4548, | |
| "mean_token_accuracy": 0.8625592529773712, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.9758948137326515, | |
| "grad_norm": 0.3147340514506444, | |
| "learning_rate": 3.5040556061483043e-09, | |
| "loss": 0.4422, | |
| "mean_token_accuracy": 0.8657747611403466, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.98563428293158, | |
| "grad_norm": 0.32764257560082577, | |
| "learning_rate": 1.0426144958985974e-09, | |
| "loss": 0.4419, | |
| "mean_token_accuracy": 0.8660721600055694, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.9953737521305088, | |
| "grad_norm": 0.3098108021812636, | |
| "learning_rate": 2.8962492393258546e-11, | |
| "loss": 0.4452, | |
| "mean_token_accuracy": 0.8650736406445503, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.9973216459702945, | |
| "mean_token_accuracy": 0.8681519404053688, | |
| "step": 1026, | |
| "total_flos": 1074983740637184.0, | |
| "train_loss": 0.49954891321022377, | |
| "train_runtime": 168401.8035, | |
| "train_samples_per_second": 0.78, | |
| "train_steps_per_second": 0.006 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1026, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1074983740637184.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |