| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.99979354483518, | |
| "eval_steps": 200, | |
| "global_step": 3632, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00027527355309338655, | |
| "grad_norm": 47.970261255811366, | |
| "learning_rate": 2.7472527472527476e-08, | |
| "loss": 2.9286, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0013763677654669328, | |
| "grad_norm": 197.0659220078, | |
| "learning_rate": 1.3736263736263737e-07, | |
| "loss": 3.3156, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0027527355309338655, | |
| "grad_norm": 155.2306778030855, | |
| "learning_rate": 2.7472527472527475e-07, | |
| "loss": 3.1021, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.004129103296400798, | |
| "grad_norm": 40.67122345119242, | |
| "learning_rate": 4.120879120879121e-07, | |
| "loss": 2.9314, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.005505471061867731, | |
| "grad_norm": 59.59101654287074, | |
| "learning_rate": 5.494505494505495e-07, | |
| "loss": 3.0831, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006881838827334664, | |
| "grad_norm": 105.45909774967997, | |
| "learning_rate": 6.868131868131869e-07, | |
| "loss": 3.0525, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.008258206592801597, | |
| "grad_norm": 34.27227049204782, | |
| "learning_rate": 8.241758241758242e-07, | |
| "loss": 3.1182, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.00963457435826853, | |
| "grad_norm": 127.02820304897055, | |
| "learning_rate": 9.615384615384617e-07, | |
| "loss": 2.9352, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.011010942123735462, | |
| "grad_norm": 64.04507215334816, | |
| "learning_rate": 1.098901098901099e-06, | |
| "loss": 2.9873, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.012387309889202395, | |
| "grad_norm": 23.999491442433914, | |
| "learning_rate": 1.2362637362637365e-06, | |
| "loss": 2.667, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.013763677654669328, | |
| "grad_norm": 34.50645980021284, | |
| "learning_rate": 1.3736263736263738e-06, | |
| "loss": 2.6091, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01514004542013626, | |
| "grad_norm": 46.64865828445111, | |
| "learning_rate": 1.510989010989011e-06, | |
| "loss": 2.4922, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.016516413185603193, | |
| "grad_norm": 21.789171957916576, | |
| "learning_rate": 1.6483516483516484e-06, | |
| "loss": 2.4031, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.017892780951070126, | |
| "grad_norm": 38.24383608079909, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 2.3918, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.01926914871653706, | |
| "grad_norm": 15.503348119198089, | |
| "learning_rate": 1.9230769230769234e-06, | |
| "loss": 2.2549, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02064551648200399, | |
| "grad_norm": 12.557848365372337, | |
| "learning_rate": 2.0604395604395607e-06, | |
| "loss": 2.254, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.022021884247470924, | |
| "grad_norm": 13.411610689976131, | |
| "learning_rate": 2.197802197802198e-06, | |
| "loss": 2.2895, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.023398252012937857, | |
| "grad_norm": 11.512956200014314, | |
| "learning_rate": 2.3351648351648353e-06, | |
| "loss": 1.9875, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.02477461977840479, | |
| "grad_norm": 10.834444585244098, | |
| "learning_rate": 2.472527472527473e-06, | |
| "loss": 1.9547, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.026150987543871723, | |
| "grad_norm": 10.311358732225472, | |
| "learning_rate": 2.6098901098901103e-06, | |
| "loss": 1.9957, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.027527355309338655, | |
| "grad_norm": 8.779165691485517, | |
| "learning_rate": 2.7472527472527476e-06, | |
| "loss": 1.8935, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.028903723074805588, | |
| "grad_norm": 7.066737844316085, | |
| "learning_rate": 2.8846153846153845e-06, | |
| "loss": 1.7438, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.03028009084027252, | |
| "grad_norm": 7.678771227526979, | |
| "learning_rate": 3.021978021978022e-06, | |
| "loss": 1.6582, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03165645860573946, | |
| "grad_norm": 6.076422896577125, | |
| "learning_rate": 3.1593406593406595e-06, | |
| "loss": 1.6652, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.033032826371206386, | |
| "grad_norm": 6.807996481097765, | |
| "learning_rate": 3.2967032967032968e-06, | |
| "loss": 1.6854, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03440919413667332, | |
| "grad_norm": 5.621259165131704, | |
| "learning_rate": 3.4340659340659345e-06, | |
| "loss": 1.6461, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.03578556190214025, | |
| "grad_norm": 4.18571348105954, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 1.5098, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03716192966760719, | |
| "grad_norm": 4.80488403899634, | |
| "learning_rate": 3.708791208791209e-06, | |
| "loss": 1.5253, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.03853829743307412, | |
| "grad_norm": 5.012572093969651, | |
| "learning_rate": 3.846153846153847e-06, | |
| "loss": 1.5984, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.039914665198541054, | |
| "grad_norm": 4.552751574433519, | |
| "learning_rate": 3.983516483516483e-06, | |
| "loss": 1.5606, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.04129103296400798, | |
| "grad_norm": 4.3599791639751535, | |
| "learning_rate": 4.120879120879121e-06, | |
| "loss": 1.4992, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04266740072947492, | |
| "grad_norm": 3.488042700729171, | |
| "learning_rate": 4.258241758241759e-06, | |
| "loss": 1.4261, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.04404376849494185, | |
| "grad_norm": 3.8333617793891577, | |
| "learning_rate": 4.395604395604396e-06, | |
| "loss": 1.4617, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.045420136260408785, | |
| "grad_norm": 3.0548962932241848, | |
| "learning_rate": 4.532967032967033e-06, | |
| "loss": 1.4781, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.046796504025875714, | |
| "grad_norm": 5.5346794342223, | |
| "learning_rate": 4.6703296703296706e-06, | |
| "loss": 1.4395, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04817287179134265, | |
| "grad_norm": 3.685964787103759, | |
| "learning_rate": 4.807692307692308e-06, | |
| "loss": 1.3802, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.04954923955680958, | |
| "grad_norm": 4.058757049675143, | |
| "learning_rate": 4.945054945054946e-06, | |
| "loss": 1.4061, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.050925607322276516, | |
| "grad_norm": 2.9156243613759965, | |
| "learning_rate": 4.999990671457219e-06, | |
| "loss": 1.3884, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.052301975087743445, | |
| "grad_norm": 2.8044219675737225, | |
| "learning_rate": 4.999933663947887e-06, | |
| "loss": 1.3079, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05367834285321038, | |
| "grad_norm": 2.5580737825571354, | |
| "learning_rate": 4.999824832633327e-06, | |
| "loss": 1.2732, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.05505471061867731, | |
| "grad_norm": 2.5474355608388106, | |
| "learning_rate": 4.999664179769621e-06, | |
| "loss": 1.3348, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05505471061867731, | |
| "eval_loss": 1.2703925371170044, | |
| "eval_runtime": 37.5835, | |
| "eval_samples_per_second": 133.037, | |
| "eval_steps_per_second": 2.102, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05643107838414425, | |
| "grad_norm": 2.5400210842014985, | |
| "learning_rate": 4.999451708687114e-06, | |
| "loss": 1.2628, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.057807446149611176, | |
| "grad_norm": 2.9134202135126803, | |
| "learning_rate": 4.999187423790347e-06, | |
| "loss": 1.3461, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05918381391507811, | |
| "grad_norm": 2.979982911881612, | |
| "learning_rate": 4.9988713305579665e-06, | |
| "loss": 1.279, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.06056018168054504, | |
| "grad_norm": 2.3341235353360754, | |
| "learning_rate": 4.998503435542605e-06, | |
| "loss": 1.2791, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.06193654944601198, | |
| "grad_norm": 2.553595077194149, | |
| "learning_rate": 4.9980837463707545e-06, | |
| "loss": 1.2591, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.06331291721147891, | |
| "grad_norm": 2.1559374955549675, | |
| "learning_rate": 4.997612271742601e-06, | |
| "loss": 1.2184, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.06468928497694584, | |
| "grad_norm": 2.143686811503016, | |
| "learning_rate": 4.9970890214318494e-06, | |
| "loss": 1.2676, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.06606565274241277, | |
| "grad_norm": 1.75568515745194, | |
| "learning_rate": 4.996514006285514e-06, | |
| "loss": 1.2013, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0674420205078797, | |
| "grad_norm": 2.4736934108040027, | |
| "learning_rate": 4.995887238223703e-06, | |
| "loss": 1.2801, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.06881838827334665, | |
| "grad_norm": 1.9847695711171895, | |
| "learning_rate": 4.99520873023936e-06, | |
| "loss": 1.2026, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07019475603881357, | |
| "grad_norm": 2.3258055796641592, | |
| "learning_rate": 4.994478496398007e-06, | |
| "loss": 1.2115, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.0715711238042805, | |
| "grad_norm": 1.8207346260417532, | |
| "learning_rate": 4.993696551837444e-06, | |
| "loss": 1.2521, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07294749156974743, | |
| "grad_norm": 1.901002680506845, | |
| "learning_rate": 4.9928629127674375e-06, | |
| "loss": 1.1437, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.07432385933521438, | |
| "grad_norm": 2.1411204432050925, | |
| "learning_rate": 4.991977596469385e-06, | |
| "loss": 1.1638, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0757002271006813, | |
| "grad_norm": 1.6695878470807146, | |
| "learning_rate": 4.991040621295959e-06, | |
| "loss": 1.1406, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.07707659486614823, | |
| "grad_norm": 1.9670297948191877, | |
| "learning_rate": 4.990052006670722e-06, | |
| "loss": 1.1152, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07845296263161516, | |
| "grad_norm": 1.8164273632468562, | |
| "learning_rate": 4.989011773087725e-06, | |
| "loss": 1.154, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.07982933039708211, | |
| "grad_norm": 1.4090788234057618, | |
| "learning_rate": 4.9879199421110865e-06, | |
| "loss": 1.0789, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.08120569816254904, | |
| "grad_norm": 1.5318431537024289, | |
| "learning_rate": 4.9867765363745426e-06, | |
| "loss": 1.156, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.08258206592801597, | |
| "grad_norm": 1.5662411802833258, | |
| "learning_rate": 4.9855815795809735e-06, | |
| "loss": 1.2253, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0839584336934829, | |
| "grad_norm": 1.487588263963499, | |
| "learning_rate": 4.984335096501922e-06, | |
| "loss": 1.1697, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.08533480145894984, | |
| "grad_norm": 1.5795690517983507, | |
| "learning_rate": 4.983037112977072e-06, | |
| "loss": 1.1747, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08671116922441677, | |
| "grad_norm": 1.5801325920576266, | |
| "learning_rate": 4.981687655913716e-06, | |
| "loss": 1.1812, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.0880875369898837, | |
| "grad_norm": 1.5881269944242014, | |
| "learning_rate": 4.980286753286196e-06, | |
| "loss": 1.1158, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08946390475535063, | |
| "grad_norm": 1.3649187456591194, | |
| "learning_rate": 4.978834434135323e-06, | |
| "loss": 1.0911, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.09084027252081757, | |
| "grad_norm": 1.3003473744310392, | |
| "learning_rate": 4.977330728567778e-06, | |
| "loss": 1.0947, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0922166402862845, | |
| "grad_norm": 2.0679161756500832, | |
| "learning_rate": 4.975775667755489e-06, | |
| "loss": 1.1364, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.09359300805175143, | |
| "grad_norm": 1.5739073785267514, | |
| "learning_rate": 4.974169283934976e-06, | |
| "loss": 1.172, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.09496937581721836, | |
| "grad_norm": 1.7213917188161827, | |
| "learning_rate": 4.972511610406693e-06, | |
| "loss": 1.1608, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.0963457435826853, | |
| "grad_norm": 1.4537784803209515, | |
| "learning_rate": 4.970802681534331e-06, | |
| "loss": 1.1647, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09772211134815223, | |
| "grad_norm": 1.3731655227058528, | |
| "learning_rate": 4.969042532744109e-06, | |
| "loss": 1.0853, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.09909847911361916, | |
| "grad_norm": 1.2363319744461196, | |
| "learning_rate": 4.967231200524037e-06, | |
| "loss": 1.0066, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.10047484687908609, | |
| "grad_norm": 1.5759080996624, | |
| "learning_rate": 4.965368722423166e-06, | |
| "loss": 1.1516, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.10185121464455303, | |
| "grad_norm": 1.3507882575141734, | |
| "learning_rate": 4.9634551370507985e-06, | |
| "loss": 1.1073, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.10322758241001996, | |
| "grad_norm": 1.277730572175811, | |
| "learning_rate": 4.961490484075698e-06, | |
| "loss": 1.1298, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.10460395017548689, | |
| "grad_norm": 1.4535170267483315, | |
| "learning_rate": 4.9594748042252635e-06, | |
| "loss": 1.1084, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.10598031794095382, | |
| "grad_norm": 1.6186734387061241, | |
| "learning_rate": 4.957408139284682e-06, | |
| "loss": 1.1102, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.10735668570642076, | |
| "grad_norm": 1.132537102297945, | |
| "learning_rate": 4.9552905320960685e-06, | |
| "loss": 1.065, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10873305347188769, | |
| "grad_norm": 1.3026196221634236, | |
| "learning_rate": 4.9531220265575714e-06, | |
| "loss": 1.1021, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.11010942123735462, | |
| "grad_norm": 1.3077867242379508, | |
| "learning_rate": 4.950902667622468e-06, | |
| "loss": 1.0411, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11010942123735462, | |
| "eval_loss": 1.0434951782226562, | |
| "eval_runtime": 37.5694, | |
| "eval_samples_per_second": 133.087, | |
| "eval_steps_per_second": 2.103, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11148578900282155, | |
| "grad_norm": 1.6148360944145506, | |
| "learning_rate": 4.948632501298228e-06, | |
| "loss": 1.0545, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.1128621567682885, | |
| "grad_norm": 1.316099506488652, | |
| "learning_rate": 4.9463115746455656e-06, | |
| "loss": 1.0593, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.11423852453375542, | |
| "grad_norm": 1.4454673179124138, | |
| "learning_rate": 4.943939935777455e-06, | |
| "loss": 1.0217, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.11561489229922235, | |
| "grad_norm": 1.2512294045462884, | |
| "learning_rate": 4.941517633858142e-06, | |
| "loss": 1.1085, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.11699126006468928, | |
| "grad_norm": 1.0101964296414214, | |
| "learning_rate": 4.93904471910212e-06, | |
| "loss": 1.0003, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.11836762783015622, | |
| "grad_norm": 1.1944857398005146, | |
| "learning_rate": 4.936521242773091e-06, | |
| "loss": 1.1296, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11974399559562315, | |
| "grad_norm": 1.2974835007262053, | |
| "learning_rate": 4.933947257182901e-06, | |
| "loss": 1.1402, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.12112036336109008, | |
| "grad_norm": 1.6118257673114658, | |
| "learning_rate": 4.931322815690457e-06, | |
| "loss": 1.0763, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.12249673112655701, | |
| "grad_norm": 1.224531658843626, | |
| "learning_rate": 4.92864797270062e-06, | |
| "loss": 1.1138, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.12387309889202396, | |
| "grad_norm": 1.1659984082065094, | |
| "learning_rate": 4.925922783663079e-06, | |
| "loss": 1.0189, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.12524946665749087, | |
| "grad_norm": 8.709307393389583, | |
| "learning_rate": 4.923147305071199e-06, | |
| "loss": 1.0822, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.12662583442295783, | |
| "grad_norm": 1.1509719961863358, | |
| "learning_rate": 4.9203215944608515e-06, | |
| "loss": 1.0373, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.12800220218842476, | |
| "grad_norm": 1.0234613424480405, | |
| "learning_rate": 4.917445710409221e-06, | |
| "loss": 1.037, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.1293785699538917, | |
| "grad_norm": 1.1018659479000739, | |
| "learning_rate": 4.914519712533592e-06, | |
| "loss": 1.088, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.13075493771935862, | |
| "grad_norm": 0.9721625793315171, | |
| "learning_rate": 4.911543661490111e-06, | |
| "loss": 1.05, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.13213130548482555, | |
| "grad_norm": 0.9710335336779444, | |
| "learning_rate": 4.908517618972532e-06, | |
| "loss": 1.0123, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.13350767325029247, | |
| "grad_norm": 1.5166549245825005, | |
| "learning_rate": 4.905441647710932e-06, | |
| "loss": 1.05, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.1348840410157594, | |
| "grad_norm": 0.8785700675801744, | |
| "learning_rate": 4.90231581147042e-06, | |
| "loss": 1.032, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.13626040878122633, | |
| "grad_norm": 1.5060433665040545, | |
| "learning_rate": 4.899140175049806e-06, | |
| "loss": 1.0196, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.1376367765466933, | |
| "grad_norm": 1.0487460758379659, | |
| "learning_rate": 4.895914804280262e-06, | |
| "loss": 1.089, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13901314431216022, | |
| "grad_norm": 1.3023717508397745, | |
| "learning_rate": 4.892639766023957e-06, | |
| "loss": 1.022, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.14038951207762715, | |
| "grad_norm": 1.0972179140074843, | |
| "learning_rate": 4.889315128172669e-06, | |
| "loss": 1.0049, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.14176587984309408, | |
| "grad_norm": 0.8796434076141977, | |
| "learning_rate": 4.885940959646383e-06, | |
| "loss": 0.9685, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.143142247608561, | |
| "grad_norm": 0.9834949701746455, | |
| "learning_rate": 4.882517330391854e-06, | |
| "loss": 1.0246, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.14451861537402794, | |
| "grad_norm": 1.138039557393203, | |
| "learning_rate": 4.879044311381164e-06, | |
| "loss": 1.1077, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.14589498313949487, | |
| "grad_norm": 1.085162105922894, | |
| "learning_rate": 4.875521974610247e-06, | |
| "loss": 1.0675, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1472713509049618, | |
| "grad_norm": 0.871775704668988, | |
| "learning_rate": 4.8719503930973995e-06, | |
| "loss": 1.019, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.14864771867042875, | |
| "grad_norm": 0.9961249615759942, | |
| "learning_rate": 4.868329640881764e-06, | |
| "loss": 1.0749, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.15002408643589568, | |
| "grad_norm": 1.1040648660006491, | |
| "learning_rate": 4.864659793021795e-06, | |
| "loss": 1.0435, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.1514004542013626, | |
| "grad_norm": 0.9631823018123004, | |
| "learning_rate": 4.860940925593703e-06, | |
| "loss": 0.9509, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.15277682196682954, | |
| "grad_norm": 1.0645503004444963, | |
| "learning_rate": 4.8571731156898785e-06, | |
| "loss": 1.0142, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.15415318973229647, | |
| "grad_norm": 1.0599242806058304, | |
| "learning_rate": 4.8533564414172915e-06, | |
| "loss": 0.9796, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1555295574977634, | |
| "grad_norm": 1.0803032824855876, | |
| "learning_rate": 4.849490981895877e-06, | |
| "loss": 0.9508, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.15690592526323033, | |
| "grad_norm": 0.8429127532715366, | |
| "learning_rate": 4.845576817256888e-06, | |
| "loss": 0.9975, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.15828229302869726, | |
| "grad_norm": 0.8648413729480695, | |
| "learning_rate": 4.841614028641241e-06, | |
| "loss": 1.0446, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.15965866079416421, | |
| "grad_norm": 0.766164996990341, | |
| "learning_rate": 4.83760269819783e-06, | |
| "loss": 1.0017, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.16103502855963114, | |
| "grad_norm": 1.1710332971275654, | |
| "learning_rate": 4.833542909081824e-06, | |
| "loss": 0.977, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.16241139632509807, | |
| "grad_norm": 0.8805076425003365, | |
| "learning_rate": 4.829434745452944e-06, | |
| "loss": 1.0282, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.163787764090565, | |
| "grad_norm": 0.9917312981640594, | |
| "learning_rate": 4.82527829247372e-06, | |
| "loss": 1.0007, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.16516413185603193, | |
| "grad_norm": 0.9418229318080311, | |
| "learning_rate": 4.821073636307719e-06, | |
| "loss": 1.0483, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.16516413185603193, | |
| "eval_loss": 0.9694015383720398, | |
| "eval_runtime": 37.5636, | |
| "eval_samples_per_second": 133.108, | |
| "eval_steps_per_second": 2.103, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.16654049962149886, | |
| "grad_norm": 1.151675240339901, | |
| "learning_rate": 4.81682086411777e-06, | |
| "loss": 1.0735, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.1679168673869658, | |
| "grad_norm": 0.8785348232118754, | |
| "learning_rate": 4.812520064064146e-06, | |
| "loss": 0.9803, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.16929323515243272, | |
| "grad_norm": 0.7903326312503011, | |
| "learning_rate": 4.8081713253027415e-06, | |
| "loss": 1.0074, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.17066960291789968, | |
| "grad_norm": 1.1239070075461262, | |
| "learning_rate": 4.803774737983226e-06, | |
| "loss": 0.9774, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1720459706833666, | |
| "grad_norm": 0.8575748601821118, | |
| "learning_rate": 4.799330393247173e-06, | |
| "loss": 0.9554, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.17342233844883354, | |
| "grad_norm": 0.7681124502085931, | |
| "learning_rate": 4.7948383832261665e-06, | |
| "loss": 0.9925, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.17479870621430046, | |
| "grad_norm": 0.9533830559317458, | |
| "learning_rate": 4.790298801039901e-06, | |
| "loss": 0.9942, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.1761750739797674, | |
| "grad_norm": 0.9451172948449857, | |
| "learning_rate": 4.785711740794241e-06, | |
| "loss": 1.0296, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.17755144174523432, | |
| "grad_norm": 0.9522113767917231, | |
| "learning_rate": 4.781077297579278e-06, | |
| "loss": 0.9792, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.17892780951070125, | |
| "grad_norm": 0.671435197412033, | |
| "learning_rate": 4.776395567467353e-06, | |
| "loss": 0.967, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.18030417727616818, | |
| "grad_norm": 0.775668527281842, | |
| "learning_rate": 4.7716666475110686e-06, | |
| "loss": 1.0187, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.18168054504163514, | |
| "grad_norm": 0.8387050001267089, | |
| "learning_rate": 4.766890635741278e-06, | |
| "loss": 1.0319, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.18305691280710207, | |
| "grad_norm": 0.824748710334662, | |
| "learning_rate": 4.762067631165049e-06, | |
| "loss": 0.9293, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.184433280572569, | |
| "grad_norm": 0.9799352793752129, | |
| "learning_rate": 4.757197733763615e-06, | |
| "loss": 0.9157, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.18580964833803593, | |
| "grad_norm": 0.8362870057769561, | |
| "learning_rate": 4.7522810444903004e-06, | |
| "loss": 0.949, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.18718601610350286, | |
| "grad_norm": 0.9321760596367203, | |
| "learning_rate": 4.7473176652684276e-06, | |
| "loss": 0.9901, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.18856238386896979, | |
| "grad_norm": 0.7717121636862594, | |
| "learning_rate": 4.742307698989207e-06, | |
| "loss": 1.0114, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.18993875163443671, | |
| "grad_norm": 1.0300652464671185, | |
| "learning_rate": 4.7372512495096005e-06, | |
| "loss": 1.0247, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.19131511939990364, | |
| "grad_norm": 0.7755223484828447, | |
| "learning_rate": 4.732148421650171e-06, | |
| "loss": 0.9337, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.1926914871653706, | |
| "grad_norm": 1.0381026277362817, | |
| "learning_rate": 4.7269993211929086e-06, | |
| "loss": 0.9709, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.19406785493083753, | |
| "grad_norm": 0.7702377174214647, | |
| "learning_rate": 4.721804054879036e-06, | |
| "loss": 0.9726, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.19544422269630446, | |
| "grad_norm": 0.9053621913087226, | |
| "learning_rate": 4.7165627304068e-06, | |
| "loss": 0.953, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.1968205904617714, | |
| "grad_norm": 1.003117563447582, | |
| "learning_rate": 4.711275456429235e-06, | |
| "loss": 0.9849, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.19819695822723832, | |
| "grad_norm": 0.9099542081572461, | |
| "learning_rate": 4.70594234255191e-06, | |
| "loss": 0.9901, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.19957332599270525, | |
| "grad_norm": 1.0432447959538196, | |
| "learning_rate": 4.700563499330664e-06, | |
| "loss": 0.9535, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.20094969375817218, | |
| "grad_norm": 0.8237571322857683, | |
| "learning_rate": 4.695139038269303e-06, | |
| "loss": 0.9535, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2023260615236391, | |
| "grad_norm": 0.9877142384272304, | |
| "learning_rate": 4.689669071817296e-06, | |
| "loss": 0.9509, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.20370242928910606, | |
| "grad_norm": 0.681443730837864, | |
| "learning_rate": 4.684153713367442e-06, | |
| "loss": 0.917, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.205078797054573, | |
| "grad_norm": 1.0414452790533368, | |
| "learning_rate": 4.678593077253521e-06, | |
| "loss": 0.9662, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.20645516482003992, | |
| "grad_norm": 0.7632284159752581, | |
| "learning_rate": 4.672987278747919e-06, | |
| "loss": 0.9588, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.20783153258550685, | |
| "grad_norm": 0.7404164880344434, | |
| "learning_rate": 4.667336434059246e-06, | |
| "loss": 0.9426, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.20920790035097378, | |
| "grad_norm": 0.7658934106898954, | |
| "learning_rate": 4.661640660329918e-06, | |
| "loss": 0.9787, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2105842681164407, | |
| "grad_norm": 0.7794575989249981, | |
| "learning_rate": 4.655900075633736e-06, | |
| "loss": 0.9341, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.21196063588190764, | |
| "grad_norm": 0.681238788669416, | |
| "learning_rate": 4.650114798973434e-06, | |
| "loss": 0.9734, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.21333700364737457, | |
| "grad_norm": 1.0301580037782345, | |
| "learning_rate": 4.644284950278217e-06, | |
| "loss": 0.9438, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.21471337141284152, | |
| "grad_norm": 1.3078635807263586, | |
| "learning_rate": 4.638410650401267e-06, | |
| "loss": 0.9335, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.21608973917830845, | |
| "grad_norm": 0.730305470715918, | |
| "learning_rate": 4.632492021117245e-06, | |
| "loss": 0.9164, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.21746610694377538, | |
| "grad_norm": 1.1199081512447784, | |
| "learning_rate": 4.626529185119763e-06, | |
| "loss": 0.9451, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2188424747092423, | |
| "grad_norm": 1.0679579821267784, | |
| "learning_rate": 4.620522266018841e-06, | |
| "loss": 0.9914, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.22021884247470924, | |
| "grad_norm": 0.6094802947859647, | |
| "learning_rate": 4.614471388338346e-06, | |
| "loss": 0.8801, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.22021884247470924, | |
| "eval_loss": 0.9227399230003357, | |
| "eval_runtime": 37.5694, | |
| "eval_samples_per_second": 133.087, | |
| "eval_steps_per_second": 2.103, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.22159521024017617, | |
| "grad_norm": 0.8418029822067341, | |
| "learning_rate": 4.60837667751341e-06, | |
| "loss": 0.8924, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.2229715780056431, | |
| "grad_norm": 0.822305082880161, | |
| "learning_rate": 4.602238259887825e-06, | |
| "loss": 0.9395, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.22434794577111003, | |
| "grad_norm": 0.6380622331066487, | |
| "learning_rate": 4.596056262711434e-06, | |
| "loss": 0.9366, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.225724313536577, | |
| "grad_norm": 0.816860005626295, | |
| "learning_rate": 4.5898308141374835e-06, | |
| "loss": 0.9472, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.22710068130204392, | |
| "grad_norm": 0.6422659239711008, | |
| "learning_rate": 4.583562043219972e-06, | |
| "loss": 0.9558, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.22847704906751085, | |
| "grad_norm": 0.7575538911525173, | |
| "learning_rate": 4.577250079910973e-06, | |
| "loss": 0.933, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.22985341683297777, | |
| "grad_norm": 0.72595045850496, | |
| "learning_rate": 4.57089505505794e-06, | |
| "loss": 0.9754, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.2312297845984447, | |
| "grad_norm": 0.9692773154744749, | |
| "learning_rate": 4.564497100400998e-06, | |
| "loss": 0.9833, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.23260615236391163, | |
| "grad_norm": 0.8045647953099883, | |
| "learning_rate": 4.558056348570209e-06, | |
| "loss": 0.918, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.23398252012937856, | |
| "grad_norm": 0.8493707730777622, | |
| "learning_rate": 4.551572933082823e-06, | |
| "loss": 0.9389, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2353588878948455, | |
| "grad_norm": 0.7008145118663581, | |
| "learning_rate": 4.545046988340509e-06, | |
| "loss": 0.8909, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.23673525566031245, | |
| "grad_norm": 0.9052644910175894, | |
| "learning_rate": 4.538478649626575e-06, | |
| "loss": 0.9574, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.23811162342577938, | |
| "grad_norm": 0.785016020446548, | |
| "learning_rate": 4.531868053103153e-06, | |
| "loss": 1.0396, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.2394879911912463, | |
| "grad_norm": 0.8758044752350663, | |
| "learning_rate": 4.52521533580839e-06, | |
| "loss": 0.8471, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.24086435895671324, | |
| "grad_norm": 0.7552178534994997, | |
| "learning_rate": 4.518520635653594e-06, | |
| "loss": 0.973, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.24224072672218017, | |
| "grad_norm": 0.6477878269303148, | |
| "learning_rate": 4.5117840914203805e-06, | |
| "loss": 0.93, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2436170944876471, | |
| "grad_norm": 0.7740483767304198, | |
| "learning_rate": 4.5050058427578e-06, | |
| "loss": 0.8919, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.24499346225311402, | |
| "grad_norm": 0.5175079894302275, | |
| "learning_rate": 4.498186030179434e-06, | |
| "loss": 0.9334, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.24636983001858095, | |
| "grad_norm": 0.7470864740423165, | |
| "learning_rate": 4.491324795060491e-06, | |
| "loss": 0.9059, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.2477461977840479, | |
| "grad_norm": 0.5912054028857261, | |
| "learning_rate": 4.4844222796348705e-06, | |
| "loss": 0.9406, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.24912256554951484, | |
| "grad_norm": 0.7632257634965951, | |
| "learning_rate": 4.477478626992214e-06, | |
| "loss": 0.9365, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.25049893331498174, | |
| "grad_norm": 0.938994254636935, | |
| "learning_rate": 4.47049398107494e-06, | |
| "loss": 0.8971, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2518753010804487, | |
| "grad_norm": 0.5444477069055039, | |
| "learning_rate": 4.4634684866752665e-06, | |
| "loss": 0.9098, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.25325166884591566, | |
| "grad_norm": 0.9370323101958166, | |
| "learning_rate": 4.456402289432196e-06, | |
| "loss": 0.988, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.25462803661138256, | |
| "grad_norm": 0.656323325746701, | |
| "learning_rate": 4.44929553582851e-06, | |
| "loss": 0.9647, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.2560044043768495, | |
| "grad_norm": 0.8566812639937329, | |
| "learning_rate": 4.442148373187722e-06, | |
| "loss": 0.9587, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.2573807721423164, | |
| "grad_norm": 0.623784946912734, | |
| "learning_rate": 4.434960949671028e-06, | |
| "loss": 0.8996, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.2587571399077834, | |
| "grad_norm": 0.556551458145588, | |
| "learning_rate": 4.427733414274238e-06, | |
| "loss": 0.8582, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.2601335076732503, | |
| "grad_norm": 0.6165188495979379, | |
| "learning_rate": 4.420465916824681e-06, | |
| "loss": 0.9263, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.26150987543871723, | |
| "grad_norm": 0.5388586078056022, | |
| "learning_rate": 4.413158607978104e-06, | |
| "loss": 0.8803, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.26288624320418413, | |
| "grad_norm": 0.6313491960276545, | |
| "learning_rate": 4.405811639215547e-06, | |
| "loss": 0.9321, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.2642626109696511, | |
| "grad_norm": 0.7175326284846463, | |
| "learning_rate": 4.398425162840202e-06, | |
| "loss": 0.921, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.26563897873511805, | |
| "grad_norm": 0.7423069055487866, | |
| "learning_rate": 4.390999331974257e-06, | |
| "loss": 0.9461, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.26701534650058495, | |
| "grad_norm": 0.8331321575994248, | |
| "learning_rate": 4.383534300555722e-06, | |
| "loss": 0.962, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.2683917142660519, | |
| "grad_norm": 0.7921869343480575, | |
| "learning_rate": 4.376030223335237e-06, | |
| "loss": 0.8739, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.2697680820315188, | |
| "grad_norm": 0.8214762995889284, | |
| "learning_rate": 4.368487255872864e-06, | |
| "loss": 0.9187, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.27114444979698576, | |
| "grad_norm": 0.5484449386313469, | |
| "learning_rate": 4.360905554534864e-06, | |
| "loss": 0.8698, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.27252081756245267, | |
| "grad_norm": 0.7491545097860447, | |
| "learning_rate": 4.35328527649045e-06, | |
| "loss": 0.865, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.2738971853279196, | |
| "grad_norm": 0.7804679526519547, | |
| "learning_rate": 4.3456265797085375e-06, | |
| "loss": 0.9351, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.2752735530933866, | |
| "grad_norm": 0.5938120679599327, | |
| "learning_rate": 4.3379296229544635e-06, | |
| "loss": 0.8996, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2752735530933866, | |
| "eval_loss": 0.8887820243835449, | |
| "eval_runtime": 37.567, | |
| "eval_samples_per_second": 133.095, | |
| "eval_steps_per_second": 2.103, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2766499208588535, | |
| "grad_norm": 0.5199466760472583, | |
| "learning_rate": 4.330194565786696e-06, | |
| "loss": 0.9159, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.27802628862432044, | |
| "grad_norm": 0.5983605277568362, | |
| "learning_rate": 4.322421568553529e-06, | |
| "loss": 0.9187, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.27940265638978734, | |
| "grad_norm": 0.6616302612956438, | |
| "learning_rate": 4.314610792389757e-06, | |
| "loss": 0.958, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.2807790241552543, | |
| "grad_norm": 0.4539525897659009, | |
| "learning_rate": 4.30676239921333e-06, | |
| "loss": 0.8607, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.2821553919207212, | |
| "grad_norm": 0.665268425928804, | |
| "learning_rate": 4.298876551722007e-06, | |
| "loss": 0.8738, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.28353175968618816, | |
| "grad_norm": 0.5882893400505045, | |
| "learning_rate": 4.290953413389977e-06, | |
| "loss": 0.8947, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.28490812745165506, | |
| "grad_norm": 0.5736299385687077, | |
| "learning_rate": 4.282993148464467e-06, | |
| "loss": 0.9378, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.286284495217122, | |
| "grad_norm": 0.7145463884016067, | |
| "learning_rate": 4.2749959219623434e-06, | |
| "loss": 0.9029, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.28766086298258897, | |
| "grad_norm": 0.5441170060136393, | |
| "learning_rate": 4.266961899666689e-06, | |
| "loss": 0.9119, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.2890372307480559, | |
| "grad_norm": 0.7902074547549074, | |
| "learning_rate": 4.2588912481233666e-06, | |
| "loss": 0.9143, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.29041359851352283, | |
| "grad_norm": 0.6019716832259343, | |
| "learning_rate": 4.250784134637564e-06, | |
| "loss": 0.8692, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.29178996627898973, | |
| "grad_norm": 0.660623434550896, | |
| "learning_rate": 4.242640727270329e-06, | |
| "loss": 0.935, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.2931663340444567, | |
| "grad_norm": 0.5545068926773359, | |
| "learning_rate": 4.234461194835083e-06, | |
| "loss": 0.9124, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.2945427018099236, | |
| "grad_norm": 0.6474889256130907, | |
| "learning_rate": 4.2262457068941245e-06, | |
| "loss": 0.9003, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.29591906957539055, | |
| "grad_norm": 0.8379201054192673, | |
| "learning_rate": 4.217994433755112e-06, | |
| "loss": 0.8946, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.2972954373408575, | |
| "grad_norm": 0.5968271233441237, | |
| "learning_rate": 4.209707546467531e-06, | |
| "loss": 0.906, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.2986718051063244, | |
| "grad_norm": 0.5971995063991833, | |
| "learning_rate": 4.201385216819155e-06, | |
| "loss": 0.9148, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.30004817287179136, | |
| "grad_norm": 0.6868895981296808, | |
| "learning_rate": 4.193027617332476e-06, | |
| "loss": 0.8785, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.30142454063725826, | |
| "grad_norm": 0.5652277468935024, | |
| "learning_rate": 4.184634921261136e-06, | |
| "loss": 0.9108, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.3028009084027252, | |
| "grad_norm": 0.5880419121074434, | |
| "learning_rate": 4.176207302586329e-06, | |
| "loss": 0.8955, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3041772761681921, | |
| "grad_norm": 0.7043218507759722, | |
| "learning_rate": 4.1677449360132e-06, | |
| "loss": 0.9431, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.3055536439336591, | |
| "grad_norm": 0.615367412623154, | |
| "learning_rate": 4.159247996967216e-06, | |
| "loss": 0.9234, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.306930011699126, | |
| "grad_norm": 0.7427039646128769, | |
| "learning_rate": 4.150716661590538e-06, | |
| "loss": 0.8887, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.30830637946459294, | |
| "grad_norm": 0.7310077258792612, | |
| "learning_rate": 4.142151106738364e-06, | |
| "loss": 0.8959, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3096827472300599, | |
| "grad_norm": 0.5946178494604679, | |
| "learning_rate": 4.133551509975264e-06, | |
| "loss": 0.8957, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.3110591149955268, | |
| "grad_norm": 0.4040536498559825, | |
| "learning_rate": 4.124918049571499e-06, | |
| "loss": 0.8815, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.31243548276099375, | |
| "grad_norm": 0.510452525326286, | |
| "learning_rate": 4.1162509044993264e-06, | |
| "loss": 0.8413, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.31381185052646066, | |
| "grad_norm": 1.095591544246511, | |
| "learning_rate": 4.107550254429289e-06, | |
| "loss": 0.8945, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.3151882182919276, | |
| "grad_norm": 0.6346832255515471, | |
| "learning_rate": 4.09881627972649e-06, | |
| "loss": 0.8879, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.3165645860573945, | |
| "grad_norm": 0.6680761432639554, | |
| "learning_rate": 4.090049161446855e-06, | |
| "loss": 0.9161, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.31794095382286147, | |
| "grad_norm": 0.696714218990553, | |
| "learning_rate": 4.081249081333381e-06, | |
| "loss": 0.9182, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.31931732158832843, | |
| "grad_norm": 1.0720208443645471, | |
| "learning_rate": 4.07241622181236e-06, | |
| "loss": 0.9112, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.32069368935379533, | |
| "grad_norm": 0.6189592447847784, | |
| "learning_rate": 4.063550765989609e-06, | |
| "loss": 0.9185, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.3220700571192623, | |
| "grad_norm": 0.7117706954107574, | |
| "learning_rate": 4.054652897646666e-06, | |
| "loss": 0.8858, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3234464248847292, | |
| "grad_norm": 0.6362209727557948, | |
| "learning_rate": 4.0457228012369855e-06, | |
| "loss": 0.8753, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.32482279265019615, | |
| "grad_norm": 0.817603348475789, | |
| "learning_rate": 4.036760661882109e-06, | |
| "loss": 0.8376, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.32619916041566305, | |
| "grad_norm": 0.7216789765837882, | |
| "learning_rate": 4.027766665367833e-06, | |
| "loss": 0.9097, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.32757552818113, | |
| "grad_norm": 0.7604270143327692, | |
| "learning_rate": 4.0187409981403525e-06, | |
| "loss": 0.8924, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.3289518959465969, | |
| "grad_norm": 0.5264426392037226, | |
| "learning_rate": 4.009683847302401e-06, | |
| "loss": 0.8908, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.33032826371206386, | |
| "grad_norm": 0.6167807303641967, | |
| "learning_rate": 4.00059540060937e-06, | |
| "loss": 0.8682, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.33032826371206386, | |
| "eval_loss": 0.8648103475570679, | |
| "eval_runtime": 37.566, | |
| "eval_samples_per_second": 133.099, | |
| "eval_steps_per_second": 2.103, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3317046314775308, | |
| "grad_norm": 0.47612017363637654, | |
| "learning_rate": 3.991475846465415e-06, | |
| "loss": 0.8904, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.3330809992429977, | |
| "grad_norm": 0.724998856357164, | |
| "learning_rate": 3.982325373919549e-06, | |
| "loss": 0.9, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3344573670084647, | |
| "grad_norm": 0.4223709120139991, | |
| "learning_rate": 3.973144172661731e-06, | |
| "loss": 0.8838, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.3358337347739316, | |
| "grad_norm": 0.5379099635161511, | |
| "learning_rate": 3.963932433018924e-06, | |
| "loss": 0.9138, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.33721010253939854, | |
| "grad_norm": 0.6682983804147051, | |
| "learning_rate": 3.954690345951156e-06, | |
| "loss": 0.8771, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.33858647030486544, | |
| "grad_norm": 0.7454287773417096, | |
| "learning_rate": 3.945418103047558e-06, | |
| "loss": 0.8805, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3399628380703324, | |
| "grad_norm": 0.7063066134977648, | |
| "learning_rate": 3.936115896522395e-06, | |
| "loss": 0.8563, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.34133920583579935, | |
| "grad_norm": 0.6645576290102445, | |
| "learning_rate": 3.92678391921108e-06, | |
| "loss": 0.9031, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.34271557360126625, | |
| "grad_norm": 0.37691206208242306, | |
| "learning_rate": 3.917422364566175e-06, | |
| "loss": 0.8369, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.3440919413667332, | |
| "grad_norm": 0.41960300904843606, | |
| "learning_rate": 3.908031426653383e-06, | |
| "loss": 0.9235, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3454683091322001, | |
| "grad_norm": 0.5867963483036928, | |
| "learning_rate": 3.898611300147525e-06, | |
| "loss": 0.8511, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.34684467689766707, | |
| "grad_norm": 0.7099329212172958, | |
| "learning_rate": 3.889162180328504e-06, | |
| "loss": 0.9318, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.34822104466313397, | |
| "grad_norm": 0.615124184951399, | |
| "learning_rate": 3.879684263077255e-06, | |
| "loss": 0.8774, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.34959741242860093, | |
| "grad_norm": 0.5769333758225725, | |
| "learning_rate": 3.870177744871686e-06, | |
| "loss": 0.8878, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.35097378019406783, | |
| "grad_norm": 0.7797713242735278, | |
| "learning_rate": 3.860642822782605e-06, | |
| "loss": 0.8559, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.3523501479595348, | |
| "grad_norm": 0.5476182116265297, | |
| "learning_rate": 3.851079694469636e-06, | |
| "loss": 0.8503, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.35372651572500174, | |
| "grad_norm": 0.5593250703560667, | |
| "learning_rate": 3.841488558177118e-06, | |
| "loss": 0.8666, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.35510288349046865, | |
| "grad_norm": 0.5174533433417139, | |
| "learning_rate": 3.831869612729999e-06, | |
| "loss": 0.88, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.3564792512559356, | |
| "grad_norm": 0.5259246199350045, | |
| "learning_rate": 3.822223057529712e-06, | |
| "loss": 0.8522, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.3578556190214025, | |
| "grad_norm": 0.9561871466566957, | |
| "learning_rate": 3.8125490925500426e-06, | |
| "loss": 0.8947, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.35923198678686946, | |
| "grad_norm": 0.5703094998753527, | |
| "learning_rate": 3.8028479183329816e-06, | |
| "loss": 0.8721, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.36060835455233636, | |
| "grad_norm": 0.7959291859516844, | |
| "learning_rate": 3.793119735984572e-06, | |
| "loss": 0.903, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.3619847223178033, | |
| "grad_norm": 0.6289787758596613, | |
| "learning_rate": 3.7833647471707345e-06, | |
| "loss": 0.8642, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.3633610900832703, | |
| "grad_norm": 0.5823311703497488, | |
| "learning_rate": 3.773583154113092e-06, | |
| "loss": 0.8812, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.3647374578487372, | |
| "grad_norm": 0.5458593624626218, | |
| "learning_rate": 3.7637751595847734e-06, | |
| "loss": 0.8848, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.36611382561420414, | |
| "grad_norm": 0.5422663450938033, | |
| "learning_rate": 3.7539409669062138e-06, | |
| "loss": 0.8546, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.36749019337967104, | |
| "grad_norm": 0.5470752416021339, | |
| "learning_rate": 3.744080779940937e-06, | |
| "loss": 0.8803, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.368866561145138, | |
| "grad_norm": 0.6784499815024887, | |
| "learning_rate": 3.7341948030913293e-06, | |
| "loss": 0.8431, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.3702429289106049, | |
| "grad_norm": 0.5173768738501396, | |
| "learning_rate": 3.7242832412944047e-06, | |
| "loss": 0.923, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.37161929667607185, | |
| "grad_norm": 0.5139718864020647, | |
| "learning_rate": 3.714346300017555e-06, | |
| "loss": 0.925, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.37299566444153875, | |
| "grad_norm": 0.5535987985827241, | |
| "learning_rate": 3.7043841852542884e-06, | |
| "loss": 0.816, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.3743720322070057, | |
| "grad_norm": 0.6220599394236812, | |
| "learning_rate": 3.6943971035199642e-06, | |
| "loss": 0.8975, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.37574839997247267, | |
| "grad_norm": 0.6424558337857292, | |
| "learning_rate": 3.684385261847506e-06, | |
| "loss": 0.8696, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.37712476773793957, | |
| "grad_norm": 0.6193220187952029, | |
| "learning_rate": 3.674348867783115e-06, | |
| "loss": 0.9187, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.3785011355034065, | |
| "grad_norm": 0.7948870412022867, | |
| "learning_rate": 3.6642881293819643e-06, | |
| "loss": 0.8794, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.37987750326887343, | |
| "grad_norm": 0.4277830946620168, | |
| "learning_rate": 3.654203255203886e-06, | |
| "loss": 0.8369, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.3812538710343404, | |
| "grad_norm": 0.5346386829006233, | |
| "learning_rate": 3.6440944543090505e-06, | |
| "loss": 0.8175, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.3826302387998073, | |
| "grad_norm": 0.5567956320600921, | |
| "learning_rate": 3.633961936253628e-06, | |
| "loss": 0.9047, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.38400660656527424, | |
| "grad_norm": 0.48101573175427903, | |
| "learning_rate": 3.623805911085452e-06, | |
| "loss": 0.8312, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.3853829743307412, | |
| "grad_norm": 0.5357232714738431, | |
| "learning_rate": 3.613626589339653e-06, | |
| "loss": 0.8757, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3853829743307412, | |
| "eval_loss": 0.846756637096405, | |
| "eval_runtime": 37.5777, | |
| "eval_samples_per_second": 133.058, | |
| "eval_steps_per_second": 2.102, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3867593420962081, | |
| "grad_norm": 0.4727877531161341, | |
| "learning_rate": 3.6034241820343086e-06, | |
| "loss": 0.8599, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.38813570986167506, | |
| "grad_norm": 0.4454834707158075, | |
| "learning_rate": 3.5931989006660567e-06, | |
| "loss": 0.9158, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.38951207762714196, | |
| "grad_norm": 0.6273941979012451, | |
| "learning_rate": 3.582950957205718e-06, | |
| "loss": 0.8325, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.3908884453926089, | |
| "grad_norm": 0.4418868761207735, | |
| "learning_rate": 3.5726805640939e-06, | |
| "loss": 0.8455, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3922648131580758, | |
| "grad_norm": 0.459223577643327, | |
| "learning_rate": 3.562387934236593e-06, | |
| "loss": 0.8554, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.3936411809235428, | |
| "grad_norm": 0.6308918701832376, | |
| "learning_rate": 3.552073281000757e-06, | |
| "loss": 0.905, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.3950175486890097, | |
| "grad_norm": 0.5375942052449636, | |
| "learning_rate": 3.541736818209897e-06, | |
| "loss": 0.8989, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.39639391645447664, | |
| "grad_norm": 0.45898548069155615, | |
| "learning_rate": 3.5313787601396328e-06, | |
| "loss": 0.8568, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.3977702842199436, | |
| "grad_norm": 0.6621205621192929, | |
| "learning_rate": 3.5209993215132556e-06, | |
| "loss": 0.8988, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.3991466519854105, | |
| "grad_norm": 0.5810221477718872, | |
| "learning_rate": 3.510598717497276e-06, | |
| "loss": 0.8574, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.40052301975087745, | |
| "grad_norm": 0.49897461838543894, | |
| "learning_rate": 3.5001771636969677e-06, | |
| "loss": 0.8677, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.40189938751634435, | |
| "grad_norm": 0.343456686019272, | |
| "learning_rate": 3.4897348761518913e-06, | |
| "loss": 0.8568, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4032757552818113, | |
| "grad_norm": 0.5890854774221135, | |
| "learning_rate": 3.4792720713314223e-06, | |
| "loss": 0.8084, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.4046521230472782, | |
| "grad_norm": 0.5050249705187615, | |
| "learning_rate": 3.4687889661302577e-06, | |
| "loss": 0.822, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.40602849081274517, | |
| "grad_norm": 0.6261040635433304, | |
| "learning_rate": 3.458285777863926e-06, | |
| "loss": 0.8983, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.4074048585782121, | |
| "grad_norm": 0.6179599241399432, | |
| "learning_rate": 3.4477627242642782e-06, | |
| "loss": 0.8186, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.408781226343679, | |
| "grad_norm": 0.4118673045686313, | |
| "learning_rate": 3.4372200234749735e-06, | |
| "loss": 0.8005, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.410157594109146, | |
| "grad_norm": 0.5523708195732413, | |
| "learning_rate": 3.4266578940469605e-06, | |
| "loss": 0.8231, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.4115339618746129, | |
| "grad_norm": 0.6262009345044368, | |
| "learning_rate": 3.416076554933944e-06, | |
| "loss": 0.8134, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.41291032964007984, | |
| "grad_norm": 0.5855645975272682, | |
| "learning_rate": 3.4054762254878477e-06, | |
| "loss": 0.8583, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.41428669740554674, | |
| "grad_norm": 0.5148476566821822, | |
| "learning_rate": 3.394857125454267e-06, | |
| "loss": 0.8362, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.4156630651710137, | |
| "grad_norm": 0.5372672904747088, | |
| "learning_rate": 3.3842194749679086e-06, | |
| "loss": 0.8381, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4170394329364806, | |
| "grad_norm": 0.6533529330850166, | |
| "learning_rate": 3.373563494548037e-06, | |
| "loss": 0.8884, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.41841580070194756, | |
| "grad_norm": 0.7855188602038081, | |
| "learning_rate": 3.3628894050938945e-06, | |
| "loss": 0.8554, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4197921684674145, | |
| "grad_norm": 0.7164029448250899, | |
| "learning_rate": 3.352197427880126e-06, | |
| "loss": 0.8902, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.4211685362328814, | |
| "grad_norm": 0.5654733251747366, | |
| "learning_rate": 3.3414877845521904e-06, | |
| "loss": 0.8858, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4225449039983484, | |
| "grad_norm": 0.7161901522114019, | |
| "learning_rate": 3.3307606971217665e-06, | |
| "loss": 0.8793, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.4239212717638153, | |
| "grad_norm": 0.37921076195825976, | |
| "learning_rate": 3.320016387962151e-06, | |
| "loss": 0.8133, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.42529763952928223, | |
| "grad_norm": 0.57460942583143, | |
| "learning_rate": 3.309255079803647e-06, | |
| "loss": 0.8308, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.42667400729474914, | |
| "grad_norm": 0.6237565822649527, | |
| "learning_rate": 3.29847699572895e-06, | |
| "loss": 0.9122, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4280503750602161, | |
| "grad_norm": 0.6604400360810607, | |
| "learning_rate": 3.2876823591685214e-06, | |
| "loss": 0.7869, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.42942674282568305, | |
| "grad_norm": 0.7964686696906034, | |
| "learning_rate": 3.276871393895954e-06, | |
| "loss": 0.8302, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.43080311059114995, | |
| "grad_norm": 0.7353573467891227, | |
| "learning_rate": 3.2660443240233387e-06, | |
| "loss": 0.8878, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.4321794783566169, | |
| "grad_norm": 0.45353882448336647, | |
| "learning_rate": 3.2552013739966147e-06, | |
| "loss": 0.8555, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.4335558461220838, | |
| "grad_norm": 0.600983398689409, | |
| "learning_rate": 3.24434276859092e-06, | |
| "loss": 0.8003, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.43493221388755077, | |
| "grad_norm": 0.33468339832513494, | |
| "learning_rate": 3.233468732905927e-06, | |
| "loss": 0.7919, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.43630858165301767, | |
| "grad_norm": 0.664686119618901, | |
| "learning_rate": 3.222579492361179e-06, | |
| "loss": 0.8585, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.4376849494184846, | |
| "grad_norm": 0.580736295078617, | |
| "learning_rate": 3.21167527269142e-06, | |
| "loss": 0.8537, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4390613171839515, | |
| "grad_norm": 0.4795429342164793, | |
| "learning_rate": 3.2007562999419094e-06, | |
| "loss": 0.8691, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.4404376849494185, | |
| "grad_norm": 0.5533419912378857, | |
| "learning_rate": 3.189822800463742e-06, | |
| "loss": 0.8441, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4404376849494185, | |
| "eval_loss": 0.8311466574668884, | |
| "eval_runtime": 37.5777, | |
| "eval_samples_per_second": 133.058, | |
| "eval_steps_per_second": 2.102, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.44181405271488544, | |
| "grad_norm": 0.44901727418791887, | |
| "learning_rate": 3.1788750009091473e-06, | |
| "loss": 0.8785, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.44319042048035234, | |
| "grad_norm": 0.7054761131009049, | |
| "learning_rate": 3.167913128226803e-06, | |
| "loss": 0.8442, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.4445667882458193, | |
| "grad_norm": 0.6338208711460487, | |
| "learning_rate": 3.156937409657119e-06, | |
| "loss": 0.8968, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.4459431560112862, | |
| "grad_norm": 0.6103638077202322, | |
| "learning_rate": 3.145948072727535e-06, | |
| "loss": 0.8823, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.44731952377675316, | |
| "grad_norm": 0.5814180130346099, | |
| "learning_rate": 3.134945345247797e-06, | |
| "loss": 0.8224, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.44869589154222006, | |
| "grad_norm": 0.6704683844032799, | |
| "learning_rate": 3.123929455305239e-06, | |
| "loss": 0.8797, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.450072259307687, | |
| "grad_norm": 0.4798892663438211, | |
| "learning_rate": 3.1129006312600558e-06, | |
| "loss": 0.8386, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.451448627073154, | |
| "grad_norm": 0.5497411331632864, | |
| "learning_rate": 3.101859101740565e-06, | |
| "loss": 0.858, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.4528249948386209, | |
| "grad_norm": 0.5856897243913328, | |
| "learning_rate": 3.09080509563847e-06, | |
| "loss": 0.8904, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.45420136260408783, | |
| "grad_norm": 0.47574657706442885, | |
| "learning_rate": 3.079738842104115e-06, | |
| "loss": 0.831, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.45557773036955473, | |
| "grad_norm": 0.7356917244396622, | |
| "learning_rate": 3.0686605705417337e-06, | |
| "loss": 0.8638, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.4569540981350217, | |
| "grad_norm": 0.4609027088459466, | |
| "learning_rate": 3.057570510604696e-06, | |
| "loss": 0.8342, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4583304659004886, | |
| "grad_norm": 0.649733338307217, | |
| "learning_rate": 3.0464688921907436e-06, | |
| "loss": 0.844, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.45970683366595555, | |
| "grad_norm": 0.6452557057054518, | |
| "learning_rate": 3.035355945437228e-06, | |
| "loss": 0.901, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.46108320143142245, | |
| "grad_norm": 0.48053889170013253, | |
| "learning_rate": 3.0242319007163373e-06, | |
| "loss": 0.8237, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.4624595691968894, | |
| "grad_norm": 0.5156070687409189, | |
| "learning_rate": 3.01309698863032e-06, | |
| "loss": 0.7832, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.46383593696235637, | |
| "grad_norm": 0.6487171872307684, | |
| "learning_rate": 3.001951440006708e-06, | |
| "loss": 0.8302, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.46521230472782327, | |
| "grad_norm": 0.4088236693992105, | |
| "learning_rate": 2.9907954858935277e-06, | |
| "loss": 0.7978, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.4665886724932902, | |
| "grad_norm": 0.6166137125554756, | |
| "learning_rate": 2.9796293575545143e-06, | |
| "loss": 0.8327, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.4679650402587571, | |
| "grad_norm": 0.5324800896120733, | |
| "learning_rate": 2.9684532864643123e-06, | |
| "loss": 0.8497, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.4693414080242241, | |
| "grad_norm": 0.5330587838888724, | |
| "learning_rate": 2.957267504303682e-06, | |
| "loss": 0.8318, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.470717775789691, | |
| "grad_norm": 0.3066813832965278, | |
| "learning_rate": 2.946072242954695e-06, | |
| "loss": 0.7959, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.47209414355515794, | |
| "grad_norm": 0.4174047766855664, | |
| "learning_rate": 2.934867734495927e-06, | |
| "loss": 0.8157, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.4734705113206249, | |
| "grad_norm": 0.5513234180318742, | |
| "learning_rate": 2.9236542111976468e-06, | |
| "loss": 0.8657, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4748468790860918, | |
| "grad_norm": 0.36186756893473565, | |
| "learning_rate": 2.9124319055170012e-06, | |
| "loss": 0.8108, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.47622324685155876, | |
| "grad_norm": 0.48483961573426937, | |
| "learning_rate": 2.9012010500931966e-06, | |
| "loss": 0.8532, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.47759961461702566, | |
| "grad_norm": 0.656595154954433, | |
| "learning_rate": 2.8899618777426763e-06, | |
| "loss": 0.8186, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.4789759823824926, | |
| "grad_norm": 0.45682292991009654, | |
| "learning_rate": 2.878714621454294e-06, | |
| "loss": 0.8507, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.4803523501479595, | |
| "grad_norm": 0.4456324138614082, | |
| "learning_rate": 2.867459514384485e-06, | |
| "loss": 0.8809, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.4817287179134265, | |
| "grad_norm": 0.5388546054688201, | |
| "learning_rate": 2.856196789852429e-06, | |
| "loss": 0.8236, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.4831050856788934, | |
| "grad_norm": 0.660462229832185, | |
| "learning_rate": 2.84492668133522e-06, | |
| "loss": 0.8338, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.48448145344436033, | |
| "grad_norm": 0.5278146557932175, | |
| "learning_rate": 2.833649422463019e-06, | |
| "loss": 0.814, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.4858578212098273, | |
| "grad_norm": 0.5470942669154103, | |
| "learning_rate": 2.8223652470142184e-06, | |
| "loss": 0.8183, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.4872341889752942, | |
| "grad_norm": 0.505713995433775, | |
| "learning_rate": 2.8110743889105874e-06, | |
| "loss": 0.8387, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.48861055674076115, | |
| "grad_norm": 0.4875452733503727, | |
| "learning_rate": 2.79977708221243e-06, | |
| "loss": 0.7981, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.48998692450622805, | |
| "grad_norm": 0.43916802590802495, | |
| "learning_rate": 2.7884735611137288e-06, | |
| "loss": 0.8532, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.491363292271695, | |
| "grad_norm": 0.33369528684284744, | |
| "learning_rate": 2.777164059937292e-06, | |
| "loss": 0.8408, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.4927396600371619, | |
| "grad_norm": 0.7202736685154576, | |
| "learning_rate": 2.765848813129895e-06, | |
| "loss": 0.8532, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.49411602780262887, | |
| "grad_norm": 0.42511937449619386, | |
| "learning_rate": 2.7545280552574204e-06, | |
| "loss": 0.8224, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.4954923955680958, | |
| "grad_norm": 0.545135520242412, | |
| "learning_rate": 2.7432020209999956e-06, | |
| "loss": 0.8197, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.4954923955680958, | |
| "eval_loss": 0.8205735087394714, | |
| "eval_runtime": 37.5781, | |
| "eval_samples_per_second": 133.056, | |
| "eval_steps_per_second": 2.102, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.4968687633335627, | |
| "grad_norm": 0.48066576255326976, | |
| "learning_rate": 2.7318709451471288e-06, | |
| "loss": 0.8239, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.4982451310990297, | |
| "grad_norm": 0.7847734357041681, | |
| "learning_rate": 2.7205350625928383e-06, | |
| "loss": 0.9108, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.4996214988644966, | |
| "grad_norm": 0.7016473795626382, | |
| "learning_rate": 2.70919460833079e-06, | |
| "loss": 0.8367, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.5009978666299635, | |
| "grad_norm": 0.6178351624669732, | |
| "learning_rate": 2.697849817449415e-06, | |
| "loss": 0.8282, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5023742343954305, | |
| "grad_norm": 0.4091504823861622, | |
| "learning_rate": 2.6865009251270506e-06, | |
| "loss": 0.8526, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.5037506021608974, | |
| "grad_norm": 0.6183413041352451, | |
| "learning_rate": 2.6751481666270513e-06, | |
| "loss": 0.8473, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5051269699263643, | |
| "grad_norm": 0.606173243841225, | |
| "learning_rate": 2.6637917772929213e-06, | |
| "loss": 0.8567, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.5065033376918313, | |
| "grad_norm": 0.6132486268789019, | |
| "learning_rate": 2.65243199254343e-06, | |
| "loss": 0.8325, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.5078797054572982, | |
| "grad_norm": 0.4854570121858536, | |
| "learning_rate": 2.6410690478677353e-06, | |
| "loss": 0.7892, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.5092560732227651, | |
| "grad_norm": 0.37412561464355937, | |
| "learning_rate": 2.6297031788205004e-06, | |
| "loss": 0.8094, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.510632440988232, | |
| "grad_norm": 0.6203401035441314, | |
| "learning_rate": 2.618334621017009e-06, | |
| "loss": 0.822, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.512008808753699, | |
| "grad_norm": 0.653767694616689, | |
| "learning_rate": 2.6069636101282862e-06, | |
| "loss": 0.8367, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5133851765191659, | |
| "grad_norm": 0.6245289959361092, | |
| "learning_rate": 2.595590381876209e-06, | |
| "loss": 0.8328, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.5147615442846328, | |
| "grad_norm": 0.4457307365521416, | |
| "learning_rate": 2.584215172028618e-06, | |
| "loss": 0.8312, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.5161379120500997, | |
| "grad_norm": 0.4815075541618031, | |
| "learning_rate": 2.572838216394434e-06, | |
| "loss": 0.8686, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.5175142798155667, | |
| "grad_norm": 0.5702137712078795, | |
| "learning_rate": 2.561459750818769e-06, | |
| "loss": 0.8347, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.5188906475810336, | |
| "grad_norm": 0.48523555192405865, | |
| "learning_rate": 2.5500800111780357e-06, | |
| "loss": 0.8036, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.5202670153465005, | |
| "grad_norm": 0.3430136628045643, | |
| "learning_rate": 2.5386992333750565e-06, | |
| "loss": 0.8291, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.5216433831119676, | |
| "grad_norm": 0.470576052419782, | |
| "learning_rate": 2.5273176533341777e-06, | |
| "loss": 0.77, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.5230197508774345, | |
| "grad_norm": 0.4479209182378322, | |
| "learning_rate": 2.5159355069963744e-06, | |
| "loss": 0.8091, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5243961186429014, | |
| "grad_norm": 0.5763895685381267, | |
| "learning_rate": 2.5045530303143604e-06, | |
| "loss": 0.863, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.5257724864083683, | |
| "grad_norm": 0.6080627036981748, | |
| "learning_rate": 2.4931704592477e-06, | |
| "loss": 0.8713, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.5271488541738353, | |
| "grad_norm": 0.5799585453878151, | |
| "learning_rate": 2.4817880297579134e-06, | |
| "loss": 0.7895, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.5285252219393022, | |
| "grad_norm": 0.6516081569612333, | |
| "learning_rate": 2.4704059778035823e-06, | |
| "loss": 0.8062, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.5299015897047691, | |
| "grad_norm": 0.5284447836149975, | |
| "learning_rate": 2.459024539335467e-06, | |
| "loss": 0.8549, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.5312779574702361, | |
| "grad_norm": 0.5029144016269022, | |
| "learning_rate": 2.447643950291608e-06, | |
| "loss": 0.8279, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.532654325235703, | |
| "grad_norm": 0.6658834516119427, | |
| "learning_rate": 2.4362644465924367e-06, | |
| "loss": 0.8335, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.5340306930011699, | |
| "grad_norm": 0.5953435916222078, | |
| "learning_rate": 2.4248862641358865e-06, | |
| "loss": 0.7918, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.5354070607666368, | |
| "grad_norm": 0.37670098774520216, | |
| "learning_rate": 2.4135096387925e-06, | |
| "loss": 0.8638, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.5367834285321038, | |
| "grad_norm": 0.511664128571869, | |
| "learning_rate": 2.4021348064005417e-06, | |
| "loss": 0.8377, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5381597962975707, | |
| "grad_norm": 0.42161706759354106, | |
| "learning_rate": 2.3907620027611083e-06, | |
| "loss": 0.83, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.5395361640630376, | |
| "grad_norm": 0.3040178118533278, | |
| "learning_rate": 2.3793914636332394e-06, | |
| "loss": 0.8746, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.5409125318285046, | |
| "grad_norm": 0.689149159117422, | |
| "learning_rate": 2.3680234247290305e-06, | |
| "loss": 0.8247, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.5422888995939715, | |
| "grad_norm": 0.44356157035761207, | |
| "learning_rate": 2.3566581217087496e-06, | |
| "loss": 0.8277, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.5436652673594384, | |
| "grad_norm": 0.5918309406851234, | |
| "learning_rate": 2.3452957901759486e-06, | |
| "loss": 0.8025, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.5450416351249053, | |
| "grad_norm": 0.4997809391017281, | |
| "learning_rate": 2.333936665672579e-06, | |
| "loss": 0.835, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.5464180028903723, | |
| "grad_norm": 0.49766628433257887, | |
| "learning_rate": 2.3225809836741118e-06, | |
| "loss": 0.7756, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.5477943706558392, | |
| "grad_norm": 0.40407922865092843, | |
| "learning_rate": 2.3112289795846537e-06, | |
| "loss": 0.7967, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5491707384213061, | |
| "grad_norm": 0.6308739524640976, | |
| "learning_rate": 2.2998808887320697e-06, | |
| "loss": 0.781, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.5505471061867732, | |
| "grad_norm": 0.5743558152124677, | |
| "learning_rate": 2.2885369463631003e-06, | |
| "loss": 0.7807, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5505471061867732, | |
| "eval_loss": 0.8089554905891418, | |
| "eval_runtime": 37.5623, | |
| "eval_samples_per_second": 133.112, | |
| "eval_steps_per_second": 2.103, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5519234739522401, | |
| "grad_norm": 0.5814715096200206, | |
| "learning_rate": 2.277197387638491e-06, | |
| "loss": 0.8329, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.553299841717707, | |
| "grad_norm": 0.5479788170588639, | |
| "learning_rate": 2.265862447628111e-06, | |
| "loss": 0.8742, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.5546762094831739, | |
| "grad_norm": 0.3062291452983948, | |
| "learning_rate": 2.254532361306085e-06, | |
| "loss": 0.7671, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.5560525772486409, | |
| "grad_norm": 0.5763488126780687, | |
| "learning_rate": 2.2432073635459196e-06, | |
| "loss": 0.8437, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.5574289450141078, | |
| "grad_norm": 0.668939310724625, | |
| "learning_rate": 2.2318876891156356e-06, | |
| "loss": 0.8973, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.5588053127795747, | |
| "grad_norm": 0.37776371512529605, | |
| "learning_rate": 2.2205735726729023e-06, | |
| "loss": 0.8345, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.5601816805450416, | |
| "grad_norm": 0.41280705089503705, | |
| "learning_rate": 2.2092652487601675e-06, | |
| "loss": 0.8323, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.5615580483105086, | |
| "grad_norm": 0.4632509161576438, | |
| "learning_rate": 2.1979629517998027e-06, | |
| "loss": 0.8282, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5629344160759755, | |
| "grad_norm": 0.7200713467674419, | |
| "learning_rate": 2.186666916089239e-06, | |
| "loss": 0.8379, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.5643107838414424, | |
| "grad_norm": 0.5178576662970729, | |
| "learning_rate": 2.1753773757961137e-06, | |
| "loss": 0.8261, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5656871516069094, | |
| "grad_norm": 0.5610321336212082, | |
| "learning_rate": 2.1640945649534096e-06, | |
| "loss": 0.8309, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.5670635193723763, | |
| "grad_norm": 0.5892891551665735, | |
| "learning_rate": 2.1528187174546093e-06, | |
| "loss": 0.8297, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.5684398871378432, | |
| "grad_norm": 0.4486760776076786, | |
| "learning_rate": 2.141550067048846e-06, | |
| "loss": 0.8389, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.5698162549033101, | |
| "grad_norm": 0.7907237469372685, | |
| "learning_rate": 2.1302888473360566e-06, | |
| "loss": 0.8321, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5711926226687771, | |
| "grad_norm": 0.566706200199042, | |
| "learning_rate": 2.119035291762136e-06, | |
| "loss": 0.8212, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.572568990434244, | |
| "grad_norm": 0.5263130247632009, | |
| "learning_rate": 2.1077896336141043e-06, | |
| "loss": 0.8042, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.5739453581997109, | |
| "grad_norm": 0.5308066511740118, | |
| "learning_rate": 2.096552106015266e-06, | |
| "loss": 0.815, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.5753217259651779, | |
| "grad_norm": 0.6957541746253699, | |
| "learning_rate": 2.0853229419203808e-06, | |
| "loss": 0.8261, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.5766980937306448, | |
| "grad_norm": 0.5966904887807215, | |
| "learning_rate": 2.0741023741108276e-06, | |
| "loss": 0.827, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.5780744614961117, | |
| "grad_norm": 0.4539597770375397, | |
| "learning_rate": 2.0628906351897885e-06, | |
| "loss": 0.8182, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5794508292615786, | |
| "grad_norm": 0.6834026151474581, | |
| "learning_rate": 2.0516879575774203e-06, | |
| "loss": 0.8303, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.5808271970270457, | |
| "grad_norm": 0.5239200524956313, | |
| "learning_rate": 2.040494573506038e-06, | |
| "loss": 0.7741, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.5822035647925126, | |
| "grad_norm": 0.45753694417604984, | |
| "learning_rate": 2.0293107150153006e-06, | |
| "loss": 0.8397, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.5835799325579795, | |
| "grad_norm": 0.46224504028065294, | |
| "learning_rate": 2.018136613947401e-06, | |
| "loss": 0.8244, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.5849563003234465, | |
| "grad_norm": 0.4341362951474634, | |
| "learning_rate": 2.0069725019422624e-06, | |
| "loss": 0.8009, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.5863326680889134, | |
| "grad_norm": 0.4900348861826574, | |
| "learning_rate": 1.9958186104327317e-06, | |
| "loss": 0.8483, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.5877090358543803, | |
| "grad_norm": 0.6257220799254032, | |
| "learning_rate": 1.9846751706397832e-06, | |
| "loss": 0.8405, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.5890854036198472, | |
| "grad_norm": 0.522323526016304, | |
| "learning_rate": 1.9735424135677283e-06, | |
| "loss": 0.8322, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.5904617713853142, | |
| "grad_norm": 0.6328347166830222, | |
| "learning_rate": 1.9624205699994256e-06, | |
| "loss": 0.8607, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.5918381391507811, | |
| "grad_norm": 0.52757076025723, | |
| "learning_rate": 1.951309870491494e-06, | |
| "loss": 0.8003, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.593214506916248, | |
| "grad_norm": 0.4530698938004871, | |
| "learning_rate": 1.9402105453695356e-06, | |
| "loss": 0.843, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.594590874681715, | |
| "grad_norm": 0.5275287498766006, | |
| "learning_rate": 1.9291228247233607e-06, | |
| "loss": 0.7959, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.5959672424471819, | |
| "grad_norm": 0.28183154522813586, | |
| "learning_rate": 1.9180469384022203e-06, | |
| "loss": 0.7799, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.5973436102126488, | |
| "grad_norm": 0.5336877667445091, | |
| "learning_rate": 1.9069831160100338e-06, | |
| "loss": 0.7979, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.5987199779781157, | |
| "grad_norm": 0.6532051021851727, | |
| "learning_rate": 1.8959315869006405e-06, | |
| "loss": 0.8359, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.6000963457435827, | |
| "grad_norm": 0.5307514597676389, | |
| "learning_rate": 1.8848925801730344e-06, | |
| "loss": 0.8937, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.6014727135090496, | |
| "grad_norm": 0.4042895440437914, | |
| "learning_rate": 1.8738663246666234e-06, | |
| "loss": 0.8252, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.6028490812745165, | |
| "grad_norm": 0.44894176165695887, | |
| "learning_rate": 1.8628530489564771e-06, | |
| "loss": 0.7835, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.6042254490399834, | |
| "grad_norm": 0.4991539092087521, | |
| "learning_rate": 1.8518529813485973e-06, | |
| "loss": 0.814, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.6056018168054504, | |
| "grad_norm": 0.3709613327877024, | |
| "learning_rate": 1.8408663498751788e-06, | |
| "loss": 0.7757, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6056018168054504, | |
| "eval_loss": 0.8015441298484802, | |
| "eval_runtime": 37.566, | |
| "eval_samples_per_second": 133.099, | |
| "eval_steps_per_second": 2.103, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6069781845709173, | |
| "grad_norm": 0.4683158255043096, | |
| "learning_rate": 1.829893382289886e-06, | |
| "loss": 0.8097, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.6083545523363842, | |
| "grad_norm": 0.5627576336667334, | |
| "learning_rate": 1.818934306063126e-06, | |
| "loss": 0.806, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.6097309201018513, | |
| "grad_norm": 0.5488618788671827, | |
| "learning_rate": 1.8079893483773413e-06, | |
| "loss": 0.8185, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.6111072878673182, | |
| "grad_norm": 0.44918917109293605, | |
| "learning_rate": 1.7970587361222946e-06, | |
| "loss": 0.8271, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.6124836556327851, | |
| "grad_norm": 0.32979924519766196, | |
| "learning_rate": 1.786142695890367e-06, | |
| "loss": 0.7828, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.613860023398252, | |
| "grad_norm": 0.35131817470857274, | |
| "learning_rate": 1.7752414539718582e-06, | |
| "loss": 0.8191, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.615236391163719, | |
| "grad_norm": 0.4656658318567486, | |
| "learning_rate": 1.7643552363503009e-06, | |
| "loss": 0.8358, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.6166127589291859, | |
| "grad_norm": 0.570964532177242, | |
| "learning_rate": 1.7534842686977721e-06, | |
| "loss": 0.8596, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.6179891266946528, | |
| "grad_norm": 0.34212198519333364, | |
| "learning_rate": 1.742628776370216e-06, | |
| "loss": 0.818, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.6193654944601198, | |
| "grad_norm": 0.48535112289382754, | |
| "learning_rate": 1.7317889844027707e-06, | |
| "loss": 0.8623, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6207418622255867, | |
| "grad_norm": 0.49166416763933013, | |
| "learning_rate": 1.7209651175051056e-06, | |
| "loss": 0.8468, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.6221182299910536, | |
| "grad_norm": 0.4583562391030405, | |
| "learning_rate": 1.7101574000567633e-06, | |
| "loss": 0.822, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.6234945977565205, | |
| "grad_norm": 0.46794730423054154, | |
| "learning_rate": 1.6993660561025072e-06, | |
| "loss": 0.8562, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.6248709655219875, | |
| "grad_norm": 0.6099197600468227, | |
| "learning_rate": 1.6885913093476741e-06, | |
| "loss": 0.8078, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.6262473332874544, | |
| "grad_norm": 0.5249915623303978, | |
| "learning_rate": 1.677833383153542e-06, | |
| "loss": 0.8219, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.6276237010529213, | |
| "grad_norm": 0.4377376254057894, | |
| "learning_rate": 1.6670925005326977e-06, | |
| "loss": 0.8179, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.6290000688183883, | |
| "grad_norm": 0.42122297813103243, | |
| "learning_rate": 1.6563688841444137e-06, | |
| "loss": 0.8418, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.6303764365838552, | |
| "grad_norm": 0.5683008577226283, | |
| "learning_rate": 1.6456627562900296e-06, | |
| "loss": 0.7891, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.6317528043493221, | |
| "grad_norm": 0.6037662603976885, | |
| "learning_rate": 1.63497433890835e-06, | |
| "loss": 0.832, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.633129172114789, | |
| "grad_norm": 0.5247870050565184, | |
| "learning_rate": 1.6243038535710365e-06, | |
| "loss": 0.8076, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.634505539880256, | |
| "grad_norm": 0.41197503482402553, | |
| "learning_rate": 1.6136515214780227e-06, | |
| "loss": 0.7596, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.6358819076457229, | |
| "grad_norm": 0.4791099245291028, | |
| "learning_rate": 1.603017563452919e-06, | |
| "loss": 0.8107, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.6372582754111898, | |
| "grad_norm": 0.43592237438522047, | |
| "learning_rate": 1.592402199938443e-06, | |
| "loss": 0.8185, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.6386346431766569, | |
| "grad_norm": 0.5940623980499528, | |
| "learning_rate": 1.5818056509918478e-06, | |
| "loss": 0.8004, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.6400110109421238, | |
| "grad_norm": 0.5250401511659805, | |
| "learning_rate": 1.5712281362803561e-06, | |
| "loss": 0.802, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.6413873787075907, | |
| "grad_norm": 0.5915893010847437, | |
| "learning_rate": 1.5606698750766108e-06, | |
| "loss": 0.8642, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.6427637464730576, | |
| "grad_norm": 0.46778493168617225, | |
| "learning_rate": 1.550131086254129e-06, | |
| "loss": 0.8092, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.6441401142385246, | |
| "grad_norm": 0.491189650305899, | |
| "learning_rate": 1.5396119882827651e-06, | |
| "loss": 0.8026, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.6455164820039915, | |
| "grad_norm": 0.43582541174722744, | |
| "learning_rate": 1.5291127992241766e-06, | |
| "loss": 0.8141, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.6468928497694584, | |
| "grad_norm": 0.6142246346723809, | |
| "learning_rate": 1.5186337367273105e-06, | |
| "loss": 0.8008, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6482692175349253, | |
| "grad_norm": 0.35462943795501145, | |
| "learning_rate": 1.5081750180238891e-06, | |
| "loss": 0.7667, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.6496455853003923, | |
| "grad_norm": 0.41180209198737644, | |
| "learning_rate": 1.4977368599239061e-06, | |
| "loss": 0.8028, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.6510219530658592, | |
| "grad_norm": 0.7189067391117583, | |
| "learning_rate": 1.487319478811131e-06, | |
| "loss": 0.8339, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.6523983208313261, | |
| "grad_norm": 0.3806482298690338, | |
| "learning_rate": 1.4769230906386272e-06, | |
| "loss": 0.8151, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.6537746885967931, | |
| "grad_norm": 0.4983576378974423, | |
| "learning_rate": 1.4665479109242696e-06, | |
| "loss": 0.7939, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.65515105636226, | |
| "grad_norm": 0.33177024863231713, | |
| "learning_rate": 1.4561941547462855e-06, | |
| "loss": 0.8009, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.6565274241277269, | |
| "grad_norm": 0.538297674107413, | |
| "learning_rate": 1.4458620367387838e-06, | |
| "loss": 0.8025, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.6579037918931938, | |
| "grad_norm": 0.4901494849475784, | |
| "learning_rate": 1.4355517710873184e-06, | |
| "loss": 0.7845, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6592801596586608, | |
| "grad_norm": 0.5486200463100078, | |
| "learning_rate": 1.4252635715244394e-06, | |
| "loss": 0.8208, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.6606565274241277, | |
| "grad_norm": 0.4897253532401449, | |
| "learning_rate": 1.4149976513252677e-06, | |
| "loss": 0.7818, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6606565274241277, | |
| "eval_loss": 0.7957330942153931, | |
| "eval_runtime": 37.5749, | |
| "eval_samples_per_second": 133.068, | |
| "eval_steps_per_second": 2.102, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6620328951895946, | |
| "grad_norm": 0.46111910295697045, | |
| "learning_rate": 1.4047542233030683e-06, | |
| "loss": 0.8258, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 0.6634092629550616, | |
| "grad_norm": 0.5784849194355406, | |
| "learning_rate": 1.3945334998048425e-06, | |
| "loss": 0.8157, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.6647856307205285, | |
| "grad_norm": 0.5455404395159691, | |
| "learning_rate": 1.3843356927069266e-06, | |
| "loss": 0.8155, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 0.6661619984859954, | |
| "grad_norm": 0.4060432121547348, | |
| "learning_rate": 1.3741610134105984e-06, | |
| "loss": 0.7862, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6675383662514623, | |
| "grad_norm": 0.5175461237539263, | |
| "learning_rate": 1.3640096728376922e-06, | |
| "loss": 0.796, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.6689147340169294, | |
| "grad_norm": 0.5579421958353933, | |
| "learning_rate": 1.353881881426231e-06, | |
| "loss": 0.8159, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6702911017823963, | |
| "grad_norm": 0.3938550567000424, | |
| "learning_rate": 1.3437778491260626e-06, | |
| "loss": 0.7888, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 0.6716674695478632, | |
| "grad_norm": 0.3621816668127261, | |
| "learning_rate": 1.3336977853945055e-06, | |
| "loss": 0.7831, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.6730438373133302, | |
| "grad_norm": 0.6046109892992472, | |
| "learning_rate": 1.3236418991920065e-06, | |
| "loss": 0.7899, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 0.6744202050787971, | |
| "grad_norm": 0.52626524074172, | |
| "learning_rate": 1.3136103989778138e-06, | |
| "loss": 0.7591, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.675796572844264, | |
| "grad_norm": 0.5540405574756208, | |
| "learning_rate": 1.303603492705649e-06, | |
| "loss": 0.8, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 0.6771729406097309, | |
| "grad_norm": 0.3755183771069488, | |
| "learning_rate": 1.2936213878194031e-06, | |
| "loss": 0.819, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.6785493083751979, | |
| "grad_norm": 0.585894250903544, | |
| "learning_rate": 1.2836642912488287e-06, | |
| "loss": 0.8327, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 0.6799256761406648, | |
| "grad_norm": 0.5007223315080989, | |
| "learning_rate": 1.2737324094052569e-06, | |
| "loss": 0.8055, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.6813020439061317, | |
| "grad_norm": 0.4543771383459902, | |
| "learning_rate": 1.2638259481773164e-06, | |
| "loss": 0.7892, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.6826784116715987, | |
| "grad_norm": 0.45325256663521224, | |
| "learning_rate": 1.2539451129266603e-06, | |
| "loss": 0.7904, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.6840547794370656, | |
| "grad_norm": 0.42755072695822427, | |
| "learning_rate": 1.244090108483718e-06, | |
| "loss": 0.8696, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 0.6854311472025325, | |
| "grad_norm": 0.5749380186174909, | |
| "learning_rate": 1.2342611391434424e-06, | |
| "loss": 0.7695, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.6868075149679994, | |
| "grad_norm": 0.47696517661731086, | |
| "learning_rate": 1.2244584086610783e-06, | |
| "loss": 0.8061, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 0.6881838827334664, | |
| "grad_norm": 0.47628535730003985, | |
| "learning_rate": 1.2146821202479347e-06, | |
| "loss": 0.8252, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6895602504989333, | |
| "grad_norm": 0.4918566967586663, | |
| "learning_rate": 1.204932476567175e-06, | |
| "loss": 0.8306, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.6909366182644002, | |
| "grad_norm": 0.45157773288125097, | |
| "learning_rate": 1.1952096797296167e-06, | |
| "loss": 0.7911, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.6923129860298671, | |
| "grad_norm": 0.5449035155568759, | |
| "learning_rate": 1.1855139312895412e-06, | |
| "loss": 0.8297, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 0.6936893537953341, | |
| "grad_norm": 0.539632313493914, | |
| "learning_rate": 1.175845432240511e-06, | |
| "loss": 0.7938, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.695065721560801, | |
| "grad_norm": 0.267579658107954, | |
| "learning_rate": 1.16620438301121e-06, | |
| "loss": 0.8326, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.6964420893262679, | |
| "grad_norm": 0.45094192311072506, | |
| "learning_rate": 1.1565909834612843e-06, | |
| "loss": 0.8183, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.697818457091735, | |
| "grad_norm": 0.45018310356306585, | |
| "learning_rate": 1.1470054328772015e-06, | |
| "loss": 0.8312, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 0.6991948248572019, | |
| "grad_norm": 0.5424670154260764, | |
| "learning_rate": 1.1374479299681144e-06, | |
| "loss": 0.8547, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.7005711926226688, | |
| "grad_norm": 0.3929689914659013, | |
| "learning_rate": 1.12791867286175e-06, | |
| "loss": 0.7631, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 0.7019475603881357, | |
| "grad_norm": 0.5189958974232449, | |
| "learning_rate": 1.1184178591002936e-06, | |
| "loss": 0.7974, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7033239281536027, | |
| "grad_norm": 0.4940843745776494, | |
| "learning_rate": 1.1089456856363023e-06, | |
| "loss": 0.7859, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 0.7047002959190696, | |
| "grad_norm": 0.3903233975311801, | |
| "learning_rate": 1.0995023488286132e-06, | |
| "loss": 0.7555, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.7060766636845365, | |
| "grad_norm": 0.44367526110003797, | |
| "learning_rate": 1.090088044438281e-06, | |
| "loss": 0.83, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 0.7074530314500035, | |
| "grad_norm": 0.3625988730276094, | |
| "learning_rate": 1.0807029676245146e-06, | |
| "loss": 0.772, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.7088293992154704, | |
| "grad_norm": 0.4378410267181844, | |
| "learning_rate": 1.0713473129406342e-06, | |
| "loss": 0.7913, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.7102057669809373, | |
| "grad_norm": 0.5659553149700242, | |
| "learning_rate": 1.062021274330035e-06, | |
| "loss": 0.8333, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.7115821347464042, | |
| "grad_norm": 0.480279896191206, | |
| "learning_rate": 1.0527250451221714e-06, | |
| "loss": 0.7924, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 0.7129585025118712, | |
| "grad_norm": 0.5794313371061417, | |
| "learning_rate": 1.043458818028546e-06, | |
| "loss": 0.8025, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.7143348702773381, | |
| "grad_norm": 0.577692103807798, | |
| "learning_rate": 1.0342227851387132e-06, | |
| "loss": 0.8102, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 0.715711238042805, | |
| "grad_norm": 0.7048819923650593, | |
| "learning_rate": 1.0250171379163035e-06, | |
| "loss": 0.8235, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.715711238042805, | |
| "eval_loss": 0.7914655208587646, | |
| "eval_runtime": 37.5636, | |
| "eval_samples_per_second": 133.108, | |
| "eval_steps_per_second": 2.103, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.717087605808272, | |
| "grad_norm": 0.5135405438687206, | |
| "learning_rate": 1.0158420671950458e-06, | |
| "loss": 0.8354, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 0.7184639735737389, | |
| "grad_norm": 0.49088601989364183, | |
| "learning_rate": 1.0066977631748192e-06, | |
| "loss": 0.8243, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.7198403413392058, | |
| "grad_norm": 0.43340747956612413, | |
| "learning_rate": 9.975844154177068e-07, | |
| "loss": 0.8082, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 0.7212167091046727, | |
| "grad_norm": 0.4929677260928928, | |
| "learning_rate": 9.88502212844063e-07, | |
| "loss": 0.8259, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.7225930768701397, | |
| "grad_norm": 0.5110702548169729, | |
| "learning_rate": 9.794513437286039e-07, | |
| "loss": 0.8231, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.7239694446356066, | |
| "grad_norm": 0.43992378048573655, | |
| "learning_rate": 9.704319956964997e-07, | |
| "loss": 0.7803, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.7253458124010735, | |
| "grad_norm": 0.3073181934194982, | |
| "learning_rate": 9.61444355719484e-07, | |
| "loss": 0.7606, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 0.7267221801665406, | |
| "grad_norm": 0.5324585928575014, | |
| "learning_rate": 9.524886101119846e-07, | |
| "loss": 0.8537, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.7280985479320075, | |
| "grad_norm": 0.45921892305675543, | |
| "learning_rate": 9.435649445272516e-07, | |
| "loss": 0.8069, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 0.7294749156974744, | |
| "grad_norm": 0.48054038898023743, | |
| "learning_rate": 9.346735439535182e-07, | |
| "loss": 0.8097, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7308512834629413, | |
| "grad_norm": 0.4380511249962659, | |
| "learning_rate": 9.25814592710158e-07, | |
| "loss": 0.7914, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 0.7322276512284083, | |
| "grad_norm": 0.337588637619449, | |
| "learning_rate": 9.16988274443871e-07, | |
| "loss": 0.7967, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.7336040189938752, | |
| "grad_norm": 0.4171603673327812, | |
| "learning_rate": 9.08194772124871e-07, | |
| "loss": 0.7909, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 0.7349803867593421, | |
| "grad_norm": 0.5340434480071784, | |
| "learning_rate": 8.994342680430971e-07, | |
| "loss": 0.7702, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.736356754524809, | |
| "grad_norm": 0.45146796076237466, | |
| "learning_rate": 8.907069438044283e-07, | |
| "loss": 0.8057, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.737733122290276, | |
| "grad_norm": 0.4216880628061979, | |
| "learning_rate": 8.820129803269272e-07, | |
| "loss": 0.8074, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.7391094900557429, | |
| "grad_norm": 0.5979240606977037, | |
| "learning_rate": 8.733525578370849e-07, | |
| "loss": 0.8162, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 0.7404858578212098, | |
| "grad_norm": 0.490528351797546, | |
| "learning_rate": 8.647258558660829e-07, | |
| "loss": 0.8103, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.7418622255866768, | |
| "grad_norm": 0.6084011328958213, | |
| "learning_rate": 8.561330532460765e-07, | |
| "loss": 0.8821, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 0.7432385933521437, | |
| "grad_norm": 0.3812636095028781, | |
| "learning_rate": 8.47574328106483e-07, | |
| "loss": 0.7365, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7446149611176106, | |
| "grad_norm": 0.4418699971833215, | |
| "learning_rate": 8.390498578702924e-07, | |
| "loss": 0.8175, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 0.7459913288830775, | |
| "grad_norm": 0.3665034778827065, | |
| "learning_rate": 8.305598192503892e-07, | |
| "loss": 0.7635, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.7473676966485445, | |
| "grad_norm": 0.39085090708364434, | |
| "learning_rate": 8.22104388245884e-07, | |
| "loss": 0.7682, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 0.7487440644140114, | |
| "grad_norm": 0.515141732505318, | |
| "learning_rate": 8.136837401384734e-07, | |
| "loss": 0.8256, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.7501204321794783, | |
| "grad_norm": 0.4873699702775256, | |
| "learning_rate": 8.052980494887996e-07, | |
| "loss": 0.8079, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.7514967999449453, | |
| "grad_norm": 0.3657787697288285, | |
| "learning_rate": 7.969474901328359e-07, | |
| "loss": 0.78, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.7528731677104122, | |
| "grad_norm": 0.514544078198768, | |
| "learning_rate": 7.886322351782782e-07, | |
| "loss": 0.821, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 0.7542495354758791, | |
| "grad_norm": 0.31173151058089976, | |
| "learning_rate": 7.803524570009638e-07, | |
| "loss": 0.793, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.755625903241346, | |
| "grad_norm": 0.42241508592582616, | |
| "learning_rate": 7.7210832724129e-07, | |
| "loss": 0.7798, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 0.757002271006813, | |
| "grad_norm": 0.3071764331948748, | |
| "learning_rate": 7.63900016800663e-07, | |
| "loss": 0.7698, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.75837863877228, | |
| "grad_norm": 0.5439311482897147, | |
| "learning_rate": 7.55727695837949e-07, | |
| "loss": 0.8452, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 0.7597550065377469, | |
| "grad_norm": 0.3563361621522225, | |
| "learning_rate": 7.475915337659517e-07, | |
| "loss": 0.7901, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.7611313743032139, | |
| "grad_norm": 0.5754448075856651, | |
| "learning_rate": 7.394916992478982e-07, | |
| "loss": 0.7638, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 0.7625077420686808, | |
| "grad_norm": 0.511147390789392, | |
| "learning_rate": 7.314283601939432e-07, | |
| "loss": 0.7966, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.7638841098341477, | |
| "grad_norm": 0.5082660532801647, | |
| "learning_rate": 7.234016837576855e-07, | |
| "loss": 0.7977, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.7652604775996146, | |
| "grad_norm": 0.4037699385261041, | |
| "learning_rate": 7.154118363327076e-07, | |
| "loss": 0.8714, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7666368453650816, | |
| "grad_norm": 0.4667815625876788, | |
| "learning_rate": 7.074589835491236e-07, | |
| "loss": 0.797, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 0.7680132131305485, | |
| "grad_norm": 0.4480832075891221, | |
| "learning_rate": 6.995432902701452e-07, | |
| "loss": 0.8327, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.7693895808960154, | |
| "grad_norm": 0.42380996565690365, | |
| "learning_rate": 6.916649205886639e-07, | |
| "loss": 0.7462, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 0.7707659486614824, | |
| "grad_norm": 0.33453237656086077, | |
| "learning_rate": 6.838240378238528e-07, | |
| "loss": 0.7854, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7707659486614824, | |
| "eval_loss": 0.788250744342804, | |
| "eval_runtime": 37.5666, | |
| "eval_samples_per_second": 133.097, | |
| "eval_steps_per_second": 2.103, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7721423164269493, | |
| "grad_norm": 0.7936762263185101, | |
| "learning_rate": 6.760208045177777e-07, | |
| "loss": 0.8265, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 0.7735186841924162, | |
| "grad_norm": 0.5449658294351013, | |
| "learning_rate": 6.68255382432027e-07, | |
| "loss": 0.8289, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.7748950519578831, | |
| "grad_norm": 0.5162826155892138, | |
| "learning_rate": 6.605279325443615e-07, | |
| "loss": 0.7767, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 0.7762714197233501, | |
| "grad_norm": 0.36993986629866255, | |
| "learning_rate": 6.528386150453747e-07, | |
| "loss": 0.7914, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.777647787488817, | |
| "grad_norm": 0.4025106600408673, | |
| "learning_rate": 6.451875893351742e-07, | |
| "loss": 0.8094, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.7790241552542839, | |
| "grad_norm": 0.3803752081384692, | |
| "learning_rate": 6.375750140200729e-07, | |
| "loss": 0.7834, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.7804005230197508, | |
| "grad_norm": 0.4647690834948723, | |
| "learning_rate": 6.300010469093085e-07, | |
| "loss": 0.7677, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 0.7817768907852178, | |
| "grad_norm": 0.4360851526308311, | |
| "learning_rate": 6.224658450117638e-07, | |
| "loss": 0.8241, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.7831532585506847, | |
| "grad_norm": 0.40784724834132835, | |
| "learning_rate": 6.149695645327197e-07, | |
| "loss": 0.7794, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 0.7845296263161516, | |
| "grad_norm": 0.3733748290573589, | |
| "learning_rate": 6.075123608706093e-07, | |
| "loss": 0.7934, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7859059940816187, | |
| "grad_norm": 0.46257419791313253, | |
| "learning_rate": 6.000943886138039e-07, | |
| "loss": 0.8197, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 0.7872823618470856, | |
| "grad_norm": 0.3846998032798624, | |
| "learning_rate": 5.927158015374032e-07, | |
| "loss": 0.7601, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.7886587296125525, | |
| "grad_norm": 0.32739463317156486, | |
| "learning_rate": 5.853767526000506e-07, | |
| "loss": 0.7976, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 0.7900350973780194, | |
| "grad_norm": 0.38672592567017716, | |
| "learning_rate": 5.780773939407586e-07, | |
| "loss": 0.8075, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.7914114651434864, | |
| "grad_norm": 0.4386867822178414, | |
| "learning_rate": 5.708178768757594e-07, | |
| "loss": 0.8151, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.7927878329089533, | |
| "grad_norm": 0.3283397967295564, | |
| "learning_rate": 5.635983518953664e-07, | |
| "loss": 0.8467, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.7941642006744202, | |
| "grad_norm": 0.4837636826148133, | |
| "learning_rate": 5.564189686608528e-07, | |
| "loss": 0.829, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 0.7955405684398872, | |
| "grad_norm": 0.7164604908817186, | |
| "learning_rate": 5.492798760013504e-07, | |
| "loss": 0.8363, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.7969169362053541, | |
| "grad_norm": 0.5246720628848824, | |
| "learning_rate": 5.421812219107652e-07, | |
| "loss": 0.7728, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 0.798293303970821, | |
| "grad_norm": 0.48145772973666245, | |
| "learning_rate": 5.351231535447096e-07, | |
| "loss": 0.8351, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7996696717362879, | |
| "grad_norm": 0.6428159033099584, | |
| "learning_rate": 5.2810581721745e-07, | |
| "loss": 0.7936, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 0.8010460395017549, | |
| "grad_norm": 0.42644687973421025, | |
| "learning_rate": 5.211293583988736e-07, | |
| "loss": 0.7612, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.8024224072672218, | |
| "grad_norm": 0.49844315374917036, | |
| "learning_rate": 5.141939217114761e-07, | |
| "loss": 0.9081, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 0.8037987750326887, | |
| "grad_norm": 0.45961744850293057, | |
| "learning_rate": 5.072996509273597e-07, | |
| "loss": 0.7703, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.8051751427981557, | |
| "grad_norm": 0.46944692269351923, | |
| "learning_rate": 5.004466889652568e-07, | |
| "loss": 0.8183, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.8065515105636226, | |
| "grad_norm": 0.43054493887650536, | |
| "learning_rate": 4.93635177887562e-07, | |
| "loss": 0.8182, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.8079278783290895, | |
| "grad_norm": 0.40134994197247525, | |
| "learning_rate": 4.86865258897391e-07, | |
| "loss": 0.7662, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 0.8093042460945564, | |
| "grad_norm": 0.3679569701658888, | |
| "learning_rate": 4.801370723356533e-07, | |
| "loss": 0.7397, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.8106806138600234, | |
| "grad_norm": 0.4996662512653364, | |
| "learning_rate": 4.7345075767814277e-07, | |
| "loss": 0.7655, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 0.8120569816254903, | |
| "grad_norm": 0.33008573990481305, | |
| "learning_rate": 4.668064535326433e-07, | |
| "loss": 0.7733, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8134333493909572, | |
| "grad_norm": 0.5216225381160015, | |
| "learning_rate": 4.602042976360596e-07, | |
| "loss": 0.8131, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 0.8148097171564243, | |
| "grad_norm": 0.3009799921127595, | |
| "learning_rate": 4.536444268515608e-07, | |
| "loss": 0.761, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.8161860849218912, | |
| "grad_norm": 0.4808517772368335, | |
| "learning_rate": 4.4712697716573994e-07, | |
| "loss": 0.7887, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 0.817562452687358, | |
| "grad_norm": 0.31056324010954034, | |
| "learning_rate": 4.406520836858003e-07, | |
| "loss": 0.7373, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.818938820452825, | |
| "grad_norm": 0.4026317051978566, | |
| "learning_rate": 4.342198806367512e-07, | |
| "loss": 0.8102, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.820315188218292, | |
| "grad_norm": 0.571522212386199, | |
| "learning_rate": 4.2783050135862454e-07, | |
| "loss": 0.8232, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.8216915559837589, | |
| "grad_norm": 0.6020021292977741, | |
| "learning_rate": 4.2148407830371553e-07, | |
| "loss": 0.8423, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 0.8230679237492258, | |
| "grad_norm": 0.31515971826344896, | |
| "learning_rate": 4.1518074303383006e-07, | |
| "loss": 0.7556, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.8244442915146927, | |
| "grad_norm": 0.4964415865676406, | |
| "learning_rate": 4.0892062621756436e-07, | |
| "loss": 0.8106, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 0.8258206592801597, | |
| "grad_norm": 0.5295609606861289, | |
| "learning_rate": 4.027038576275921e-07, | |
| "loss": 0.7958, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8258206592801597, | |
| "eval_loss": 0.7862712144851685, | |
| "eval_runtime": 37.5789, | |
| "eval_samples_per_second": 133.054, | |
| "eval_steps_per_second": 2.102, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8271970270456266, | |
| "grad_norm": 0.5235293567756518, | |
| "learning_rate": 3.9653056613797315e-07, | |
| "loss": 0.8119, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 0.8285733948110935, | |
| "grad_norm": 0.46096657495011545, | |
| "learning_rate": 3.904008797214867e-07, | |
| "loss": 0.7939, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.8299497625765605, | |
| "grad_norm": 0.4583761081561358, | |
| "learning_rate": 3.8431492544697384e-07, | |
| "loss": 0.8206, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 0.8313261303420274, | |
| "grad_norm": 0.4307989854498663, | |
| "learning_rate": 3.7827282947670686e-07, | |
| "loss": 0.805, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.8327024981074943, | |
| "grad_norm": 0.6458058398139984, | |
| "learning_rate": 3.722747170637703e-07, | |
| "loss": 0.8272, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.8340788658729612, | |
| "grad_norm": 0.27246801971613765, | |
| "learning_rate": 3.663207125494667e-07, | |
| "loss": 0.7188, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.8354552336384282, | |
| "grad_norm": 0.31316968826402736, | |
| "learning_rate": 3.604109393607397e-07, | |
| "loss": 0.7771, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 0.8368316014038951, | |
| "grad_norm": 0.46558859230251337, | |
| "learning_rate": 3.545455200076148e-07, | |
| "loss": 0.7697, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.838207969169362, | |
| "grad_norm": 0.4510507655849259, | |
| "learning_rate": 3.4872457608065706e-07, | |
| "loss": 0.7729, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 0.839584336934829, | |
| "grad_norm": 0.43470367010584865, | |
| "learning_rate": 3.4294822824845447e-07, | |
| "loss": 0.8024, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.8409607047002959, | |
| "grad_norm": 0.5457490936555786, | |
| "learning_rate": 3.3721659625511466e-07, | |
| "loss": 0.8288, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 0.8423370724657628, | |
| "grad_norm": 0.4460830059047153, | |
| "learning_rate": 3.315297989177829e-07, | |
| "loss": 0.7704, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.8437134402312297, | |
| "grad_norm": 0.4017748109392074, | |
| "learning_rate": 3.2588795412417715e-07, | |
| "loss": 0.8081, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 0.8450898079966968, | |
| "grad_norm": 0.43509526046571056, | |
| "learning_rate": 3.20291178830148e-07, | |
| "loss": 0.7696, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.8464661757621637, | |
| "grad_norm": 0.4968191608476648, | |
| "learning_rate": 3.1473958905725023e-07, | |
| "loss": 0.8007, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.8478425435276306, | |
| "grad_norm": 0.35779440235114646, | |
| "learning_rate": 3.092332998903416e-07, | |
| "loss": 0.7844, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.8492189112930976, | |
| "grad_norm": 0.41824405953159766, | |
| "learning_rate": 3.0377242547519224e-07, | |
| "loss": 0.8119, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 0.8505952790585645, | |
| "grad_norm": 0.3627881737322641, | |
| "learning_rate": 2.983570790161236e-07, | |
| "loss": 0.7926, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.8519716468240314, | |
| "grad_norm": 0.6401049982386239, | |
| "learning_rate": 2.9298737277365875e-07, | |
| "loss": 0.7957, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 0.8533480145894983, | |
| "grad_norm": 0.4518026617163562, | |
| "learning_rate": 2.8766341806219565e-07, | |
| "loss": 0.8071, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8547243823549653, | |
| "grad_norm": 0.431837948798665, | |
| "learning_rate": 2.823853252476988e-07, | |
| "loss": 0.8007, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 0.8561007501204322, | |
| "grad_norm": 0.347884381355328, | |
| "learning_rate": 2.771532037454136e-07, | |
| "loss": 0.8173, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.8574771178858991, | |
| "grad_norm": 0.4330221328925689, | |
| "learning_rate": 2.719671620175968e-07, | |
| "loss": 0.7266, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 0.8588534856513661, | |
| "grad_norm": 0.3637332745725778, | |
| "learning_rate": 2.6682730757126627e-07, | |
| "loss": 0.8076, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.860229853416833, | |
| "grad_norm": 0.6109148107559905, | |
| "learning_rate": 2.6173374695597693e-07, | |
| "loss": 0.8339, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.8616062211822999, | |
| "grad_norm": 0.5076827526837399, | |
| "learning_rate": 2.566865857616066e-07, | |
| "loss": 0.8432, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.8629825889477668, | |
| "grad_norm": 0.5457670411835718, | |
| "learning_rate": 2.5168592861617216e-07, | |
| "loss": 0.7928, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 0.8643589567132338, | |
| "grad_norm": 0.41906024544255965, | |
| "learning_rate": 2.4673187918365593e-07, | |
| "loss": 0.7741, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.8657353244787007, | |
| "grad_norm": 0.5161588655724126, | |
| "learning_rate": 2.4182454016186046e-07, | |
| "loss": 0.8115, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 0.8671116922441676, | |
| "grad_norm": 0.3816011931932575, | |
| "learning_rate": 2.3696401328027806e-07, | |
| "loss": 0.7693, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.8684880600096345, | |
| "grad_norm": 0.4654707569379619, | |
| "learning_rate": 2.3215039929798205e-07, | |
| "loss": 0.8122, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 0.8698644277751015, | |
| "grad_norm": 0.306667830180016, | |
| "learning_rate": 2.2738379800153641e-07, | |
| "loss": 0.7612, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.8712407955405684, | |
| "grad_norm": 0.46545229964609797, | |
| "learning_rate": 2.226643082029309e-07, | |
| "loss": 0.7892, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 0.8726171633060353, | |
| "grad_norm": 0.1953075510516031, | |
| "learning_rate": 2.1799202773752943e-07, | |
| "loss": 0.7521, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.8739935310715023, | |
| "grad_norm": 0.381786701995952, | |
| "learning_rate": 2.1336705346204301e-07, | |
| "loss": 0.8512, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.8753698988369693, | |
| "grad_norm": 0.43933009705251314, | |
| "learning_rate": 2.087894812525218e-07, | |
| "loss": 0.8025, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.8767462666024362, | |
| "grad_norm": 0.44005358531346456, | |
| "learning_rate": 2.042594060023681e-07, | |
| "loss": 0.7756, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 0.878122634367903, | |
| "grad_norm": 0.49824718850948474, | |
| "learning_rate": 1.9977692162036876e-07, | |
| "loss": 0.7978, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.8794990021333701, | |
| "grad_norm": 0.3227198829491034, | |
| "learning_rate": 1.95342121028749e-07, | |
| "loss": 0.7738, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 0.880875369898837, | |
| "grad_norm": 0.34555908654114686, | |
| "learning_rate": 1.9095509616124385e-07, | |
| "loss": 0.8192, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.880875369898837, | |
| "eval_loss": 0.7828695774078369, | |
| "eval_runtime": 37.5732, | |
| "eval_samples_per_second": 133.074, | |
| "eval_steps_per_second": 2.103, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8822517376643039, | |
| "grad_norm": 0.4058193409462779, | |
| "learning_rate": 1.866159379611965e-07, | |
| "loss": 0.7827, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 0.8836281054297709, | |
| "grad_norm": 0.634249713668319, | |
| "learning_rate": 1.8232473637966874e-07, | |
| "loss": 0.8316, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.8850044731952378, | |
| "grad_norm": 0.47086197421634446, | |
| "learning_rate": 1.7808158037357997e-07, | |
| "loss": 0.8106, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 0.8863808409607047, | |
| "grad_norm": 0.4923463916845965, | |
| "learning_rate": 1.7388655790385928e-07, | |
| "loss": 0.7618, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.8877572087261716, | |
| "grad_norm": 0.5706429066921409, | |
| "learning_rate": 1.6973975593362557e-07, | |
| "loss": 0.8026, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.8891335764916386, | |
| "grad_norm": 0.40654338758220415, | |
| "learning_rate": 1.656412604263824e-07, | |
| "loss": 0.805, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.8905099442571055, | |
| "grad_norm": 0.36239810433911784, | |
| "learning_rate": 1.615911563442385e-07, | |
| "loss": 0.7901, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 0.8918863120225724, | |
| "grad_norm": 0.3902839217508603, | |
| "learning_rate": 1.5758952764614254e-07, | |
| "loss": 0.772, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.8932626797880394, | |
| "grad_norm": 0.4730030214965539, | |
| "learning_rate": 1.536364572861465e-07, | |
| "loss": 0.7981, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 0.8946390475535063, | |
| "grad_norm": 0.5222497792049411, | |
| "learning_rate": 1.4973202721168452e-07, | |
| "loss": 0.774, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8960154153189732, | |
| "grad_norm": 0.457253030971323, | |
| "learning_rate": 1.4587631836187362e-07, | |
| "loss": 0.7762, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 0.8973917830844401, | |
| "grad_norm": 0.35084396615645064, | |
| "learning_rate": 1.420694106658363e-07, | |
| "loss": 0.7956, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.8987681508499071, | |
| "grad_norm": 0.46947234334867594, | |
| "learning_rate": 1.3831138304104374e-07, | |
| "loss": 0.7488, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 0.900144518615374, | |
| "grad_norm": 0.4211703728724711, | |
| "learning_rate": 1.3460231339168018e-07, | |
| "loss": 0.7594, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.9015208863808409, | |
| "grad_norm": 0.4318480909616356, | |
| "learning_rate": 1.3094227860702636e-07, | |
| "loss": 0.7535, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.902897254146308, | |
| "grad_norm": 0.5304843144531105, | |
| "learning_rate": 1.2733135455986755e-07, | |
| "loss": 0.7631, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.9042736219117748, | |
| "grad_norm": 0.39712712425481755, | |
| "learning_rate": 1.237696161049201e-07, | |
| "loss": 0.7967, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 0.9056499896772418, | |
| "grad_norm": 0.3607138196755753, | |
| "learning_rate": 1.2025713707727954e-07, | |
| "loss": 0.7673, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.9070263574427087, | |
| "grad_norm": 0.36526857232256044, | |
| "learning_rate": 1.1679399029088878e-07, | |
| "loss": 0.8021, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 0.9084027252081757, | |
| "grad_norm": 0.4307690645765424, | |
| "learning_rate": 1.1338024753703076e-07, | |
| "loss": 0.7855, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9097790929736426, | |
| "grad_norm": 0.5370242540693829, | |
| "learning_rate": 1.1001597958283927e-07, | |
| "loss": 0.7942, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 0.9111554607391095, | |
| "grad_norm": 0.762967780858564, | |
| "learning_rate": 1.067012561698319e-07, | |
| "loss": 0.7809, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.9125318285045764, | |
| "grad_norm": 0.38146623134983393, | |
| "learning_rate": 1.0343614601246388e-07, | |
| "loss": 0.8512, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 0.9139081962700434, | |
| "grad_norm": 0.483635357463634, | |
| "learning_rate": 1.0022071679670426e-07, | |
| "loss": 0.8334, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.9152845640355103, | |
| "grad_norm": 0.45987532559150957, | |
| "learning_rate": 9.705503517863286e-08, | |
| "loss": 0.7624, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.9166609318009772, | |
| "grad_norm": 0.5146306345364831, | |
| "learning_rate": 9.393916678305831e-08, | |
| "loss": 0.781, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.9180372995664442, | |
| "grad_norm": 0.4353941113364763, | |
| "learning_rate": 9.087317620215642e-08, | |
| "loss": 0.7926, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 0.9194136673319111, | |
| "grad_norm": 0.5238188017998141, | |
| "learning_rate": 8.78571269941339e-08, | |
| "loss": 0.7944, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.920790035097378, | |
| "grad_norm": 0.39784081936662746, | |
| "learning_rate": 8.48910816819079e-08, | |
| "loss": 0.7456, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 0.9221664028628449, | |
| "grad_norm": 0.4021648902631367, | |
| "learning_rate": 8.197510175181279e-08, | |
| "loss": 0.7972, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9235427706283119, | |
| "grad_norm": 0.4535198430400062, | |
| "learning_rate": 7.910924765232169e-08, | |
| "loss": 0.7716, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 0.9249191383937788, | |
| "grad_norm": 0.4679043443981073, | |
| "learning_rate": 7.629357879279764e-08, | |
| "loss": 0.8151, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.9262955061592457, | |
| "grad_norm": 0.4138512577922224, | |
| "learning_rate": 7.352815354225856e-08, | |
| "loss": 0.7778, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 0.9276718739247127, | |
| "grad_norm": 0.4959390023966221, | |
| "learning_rate": 7.08130292281703e-08, | |
| "loss": 0.7654, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.9290482416901796, | |
| "grad_norm": 0.3635744148121797, | |
| "learning_rate": 6.8148262135255e-08, | |
| "loss": 0.7713, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.9304246094556465, | |
| "grad_norm": 0.3880051004435765, | |
| "learning_rate": 6.553390750432709e-08, | |
| "loss": 0.797, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.9318009772211134, | |
| "grad_norm": 0.376848176957449, | |
| "learning_rate": 6.297001953114696e-08, | |
| "loss": 0.7915, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 0.9331773449865804, | |
| "grad_norm": 0.5229520565025577, | |
| "learning_rate": 6.045665136529683e-08, | |
| "loss": 0.7831, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.9345537127520473, | |
| "grad_norm": 0.3749002422429637, | |
| "learning_rate": 5.799385510908029e-08, | |
| "loss": 0.813, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 0.9359300805175143, | |
| "grad_norm": 0.538377010361361, | |
| "learning_rate": 5.558168181644147e-08, | |
| "loss": 0.765, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.9359300805175143, | |
| "eval_loss": 0.7824124693870544, | |
| "eval_runtime": 37.5729, | |
| "eval_samples_per_second": 133.075, | |
| "eval_steps_per_second": 2.103, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.9373064482829813, | |
| "grad_norm": 0.40542132743231607, | |
| "learning_rate": 5.3220181491906997e-08, | |
| "loss": 0.7939, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 0.9386828160484482, | |
| "grad_norm": 0.45653448883349285, | |
| "learning_rate": 5.0909403089548504e-08, | |
| "loss": 0.7683, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.9400591838139151, | |
| "grad_norm": 0.43498449283812496, | |
| "learning_rate": 4.864939451196926e-08, | |
| "loss": 0.7706, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 0.941435551579382, | |
| "grad_norm": 0.4632642432067583, | |
| "learning_rate": 4.6440202609309983e-08, | |
| "loss": 0.847, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.942811919344849, | |
| "grad_norm": 0.398573024905695, | |
| "learning_rate": 4.428187317827848e-08, | |
| "loss": 0.8004, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.9441882871103159, | |
| "grad_norm": 0.3520961675944394, | |
| "learning_rate": 4.217445096119932e-08, | |
| "loss": 0.7768, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.9455646548757828, | |
| "grad_norm": 0.2659907213915567, | |
| "learning_rate": 4.011797964508707e-08, | |
| "loss": 0.8068, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 0.9469410226412498, | |
| "grad_norm": 0.35721378611449023, | |
| "learning_rate": 3.8112501860740893e-08, | |
| "loss": 0.7761, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.9483173904067167, | |
| "grad_norm": 0.5662605277154725, | |
| "learning_rate": 3.615805918185999e-08, | |
| "loss": 0.7956, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 0.9496937581721836, | |
| "grad_norm": 0.4708997292827784, | |
| "learning_rate": 3.4254692124181256e-08, | |
| "loss": 0.781, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.9510701259376505, | |
| "grad_norm": 0.3463983685723411, | |
| "learning_rate": 3.240244014464211e-08, | |
| "loss": 0.8038, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 0.9524464937031175, | |
| "grad_norm": 0.30914041030379646, | |
| "learning_rate": 3.060134164055928e-08, | |
| "loss": 0.7855, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.9538228614685844, | |
| "grad_norm": 0.4094929093409069, | |
| "learning_rate": 2.885143394883466e-08, | |
| "loss": 0.7922, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 0.9551992292340513, | |
| "grad_norm": 0.3550357942305495, | |
| "learning_rate": 2.7152753345181248e-08, | |
| "loss": 0.7488, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.9565755969995182, | |
| "grad_norm": 0.4969121322892503, | |
| "learning_rate": 2.5505335043370105e-08, | |
| "loss": 0.8235, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.9579519647649852, | |
| "grad_norm": 0.5594870015775546, | |
| "learning_rate": 2.3909213194501513e-08, | |
| "loss": 0.8019, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.9593283325304521, | |
| "grad_norm": 0.3982458733337497, | |
| "learning_rate": 2.2364420886297202e-08, | |
| "loss": 0.7931, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 0.960704700295919, | |
| "grad_norm": 0.40600940068470787, | |
| "learning_rate": 2.087099014241256e-08, | |
| "loss": 0.7751, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.962081068061386, | |
| "grad_norm": 0.5688667017637077, | |
| "learning_rate": 1.9428951921774687e-08, | |
| "loss": 0.8253, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 0.963457435826853, | |
| "grad_norm": 0.41507791635586866, | |
| "learning_rate": 1.8038336117940368e-08, | |
| "loss": 0.7615, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9648338035923198, | |
| "grad_norm": 0.3974167729618538, | |
| "learning_rate": 1.6699171558474946e-08, | |
| "loss": 0.7943, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 0.9662101713577868, | |
| "grad_norm": 0.6940727253100508, | |
| "learning_rate": 1.541148600435721e-08, | |
| "loss": 0.8198, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.9675865391232538, | |
| "grad_norm": 0.46568158788586206, | |
| "learning_rate": 1.4175306149400715e-08, | |
| "loss": 0.8164, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 0.9689629068887207, | |
| "grad_norm": 0.5038186258746858, | |
| "learning_rate": 1.2990657619703361e-08, | |
| "loss": 0.755, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.9703392746541876, | |
| "grad_norm": 0.44963940045587836, | |
| "learning_rate": 1.1857564973114798e-08, | |
| "loss": 0.8276, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.9717156424196546, | |
| "grad_norm": 0.3869453345256143, | |
| "learning_rate": 1.0776051698727363e-08, | |
| "loss": 0.7643, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.9730920101851215, | |
| "grad_norm": 0.4625561038516912, | |
| "learning_rate": 9.746140216388978e-09, | |
| "loss": 0.7961, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 0.9744683779505884, | |
| "grad_norm": 0.43657476353887914, | |
| "learning_rate": 8.767851876239075e-09, | |
| "loss": 0.785, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.9758447457160553, | |
| "grad_norm": 0.36886304870486564, | |
| "learning_rate": 7.841206958265901e-09, | |
| "loss": 0.8109, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 0.9772211134815223, | |
| "grad_norm": 0.48365435933116596, | |
| "learning_rate": 6.9662246718849025e-09, | |
| "loss": 0.805, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.9785974812469892, | |
| "grad_norm": 0.5676220156571874, | |
| "learning_rate": 6.142923155542379e-09, | |
| "loss": 0.8249, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 0.9799738490124561, | |
| "grad_norm": 0.5777045577708868, | |
| "learning_rate": 5.371319476338288e-09, | |
| "loss": 0.8371, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.9813502167779231, | |
| "grad_norm": 0.4285639278318182, | |
| "learning_rate": 4.651429629672077e-09, | |
| "loss": 0.8493, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 0.98272658454339, | |
| "grad_norm": 0.5788587404914071, | |
| "learning_rate": 3.9832685389123995e-09, | |
| "loss": 0.8533, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.9841029523088569, | |
| "grad_norm": 0.504857407057922, | |
| "learning_rate": 3.3668500550870787e-09, | |
| "loss": 0.8482, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.9854793200743238, | |
| "grad_norm": 0.48362838745693276, | |
| "learning_rate": 2.8021869565958427e-09, | |
| "loss": 0.7877, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.9868556878397908, | |
| "grad_norm": 0.4381509362003844, | |
| "learning_rate": 2.289290948944978e-09, | |
| "loss": 0.8337, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 0.9882320556052577, | |
| "grad_norm": 0.35883074278050137, | |
| "learning_rate": 1.8281726645061338e-09, | |
| "loss": 0.8103, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.9896084233707246, | |
| "grad_norm": 0.30791028488533356, | |
| "learning_rate": 1.4188416622945566e-09, | |
| "loss": 0.7772, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 0.9909847911361916, | |
| "grad_norm": 0.519345784734355, | |
| "learning_rate": 1.0613064277711916e-09, | |
| "loss": 0.7939, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9909847911361916, | |
| "eval_loss": 0.7823675870895386, | |
| "eval_runtime": 37.5664, | |
| "eval_samples_per_second": 133.098, | |
| "eval_steps_per_second": 2.103, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9923611589016585, | |
| "grad_norm": 0.5511524759279022, | |
| "learning_rate": 7.555743726675446e-10, | |
| "loss": 0.7623, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 0.9937375266671254, | |
| "grad_norm": 0.46879838306977284, | |
| "learning_rate": 5.01651834831085e-10, | |
| "loss": 0.7906, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.9951138944325923, | |
| "grad_norm": 0.5200935954838051, | |
| "learning_rate": 2.9954407809423823e-10, | |
| "loss": 0.788, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 0.9964902621980594, | |
| "grad_norm": 0.47789258781015914, | |
| "learning_rate": 1.4925529216558432e-10, | |
| "loss": 0.7885, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.9978666299635263, | |
| "grad_norm": 0.2942434423794945, | |
| "learning_rate": 5.078859254242785e-11, | |
| "loss": 0.8001, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.9992429977289932, | |
| "grad_norm": 0.49341815424912977, | |
| "learning_rate": 4.1460204466825526e-12, | |
| "loss": 0.8235, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.99979354483518, | |
| "step": 3632, | |
| "total_flos": 1258708224212992.0, | |
| "train_loss": 0.935843440076328, | |
| "train_runtime": 26049.5256, | |
| "train_samples_per_second": 35.699, | |
| "train_steps_per_second": 0.139 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 3632, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1258708224212992.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |