{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.996924023377423, "eval_steps": 500, "global_step": 16250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030759766225776685, "grad_norm": 16256.0, "learning_rate": 5e-06, "loss": 20.0553, "step": 5 }, { "epoch": 0.006151953245155337, "grad_norm": 22400.0, "learning_rate": 1e-05, "loss": 5.1282, "step": 10 }, { "epoch": 0.009227929867733005, "grad_norm": 116.0, "learning_rate": 1.5e-05, "loss": 0.5414, "step": 15 }, { "epoch": 0.012303906490310674, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 0.1132, "step": 20 }, { "epoch": 0.015379883112888341, "grad_norm": 0.38671875, "learning_rate": 2.5e-05, "loss": 0.0785, "step": 25 }, { "epoch": 0.01845585973546601, "grad_norm": 0.328125, "learning_rate": 3e-05, "loss": 0.0745, "step": 30 }, { "epoch": 0.02153183635804368, "grad_norm": 0.318359375, "learning_rate": 3.5e-05, "loss": 0.0691, "step": 35 }, { "epoch": 0.024607812980621348, "grad_norm": 0.314453125, "learning_rate": 4e-05, "loss": 0.0775, "step": 40 }, { "epoch": 0.027683789603199015, "grad_norm": 0.3125, "learning_rate": 4.5e-05, "loss": 0.0815, "step": 45 }, { "epoch": 0.030759766225776683, "grad_norm": 0.294921875, "learning_rate": 5e-05, "loss": 0.0817, "step": 50 }, { "epoch": 0.033835742848354354, "grad_norm": 0.2890625, "learning_rate": 4.998456790123457e-05, "loss": 0.0683, "step": 55 }, { "epoch": 0.03691171947093202, "grad_norm": 0.298828125, "learning_rate": 4.996913580246913e-05, "loss": 0.0737, "step": 60 }, { "epoch": 0.03998769609350969, "grad_norm": 0.314453125, "learning_rate": 4.995370370370371e-05, "loss": 0.0718, "step": 65 }, { "epoch": 0.04306367271608736, "grad_norm": 0.259765625, "learning_rate": 4.9938271604938276e-05, "loss": 0.0767, "step": 70 }, { "epoch": 0.046139649338665024, "grad_norm": 0.29296875, "learning_rate": 4.9922839506172845e-05, "loss": 0.0695, "step": 75 }, { "epoch": 0.049215625961242696, "grad_norm": 0.2578125, "learning_rate": 4.9907407407407406e-05, "loss": 0.0716, "step": 80 }, { "epoch": 0.05229160258382036, "grad_norm": 0.2578125, "learning_rate": 4.9891975308641975e-05, "loss": 0.0756, "step": 85 }, { "epoch": 0.05536757920639803, "grad_norm": 0.234375, "learning_rate": 4.987654320987655e-05, "loss": 0.0629, "step": 90 }, { "epoch": 0.0584435558289757, "grad_norm": 0.291015625, "learning_rate": 4.986111111111111e-05, "loss": 0.0769, "step": 95 }, { "epoch": 0.061519532451553366, "grad_norm": 0.279296875, "learning_rate": 4.984567901234568e-05, "loss": 0.0667, "step": 100 }, { "epoch": 0.06459550907413103, "grad_norm": 0.265625, "learning_rate": 4.983024691358025e-05, "loss": 0.0712, "step": 105 }, { "epoch": 0.06767148569670871, "grad_norm": 0.2353515625, "learning_rate": 4.981481481481482e-05, "loss": 0.0713, "step": 110 }, { "epoch": 0.07074746231928637, "grad_norm": 0.2578125, "learning_rate": 4.9799382716049385e-05, "loss": 0.0775, "step": 115 }, { "epoch": 0.07382343894186404, "grad_norm": 0.283203125, "learning_rate": 4.9783950617283954e-05, "loss": 0.069, "step": 120 }, { "epoch": 0.07689941556444171, "grad_norm": 0.26171875, "learning_rate": 4.976851851851852e-05, "loss": 0.0757, "step": 125 }, { "epoch": 0.07997539218701938, "grad_norm": 0.26953125, "learning_rate": 4.9753086419753084e-05, "loss": 0.0673, "step": 130 }, { "epoch": 0.08305136880959704, "grad_norm": 0.263671875, "learning_rate": 4.973765432098766e-05, "loss": 0.082, "step": 135 }, { "epoch": 0.08612734543217472, "grad_norm": 0.283203125, "learning_rate": 4.972222222222223e-05, "loss": 0.0671, "step": 140 }, { "epoch": 0.08920332205475238, "grad_norm": 0.28515625, "learning_rate": 4.970679012345679e-05, "loss": 0.076, "step": 145 }, { "epoch": 0.09227929867733005, "grad_norm": 0.28515625, "learning_rate": 4.969135802469136e-05, "loss": 0.0631, "step": 150 }, { "epoch": 0.09535527529990773, "grad_norm": 0.275390625, "learning_rate": 4.9675925925925926e-05, "loss": 0.0743, "step": 155 }, { "epoch": 0.09843125192248539, "grad_norm": 0.25390625, "learning_rate": 4.9660493827160495e-05, "loss": 0.0646, "step": 160 }, { "epoch": 0.10150722854506306, "grad_norm": 0.228515625, "learning_rate": 4.964506172839506e-05, "loss": 0.0765, "step": 165 }, { "epoch": 0.10458320516764072, "grad_norm": 0.28125, "learning_rate": 4.962962962962963e-05, "loss": 0.0684, "step": 170 }, { "epoch": 0.1076591817902184, "grad_norm": 0.22265625, "learning_rate": 4.96141975308642e-05, "loss": 0.0759, "step": 175 }, { "epoch": 0.11073515841279606, "grad_norm": 0.248046875, "learning_rate": 4.959876543209877e-05, "loss": 0.0717, "step": 180 }, { "epoch": 0.11381113503537373, "grad_norm": 0.26171875, "learning_rate": 4.958333333333334e-05, "loss": 0.0763, "step": 185 }, { "epoch": 0.1168871116579514, "grad_norm": 0.2578125, "learning_rate": 4.9567901234567905e-05, "loss": 0.07, "step": 190 }, { "epoch": 0.11996308828052907, "grad_norm": 0.302734375, "learning_rate": 4.9552469135802474e-05, "loss": 0.0695, "step": 195 }, { "epoch": 0.12303906490310673, "grad_norm": 0.26953125, "learning_rate": 4.9537037037037035e-05, "loss": 0.0686, "step": 200 }, { "epoch": 0.1261150415256844, "grad_norm": 0.2578125, "learning_rate": 4.952160493827161e-05, "loss": 0.0765, "step": 205 }, { "epoch": 0.12919101814826206, "grad_norm": 0.2275390625, "learning_rate": 4.950617283950618e-05, "loss": 0.0649, "step": 210 }, { "epoch": 0.13226699477083975, "grad_norm": 0.2412109375, "learning_rate": 4.949074074074074e-05, "loss": 0.0721, "step": 215 }, { "epoch": 0.13534297139341742, "grad_norm": 0.2470703125, "learning_rate": 4.947530864197531e-05, "loss": 0.0633, "step": 220 }, { "epoch": 0.13841894801599508, "grad_norm": 0.2490234375, "learning_rate": 4.945987654320988e-05, "loss": 0.0673, "step": 225 }, { "epoch": 0.14149492463857274, "grad_norm": 0.28125, "learning_rate": 4.9444444444444446e-05, "loss": 0.0751, "step": 230 }, { "epoch": 0.1445709012611504, "grad_norm": 0.2470703125, "learning_rate": 4.9429012345679015e-05, "loss": 0.0678, "step": 235 }, { "epoch": 0.14764687788372807, "grad_norm": 0.251953125, "learning_rate": 4.941358024691358e-05, "loss": 0.0738, "step": 240 }, { "epoch": 0.15072285450630576, "grad_norm": 0.234375, "learning_rate": 4.939814814814815e-05, "loss": 0.0689, "step": 245 }, { "epoch": 0.15379883112888343, "grad_norm": 0.236328125, "learning_rate": 4.938271604938271e-05, "loss": 0.0635, "step": 250 }, { "epoch": 0.1568748077514611, "grad_norm": 0.2578125, "learning_rate": 4.936728395061729e-05, "loss": 0.0634, "step": 255 }, { "epoch": 0.15995078437403876, "grad_norm": 0.248046875, "learning_rate": 4.935185185185186e-05, "loss": 0.0669, "step": 260 }, { "epoch": 0.16302676099661642, "grad_norm": 0.2578125, "learning_rate": 4.933641975308642e-05, "loss": 0.065, "step": 265 }, { "epoch": 0.16610273761919409, "grad_norm": 0.2490234375, "learning_rate": 4.932098765432099e-05, "loss": 0.0652, "step": 270 }, { "epoch": 0.16917871424177175, "grad_norm": 0.255859375, "learning_rate": 4.930555555555556e-05, "loss": 0.0702, "step": 275 }, { "epoch": 0.17225469086434944, "grad_norm": 0.2275390625, "learning_rate": 4.9290123456790124e-05, "loss": 0.0634, "step": 280 }, { "epoch": 0.1753306674869271, "grad_norm": 0.234375, "learning_rate": 4.927469135802469e-05, "loss": 0.0636, "step": 285 }, { "epoch": 0.17840664410950477, "grad_norm": 0.2421875, "learning_rate": 4.925925925925926e-05, "loss": 0.0661, "step": 290 }, { "epoch": 0.18148262073208243, "grad_norm": 0.263671875, "learning_rate": 4.924382716049383e-05, "loss": 0.0715, "step": 295 }, { "epoch": 0.1845585973546601, "grad_norm": 0.283203125, "learning_rate": 4.92283950617284e-05, "loss": 0.079, "step": 300 }, { "epoch": 0.18763457397723776, "grad_norm": 0.259765625, "learning_rate": 4.9212962962962966e-05, "loss": 0.0666, "step": 305 }, { "epoch": 0.19071055059981545, "grad_norm": 0.248046875, "learning_rate": 4.9197530864197535e-05, "loss": 0.0789, "step": 310 }, { "epoch": 0.19378652722239312, "grad_norm": 0.25390625, "learning_rate": 4.91820987654321e-05, "loss": 0.0677, "step": 315 }, { "epoch": 0.19686250384497078, "grad_norm": 0.236328125, "learning_rate": 4.9166666666666665e-05, "loss": 0.0709, "step": 320 }, { "epoch": 0.19993848046754845, "grad_norm": 0.2265625, "learning_rate": 4.915123456790124e-05, "loss": 0.0644, "step": 325 }, { "epoch": 0.2030144570901261, "grad_norm": 0.279296875, "learning_rate": 4.913580246913581e-05, "loss": 0.0664, "step": 330 }, { "epoch": 0.20609043371270377, "grad_norm": 0.24609375, "learning_rate": 4.912037037037037e-05, "loss": 0.0685, "step": 335 }, { "epoch": 0.20916641033528144, "grad_norm": 0.279296875, "learning_rate": 4.910493827160494e-05, "loss": 0.073, "step": 340 }, { "epoch": 0.21224238695785913, "grad_norm": 0.2197265625, "learning_rate": 4.9089506172839514e-05, "loss": 0.0648, "step": 345 }, { "epoch": 0.2153183635804368, "grad_norm": 0.25390625, "learning_rate": 4.9074074074074075e-05, "loss": 0.0699, "step": 350 }, { "epoch": 0.21839434020301446, "grad_norm": 0.23828125, "learning_rate": 4.9058641975308644e-05, "loss": 0.0656, "step": 355 }, { "epoch": 0.22147031682559212, "grad_norm": 0.23828125, "learning_rate": 4.904320987654321e-05, "loss": 0.0671, "step": 360 }, { "epoch": 0.2245462934481698, "grad_norm": 0.265625, "learning_rate": 4.902777777777778e-05, "loss": 0.0652, "step": 365 }, { "epoch": 0.22762227007074745, "grad_norm": 0.26953125, "learning_rate": 4.901234567901235e-05, "loss": 0.0729, "step": 370 }, { "epoch": 0.23069824669332514, "grad_norm": 0.22265625, "learning_rate": 4.899691358024692e-05, "loss": 0.0607, "step": 375 }, { "epoch": 0.2337742233159028, "grad_norm": 0.287109375, "learning_rate": 4.8981481481481486e-05, "loss": 0.0713, "step": 380 }, { "epoch": 0.23685019993848047, "grad_norm": 0.25390625, "learning_rate": 4.896604938271605e-05, "loss": 0.0714, "step": 385 }, { "epoch": 0.23992617656105814, "grad_norm": 0.255859375, "learning_rate": 4.8950617283950616e-05, "loss": 0.0686, "step": 390 }, { "epoch": 0.2430021531836358, "grad_norm": 0.234375, "learning_rate": 4.893518518518519e-05, "loss": 0.0661, "step": 395 }, { "epoch": 0.24607812980621346, "grad_norm": 0.208984375, "learning_rate": 4.891975308641975e-05, "loss": 0.0642, "step": 400 }, { "epoch": 0.24915410642879113, "grad_norm": 0.2294921875, "learning_rate": 4.890432098765432e-05, "loss": 0.0665, "step": 405 }, { "epoch": 0.2522300830513688, "grad_norm": 0.2412109375, "learning_rate": 4.888888888888889e-05, "loss": 0.0685, "step": 410 }, { "epoch": 0.2553060596739465, "grad_norm": 0.267578125, "learning_rate": 4.8873456790123465e-05, "loss": 0.0692, "step": 415 }, { "epoch": 0.2583820362965241, "grad_norm": 0.232421875, "learning_rate": 4.885802469135803e-05, "loss": 0.0719, "step": 420 }, { "epoch": 0.2614580129191018, "grad_norm": 0.234375, "learning_rate": 4.8842592592592595e-05, "loss": 0.0629, "step": 425 }, { "epoch": 0.2645339895416795, "grad_norm": 0.2265625, "learning_rate": 4.8827160493827164e-05, "loss": 0.0682, "step": 430 }, { "epoch": 0.26760996616425714, "grad_norm": 0.232421875, "learning_rate": 4.881172839506173e-05, "loss": 0.0616, "step": 435 }, { "epoch": 0.27068594278683483, "grad_norm": 0.25, "learning_rate": 4.87962962962963e-05, "loss": 0.061, "step": 440 }, { "epoch": 0.27376191940941247, "grad_norm": 0.23046875, "learning_rate": 4.878086419753087e-05, "loss": 0.0647, "step": 445 }, { "epoch": 0.27683789603199016, "grad_norm": 0.228515625, "learning_rate": 4.876543209876544e-05, "loss": 0.0635, "step": 450 }, { "epoch": 0.27991387265456785, "grad_norm": 0.22265625, "learning_rate": 4.875e-05, "loss": 0.0728, "step": 455 }, { "epoch": 0.2829898492771455, "grad_norm": 0.287109375, "learning_rate": 4.873456790123457e-05, "loss": 0.0681, "step": 460 }, { "epoch": 0.2860658258997232, "grad_norm": 0.259765625, "learning_rate": 4.871913580246914e-05, "loss": 0.0626, "step": 465 }, { "epoch": 0.2891418025223008, "grad_norm": 0.27734375, "learning_rate": 4.8703703703703704e-05, "loss": 0.0693, "step": 470 }, { "epoch": 0.2922177791448785, "grad_norm": 0.216796875, "learning_rate": 4.868827160493827e-05, "loss": 0.0591, "step": 475 }, { "epoch": 0.29529375576745615, "grad_norm": 0.26171875, "learning_rate": 4.867283950617284e-05, "loss": 0.07, "step": 480 }, { "epoch": 0.29836973239003384, "grad_norm": 0.236328125, "learning_rate": 4.865740740740741e-05, "loss": 0.069, "step": 485 }, { "epoch": 0.30144570901261153, "grad_norm": 0.259765625, "learning_rate": 4.864197530864198e-05, "loss": 0.0701, "step": 490 }, { "epoch": 0.30452168563518917, "grad_norm": 0.240234375, "learning_rate": 4.862654320987655e-05, "loss": 0.0688, "step": 495 }, { "epoch": 0.30759766225776686, "grad_norm": 0.224609375, "learning_rate": 4.8611111111111115e-05, "loss": 0.0663, "step": 500 }, { "epoch": 0.3106736388803445, "grad_norm": 0.2353515625, "learning_rate": 4.859567901234568e-05, "loss": 0.0682, "step": 505 }, { "epoch": 0.3137496155029222, "grad_norm": 0.224609375, "learning_rate": 4.858024691358025e-05, "loss": 0.072, "step": 510 }, { "epoch": 0.3168255921254998, "grad_norm": 0.2275390625, "learning_rate": 4.856481481481482e-05, "loss": 0.06, "step": 515 }, { "epoch": 0.3199015687480775, "grad_norm": 0.2197265625, "learning_rate": 4.854938271604938e-05, "loss": 0.0702, "step": 520 }, { "epoch": 0.3229775453706552, "grad_norm": 0.2490234375, "learning_rate": 4.853395061728395e-05, "loss": 0.0714, "step": 525 }, { "epoch": 0.32605352199323284, "grad_norm": 0.208984375, "learning_rate": 4.851851851851852e-05, "loss": 0.0619, "step": 530 }, { "epoch": 0.32912949861581053, "grad_norm": 0.2412109375, "learning_rate": 4.8503086419753094e-05, "loss": 0.079, "step": 535 }, { "epoch": 0.33220547523838817, "grad_norm": 0.2314453125, "learning_rate": 4.8487654320987656e-05, "loss": 0.0666, "step": 540 }, { "epoch": 0.33528145186096586, "grad_norm": 0.240234375, "learning_rate": 4.8472222222222224e-05, "loss": 0.0674, "step": 545 }, { "epoch": 0.3383574284835435, "grad_norm": 0.2109375, "learning_rate": 4.845679012345679e-05, "loss": 0.0659, "step": 550 }, { "epoch": 0.3414334051061212, "grad_norm": 0.2255859375, "learning_rate": 4.8441358024691354e-05, "loss": 0.0627, "step": 555 }, { "epoch": 0.3445093817286989, "grad_norm": 0.224609375, "learning_rate": 4.842592592592593e-05, "loss": 0.0637, "step": 560 }, { "epoch": 0.3475853583512765, "grad_norm": 0.2216796875, "learning_rate": 4.84104938271605e-05, "loss": 0.0643, "step": 565 }, { "epoch": 0.3506613349738542, "grad_norm": 0.2255859375, "learning_rate": 4.8395061728395067e-05, "loss": 0.0625, "step": 570 }, { "epoch": 0.35373731159643185, "grad_norm": 0.2119140625, "learning_rate": 4.837962962962963e-05, "loss": 0.0601, "step": 575 }, { "epoch": 0.35681328821900954, "grad_norm": 0.23046875, "learning_rate": 4.8364197530864204e-05, "loss": 0.0634, "step": 580 }, { "epoch": 0.35988926484158723, "grad_norm": 0.265625, "learning_rate": 4.834876543209877e-05, "loss": 0.0712, "step": 585 }, { "epoch": 0.36296524146416487, "grad_norm": 0.25390625, "learning_rate": 4.8333333333333334e-05, "loss": 0.0642, "step": 590 }, { "epoch": 0.36604121808674256, "grad_norm": 0.263671875, "learning_rate": 4.83179012345679e-05, "loss": 0.0655, "step": 595 }, { "epoch": 0.3691171947093202, "grad_norm": 0.2373046875, "learning_rate": 4.830246913580247e-05, "loss": 0.0689, "step": 600 }, { "epoch": 0.3721931713318979, "grad_norm": 0.2392578125, "learning_rate": 4.828703703703704e-05, "loss": 0.0624, "step": 605 }, { "epoch": 0.3752691479544755, "grad_norm": 0.25, "learning_rate": 4.827160493827161e-05, "loss": 0.0741, "step": 610 }, { "epoch": 0.3783451245770532, "grad_norm": 0.2314453125, "learning_rate": 4.8256172839506176e-05, "loss": 0.0704, "step": 615 }, { "epoch": 0.3814211011996309, "grad_norm": 0.259765625, "learning_rate": 4.8240740740740744e-05, "loss": 0.0658, "step": 620 }, { "epoch": 0.38449707782220854, "grad_norm": 0.2177734375, "learning_rate": 4.8225308641975306e-05, "loss": 0.0651, "step": 625 }, { "epoch": 0.38757305444478624, "grad_norm": 0.27734375, "learning_rate": 4.820987654320988e-05, "loss": 0.0667, "step": 630 }, { "epoch": 0.39064903106736387, "grad_norm": 0.2314453125, "learning_rate": 4.819444444444445e-05, "loss": 0.0668, "step": 635 }, { "epoch": 0.39372500768994156, "grad_norm": 0.2314453125, "learning_rate": 4.817901234567901e-05, "loss": 0.0624, "step": 640 }, { "epoch": 0.3968009843125192, "grad_norm": 0.236328125, "learning_rate": 4.816358024691358e-05, "loss": 0.0632, "step": 645 }, { "epoch": 0.3998769609350969, "grad_norm": 0.2275390625, "learning_rate": 4.814814814814815e-05, "loss": 0.0687, "step": 650 }, { "epoch": 0.4029529375576746, "grad_norm": 0.2265625, "learning_rate": 4.8132716049382723e-05, "loss": 0.0661, "step": 655 }, { "epoch": 0.4060289141802522, "grad_norm": 0.2353515625, "learning_rate": 4.8117283950617285e-05, "loss": 0.0666, "step": 660 }, { "epoch": 0.4091048908028299, "grad_norm": 0.21875, "learning_rate": 4.8101851851851854e-05, "loss": 0.0654, "step": 665 }, { "epoch": 0.41218086742540755, "grad_norm": 0.2333984375, "learning_rate": 4.808641975308642e-05, "loss": 0.0605, "step": 670 }, { "epoch": 0.41525684404798524, "grad_norm": 0.21484375, "learning_rate": 4.807098765432099e-05, "loss": 0.0643, "step": 675 }, { "epoch": 0.4183328206705629, "grad_norm": 0.2236328125, "learning_rate": 4.805555555555556e-05, "loss": 0.0662, "step": 680 }, { "epoch": 0.42140879729314057, "grad_norm": 0.2197265625, "learning_rate": 4.804012345679013e-05, "loss": 0.0636, "step": 685 }, { "epoch": 0.42448477391571826, "grad_norm": 0.21875, "learning_rate": 4.8024691358024696e-05, "loss": 0.0625, "step": 690 }, { "epoch": 0.4275607505382959, "grad_norm": 0.232421875, "learning_rate": 4.800925925925926e-05, "loss": 0.0715, "step": 695 }, { "epoch": 0.4306367271608736, "grad_norm": 0.232421875, "learning_rate": 4.799382716049383e-05, "loss": 0.0672, "step": 700 }, { "epoch": 0.4337127037834512, "grad_norm": 0.240234375, "learning_rate": 4.79783950617284e-05, "loss": 0.0691, "step": 705 }, { "epoch": 0.4367886804060289, "grad_norm": 0.2578125, "learning_rate": 4.796296296296296e-05, "loss": 0.0658, "step": 710 }, { "epoch": 0.4398646570286066, "grad_norm": 0.212890625, "learning_rate": 4.794753086419753e-05, "loss": 0.0598, "step": 715 }, { "epoch": 0.44294063365118425, "grad_norm": 0.232421875, "learning_rate": 4.79320987654321e-05, "loss": 0.0706, "step": 720 }, { "epoch": 0.44601661027376194, "grad_norm": 0.2109375, "learning_rate": 4.791666666666667e-05, "loss": 0.0629, "step": 725 }, { "epoch": 0.4490925868963396, "grad_norm": 0.2421875, "learning_rate": 4.7901234567901237e-05, "loss": 0.0762, "step": 730 }, { "epoch": 0.45216856351891727, "grad_norm": 0.2236328125, "learning_rate": 4.7885802469135805e-05, "loss": 0.06, "step": 735 }, { "epoch": 0.4552445401414949, "grad_norm": 0.2265625, "learning_rate": 4.7870370370370373e-05, "loss": 0.0655, "step": 740 }, { "epoch": 0.4583205167640726, "grad_norm": 0.287109375, "learning_rate": 4.785493827160494e-05, "loss": 0.0667, "step": 745 }, { "epoch": 0.4613964933866503, "grad_norm": 0.20703125, "learning_rate": 4.783950617283951e-05, "loss": 0.0679, "step": 750 }, { "epoch": 0.4644724700092279, "grad_norm": 0.240234375, "learning_rate": 4.782407407407408e-05, "loss": 0.0691, "step": 755 }, { "epoch": 0.4675484466318056, "grad_norm": 0.2119140625, "learning_rate": 4.780864197530864e-05, "loss": 0.0639, "step": 760 }, { "epoch": 0.47062442325438325, "grad_norm": 0.2333984375, "learning_rate": 4.779320987654321e-05, "loss": 0.0665, "step": 765 }, { "epoch": 0.47370039987696094, "grad_norm": 0.255859375, "learning_rate": 4.7777777777777784e-05, "loss": 0.0637, "step": 770 }, { "epoch": 0.4767763764995386, "grad_norm": 0.205078125, "learning_rate": 4.7762345679012346e-05, "loss": 0.0629, "step": 775 }, { "epoch": 0.47985235312211627, "grad_norm": 0.265625, "learning_rate": 4.7746913580246914e-05, "loss": 0.0716, "step": 780 }, { "epoch": 0.48292832974469396, "grad_norm": 0.2265625, "learning_rate": 4.773148148148148e-05, "loss": 0.0585, "step": 785 }, { "epoch": 0.4860043063672716, "grad_norm": 0.216796875, "learning_rate": 4.771604938271605e-05, "loss": 0.0662, "step": 790 }, { "epoch": 0.4890802829898493, "grad_norm": 0.23828125, "learning_rate": 4.770061728395062e-05, "loss": 0.0678, "step": 795 }, { "epoch": 0.4921562596124269, "grad_norm": 0.2158203125, "learning_rate": 4.768518518518519e-05, "loss": 0.0676, "step": 800 }, { "epoch": 0.4952322362350046, "grad_norm": 0.21875, "learning_rate": 4.7669753086419756e-05, "loss": 0.0717, "step": 805 }, { "epoch": 0.49830821285758226, "grad_norm": 0.2431640625, "learning_rate": 4.7654320987654325e-05, "loss": 0.0642, "step": 810 }, { "epoch": 0.50138418948016, "grad_norm": 0.205078125, "learning_rate": 4.7638888888888887e-05, "loss": 0.0665, "step": 815 }, { "epoch": 0.5044601661027376, "grad_norm": 0.19921875, "learning_rate": 4.762345679012346e-05, "loss": 0.0597, "step": 820 }, { "epoch": 0.5075361427253153, "grad_norm": 0.2041015625, "learning_rate": 4.760802469135803e-05, "loss": 0.0643, "step": 825 }, { "epoch": 0.510612119347893, "grad_norm": 0.228515625, "learning_rate": 4.759259259259259e-05, "loss": 0.0605, "step": 830 }, { "epoch": 0.5136880959704706, "grad_norm": 0.23046875, "learning_rate": 4.757716049382716e-05, "loss": 0.0679, "step": 835 }, { "epoch": 0.5167640725930482, "grad_norm": 0.263671875, "learning_rate": 4.7561728395061736e-05, "loss": 0.0646, "step": 840 }, { "epoch": 0.519840049215626, "grad_norm": 0.267578125, "learning_rate": 4.75462962962963e-05, "loss": 0.071, "step": 845 }, { "epoch": 0.5229160258382036, "grad_norm": 0.2177734375, "learning_rate": 4.7530864197530866e-05, "loss": 0.0617, "step": 850 }, { "epoch": 0.5259920024607813, "grad_norm": 0.1982421875, "learning_rate": 4.7515432098765434e-05, "loss": 0.0606, "step": 855 }, { "epoch": 0.529067979083359, "grad_norm": 0.2197265625, "learning_rate": 4.75e-05, "loss": 0.0625, "step": 860 }, { "epoch": 0.5321439557059366, "grad_norm": 0.23828125, "learning_rate": 4.748456790123457e-05, "loss": 0.063, "step": 865 }, { "epoch": 0.5352199323285143, "grad_norm": 0.2490234375, "learning_rate": 4.746913580246914e-05, "loss": 0.0698, "step": 870 }, { "epoch": 0.5382959089510919, "grad_norm": 0.234375, "learning_rate": 4.745370370370371e-05, "loss": 0.0636, "step": 875 }, { "epoch": 0.5413718855736697, "grad_norm": 0.2578125, "learning_rate": 4.743827160493827e-05, "loss": 0.0634, "step": 880 }, { "epoch": 0.5444478621962473, "grad_norm": 0.2158203125, "learning_rate": 4.742283950617284e-05, "loss": 0.0639, "step": 885 }, { "epoch": 0.5475238388188249, "grad_norm": 0.25390625, "learning_rate": 4.740740740740741e-05, "loss": 0.0635, "step": 890 }, { "epoch": 0.5505998154414027, "grad_norm": 0.24609375, "learning_rate": 4.7391975308641975e-05, "loss": 0.0685, "step": 895 }, { "epoch": 0.5536757920639803, "grad_norm": 0.2412109375, "learning_rate": 4.7376543209876543e-05, "loss": 0.067, "step": 900 }, { "epoch": 0.556751768686558, "grad_norm": 0.2041015625, "learning_rate": 4.736111111111111e-05, "loss": 0.0577, "step": 905 }, { "epoch": 0.5598277453091357, "grad_norm": 0.2236328125, "learning_rate": 4.734567901234569e-05, "loss": 0.0653, "step": 910 }, { "epoch": 0.5629037219317133, "grad_norm": 0.232421875, "learning_rate": 4.733024691358025e-05, "loss": 0.0741, "step": 915 }, { "epoch": 0.565979698554291, "grad_norm": 0.2001953125, "learning_rate": 4.731481481481482e-05, "loss": 0.0579, "step": 920 }, { "epoch": 0.5690556751768686, "grad_norm": 0.2021484375, "learning_rate": 4.7299382716049386e-05, "loss": 0.0693, "step": 925 }, { "epoch": 0.5721316517994464, "grad_norm": 0.2197265625, "learning_rate": 4.7283950617283954e-05, "loss": 0.0604, "step": 930 }, { "epoch": 0.575207628422024, "grad_norm": 0.2109375, "learning_rate": 4.726851851851852e-05, "loss": 0.063, "step": 935 }, { "epoch": 0.5782836050446016, "grad_norm": 0.2353515625, "learning_rate": 4.725308641975309e-05, "loss": 0.0604, "step": 940 }, { "epoch": 0.5813595816671794, "grad_norm": 0.208984375, "learning_rate": 4.723765432098766e-05, "loss": 0.0605, "step": 945 }, { "epoch": 0.584435558289757, "grad_norm": 0.2373046875, "learning_rate": 4.722222222222222e-05, "loss": 0.071, "step": 950 }, { "epoch": 0.5875115349123347, "grad_norm": 0.21875, "learning_rate": 4.720679012345679e-05, "loss": 0.0646, "step": 955 }, { "epoch": 0.5905875115349123, "grad_norm": 0.21484375, "learning_rate": 4.7191358024691365e-05, "loss": 0.0611, "step": 960 }, { "epoch": 0.59366348815749, "grad_norm": 0.2255859375, "learning_rate": 4.7175925925925926e-05, "loss": 0.0601, "step": 965 }, { "epoch": 0.5967394647800677, "grad_norm": 0.212890625, "learning_rate": 4.7160493827160495e-05, "loss": 0.0653, "step": 970 }, { "epoch": 0.5998154414026453, "grad_norm": 0.2080078125, "learning_rate": 4.714506172839506e-05, "loss": 0.0594, "step": 975 }, { "epoch": 0.6028914180252231, "grad_norm": 0.251953125, "learning_rate": 4.712962962962963e-05, "loss": 0.0646, "step": 980 }, { "epoch": 0.6059673946478007, "grad_norm": 0.2431640625, "learning_rate": 4.71141975308642e-05, "loss": 0.0649, "step": 985 }, { "epoch": 0.6090433712703783, "grad_norm": 0.21875, "learning_rate": 4.709876543209877e-05, "loss": 0.0653, "step": 990 }, { "epoch": 0.612119347892956, "grad_norm": 0.232421875, "learning_rate": 4.708333333333334e-05, "loss": 0.0621, "step": 995 }, { "epoch": 0.6151953245155337, "grad_norm": 0.2392578125, "learning_rate": 4.70679012345679e-05, "loss": 0.0641, "step": 1000 }, { "epoch": 0.6182713011381114, "grad_norm": 0.21484375, "learning_rate": 4.7052469135802474e-05, "loss": 0.068, "step": 1005 }, { "epoch": 0.621347277760689, "grad_norm": 0.23046875, "learning_rate": 4.703703703703704e-05, "loss": 0.0681, "step": 1010 }, { "epoch": 0.6244232543832667, "grad_norm": 0.2138671875, "learning_rate": 4.7021604938271604e-05, "loss": 0.0675, "step": 1015 }, { "epoch": 0.6274992310058444, "grad_norm": 0.248046875, "learning_rate": 4.700617283950617e-05, "loss": 0.0695, "step": 1020 }, { "epoch": 0.630575207628422, "grad_norm": 0.23046875, "learning_rate": 4.699074074074074e-05, "loss": 0.0627, "step": 1025 }, { "epoch": 0.6336511842509996, "grad_norm": 0.2177734375, "learning_rate": 4.6975308641975316e-05, "loss": 0.0687, "step": 1030 }, { "epoch": 0.6367271608735774, "grad_norm": 0.2392578125, "learning_rate": 4.695987654320988e-05, "loss": 0.0658, "step": 1035 }, { "epoch": 0.639803137496155, "grad_norm": 0.216796875, "learning_rate": 4.6944444444444446e-05, "loss": 0.0644, "step": 1040 }, { "epoch": 0.6428791141187327, "grad_norm": 0.234375, "learning_rate": 4.6929012345679015e-05, "loss": 0.0771, "step": 1045 }, { "epoch": 0.6459550907413104, "grad_norm": 0.2490234375, "learning_rate": 4.691358024691358e-05, "loss": 0.0635, "step": 1050 }, { "epoch": 0.649031067363888, "grad_norm": 0.212890625, "learning_rate": 4.689814814814815e-05, "loss": 0.0644, "step": 1055 }, { "epoch": 0.6521070439864657, "grad_norm": 0.220703125, "learning_rate": 4.688271604938272e-05, "loss": 0.0639, "step": 1060 }, { "epoch": 0.6551830206090433, "grad_norm": 0.2275390625, "learning_rate": 4.686728395061729e-05, "loss": 0.0678, "step": 1065 }, { "epoch": 0.6582589972316211, "grad_norm": 0.22265625, "learning_rate": 4.685185185185185e-05, "loss": 0.0621, "step": 1070 }, { "epoch": 0.6613349738541987, "grad_norm": 0.21875, "learning_rate": 4.6836419753086425e-05, "loss": 0.0635, "step": 1075 }, { "epoch": 0.6644109504767763, "grad_norm": 0.234375, "learning_rate": 4.6820987654320994e-05, "loss": 0.0656, "step": 1080 }, { "epoch": 0.6674869270993541, "grad_norm": 0.232421875, "learning_rate": 4.6805555555555556e-05, "loss": 0.0625, "step": 1085 }, { "epoch": 0.6705629037219317, "grad_norm": 0.19921875, "learning_rate": 4.6790123456790124e-05, "loss": 0.0613, "step": 1090 }, { "epoch": 0.6736388803445094, "grad_norm": 0.21484375, "learning_rate": 4.677469135802469e-05, "loss": 0.0584, "step": 1095 }, { "epoch": 0.676714856967087, "grad_norm": 0.216796875, "learning_rate": 4.675925925925926e-05, "loss": 0.0593, "step": 1100 }, { "epoch": 0.6797908335896647, "grad_norm": 0.2294921875, "learning_rate": 4.674382716049383e-05, "loss": 0.0694, "step": 1105 }, { "epoch": 0.6828668102122424, "grad_norm": 0.2431640625, "learning_rate": 4.67283950617284e-05, "loss": 0.0648, "step": 1110 }, { "epoch": 0.68594278683482, "grad_norm": 0.2314453125, "learning_rate": 4.6712962962962966e-05, "loss": 0.0661, "step": 1115 }, { "epoch": 0.6890187634573978, "grad_norm": 0.251953125, "learning_rate": 4.669753086419753e-05, "loss": 0.0642, "step": 1120 }, { "epoch": 0.6920947400799754, "grad_norm": 0.203125, "learning_rate": 4.66820987654321e-05, "loss": 0.0685, "step": 1125 }, { "epoch": 0.695170716702553, "grad_norm": 0.267578125, "learning_rate": 4.666666666666667e-05, "loss": 0.0632, "step": 1130 }, { "epoch": 0.6982466933251307, "grad_norm": 0.23046875, "learning_rate": 4.665123456790123e-05, "loss": 0.0652, "step": 1135 }, { "epoch": 0.7013226699477084, "grad_norm": 0.2265625, "learning_rate": 4.66358024691358e-05, "loss": 0.0594, "step": 1140 }, { "epoch": 0.7043986465702861, "grad_norm": 0.251953125, "learning_rate": 4.662037037037038e-05, "loss": 0.0664, "step": 1145 }, { "epoch": 0.7074746231928637, "grad_norm": 0.232421875, "learning_rate": 4.6604938271604945e-05, "loss": 0.0688, "step": 1150 }, { "epoch": 0.7105505998154414, "grad_norm": 0.21875, "learning_rate": 4.658950617283951e-05, "loss": 0.0668, "step": 1155 }, { "epoch": 0.7136265764380191, "grad_norm": 0.23828125, "learning_rate": 4.6574074074074076e-05, "loss": 0.0648, "step": 1160 }, { "epoch": 0.7167025530605967, "grad_norm": 0.224609375, "learning_rate": 4.6558641975308644e-05, "loss": 0.0667, "step": 1165 }, { "epoch": 0.7197785296831745, "grad_norm": 0.203125, "learning_rate": 4.654320987654321e-05, "loss": 0.0594, "step": 1170 }, { "epoch": 0.7228545063057521, "grad_norm": 0.23046875, "learning_rate": 4.652777777777778e-05, "loss": 0.0653, "step": 1175 }, { "epoch": 0.7259304829283297, "grad_norm": 0.2275390625, "learning_rate": 4.651234567901235e-05, "loss": 0.0672, "step": 1180 }, { "epoch": 0.7290064595509074, "grad_norm": 0.2490234375, "learning_rate": 4.649691358024692e-05, "loss": 0.0637, "step": 1185 }, { "epoch": 0.7320824361734851, "grad_norm": 0.220703125, "learning_rate": 4.648148148148148e-05, "loss": 0.0657, "step": 1190 }, { "epoch": 0.7351584127960628, "grad_norm": 0.228515625, "learning_rate": 4.6466049382716055e-05, "loss": 0.0615, "step": 1195 }, { "epoch": 0.7382343894186404, "grad_norm": 0.2294921875, "learning_rate": 4.645061728395062e-05, "loss": 0.0666, "step": 1200 }, { "epoch": 0.7413103660412181, "grad_norm": 0.1982421875, "learning_rate": 4.6435185185185185e-05, "loss": 0.0603, "step": 1205 }, { "epoch": 0.7443863426637958, "grad_norm": 0.2138671875, "learning_rate": 4.641975308641975e-05, "loss": 0.066, "step": 1210 }, { "epoch": 0.7474623192863734, "grad_norm": 0.208984375, "learning_rate": 4.640432098765432e-05, "loss": 0.0586, "step": 1215 }, { "epoch": 0.750538295908951, "grad_norm": 0.185546875, "learning_rate": 4.638888888888889e-05, "loss": 0.0583, "step": 1220 }, { "epoch": 0.7536142725315288, "grad_norm": 0.2138671875, "learning_rate": 4.637345679012346e-05, "loss": 0.0731, "step": 1225 }, { "epoch": 0.7566902491541064, "grad_norm": 0.22265625, "learning_rate": 4.635802469135803e-05, "loss": 0.0575, "step": 1230 }, { "epoch": 0.7597662257766841, "grad_norm": 0.2099609375, "learning_rate": 4.6342592592592595e-05, "loss": 0.0655, "step": 1235 }, { "epoch": 0.7628422023992618, "grad_norm": 0.20703125, "learning_rate": 4.6327160493827164e-05, "loss": 0.0619, "step": 1240 }, { "epoch": 0.7659181790218395, "grad_norm": 0.2373046875, "learning_rate": 4.631172839506173e-05, "loss": 0.0593, "step": 1245 }, { "epoch": 0.7689941556444171, "grad_norm": 0.1962890625, "learning_rate": 4.62962962962963e-05, "loss": 0.0614, "step": 1250 }, { "epoch": 0.7720701322669947, "grad_norm": 0.2119140625, "learning_rate": 4.628086419753086e-05, "loss": 0.0596, "step": 1255 }, { "epoch": 0.7751461088895725, "grad_norm": 0.2041015625, "learning_rate": 4.626543209876543e-05, "loss": 0.0607, "step": 1260 }, { "epoch": 0.7782220855121501, "grad_norm": 0.2275390625, "learning_rate": 4.6250000000000006e-05, "loss": 0.0687, "step": 1265 }, { "epoch": 0.7812980621347277, "grad_norm": 0.20703125, "learning_rate": 4.623456790123457e-05, "loss": 0.0619, "step": 1270 }, { "epoch": 0.7843740387573055, "grad_norm": 0.2353515625, "learning_rate": 4.6219135802469136e-05, "loss": 0.0689, "step": 1275 }, { "epoch": 0.7874500153798831, "grad_norm": 0.2158203125, "learning_rate": 4.6203703703703705e-05, "loss": 0.0592, "step": 1280 }, { "epoch": 0.7905259920024608, "grad_norm": 0.2197265625, "learning_rate": 4.618827160493827e-05, "loss": 0.0661, "step": 1285 }, { "epoch": 0.7936019686250384, "grad_norm": 0.228515625, "learning_rate": 4.617283950617284e-05, "loss": 0.0622, "step": 1290 }, { "epoch": 0.7966779452476161, "grad_norm": 0.2255859375, "learning_rate": 4.615740740740741e-05, "loss": 0.0618, "step": 1295 }, { "epoch": 0.7997539218701938, "grad_norm": 0.2216796875, "learning_rate": 4.614197530864198e-05, "loss": 0.0741, "step": 1300 }, { "epoch": 0.8028298984927714, "grad_norm": 0.240234375, "learning_rate": 4.612654320987655e-05, "loss": 0.0678, "step": 1305 }, { "epoch": 0.8059058751153492, "grad_norm": 0.21484375, "learning_rate": 4.6111111111111115e-05, "loss": 0.0578, "step": 1310 }, { "epoch": 0.8089818517379268, "grad_norm": 0.220703125, "learning_rate": 4.6095679012345684e-05, "loss": 0.0646, "step": 1315 }, { "epoch": 0.8120578283605044, "grad_norm": 0.263671875, "learning_rate": 4.608024691358025e-05, "loss": 0.063, "step": 1320 }, { "epoch": 0.8151338049830821, "grad_norm": 0.2314453125, "learning_rate": 4.6064814814814814e-05, "loss": 0.0661, "step": 1325 }, { "epoch": 0.8182097816056598, "grad_norm": 0.1923828125, "learning_rate": 4.604938271604938e-05, "loss": 0.0597, "step": 1330 }, { "epoch": 0.8212857582282375, "grad_norm": 0.228515625, "learning_rate": 4.603395061728396e-05, "loss": 0.0655, "step": 1335 }, { "epoch": 0.8243617348508151, "grad_norm": 0.26171875, "learning_rate": 4.601851851851852e-05, "loss": 0.0644, "step": 1340 }, { "epoch": 0.8274377114733928, "grad_norm": 0.205078125, "learning_rate": 4.600308641975309e-05, "loss": 0.0646, "step": 1345 }, { "epoch": 0.8305136880959705, "grad_norm": 0.2392578125, "learning_rate": 4.5987654320987656e-05, "loss": 0.0594, "step": 1350 }, { "epoch": 0.8335896647185481, "grad_norm": 0.23828125, "learning_rate": 4.5972222222222225e-05, "loss": 0.0622, "step": 1355 }, { "epoch": 0.8366656413411258, "grad_norm": 0.2431640625, "learning_rate": 4.595679012345679e-05, "loss": 0.0707, "step": 1360 }, { "epoch": 0.8397416179637035, "grad_norm": 0.2314453125, "learning_rate": 4.594135802469136e-05, "loss": 0.0558, "step": 1365 }, { "epoch": 0.8428175945862811, "grad_norm": 0.2314453125, "learning_rate": 4.592592592592593e-05, "loss": 0.0581, "step": 1370 }, { "epoch": 0.8458935712088588, "grad_norm": 0.25, "learning_rate": 4.591049382716049e-05, "loss": 0.066, "step": 1375 }, { "epoch": 0.8489695478314365, "grad_norm": 0.2021484375, "learning_rate": 4.589506172839506e-05, "loss": 0.0591, "step": 1380 }, { "epoch": 0.8520455244540142, "grad_norm": 0.2333984375, "learning_rate": 4.5879629629629635e-05, "loss": 0.0734, "step": 1385 }, { "epoch": 0.8551215010765918, "grad_norm": 0.2470703125, "learning_rate": 4.58641975308642e-05, "loss": 0.0693, "step": 1390 }, { "epoch": 0.8581974776991694, "grad_norm": 0.2333984375, "learning_rate": 4.5848765432098765e-05, "loss": 0.0652, "step": 1395 }, { "epoch": 0.8612734543217472, "grad_norm": 0.2333984375, "learning_rate": 4.5833333333333334e-05, "loss": 0.0626, "step": 1400 }, { "epoch": 0.8643494309443248, "grad_norm": 0.2314453125, "learning_rate": 4.581790123456791e-05, "loss": 0.0643, "step": 1405 }, { "epoch": 0.8674254075669025, "grad_norm": 0.2314453125, "learning_rate": 4.580246913580247e-05, "loss": 0.0647, "step": 1410 }, { "epoch": 0.8705013841894802, "grad_norm": 0.228515625, "learning_rate": 4.578703703703704e-05, "loss": 0.0665, "step": 1415 }, { "epoch": 0.8735773608120578, "grad_norm": 0.2197265625, "learning_rate": 4.577160493827161e-05, "loss": 0.0701, "step": 1420 }, { "epoch": 0.8766533374346355, "grad_norm": 0.1953125, "learning_rate": 4.5756172839506176e-05, "loss": 0.0589, "step": 1425 }, { "epoch": 0.8797293140572132, "grad_norm": 0.203125, "learning_rate": 4.5740740740740745e-05, "loss": 0.0633, "step": 1430 }, { "epoch": 0.8828052906797909, "grad_norm": 0.2275390625, "learning_rate": 4.572530864197531e-05, "loss": 0.0633, "step": 1435 }, { "epoch": 0.8858812673023685, "grad_norm": 0.2470703125, "learning_rate": 4.570987654320988e-05, "loss": 0.0598, "step": 1440 }, { "epoch": 0.8889572439249461, "grad_norm": 0.2265625, "learning_rate": 4.569444444444444e-05, "loss": 0.0651, "step": 1445 }, { "epoch": 0.8920332205475239, "grad_norm": 0.2119140625, "learning_rate": 4.567901234567901e-05, "loss": 0.0602, "step": 1450 }, { "epoch": 0.8951091971701015, "grad_norm": 0.2099609375, "learning_rate": 4.566358024691359e-05, "loss": 0.0586, "step": 1455 }, { "epoch": 0.8981851737926791, "grad_norm": 0.23828125, "learning_rate": 4.564814814814815e-05, "loss": 0.0683, "step": 1460 }, { "epoch": 0.9012611504152569, "grad_norm": 0.2353515625, "learning_rate": 4.563271604938272e-05, "loss": 0.0602, "step": 1465 }, { "epoch": 0.9043371270378345, "grad_norm": 0.2099609375, "learning_rate": 4.5617283950617285e-05, "loss": 0.0682, "step": 1470 }, { "epoch": 0.9074131036604122, "grad_norm": 0.21875, "learning_rate": 4.5601851851851854e-05, "loss": 0.0631, "step": 1475 }, { "epoch": 0.9104890802829898, "grad_norm": 0.24609375, "learning_rate": 4.558641975308642e-05, "loss": 0.0695, "step": 1480 }, { "epoch": 0.9135650569055676, "grad_norm": 0.2001953125, "learning_rate": 4.557098765432099e-05, "loss": 0.0624, "step": 1485 }, { "epoch": 0.9166410335281452, "grad_norm": 0.197265625, "learning_rate": 4.555555555555556e-05, "loss": 0.0598, "step": 1490 }, { "epoch": 0.9197170101507228, "grad_norm": 0.2158203125, "learning_rate": 4.554012345679012e-05, "loss": 0.065, "step": 1495 }, { "epoch": 0.9227929867733006, "grad_norm": 0.2041015625, "learning_rate": 4.5524691358024696e-05, "loss": 0.067, "step": 1500 }, { "epoch": 0.9258689633958782, "grad_norm": 0.2392578125, "learning_rate": 4.5509259259259264e-05, "loss": 0.0631, "step": 1505 }, { "epoch": 0.9289449400184558, "grad_norm": 0.1865234375, "learning_rate": 4.5493827160493826e-05, "loss": 0.061, "step": 1510 }, { "epoch": 0.9320209166410335, "grad_norm": 0.2275390625, "learning_rate": 4.5478395061728395e-05, "loss": 0.0675, "step": 1515 }, { "epoch": 0.9350968932636112, "grad_norm": 0.2451171875, "learning_rate": 4.546296296296296e-05, "loss": 0.0658, "step": 1520 }, { "epoch": 0.9381728698861889, "grad_norm": 0.232421875, "learning_rate": 4.544753086419754e-05, "loss": 0.0634, "step": 1525 }, { "epoch": 0.9412488465087665, "grad_norm": 0.2265625, "learning_rate": 4.54320987654321e-05, "loss": 0.0629, "step": 1530 }, { "epoch": 0.9443248231313442, "grad_norm": 0.2099609375, "learning_rate": 4.541666666666667e-05, "loss": 0.0593, "step": 1535 }, { "epoch": 0.9474007997539219, "grad_norm": 0.240234375, "learning_rate": 4.540123456790124e-05, "loss": 0.071, "step": 1540 }, { "epoch": 0.9504767763764995, "grad_norm": 0.240234375, "learning_rate": 4.5385802469135805e-05, "loss": 0.0628, "step": 1545 }, { "epoch": 0.9535527529990772, "grad_norm": 0.2060546875, "learning_rate": 4.5370370370370374e-05, "loss": 0.0642, "step": 1550 }, { "epoch": 0.9566287296216549, "grad_norm": 0.19140625, "learning_rate": 4.535493827160494e-05, "loss": 0.0597, "step": 1555 }, { "epoch": 0.9597047062442325, "grad_norm": 0.189453125, "learning_rate": 4.533950617283951e-05, "loss": 0.0631, "step": 1560 }, { "epoch": 0.9627806828668102, "grad_norm": 0.1904296875, "learning_rate": 4.532407407407407e-05, "loss": 0.0625, "step": 1565 }, { "epoch": 0.9658566594893879, "grad_norm": 0.212890625, "learning_rate": 4.530864197530865e-05, "loss": 0.0613, "step": 1570 }, { "epoch": 0.9689326361119656, "grad_norm": 0.18359375, "learning_rate": 4.5293209876543216e-05, "loss": 0.0656, "step": 1575 }, { "epoch": 0.9720086127345432, "grad_norm": 0.2373046875, "learning_rate": 4.527777777777778e-05, "loss": 0.0653, "step": 1580 }, { "epoch": 0.9750845893571208, "grad_norm": 0.23828125, "learning_rate": 4.5262345679012346e-05, "loss": 0.0627, "step": 1585 }, { "epoch": 0.9781605659796986, "grad_norm": 0.2294921875, "learning_rate": 4.5246913580246914e-05, "loss": 0.0665, "step": 1590 }, { "epoch": 0.9812365426022762, "grad_norm": 0.201171875, "learning_rate": 4.523148148148148e-05, "loss": 0.0682, "step": 1595 }, { "epoch": 0.9843125192248539, "grad_norm": 0.2177734375, "learning_rate": 4.521604938271605e-05, "loss": 0.0647, "step": 1600 }, { "epoch": 0.9873884958474316, "grad_norm": 0.212890625, "learning_rate": 4.520061728395062e-05, "loss": 0.0645, "step": 1605 }, { "epoch": 0.9904644724700092, "grad_norm": 0.2236328125, "learning_rate": 4.518518518518519e-05, "loss": 0.0601, "step": 1610 }, { "epoch": 0.9935404490925869, "grad_norm": 0.224609375, "learning_rate": 4.516975308641975e-05, "loss": 0.0639, "step": 1615 }, { "epoch": 0.9966164257151645, "grad_norm": 0.2236328125, "learning_rate": 4.5154320987654325e-05, "loss": 0.0583, "step": 1620 }, { "epoch": 0.9996924023377423, "grad_norm": 0.2177734375, "learning_rate": 4.5138888888888894e-05, "loss": 0.064, "step": 1625 }, { "epoch": 1.00276837896032, "grad_norm": 0.21484375, "learning_rate": 4.5123456790123455e-05, "loss": 0.0591, "step": 1630 }, { "epoch": 1.0058443555828975, "grad_norm": 0.267578125, "learning_rate": 4.5108024691358024e-05, "loss": 0.054, "step": 1635 }, { "epoch": 1.0089203322054752, "grad_norm": 0.220703125, "learning_rate": 4.50925925925926e-05, "loss": 0.0522, "step": 1640 }, { "epoch": 1.0119963088280528, "grad_norm": 0.224609375, "learning_rate": 4.507716049382717e-05, "loss": 0.0568, "step": 1645 }, { "epoch": 1.0150722854506307, "grad_norm": 0.2265625, "learning_rate": 4.506172839506173e-05, "loss": 0.0482, "step": 1650 }, { "epoch": 1.0181482620732083, "grad_norm": 0.19921875, "learning_rate": 4.50462962962963e-05, "loss": 0.0472, "step": 1655 }, { "epoch": 1.021224238695786, "grad_norm": 0.2158203125, "learning_rate": 4.5030864197530866e-05, "loss": 0.0504, "step": 1660 }, { "epoch": 1.0243002153183636, "grad_norm": 0.19140625, "learning_rate": 4.5015432098765434e-05, "loss": 0.0538, "step": 1665 }, { "epoch": 1.0273761919409412, "grad_norm": 0.1943359375, "learning_rate": 4.5e-05, "loss": 0.0505, "step": 1670 }, { "epoch": 1.0304521685635188, "grad_norm": 0.220703125, "learning_rate": 4.498456790123457e-05, "loss": 0.0609, "step": 1675 }, { "epoch": 1.0335281451860965, "grad_norm": 0.220703125, "learning_rate": 4.496913580246914e-05, "loss": 0.0509, "step": 1680 }, { "epoch": 1.0366041218086743, "grad_norm": 0.255859375, "learning_rate": 4.49537037037037e-05, "loss": 0.056, "step": 1685 }, { "epoch": 1.039680098431252, "grad_norm": 0.23046875, "learning_rate": 4.493827160493828e-05, "loss": 0.0555, "step": 1690 }, { "epoch": 1.0427560750538296, "grad_norm": 0.23046875, "learning_rate": 4.4922839506172845e-05, "loss": 0.0531, "step": 1695 }, { "epoch": 1.0458320516764072, "grad_norm": 0.22265625, "learning_rate": 4.490740740740741e-05, "loss": 0.0496, "step": 1700 }, { "epoch": 1.0489080282989849, "grad_norm": 0.21484375, "learning_rate": 4.4891975308641975e-05, "loss": 0.0529, "step": 1705 }, { "epoch": 1.0519840049215625, "grad_norm": 0.255859375, "learning_rate": 4.4876543209876544e-05, "loss": 0.0581, "step": 1710 }, { "epoch": 1.0550599815441402, "grad_norm": 0.1962890625, "learning_rate": 4.486111111111111e-05, "loss": 0.0506, "step": 1715 }, { "epoch": 1.058135958166718, "grad_norm": 0.2255859375, "learning_rate": 4.484567901234568e-05, "loss": 0.0509, "step": 1720 }, { "epoch": 1.0612119347892957, "grad_norm": 0.23828125, "learning_rate": 4.483024691358025e-05, "loss": 0.0558, "step": 1725 }, { "epoch": 1.0642879114118733, "grad_norm": 0.2021484375, "learning_rate": 4.481481481481482e-05, "loss": 0.0532, "step": 1730 }, { "epoch": 1.067363888034451, "grad_norm": 0.185546875, "learning_rate": 4.4799382716049386e-05, "loss": 0.0488, "step": 1735 }, { "epoch": 1.0704398646570286, "grad_norm": 0.2158203125, "learning_rate": 4.4783950617283954e-05, "loss": 0.0442, "step": 1740 }, { "epoch": 1.0735158412796062, "grad_norm": 0.224609375, "learning_rate": 4.476851851851852e-05, "loss": 0.0508, "step": 1745 }, { "epoch": 1.0765918179021838, "grad_norm": 0.23046875, "learning_rate": 4.4753086419753084e-05, "loss": 0.051, "step": 1750 }, { "epoch": 1.0796677945247617, "grad_norm": 0.203125, "learning_rate": 4.473765432098765e-05, "loss": 0.0558, "step": 1755 }, { "epoch": 1.0827437711473393, "grad_norm": 0.2138671875, "learning_rate": 4.472222222222223e-05, "loss": 0.05, "step": 1760 }, { "epoch": 1.085819747769917, "grad_norm": 0.2197265625, "learning_rate": 4.4706790123456797e-05, "loss": 0.0558, "step": 1765 }, { "epoch": 1.0888957243924946, "grad_norm": 0.224609375, "learning_rate": 4.469135802469136e-05, "loss": 0.0525, "step": 1770 }, { "epoch": 1.0919717010150722, "grad_norm": 0.205078125, "learning_rate": 4.467592592592593e-05, "loss": 0.0476, "step": 1775 }, { "epoch": 1.0950476776376499, "grad_norm": 0.25390625, "learning_rate": 4.4660493827160495e-05, "loss": 0.0542, "step": 1780 }, { "epoch": 1.0981236542602275, "grad_norm": 0.212890625, "learning_rate": 4.4645061728395064e-05, "loss": 0.0518, "step": 1785 }, { "epoch": 1.1011996308828054, "grad_norm": 0.2275390625, "learning_rate": 4.462962962962963e-05, "loss": 0.0468, "step": 1790 }, { "epoch": 1.104275607505383, "grad_norm": 0.224609375, "learning_rate": 4.46141975308642e-05, "loss": 0.0537, "step": 1795 }, { "epoch": 1.1073515841279606, "grad_norm": 0.23046875, "learning_rate": 4.459876543209877e-05, "loss": 0.0578, "step": 1800 }, { "epoch": 1.1104275607505383, "grad_norm": 0.2265625, "learning_rate": 4.458333333333334e-05, "loss": 0.052, "step": 1805 }, { "epoch": 1.113503537373116, "grad_norm": 0.197265625, "learning_rate": 4.4567901234567906e-05, "loss": 0.052, "step": 1810 }, { "epoch": 1.1165795139956936, "grad_norm": 0.2373046875, "learning_rate": 4.4552469135802474e-05, "loss": 0.0592, "step": 1815 }, { "epoch": 1.1196554906182712, "grad_norm": 0.2080078125, "learning_rate": 4.4537037037037036e-05, "loss": 0.0533, "step": 1820 }, { "epoch": 1.122731467240849, "grad_norm": 0.2138671875, "learning_rate": 4.4521604938271604e-05, "loss": 0.0518, "step": 1825 }, { "epoch": 1.1258074438634267, "grad_norm": 0.2294921875, "learning_rate": 4.450617283950618e-05, "loss": 0.0506, "step": 1830 }, { "epoch": 1.1288834204860043, "grad_norm": 0.1962890625, "learning_rate": 4.449074074074074e-05, "loss": 0.05, "step": 1835 }, { "epoch": 1.131959397108582, "grad_norm": 0.2353515625, "learning_rate": 4.447530864197531e-05, "loss": 0.0565, "step": 1840 }, { "epoch": 1.1350353737311596, "grad_norm": 0.212890625, "learning_rate": 4.445987654320988e-05, "loss": 0.0487, "step": 1845 }, { "epoch": 1.1381113503537372, "grad_norm": 0.193359375, "learning_rate": 4.4444444444444447e-05, "loss": 0.0506, "step": 1850 }, { "epoch": 1.1411873269763149, "grad_norm": 0.1982421875, "learning_rate": 4.4429012345679015e-05, "loss": 0.0542, "step": 1855 }, { "epoch": 1.1442633035988927, "grad_norm": 0.208984375, "learning_rate": 4.4413580246913583e-05, "loss": 0.0508, "step": 1860 }, { "epoch": 1.1473392802214704, "grad_norm": 0.1953125, "learning_rate": 4.439814814814815e-05, "loss": 0.0508, "step": 1865 }, { "epoch": 1.150415256844048, "grad_norm": 0.232421875, "learning_rate": 4.4382716049382714e-05, "loss": 0.05, "step": 1870 }, { "epoch": 1.1534912334666256, "grad_norm": 0.2490234375, "learning_rate": 4.436728395061729e-05, "loss": 0.0576, "step": 1875 }, { "epoch": 1.1565672100892033, "grad_norm": 0.255859375, "learning_rate": 4.435185185185186e-05, "loss": 0.0594, "step": 1880 }, { "epoch": 1.159643186711781, "grad_norm": 0.20703125, "learning_rate": 4.433641975308642e-05, "loss": 0.0502, "step": 1885 }, { "epoch": 1.1627191633343585, "grad_norm": 0.2265625, "learning_rate": 4.432098765432099e-05, "loss": 0.0563, "step": 1890 }, { "epoch": 1.1657951399569364, "grad_norm": 0.2392578125, "learning_rate": 4.4305555555555556e-05, "loss": 0.0605, "step": 1895 }, { "epoch": 1.168871116579514, "grad_norm": 0.25390625, "learning_rate": 4.429012345679013e-05, "loss": 0.0537, "step": 1900 }, { "epoch": 1.1719470932020917, "grad_norm": 0.22265625, "learning_rate": 4.427469135802469e-05, "loss": 0.0531, "step": 1905 }, { "epoch": 1.1750230698246693, "grad_norm": 0.212890625, "learning_rate": 4.425925925925926e-05, "loss": 0.0486, "step": 1910 }, { "epoch": 1.178099046447247, "grad_norm": 0.2275390625, "learning_rate": 4.424382716049383e-05, "loss": 0.0526, "step": 1915 }, { "epoch": 1.1811750230698246, "grad_norm": 0.216796875, "learning_rate": 4.42283950617284e-05, "loss": 0.0527, "step": 1920 }, { "epoch": 1.1842509996924022, "grad_norm": 0.240234375, "learning_rate": 4.4212962962962966e-05, "loss": 0.0527, "step": 1925 }, { "epoch": 1.18732697631498, "grad_norm": 0.2470703125, "learning_rate": 4.4197530864197535e-05, "loss": 0.0553, "step": 1930 }, { "epoch": 1.1904029529375577, "grad_norm": 0.2353515625, "learning_rate": 4.41820987654321e-05, "loss": 0.0538, "step": 1935 }, { "epoch": 1.1934789295601353, "grad_norm": 0.22265625, "learning_rate": 4.4166666666666665e-05, "loss": 0.0547, "step": 1940 }, { "epoch": 1.196554906182713, "grad_norm": 0.2255859375, "learning_rate": 4.4151234567901234e-05, "loss": 0.0525, "step": 1945 }, { "epoch": 1.1996308828052906, "grad_norm": 0.2099609375, "learning_rate": 4.413580246913581e-05, "loss": 0.0533, "step": 1950 }, { "epoch": 1.2027068594278683, "grad_norm": 0.2431640625, "learning_rate": 4.412037037037037e-05, "loss": 0.0552, "step": 1955 }, { "epoch": 1.205782836050446, "grad_norm": 0.25, "learning_rate": 4.410493827160494e-05, "loss": 0.0562, "step": 1960 }, { "epoch": 1.2088588126730238, "grad_norm": 0.216796875, "learning_rate": 4.408950617283951e-05, "loss": 0.0513, "step": 1965 }, { "epoch": 1.2119347892956014, "grad_norm": 0.1943359375, "learning_rate": 4.4074074074074076e-05, "loss": 0.0486, "step": 1970 }, { "epoch": 1.215010765918179, "grad_norm": 0.25, "learning_rate": 4.4058641975308644e-05, "loss": 0.0602, "step": 1975 }, { "epoch": 1.2180867425407567, "grad_norm": 0.2421875, "learning_rate": 4.404320987654321e-05, "loss": 0.0541, "step": 1980 }, { "epoch": 1.2211627191633343, "grad_norm": 0.2001953125, "learning_rate": 4.402777777777778e-05, "loss": 0.0481, "step": 1985 }, { "epoch": 1.224238695785912, "grad_norm": 0.2275390625, "learning_rate": 4.401234567901234e-05, "loss": 0.0529, "step": 1990 }, { "epoch": 1.2273146724084896, "grad_norm": 0.2333984375, "learning_rate": 4.399691358024692e-05, "loss": 0.0535, "step": 1995 }, { "epoch": 1.2303906490310674, "grad_norm": 0.1884765625, "learning_rate": 4.3981481481481486e-05, "loss": 0.0417, "step": 2000 }, { "epoch": 1.233466625653645, "grad_norm": 0.21875, "learning_rate": 4.396604938271605e-05, "loss": 0.0557, "step": 2005 }, { "epoch": 1.2365426022762227, "grad_norm": 0.23046875, "learning_rate": 4.3950617283950617e-05, "loss": 0.0577, "step": 2010 }, { "epoch": 1.2396185788988003, "grad_norm": 0.234375, "learning_rate": 4.3935185185185185e-05, "loss": 0.0597, "step": 2015 }, { "epoch": 1.242694555521378, "grad_norm": 0.224609375, "learning_rate": 4.391975308641976e-05, "loss": 0.0552, "step": 2020 }, { "epoch": 1.2457705321439556, "grad_norm": 0.1845703125, "learning_rate": 4.390432098765432e-05, "loss": 0.0481, "step": 2025 }, { "epoch": 1.2488465087665335, "grad_norm": 0.228515625, "learning_rate": 4.388888888888889e-05, "loss": 0.0525, "step": 2030 }, { "epoch": 1.251922485389111, "grad_norm": 0.224609375, "learning_rate": 4.387345679012346e-05, "loss": 0.0494, "step": 2035 }, { "epoch": 1.2549984620116887, "grad_norm": 0.220703125, "learning_rate": 4.385802469135803e-05, "loss": 0.0535, "step": 2040 }, { "epoch": 1.2580744386342664, "grad_norm": 0.2109375, "learning_rate": 4.3842592592592596e-05, "loss": 0.0564, "step": 2045 }, { "epoch": 1.261150415256844, "grad_norm": 0.2314453125, "learning_rate": 4.3827160493827164e-05, "loss": 0.0599, "step": 2050 }, { "epoch": 1.2642263918794217, "grad_norm": 0.251953125, "learning_rate": 4.381172839506173e-05, "loss": 0.0554, "step": 2055 }, { "epoch": 1.2673023685019995, "grad_norm": 0.2177734375, "learning_rate": 4.3796296296296294e-05, "loss": 0.0585, "step": 2060 }, { "epoch": 1.270378345124577, "grad_norm": 0.25, "learning_rate": 4.378086419753087e-05, "loss": 0.0517, "step": 2065 }, { "epoch": 1.2734543217471548, "grad_norm": 0.216796875, "learning_rate": 4.376543209876544e-05, "loss": 0.052, "step": 2070 }, { "epoch": 1.2765302983697324, "grad_norm": 0.212890625, "learning_rate": 4.375e-05, "loss": 0.0523, "step": 2075 }, { "epoch": 1.27960627499231, "grad_norm": 0.2275390625, "learning_rate": 4.373456790123457e-05, "loss": 0.0549, "step": 2080 }, { "epoch": 1.2826822516148877, "grad_norm": 0.23046875, "learning_rate": 4.3719135802469136e-05, "loss": 0.057, "step": 2085 }, { "epoch": 1.2857582282374653, "grad_norm": 0.212890625, "learning_rate": 4.3703703703703705e-05, "loss": 0.0524, "step": 2090 }, { "epoch": 1.2888342048600432, "grad_norm": 0.234375, "learning_rate": 4.368827160493827e-05, "loss": 0.0579, "step": 2095 }, { "epoch": 1.2919101814826206, "grad_norm": 0.234375, "learning_rate": 4.367283950617284e-05, "loss": 0.0547, "step": 2100 }, { "epoch": 1.2949861581051985, "grad_norm": 0.20703125, "learning_rate": 4.365740740740741e-05, "loss": 0.0494, "step": 2105 }, { "epoch": 1.298062134727776, "grad_norm": 0.2177734375, "learning_rate": 4.364197530864197e-05, "loss": 0.0516, "step": 2110 }, { "epoch": 1.3011381113503537, "grad_norm": 0.2158203125, "learning_rate": 4.362654320987655e-05, "loss": 0.0544, "step": 2115 }, { "epoch": 1.3042140879729314, "grad_norm": 0.310546875, "learning_rate": 4.3611111111111116e-05, "loss": 0.0541, "step": 2120 }, { "epoch": 1.307290064595509, "grad_norm": 0.2255859375, "learning_rate": 4.359567901234568e-05, "loss": 0.0472, "step": 2125 }, { "epoch": 1.3103660412180869, "grad_norm": 0.2099609375, "learning_rate": 4.3580246913580246e-05, "loss": 0.0561, "step": 2130 }, { "epoch": 1.3134420178406643, "grad_norm": 0.22265625, "learning_rate": 4.356481481481482e-05, "loss": 0.0598, "step": 2135 }, { "epoch": 1.3165179944632421, "grad_norm": 0.216796875, "learning_rate": 4.354938271604939e-05, "loss": 0.0537, "step": 2140 }, { "epoch": 1.3195939710858198, "grad_norm": 0.2158203125, "learning_rate": 4.353395061728395e-05, "loss": 0.0534, "step": 2145 }, { "epoch": 1.3226699477083974, "grad_norm": 0.208984375, "learning_rate": 4.351851851851852e-05, "loss": 0.0509, "step": 2150 }, { "epoch": 1.325745924330975, "grad_norm": 0.2353515625, "learning_rate": 4.350308641975309e-05, "loss": 0.0518, "step": 2155 }, { "epoch": 1.3288219009535527, "grad_norm": 0.2314453125, "learning_rate": 4.3487654320987656e-05, "loss": 0.0555, "step": 2160 }, { "epoch": 1.3318978775761305, "grad_norm": 0.2275390625, "learning_rate": 4.3472222222222225e-05, "loss": 0.0521, "step": 2165 }, { "epoch": 1.334973854198708, "grad_norm": 0.2353515625, "learning_rate": 4.345679012345679e-05, "loss": 0.0507, "step": 2170 }, { "epoch": 1.3380498308212858, "grad_norm": 0.265625, "learning_rate": 4.344135802469136e-05, "loss": 0.0529, "step": 2175 }, { "epoch": 1.3411258074438634, "grad_norm": 0.21875, "learning_rate": 4.342592592592592e-05, "loss": 0.0578, "step": 2180 }, { "epoch": 1.344201784066441, "grad_norm": 0.21484375, "learning_rate": 4.34104938271605e-05, "loss": 0.0548, "step": 2185 }, { "epoch": 1.3472777606890187, "grad_norm": 0.24609375, "learning_rate": 4.339506172839507e-05, "loss": 0.0524, "step": 2190 }, { "epoch": 1.3503537373115964, "grad_norm": 0.26953125, "learning_rate": 4.337962962962963e-05, "loss": 0.0548, "step": 2195 }, { "epoch": 1.3534297139341742, "grad_norm": 0.2294921875, "learning_rate": 4.33641975308642e-05, "loss": 0.055, "step": 2200 }, { "epoch": 1.3565056905567516, "grad_norm": 0.21875, "learning_rate": 4.334876543209877e-05, "loss": 0.0492, "step": 2205 }, { "epoch": 1.3595816671793295, "grad_norm": 0.2451171875, "learning_rate": 4.3333333333333334e-05, "loss": 0.0556, "step": 2210 }, { "epoch": 1.3626576438019071, "grad_norm": 0.224609375, "learning_rate": 4.33179012345679e-05, "loss": 0.054, "step": 2215 }, { "epoch": 1.3657336204244848, "grad_norm": 0.23046875, "learning_rate": 4.330246913580247e-05, "loss": 0.0572, "step": 2220 }, { "epoch": 1.3688095970470624, "grad_norm": 0.2216796875, "learning_rate": 4.328703703703704e-05, "loss": 0.0455, "step": 2225 }, { "epoch": 1.37188557366964, "grad_norm": 0.25, "learning_rate": 4.327160493827161e-05, "loss": 0.0541, "step": 2230 }, { "epoch": 1.374961550292218, "grad_norm": 0.240234375, "learning_rate": 4.3256172839506176e-05, "loss": 0.0564, "step": 2235 }, { "epoch": 1.3780375269147955, "grad_norm": 0.2099609375, "learning_rate": 4.3240740740740745e-05, "loss": 0.0497, "step": 2240 }, { "epoch": 1.3811135035373732, "grad_norm": 0.2353515625, "learning_rate": 4.3225308641975306e-05, "loss": 0.0525, "step": 2245 }, { "epoch": 1.3841894801599508, "grad_norm": 0.205078125, "learning_rate": 4.3209876543209875e-05, "loss": 0.0573, "step": 2250 }, { "epoch": 1.3872654567825284, "grad_norm": 0.2275390625, "learning_rate": 4.319444444444445e-05, "loss": 0.0589, "step": 2255 }, { "epoch": 1.390341433405106, "grad_norm": 0.208984375, "learning_rate": 4.317901234567902e-05, "loss": 0.048, "step": 2260 }, { "epoch": 1.3934174100276837, "grad_norm": 0.208984375, "learning_rate": 4.316358024691358e-05, "loss": 0.0531, "step": 2265 }, { "epoch": 1.3964933866502616, "grad_norm": 0.2470703125, "learning_rate": 4.314814814814815e-05, "loss": 0.0539, "step": 2270 }, { "epoch": 1.3995693632728392, "grad_norm": 0.263671875, "learning_rate": 4.313271604938272e-05, "loss": 0.0538, "step": 2275 }, { "epoch": 1.4026453398954168, "grad_norm": 0.2041015625, "learning_rate": 4.3117283950617286e-05, "loss": 0.0526, "step": 2280 }, { "epoch": 1.4057213165179945, "grad_norm": 0.2216796875, "learning_rate": 4.3101851851851854e-05, "loss": 0.0512, "step": 2285 }, { "epoch": 1.4087972931405721, "grad_norm": 0.2158203125, "learning_rate": 4.308641975308642e-05, "loss": 0.0521, "step": 2290 }, { "epoch": 1.4118732697631498, "grad_norm": 0.216796875, "learning_rate": 4.307098765432099e-05, "loss": 0.0491, "step": 2295 }, { "epoch": 1.4149492463857274, "grad_norm": 0.2265625, "learning_rate": 4.305555555555556e-05, "loss": 0.0579, "step": 2300 }, { "epoch": 1.4180252230083052, "grad_norm": 0.2080078125, "learning_rate": 4.304012345679013e-05, "loss": 0.0514, "step": 2305 }, { "epoch": 1.4211011996308829, "grad_norm": 0.2294921875, "learning_rate": 4.3024691358024696e-05, "loss": 0.0533, "step": 2310 }, { "epoch": 1.4241771762534605, "grad_norm": 0.21484375, "learning_rate": 4.300925925925926e-05, "loss": 0.0475, "step": 2315 }, { "epoch": 1.4272531528760382, "grad_norm": 0.24609375, "learning_rate": 4.2993827160493826e-05, "loss": 0.0527, "step": 2320 }, { "epoch": 1.4303291294986158, "grad_norm": 0.228515625, "learning_rate": 4.29783950617284e-05, "loss": 0.0528, "step": 2325 }, { "epoch": 1.4334051061211934, "grad_norm": 0.2001953125, "learning_rate": 4.296296296296296e-05, "loss": 0.0497, "step": 2330 }, { "epoch": 1.436481082743771, "grad_norm": 0.2119140625, "learning_rate": 4.294753086419753e-05, "loss": 0.0492, "step": 2335 }, { "epoch": 1.439557059366349, "grad_norm": 0.2255859375, "learning_rate": 4.29320987654321e-05, "loss": 0.0518, "step": 2340 }, { "epoch": 1.4426330359889266, "grad_norm": 0.216796875, "learning_rate": 4.291666666666667e-05, "loss": 0.0606, "step": 2345 }, { "epoch": 1.4457090126115042, "grad_norm": 0.236328125, "learning_rate": 4.290123456790124e-05, "loss": 0.0523, "step": 2350 }, { "epoch": 1.4487849892340818, "grad_norm": 0.2333984375, "learning_rate": 4.2885802469135805e-05, "loss": 0.0493, "step": 2355 }, { "epoch": 1.4518609658566595, "grad_norm": 0.203125, "learning_rate": 4.2870370370370374e-05, "loss": 0.0501, "step": 2360 }, { "epoch": 1.454936942479237, "grad_norm": 0.2158203125, "learning_rate": 4.2854938271604936e-05, "loss": 0.0571, "step": 2365 }, { "epoch": 1.4580129191018147, "grad_norm": 0.25390625, "learning_rate": 4.283950617283951e-05, "loss": 0.057, "step": 2370 }, { "epoch": 1.4610888957243926, "grad_norm": 0.228515625, "learning_rate": 4.282407407407408e-05, "loss": 0.0552, "step": 2375 }, { "epoch": 1.4641648723469702, "grad_norm": 0.2021484375, "learning_rate": 4.280864197530864e-05, "loss": 0.0508, "step": 2380 }, { "epoch": 1.4672408489695479, "grad_norm": 0.23046875, "learning_rate": 4.279320987654321e-05, "loss": 0.0574, "step": 2385 }, { "epoch": 1.4703168255921255, "grad_norm": 0.216796875, "learning_rate": 4.277777777777778e-05, "loss": 0.0464, "step": 2390 }, { "epoch": 1.4733928022147031, "grad_norm": 0.1982421875, "learning_rate": 4.276234567901235e-05, "loss": 0.052, "step": 2395 }, { "epoch": 1.4764687788372808, "grad_norm": 0.2412109375, "learning_rate": 4.2746913580246915e-05, "loss": 0.0532, "step": 2400 }, { "epoch": 1.4795447554598584, "grad_norm": 0.2119140625, "learning_rate": 4.273148148148148e-05, "loss": 0.0535, "step": 2405 }, { "epoch": 1.4826207320824363, "grad_norm": 0.23046875, "learning_rate": 4.271604938271605e-05, "loss": 0.0567, "step": 2410 }, { "epoch": 1.485696708705014, "grad_norm": 0.259765625, "learning_rate": 4.270061728395062e-05, "loss": 0.0559, "step": 2415 }, { "epoch": 1.4887726853275915, "grad_norm": 0.2158203125, "learning_rate": 4.268518518518519e-05, "loss": 0.0559, "step": 2420 }, { "epoch": 1.4918486619501692, "grad_norm": 0.23828125, "learning_rate": 4.266975308641976e-05, "loss": 0.0582, "step": 2425 }, { "epoch": 1.4949246385727468, "grad_norm": 0.2333984375, "learning_rate": 4.2654320987654325e-05, "loss": 0.0557, "step": 2430 }, { "epoch": 1.4980006151953245, "grad_norm": 0.2275390625, "learning_rate": 4.263888888888889e-05, "loss": 0.0568, "step": 2435 }, { "epoch": 1.501076591817902, "grad_norm": 0.2275390625, "learning_rate": 4.262345679012346e-05, "loss": 0.0546, "step": 2440 }, { "epoch": 1.50415256844048, "grad_norm": 0.248046875, "learning_rate": 4.260802469135803e-05, "loss": 0.0603, "step": 2445 }, { "epoch": 1.5072285450630574, "grad_norm": 0.236328125, "learning_rate": 4.259259259259259e-05, "loss": 0.0557, "step": 2450 }, { "epoch": 1.5103045216856352, "grad_norm": 0.2109375, "learning_rate": 4.257716049382716e-05, "loss": 0.0523, "step": 2455 }, { "epoch": 1.5133804983082129, "grad_norm": 0.2177734375, "learning_rate": 4.256172839506173e-05, "loss": 0.0579, "step": 2460 }, { "epoch": 1.5164564749307905, "grad_norm": 0.2099609375, "learning_rate": 4.25462962962963e-05, "loss": 0.0547, "step": 2465 }, { "epoch": 1.5195324515533684, "grad_norm": 0.2265625, "learning_rate": 4.2530864197530866e-05, "loss": 0.0522, "step": 2470 }, { "epoch": 1.5226084281759458, "grad_norm": 0.2001953125, "learning_rate": 4.2515432098765435e-05, "loss": 0.0497, "step": 2475 }, { "epoch": 1.5256844047985236, "grad_norm": 0.21875, "learning_rate": 4.25e-05, "loss": 0.0549, "step": 2480 }, { "epoch": 1.528760381421101, "grad_norm": 0.2041015625, "learning_rate": 4.2484567901234565e-05, "loss": 0.0467, "step": 2485 }, { "epoch": 1.531836358043679, "grad_norm": 0.259765625, "learning_rate": 4.246913580246914e-05, "loss": 0.051, "step": 2490 }, { "epoch": 1.5349123346662565, "grad_norm": 0.2431640625, "learning_rate": 4.245370370370371e-05, "loss": 0.0527, "step": 2495 }, { "epoch": 1.5379883112888342, "grad_norm": 0.25390625, "learning_rate": 4.243827160493827e-05, "loss": 0.0546, "step": 2500 }, { "epoch": 1.541064287911412, "grad_norm": 0.2236328125, "learning_rate": 4.242283950617284e-05, "loss": 0.0518, "step": 2505 }, { "epoch": 1.5441402645339894, "grad_norm": 0.1669921875, "learning_rate": 4.240740740740741e-05, "loss": 0.0399, "step": 2510 }, { "epoch": 1.5472162411565673, "grad_norm": 0.2314453125, "learning_rate": 4.239197530864198e-05, "loss": 0.056, "step": 2515 }, { "epoch": 1.5502922177791447, "grad_norm": 0.23046875, "learning_rate": 4.2376543209876544e-05, "loss": 0.0564, "step": 2520 }, { "epoch": 1.5533681944017226, "grad_norm": 0.2294921875, "learning_rate": 4.236111111111111e-05, "loss": 0.0528, "step": 2525 }, { "epoch": 1.5564441710243002, "grad_norm": 0.2490234375, "learning_rate": 4.234567901234568e-05, "loss": 0.0526, "step": 2530 }, { "epoch": 1.5595201476468779, "grad_norm": 0.2353515625, "learning_rate": 4.233024691358025e-05, "loss": 0.0536, "step": 2535 }, { "epoch": 1.5625961242694557, "grad_norm": 0.240234375, "learning_rate": 4.231481481481482e-05, "loss": 0.0536, "step": 2540 }, { "epoch": 1.5656721008920331, "grad_norm": 0.208984375, "learning_rate": 4.2299382716049386e-05, "loss": 0.0556, "step": 2545 }, { "epoch": 1.568748077514611, "grad_norm": 0.20703125, "learning_rate": 4.2283950617283955e-05, "loss": 0.0494, "step": 2550 }, { "epoch": 1.5718240541371884, "grad_norm": 0.208984375, "learning_rate": 4.2268518518518516e-05, "loss": 0.0501, "step": 2555 }, { "epoch": 1.5749000307597663, "grad_norm": 0.2265625, "learning_rate": 4.225308641975309e-05, "loss": 0.0525, "step": 2560 }, { "epoch": 1.577976007382344, "grad_norm": 0.205078125, "learning_rate": 4.223765432098766e-05, "loss": 0.0519, "step": 2565 }, { "epoch": 1.5810519840049215, "grad_norm": 0.2060546875, "learning_rate": 4.222222222222222e-05, "loss": 0.0484, "step": 2570 }, { "epoch": 1.5841279606274994, "grad_norm": 0.2236328125, "learning_rate": 4.220679012345679e-05, "loss": 0.0563, "step": 2575 }, { "epoch": 1.5872039372500768, "grad_norm": 0.2041015625, "learning_rate": 4.219135802469136e-05, "loss": 0.0532, "step": 2580 }, { "epoch": 1.5902799138726547, "grad_norm": 0.2470703125, "learning_rate": 4.217592592592593e-05, "loss": 0.0512, "step": 2585 }, { "epoch": 1.593355890495232, "grad_norm": 0.25, "learning_rate": 4.2160493827160495e-05, "loss": 0.057, "step": 2590 }, { "epoch": 1.59643186711781, "grad_norm": 0.2119140625, "learning_rate": 4.2145061728395064e-05, "loss": 0.0489, "step": 2595 }, { "epoch": 1.5995078437403876, "grad_norm": 0.2099609375, "learning_rate": 4.212962962962963e-05, "loss": 0.0515, "step": 2600 }, { "epoch": 1.6025838203629652, "grad_norm": 0.2041015625, "learning_rate": 4.21141975308642e-05, "loss": 0.0523, "step": 2605 }, { "epoch": 1.605659796985543, "grad_norm": 0.2021484375, "learning_rate": 4.209876543209877e-05, "loss": 0.0585, "step": 2610 }, { "epoch": 1.6087357736081205, "grad_norm": 0.248046875, "learning_rate": 4.208333333333334e-05, "loss": 0.0563, "step": 2615 }, { "epoch": 1.6118117502306983, "grad_norm": 0.248046875, "learning_rate": 4.20679012345679e-05, "loss": 0.055, "step": 2620 }, { "epoch": 1.6148877268532758, "grad_norm": 0.228515625, "learning_rate": 4.205246913580247e-05, "loss": 0.0566, "step": 2625 }, { "epoch": 1.6179637034758536, "grad_norm": 0.21875, "learning_rate": 4.203703703703704e-05, "loss": 0.0492, "step": 2630 }, { "epoch": 1.6210396800984312, "grad_norm": 0.2314453125, "learning_rate": 4.202160493827161e-05, "loss": 0.0578, "step": 2635 }, { "epoch": 1.6241156567210089, "grad_norm": 0.2236328125, "learning_rate": 4.200617283950617e-05, "loss": 0.0483, "step": 2640 }, { "epoch": 1.6271916333435867, "grad_norm": 0.232421875, "learning_rate": 4.199074074074074e-05, "loss": 0.0539, "step": 2645 }, { "epoch": 1.6302676099661642, "grad_norm": 0.2021484375, "learning_rate": 4.197530864197531e-05, "loss": 0.0517, "step": 2650 }, { "epoch": 1.633343586588742, "grad_norm": 0.2392578125, "learning_rate": 4.195987654320988e-05, "loss": 0.0583, "step": 2655 }, { "epoch": 1.6364195632113194, "grad_norm": 0.2392578125, "learning_rate": 4.194444444444445e-05, "loss": 0.0474, "step": 2660 }, { "epoch": 1.6394955398338973, "grad_norm": 0.220703125, "learning_rate": 4.1929012345679015e-05, "loss": 0.0543, "step": 2665 }, { "epoch": 1.642571516456475, "grad_norm": 0.2421875, "learning_rate": 4.1913580246913584e-05, "loss": 0.058, "step": 2670 }, { "epoch": 1.6456474930790526, "grad_norm": 0.232421875, "learning_rate": 4.1898148148148145e-05, "loss": 0.049, "step": 2675 }, { "epoch": 1.6487234697016304, "grad_norm": 0.2412109375, "learning_rate": 4.188271604938272e-05, "loss": 0.0558, "step": 2680 }, { "epoch": 1.6517994463242078, "grad_norm": 0.248046875, "learning_rate": 4.186728395061729e-05, "loss": 0.0533, "step": 2685 }, { "epoch": 1.6548754229467857, "grad_norm": 0.2197265625, "learning_rate": 4.185185185185185e-05, "loss": 0.0476, "step": 2690 }, { "epoch": 1.6579513995693633, "grad_norm": 0.232421875, "learning_rate": 4.183641975308642e-05, "loss": 0.0503, "step": 2695 }, { "epoch": 1.661027376191941, "grad_norm": 0.23046875, "learning_rate": 4.1820987654320994e-05, "loss": 0.0488, "step": 2700 }, { "epoch": 1.6641033528145186, "grad_norm": 0.2197265625, "learning_rate": 4.1805555555555556e-05, "loss": 0.0517, "step": 2705 }, { "epoch": 1.6671793294370962, "grad_norm": 0.2119140625, "learning_rate": 4.1790123456790124e-05, "loss": 0.056, "step": 2710 }, { "epoch": 1.670255306059674, "grad_norm": 0.2177734375, "learning_rate": 4.177469135802469e-05, "loss": 0.0531, "step": 2715 }, { "epoch": 1.6733312826822515, "grad_norm": 0.205078125, "learning_rate": 4.175925925925926e-05, "loss": 0.0491, "step": 2720 }, { "epoch": 1.6764072593048294, "grad_norm": 0.2333984375, "learning_rate": 4.174382716049383e-05, "loss": 0.0522, "step": 2725 }, { "epoch": 1.679483235927407, "grad_norm": 0.2119140625, "learning_rate": 4.17283950617284e-05, "loss": 0.0517, "step": 2730 }, { "epoch": 1.6825592125499846, "grad_norm": 0.20703125, "learning_rate": 4.171296296296297e-05, "loss": 0.0473, "step": 2735 }, { "epoch": 1.6856351891725623, "grad_norm": 0.1845703125, "learning_rate": 4.169753086419753e-05, "loss": 0.0489, "step": 2740 }, { "epoch": 1.68871116579514, "grad_norm": 0.24609375, "learning_rate": 4.16820987654321e-05, "loss": 0.0553, "step": 2745 }, { "epoch": 1.6917871424177178, "grad_norm": 0.232421875, "learning_rate": 4.166666666666667e-05, "loss": 0.061, "step": 2750 }, { "epoch": 1.6948631190402952, "grad_norm": 0.208984375, "learning_rate": 4.165123456790124e-05, "loss": 0.0523, "step": 2755 }, { "epoch": 1.697939095662873, "grad_norm": 0.2041015625, "learning_rate": 4.16358024691358e-05, "loss": 0.0506, "step": 2760 }, { "epoch": 1.7010150722854507, "grad_norm": 0.2333984375, "learning_rate": 4.162037037037037e-05, "loss": 0.0526, "step": 2765 }, { "epoch": 1.7040910489080283, "grad_norm": 0.2216796875, "learning_rate": 4.1604938271604946e-05, "loss": 0.0505, "step": 2770 }, { "epoch": 1.707167025530606, "grad_norm": 0.2392578125, "learning_rate": 4.158950617283951e-05, "loss": 0.0505, "step": 2775 }, { "epoch": 1.7102430021531836, "grad_norm": 0.2431640625, "learning_rate": 4.1574074074074076e-05, "loss": 0.0566, "step": 2780 }, { "epoch": 1.7133189787757614, "grad_norm": 0.2314453125, "learning_rate": 4.1558641975308644e-05, "loss": 0.0491, "step": 2785 }, { "epoch": 1.7163949553983389, "grad_norm": 0.267578125, "learning_rate": 4.154320987654321e-05, "loss": 0.0578, "step": 2790 }, { "epoch": 1.7194709320209167, "grad_norm": 0.18359375, "learning_rate": 4.152777777777778e-05, "loss": 0.0629, "step": 2795 }, { "epoch": 1.7225469086434944, "grad_norm": 0.2275390625, "learning_rate": 4.151234567901235e-05, "loss": 0.0544, "step": 2800 }, { "epoch": 1.725622885266072, "grad_norm": 0.2333984375, "learning_rate": 4.149691358024692e-05, "loss": 0.0534, "step": 2805 }, { "epoch": 1.7286988618886496, "grad_norm": 0.2158203125, "learning_rate": 4.148148148148148e-05, "loss": 0.0509, "step": 2810 }, { "epoch": 1.7317748385112273, "grad_norm": 0.236328125, "learning_rate": 4.146604938271605e-05, "loss": 0.0525, "step": 2815 }, { "epoch": 1.7348508151338051, "grad_norm": 0.232421875, "learning_rate": 4.1450617283950624e-05, "loss": 0.0539, "step": 2820 }, { "epoch": 1.7379267917563825, "grad_norm": 0.251953125, "learning_rate": 4.1435185185185185e-05, "loss": 0.053, "step": 2825 }, { "epoch": 1.7410027683789604, "grad_norm": 0.236328125, "learning_rate": 4.1419753086419754e-05, "loss": 0.0532, "step": 2830 }, { "epoch": 1.744078745001538, "grad_norm": 0.201171875, "learning_rate": 4.140432098765432e-05, "loss": 0.0496, "step": 2835 }, { "epoch": 1.7471547216241157, "grad_norm": 0.26953125, "learning_rate": 4.138888888888889e-05, "loss": 0.0601, "step": 2840 }, { "epoch": 1.7502306982466933, "grad_norm": 0.2080078125, "learning_rate": 4.137345679012346e-05, "loss": 0.0507, "step": 2845 }, { "epoch": 1.753306674869271, "grad_norm": 0.2294921875, "learning_rate": 4.135802469135803e-05, "loss": 0.0486, "step": 2850 }, { "epoch": 1.7563826514918488, "grad_norm": 0.2177734375, "learning_rate": 4.1342592592592596e-05, "loss": 0.0517, "step": 2855 }, { "epoch": 1.7594586281144262, "grad_norm": 0.2177734375, "learning_rate": 4.132716049382716e-05, "loss": 0.057, "step": 2860 }, { "epoch": 1.762534604737004, "grad_norm": 0.2158203125, "learning_rate": 4.131172839506173e-05, "loss": 0.0532, "step": 2865 }, { "epoch": 1.7656105813595817, "grad_norm": 0.2099609375, "learning_rate": 4.12962962962963e-05, "loss": 0.0521, "step": 2870 }, { "epoch": 1.7686865579821593, "grad_norm": 0.19140625, "learning_rate": 4.128086419753087e-05, "loss": 0.0511, "step": 2875 }, { "epoch": 1.771762534604737, "grad_norm": 0.2265625, "learning_rate": 4.126543209876543e-05, "loss": 0.054, "step": 2880 }, { "epoch": 1.7748385112273146, "grad_norm": 0.193359375, "learning_rate": 4.125e-05, "loss": 0.0488, "step": 2885 }, { "epoch": 1.7779144878498925, "grad_norm": 0.21875, "learning_rate": 4.1234567901234575e-05, "loss": 0.051, "step": 2890 }, { "epoch": 1.78099046447247, "grad_norm": 0.2236328125, "learning_rate": 4.121913580246914e-05, "loss": 0.0504, "step": 2895 }, { "epoch": 1.7840664410950478, "grad_norm": 0.259765625, "learning_rate": 4.1203703703703705e-05, "loss": 0.0587, "step": 2900 }, { "epoch": 1.7871424177176254, "grad_norm": 0.2255859375, "learning_rate": 4.1188271604938274e-05, "loss": 0.0547, "step": 2905 }, { "epoch": 1.790218394340203, "grad_norm": 0.21875, "learning_rate": 4.117283950617284e-05, "loss": 0.0506, "step": 2910 }, { "epoch": 1.7932943709627807, "grad_norm": 0.2197265625, "learning_rate": 4.115740740740741e-05, "loss": 0.0505, "step": 2915 }, { "epoch": 1.7963703475853583, "grad_norm": 0.208984375, "learning_rate": 4.114197530864198e-05, "loss": 0.0529, "step": 2920 }, { "epoch": 1.7994463242079362, "grad_norm": 0.2109375, "learning_rate": 4.112654320987655e-05, "loss": 0.0509, "step": 2925 }, { "epoch": 1.8025223008305136, "grad_norm": 0.2314453125, "learning_rate": 4.111111111111111e-05, "loss": 0.0519, "step": 2930 }, { "epoch": 1.8055982774530914, "grad_norm": 0.1962890625, "learning_rate": 4.1095679012345684e-05, "loss": 0.0516, "step": 2935 }, { "epoch": 1.808674254075669, "grad_norm": 0.2275390625, "learning_rate": 4.108024691358025e-05, "loss": 0.0564, "step": 2940 }, { "epoch": 1.8117502306982467, "grad_norm": 0.251953125, "learning_rate": 4.1064814814814814e-05, "loss": 0.0574, "step": 2945 }, { "epoch": 1.8148262073208243, "grad_norm": 0.2421875, "learning_rate": 4.104938271604938e-05, "loss": 0.0577, "step": 2950 }, { "epoch": 1.817902183943402, "grad_norm": 0.212890625, "learning_rate": 4.103395061728395e-05, "loss": 0.055, "step": 2955 }, { "epoch": 1.8209781605659798, "grad_norm": 0.23046875, "learning_rate": 4.101851851851852e-05, "loss": 0.0563, "step": 2960 }, { "epoch": 1.8240541371885572, "grad_norm": 0.2373046875, "learning_rate": 4.100308641975309e-05, "loss": 0.0539, "step": 2965 }, { "epoch": 1.827130113811135, "grad_norm": 0.2421875, "learning_rate": 4.0987654320987657e-05, "loss": 0.0535, "step": 2970 }, { "epoch": 1.8302060904337127, "grad_norm": 0.2138671875, "learning_rate": 4.0972222222222225e-05, "loss": 0.05, "step": 2975 }, { "epoch": 1.8332820670562904, "grad_norm": 0.212890625, "learning_rate": 4.095679012345679e-05, "loss": 0.0527, "step": 2980 }, { "epoch": 1.836358043678868, "grad_norm": 0.228515625, "learning_rate": 4.094135802469136e-05, "loss": 0.052, "step": 2985 }, { "epoch": 1.8394340203014456, "grad_norm": 0.2333984375, "learning_rate": 4.092592592592593e-05, "loss": 0.0563, "step": 2990 }, { "epoch": 1.8425099969240235, "grad_norm": 0.21484375, "learning_rate": 4.091049382716049e-05, "loss": 0.0543, "step": 2995 }, { "epoch": 1.845585973546601, "grad_norm": 0.1923828125, "learning_rate": 4.089506172839506e-05, "loss": 0.046, "step": 3000 }, { "epoch": 1.8486619501691788, "grad_norm": 0.2314453125, "learning_rate": 4.087962962962963e-05, "loss": 0.0555, "step": 3005 }, { "epoch": 1.8517379267917564, "grad_norm": 0.177734375, "learning_rate": 4.0864197530864204e-05, "loss": 0.0536, "step": 3010 }, { "epoch": 1.854813903414334, "grad_norm": 0.23046875, "learning_rate": 4.0848765432098766e-05, "loss": 0.0552, "step": 3015 }, { "epoch": 1.8578898800369117, "grad_norm": 0.208984375, "learning_rate": 4.0833333333333334e-05, "loss": 0.0487, "step": 3020 }, { "epoch": 1.8609658566594893, "grad_norm": 0.2080078125, "learning_rate": 4.08179012345679e-05, "loss": 0.0512, "step": 3025 }, { "epoch": 1.8640418332820672, "grad_norm": 0.248046875, "learning_rate": 4.080246913580247e-05, "loss": 0.05, "step": 3030 }, { "epoch": 1.8671178099046446, "grad_norm": 0.2373046875, "learning_rate": 4.078703703703704e-05, "loss": 0.056, "step": 3035 }, { "epoch": 1.8701937865272225, "grad_norm": 0.20703125, "learning_rate": 4.077160493827161e-05, "loss": 0.0603, "step": 3040 }, { "epoch": 1.8732697631498, "grad_norm": 0.2451171875, "learning_rate": 4.0756172839506177e-05, "loss": 0.063, "step": 3045 }, { "epoch": 1.8763457397723777, "grad_norm": 0.2197265625, "learning_rate": 4.074074074074074e-05, "loss": 0.0515, "step": 3050 }, { "epoch": 1.8794217163949554, "grad_norm": 0.2490234375, "learning_rate": 4.0725308641975313e-05, "loss": 0.057, "step": 3055 }, { "epoch": 1.882497693017533, "grad_norm": 0.2421875, "learning_rate": 4.070987654320988e-05, "loss": 0.0496, "step": 3060 }, { "epoch": 1.8855736696401109, "grad_norm": 0.23046875, "learning_rate": 4.0694444444444444e-05, "loss": 0.0594, "step": 3065 }, { "epoch": 1.8886496462626883, "grad_norm": 0.232421875, "learning_rate": 4.067901234567901e-05, "loss": 0.0567, "step": 3070 }, { "epoch": 1.8917256228852661, "grad_norm": 0.22265625, "learning_rate": 4.066358024691358e-05, "loss": 0.0488, "step": 3075 }, { "epoch": 1.8948015995078438, "grad_norm": 0.2451171875, "learning_rate": 4.064814814814815e-05, "loss": 0.0552, "step": 3080 }, { "epoch": 1.8978775761304214, "grad_norm": 0.240234375, "learning_rate": 4.063271604938272e-05, "loss": 0.0583, "step": 3085 }, { "epoch": 1.900953552752999, "grad_norm": 0.259765625, "learning_rate": 4.0617283950617286e-05, "loss": 0.0526, "step": 3090 }, { "epoch": 1.9040295293755767, "grad_norm": 0.21484375, "learning_rate": 4.0601851851851854e-05, "loss": 0.0551, "step": 3095 }, { "epoch": 1.9071055059981545, "grad_norm": 0.2109375, "learning_rate": 4.058641975308642e-05, "loss": 0.0603, "step": 3100 }, { "epoch": 1.910181482620732, "grad_norm": 0.2216796875, "learning_rate": 4.057098765432099e-05, "loss": 0.053, "step": 3105 }, { "epoch": 1.9132574592433098, "grad_norm": 0.23828125, "learning_rate": 4.055555555555556e-05, "loss": 0.0555, "step": 3110 }, { "epoch": 1.9163334358658874, "grad_norm": 0.2080078125, "learning_rate": 4.054012345679012e-05, "loss": 0.05, "step": 3115 }, { "epoch": 1.919409412488465, "grad_norm": 0.2138671875, "learning_rate": 4.052469135802469e-05, "loss": 0.055, "step": 3120 }, { "epoch": 1.9224853891110427, "grad_norm": 0.224609375, "learning_rate": 4.0509259259259265e-05, "loss": 0.0493, "step": 3125 }, { "epoch": 1.9255613657336204, "grad_norm": 0.255859375, "learning_rate": 4.049382716049383e-05, "loss": 0.0527, "step": 3130 }, { "epoch": 1.9286373423561982, "grad_norm": 0.2255859375, "learning_rate": 4.0478395061728395e-05, "loss": 0.0503, "step": 3135 }, { "epoch": 1.9317133189787756, "grad_norm": 0.197265625, "learning_rate": 4.0462962962962963e-05, "loss": 0.0554, "step": 3140 }, { "epoch": 1.9347892956013535, "grad_norm": 0.2392578125, "learning_rate": 4.044753086419753e-05, "loss": 0.0551, "step": 3145 }, { "epoch": 1.9378652722239311, "grad_norm": 0.27734375, "learning_rate": 4.04320987654321e-05, "loss": 0.057, "step": 3150 }, { "epoch": 1.9409412488465088, "grad_norm": 0.197265625, "learning_rate": 4.041666666666667e-05, "loss": 0.0492, "step": 3155 }, { "epoch": 1.9440172254690864, "grad_norm": 0.2236328125, "learning_rate": 4.040123456790124e-05, "loss": 0.0553, "step": 3160 }, { "epoch": 1.947093202091664, "grad_norm": 0.232421875, "learning_rate": 4.0385802469135806e-05, "loss": 0.0512, "step": 3165 }, { "epoch": 1.950169178714242, "grad_norm": 0.2275390625, "learning_rate": 4.0370370370370374e-05, "loss": 0.0552, "step": 3170 }, { "epoch": 1.9532451553368193, "grad_norm": 0.21875, "learning_rate": 4.035493827160494e-05, "loss": 0.0549, "step": 3175 }, { "epoch": 1.9563211319593972, "grad_norm": 0.2197265625, "learning_rate": 4.033950617283951e-05, "loss": 0.055, "step": 3180 }, { "epoch": 1.9593971085819748, "grad_norm": 0.2470703125, "learning_rate": 4.032407407407407e-05, "loss": 0.0571, "step": 3185 }, { "epoch": 1.9624730852045524, "grad_norm": 0.2216796875, "learning_rate": 4.030864197530864e-05, "loss": 0.0515, "step": 3190 }, { "epoch": 1.96554906182713, "grad_norm": 0.216796875, "learning_rate": 4.0293209876543216e-05, "loss": 0.0545, "step": 3195 }, { "epoch": 1.9686250384497077, "grad_norm": 0.212890625, "learning_rate": 4.027777777777778e-05, "loss": 0.0558, "step": 3200 }, { "epoch": 1.9717010150722856, "grad_norm": 0.24609375, "learning_rate": 4.0262345679012346e-05, "loss": 0.0552, "step": 3205 }, { "epoch": 1.974776991694863, "grad_norm": 0.2255859375, "learning_rate": 4.0246913580246915e-05, "loss": 0.053, "step": 3210 }, { "epoch": 1.9778529683174408, "grad_norm": 0.2177734375, "learning_rate": 4.023148148148148e-05, "loss": 0.0558, "step": 3215 }, { "epoch": 1.9809289449400185, "grad_norm": 0.2392578125, "learning_rate": 4.021604938271605e-05, "loss": 0.0563, "step": 3220 }, { "epoch": 1.9840049215625961, "grad_norm": 0.255859375, "learning_rate": 4.020061728395062e-05, "loss": 0.0608, "step": 3225 }, { "epoch": 1.9870808981851737, "grad_norm": 0.2001953125, "learning_rate": 4.018518518518519e-05, "loss": 0.0539, "step": 3230 }, { "epoch": 1.9901568748077514, "grad_norm": 0.251953125, "learning_rate": 4.016975308641975e-05, "loss": 0.054, "step": 3235 }, { "epoch": 1.9932328514303292, "grad_norm": 0.2353515625, "learning_rate": 4.015432098765432e-05, "loss": 0.0573, "step": 3240 }, { "epoch": 1.9963088280529067, "grad_norm": 0.2216796875, "learning_rate": 4.0138888888888894e-05, "loss": 0.0496, "step": 3245 }, { "epoch": 1.9993848046754845, "grad_norm": 0.224609375, "learning_rate": 4.012345679012346e-05, "loss": 0.0528, "step": 3250 }, { "epoch": 2.002460781298062, "grad_norm": 0.21484375, "learning_rate": 4.0108024691358024e-05, "loss": 0.0478, "step": 3255 }, { "epoch": 2.00553675792064, "grad_norm": 0.23046875, "learning_rate": 4.009259259259259e-05, "loss": 0.0453, "step": 3260 }, { "epoch": 2.0086127345432176, "grad_norm": 0.25390625, "learning_rate": 4.007716049382717e-05, "loss": 0.0456, "step": 3265 }, { "epoch": 2.011688711165795, "grad_norm": 0.21875, "learning_rate": 4.006172839506173e-05, "loss": 0.0404, "step": 3270 }, { "epoch": 2.014764687788373, "grad_norm": 0.2021484375, "learning_rate": 4.00462962962963e-05, "loss": 0.0377, "step": 3275 }, { "epoch": 2.0178406644109503, "grad_norm": 0.216796875, "learning_rate": 4.0030864197530866e-05, "loss": 0.0415, "step": 3280 }, { "epoch": 2.020916641033528, "grad_norm": 0.236328125, "learning_rate": 4.0015432098765435e-05, "loss": 0.0418, "step": 3285 }, { "epoch": 2.0239926176561056, "grad_norm": 0.2216796875, "learning_rate": 4e-05, "loss": 0.0511, "step": 3290 }, { "epoch": 2.0270685942786835, "grad_norm": 0.2294921875, "learning_rate": 3.998456790123457e-05, "loss": 0.0445, "step": 3295 }, { "epoch": 2.0301445709012613, "grad_norm": 0.220703125, "learning_rate": 3.996913580246914e-05, "loss": 0.0447, "step": 3300 }, { "epoch": 2.0332205475238387, "grad_norm": 0.203125, "learning_rate": 3.99537037037037e-05, "loss": 0.0431, "step": 3305 }, { "epoch": 2.0362965241464166, "grad_norm": 0.2177734375, "learning_rate": 3.993827160493827e-05, "loss": 0.0447, "step": 3310 }, { "epoch": 2.039372500768994, "grad_norm": 0.236328125, "learning_rate": 3.9922839506172846e-05, "loss": 0.0397, "step": 3315 }, { "epoch": 2.042448477391572, "grad_norm": 0.2060546875, "learning_rate": 3.990740740740741e-05, "loss": 0.0413, "step": 3320 }, { "epoch": 2.0455244540141493, "grad_norm": 0.220703125, "learning_rate": 3.9891975308641976e-05, "loss": 0.0436, "step": 3325 }, { "epoch": 2.048600430636727, "grad_norm": 0.2353515625, "learning_rate": 3.9876543209876544e-05, "loss": 0.0413, "step": 3330 }, { "epoch": 2.051676407259305, "grad_norm": 0.265625, "learning_rate": 3.986111111111111e-05, "loss": 0.0476, "step": 3335 }, { "epoch": 2.0547523838818824, "grad_norm": 0.279296875, "learning_rate": 3.984567901234568e-05, "loss": 0.0467, "step": 3340 }, { "epoch": 2.0578283605044603, "grad_norm": 0.251953125, "learning_rate": 3.983024691358025e-05, "loss": 0.0395, "step": 3345 }, { "epoch": 2.0609043371270377, "grad_norm": 0.26171875, "learning_rate": 3.981481481481482e-05, "loss": 0.0431, "step": 3350 }, { "epoch": 2.0639803137496155, "grad_norm": 0.2431640625, "learning_rate": 3.979938271604938e-05, "loss": 0.0417, "step": 3355 }, { "epoch": 2.067056290372193, "grad_norm": 0.21484375, "learning_rate": 3.9783950617283955e-05, "loss": 0.0434, "step": 3360 }, { "epoch": 2.070132266994771, "grad_norm": 0.201171875, "learning_rate": 3.976851851851852e-05, "loss": 0.042, "step": 3365 }, { "epoch": 2.0732082436173487, "grad_norm": 0.2255859375, "learning_rate": 3.975308641975309e-05, "loss": 0.0463, "step": 3370 }, { "epoch": 2.076284220239926, "grad_norm": 0.21875, "learning_rate": 3.973765432098765e-05, "loss": 0.046, "step": 3375 }, { "epoch": 2.079360196862504, "grad_norm": 0.203125, "learning_rate": 3.972222222222222e-05, "loss": 0.0454, "step": 3380 }, { "epoch": 2.0824361734850814, "grad_norm": 0.24609375, "learning_rate": 3.97067901234568e-05, "loss": 0.0421, "step": 3385 }, { "epoch": 2.0855121501076592, "grad_norm": 0.2578125, "learning_rate": 3.969135802469136e-05, "loss": 0.0416, "step": 3390 }, { "epoch": 2.0885881267302366, "grad_norm": 0.2373046875, "learning_rate": 3.967592592592593e-05, "loss": 0.0463, "step": 3395 }, { "epoch": 2.0916641033528145, "grad_norm": 0.2431640625, "learning_rate": 3.9660493827160496e-05, "loss": 0.0448, "step": 3400 }, { "epoch": 2.0947400799753924, "grad_norm": 0.21484375, "learning_rate": 3.9645061728395064e-05, "loss": 0.0397, "step": 3405 }, { "epoch": 2.0978160565979698, "grad_norm": 0.2255859375, "learning_rate": 3.962962962962963e-05, "loss": 0.0464, "step": 3410 }, { "epoch": 2.1008920332205476, "grad_norm": 0.2216796875, "learning_rate": 3.96141975308642e-05, "loss": 0.042, "step": 3415 }, { "epoch": 2.103968009843125, "grad_norm": 0.2353515625, "learning_rate": 3.959876543209877e-05, "loss": 0.0453, "step": 3420 }, { "epoch": 2.107043986465703, "grad_norm": 0.240234375, "learning_rate": 3.958333333333333e-05, "loss": 0.0421, "step": 3425 }, { "epoch": 2.1101199630882803, "grad_norm": 0.22265625, "learning_rate": 3.9567901234567906e-05, "loss": 0.0456, "step": 3430 }, { "epoch": 2.113195939710858, "grad_norm": 0.2119140625, "learning_rate": 3.9552469135802475e-05, "loss": 0.0435, "step": 3435 }, { "epoch": 2.116271916333436, "grad_norm": 0.2265625, "learning_rate": 3.9537037037037036e-05, "loss": 0.041, "step": 3440 }, { "epoch": 2.1193478929560134, "grad_norm": 0.2314453125, "learning_rate": 3.9521604938271605e-05, "loss": 0.0435, "step": 3445 }, { "epoch": 2.1224238695785913, "grad_norm": 0.2470703125, "learning_rate": 3.950617283950617e-05, "loss": 0.0443, "step": 3450 }, { "epoch": 2.1254998462011687, "grad_norm": 0.20703125, "learning_rate": 3.949074074074074e-05, "loss": 0.0429, "step": 3455 }, { "epoch": 2.1285758228237466, "grad_norm": 0.236328125, "learning_rate": 3.947530864197531e-05, "loss": 0.0424, "step": 3460 }, { "epoch": 2.131651799446324, "grad_norm": 0.21484375, "learning_rate": 3.945987654320988e-05, "loss": 0.0421, "step": 3465 }, { "epoch": 2.134727776068902, "grad_norm": 0.2470703125, "learning_rate": 3.944444444444445e-05, "loss": 0.0447, "step": 3470 }, { "epoch": 2.1378037526914797, "grad_norm": 0.255859375, "learning_rate": 3.942901234567901e-05, "loss": 0.0482, "step": 3475 }, { "epoch": 2.140879729314057, "grad_norm": 0.234375, "learning_rate": 3.9413580246913584e-05, "loss": 0.0439, "step": 3480 }, { "epoch": 2.143955705936635, "grad_norm": 0.2412109375, "learning_rate": 3.939814814814815e-05, "loss": 0.0453, "step": 3485 }, { "epoch": 2.1470316825592124, "grad_norm": 0.2138671875, "learning_rate": 3.938271604938272e-05, "loss": 0.0392, "step": 3490 }, { "epoch": 2.1501076591817903, "grad_norm": 0.232421875, "learning_rate": 3.936728395061728e-05, "loss": 0.0443, "step": 3495 }, { "epoch": 2.1531836358043677, "grad_norm": 0.2421875, "learning_rate": 3.935185185185186e-05, "loss": 0.044, "step": 3500 }, { "epoch": 2.1562596124269455, "grad_norm": 0.25390625, "learning_rate": 3.9336419753086426e-05, "loss": 0.047, "step": 3505 }, { "epoch": 2.1593355890495234, "grad_norm": 0.263671875, "learning_rate": 3.932098765432099e-05, "loss": 0.0434, "step": 3510 }, { "epoch": 2.162411565672101, "grad_norm": 0.2314453125, "learning_rate": 3.9305555555555556e-05, "loss": 0.0465, "step": 3515 }, { "epoch": 2.1654875422946787, "grad_norm": 0.185546875, "learning_rate": 3.9290123456790125e-05, "loss": 0.0396, "step": 3520 }, { "epoch": 2.168563518917256, "grad_norm": 0.22265625, "learning_rate": 3.927469135802469e-05, "loss": 0.0432, "step": 3525 }, { "epoch": 2.171639495539834, "grad_norm": 0.1923828125, "learning_rate": 3.925925925925926e-05, "loss": 0.0455, "step": 3530 }, { "epoch": 2.1747154721624113, "grad_norm": 0.25390625, "learning_rate": 3.924382716049383e-05, "loss": 0.0453, "step": 3535 }, { "epoch": 2.177791448784989, "grad_norm": 0.25, "learning_rate": 3.92283950617284e-05, "loss": 0.0445, "step": 3540 }, { "epoch": 2.180867425407567, "grad_norm": 0.228515625, "learning_rate": 3.921296296296296e-05, "loss": 0.0445, "step": 3545 }, { "epoch": 2.1839434020301445, "grad_norm": 0.2275390625, "learning_rate": 3.9197530864197535e-05, "loss": 0.0459, "step": 3550 }, { "epoch": 2.1870193786527223, "grad_norm": 0.2197265625, "learning_rate": 3.9182098765432104e-05, "loss": 0.0426, "step": 3555 }, { "epoch": 2.1900953552752997, "grad_norm": 0.240234375, "learning_rate": 3.9166666666666665e-05, "loss": 0.0459, "step": 3560 }, { "epoch": 2.1931713318978776, "grad_norm": 0.244140625, "learning_rate": 3.9151234567901234e-05, "loss": 0.0452, "step": 3565 }, { "epoch": 2.196247308520455, "grad_norm": 0.259765625, "learning_rate": 3.91358024691358e-05, "loss": 0.0465, "step": 3570 }, { "epoch": 2.199323285143033, "grad_norm": 0.263671875, "learning_rate": 3.912037037037037e-05, "loss": 0.0458, "step": 3575 }, { "epoch": 2.2023992617656107, "grad_norm": 0.23046875, "learning_rate": 3.910493827160494e-05, "loss": 0.0442, "step": 3580 }, { "epoch": 2.205475238388188, "grad_norm": 0.236328125, "learning_rate": 3.908950617283951e-05, "loss": 0.0445, "step": 3585 }, { "epoch": 2.208551215010766, "grad_norm": 0.275390625, "learning_rate": 3.9074074074074076e-05, "loss": 0.0478, "step": 3590 }, { "epoch": 2.2116271916333434, "grad_norm": 0.2431640625, "learning_rate": 3.9058641975308645e-05, "loss": 0.0415, "step": 3595 }, { "epoch": 2.2147031682559213, "grad_norm": 0.2412109375, "learning_rate": 3.904320987654321e-05, "loss": 0.0439, "step": 3600 }, { "epoch": 2.2177791448784987, "grad_norm": 0.2734375, "learning_rate": 3.902777777777778e-05, "loss": 0.0459, "step": 3605 }, { "epoch": 2.2208551215010766, "grad_norm": 0.2177734375, "learning_rate": 3.901234567901234e-05, "loss": 0.0411, "step": 3610 }, { "epoch": 2.2239310981236544, "grad_norm": 0.193359375, "learning_rate": 3.899691358024691e-05, "loss": 0.0404, "step": 3615 }, { "epoch": 2.227007074746232, "grad_norm": 0.2236328125, "learning_rate": 3.898148148148149e-05, "loss": 0.0399, "step": 3620 }, { "epoch": 2.2300830513688097, "grad_norm": 0.220703125, "learning_rate": 3.8966049382716055e-05, "loss": 0.044, "step": 3625 }, { "epoch": 2.233159027991387, "grad_norm": 0.255859375, "learning_rate": 3.895061728395062e-05, "loss": 0.0429, "step": 3630 }, { "epoch": 2.236235004613965, "grad_norm": 0.2255859375, "learning_rate": 3.8935185185185185e-05, "loss": 0.0443, "step": 3635 }, { "epoch": 2.2393109812365424, "grad_norm": 0.2080078125, "learning_rate": 3.8919753086419754e-05, "loss": 0.0405, "step": 3640 }, { "epoch": 2.2423869578591202, "grad_norm": 0.259765625, "learning_rate": 3.890432098765432e-05, "loss": 0.049, "step": 3645 }, { "epoch": 2.245462934481698, "grad_norm": 0.232421875, "learning_rate": 3.888888888888889e-05, "loss": 0.0425, "step": 3650 }, { "epoch": 2.2485389111042755, "grad_norm": 0.228515625, "learning_rate": 3.887345679012346e-05, "loss": 0.0426, "step": 3655 }, { "epoch": 2.2516148877268534, "grad_norm": 0.2490234375, "learning_rate": 3.885802469135803e-05, "loss": 0.0415, "step": 3660 }, { "epoch": 2.254690864349431, "grad_norm": 0.21484375, "learning_rate": 3.8842592592592596e-05, "loss": 0.0395, "step": 3665 }, { "epoch": 2.2577668409720086, "grad_norm": 0.27734375, "learning_rate": 3.8827160493827165e-05, "loss": 0.0465, "step": 3670 }, { "epoch": 2.260842817594586, "grad_norm": 0.23828125, "learning_rate": 3.881172839506173e-05, "loss": 0.0415, "step": 3675 }, { "epoch": 2.263918794217164, "grad_norm": 0.248046875, "learning_rate": 3.8796296296296295e-05, "loss": 0.0483, "step": 3680 }, { "epoch": 2.2669947708397418, "grad_norm": 0.2255859375, "learning_rate": 3.878086419753086e-05, "loss": 0.0433, "step": 3685 }, { "epoch": 2.270070747462319, "grad_norm": 0.265625, "learning_rate": 3.876543209876544e-05, "loss": 0.0489, "step": 3690 }, { "epoch": 2.273146724084897, "grad_norm": 0.267578125, "learning_rate": 3.875e-05, "loss": 0.0429, "step": 3695 }, { "epoch": 2.2762227007074745, "grad_norm": 0.2294921875, "learning_rate": 3.873456790123457e-05, "loss": 0.0459, "step": 3700 }, { "epoch": 2.2792986773300523, "grad_norm": 0.2578125, "learning_rate": 3.871913580246914e-05, "loss": 0.0458, "step": 3705 }, { "epoch": 2.2823746539526297, "grad_norm": 0.2265625, "learning_rate": 3.8703703703703705e-05, "loss": 0.0453, "step": 3710 }, { "epoch": 2.2854506305752076, "grad_norm": 0.2578125, "learning_rate": 3.8688271604938274e-05, "loss": 0.0425, "step": 3715 }, { "epoch": 2.2885266071977854, "grad_norm": 0.248046875, "learning_rate": 3.867283950617284e-05, "loss": 0.0446, "step": 3720 }, { "epoch": 2.291602583820363, "grad_norm": 0.228515625, "learning_rate": 3.865740740740741e-05, "loss": 0.0415, "step": 3725 }, { "epoch": 2.2946785604429407, "grad_norm": 0.2158203125, "learning_rate": 3.864197530864197e-05, "loss": 0.0462, "step": 3730 }, { "epoch": 2.297754537065518, "grad_norm": 0.2158203125, "learning_rate": 3.862654320987654e-05, "loss": 0.0436, "step": 3735 }, { "epoch": 2.300830513688096, "grad_norm": 0.203125, "learning_rate": 3.8611111111111116e-05, "loss": 0.0455, "step": 3740 }, { "epoch": 2.3039064903106734, "grad_norm": 0.263671875, "learning_rate": 3.8595679012345684e-05, "loss": 0.0445, "step": 3745 }, { "epoch": 2.3069824669332513, "grad_norm": 0.240234375, "learning_rate": 3.8580246913580246e-05, "loss": 0.0462, "step": 3750 }, { "epoch": 2.310058443555829, "grad_norm": 0.3046875, "learning_rate": 3.8564814814814815e-05, "loss": 0.0513, "step": 3755 }, { "epoch": 2.3131344201784065, "grad_norm": 0.255859375, "learning_rate": 3.854938271604939e-05, "loss": 0.0444, "step": 3760 }, { "epoch": 2.3162103968009844, "grad_norm": 0.25, "learning_rate": 3.853395061728395e-05, "loss": 0.0435, "step": 3765 }, { "epoch": 2.319286373423562, "grad_norm": 0.22265625, "learning_rate": 3.851851851851852e-05, "loss": 0.0456, "step": 3770 }, { "epoch": 2.3223623500461397, "grad_norm": 0.2236328125, "learning_rate": 3.850308641975309e-05, "loss": 0.0439, "step": 3775 }, { "epoch": 2.325438326668717, "grad_norm": 0.224609375, "learning_rate": 3.848765432098766e-05, "loss": 0.0423, "step": 3780 }, { "epoch": 2.328514303291295, "grad_norm": 0.2060546875, "learning_rate": 3.8472222222222225e-05, "loss": 0.0391, "step": 3785 }, { "epoch": 2.331590279913873, "grad_norm": 0.240234375, "learning_rate": 3.8456790123456794e-05, "loss": 0.0376, "step": 3790 }, { "epoch": 2.33466625653645, "grad_norm": 0.189453125, "learning_rate": 3.844135802469136e-05, "loss": 0.0416, "step": 3795 }, { "epoch": 2.337742233159028, "grad_norm": 0.2412109375, "learning_rate": 3.8425925925925924e-05, "loss": 0.0468, "step": 3800 }, { "epoch": 2.3408182097816055, "grad_norm": 0.2275390625, "learning_rate": 3.841049382716049e-05, "loss": 0.043, "step": 3805 }, { "epoch": 2.3438941864041833, "grad_norm": 0.220703125, "learning_rate": 3.839506172839507e-05, "loss": 0.0454, "step": 3810 }, { "epoch": 2.3469701630267608, "grad_norm": 0.26953125, "learning_rate": 3.837962962962963e-05, "loss": 0.0473, "step": 3815 }, { "epoch": 2.3500461396493386, "grad_norm": 0.2294921875, "learning_rate": 3.83641975308642e-05, "loss": 0.0441, "step": 3820 }, { "epoch": 2.3531221162719165, "grad_norm": 0.251953125, "learning_rate": 3.8348765432098766e-05, "loss": 0.0478, "step": 3825 }, { "epoch": 2.356198092894494, "grad_norm": 0.23046875, "learning_rate": 3.8333333333333334e-05, "loss": 0.0426, "step": 3830 }, { "epoch": 2.3592740695170717, "grad_norm": 0.2275390625, "learning_rate": 3.83179012345679e-05, "loss": 0.0444, "step": 3835 }, { "epoch": 2.362350046139649, "grad_norm": 0.23828125, "learning_rate": 3.830246913580247e-05, "loss": 0.0462, "step": 3840 }, { "epoch": 2.365426022762227, "grad_norm": 0.2490234375, "learning_rate": 3.828703703703704e-05, "loss": 0.0411, "step": 3845 }, { "epoch": 2.3685019993848044, "grad_norm": 0.2265625, "learning_rate": 3.82716049382716e-05, "loss": 0.0402, "step": 3850 }, { "epoch": 2.3715779760073823, "grad_norm": 0.251953125, "learning_rate": 3.825617283950618e-05, "loss": 0.0475, "step": 3855 }, { "epoch": 2.37465395262996, "grad_norm": 0.228515625, "learning_rate": 3.8240740740740745e-05, "loss": 0.0459, "step": 3860 }, { "epoch": 2.3777299292525376, "grad_norm": 0.263671875, "learning_rate": 3.8225308641975314e-05, "loss": 0.043, "step": 3865 }, { "epoch": 2.3808059058751154, "grad_norm": 0.2412109375, "learning_rate": 3.8209876543209875e-05, "loss": 0.0448, "step": 3870 }, { "epoch": 2.383881882497693, "grad_norm": 0.265625, "learning_rate": 3.8194444444444444e-05, "loss": 0.0444, "step": 3875 }, { "epoch": 2.3869578591202707, "grad_norm": 0.23828125, "learning_rate": 3.817901234567902e-05, "loss": 0.0435, "step": 3880 }, { "epoch": 2.390033835742848, "grad_norm": 0.2080078125, "learning_rate": 3.816358024691358e-05, "loss": 0.04, "step": 3885 }, { "epoch": 2.393109812365426, "grad_norm": 0.2314453125, "learning_rate": 3.814814814814815e-05, "loss": 0.0432, "step": 3890 }, { "epoch": 2.396185788988004, "grad_norm": 0.2060546875, "learning_rate": 3.813271604938272e-05, "loss": 0.0417, "step": 3895 }, { "epoch": 2.3992617656105812, "grad_norm": 0.2490234375, "learning_rate": 3.8117283950617286e-05, "loss": 0.0486, "step": 3900 }, { "epoch": 2.402337742233159, "grad_norm": 0.2421875, "learning_rate": 3.8101851851851854e-05, "loss": 0.0452, "step": 3905 }, { "epoch": 2.4054137188557365, "grad_norm": 0.2490234375, "learning_rate": 3.808641975308642e-05, "loss": 0.0463, "step": 3910 }, { "epoch": 2.4084896954783144, "grad_norm": 0.2353515625, "learning_rate": 3.807098765432099e-05, "loss": 0.0422, "step": 3915 }, { "epoch": 2.411565672100892, "grad_norm": 0.232421875, "learning_rate": 3.805555555555555e-05, "loss": 0.0403, "step": 3920 }, { "epoch": 2.4146416487234696, "grad_norm": 0.2314453125, "learning_rate": 3.804012345679013e-05, "loss": 0.041, "step": 3925 }, { "epoch": 2.4177176253460475, "grad_norm": 0.23046875, "learning_rate": 3.80246913580247e-05, "loss": 0.0397, "step": 3930 }, { "epoch": 2.420793601968625, "grad_norm": 0.2353515625, "learning_rate": 3.800925925925926e-05, "loss": 0.0471, "step": 3935 }, { "epoch": 2.423869578591203, "grad_norm": 0.2119140625, "learning_rate": 3.799382716049383e-05, "loss": 0.0433, "step": 3940 }, { "epoch": 2.42694555521378, "grad_norm": 0.232421875, "learning_rate": 3.7978395061728395e-05, "loss": 0.043, "step": 3945 }, { "epoch": 2.430021531836358, "grad_norm": 0.23046875, "learning_rate": 3.7962962962962964e-05, "loss": 0.0415, "step": 3950 }, { "epoch": 2.4330975084589355, "grad_norm": 0.2255859375, "learning_rate": 3.794753086419753e-05, "loss": 0.0449, "step": 3955 }, { "epoch": 2.4361734850815133, "grad_norm": 0.2177734375, "learning_rate": 3.79320987654321e-05, "loss": 0.0439, "step": 3960 }, { "epoch": 2.439249461704091, "grad_norm": 0.2373046875, "learning_rate": 3.791666666666667e-05, "loss": 0.042, "step": 3965 }, { "epoch": 2.4423254383266686, "grad_norm": 0.2236328125, "learning_rate": 3.790123456790123e-05, "loss": 0.0425, "step": 3970 }, { "epoch": 2.4454014149492465, "grad_norm": 0.2197265625, "learning_rate": 3.7885802469135806e-05, "loss": 0.0419, "step": 3975 }, { "epoch": 2.448477391571824, "grad_norm": 0.259765625, "learning_rate": 3.7870370370370374e-05, "loss": 0.0471, "step": 3980 }, { "epoch": 2.4515533681944017, "grad_norm": 0.2109375, "learning_rate": 3.785493827160494e-05, "loss": 0.0407, "step": 3985 }, { "epoch": 2.454629344816979, "grad_norm": 0.244140625, "learning_rate": 3.7839506172839504e-05, "loss": 0.0434, "step": 3990 }, { "epoch": 2.457705321439557, "grad_norm": 0.2373046875, "learning_rate": 3.782407407407408e-05, "loss": 0.0458, "step": 3995 }, { "epoch": 2.460781298062135, "grad_norm": 0.18359375, "learning_rate": 3.780864197530865e-05, "loss": 0.0431, "step": 4000 }, { "epoch": 2.4638572746847123, "grad_norm": 0.2099609375, "learning_rate": 3.779320987654321e-05, "loss": 0.0445, "step": 4005 }, { "epoch": 2.46693325130729, "grad_norm": 0.2236328125, "learning_rate": 3.777777777777778e-05, "loss": 0.0452, "step": 4010 }, { "epoch": 2.4700092279298675, "grad_norm": 0.220703125, "learning_rate": 3.776234567901235e-05, "loss": 0.0408, "step": 4015 }, { "epoch": 2.4730852045524454, "grad_norm": 0.232421875, "learning_rate": 3.7746913580246915e-05, "loss": 0.0466, "step": 4020 }, { "epoch": 2.4761611811750233, "grad_norm": 0.2412109375, "learning_rate": 3.7731481481481484e-05, "loss": 0.046, "step": 4025 }, { "epoch": 2.4792371577976007, "grad_norm": 0.2412109375, "learning_rate": 3.771604938271605e-05, "loss": 0.0449, "step": 4030 }, { "epoch": 2.4823131344201785, "grad_norm": 0.28515625, "learning_rate": 3.770061728395062e-05, "loss": 0.0512, "step": 4035 }, { "epoch": 2.485389111042756, "grad_norm": 0.251953125, "learning_rate": 3.768518518518518e-05, "loss": 0.0444, "step": 4040 }, { "epoch": 2.488465087665334, "grad_norm": 0.259765625, "learning_rate": 3.766975308641976e-05, "loss": 0.0488, "step": 4045 }, { "epoch": 2.4915410642879112, "grad_norm": 0.302734375, "learning_rate": 3.7654320987654326e-05, "loss": 0.0505, "step": 4050 }, { "epoch": 2.494617040910489, "grad_norm": 0.25390625, "learning_rate": 3.763888888888889e-05, "loss": 0.0444, "step": 4055 }, { "epoch": 2.497693017533067, "grad_norm": 0.275390625, "learning_rate": 3.7623456790123456e-05, "loss": 0.05, "step": 4060 }, { "epoch": 2.5007689941556444, "grad_norm": 0.232421875, "learning_rate": 3.760802469135803e-05, "loss": 0.0423, "step": 4065 }, { "epoch": 2.503844970778222, "grad_norm": 0.2275390625, "learning_rate": 3.759259259259259e-05, "loss": 0.0427, "step": 4070 }, { "epoch": 2.5069209474007996, "grad_norm": 0.2392578125, "learning_rate": 3.757716049382716e-05, "loss": 0.0456, "step": 4075 }, { "epoch": 2.5099969240233775, "grad_norm": 0.2236328125, "learning_rate": 3.756172839506173e-05, "loss": 0.0405, "step": 4080 }, { "epoch": 2.5130729006459553, "grad_norm": 0.22265625, "learning_rate": 3.75462962962963e-05, "loss": 0.0436, "step": 4085 }, { "epoch": 2.5161488772685328, "grad_norm": 0.2470703125, "learning_rate": 3.7530864197530867e-05, "loss": 0.0434, "step": 4090 }, { "epoch": 2.51922485389111, "grad_norm": 0.236328125, "learning_rate": 3.7515432098765435e-05, "loss": 0.0481, "step": 4095 }, { "epoch": 2.522300830513688, "grad_norm": 0.236328125, "learning_rate": 3.7500000000000003e-05, "loss": 0.0454, "step": 4100 }, { "epoch": 2.525376807136266, "grad_norm": 0.2578125, "learning_rate": 3.7484567901234565e-05, "loss": 0.0467, "step": 4105 }, { "epoch": 2.5284527837588433, "grad_norm": 0.21484375, "learning_rate": 3.7469135802469134e-05, "loss": 0.0402, "step": 4110 }, { "epoch": 2.531528760381421, "grad_norm": 0.275390625, "learning_rate": 3.745370370370371e-05, "loss": 0.049, "step": 4115 }, { "epoch": 2.534604737003999, "grad_norm": 0.25390625, "learning_rate": 3.743827160493828e-05, "loss": 0.0445, "step": 4120 }, { "epoch": 2.5376807136265764, "grad_norm": 0.251953125, "learning_rate": 3.742283950617284e-05, "loss": 0.0451, "step": 4125 }, { "epoch": 2.540756690249154, "grad_norm": 0.29296875, "learning_rate": 3.740740740740741e-05, "loss": 0.0453, "step": 4130 }, { "epoch": 2.5438326668717317, "grad_norm": 0.88671875, "learning_rate": 3.7391975308641976e-05, "loss": 0.0462, "step": 4135 }, { "epoch": 2.5469086434943096, "grad_norm": 0.26171875, "learning_rate": 3.7376543209876544e-05, "loss": 0.046, "step": 4140 }, { "epoch": 2.549984620116887, "grad_norm": 0.232421875, "learning_rate": 3.736111111111111e-05, "loss": 0.0476, "step": 4145 }, { "epoch": 2.553060596739465, "grad_norm": 0.25390625, "learning_rate": 3.734567901234568e-05, "loss": 0.0481, "step": 4150 }, { "epoch": 2.5561365733620427, "grad_norm": 0.2431640625, "learning_rate": 3.733024691358025e-05, "loss": 0.0492, "step": 4155 }, { "epoch": 2.55921254998462, "grad_norm": 0.2451171875, "learning_rate": 3.731481481481482e-05, "loss": 0.0455, "step": 4160 }, { "epoch": 2.5622885266071975, "grad_norm": 0.2431640625, "learning_rate": 3.7299382716049387e-05, "loss": 0.0425, "step": 4165 }, { "epoch": 2.5653645032297754, "grad_norm": 0.259765625, "learning_rate": 3.7283950617283955e-05, "loss": 0.0485, "step": 4170 }, { "epoch": 2.5684404798523532, "grad_norm": 0.267578125, "learning_rate": 3.726851851851852e-05, "loss": 0.0448, "step": 4175 }, { "epoch": 2.5715164564749307, "grad_norm": 0.2119140625, "learning_rate": 3.7253086419753085e-05, "loss": 0.0441, "step": 4180 }, { "epoch": 2.5745924330975085, "grad_norm": 0.224609375, "learning_rate": 3.723765432098766e-05, "loss": 0.0455, "step": 4185 }, { "epoch": 2.5776684097200864, "grad_norm": 0.263671875, "learning_rate": 3.722222222222222e-05, "loss": 0.047, "step": 4190 }, { "epoch": 2.580744386342664, "grad_norm": 0.220703125, "learning_rate": 3.720679012345679e-05, "loss": 0.0507, "step": 4195 }, { "epoch": 2.583820362965241, "grad_norm": 0.255859375, "learning_rate": 3.719135802469136e-05, "loss": 0.0442, "step": 4200 }, { "epoch": 2.586896339587819, "grad_norm": 0.244140625, "learning_rate": 3.717592592592593e-05, "loss": 0.0459, "step": 4205 }, { "epoch": 2.589972316210397, "grad_norm": 0.2314453125, "learning_rate": 3.7160493827160496e-05, "loss": 0.0451, "step": 4210 }, { "epoch": 2.5930482928329743, "grad_norm": 0.216796875, "learning_rate": 3.7145061728395064e-05, "loss": 0.047, "step": 4215 }, { "epoch": 2.596124269455552, "grad_norm": 0.2412109375, "learning_rate": 3.712962962962963e-05, "loss": 0.0461, "step": 4220 }, { "epoch": 2.59920024607813, "grad_norm": 0.212890625, "learning_rate": 3.7114197530864194e-05, "loss": 0.0436, "step": 4225 }, { "epoch": 2.6022762227007075, "grad_norm": 0.2421875, "learning_rate": 3.709876543209877e-05, "loss": 0.0476, "step": 4230 }, { "epoch": 2.605352199323285, "grad_norm": 0.265625, "learning_rate": 3.708333333333334e-05, "loss": 0.0431, "step": 4235 }, { "epoch": 2.6084281759458627, "grad_norm": 0.265625, "learning_rate": 3.7067901234567906e-05, "loss": 0.0442, "step": 4240 }, { "epoch": 2.6115041525684406, "grad_norm": 0.2138671875, "learning_rate": 3.705246913580247e-05, "loss": 0.0433, "step": 4245 }, { "epoch": 2.614580129191018, "grad_norm": 0.228515625, "learning_rate": 3.7037037037037037e-05, "loss": 0.0457, "step": 4250 }, { "epoch": 2.617656105813596, "grad_norm": 0.2421875, "learning_rate": 3.702160493827161e-05, "loss": 0.0441, "step": 4255 }, { "epoch": 2.6207320824361737, "grad_norm": 0.2890625, "learning_rate": 3.7006172839506173e-05, "loss": 0.0454, "step": 4260 }, { "epoch": 2.623808059058751, "grad_norm": 0.21875, "learning_rate": 3.699074074074074e-05, "loss": 0.0425, "step": 4265 }, { "epoch": 2.6268840356813286, "grad_norm": 0.2216796875, "learning_rate": 3.697530864197531e-05, "loss": 0.043, "step": 4270 }, { "epoch": 2.6299600123039064, "grad_norm": 0.21875, "learning_rate": 3.695987654320988e-05, "loss": 0.0443, "step": 4275 }, { "epoch": 2.6330359889264843, "grad_norm": 0.236328125, "learning_rate": 3.694444444444445e-05, "loss": 0.0487, "step": 4280 }, { "epoch": 2.6361119655490617, "grad_norm": 0.21875, "learning_rate": 3.6929012345679016e-05, "loss": 0.0459, "step": 4285 }, { "epoch": 2.6391879421716395, "grad_norm": 0.2734375, "learning_rate": 3.6913580246913584e-05, "loss": 0.0452, "step": 4290 }, { "epoch": 2.6422639187942174, "grad_norm": 0.2470703125, "learning_rate": 3.6898148148148146e-05, "loss": 0.0473, "step": 4295 }, { "epoch": 2.645339895416795, "grad_norm": 0.2578125, "learning_rate": 3.6882716049382714e-05, "loss": 0.0498, "step": 4300 }, { "epoch": 2.6484158720393722, "grad_norm": 0.2216796875, "learning_rate": 3.686728395061729e-05, "loss": 0.0434, "step": 4305 }, { "epoch": 2.65149184866195, "grad_norm": 0.2177734375, "learning_rate": 3.685185185185185e-05, "loss": 0.0441, "step": 4310 }, { "epoch": 2.654567825284528, "grad_norm": 0.251953125, "learning_rate": 3.683641975308642e-05, "loss": 0.0476, "step": 4315 }, { "epoch": 2.6576438019071054, "grad_norm": 0.2412109375, "learning_rate": 3.682098765432099e-05, "loss": 0.0425, "step": 4320 }, { "epoch": 2.660719778529683, "grad_norm": 0.263671875, "learning_rate": 3.6805555555555556e-05, "loss": 0.0496, "step": 4325 }, { "epoch": 2.663795755152261, "grad_norm": 0.2412109375, "learning_rate": 3.6790123456790125e-05, "loss": 0.0439, "step": 4330 }, { "epoch": 2.6668717317748385, "grad_norm": 0.2451171875, "learning_rate": 3.677469135802469e-05, "loss": 0.0453, "step": 4335 }, { "epoch": 2.669947708397416, "grad_norm": 0.25390625, "learning_rate": 3.675925925925926e-05, "loss": 0.0469, "step": 4340 }, { "epoch": 2.6730236850199938, "grad_norm": 0.2392578125, "learning_rate": 3.6743827160493823e-05, "loss": 0.0459, "step": 4345 }, { "epoch": 2.6760996616425716, "grad_norm": 0.2470703125, "learning_rate": 3.67283950617284e-05, "loss": 0.0471, "step": 4350 }, { "epoch": 2.679175638265149, "grad_norm": 0.265625, "learning_rate": 3.671296296296297e-05, "loss": 0.046, "step": 4355 }, { "epoch": 2.682251614887727, "grad_norm": 0.212890625, "learning_rate": 3.6697530864197536e-05, "loss": 0.0432, "step": 4360 }, { "epoch": 2.6853275915103048, "grad_norm": 0.236328125, "learning_rate": 3.66820987654321e-05, "loss": 0.046, "step": 4365 }, { "epoch": 2.688403568132882, "grad_norm": 0.236328125, "learning_rate": 3.6666666666666666e-05, "loss": 0.0438, "step": 4370 }, { "epoch": 2.6914795447554596, "grad_norm": 0.2236328125, "learning_rate": 3.665123456790124e-05, "loss": 0.0492, "step": 4375 }, { "epoch": 2.6945555213780374, "grad_norm": 0.234375, "learning_rate": 3.66358024691358e-05, "loss": 0.0445, "step": 4380 }, { "epoch": 2.6976314980006153, "grad_norm": 0.2490234375, "learning_rate": 3.662037037037037e-05, "loss": 0.0436, "step": 4385 }, { "epoch": 2.7007074746231927, "grad_norm": 0.224609375, "learning_rate": 3.660493827160494e-05, "loss": 0.043, "step": 4390 }, { "epoch": 2.7037834512457706, "grad_norm": 0.26171875, "learning_rate": 3.658950617283951e-05, "loss": 0.0402, "step": 4395 }, { "epoch": 2.7068594278683484, "grad_norm": 0.248046875, "learning_rate": 3.6574074074074076e-05, "loss": 0.0456, "step": 4400 }, { "epoch": 2.709935404490926, "grad_norm": 0.2119140625, "learning_rate": 3.6558641975308645e-05, "loss": 0.0491, "step": 4405 }, { "epoch": 2.7130113811135033, "grad_norm": 0.212890625, "learning_rate": 3.654320987654321e-05, "loss": 0.0441, "step": 4410 }, { "epoch": 2.716087357736081, "grad_norm": 0.2255859375, "learning_rate": 3.6527777777777775e-05, "loss": 0.0413, "step": 4415 }, { "epoch": 2.719163334358659, "grad_norm": 0.240234375, "learning_rate": 3.651234567901235e-05, "loss": 0.0462, "step": 4420 }, { "epoch": 2.7222393109812364, "grad_norm": 0.2333984375, "learning_rate": 3.649691358024692e-05, "loss": 0.0417, "step": 4425 }, { "epoch": 2.7253152876038143, "grad_norm": 0.240234375, "learning_rate": 3.648148148148148e-05, "loss": 0.0422, "step": 4430 }, { "epoch": 2.728391264226392, "grad_norm": 0.236328125, "learning_rate": 3.646604938271605e-05, "loss": 0.045, "step": 4435 }, { "epoch": 2.7314672408489695, "grad_norm": 0.255859375, "learning_rate": 3.645061728395062e-05, "loss": 0.0453, "step": 4440 }, { "epoch": 2.7345432174715474, "grad_norm": 0.2392578125, "learning_rate": 3.6435185185185186e-05, "loss": 0.0451, "step": 4445 }, { "epoch": 2.737619194094125, "grad_norm": 0.2216796875, "learning_rate": 3.6419753086419754e-05, "loss": 0.0452, "step": 4450 }, { "epoch": 2.7406951707167027, "grad_norm": 0.255859375, "learning_rate": 3.640432098765432e-05, "loss": 0.0529, "step": 4455 }, { "epoch": 2.74377114733928, "grad_norm": 0.240234375, "learning_rate": 3.638888888888889e-05, "loss": 0.0427, "step": 4460 }, { "epoch": 2.746847123961858, "grad_norm": 0.2333984375, "learning_rate": 3.637345679012346e-05, "loss": 0.0435, "step": 4465 }, { "epoch": 2.749923100584436, "grad_norm": 0.25, "learning_rate": 3.635802469135803e-05, "loss": 0.0429, "step": 4470 }, { "epoch": 2.752999077207013, "grad_norm": 0.2451171875, "learning_rate": 3.6342592592592596e-05, "loss": 0.047, "step": 4475 }, { "epoch": 2.756075053829591, "grad_norm": 0.22265625, "learning_rate": 3.6327160493827165e-05, "loss": 0.0468, "step": 4480 }, { "epoch": 2.7591510304521685, "grad_norm": 0.234375, "learning_rate": 3.6311728395061726e-05, "loss": 0.0479, "step": 4485 }, { "epoch": 2.7622270070747463, "grad_norm": 0.22265625, "learning_rate": 3.62962962962963e-05, "loss": 0.0435, "step": 4490 }, { "epoch": 2.7653029836973237, "grad_norm": 0.21484375, "learning_rate": 3.628086419753087e-05, "loss": 0.044, "step": 4495 }, { "epoch": 2.7683789603199016, "grad_norm": 0.25390625, "learning_rate": 3.626543209876543e-05, "loss": 0.0434, "step": 4500 }, { "epoch": 2.7714549369424795, "grad_norm": 0.234375, "learning_rate": 3.625e-05, "loss": 0.0473, "step": 4505 }, { "epoch": 2.774530913565057, "grad_norm": 0.25, "learning_rate": 3.623456790123457e-05, "loss": 0.0486, "step": 4510 }, { "epoch": 2.7776068901876347, "grad_norm": 0.2333984375, "learning_rate": 3.621913580246914e-05, "loss": 0.0464, "step": 4515 }, { "epoch": 2.780682866810212, "grad_norm": 0.26171875, "learning_rate": 3.6203703703703706e-05, "loss": 0.0486, "step": 4520 }, { "epoch": 2.78375884343279, "grad_norm": 0.2412109375, "learning_rate": 3.6188271604938274e-05, "loss": 0.0456, "step": 4525 }, { "epoch": 2.7868348200553674, "grad_norm": 0.267578125, "learning_rate": 3.617283950617284e-05, "loss": 0.0454, "step": 4530 }, { "epoch": 2.7899107966779453, "grad_norm": 0.2470703125, "learning_rate": 3.6157407407407404e-05, "loss": 0.045, "step": 4535 }, { "epoch": 2.792986773300523, "grad_norm": 0.2001953125, "learning_rate": 3.614197530864198e-05, "loss": 0.0466, "step": 4540 }, { "epoch": 2.7960627499231006, "grad_norm": 0.251953125, "learning_rate": 3.612654320987655e-05, "loss": 0.0469, "step": 4545 }, { "epoch": 2.7991387265456784, "grad_norm": 0.2392578125, "learning_rate": 3.611111111111111e-05, "loss": 0.0439, "step": 4550 }, { "epoch": 2.802214703168256, "grad_norm": 0.212890625, "learning_rate": 3.609567901234568e-05, "loss": 0.0469, "step": 4555 }, { "epoch": 2.8052906797908337, "grad_norm": 0.23828125, "learning_rate": 3.608024691358025e-05, "loss": 0.0518, "step": 4560 }, { "epoch": 2.808366656413411, "grad_norm": 0.224609375, "learning_rate": 3.6064814814814815e-05, "loss": 0.0425, "step": 4565 }, { "epoch": 2.811442633035989, "grad_norm": 0.2138671875, "learning_rate": 3.604938271604938e-05, "loss": 0.0463, "step": 4570 }, { "epoch": 2.814518609658567, "grad_norm": 0.2080078125, "learning_rate": 3.603395061728395e-05, "loss": 0.0429, "step": 4575 }, { "epoch": 2.8175945862811442, "grad_norm": 0.2314453125, "learning_rate": 3.601851851851852e-05, "loss": 0.0438, "step": 4580 }, { "epoch": 2.820670562903722, "grad_norm": 0.259765625, "learning_rate": 3.600308641975309e-05, "loss": 0.0494, "step": 4585 }, { "epoch": 2.8237465395262995, "grad_norm": 0.2470703125, "learning_rate": 3.598765432098766e-05, "loss": 0.0425, "step": 4590 }, { "epoch": 2.8268225161488774, "grad_norm": 0.251953125, "learning_rate": 3.5972222222222225e-05, "loss": 0.0467, "step": 4595 }, { "epoch": 2.8298984927714548, "grad_norm": 0.2236328125, "learning_rate": 3.5956790123456794e-05, "loss": 0.0439, "step": 4600 }, { "epoch": 2.8329744693940326, "grad_norm": 0.2734375, "learning_rate": 3.5941358024691356e-05, "loss": 0.0485, "step": 4605 }, { "epoch": 2.8360504460166105, "grad_norm": 0.25, "learning_rate": 3.592592592592593e-05, "loss": 0.0467, "step": 4610 }, { "epoch": 2.839126422639188, "grad_norm": 0.228515625, "learning_rate": 3.59104938271605e-05, "loss": 0.0452, "step": 4615 }, { "epoch": 2.8422023992617658, "grad_norm": 0.2275390625, "learning_rate": 3.589506172839506e-05, "loss": 0.046, "step": 4620 }, { "epoch": 2.845278375884343, "grad_norm": 0.236328125, "learning_rate": 3.587962962962963e-05, "loss": 0.0433, "step": 4625 }, { "epoch": 2.848354352506921, "grad_norm": 0.255859375, "learning_rate": 3.5864197530864205e-05, "loss": 0.043, "step": 4630 }, { "epoch": 2.8514303291294985, "grad_norm": 0.2041015625, "learning_rate": 3.5848765432098766e-05, "loss": 0.0394, "step": 4635 }, { "epoch": 2.8545063057520763, "grad_norm": 0.271484375, "learning_rate": 3.5833333333333335e-05, "loss": 0.0468, "step": 4640 }, { "epoch": 2.857582282374654, "grad_norm": 0.259765625, "learning_rate": 3.58179012345679e-05, "loss": 0.0464, "step": 4645 }, { "epoch": 2.8606582589972316, "grad_norm": 0.228515625, "learning_rate": 3.580246913580247e-05, "loss": 0.0426, "step": 4650 }, { "epoch": 2.8637342356198094, "grad_norm": 0.25390625, "learning_rate": 3.578703703703704e-05, "loss": 0.0469, "step": 4655 }, { "epoch": 2.866810212242387, "grad_norm": 0.265625, "learning_rate": 3.577160493827161e-05, "loss": 0.0421, "step": 4660 }, { "epoch": 2.8698861888649647, "grad_norm": 0.287109375, "learning_rate": 3.575617283950618e-05, "loss": 0.0459, "step": 4665 }, { "epoch": 2.872962165487542, "grad_norm": 0.2421875, "learning_rate": 3.574074074074074e-05, "loss": 0.0467, "step": 4670 }, { "epoch": 2.87603814211012, "grad_norm": 0.2119140625, "learning_rate": 3.572530864197531e-05, "loss": 0.0453, "step": 4675 }, { "epoch": 2.879114118732698, "grad_norm": 0.2333984375, "learning_rate": 3.570987654320988e-05, "loss": 0.0473, "step": 4680 }, { "epoch": 2.8821900953552753, "grad_norm": 0.232421875, "learning_rate": 3.5694444444444444e-05, "loss": 0.044, "step": 4685 }, { "epoch": 2.885266071977853, "grad_norm": 0.2392578125, "learning_rate": 3.567901234567901e-05, "loss": 0.0433, "step": 4690 }, { "epoch": 2.8883420486004305, "grad_norm": 0.2158203125, "learning_rate": 3.566358024691358e-05, "loss": 0.0467, "step": 4695 }, { "epoch": 2.8914180252230084, "grad_norm": 0.2373046875, "learning_rate": 3.564814814814815e-05, "loss": 0.0458, "step": 4700 }, { "epoch": 2.894494001845586, "grad_norm": 0.2578125, "learning_rate": 3.563271604938272e-05, "loss": 0.0452, "step": 4705 }, { "epoch": 2.8975699784681637, "grad_norm": 0.2451171875, "learning_rate": 3.5617283950617286e-05, "loss": 0.0422, "step": 4710 }, { "epoch": 2.9006459550907415, "grad_norm": 0.2197265625, "learning_rate": 3.5601851851851855e-05, "loss": 0.0434, "step": 4715 }, { "epoch": 2.903721931713319, "grad_norm": 0.26171875, "learning_rate": 3.5586419753086416e-05, "loss": 0.0475, "step": 4720 }, { "epoch": 2.906797908335897, "grad_norm": 0.2373046875, "learning_rate": 3.557098765432099e-05, "loss": 0.0454, "step": 4725 }, { "epoch": 2.909873884958474, "grad_norm": 0.26953125, "learning_rate": 3.555555555555556e-05, "loss": 0.0471, "step": 4730 }, { "epoch": 2.912949861581052, "grad_norm": 0.2333984375, "learning_rate": 3.554012345679013e-05, "loss": 0.0457, "step": 4735 }, { "epoch": 2.9160258382036295, "grad_norm": 0.2353515625, "learning_rate": 3.552469135802469e-05, "loss": 0.0442, "step": 4740 }, { "epoch": 2.9191018148262073, "grad_norm": 0.2314453125, "learning_rate": 3.550925925925926e-05, "loss": 0.0445, "step": 4745 }, { "epoch": 2.922177791448785, "grad_norm": 0.2470703125, "learning_rate": 3.5493827160493834e-05, "loss": 0.0475, "step": 4750 }, { "epoch": 2.9252537680713626, "grad_norm": 0.26171875, "learning_rate": 3.5478395061728395e-05, "loss": 0.0481, "step": 4755 }, { "epoch": 2.9283297446939405, "grad_norm": 0.2490234375, "learning_rate": 3.5462962962962964e-05, "loss": 0.0452, "step": 4760 }, { "epoch": 2.931405721316518, "grad_norm": 0.240234375, "learning_rate": 3.544753086419753e-05, "loss": 0.0411, "step": 4765 }, { "epoch": 2.9344816979390957, "grad_norm": 0.263671875, "learning_rate": 3.54320987654321e-05, "loss": 0.0488, "step": 4770 }, { "epoch": 2.937557674561673, "grad_norm": 0.2294921875, "learning_rate": 3.541666666666667e-05, "loss": 0.0464, "step": 4775 }, { "epoch": 2.940633651184251, "grad_norm": 0.224609375, "learning_rate": 3.540123456790124e-05, "loss": 0.0406, "step": 4780 }, { "epoch": 2.943709627806829, "grad_norm": 0.21484375, "learning_rate": 3.5385802469135806e-05, "loss": 0.0482, "step": 4785 }, { "epoch": 2.9467856044294063, "grad_norm": 0.2470703125, "learning_rate": 3.537037037037037e-05, "loss": 0.0501, "step": 4790 }, { "epoch": 2.949861581051984, "grad_norm": 0.24609375, "learning_rate": 3.535493827160494e-05, "loss": 0.0458, "step": 4795 }, { "epoch": 2.9529375576745616, "grad_norm": 0.2236328125, "learning_rate": 3.533950617283951e-05, "loss": 0.0488, "step": 4800 }, { "epoch": 2.9560135342971394, "grad_norm": 0.234375, "learning_rate": 3.532407407407407e-05, "loss": 0.0454, "step": 4805 }, { "epoch": 2.959089510919717, "grad_norm": 0.216796875, "learning_rate": 3.530864197530864e-05, "loss": 0.0435, "step": 4810 }, { "epoch": 2.9621654875422947, "grad_norm": 0.240234375, "learning_rate": 3.529320987654321e-05, "loss": 0.0483, "step": 4815 }, { "epoch": 2.9652414641648726, "grad_norm": 0.25, "learning_rate": 3.527777777777778e-05, "loss": 0.0494, "step": 4820 }, { "epoch": 2.96831744078745, "grad_norm": 0.2255859375, "learning_rate": 3.526234567901235e-05, "loss": 0.0467, "step": 4825 }, { "epoch": 2.971393417410028, "grad_norm": 0.2373046875, "learning_rate": 3.5246913580246915e-05, "loss": 0.044, "step": 4830 }, { "epoch": 2.9744693940326052, "grad_norm": 0.2333984375, "learning_rate": 3.5231481481481484e-05, "loss": 0.0452, "step": 4835 }, { "epoch": 2.977545370655183, "grad_norm": 0.2255859375, "learning_rate": 3.5216049382716045e-05, "loss": 0.046, "step": 4840 }, { "epoch": 2.9806213472777605, "grad_norm": 0.26171875, "learning_rate": 3.520061728395062e-05, "loss": 0.0486, "step": 4845 }, { "epoch": 2.9836973239003384, "grad_norm": 0.29296875, "learning_rate": 3.518518518518519e-05, "loss": 0.0502, "step": 4850 }, { "epoch": 2.9867733005229162, "grad_norm": 0.24609375, "learning_rate": 3.516975308641976e-05, "loss": 0.0478, "step": 4855 }, { "epoch": 2.9898492771454936, "grad_norm": 0.251953125, "learning_rate": 3.515432098765432e-05, "loss": 0.0469, "step": 4860 }, { "epoch": 2.9929252537680715, "grad_norm": 0.228515625, "learning_rate": 3.513888888888889e-05, "loss": 0.0449, "step": 4865 }, { "epoch": 2.996001230390649, "grad_norm": 0.2177734375, "learning_rate": 3.512345679012346e-05, "loss": 0.0431, "step": 4870 }, { "epoch": 2.9990772070132268, "grad_norm": 0.236328125, "learning_rate": 3.5108024691358025e-05, "loss": 0.0466, "step": 4875 }, { "epoch": 3.002153183635804, "grad_norm": 0.208984375, "learning_rate": 3.509259259259259e-05, "loss": 0.0379, "step": 4880 }, { "epoch": 3.005229160258382, "grad_norm": 0.2431640625, "learning_rate": 3.507716049382716e-05, "loss": 0.0378, "step": 4885 }, { "epoch": 3.00830513688096, "grad_norm": 0.357421875, "learning_rate": 3.506172839506173e-05, "loss": 0.0418, "step": 4890 }, { "epoch": 3.0113811135035373, "grad_norm": 0.234375, "learning_rate": 3.50462962962963e-05, "loss": 0.0377, "step": 4895 }, { "epoch": 3.014457090126115, "grad_norm": 0.263671875, "learning_rate": 3.503086419753087e-05, "loss": 0.0345, "step": 4900 }, { "epoch": 3.0175330667486926, "grad_norm": 0.2255859375, "learning_rate": 3.5015432098765435e-05, "loss": 0.0378, "step": 4905 }, { "epoch": 3.0206090433712705, "grad_norm": 0.2412109375, "learning_rate": 3.5e-05, "loss": 0.0374, "step": 4910 }, { "epoch": 3.023685019993848, "grad_norm": 0.2099609375, "learning_rate": 3.498456790123457e-05, "loss": 0.0368, "step": 4915 }, { "epoch": 3.0267609966164257, "grad_norm": 0.21875, "learning_rate": 3.496913580246914e-05, "loss": 0.0342, "step": 4920 }, { "epoch": 3.0298369732390036, "grad_norm": 0.2490234375, "learning_rate": 3.49537037037037e-05, "loss": 0.0399, "step": 4925 }, { "epoch": 3.032912949861581, "grad_norm": 0.2431640625, "learning_rate": 3.493827160493827e-05, "loss": 0.0375, "step": 4930 }, { "epoch": 3.035988926484159, "grad_norm": 0.259765625, "learning_rate": 3.492283950617284e-05, "loss": 0.0376, "step": 4935 }, { "epoch": 3.0390649031067363, "grad_norm": 0.2236328125, "learning_rate": 3.490740740740741e-05, "loss": 0.0358, "step": 4940 }, { "epoch": 3.042140879729314, "grad_norm": 0.23828125, "learning_rate": 3.4891975308641976e-05, "loss": 0.0371, "step": 4945 }, { "epoch": 3.0452168563518915, "grad_norm": 0.248046875, "learning_rate": 3.4876543209876545e-05, "loss": 0.0383, "step": 4950 }, { "epoch": 3.0482928329744694, "grad_norm": 0.263671875, "learning_rate": 3.486111111111111e-05, "loss": 0.0379, "step": 4955 }, { "epoch": 3.0513688095970473, "grad_norm": 0.2412109375, "learning_rate": 3.484567901234568e-05, "loss": 0.037, "step": 4960 }, { "epoch": 3.0544447862196247, "grad_norm": 0.255859375, "learning_rate": 3.483024691358025e-05, "loss": 0.0353, "step": 4965 }, { "epoch": 3.0575207628422025, "grad_norm": 0.2392578125, "learning_rate": 3.481481481481482e-05, "loss": 0.0375, "step": 4970 }, { "epoch": 3.06059673946478, "grad_norm": 0.279296875, "learning_rate": 3.479938271604939e-05, "loss": 0.0412, "step": 4975 }, { "epoch": 3.063672716087358, "grad_norm": 0.2421875, "learning_rate": 3.478395061728395e-05, "loss": 0.0412, "step": 4980 }, { "epoch": 3.066748692709935, "grad_norm": 0.275390625, "learning_rate": 3.4768518518518524e-05, "loss": 0.0344, "step": 4985 }, { "epoch": 3.069824669332513, "grad_norm": 0.2431640625, "learning_rate": 3.475308641975309e-05, "loss": 0.0363, "step": 4990 }, { "epoch": 3.072900645955091, "grad_norm": 0.2421875, "learning_rate": 3.4737654320987654e-05, "loss": 0.0392, "step": 4995 }, { "epoch": 3.0759766225776684, "grad_norm": 0.275390625, "learning_rate": 3.472222222222222e-05, "loss": 0.0361, "step": 5000 }, { "epoch": 3.079052599200246, "grad_norm": 0.244140625, "learning_rate": 3.470679012345679e-05, "loss": 0.0369, "step": 5005 }, { "epoch": 3.0821285758228236, "grad_norm": 0.2216796875, "learning_rate": 3.469135802469136e-05, "loss": 0.0339, "step": 5010 }, { "epoch": 3.0852045524454015, "grad_norm": 0.2255859375, "learning_rate": 3.467592592592593e-05, "loss": 0.0321, "step": 5015 }, { "epoch": 3.088280529067979, "grad_norm": 0.25, "learning_rate": 3.4660493827160496e-05, "loss": 0.0373, "step": 5020 }, { "epoch": 3.0913565056905568, "grad_norm": 0.2265625, "learning_rate": 3.4645061728395064e-05, "loss": 0.0372, "step": 5025 }, { "epoch": 3.0944324823131346, "grad_norm": 0.2451171875, "learning_rate": 3.4629629629629626e-05, "loss": 0.0334, "step": 5030 }, { "epoch": 3.097508458935712, "grad_norm": 0.21875, "learning_rate": 3.46141975308642e-05, "loss": 0.0325, "step": 5035 }, { "epoch": 3.10058443555829, "grad_norm": 0.228515625, "learning_rate": 3.459876543209877e-05, "loss": 0.0335, "step": 5040 }, { "epoch": 3.1036604121808673, "grad_norm": 0.2451171875, "learning_rate": 3.458333333333333e-05, "loss": 0.0323, "step": 5045 }, { "epoch": 3.106736388803445, "grad_norm": 0.25390625, "learning_rate": 3.45679012345679e-05, "loss": 0.0359, "step": 5050 }, { "epoch": 3.1098123654260226, "grad_norm": 0.2353515625, "learning_rate": 3.4552469135802475e-05, "loss": 0.0365, "step": 5055 }, { "epoch": 3.1128883420486004, "grad_norm": 0.28515625, "learning_rate": 3.453703703703704e-05, "loss": 0.0361, "step": 5060 }, { "epoch": 3.1159643186711783, "grad_norm": 0.259765625, "learning_rate": 3.4521604938271605e-05, "loss": 0.0362, "step": 5065 }, { "epoch": 3.1190402952937557, "grad_norm": 0.228515625, "learning_rate": 3.4506172839506174e-05, "loss": 0.0429, "step": 5070 }, { "epoch": 3.1221162719163336, "grad_norm": 0.212890625, "learning_rate": 3.449074074074074e-05, "loss": 0.0351, "step": 5075 }, { "epoch": 3.125192248538911, "grad_norm": 0.275390625, "learning_rate": 3.447530864197531e-05, "loss": 0.0384, "step": 5080 }, { "epoch": 3.128268225161489, "grad_norm": 0.23828125, "learning_rate": 3.445987654320988e-05, "loss": 0.0361, "step": 5085 }, { "epoch": 3.1313442017840663, "grad_norm": 0.24609375, "learning_rate": 3.444444444444445e-05, "loss": 0.0369, "step": 5090 }, { "epoch": 3.134420178406644, "grad_norm": 0.25390625, "learning_rate": 3.4429012345679016e-05, "loss": 0.0346, "step": 5095 }, { "epoch": 3.137496155029222, "grad_norm": 0.28515625, "learning_rate": 3.441358024691358e-05, "loss": 0.0434, "step": 5100 }, { "epoch": 3.1405721316517994, "grad_norm": 0.23046875, "learning_rate": 3.439814814814815e-05, "loss": 0.0327, "step": 5105 }, { "epoch": 3.1436481082743772, "grad_norm": 0.24609375, "learning_rate": 3.438271604938272e-05, "loss": 0.0367, "step": 5110 }, { "epoch": 3.1467240848969547, "grad_norm": 0.2177734375, "learning_rate": 3.436728395061728e-05, "loss": 0.0376, "step": 5115 }, { "epoch": 3.1498000615195325, "grad_norm": 0.251953125, "learning_rate": 3.435185185185185e-05, "loss": 0.0359, "step": 5120 }, { "epoch": 3.15287603814211, "grad_norm": 0.48828125, "learning_rate": 3.4336419753086427e-05, "loss": 0.0379, "step": 5125 }, { "epoch": 3.155952014764688, "grad_norm": 0.2236328125, "learning_rate": 3.432098765432099e-05, "loss": 0.0368, "step": 5130 }, { "epoch": 3.1590279913872656, "grad_norm": 0.251953125, "learning_rate": 3.430555555555556e-05, "loss": 0.0398, "step": 5135 }, { "epoch": 3.162103968009843, "grad_norm": 0.2734375, "learning_rate": 3.4290123456790125e-05, "loss": 0.0401, "step": 5140 }, { "epoch": 3.165179944632421, "grad_norm": 0.240234375, "learning_rate": 3.4274691358024694e-05, "loss": 0.0356, "step": 5145 }, { "epoch": 3.1682559212549983, "grad_norm": 0.265625, "learning_rate": 3.425925925925926e-05, "loss": 0.0383, "step": 5150 }, { "epoch": 3.171331897877576, "grad_norm": 0.2197265625, "learning_rate": 3.424382716049383e-05, "loss": 0.0343, "step": 5155 }, { "epoch": 3.1744078745001536, "grad_norm": 0.2490234375, "learning_rate": 3.42283950617284e-05, "loss": 0.0356, "step": 5160 }, { "epoch": 3.1774838511227315, "grad_norm": 0.2373046875, "learning_rate": 3.421296296296296e-05, "loss": 0.0403, "step": 5165 }, { "epoch": 3.1805598277453093, "grad_norm": 0.2158203125, "learning_rate": 3.419753086419753e-05, "loss": 0.0354, "step": 5170 }, { "epoch": 3.1836358043678867, "grad_norm": 0.263671875, "learning_rate": 3.4182098765432104e-05, "loss": 0.0408, "step": 5175 }, { "epoch": 3.1867117809904646, "grad_norm": 0.2490234375, "learning_rate": 3.4166666666666666e-05, "loss": 0.0346, "step": 5180 }, { "epoch": 3.189787757613042, "grad_norm": 0.2470703125, "learning_rate": 3.4151234567901234e-05, "loss": 0.0358, "step": 5185 }, { "epoch": 3.19286373423562, "grad_norm": 0.2353515625, "learning_rate": 3.41358024691358e-05, "loss": 0.036, "step": 5190 }, { "epoch": 3.1959397108581973, "grad_norm": 0.263671875, "learning_rate": 3.412037037037038e-05, "loss": 0.0352, "step": 5195 }, { "epoch": 3.199015687480775, "grad_norm": 0.251953125, "learning_rate": 3.410493827160494e-05, "loss": 0.0358, "step": 5200 }, { "epoch": 3.202091664103353, "grad_norm": 0.2490234375, "learning_rate": 3.408950617283951e-05, "loss": 0.0376, "step": 5205 }, { "epoch": 3.2051676407259304, "grad_norm": 0.2490234375, "learning_rate": 3.4074074074074077e-05, "loss": 0.035, "step": 5210 }, { "epoch": 3.2082436173485083, "grad_norm": 0.228515625, "learning_rate": 3.405864197530864e-05, "loss": 0.0365, "step": 5215 }, { "epoch": 3.2113195939710857, "grad_norm": 0.236328125, "learning_rate": 3.4043209876543214e-05, "loss": 0.0368, "step": 5220 }, { "epoch": 3.2143955705936635, "grad_norm": 0.24609375, "learning_rate": 3.402777777777778e-05, "loss": 0.0368, "step": 5225 }, { "epoch": 3.217471547216241, "grad_norm": 0.2216796875, "learning_rate": 3.401234567901235e-05, "loss": 0.0347, "step": 5230 }, { "epoch": 3.220547523838819, "grad_norm": 0.2421875, "learning_rate": 3.399691358024691e-05, "loss": 0.0345, "step": 5235 }, { "epoch": 3.2236235004613967, "grad_norm": 0.255859375, "learning_rate": 3.398148148148148e-05, "loss": 0.0379, "step": 5240 }, { "epoch": 3.226699477083974, "grad_norm": 0.228515625, "learning_rate": 3.3966049382716056e-05, "loss": 0.032, "step": 5245 }, { "epoch": 3.229775453706552, "grad_norm": 0.2373046875, "learning_rate": 3.395061728395062e-05, "loss": 0.0387, "step": 5250 }, { "epoch": 3.2328514303291294, "grad_norm": 0.265625, "learning_rate": 3.3935185185185186e-05, "loss": 0.0346, "step": 5255 }, { "epoch": 3.235927406951707, "grad_norm": 0.275390625, "learning_rate": 3.3919753086419754e-05, "loss": 0.0396, "step": 5260 }, { "epoch": 3.2390033835742846, "grad_norm": 0.2470703125, "learning_rate": 3.390432098765432e-05, "loss": 0.0393, "step": 5265 }, { "epoch": 3.2420793601968625, "grad_norm": 0.25390625, "learning_rate": 3.388888888888889e-05, "loss": 0.0381, "step": 5270 }, { "epoch": 3.2451553368194404, "grad_norm": 0.23046875, "learning_rate": 3.387345679012346e-05, "loss": 0.0378, "step": 5275 }, { "epoch": 3.2482313134420178, "grad_norm": 0.267578125, "learning_rate": 3.385802469135803e-05, "loss": 0.0348, "step": 5280 }, { "epoch": 3.2513072900645956, "grad_norm": 0.23046875, "learning_rate": 3.384259259259259e-05, "loss": 0.0372, "step": 5285 }, { "epoch": 3.254383266687173, "grad_norm": 0.220703125, "learning_rate": 3.3827160493827165e-05, "loss": 0.0353, "step": 5290 }, { "epoch": 3.257459243309751, "grad_norm": 0.265625, "learning_rate": 3.3811728395061733e-05, "loss": 0.0401, "step": 5295 }, { "epoch": 3.2605352199323283, "grad_norm": 0.259765625, "learning_rate": 3.3796296296296295e-05, "loss": 0.0332, "step": 5300 }, { "epoch": 3.263611196554906, "grad_norm": 0.26953125, "learning_rate": 3.3780864197530864e-05, "loss": 0.0372, "step": 5305 }, { "epoch": 3.266687173177484, "grad_norm": 0.26171875, "learning_rate": 3.376543209876543e-05, "loss": 0.0371, "step": 5310 }, { "epoch": 3.2697631498000614, "grad_norm": 0.296875, "learning_rate": 3.375000000000001e-05, "loss": 0.0394, "step": 5315 }, { "epoch": 3.2728391264226393, "grad_norm": 0.24609375, "learning_rate": 3.373456790123457e-05, "loss": 0.0384, "step": 5320 }, { "epoch": 3.2759151030452167, "grad_norm": 0.26953125, "learning_rate": 3.371913580246914e-05, "loss": 0.0336, "step": 5325 }, { "epoch": 3.2789910796677946, "grad_norm": 0.2373046875, "learning_rate": 3.3703703703703706e-05, "loss": 0.0368, "step": 5330 }, { "epoch": 3.282067056290372, "grad_norm": 0.2451171875, "learning_rate": 3.368827160493827e-05, "loss": 0.0363, "step": 5335 }, { "epoch": 3.28514303291295, "grad_norm": 0.25, "learning_rate": 3.367283950617284e-05, "loss": 0.0352, "step": 5340 }, { "epoch": 3.2882190095355277, "grad_norm": 0.203125, "learning_rate": 3.365740740740741e-05, "loss": 0.0378, "step": 5345 }, { "epoch": 3.291294986158105, "grad_norm": 0.2412109375, "learning_rate": 3.364197530864198e-05, "loss": 0.0343, "step": 5350 }, { "epoch": 3.294370962780683, "grad_norm": 0.236328125, "learning_rate": 3.362654320987654e-05, "loss": 0.0318, "step": 5355 }, { "epoch": 3.2974469394032604, "grad_norm": 0.2236328125, "learning_rate": 3.3611111111111116e-05, "loss": 0.038, "step": 5360 }, { "epoch": 3.3005229160258382, "grad_norm": 0.2265625, "learning_rate": 3.3595679012345685e-05, "loss": 0.0366, "step": 5365 }, { "epoch": 3.3035988926484157, "grad_norm": 0.23046875, "learning_rate": 3.3580246913580247e-05, "loss": 0.0355, "step": 5370 }, { "epoch": 3.3066748692709935, "grad_norm": 0.228515625, "learning_rate": 3.3564814814814815e-05, "loss": 0.0372, "step": 5375 }, { "epoch": 3.3097508458935714, "grad_norm": 0.26953125, "learning_rate": 3.3549382716049383e-05, "loss": 0.0387, "step": 5380 }, { "epoch": 3.312826822516149, "grad_norm": 0.251953125, "learning_rate": 3.353395061728395e-05, "loss": 0.0399, "step": 5385 }, { "epoch": 3.3159027991387267, "grad_norm": 0.259765625, "learning_rate": 3.351851851851852e-05, "loss": 0.0415, "step": 5390 }, { "epoch": 3.318978775761304, "grad_norm": 0.2578125, "learning_rate": 3.350308641975309e-05, "loss": 0.0381, "step": 5395 }, { "epoch": 3.322054752383882, "grad_norm": 0.2451171875, "learning_rate": 3.348765432098766e-05, "loss": 0.0391, "step": 5400 }, { "epoch": 3.3251307290064593, "grad_norm": 0.25390625, "learning_rate": 3.347222222222222e-05, "loss": 0.0361, "step": 5405 }, { "epoch": 3.328206705629037, "grad_norm": 0.2412109375, "learning_rate": 3.3456790123456794e-05, "loss": 0.0351, "step": 5410 }, { "epoch": 3.331282682251615, "grad_norm": 0.2734375, "learning_rate": 3.344135802469136e-05, "loss": 0.0388, "step": 5415 }, { "epoch": 3.3343586588741925, "grad_norm": 0.3046875, "learning_rate": 3.3425925925925924e-05, "loss": 0.0381, "step": 5420 }, { "epoch": 3.3374346354967703, "grad_norm": 0.2119140625, "learning_rate": 3.341049382716049e-05, "loss": 0.0378, "step": 5425 }, { "epoch": 3.3405106121193477, "grad_norm": 0.248046875, "learning_rate": 3.339506172839506e-05, "loss": 0.0385, "step": 5430 }, { "epoch": 3.3435865887419256, "grad_norm": 0.283203125, "learning_rate": 3.337962962962963e-05, "loss": 0.0436, "step": 5435 }, { "epoch": 3.346662565364503, "grad_norm": 0.234375, "learning_rate": 3.33641975308642e-05, "loss": 0.0357, "step": 5440 }, { "epoch": 3.349738541987081, "grad_norm": 0.2578125, "learning_rate": 3.3348765432098766e-05, "loss": 0.0391, "step": 5445 }, { "epoch": 3.3528145186096587, "grad_norm": 0.265625, "learning_rate": 3.3333333333333335e-05, "loss": 0.0397, "step": 5450 }, { "epoch": 3.355890495232236, "grad_norm": 0.271484375, "learning_rate": 3.33179012345679e-05, "loss": 0.0384, "step": 5455 }, { "epoch": 3.358966471854814, "grad_norm": 0.25, "learning_rate": 3.330246913580247e-05, "loss": 0.038, "step": 5460 }, { "epoch": 3.3620424484773914, "grad_norm": 0.26171875, "learning_rate": 3.328703703703704e-05, "loss": 0.0403, "step": 5465 }, { "epoch": 3.3651184250999693, "grad_norm": 0.275390625, "learning_rate": 3.327160493827161e-05, "loss": 0.0373, "step": 5470 }, { "epoch": 3.3681944017225467, "grad_norm": 0.244140625, "learning_rate": 3.325617283950617e-05, "loss": 0.0359, "step": 5475 }, { "epoch": 3.3712703783451246, "grad_norm": 0.27734375, "learning_rate": 3.3240740740740746e-05, "loss": 0.0377, "step": 5480 }, { "epoch": 3.3743463549677024, "grad_norm": 0.259765625, "learning_rate": 3.3225308641975314e-05, "loss": 0.0382, "step": 5485 }, { "epoch": 3.37742233159028, "grad_norm": 0.259765625, "learning_rate": 3.3209876543209876e-05, "loss": 0.0404, "step": 5490 }, { "epoch": 3.3804983082128577, "grad_norm": 0.259765625, "learning_rate": 3.3194444444444444e-05, "loss": 0.0395, "step": 5495 }, { "epoch": 3.383574284835435, "grad_norm": 0.271484375, "learning_rate": 3.317901234567901e-05, "loss": 0.0388, "step": 5500 }, { "epoch": 3.386650261458013, "grad_norm": 0.2197265625, "learning_rate": 3.316358024691358e-05, "loss": 0.0371, "step": 5505 }, { "epoch": 3.3897262380805904, "grad_norm": 0.25, "learning_rate": 3.314814814814815e-05, "loss": 0.0366, "step": 5510 }, { "epoch": 3.3928022147031682, "grad_norm": 0.263671875, "learning_rate": 3.313271604938272e-05, "loss": 0.0365, "step": 5515 }, { "epoch": 3.395878191325746, "grad_norm": 0.2314453125, "learning_rate": 3.3117283950617286e-05, "loss": 0.0416, "step": 5520 }, { "epoch": 3.3989541679483235, "grad_norm": 0.28125, "learning_rate": 3.3101851851851855e-05, "loss": 0.0376, "step": 5525 }, { "epoch": 3.4020301445709014, "grad_norm": 0.224609375, "learning_rate": 3.308641975308642e-05, "loss": 0.0357, "step": 5530 }, { "epoch": 3.4051061211934788, "grad_norm": 0.2734375, "learning_rate": 3.307098765432099e-05, "loss": 0.0404, "step": 5535 }, { "epoch": 3.4081820978160566, "grad_norm": 0.2197265625, "learning_rate": 3.3055555555555553e-05, "loss": 0.0351, "step": 5540 }, { "epoch": 3.411258074438634, "grad_norm": 0.25390625, "learning_rate": 3.304012345679012e-05, "loss": 0.0379, "step": 5545 }, { "epoch": 3.414334051061212, "grad_norm": 0.287109375, "learning_rate": 3.30246913580247e-05, "loss": 0.0409, "step": 5550 }, { "epoch": 3.4174100276837898, "grad_norm": 0.26171875, "learning_rate": 3.300925925925926e-05, "loss": 0.0383, "step": 5555 }, { "epoch": 3.420486004306367, "grad_norm": 0.255859375, "learning_rate": 3.299382716049383e-05, "loss": 0.0405, "step": 5560 }, { "epoch": 3.423561980928945, "grad_norm": 0.23828125, "learning_rate": 3.2978395061728396e-05, "loss": 0.0371, "step": 5565 }, { "epoch": 3.4266379575515225, "grad_norm": 0.234375, "learning_rate": 3.2962962962962964e-05, "loss": 0.035, "step": 5570 }, { "epoch": 3.4297139341741003, "grad_norm": 0.2275390625, "learning_rate": 3.294753086419753e-05, "loss": 0.038, "step": 5575 }, { "epoch": 3.4327899107966777, "grad_norm": 0.26953125, "learning_rate": 3.29320987654321e-05, "loss": 0.0392, "step": 5580 }, { "epoch": 3.4358658874192556, "grad_norm": 0.2333984375, "learning_rate": 3.291666666666667e-05, "loss": 0.0376, "step": 5585 }, { "epoch": 3.4389418640418334, "grad_norm": 0.271484375, "learning_rate": 3.290123456790124e-05, "loss": 0.0405, "step": 5590 }, { "epoch": 3.442017840664411, "grad_norm": 0.251953125, "learning_rate": 3.28858024691358e-05, "loss": 0.0376, "step": 5595 }, { "epoch": 3.4450938172869887, "grad_norm": 0.267578125, "learning_rate": 3.2870370370370375e-05, "loss": 0.0385, "step": 5600 }, { "epoch": 3.448169793909566, "grad_norm": 0.232421875, "learning_rate": 3.285493827160494e-05, "loss": 0.0387, "step": 5605 }, { "epoch": 3.451245770532144, "grad_norm": 0.279296875, "learning_rate": 3.2839506172839505e-05, "loss": 0.0378, "step": 5610 }, { "epoch": 3.4543217471547214, "grad_norm": 0.291015625, "learning_rate": 3.282407407407407e-05, "loss": 0.0382, "step": 5615 }, { "epoch": 3.4573977237772993, "grad_norm": 0.2734375, "learning_rate": 3.280864197530865e-05, "loss": 0.0412, "step": 5620 }, { "epoch": 3.460473700399877, "grad_norm": 0.27734375, "learning_rate": 3.279320987654321e-05, "loss": 0.0367, "step": 5625 }, { "epoch": 3.4635496770224545, "grad_norm": 0.259765625, "learning_rate": 3.277777777777778e-05, "loss": 0.0389, "step": 5630 }, { "epoch": 3.4666256536450324, "grad_norm": 0.2421875, "learning_rate": 3.276234567901235e-05, "loss": 0.0352, "step": 5635 }, { "epoch": 3.46970163026761, "grad_norm": 0.26171875, "learning_rate": 3.2746913580246916e-05, "loss": 0.0384, "step": 5640 }, { "epoch": 3.4727776068901877, "grad_norm": 0.2333984375, "learning_rate": 3.2731481481481484e-05, "loss": 0.0385, "step": 5645 }, { "epoch": 3.475853583512765, "grad_norm": 0.271484375, "learning_rate": 3.271604938271605e-05, "loss": 0.0388, "step": 5650 }, { "epoch": 3.478929560135343, "grad_norm": 0.2353515625, "learning_rate": 3.270061728395062e-05, "loss": 0.0384, "step": 5655 }, { "epoch": 3.482005536757921, "grad_norm": 0.24609375, "learning_rate": 3.268518518518518e-05, "loss": 0.038, "step": 5660 }, { "epoch": 3.485081513380498, "grad_norm": 0.23828125, "learning_rate": 3.266975308641975e-05, "loss": 0.0373, "step": 5665 }, { "epoch": 3.488157490003076, "grad_norm": 0.283203125, "learning_rate": 3.2654320987654326e-05, "loss": 0.0367, "step": 5670 }, { "epoch": 3.4912334666256535, "grad_norm": 0.251953125, "learning_rate": 3.263888888888889e-05, "loss": 0.042, "step": 5675 }, { "epoch": 3.4943094432482313, "grad_norm": 0.28125, "learning_rate": 3.2623456790123456e-05, "loss": 0.0401, "step": 5680 }, { "epoch": 3.4973854198708088, "grad_norm": 0.251953125, "learning_rate": 3.2608024691358025e-05, "loss": 0.0347, "step": 5685 }, { "epoch": 3.5004613964933866, "grad_norm": 0.2353515625, "learning_rate": 3.25925925925926e-05, "loss": 0.0364, "step": 5690 }, { "epoch": 3.5035373731159645, "grad_norm": 0.28125, "learning_rate": 3.257716049382716e-05, "loss": 0.0412, "step": 5695 }, { "epoch": 3.506613349738542, "grad_norm": 0.2158203125, "learning_rate": 3.256172839506173e-05, "loss": 0.0371, "step": 5700 }, { "epoch": 3.5096893263611197, "grad_norm": 0.25, "learning_rate": 3.25462962962963e-05, "loss": 0.0359, "step": 5705 }, { "epoch": 3.512765302983697, "grad_norm": 0.25390625, "learning_rate": 3.253086419753087e-05, "loss": 0.0425, "step": 5710 }, { "epoch": 3.515841279606275, "grad_norm": 0.2353515625, "learning_rate": 3.2515432098765435e-05, "loss": 0.0353, "step": 5715 }, { "epoch": 3.5189172562288524, "grad_norm": 0.251953125, "learning_rate": 3.2500000000000004e-05, "loss": 0.0354, "step": 5720 }, { "epoch": 3.5219932328514303, "grad_norm": 0.265625, "learning_rate": 3.248456790123457e-05, "loss": 0.0384, "step": 5725 }, { "epoch": 3.525069209474008, "grad_norm": 0.271484375, "learning_rate": 3.2469135802469134e-05, "loss": 0.0408, "step": 5730 }, { "epoch": 3.5281451860965856, "grad_norm": 0.263671875, "learning_rate": 3.24537037037037e-05, "loss": 0.0395, "step": 5735 }, { "epoch": 3.5312211627191634, "grad_norm": 0.2314453125, "learning_rate": 3.243827160493828e-05, "loss": 0.0402, "step": 5740 }, { "epoch": 3.5342971393417413, "grad_norm": 0.302734375, "learning_rate": 3.242283950617284e-05, "loss": 0.0368, "step": 5745 }, { "epoch": 3.5373731159643187, "grad_norm": 0.291015625, "learning_rate": 3.240740740740741e-05, "loss": 0.0395, "step": 5750 }, { "epoch": 3.540449092586896, "grad_norm": 0.25390625, "learning_rate": 3.2391975308641976e-05, "loss": 0.038, "step": 5755 }, { "epoch": 3.543525069209474, "grad_norm": 0.2294921875, "learning_rate": 3.2376543209876545e-05, "loss": 0.0381, "step": 5760 }, { "epoch": 3.546601045832052, "grad_norm": 0.212890625, "learning_rate": 3.236111111111111e-05, "loss": 0.0333, "step": 5765 }, { "epoch": 3.5496770224546292, "grad_norm": 0.25, "learning_rate": 3.234567901234568e-05, "loss": 0.0359, "step": 5770 }, { "epoch": 3.552752999077207, "grad_norm": 0.232421875, "learning_rate": 3.233024691358025e-05, "loss": 0.0403, "step": 5775 }, { "epoch": 3.555828975699785, "grad_norm": 0.25390625, "learning_rate": 3.231481481481481e-05, "loss": 0.0328, "step": 5780 }, { "epoch": 3.5589049523223624, "grad_norm": 0.248046875, "learning_rate": 3.229938271604939e-05, "loss": 0.0412, "step": 5785 }, { "epoch": 3.56198092894494, "grad_norm": 0.2890625, "learning_rate": 3.2283950617283955e-05, "loss": 0.0389, "step": 5790 }, { "epoch": 3.5650569055675176, "grad_norm": 0.28515625, "learning_rate": 3.226851851851852e-05, "loss": 0.0403, "step": 5795 }, { "epoch": 3.5681328821900955, "grad_norm": 0.26953125, "learning_rate": 3.2253086419753086e-05, "loss": 0.0387, "step": 5800 }, { "epoch": 3.571208858812673, "grad_norm": 0.2275390625, "learning_rate": 3.2237654320987654e-05, "loss": 0.0325, "step": 5805 }, { "epoch": 3.5742848354352508, "grad_norm": 0.2412109375, "learning_rate": 3.222222222222223e-05, "loss": 0.0363, "step": 5810 }, { "epoch": 3.5773608120578286, "grad_norm": 0.3046875, "learning_rate": 3.220679012345679e-05, "loss": 0.0408, "step": 5815 }, { "epoch": 3.580436788680406, "grad_norm": 0.2421875, "learning_rate": 3.219135802469136e-05, "loss": 0.0357, "step": 5820 }, { "epoch": 3.5835127653029835, "grad_norm": 0.271484375, "learning_rate": 3.217592592592593e-05, "loss": 0.0434, "step": 5825 }, { "epoch": 3.5865887419255613, "grad_norm": 0.283203125, "learning_rate": 3.216049382716049e-05, "loss": 0.0384, "step": 5830 }, { "epoch": 3.589664718548139, "grad_norm": 0.2333984375, "learning_rate": 3.2145061728395065e-05, "loss": 0.0386, "step": 5835 }, { "epoch": 3.5927406951707166, "grad_norm": 0.26953125, "learning_rate": 3.212962962962963e-05, "loss": 0.0394, "step": 5840 }, { "epoch": 3.5958166717932945, "grad_norm": 0.232421875, "learning_rate": 3.21141975308642e-05, "loss": 0.0391, "step": 5845 }, { "epoch": 3.5988926484158723, "grad_norm": 0.24609375, "learning_rate": 3.209876543209876e-05, "loss": 0.0371, "step": 5850 }, { "epoch": 3.6019686250384497, "grad_norm": 0.228515625, "learning_rate": 3.208333333333334e-05, "loss": 0.0386, "step": 5855 }, { "epoch": 3.605044601661027, "grad_norm": 0.255859375, "learning_rate": 3.206790123456791e-05, "loss": 0.0367, "step": 5860 }, { "epoch": 3.608120578283605, "grad_norm": 0.259765625, "learning_rate": 3.205246913580247e-05, "loss": 0.0368, "step": 5865 }, { "epoch": 3.611196554906183, "grad_norm": 0.216796875, "learning_rate": 3.203703703703704e-05, "loss": 0.0356, "step": 5870 }, { "epoch": 3.6142725315287603, "grad_norm": 0.2451171875, "learning_rate": 3.2021604938271605e-05, "loss": 0.0419, "step": 5875 }, { "epoch": 3.617348508151338, "grad_norm": 0.2275390625, "learning_rate": 3.2006172839506174e-05, "loss": 0.0421, "step": 5880 }, { "epoch": 3.620424484773916, "grad_norm": 0.244140625, "learning_rate": 3.199074074074074e-05, "loss": 0.0404, "step": 5885 }, { "epoch": 3.6235004613964934, "grad_norm": 0.224609375, "learning_rate": 3.197530864197531e-05, "loss": 0.0385, "step": 5890 }, { "epoch": 3.626576438019071, "grad_norm": 0.287109375, "learning_rate": 3.195987654320988e-05, "loss": 0.0393, "step": 5895 }, { "epoch": 3.6296524146416487, "grad_norm": 0.2421875, "learning_rate": 3.194444444444444e-05, "loss": 0.0343, "step": 5900 }, { "epoch": 3.6327283912642265, "grad_norm": 0.2353515625, "learning_rate": 3.1929012345679016e-05, "loss": 0.0377, "step": 5905 }, { "epoch": 3.635804367886804, "grad_norm": 0.2451171875, "learning_rate": 3.1913580246913585e-05, "loss": 0.034, "step": 5910 }, { "epoch": 3.638880344509382, "grad_norm": 0.2373046875, "learning_rate": 3.1898148148148146e-05, "loss": 0.0395, "step": 5915 }, { "epoch": 3.6419563211319597, "grad_norm": 0.3359375, "learning_rate": 3.1882716049382715e-05, "loss": 0.0406, "step": 5920 }, { "epoch": 3.645032297754537, "grad_norm": 0.2392578125, "learning_rate": 3.186728395061729e-05, "loss": 0.0388, "step": 5925 }, { "epoch": 3.6481082743771145, "grad_norm": 0.251953125, "learning_rate": 3.185185185185185e-05, "loss": 0.0367, "step": 5930 }, { "epoch": 3.6511842509996923, "grad_norm": 0.2578125, "learning_rate": 3.183641975308642e-05, "loss": 0.0368, "step": 5935 }, { "epoch": 3.65426022762227, "grad_norm": 0.294921875, "learning_rate": 3.182098765432099e-05, "loss": 0.0376, "step": 5940 }, { "epoch": 3.6573362042448476, "grad_norm": 0.251953125, "learning_rate": 3.180555555555556e-05, "loss": 0.0342, "step": 5945 }, { "epoch": 3.6604121808674255, "grad_norm": 0.259765625, "learning_rate": 3.1790123456790125e-05, "loss": 0.0375, "step": 5950 }, { "epoch": 3.6634881574900033, "grad_norm": 0.26171875, "learning_rate": 3.1774691358024694e-05, "loss": 0.0394, "step": 5955 }, { "epoch": 3.6665641341125808, "grad_norm": 0.2578125, "learning_rate": 3.175925925925926e-05, "loss": 0.0388, "step": 5960 }, { "epoch": 3.669640110735158, "grad_norm": 0.271484375, "learning_rate": 3.174382716049383e-05, "loss": 0.0399, "step": 5965 }, { "epoch": 3.672716087357736, "grad_norm": 0.25, "learning_rate": 3.172839506172839e-05, "loss": 0.0388, "step": 5970 }, { "epoch": 3.675792063980314, "grad_norm": 0.27734375, "learning_rate": 3.171296296296297e-05, "loss": 0.0389, "step": 5975 }, { "epoch": 3.6788680406028913, "grad_norm": 0.2197265625, "learning_rate": 3.1697530864197536e-05, "loss": 0.036, "step": 5980 }, { "epoch": 3.681944017225469, "grad_norm": 0.306640625, "learning_rate": 3.16820987654321e-05, "loss": 0.0402, "step": 5985 }, { "epoch": 3.685019993848047, "grad_norm": 0.259765625, "learning_rate": 3.1666666666666666e-05, "loss": 0.0379, "step": 5990 }, { "epoch": 3.6880959704706244, "grad_norm": 0.2578125, "learning_rate": 3.1651234567901235e-05, "loss": 0.038, "step": 5995 }, { "epoch": 3.691171947093202, "grad_norm": 0.2412109375, "learning_rate": 3.16358024691358e-05, "loss": 0.0371, "step": 6000 }, { "epoch": 3.6942479237157797, "grad_norm": 0.23828125, "learning_rate": 3.162037037037037e-05, "loss": 0.0382, "step": 6005 }, { "epoch": 3.6973239003383576, "grad_norm": 0.23828125, "learning_rate": 3.160493827160494e-05, "loss": 0.0425, "step": 6010 }, { "epoch": 3.700399876960935, "grad_norm": 0.259765625, "learning_rate": 3.158950617283951e-05, "loss": 0.0348, "step": 6015 }, { "epoch": 3.703475853583513, "grad_norm": 0.267578125, "learning_rate": 3.157407407407408e-05, "loss": 0.0381, "step": 6020 }, { "epoch": 3.7065518302060907, "grad_norm": 0.2734375, "learning_rate": 3.1558641975308645e-05, "loss": 0.0414, "step": 6025 }, { "epoch": 3.709627806828668, "grad_norm": 0.25390625, "learning_rate": 3.1543209876543214e-05, "loss": 0.0377, "step": 6030 }, { "epoch": 3.7127037834512455, "grad_norm": 0.205078125, "learning_rate": 3.1527777777777775e-05, "loss": 0.0386, "step": 6035 }, { "epoch": 3.7157797600738234, "grad_norm": 0.2734375, "learning_rate": 3.1512345679012344e-05, "loss": 0.0415, "step": 6040 }, { "epoch": 3.7188557366964012, "grad_norm": 0.259765625, "learning_rate": 3.149691358024692e-05, "loss": 0.0407, "step": 6045 }, { "epoch": 3.7219317133189787, "grad_norm": 0.22265625, "learning_rate": 3.148148148148148e-05, "loss": 0.0366, "step": 6050 }, { "epoch": 3.7250076899415565, "grad_norm": 0.25, "learning_rate": 3.146604938271605e-05, "loss": 0.0356, "step": 6055 }, { "epoch": 3.7280836665641344, "grad_norm": 0.244140625, "learning_rate": 3.145061728395062e-05, "loss": 0.0377, "step": 6060 }, { "epoch": 3.731159643186712, "grad_norm": 0.234375, "learning_rate": 3.1435185185185186e-05, "loss": 0.041, "step": 6065 }, { "epoch": 3.734235619809289, "grad_norm": 0.2490234375, "learning_rate": 3.1419753086419755e-05, "loss": 0.0339, "step": 6070 }, { "epoch": 3.737311596431867, "grad_norm": 0.29296875, "learning_rate": 3.140432098765432e-05, "loss": 0.0354, "step": 6075 }, { "epoch": 3.740387573054445, "grad_norm": 0.267578125, "learning_rate": 3.138888888888889e-05, "loss": 0.0393, "step": 6080 }, { "epoch": 3.7434635496770223, "grad_norm": 0.234375, "learning_rate": 3.137345679012346e-05, "loss": 0.037, "step": 6085 }, { "epoch": 3.7465395262996, "grad_norm": 0.220703125, "learning_rate": 3.135802469135803e-05, "loss": 0.0381, "step": 6090 }, { "epoch": 3.749615502922178, "grad_norm": 0.26171875, "learning_rate": 3.13425925925926e-05, "loss": 0.0333, "step": 6095 }, { "epoch": 3.7526914795447555, "grad_norm": 0.24609375, "learning_rate": 3.1327160493827165e-05, "loss": 0.0394, "step": 6100 }, { "epoch": 3.755767456167333, "grad_norm": 0.220703125, "learning_rate": 3.131172839506173e-05, "loss": 0.0423, "step": 6105 }, { "epoch": 3.7588434327899107, "grad_norm": 0.279296875, "learning_rate": 3.1296296296296295e-05, "loss": 0.0397, "step": 6110 }, { "epoch": 3.7619194094124886, "grad_norm": 0.255859375, "learning_rate": 3.128086419753087e-05, "loss": 0.0363, "step": 6115 }, { "epoch": 3.764995386035066, "grad_norm": 0.2412109375, "learning_rate": 3.126543209876543e-05, "loss": 0.04, "step": 6120 }, { "epoch": 3.768071362657644, "grad_norm": 0.2333984375, "learning_rate": 3.125e-05, "loss": 0.0394, "step": 6125 }, { "epoch": 3.7711473392802217, "grad_norm": 0.2451171875, "learning_rate": 3.123456790123457e-05, "loss": 0.0379, "step": 6130 }, { "epoch": 3.774223315902799, "grad_norm": 0.2470703125, "learning_rate": 3.121913580246914e-05, "loss": 0.0392, "step": 6135 }, { "epoch": 3.7772992925253766, "grad_norm": 0.283203125, "learning_rate": 3.1203703703703706e-05, "loss": 0.04, "step": 6140 }, { "epoch": 3.7803752691479544, "grad_norm": 0.279296875, "learning_rate": 3.1188271604938274e-05, "loss": 0.0419, "step": 6145 }, { "epoch": 3.7834512457705323, "grad_norm": 0.259765625, "learning_rate": 3.117283950617284e-05, "loss": 0.0371, "step": 6150 }, { "epoch": 3.7865272223931097, "grad_norm": 0.244140625, "learning_rate": 3.1157407407407405e-05, "loss": 0.0362, "step": 6155 }, { "epoch": 3.7896031990156875, "grad_norm": 0.279296875, "learning_rate": 3.114197530864197e-05, "loss": 0.0384, "step": 6160 }, { "epoch": 3.7926791756382654, "grad_norm": 0.263671875, "learning_rate": 3.112654320987655e-05, "loss": 0.0349, "step": 6165 }, { "epoch": 3.795755152260843, "grad_norm": 0.2734375, "learning_rate": 3.111111111111111e-05, "loss": 0.038, "step": 6170 }, { "epoch": 3.7988311288834202, "grad_norm": 0.255859375, "learning_rate": 3.109567901234568e-05, "loss": 0.037, "step": 6175 }, { "epoch": 3.801907105505998, "grad_norm": 0.265625, "learning_rate": 3.108024691358025e-05, "loss": 0.0391, "step": 6180 }, { "epoch": 3.804983082128576, "grad_norm": 0.240234375, "learning_rate": 3.106481481481482e-05, "loss": 0.0379, "step": 6185 }, { "epoch": 3.8080590587511534, "grad_norm": 0.25, "learning_rate": 3.1049382716049384e-05, "loss": 0.0363, "step": 6190 }, { "epoch": 3.811135035373731, "grad_norm": 0.30859375, "learning_rate": 3.103395061728395e-05, "loss": 0.0394, "step": 6195 }, { "epoch": 3.814211011996309, "grad_norm": 0.421875, "learning_rate": 3.101851851851852e-05, "loss": 0.0344, "step": 6200 }, { "epoch": 3.8172869886188865, "grad_norm": 0.267578125, "learning_rate": 3.100308641975309e-05, "loss": 0.0379, "step": 6205 }, { "epoch": 3.820362965241464, "grad_norm": 0.267578125, "learning_rate": 3.098765432098766e-05, "loss": 0.0385, "step": 6210 }, { "epoch": 3.8234389418640418, "grad_norm": 0.25, "learning_rate": 3.0972222222222226e-05, "loss": 0.0372, "step": 6215 }, { "epoch": 3.8265149184866196, "grad_norm": 0.26171875, "learning_rate": 3.0956790123456794e-05, "loss": 0.0352, "step": 6220 }, { "epoch": 3.829590895109197, "grad_norm": 0.244140625, "learning_rate": 3.0941358024691356e-05, "loss": 0.0341, "step": 6225 }, { "epoch": 3.832666871731775, "grad_norm": 0.29296875, "learning_rate": 3.0925925925925924e-05, "loss": 0.0388, "step": 6230 }, { "epoch": 3.8357428483543528, "grad_norm": 0.255859375, "learning_rate": 3.09104938271605e-05, "loss": 0.0401, "step": 6235 }, { "epoch": 3.83881882497693, "grad_norm": 0.2421875, "learning_rate": 3.089506172839506e-05, "loss": 0.0347, "step": 6240 }, { "epoch": 3.8418948015995076, "grad_norm": 0.2734375, "learning_rate": 3.087962962962963e-05, "loss": 0.0436, "step": 6245 }, { "epoch": 3.8449707782220854, "grad_norm": 0.259765625, "learning_rate": 3.08641975308642e-05, "loss": 0.0398, "step": 6250 }, { "epoch": 3.8480467548446633, "grad_norm": 0.2451171875, "learning_rate": 3.084876543209877e-05, "loss": 0.0412, "step": 6255 }, { "epoch": 3.8511227314672407, "grad_norm": 0.2890625, "learning_rate": 3.0833333333333335e-05, "loss": 0.0407, "step": 6260 }, { "epoch": 3.8541987080898186, "grad_norm": 0.2421875, "learning_rate": 3.0817901234567904e-05, "loss": 0.034, "step": 6265 }, { "epoch": 3.8572746847123964, "grad_norm": 0.30078125, "learning_rate": 3.080246913580247e-05, "loss": 0.0421, "step": 6270 }, { "epoch": 3.860350661334974, "grad_norm": 0.279296875, "learning_rate": 3.0787037037037034e-05, "loss": 0.0379, "step": 6275 }, { "epoch": 3.8634266379575513, "grad_norm": 0.224609375, "learning_rate": 3.077160493827161e-05, "loss": 0.0359, "step": 6280 }, { "epoch": 3.866502614580129, "grad_norm": 0.2470703125, "learning_rate": 3.075617283950618e-05, "loss": 0.0382, "step": 6285 }, { "epoch": 3.869578591202707, "grad_norm": 0.271484375, "learning_rate": 3.074074074074074e-05, "loss": 0.0413, "step": 6290 }, { "epoch": 3.8726545678252844, "grad_norm": 0.26953125, "learning_rate": 3.072530864197531e-05, "loss": 0.0406, "step": 6295 }, { "epoch": 3.8757305444478622, "grad_norm": 0.265625, "learning_rate": 3.0709876543209876e-05, "loss": 0.0389, "step": 6300 }, { "epoch": 3.87880652107044, "grad_norm": 0.251953125, "learning_rate": 3.069444444444445e-05, "loss": 0.0382, "step": 6305 }, { "epoch": 3.8818824976930175, "grad_norm": 0.294921875, "learning_rate": 3.067901234567901e-05, "loss": 0.0437, "step": 6310 }, { "epoch": 3.884958474315595, "grad_norm": 0.2734375, "learning_rate": 3.066358024691358e-05, "loss": 0.0362, "step": 6315 }, { "epoch": 3.888034450938173, "grad_norm": 0.2578125, "learning_rate": 3.064814814814815e-05, "loss": 0.0392, "step": 6320 }, { "epoch": 3.8911104275607507, "grad_norm": 0.263671875, "learning_rate": 3.063271604938271e-05, "loss": 0.0412, "step": 6325 }, { "epoch": 3.894186404183328, "grad_norm": 0.2470703125, "learning_rate": 3.061728395061729e-05, "loss": 0.037, "step": 6330 }, { "epoch": 3.897262380805906, "grad_norm": 0.259765625, "learning_rate": 3.0601851851851855e-05, "loss": 0.0411, "step": 6335 }, { "epoch": 3.900338357428484, "grad_norm": 0.234375, "learning_rate": 3.0586419753086424e-05, "loss": 0.0378, "step": 6340 }, { "epoch": 3.903414334051061, "grad_norm": 0.259765625, "learning_rate": 3.0570987654320985e-05, "loss": 0.0378, "step": 6345 }, { "epoch": 3.9064903106736386, "grad_norm": 0.26171875, "learning_rate": 3.055555555555556e-05, "loss": 0.0371, "step": 6350 }, { "epoch": 3.9095662872962165, "grad_norm": 0.2431640625, "learning_rate": 3.054012345679013e-05, "loss": 0.0388, "step": 6355 }, { "epoch": 3.9126422639187943, "grad_norm": 0.2353515625, "learning_rate": 3.052469135802469e-05, "loss": 0.0413, "step": 6360 }, { "epoch": 3.9157182405413717, "grad_norm": 0.27734375, "learning_rate": 3.050925925925926e-05, "loss": 0.0417, "step": 6365 }, { "epoch": 3.9187942171639496, "grad_norm": 0.306640625, "learning_rate": 3.0493827160493827e-05, "loss": 0.0416, "step": 6370 }, { "epoch": 3.9218701937865275, "grad_norm": 0.2470703125, "learning_rate": 3.04783950617284e-05, "loss": 0.0368, "step": 6375 }, { "epoch": 3.924946170409105, "grad_norm": 0.298828125, "learning_rate": 3.0462962962962964e-05, "loss": 0.0428, "step": 6380 }, { "epoch": 3.9280221470316823, "grad_norm": 0.265625, "learning_rate": 3.0447530864197533e-05, "loss": 0.0388, "step": 6385 }, { "epoch": 3.93109812365426, "grad_norm": 0.259765625, "learning_rate": 3.0432098765432098e-05, "loss": 0.0396, "step": 6390 }, { "epoch": 3.934174100276838, "grad_norm": 0.25390625, "learning_rate": 3.0416666666666666e-05, "loss": 0.0366, "step": 6395 }, { "epoch": 3.9372500768994154, "grad_norm": 0.2421875, "learning_rate": 3.0401234567901238e-05, "loss": 0.0397, "step": 6400 }, { "epoch": 3.9403260535219933, "grad_norm": 0.265625, "learning_rate": 3.0385802469135803e-05, "loss": 0.0378, "step": 6405 }, { "epoch": 3.943402030144571, "grad_norm": 0.251953125, "learning_rate": 3.037037037037037e-05, "loss": 0.038, "step": 6410 }, { "epoch": 3.9464780067671485, "grad_norm": 0.2158203125, "learning_rate": 3.0354938271604937e-05, "loss": 0.037, "step": 6415 }, { "epoch": 3.949553983389726, "grad_norm": 0.287109375, "learning_rate": 3.0339506172839512e-05, "loss": 0.0377, "step": 6420 }, { "epoch": 3.952629960012304, "grad_norm": 0.296875, "learning_rate": 3.0324074074074077e-05, "loss": 0.035, "step": 6425 }, { "epoch": 3.9557059366348817, "grad_norm": 0.2373046875, "learning_rate": 3.0308641975308642e-05, "loss": 0.0398, "step": 6430 }, { "epoch": 3.958781913257459, "grad_norm": 0.2578125, "learning_rate": 3.029320987654321e-05, "loss": 0.0352, "step": 6435 }, { "epoch": 3.961857889880037, "grad_norm": 0.283203125, "learning_rate": 3.0277777777777776e-05, "loss": 0.0367, "step": 6440 }, { "epoch": 3.964933866502615, "grad_norm": 0.2470703125, "learning_rate": 3.026234567901235e-05, "loss": 0.0371, "step": 6445 }, { "epoch": 3.9680098431251922, "grad_norm": 0.267578125, "learning_rate": 3.0246913580246916e-05, "loss": 0.0392, "step": 6450 }, { "epoch": 3.9710858197477696, "grad_norm": 0.263671875, "learning_rate": 3.0231481481481484e-05, "loss": 0.0382, "step": 6455 }, { "epoch": 3.9741617963703475, "grad_norm": 0.26953125, "learning_rate": 3.021604938271605e-05, "loss": 0.0408, "step": 6460 }, { "epoch": 3.9772377729929254, "grad_norm": 0.263671875, "learning_rate": 3.0200617283950618e-05, "loss": 0.0371, "step": 6465 }, { "epoch": 3.9803137496155028, "grad_norm": 0.2451171875, "learning_rate": 3.018518518518519e-05, "loss": 0.0357, "step": 6470 }, { "epoch": 3.9833897262380806, "grad_norm": 0.28125, "learning_rate": 3.0169753086419755e-05, "loss": 0.0427, "step": 6475 }, { "epoch": 3.9864657028606585, "grad_norm": 0.2734375, "learning_rate": 3.0154320987654323e-05, "loss": 0.0411, "step": 6480 }, { "epoch": 3.989541679483236, "grad_norm": 0.265625, "learning_rate": 3.0138888888888888e-05, "loss": 0.0384, "step": 6485 }, { "epoch": 3.9926176561058138, "grad_norm": 0.283203125, "learning_rate": 3.012345679012346e-05, "loss": 0.0416, "step": 6490 }, { "epoch": 3.995693632728391, "grad_norm": 0.271484375, "learning_rate": 3.010802469135803e-05, "loss": 0.0393, "step": 6495 }, { "epoch": 3.998769609350969, "grad_norm": 0.2421875, "learning_rate": 3.0092592592592593e-05, "loss": 0.036, "step": 6500 }, { "epoch": 4.0018455859735464, "grad_norm": 0.251953125, "learning_rate": 3.0077160493827162e-05, "loss": 0.0349, "step": 6505 }, { "epoch": 4.004921562596124, "grad_norm": 0.26953125, "learning_rate": 3.0061728395061727e-05, "loss": 0.0301, "step": 6510 }, { "epoch": 4.007997539218702, "grad_norm": 0.271484375, "learning_rate": 3.00462962962963e-05, "loss": 0.0313, "step": 6515 }, { "epoch": 4.01107351584128, "grad_norm": 0.279296875, "learning_rate": 3.0030864197530867e-05, "loss": 0.0306, "step": 6520 }, { "epoch": 4.014149492463857, "grad_norm": 0.244140625, "learning_rate": 3.0015432098765432e-05, "loss": 0.0304, "step": 6525 }, { "epoch": 4.017225469086435, "grad_norm": 0.2177734375, "learning_rate": 3e-05, "loss": 0.0292, "step": 6530 }, { "epoch": 4.020301445709013, "grad_norm": 0.2197265625, "learning_rate": 2.9984567901234566e-05, "loss": 0.029, "step": 6535 }, { "epoch": 4.02337742233159, "grad_norm": 0.279296875, "learning_rate": 2.9969135802469138e-05, "loss": 0.0316, "step": 6540 }, { "epoch": 4.0264533989541675, "grad_norm": 0.23046875, "learning_rate": 2.9953703703703706e-05, "loss": 0.0308, "step": 6545 }, { "epoch": 4.029529375576746, "grad_norm": 0.287109375, "learning_rate": 2.993827160493827e-05, "loss": 0.0321, "step": 6550 }, { "epoch": 4.032605352199323, "grad_norm": 0.251953125, "learning_rate": 2.992283950617284e-05, "loss": 0.0318, "step": 6555 }, { "epoch": 4.035681328821901, "grad_norm": 0.296875, "learning_rate": 2.9907407407407405e-05, "loss": 0.0333, "step": 6560 }, { "epoch": 4.038757305444479, "grad_norm": 0.271484375, "learning_rate": 2.989197530864198e-05, "loss": 0.0307, "step": 6565 }, { "epoch": 4.041833282067056, "grad_norm": 0.263671875, "learning_rate": 2.9876543209876545e-05, "loss": 0.0312, "step": 6570 }, { "epoch": 4.044909258689634, "grad_norm": 0.26171875, "learning_rate": 2.9861111111111113e-05, "loss": 0.0292, "step": 6575 }, { "epoch": 4.047985235312211, "grad_norm": 0.2578125, "learning_rate": 2.984567901234568e-05, "loss": 0.0314, "step": 6580 }, { "epoch": 4.0510612119347895, "grad_norm": 0.2490234375, "learning_rate": 2.983024691358025e-05, "loss": 0.0338, "step": 6585 }, { "epoch": 4.054137188557367, "grad_norm": 0.263671875, "learning_rate": 2.981481481481482e-05, "loss": 0.0292, "step": 6590 }, { "epoch": 4.057213165179944, "grad_norm": 0.326171875, "learning_rate": 2.9799382716049384e-05, "loss": 0.0295, "step": 6595 }, { "epoch": 4.060289141802523, "grad_norm": 0.283203125, "learning_rate": 2.9783950617283952e-05, "loss": 0.033, "step": 6600 }, { "epoch": 4.0633651184251, "grad_norm": 0.26953125, "learning_rate": 2.9768518518518517e-05, "loss": 0.0337, "step": 6605 }, { "epoch": 4.0664410950476775, "grad_norm": 0.255859375, "learning_rate": 2.975308641975309e-05, "loss": 0.0335, "step": 6610 }, { "epoch": 4.069517071670255, "grad_norm": 0.27734375, "learning_rate": 2.9737654320987658e-05, "loss": 0.03, "step": 6615 }, { "epoch": 4.072593048292833, "grad_norm": 0.248046875, "learning_rate": 2.9722222222222223e-05, "loss": 0.0336, "step": 6620 }, { "epoch": 4.075669024915411, "grad_norm": 0.279296875, "learning_rate": 2.970679012345679e-05, "loss": 0.0324, "step": 6625 }, { "epoch": 4.078745001537988, "grad_norm": 0.291015625, "learning_rate": 2.9691358024691356e-05, "loss": 0.037, "step": 6630 }, { "epoch": 4.081820978160566, "grad_norm": 0.26953125, "learning_rate": 2.9675925925925928e-05, "loss": 0.0311, "step": 6635 }, { "epoch": 4.084896954783144, "grad_norm": 0.25390625, "learning_rate": 2.9660493827160496e-05, "loss": 0.0312, "step": 6640 }, { "epoch": 4.087972931405721, "grad_norm": 0.2490234375, "learning_rate": 2.964506172839506e-05, "loss": 0.0306, "step": 6645 }, { "epoch": 4.091048908028299, "grad_norm": 0.263671875, "learning_rate": 2.962962962962963e-05, "loss": 0.0294, "step": 6650 }, { "epoch": 4.094124884650877, "grad_norm": 0.259765625, "learning_rate": 2.9614197530864202e-05, "loss": 0.0284, "step": 6655 }, { "epoch": 4.097200861273454, "grad_norm": 0.28125, "learning_rate": 2.9598765432098767e-05, "loss": 0.0318, "step": 6660 }, { "epoch": 4.100276837896032, "grad_norm": 0.28125, "learning_rate": 2.9583333333333335e-05, "loss": 0.0342, "step": 6665 }, { "epoch": 4.10335281451861, "grad_norm": 0.265625, "learning_rate": 2.95679012345679e-05, "loss": 0.0318, "step": 6670 }, { "epoch": 4.106428791141187, "grad_norm": 0.251953125, "learning_rate": 2.955246913580247e-05, "loss": 0.0311, "step": 6675 }, { "epoch": 4.109504767763765, "grad_norm": 0.259765625, "learning_rate": 2.953703703703704e-05, "loss": 0.035, "step": 6680 }, { "epoch": 4.112580744386342, "grad_norm": 0.2197265625, "learning_rate": 2.952160493827161e-05, "loss": 0.0288, "step": 6685 }, { "epoch": 4.1156567210089205, "grad_norm": 0.26171875, "learning_rate": 2.9506172839506174e-05, "loss": 0.0302, "step": 6690 }, { "epoch": 4.118732697631498, "grad_norm": 0.275390625, "learning_rate": 2.9490740740740743e-05, "loss": 0.0343, "step": 6695 }, { "epoch": 4.121808674254075, "grad_norm": 0.287109375, "learning_rate": 2.9475308641975308e-05, "loss": 0.0344, "step": 6700 }, { "epoch": 4.124884650876654, "grad_norm": 0.244140625, "learning_rate": 2.945987654320988e-05, "loss": 0.031, "step": 6705 }, { "epoch": 4.127960627499231, "grad_norm": 0.310546875, "learning_rate": 2.9444444444444448e-05, "loss": 0.032, "step": 6710 }, { "epoch": 4.1310366041218085, "grad_norm": 0.267578125, "learning_rate": 2.9429012345679013e-05, "loss": 0.0317, "step": 6715 }, { "epoch": 4.134112580744386, "grad_norm": 0.267578125, "learning_rate": 2.941358024691358e-05, "loss": 0.0327, "step": 6720 }, { "epoch": 4.137188557366964, "grad_norm": 0.25390625, "learning_rate": 2.9398148148148146e-05, "loss": 0.0334, "step": 6725 }, { "epoch": 4.140264533989542, "grad_norm": 0.271484375, "learning_rate": 2.9382716049382718e-05, "loss": 0.0314, "step": 6730 }, { "epoch": 4.143340510612119, "grad_norm": 0.255859375, "learning_rate": 2.9367283950617287e-05, "loss": 0.0321, "step": 6735 }, { "epoch": 4.146416487234697, "grad_norm": 0.24609375, "learning_rate": 2.9351851851851852e-05, "loss": 0.0293, "step": 6740 }, { "epoch": 4.149492463857275, "grad_norm": 0.267578125, "learning_rate": 2.933641975308642e-05, "loss": 0.0318, "step": 6745 }, { "epoch": 4.152568440479852, "grad_norm": 0.2734375, "learning_rate": 2.9320987654320992e-05, "loss": 0.0344, "step": 6750 }, { "epoch": 4.15564441710243, "grad_norm": 0.255859375, "learning_rate": 2.9305555555555557e-05, "loss": 0.0312, "step": 6755 }, { "epoch": 4.158720393725008, "grad_norm": 0.24609375, "learning_rate": 2.9290123456790126e-05, "loss": 0.0274, "step": 6760 }, { "epoch": 4.161796370347585, "grad_norm": 0.259765625, "learning_rate": 2.927469135802469e-05, "loss": 0.0326, "step": 6765 }, { "epoch": 4.164872346970163, "grad_norm": 0.283203125, "learning_rate": 2.925925925925926e-05, "loss": 0.0292, "step": 6770 }, { "epoch": 4.167948323592741, "grad_norm": 0.25390625, "learning_rate": 2.924382716049383e-05, "loss": 0.0297, "step": 6775 }, { "epoch": 4.1710243002153184, "grad_norm": 0.27734375, "learning_rate": 2.9228395061728396e-05, "loss": 0.0291, "step": 6780 }, { "epoch": 4.174100276837896, "grad_norm": 0.28125, "learning_rate": 2.9212962962962964e-05, "loss": 0.0309, "step": 6785 }, { "epoch": 4.177176253460473, "grad_norm": 0.23828125, "learning_rate": 2.919753086419753e-05, "loss": 0.0338, "step": 6790 }, { "epoch": 4.180252230083052, "grad_norm": 0.279296875, "learning_rate": 2.9182098765432098e-05, "loss": 0.0322, "step": 6795 }, { "epoch": 4.183328206705629, "grad_norm": 0.251953125, "learning_rate": 2.916666666666667e-05, "loss": 0.0314, "step": 6800 }, { "epoch": 4.186404183328206, "grad_norm": 0.265625, "learning_rate": 2.9151234567901238e-05, "loss": 0.0335, "step": 6805 }, { "epoch": 4.189480159950785, "grad_norm": 0.283203125, "learning_rate": 2.9135802469135803e-05, "loss": 0.033, "step": 6810 }, { "epoch": 4.192556136573362, "grad_norm": 0.267578125, "learning_rate": 2.9120370370370372e-05, "loss": 0.0283, "step": 6815 }, { "epoch": 4.1956321131959395, "grad_norm": 0.26171875, "learning_rate": 2.9104938271604944e-05, "loss": 0.0307, "step": 6820 }, { "epoch": 4.198708089818517, "grad_norm": 0.2734375, "learning_rate": 2.908950617283951e-05, "loss": 0.0318, "step": 6825 }, { "epoch": 4.201784066441095, "grad_norm": 0.271484375, "learning_rate": 2.9074074074074077e-05, "loss": 0.0314, "step": 6830 }, { "epoch": 4.204860043063673, "grad_norm": 0.275390625, "learning_rate": 2.9058641975308642e-05, "loss": 0.0322, "step": 6835 }, { "epoch": 4.20793601968625, "grad_norm": 0.263671875, "learning_rate": 2.904320987654321e-05, "loss": 0.0304, "step": 6840 }, { "epoch": 4.211011996308828, "grad_norm": 0.263671875, "learning_rate": 2.9027777777777782e-05, "loss": 0.032, "step": 6845 }, { "epoch": 4.214087972931406, "grad_norm": 0.2421875, "learning_rate": 2.9012345679012347e-05, "loss": 0.0328, "step": 6850 }, { "epoch": 4.217163949553983, "grad_norm": 0.2734375, "learning_rate": 2.8996913580246916e-05, "loss": 0.0329, "step": 6855 }, { "epoch": 4.220239926176561, "grad_norm": 0.27734375, "learning_rate": 2.898148148148148e-05, "loss": 0.0301, "step": 6860 }, { "epoch": 4.223315902799139, "grad_norm": 0.2578125, "learning_rate": 2.896604938271605e-05, "loss": 0.03, "step": 6865 }, { "epoch": 4.226391879421716, "grad_norm": 0.287109375, "learning_rate": 2.895061728395062e-05, "loss": 0.034, "step": 6870 }, { "epoch": 4.229467856044294, "grad_norm": 0.244140625, "learning_rate": 2.8935185185185186e-05, "loss": 0.0339, "step": 6875 }, { "epoch": 4.232543832666872, "grad_norm": 0.26171875, "learning_rate": 2.8919753086419755e-05, "loss": 0.0316, "step": 6880 }, { "epoch": 4.2356198092894495, "grad_norm": 0.27734375, "learning_rate": 2.890432098765432e-05, "loss": 0.0309, "step": 6885 }, { "epoch": 4.238695785912027, "grad_norm": 0.25390625, "learning_rate": 2.8888888888888888e-05, "loss": 0.0321, "step": 6890 }, { "epoch": 4.241771762534604, "grad_norm": 0.251953125, "learning_rate": 2.887345679012346e-05, "loss": 0.0347, "step": 6895 }, { "epoch": 4.244847739157183, "grad_norm": 0.267578125, "learning_rate": 2.8858024691358025e-05, "loss": 0.0337, "step": 6900 }, { "epoch": 4.24792371577976, "grad_norm": 0.244140625, "learning_rate": 2.8842592592592594e-05, "loss": 0.0317, "step": 6905 }, { "epoch": 4.250999692402337, "grad_norm": 0.3046875, "learning_rate": 2.882716049382716e-05, "loss": 0.0309, "step": 6910 }, { "epoch": 4.254075669024916, "grad_norm": 0.302734375, "learning_rate": 2.8811728395061734e-05, "loss": 0.0358, "step": 6915 }, { "epoch": 4.257151645647493, "grad_norm": 0.3125, "learning_rate": 2.87962962962963e-05, "loss": 0.0346, "step": 6920 }, { "epoch": 4.260227622270071, "grad_norm": 0.255859375, "learning_rate": 2.8780864197530867e-05, "loss": 0.0316, "step": 6925 }, { "epoch": 4.263303598892648, "grad_norm": 0.25, "learning_rate": 2.8765432098765432e-05, "loss": 0.032, "step": 6930 }, { "epoch": 4.266379575515226, "grad_norm": 0.26953125, "learning_rate": 2.8749999999999997e-05, "loss": 0.0308, "step": 6935 }, { "epoch": 4.269455552137804, "grad_norm": 0.2578125, "learning_rate": 2.8734567901234573e-05, "loss": 0.0328, "step": 6940 }, { "epoch": 4.272531528760381, "grad_norm": 0.2890625, "learning_rate": 2.8719135802469138e-05, "loss": 0.0329, "step": 6945 }, { "epoch": 4.275607505382959, "grad_norm": 0.265625, "learning_rate": 2.8703703703703706e-05, "loss": 0.0312, "step": 6950 }, { "epoch": 4.278683482005537, "grad_norm": 0.263671875, "learning_rate": 2.868827160493827e-05, "loss": 0.0305, "step": 6955 }, { "epoch": 4.281759458628114, "grad_norm": 0.2431640625, "learning_rate": 2.867283950617284e-05, "loss": 0.0304, "step": 6960 }, { "epoch": 4.284835435250692, "grad_norm": 0.271484375, "learning_rate": 2.865740740740741e-05, "loss": 0.0303, "step": 6965 }, { "epoch": 4.28791141187327, "grad_norm": 0.265625, "learning_rate": 2.8641975308641977e-05, "loss": 0.0316, "step": 6970 }, { "epoch": 4.290987388495847, "grad_norm": 0.291015625, "learning_rate": 2.8626543209876545e-05, "loss": 0.0313, "step": 6975 }, { "epoch": 4.294063365118425, "grad_norm": 0.265625, "learning_rate": 2.861111111111111e-05, "loss": 0.0313, "step": 6980 }, { "epoch": 4.297139341741003, "grad_norm": 0.2578125, "learning_rate": 2.8595679012345682e-05, "loss": 0.0343, "step": 6985 }, { "epoch": 4.3002153183635805, "grad_norm": 0.267578125, "learning_rate": 2.858024691358025e-05, "loss": 0.0338, "step": 6990 }, { "epoch": 4.303291294986158, "grad_norm": 0.255859375, "learning_rate": 2.8564814814814815e-05, "loss": 0.0307, "step": 6995 }, { "epoch": 4.306367271608735, "grad_norm": 0.2578125, "learning_rate": 2.8549382716049384e-05, "loss": 0.0304, "step": 7000 }, { "epoch": 4.309443248231314, "grad_norm": 0.267578125, "learning_rate": 2.853395061728395e-05, "loss": 0.0293, "step": 7005 }, { "epoch": 4.312519224853891, "grad_norm": 0.294921875, "learning_rate": 2.851851851851852e-05, "loss": 0.0304, "step": 7010 }, { "epoch": 4.3155952014764685, "grad_norm": 0.287109375, "learning_rate": 2.850308641975309e-05, "loss": 0.0314, "step": 7015 }, { "epoch": 4.318671178099047, "grad_norm": 0.2451171875, "learning_rate": 2.8487654320987654e-05, "loss": 0.0342, "step": 7020 }, { "epoch": 4.321747154721624, "grad_norm": 0.33203125, "learning_rate": 2.8472222222222223e-05, "loss": 0.0352, "step": 7025 }, { "epoch": 4.324823131344202, "grad_norm": 0.271484375, "learning_rate": 2.8456790123456788e-05, "loss": 0.0334, "step": 7030 }, { "epoch": 4.327899107966779, "grad_norm": 0.283203125, "learning_rate": 2.8441358024691363e-05, "loss": 0.0333, "step": 7035 }, { "epoch": 4.330975084589357, "grad_norm": 0.2890625, "learning_rate": 2.8425925925925928e-05, "loss": 0.0327, "step": 7040 }, { "epoch": 4.334051061211935, "grad_norm": 0.236328125, "learning_rate": 2.8410493827160493e-05, "loss": 0.0295, "step": 7045 }, { "epoch": 4.337127037834512, "grad_norm": 0.244140625, "learning_rate": 2.839506172839506e-05, "loss": 0.0283, "step": 7050 }, { "epoch": 4.3402030144570904, "grad_norm": 0.2236328125, "learning_rate": 2.8379629629629627e-05, "loss": 0.0314, "step": 7055 }, { "epoch": 4.343278991079668, "grad_norm": 0.263671875, "learning_rate": 2.8364197530864202e-05, "loss": 0.0327, "step": 7060 }, { "epoch": 4.346354967702245, "grad_norm": 0.28515625, "learning_rate": 2.8348765432098767e-05, "loss": 0.0352, "step": 7065 }, { "epoch": 4.349430944324823, "grad_norm": 0.259765625, "learning_rate": 2.8333333333333335e-05, "loss": 0.0307, "step": 7070 }, { "epoch": 4.352506920947401, "grad_norm": 0.255859375, "learning_rate": 2.83179012345679e-05, "loss": 0.0337, "step": 7075 }, { "epoch": 4.355582897569978, "grad_norm": 0.26171875, "learning_rate": 2.8302469135802472e-05, "loss": 0.0309, "step": 7080 }, { "epoch": 4.358658874192556, "grad_norm": 0.259765625, "learning_rate": 2.828703703703704e-05, "loss": 0.0323, "step": 7085 }, { "epoch": 4.361734850815134, "grad_norm": 0.283203125, "learning_rate": 2.8271604938271606e-05, "loss": 0.0307, "step": 7090 }, { "epoch": 4.3648108274377115, "grad_norm": 0.341796875, "learning_rate": 2.8256172839506174e-05, "loss": 0.0347, "step": 7095 }, { "epoch": 4.367886804060289, "grad_norm": 0.298828125, "learning_rate": 2.824074074074074e-05, "loss": 0.0329, "step": 7100 }, { "epoch": 4.370962780682866, "grad_norm": 0.2392578125, "learning_rate": 2.822530864197531e-05, "loss": 0.0307, "step": 7105 }, { "epoch": 4.374038757305445, "grad_norm": 0.255859375, "learning_rate": 2.820987654320988e-05, "loss": 0.0313, "step": 7110 }, { "epoch": 4.377114733928022, "grad_norm": 0.2578125, "learning_rate": 2.8194444444444445e-05, "loss": 0.0333, "step": 7115 }, { "epoch": 4.3801907105505995, "grad_norm": 0.25390625, "learning_rate": 2.8179012345679013e-05, "loss": 0.0321, "step": 7120 }, { "epoch": 4.383266687173178, "grad_norm": 0.244140625, "learning_rate": 2.8163580246913578e-05, "loss": 0.0297, "step": 7125 }, { "epoch": 4.386342663795755, "grad_norm": 0.279296875, "learning_rate": 2.814814814814815e-05, "loss": 0.031, "step": 7130 }, { "epoch": 4.389418640418333, "grad_norm": 0.2421875, "learning_rate": 2.813271604938272e-05, "loss": 0.0304, "step": 7135 }, { "epoch": 4.39249461704091, "grad_norm": 0.28515625, "learning_rate": 2.8117283950617283e-05, "loss": 0.0319, "step": 7140 }, { "epoch": 4.395570593663488, "grad_norm": 0.248046875, "learning_rate": 2.8101851851851852e-05, "loss": 0.0351, "step": 7145 }, { "epoch": 4.398646570286066, "grad_norm": 0.283203125, "learning_rate": 2.8086419753086424e-05, "loss": 0.0312, "step": 7150 }, { "epoch": 4.401722546908643, "grad_norm": 0.31640625, "learning_rate": 2.807098765432099e-05, "loss": 0.0337, "step": 7155 }, { "epoch": 4.4047985235312215, "grad_norm": 0.2431640625, "learning_rate": 2.8055555555555557e-05, "loss": 0.0317, "step": 7160 }, { "epoch": 4.407874500153799, "grad_norm": 0.267578125, "learning_rate": 2.8040123456790122e-05, "loss": 0.0316, "step": 7165 }, { "epoch": 4.410950476776376, "grad_norm": 0.294921875, "learning_rate": 2.802469135802469e-05, "loss": 0.036, "step": 7170 }, { "epoch": 4.414026453398954, "grad_norm": 0.287109375, "learning_rate": 2.8009259259259263e-05, "loss": 0.0328, "step": 7175 }, { "epoch": 4.417102430021532, "grad_norm": 0.296875, "learning_rate": 2.799382716049383e-05, "loss": 0.0328, "step": 7180 }, { "epoch": 4.420178406644109, "grad_norm": 0.2734375, "learning_rate": 2.7978395061728396e-05, "loss": 0.0321, "step": 7185 }, { "epoch": 4.423254383266687, "grad_norm": 0.279296875, "learning_rate": 2.7962962962962965e-05, "loss": 0.0336, "step": 7190 }, { "epoch": 4.426330359889265, "grad_norm": 0.244140625, "learning_rate": 2.794753086419753e-05, "loss": 0.0312, "step": 7195 }, { "epoch": 4.429406336511843, "grad_norm": 0.28515625, "learning_rate": 2.79320987654321e-05, "loss": 0.031, "step": 7200 }, { "epoch": 4.43248231313442, "grad_norm": 0.267578125, "learning_rate": 2.791666666666667e-05, "loss": 0.0328, "step": 7205 }, { "epoch": 4.435558289756997, "grad_norm": 0.306640625, "learning_rate": 2.7901234567901235e-05, "loss": 0.0291, "step": 7210 }, { "epoch": 4.438634266379576, "grad_norm": 0.224609375, "learning_rate": 2.7885802469135803e-05, "loss": 0.0307, "step": 7215 }, { "epoch": 4.441710243002153, "grad_norm": 0.296875, "learning_rate": 2.7870370370370375e-05, "loss": 0.0317, "step": 7220 }, { "epoch": 4.4447862196247305, "grad_norm": 0.259765625, "learning_rate": 2.785493827160494e-05, "loss": 0.0338, "step": 7225 }, { "epoch": 4.447862196247309, "grad_norm": 0.296875, "learning_rate": 2.783950617283951e-05, "loss": 0.0344, "step": 7230 }, { "epoch": 4.450938172869886, "grad_norm": 0.2578125, "learning_rate": 2.7824074074074074e-05, "loss": 0.0284, "step": 7235 }, { "epoch": 4.454014149492464, "grad_norm": 0.2373046875, "learning_rate": 2.7808641975308642e-05, "loss": 0.0316, "step": 7240 }, { "epoch": 4.457090126115041, "grad_norm": 0.3125, "learning_rate": 2.7793209876543214e-05, "loss": 0.0328, "step": 7245 }, { "epoch": 4.460166102737619, "grad_norm": 0.28515625, "learning_rate": 2.777777777777778e-05, "loss": 0.0302, "step": 7250 }, { "epoch": 4.463242079360197, "grad_norm": 0.2353515625, "learning_rate": 2.7762345679012348e-05, "loss": 0.0297, "step": 7255 }, { "epoch": 4.466318055982774, "grad_norm": 0.259765625, "learning_rate": 2.7746913580246913e-05, "loss": 0.0327, "step": 7260 }, { "epoch": 4.4693940326053525, "grad_norm": 0.25390625, "learning_rate": 2.773148148148148e-05, "loss": 0.034, "step": 7265 }, { "epoch": 4.47247000922793, "grad_norm": 0.265625, "learning_rate": 2.7716049382716053e-05, "loss": 0.0342, "step": 7270 }, { "epoch": 4.475545985850507, "grad_norm": 0.271484375, "learning_rate": 2.7700617283950618e-05, "loss": 0.0334, "step": 7275 }, { "epoch": 4.478621962473085, "grad_norm": 0.24609375, "learning_rate": 2.7685185185185186e-05, "loss": 0.03, "step": 7280 }, { "epoch": 4.481697939095663, "grad_norm": 0.259765625, "learning_rate": 2.766975308641975e-05, "loss": 0.0329, "step": 7285 }, { "epoch": 4.4847739157182405, "grad_norm": 0.30078125, "learning_rate": 2.765432098765432e-05, "loss": 0.0337, "step": 7290 }, { "epoch": 4.487849892340818, "grad_norm": 0.25390625, "learning_rate": 2.7638888888888892e-05, "loss": 0.0347, "step": 7295 }, { "epoch": 4.490925868963396, "grad_norm": 0.255859375, "learning_rate": 2.762345679012346e-05, "loss": 0.031, "step": 7300 }, { "epoch": 4.494001845585974, "grad_norm": 0.259765625, "learning_rate": 2.7608024691358025e-05, "loss": 0.036, "step": 7305 }, { "epoch": 4.497077822208551, "grad_norm": 0.263671875, "learning_rate": 2.7592592592592594e-05, "loss": 0.0309, "step": 7310 }, { "epoch": 4.500153798831128, "grad_norm": 0.279296875, "learning_rate": 2.7577160493827166e-05, "loss": 0.0321, "step": 7315 }, { "epoch": 4.503229775453707, "grad_norm": 0.2734375, "learning_rate": 2.756172839506173e-05, "loss": 0.0368, "step": 7320 }, { "epoch": 4.506305752076284, "grad_norm": 0.337890625, "learning_rate": 2.75462962962963e-05, "loss": 0.0351, "step": 7325 }, { "epoch": 4.509381728698862, "grad_norm": 0.328125, "learning_rate": 2.7530864197530864e-05, "loss": 0.0337, "step": 7330 }, { "epoch": 4.51245770532144, "grad_norm": 0.267578125, "learning_rate": 2.7515432098765433e-05, "loss": 0.0353, "step": 7335 }, { "epoch": 4.515533681944017, "grad_norm": 0.2890625, "learning_rate": 2.7500000000000004e-05, "loss": 0.0301, "step": 7340 }, { "epoch": 4.518609658566595, "grad_norm": 0.26171875, "learning_rate": 2.748456790123457e-05, "loss": 0.0337, "step": 7345 }, { "epoch": 4.521685635189172, "grad_norm": 0.271484375, "learning_rate": 2.7469135802469138e-05, "loss": 0.034, "step": 7350 }, { "epoch": 4.52476161181175, "grad_norm": 0.2412109375, "learning_rate": 2.7453703703703703e-05, "loss": 0.0292, "step": 7355 }, { "epoch": 4.527837588434328, "grad_norm": 0.310546875, "learning_rate": 2.743827160493827e-05, "loss": 0.0322, "step": 7360 }, { "epoch": 4.530913565056905, "grad_norm": 0.251953125, "learning_rate": 2.7422839506172843e-05, "loss": 0.0326, "step": 7365 }, { "epoch": 4.5339895416794835, "grad_norm": 0.267578125, "learning_rate": 2.7407407407407408e-05, "loss": 0.0315, "step": 7370 }, { "epoch": 4.537065518302061, "grad_norm": 0.283203125, "learning_rate": 2.7391975308641977e-05, "loss": 0.0332, "step": 7375 }, { "epoch": 4.540141494924638, "grad_norm": 0.28515625, "learning_rate": 2.7376543209876542e-05, "loss": 0.0308, "step": 7380 }, { "epoch": 4.543217471547216, "grad_norm": 0.2373046875, "learning_rate": 2.7361111111111114e-05, "loss": 0.03, "step": 7385 }, { "epoch": 4.546293448169794, "grad_norm": 0.267578125, "learning_rate": 2.7345679012345682e-05, "loss": 0.0318, "step": 7390 }, { "epoch": 4.5493694247923715, "grad_norm": 0.255859375, "learning_rate": 2.7330246913580247e-05, "loss": 0.0354, "step": 7395 }, { "epoch": 4.552445401414949, "grad_norm": 0.31640625, "learning_rate": 2.7314814814814816e-05, "loss": 0.035, "step": 7400 }, { "epoch": 4.555521378037527, "grad_norm": 0.2890625, "learning_rate": 2.729938271604938e-05, "loss": 0.0333, "step": 7405 }, { "epoch": 4.558597354660105, "grad_norm": 0.32421875, "learning_rate": 2.7283950617283956e-05, "loss": 0.0365, "step": 7410 }, { "epoch": 4.561673331282682, "grad_norm": 0.28515625, "learning_rate": 2.726851851851852e-05, "loss": 0.0303, "step": 7415 }, { "epoch": 4.5647493079052595, "grad_norm": 0.28515625, "learning_rate": 2.725308641975309e-05, "loss": 0.0339, "step": 7420 }, { "epoch": 4.567825284527838, "grad_norm": 0.271484375, "learning_rate": 2.7237654320987654e-05, "loss": 0.0354, "step": 7425 }, { "epoch": 4.570901261150415, "grad_norm": 0.267578125, "learning_rate": 2.7222222222222223e-05, "loss": 0.0358, "step": 7430 }, { "epoch": 4.573977237772993, "grad_norm": 0.271484375, "learning_rate": 2.7206790123456795e-05, "loss": 0.0308, "step": 7435 }, { "epoch": 4.577053214395571, "grad_norm": 0.275390625, "learning_rate": 2.719135802469136e-05, "loss": 0.0331, "step": 7440 }, { "epoch": 4.580129191018148, "grad_norm": 0.302734375, "learning_rate": 2.7175925925925928e-05, "loss": 0.0327, "step": 7445 }, { "epoch": 4.583205167640726, "grad_norm": 0.287109375, "learning_rate": 2.7160493827160493e-05, "loss": 0.0331, "step": 7450 }, { "epoch": 4.586281144263303, "grad_norm": 0.271484375, "learning_rate": 2.714506172839506e-05, "loss": 0.0309, "step": 7455 }, { "epoch": 4.589357120885881, "grad_norm": 0.23046875, "learning_rate": 2.7129629629629634e-05, "loss": 0.0313, "step": 7460 }, { "epoch": 4.592433097508459, "grad_norm": 0.29296875, "learning_rate": 2.71141975308642e-05, "loss": 0.0364, "step": 7465 }, { "epoch": 4.595509074131036, "grad_norm": 0.279296875, "learning_rate": 2.7098765432098767e-05, "loss": 0.034, "step": 7470 }, { "epoch": 4.598585050753615, "grad_norm": 0.2197265625, "learning_rate": 2.7083333333333332e-05, "loss": 0.0309, "step": 7475 }, { "epoch": 4.601661027376192, "grad_norm": 0.267578125, "learning_rate": 2.7067901234567904e-05, "loss": 0.032, "step": 7480 }, { "epoch": 4.604737003998769, "grad_norm": 0.255859375, "learning_rate": 2.7052469135802472e-05, "loss": 0.0339, "step": 7485 }, { "epoch": 4.607812980621347, "grad_norm": 0.25, "learning_rate": 2.7037037037037037e-05, "loss": 0.0321, "step": 7490 }, { "epoch": 4.610888957243925, "grad_norm": 0.296875, "learning_rate": 2.7021604938271606e-05, "loss": 0.0359, "step": 7495 }, { "epoch": 4.6139649338665025, "grad_norm": 0.255859375, "learning_rate": 2.700617283950617e-05, "loss": 0.0323, "step": 7500 }, { "epoch": 4.61704091048908, "grad_norm": 0.244140625, "learning_rate": 2.6990740740740743e-05, "loss": 0.0339, "step": 7505 }, { "epoch": 4.620116887111658, "grad_norm": 0.265625, "learning_rate": 2.697530864197531e-05, "loss": 0.0329, "step": 7510 }, { "epoch": 4.623192863734236, "grad_norm": 0.26953125, "learning_rate": 2.6959876543209876e-05, "loss": 0.0329, "step": 7515 }, { "epoch": 4.626268840356813, "grad_norm": 0.26171875, "learning_rate": 2.6944444444444445e-05, "loss": 0.0333, "step": 7520 }, { "epoch": 4.6293448169793905, "grad_norm": 0.27734375, "learning_rate": 2.692901234567901e-05, "loss": 0.0332, "step": 7525 }, { "epoch": 4.632420793601969, "grad_norm": 0.25, "learning_rate": 2.6913580246913585e-05, "loss": 0.0319, "step": 7530 }, { "epoch": 4.635496770224546, "grad_norm": 0.240234375, "learning_rate": 2.689814814814815e-05, "loss": 0.0333, "step": 7535 }, { "epoch": 4.638572746847124, "grad_norm": 0.31640625, "learning_rate": 2.688271604938272e-05, "loss": 0.0324, "step": 7540 }, { "epoch": 4.641648723469702, "grad_norm": 0.296875, "learning_rate": 2.6867283950617284e-05, "loss": 0.0311, "step": 7545 }, { "epoch": 4.644724700092279, "grad_norm": 0.279296875, "learning_rate": 2.6851851851851855e-05, "loss": 0.0334, "step": 7550 }, { "epoch": 4.647800676714857, "grad_norm": 0.27734375, "learning_rate": 2.6836419753086424e-05, "loss": 0.0303, "step": 7555 }, { "epoch": 4.650876653337434, "grad_norm": 0.228515625, "learning_rate": 2.682098765432099e-05, "loss": 0.0296, "step": 7560 }, { "epoch": 4.6539526299600125, "grad_norm": 0.287109375, "learning_rate": 2.6805555555555557e-05, "loss": 0.0305, "step": 7565 }, { "epoch": 4.65702860658259, "grad_norm": 0.25390625, "learning_rate": 2.6790123456790122e-05, "loss": 0.0351, "step": 7570 }, { "epoch": 4.660104583205167, "grad_norm": 0.259765625, "learning_rate": 2.6774691358024694e-05, "loss": 0.0305, "step": 7575 }, { "epoch": 4.663180559827746, "grad_norm": 0.3125, "learning_rate": 2.6759259259259263e-05, "loss": 0.0361, "step": 7580 }, { "epoch": 4.666256536450323, "grad_norm": 0.35546875, "learning_rate": 2.6743827160493828e-05, "loss": 0.0316, "step": 7585 }, { "epoch": 4.6693325130729, "grad_norm": 0.28515625, "learning_rate": 2.6728395061728396e-05, "loss": 0.0295, "step": 7590 }, { "epoch": 4.672408489695478, "grad_norm": 0.26953125, "learning_rate": 2.671296296296296e-05, "loss": 0.0305, "step": 7595 }, { "epoch": 4.675484466318056, "grad_norm": 0.251953125, "learning_rate": 2.6697530864197533e-05, "loss": 0.0293, "step": 7600 }, { "epoch": 4.678560442940634, "grad_norm": 0.310546875, "learning_rate": 2.66820987654321e-05, "loss": 0.033, "step": 7605 }, { "epoch": 4.681636419563211, "grad_norm": 0.291015625, "learning_rate": 2.6666666666666667e-05, "loss": 0.0309, "step": 7610 }, { "epoch": 4.684712396185789, "grad_norm": 0.27734375, "learning_rate": 2.6651234567901235e-05, "loss": 0.0312, "step": 7615 }, { "epoch": 4.687788372808367, "grad_norm": 0.259765625, "learning_rate": 2.66358024691358e-05, "loss": 0.0324, "step": 7620 }, { "epoch": 4.690864349430944, "grad_norm": 0.302734375, "learning_rate": 2.6620370370370372e-05, "loss": 0.0337, "step": 7625 }, { "epoch": 4.6939403260535215, "grad_norm": 0.22265625, "learning_rate": 2.660493827160494e-05, "loss": 0.0313, "step": 7630 }, { "epoch": 4.6970163026761, "grad_norm": 0.294921875, "learning_rate": 2.6589506172839505e-05, "loss": 0.0327, "step": 7635 }, { "epoch": 4.700092279298677, "grad_norm": 0.294921875, "learning_rate": 2.6574074074074074e-05, "loss": 0.0347, "step": 7640 }, { "epoch": 4.703168255921255, "grad_norm": 0.244140625, "learning_rate": 2.6558641975308646e-05, "loss": 0.0349, "step": 7645 }, { "epoch": 4.706244232543833, "grad_norm": 0.287109375, "learning_rate": 2.654320987654321e-05, "loss": 0.0294, "step": 7650 }, { "epoch": 4.70932020916641, "grad_norm": 0.2314453125, "learning_rate": 2.652777777777778e-05, "loss": 0.032, "step": 7655 }, { "epoch": 4.712396185788988, "grad_norm": 0.294921875, "learning_rate": 2.6512345679012344e-05, "loss": 0.0335, "step": 7660 }, { "epoch": 4.715472162411565, "grad_norm": 0.2412109375, "learning_rate": 2.6496913580246913e-05, "loss": 0.0328, "step": 7665 }, { "epoch": 4.7185481390341435, "grad_norm": 0.2890625, "learning_rate": 2.6481481481481485e-05, "loss": 0.0326, "step": 7670 }, { "epoch": 4.721624115656721, "grad_norm": 0.2890625, "learning_rate": 2.6466049382716053e-05, "loss": 0.0327, "step": 7675 }, { "epoch": 4.724700092279298, "grad_norm": 0.263671875, "learning_rate": 2.6450617283950618e-05, "loss": 0.0321, "step": 7680 }, { "epoch": 4.727776068901877, "grad_norm": 0.263671875, "learning_rate": 2.6435185185185187e-05, "loss": 0.0306, "step": 7685 }, { "epoch": 4.730852045524454, "grad_norm": 0.263671875, "learning_rate": 2.641975308641975e-05, "loss": 0.031, "step": 7690 }, { "epoch": 4.7339280221470315, "grad_norm": 0.298828125, "learning_rate": 2.6404320987654323e-05, "loss": 0.034, "step": 7695 }, { "epoch": 4.737003998769609, "grad_norm": 0.2890625, "learning_rate": 2.6388888888888892e-05, "loss": 0.0329, "step": 7700 }, { "epoch": 4.740079975392187, "grad_norm": 0.26953125, "learning_rate": 2.6373456790123457e-05, "loss": 0.0344, "step": 7705 }, { "epoch": 4.743155952014765, "grad_norm": 0.271484375, "learning_rate": 2.6358024691358025e-05, "loss": 0.0309, "step": 7710 }, { "epoch": 4.746231928637342, "grad_norm": 0.306640625, "learning_rate": 2.6342592592592597e-05, "loss": 0.0319, "step": 7715 }, { "epoch": 4.74930790525992, "grad_norm": 0.2734375, "learning_rate": 2.6327160493827162e-05, "loss": 0.03, "step": 7720 }, { "epoch": 4.752383881882498, "grad_norm": 0.3125, "learning_rate": 2.631172839506173e-05, "loss": 0.0406, "step": 7725 }, { "epoch": 4.755459858505075, "grad_norm": 0.275390625, "learning_rate": 2.6296296296296296e-05, "loss": 0.0344, "step": 7730 }, { "epoch": 4.7585358351276525, "grad_norm": 0.30078125, "learning_rate": 2.6280864197530864e-05, "loss": 0.0347, "step": 7735 }, { "epoch": 4.761611811750231, "grad_norm": 0.25390625, "learning_rate": 2.6265432098765436e-05, "loss": 0.0345, "step": 7740 }, { "epoch": 4.764687788372808, "grad_norm": 0.30859375, "learning_rate": 2.625e-05, "loss": 0.0316, "step": 7745 }, { "epoch": 4.767763764995386, "grad_norm": 0.263671875, "learning_rate": 2.623456790123457e-05, "loss": 0.0297, "step": 7750 }, { "epoch": 4.770839741617964, "grad_norm": 0.287109375, "learning_rate": 2.6219135802469135e-05, "loss": 0.0328, "step": 7755 }, { "epoch": 4.773915718240541, "grad_norm": 0.294921875, "learning_rate": 2.6203703703703703e-05, "loss": 0.034, "step": 7760 }, { "epoch": 4.776991694863119, "grad_norm": 0.3203125, "learning_rate": 2.6188271604938275e-05, "loss": 0.035, "step": 7765 }, { "epoch": 4.780067671485696, "grad_norm": 0.275390625, "learning_rate": 2.617283950617284e-05, "loss": 0.0309, "step": 7770 }, { "epoch": 4.7831436481082745, "grad_norm": 0.25390625, "learning_rate": 2.615740740740741e-05, "loss": 0.0332, "step": 7775 }, { "epoch": 4.786219624730852, "grad_norm": 0.30859375, "learning_rate": 2.6141975308641973e-05, "loss": 0.0313, "step": 7780 }, { "epoch": 4.789295601353429, "grad_norm": 0.322265625, "learning_rate": 2.6126543209876542e-05, "loss": 0.033, "step": 7785 }, { "epoch": 4.792371577976008, "grad_norm": 0.271484375, "learning_rate": 2.6111111111111114e-05, "loss": 0.034, "step": 7790 }, { "epoch": 4.795447554598585, "grad_norm": 0.2578125, "learning_rate": 2.6095679012345682e-05, "loss": 0.0314, "step": 7795 }, { "epoch": 4.7985235312211625, "grad_norm": 0.287109375, "learning_rate": 2.6080246913580247e-05, "loss": 0.0348, "step": 7800 }, { "epoch": 4.80159950784374, "grad_norm": 0.29296875, "learning_rate": 2.6064814814814816e-05, "loss": 0.032, "step": 7805 }, { "epoch": 4.804675484466318, "grad_norm": 0.2275390625, "learning_rate": 2.6049382716049388e-05, "loss": 0.0331, "step": 7810 }, { "epoch": 4.807751461088896, "grad_norm": 0.2216796875, "learning_rate": 2.6033950617283953e-05, "loss": 0.0309, "step": 7815 }, { "epoch": 4.810827437711473, "grad_norm": 0.240234375, "learning_rate": 2.601851851851852e-05, "loss": 0.0328, "step": 7820 }, { "epoch": 4.813903414334051, "grad_norm": 0.259765625, "learning_rate": 2.6003086419753086e-05, "loss": 0.0341, "step": 7825 }, { "epoch": 4.816979390956629, "grad_norm": 0.271484375, "learning_rate": 2.5987654320987655e-05, "loss": 0.0306, "step": 7830 }, { "epoch": 4.820055367579206, "grad_norm": 0.265625, "learning_rate": 2.5972222222222226e-05, "loss": 0.0323, "step": 7835 }, { "epoch": 4.823131344201784, "grad_norm": 0.326171875, "learning_rate": 2.595679012345679e-05, "loss": 0.0344, "step": 7840 }, { "epoch": 4.826207320824362, "grad_norm": 0.29296875, "learning_rate": 2.594135802469136e-05, "loss": 0.0335, "step": 7845 }, { "epoch": 4.829283297446939, "grad_norm": 0.25390625, "learning_rate": 2.5925925925925925e-05, "loss": 0.0331, "step": 7850 }, { "epoch": 4.832359274069517, "grad_norm": 0.296875, "learning_rate": 2.5910493827160493e-05, "loss": 0.0313, "step": 7855 }, { "epoch": 4.835435250692095, "grad_norm": 0.263671875, "learning_rate": 2.5895061728395065e-05, "loss": 0.0332, "step": 7860 }, { "epoch": 4.838511227314672, "grad_norm": 0.298828125, "learning_rate": 2.587962962962963e-05, "loss": 0.0322, "step": 7865 }, { "epoch": 4.84158720393725, "grad_norm": 0.2314453125, "learning_rate": 2.58641975308642e-05, "loss": 0.0307, "step": 7870 }, { "epoch": 4.844663180559827, "grad_norm": 0.251953125, "learning_rate": 2.5848765432098764e-05, "loss": 0.0318, "step": 7875 }, { "epoch": 4.847739157182406, "grad_norm": 0.259765625, "learning_rate": 2.5833333333333336e-05, "loss": 0.0302, "step": 7880 }, { "epoch": 4.850815133804983, "grad_norm": 0.251953125, "learning_rate": 2.5817901234567904e-05, "loss": 0.0332, "step": 7885 }, { "epoch": 4.85389111042756, "grad_norm": 0.259765625, "learning_rate": 2.580246913580247e-05, "loss": 0.0315, "step": 7890 }, { "epoch": 4.856967087050139, "grad_norm": 0.2470703125, "learning_rate": 2.5787037037037038e-05, "loss": 0.0322, "step": 7895 }, { "epoch": 4.860043063672716, "grad_norm": 0.2578125, "learning_rate": 2.5771604938271603e-05, "loss": 0.0322, "step": 7900 }, { "epoch": 4.8631190402952935, "grad_norm": 0.271484375, "learning_rate": 2.5756172839506178e-05, "loss": 0.0305, "step": 7905 }, { "epoch": 4.866195016917871, "grad_norm": 0.287109375, "learning_rate": 2.5740740740740743e-05, "loss": 0.0335, "step": 7910 }, { "epoch": 4.869270993540449, "grad_norm": 0.26171875, "learning_rate": 2.572530864197531e-05, "loss": 0.0361, "step": 7915 }, { "epoch": 4.872346970163027, "grad_norm": 0.291015625, "learning_rate": 2.5709876543209876e-05, "loss": 0.0327, "step": 7920 }, { "epoch": 4.875422946785604, "grad_norm": 0.267578125, "learning_rate": 2.5694444444444445e-05, "loss": 0.0308, "step": 7925 }, { "epoch": 4.878498923408182, "grad_norm": 0.275390625, "learning_rate": 2.5679012345679017e-05, "loss": 0.0358, "step": 7930 }, { "epoch": 4.88157490003076, "grad_norm": 0.298828125, "learning_rate": 2.5663580246913582e-05, "loss": 0.0314, "step": 7935 }, { "epoch": 4.884650876653337, "grad_norm": 0.291015625, "learning_rate": 2.564814814814815e-05, "loss": 0.0332, "step": 7940 }, { "epoch": 4.887726853275915, "grad_norm": 0.263671875, "learning_rate": 2.5632716049382715e-05, "loss": 0.0331, "step": 7945 }, { "epoch": 4.890802829898493, "grad_norm": 0.259765625, "learning_rate": 2.5617283950617287e-05, "loss": 0.0322, "step": 7950 }, { "epoch": 4.89387880652107, "grad_norm": 0.2314453125, "learning_rate": 2.5601851851851856e-05, "loss": 0.0285, "step": 7955 }, { "epoch": 4.896954783143648, "grad_norm": 0.29296875, "learning_rate": 2.558641975308642e-05, "loss": 0.032, "step": 7960 }, { "epoch": 4.900030759766226, "grad_norm": 0.255859375, "learning_rate": 2.557098765432099e-05, "loss": 0.0329, "step": 7965 }, { "epoch": 4.9031067363888035, "grad_norm": 0.267578125, "learning_rate": 2.5555555555555554e-05, "loss": 0.0354, "step": 7970 }, { "epoch": 4.906182713011381, "grad_norm": 0.263671875, "learning_rate": 2.5540123456790126e-05, "loss": 0.0339, "step": 7975 }, { "epoch": 4.909258689633958, "grad_norm": 0.27734375, "learning_rate": 2.5524691358024694e-05, "loss": 0.034, "step": 7980 }, { "epoch": 4.912334666256537, "grad_norm": 0.259765625, "learning_rate": 2.550925925925926e-05, "loss": 0.0332, "step": 7985 }, { "epoch": 4.915410642879114, "grad_norm": 0.259765625, "learning_rate": 2.5493827160493828e-05, "loss": 0.0312, "step": 7990 }, { "epoch": 4.918486619501691, "grad_norm": 0.279296875, "learning_rate": 2.5478395061728393e-05, "loss": 0.0317, "step": 7995 }, { "epoch": 4.92156259612427, "grad_norm": 0.2392578125, "learning_rate": 2.5462962962962965e-05, "loss": 0.0313, "step": 8000 }, { "epoch": 4.924638572746847, "grad_norm": 0.244140625, "learning_rate": 2.5447530864197533e-05, "loss": 0.0354, "step": 8005 }, { "epoch": 4.9277145493694245, "grad_norm": 0.28125, "learning_rate": 2.5432098765432098e-05, "loss": 0.0316, "step": 8010 }, { "epoch": 4.930790525992003, "grad_norm": 0.2734375, "learning_rate": 2.5416666666666667e-05, "loss": 0.0337, "step": 8015 }, { "epoch": 4.93386650261458, "grad_norm": 0.263671875, "learning_rate": 2.5401234567901232e-05, "loss": 0.031, "step": 8020 }, { "epoch": 4.936942479237158, "grad_norm": 0.2578125, "learning_rate": 2.5385802469135807e-05, "loss": 0.0338, "step": 8025 }, { "epoch": 4.940018455859735, "grad_norm": 0.30859375, "learning_rate": 2.5370370370370372e-05, "loss": 0.0322, "step": 8030 }, { "epoch": 4.943094432482313, "grad_norm": 0.24609375, "learning_rate": 2.535493827160494e-05, "loss": 0.0324, "step": 8035 }, { "epoch": 4.946170409104891, "grad_norm": 0.279296875, "learning_rate": 2.5339506172839506e-05, "loss": 0.0322, "step": 8040 }, { "epoch": 4.949246385727468, "grad_norm": 0.28125, "learning_rate": 2.5324074074074077e-05, "loss": 0.031, "step": 8045 }, { "epoch": 4.9523223623500465, "grad_norm": 0.28515625, "learning_rate": 2.5308641975308646e-05, "loss": 0.033, "step": 8050 }, { "epoch": 4.955398338972624, "grad_norm": 0.26953125, "learning_rate": 2.529320987654321e-05, "loss": 0.0316, "step": 8055 }, { "epoch": 4.958474315595201, "grad_norm": 0.2890625, "learning_rate": 2.527777777777778e-05, "loss": 0.0326, "step": 8060 }, { "epoch": 4.961550292217779, "grad_norm": 0.28515625, "learning_rate": 2.5262345679012344e-05, "loss": 0.0336, "step": 8065 }, { "epoch": 4.964626268840357, "grad_norm": 0.291015625, "learning_rate": 2.5246913580246916e-05, "loss": 0.0362, "step": 8070 }, { "epoch": 4.9677022454629345, "grad_norm": 0.2373046875, "learning_rate": 2.5231481481481485e-05, "loss": 0.0325, "step": 8075 }, { "epoch": 4.970778222085512, "grad_norm": 0.26171875, "learning_rate": 2.521604938271605e-05, "loss": 0.0325, "step": 8080 }, { "epoch": 4.97385419870809, "grad_norm": 0.27734375, "learning_rate": 2.5200617283950618e-05, "loss": 0.0345, "step": 8085 }, { "epoch": 4.976930175330668, "grad_norm": 0.294921875, "learning_rate": 2.5185185185185183e-05, "loss": 0.0319, "step": 8090 }, { "epoch": 4.980006151953245, "grad_norm": 0.2392578125, "learning_rate": 2.5169753086419755e-05, "loss": 0.0324, "step": 8095 }, { "epoch": 4.9830821285758224, "grad_norm": 0.236328125, "learning_rate": 2.5154320987654324e-05, "loss": 0.0308, "step": 8100 }, { "epoch": 4.986158105198401, "grad_norm": 0.302734375, "learning_rate": 2.513888888888889e-05, "loss": 0.0318, "step": 8105 }, { "epoch": 4.989234081820978, "grad_norm": 0.28515625, "learning_rate": 2.5123456790123457e-05, "loss": 0.0331, "step": 8110 }, { "epoch": 4.992310058443556, "grad_norm": 0.3125, "learning_rate": 2.510802469135803e-05, "loss": 0.0347, "step": 8115 }, { "epoch": 4.995386035066134, "grad_norm": 0.236328125, "learning_rate": 2.5092592592592594e-05, "loss": 0.0333, "step": 8120 }, { "epoch": 4.998462011688711, "grad_norm": 0.271484375, "learning_rate": 2.5077160493827162e-05, "loss": 0.0312, "step": 8125 }, { "epoch": 5.001537988311289, "grad_norm": 0.240234375, "learning_rate": 2.5061728395061727e-05, "loss": 0.0313, "step": 8130 }, { "epoch": 5.004613964933866, "grad_norm": 0.2734375, "learning_rate": 2.5046296296296296e-05, "loss": 0.0279, "step": 8135 }, { "epoch": 5.007689941556444, "grad_norm": 0.298828125, "learning_rate": 2.5030864197530868e-05, "loss": 0.0267, "step": 8140 }, { "epoch": 5.010765918179022, "grad_norm": 0.26953125, "learning_rate": 2.5015432098765436e-05, "loss": 0.0271, "step": 8145 }, { "epoch": 5.013841894801599, "grad_norm": 0.298828125, "learning_rate": 2.5e-05, "loss": 0.0304, "step": 8150 }, { "epoch": 5.0169178714241776, "grad_norm": 0.279296875, "learning_rate": 2.4984567901234566e-05, "loss": 0.0278, "step": 8155 }, { "epoch": 5.019993848046755, "grad_norm": 0.26953125, "learning_rate": 2.4969135802469138e-05, "loss": 0.0307, "step": 8160 }, { "epoch": 5.023069824669332, "grad_norm": 0.279296875, "learning_rate": 2.4953703703703703e-05, "loss": 0.0287, "step": 8165 }, { "epoch": 5.02614580129191, "grad_norm": 0.373046875, "learning_rate": 2.4938271604938275e-05, "loss": 0.0299, "step": 8170 }, { "epoch": 5.029221777914488, "grad_norm": 0.2578125, "learning_rate": 2.492283950617284e-05, "loss": 0.0255, "step": 8175 }, { "epoch": 5.0322977545370655, "grad_norm": 0.294921875, "learning_rate": 2.490740740740741e-05, "loss": 0.0274, "step": 8180 }, { "epoch": 5.035373731159643, "grad_norm": 0.271484375, "learning_rate": 2.4891975308641977e-05, "loss": 0.0261, "step": 8185 }, { "epoch": 5.038449707782221, "grad_norm": 0.28515625, "learning_rate": 2.4876543209876542e-05, "loss": 0.0291, "step": 8190 }, { "epoch": 5.041525684404799, "grad_norm": 0.279296875, "learning_rate": 2.4861111111111114e-05, "loss": 0.0285, "step": 8195 }, { "epoch": 5.044601661027376, "grad_norm": 0.2578125, "learning_rate": 2.484567901234568e-05, "loss": 0.0274, "step": 8200 }, { "epoch": 5.0476776376499535, "grad_norm": 0.2451171875, "learning_rate": 2.4830246913580247e-05, "loss": 0.0303, "step": 8205 }, { "epoch": 5.050753614272532, "grad_norm": 0.337890625, "learning_rate": 2.4814814814814816e-05, "loss": 0.0264, "step": 8210 }, { "epoch": 5.053829590895109, "grad_norm": 0.23828125, "learning_rate": 2.4799382716049384e-05, "loss": 0.0264, "step": 8215 }, { "epoch": 5.056905567517687, "grad_norm": 0.30078125, "learning_rate": 2.4783950617283953e-05, "loss": 0.0297, "step": 8220 }, { "epoch": 5.059981544140265, "grad_norm": 0.2275390625, "learning_rate": 2.4768518518518518e-05, "loss": 0.028, "step": 8225 }, { "epoch": 5.063057520762842, "grad_norm": 0.306640625, "learning_rate": 2.475308641975309e-05, "loss": 0.0316, "step": 8230 }, { "epoch": 5.06613349738542, "grad_norm": 0.322265625, "learning_rate": 2.4737654320987655e-05, "loss": 0.0278, "step": 8235 }, { "epoch": 5.069209474007997, "grad_norm": 0.2470703125, "learning_rate": 2.4722222222222223e-05, "loss": 0.0253, "step": 8240 }, { "epoch": 5.0722854506305755, "grad_norm": 0.283203125, "learning_rate": 2.470679012345679e-05, "loss": 0.0252, "step": 8245 }, { "epoch": 5.075361427253153, "grad_norm": 0.259765625, "learning_rate": 2.4691358024691357e-05, "loss": 0.026, "step": 8250 }, { "epoch": 5.07843740387573, "grad_norm": 0.2373046875, "learning_rate": 2.467592592592593e-05, "loss": 0.0265, "step": 8255 }, { "epoch": 5.081513380498309, "grad_norm": 0.259765625, "learning_rate": 2.4660493827160493e-05, "loss": 0.0259, "step": 8260 }, { "epoch": 5.084589357120886, "grad_norm": 0.255859375, "learning_rate": 2.4645061728395062e-05, "loss": 0.0275, "step": 8265 }, { "epoch": 5.087665333743463, "grad_norm": 0.2734375, "learning_rate": 2.462962962962963e-05, "loss": 0.0273, "step": 8270 }, { "epoch": 5.090741310366041, "grad_norm": 0.28125, "learning_rate": 2.46141975308642e-05, "loss": 0.0293, "step": 8275 }, { "epoch": 5.093817286988619, "grad_norm": 0.2314453125, "learning_rate": 2.4598765432098767e-05, "loss": 0.0271, "step": 8280 }, { "epoch": 5.0968932636111965, "grad_norm": 0.244140625, "learning_rate": 2.4583333333333332e-05, "loss": 0.0274, "step": 8285 }, { "epoch": 5.099969240233774, "grad_norm": 0.263671875, "learning_rate": 2.4567901234567904e-05, "loss": 0.0232, "step": 8290 }, { "epoch": 5.103045216856352, "grad_norm": 0.291015625, "learning_rate": 2.455246913580247e-05, "loss": 0.0289, "step": 8295 }, { "epoch": 5.10612119347893, "grad_norm": 0.271484375, "learning_rate": 2.4537037037037038e-05, "loss": 0.0285, "step": 8300 }, { "epoch": 5.109197170101507, "grad_norm": 0.28515625, "learning_rate": 2.4521604938271606e-05, "loss": 0.0294, "step": 8305 }, { "epoch": 5.1122731467240845, "grad_norm": 0.318359375, "learning_rate": 2.4506172839506175e-05, "loss": 0.0288, "step": 8310 }, { "epoch": 5.115349123346663, "grad_norm": 0.28125, "learning_rate": 2.4490740740740743e-05, "loss": 0.0255, "step": 8315 }, { "epoch": 5.11842509996924, "grad_norm": 0.291015625, "learning_rate": 2.4475308641975308e-05, "loss": 0.0302, "step": 8320 }, { "epoch": 5.121501076591818, "grad_norm": 0.2470703125, "learning_rate": 2.4459876543209876e-05, "loss": 0.0263, "step": 8325 }, { "epoch": 5.124577053214396, "grad_norm": 0.291015625, "learning_rate": 2.4444444444444445e-05, "loss": 0.0295, "step": 8330 }, { "epoch": 5.127653029836973, "grad_norm": 0.248046875, "learning_rate": 2.4429012345679013e-05, "loss": 0.0278, "step": 8335 }, { "epoch": 5.130729006459551, "grad_norm": 0.2890625, "learning_rate": 2.4413580246913582e-05, "loss": 0.0277, "step": 8340 }, { "epoch": 5.133804983082128, "grad_norm": 0.287109375, "learning_rate": 2.439814814814815e-05, "loss": 0.0297, "step": 8345 }, { "epoch": 5.1368809597047065, "grad_norm": 0.2080078125, "learning_rate": 2.438271604938272e-05, "loss": 0.0258, "step": 8350 }, { "epoch": 5.139956936327284, "grad_norm": 0.267578125, "learning_rate": 2.4367283950617284e-05, "loss": 0.0283, "step": 8355 }, { "epoch": 5.143032912949861, "grad_norm": 0.28515625, "learning_rate": 2.4351851851851852e-05, "loss": 0.0285, "step": 8360 }, { "epoch": 5.14610888957244, "grad_norm": 0.3046875, "learning_rate": 2.433641975308642e-05, "loss": 0.0279, "step": 8365 }, { "epoch": 5.149184866195017, "grad_norm": 0.314453125, "learning_rate": 2.432098765432099e-05, "loss": 0.0289, "step": 8370 }, { "epoch": 5.152260842817594, "grad_norm": 0.2578125, "learning_rate": 2.4305555555555558e-05, "loss": 0.0275, "step": 8375 }, { "epoch": 5.155336819440172, "grad_norm": 0.287109375, "learning_rate": 2.4290123456790126e-05, "loss": 0.0288, "step": 8380 }, { "epoch": 5.15841279606275, "grad_norm": 0.263671875, "learning_rate": 2.427469135802469e-05, "loss": 0.0304, "step": 8385 }, { "epoch": 5.161488772685328, "grad_norm": 0.296875, "learning_rate": 2.425925925925926e-05, "loss": 0.0287, "step": 8390 }, { "epoch": 5.164564749307905, "grad_norm": 0.23828125, "learning_rate": 2.4243827160493828e-05, "loss": 0.0272, "step": 8395 }, { "epoch": 5.167640725930483, "grad_norm": 0.3203125, "learning_rate": 2.4228395061728396e-05, "loss": 0.025, "step": 8400 }, { "epoch": 5.170716702553061, "grad_norm": 0.2890625, "learning_rate": 2.4212962962962965e-05, "loss": 0.0278, "step": 8405 }, { "epoch": 5.173792679175638, "grad_norm": 0.32421875, "learning_rate": 2.4197530864197533e-05, "loss": 0.0281, "step": 8410 }, { "epoch": 5.1768686557982155, "grad_norm": 0.263671875, "learning_rate": 2.4182098765432102e-05, "loss": 0.027, "step": 8415 }, { "epoch": 5.179944632420794, "grad_norm": 0.298828125, "learning_rate": 2.4166666666666667e-05, "loss": 0.0276, "step": 8420 }, { "epoch": 5.183020609043371, "grad_norm": 0.314453125, "learning_rate": 2.4151234567901235e-05, "loss": 0.0289, "step": 8425 }, { "epoch": 5.186096585665949, "grad_norm": 0.275390625, "learning_rate": 2.4135802469135804e-05, "loss": 0.0255, "step": 8430 }, { "epoch": 5.189172562288527, "grad_norm": 0.24609375, "learning_rate": 2.4120370370370372e-05, "loss": 0.0249, "step": 8435 }, { "epoch": 5.192248538911104, "grad_norm": 0.2578125, "learning_rate": 2.410493827160494e-05, "loss": 0.028, "step": 8440 }, { "epoch": 5.195324515533682, "grad_norm": 0.275390625, "learning_rate": 2.4089506172839506e-05, "loss": 0.0265, "step": 8445 }, { "epoch": 5.198400492156259, "grad_norm": 0.28125, "learning_rate": 2.4074074074074074e-05, "loss": 0.0275, "step": 8450 }, { "epoch": 5.2014764687788375, "grad_norm": 0.283203125, "learning_rate": 2.4058641975308643e-05, "loss": 0.0275, "step": 8455 }, { "epoch": 5.204552445401415, "grad_norm": 0.259765625, "learning_rate": 2.404320987654321e-05, "loss": 0.0296, "step": 8460 }, { "epoch": 5.207628422023992, "grad_norm": 0.2421875, "learning_rate": 2.402777777777778e-05, "loss": 0.0266, "step": 8465 }, { "epoch": 5.210704398646571, "grad_norm": 0.2392578125, "learning_rate": 2.4012345679012348e-05, "loss": 0.0264, "step": 8470 }, { "epoch": 5.213780375269148, "grad_norm": 0.287109375, "learning_rate": 2.3996913580246916e-05, "loss": 0.0294, "step": 8475 }, { "epoch": 5.2168563518917255, "grad_norm": 0.341796875, "learning_rate": 2.398148148148148e-05, "loss": 0.0288, "step": 8480 }, { "epoch": 5.219932328514303, "grad_norm": 0.318359375, "learning_rate": 2.396604938271605e-05, "loss": 0.0301, "step": 8485 }, { "epoch": 5.223008305136881, "grad_norm": 0.30078125, "learning_rate": 2.3950617283950618e-05, "loss": 0.0276, "step": 8490 }, { "epoch": 5.226084281759459, "grad_norm": 0.306640625, "learning_rate": 2.3935185185185187e-05, "loss": 0.0286, "step": 8495 }, { "epoch": 5.229160258382036, "grad_norm": 0.310546875, "learning_rate": 2.3919753086419755e-05, "loss": 0.0296, "step": 8500 }, { "epoch": 5.232236235004614, "grad_norm": 0.28515625, "learning_rate": 2.390432098765432e-05, "loss": 0.0269, "step": 8505 }, { "epoch": 5.235312211627192, "grad_norm": 0.314453125, "learning_rate": 2.3888888888888892e-05, "loss": 0.0284, "step": 8510 }, { "epoch": 5.238388188249769, "grad_norm": 0.26953125, "learning_rate": 2.3873456790123457e-05, "loss": 0.0265, "step": 8515 }, { "epoch": 5.241464164872347, "grad_norm": 0.310546875, "learning_rate": 2.3858024691358026e-05, "loss": 0.0296, "step": 8520 }, { "epoch": 5.244540141494925, "grad_norm": 0.255859375, "learning_rate": 2.3842592592592594e-05, "loss": 0.0279, "step": 8525 }, { "epoch": 5.247616118117502, "grad_norm": 0.271484375, "learning_rate": 2.3827160493827162e-05, "loss": 0.0256, "step": 8530 }, { "epoch": 5.25069209474008, "grad_norm": 0.248046875, "learning_rate": 2.381172839506173e-05, "loss": 0.0253, "step": 8535 }, { "epoch": 5.253768071362658, "grad_norm": 0.298828125, "learning_rate": 2.3796296296296296e-05, "loss": 0.0287, "step": 8540 }, { "epoch": 5.256844047985235, "grad_norm": 0.2890625, "learning_rate": 2.3780864197530868e-05, "loss": 0.0277, "step": 8545 }, { "epoch": 5.259920024607813, "grad_norm": 0.2890625, "learning_rate": 2.3765432098765433e-05, "loss": 0.0297, "step": 8550 }, { "epoch": 5.26299600123039, "grad_norm": 0.296875, "learning_rate": 2.375e-05, "loss": 0.0295, "step": 8555 }, { "epoch": 5.2660719778529685, "grad_norm": 0.2578125, "learning_rate": 2.373456790123457e-05, "loss": 0.0311, "step": 8560 }, { "epoch": 5.269147954475546, "grad_norm": 0.2236328125, "learning_rate": 2.3719135802469135e-05, "loss": 0.0257, "step": 8565 }, { "epoch": 5.272223931098123, "grad_norm": 0.296875, "learning_rate": 2.3703703703703707e-05, "loss": 0.0272, "step": 8570 }, { "epoch": 5.275299907720702, "grad_norm": 0.26953125, "learning_rate": 2.3688271604938272e-05, "loss": 0.0274, "step": 8575 }, { "epoch": 5.278375884343279, "grad_norm": 0.263671875, "learning_rate": 2.3672839506172844e-05, "loss": 0.0296, "step": 8580 }, { "epoch": 5.2814518609658565, "grad_norm": 0.259765625, "learning_rate": 2.365740740740741e-05, "loss": 0.026, "step": 8585 }, { "epoch": 5.284527837588434, "grad_norm": 0.25390625, "learning_rate": 2.3641975308641977e-05, "loss": 0.0281, "step": 8590 }, { "epoch": 5.287603814211012, "grad_norm": 0.259765625, "learning_rate": 2.3626543209876545e-05, "loss": 0.0316, "step": 8595 }, { "epoch": 5.29067979083359, "grad_norm": 0.298828125, "learning_rate": 2.361111111111111e-05, "loss": 0.0272, "step": 8600 }, { "epoch": 5.293755767456167, "grad_norm": 0.25390625, "learning_rate": 2.3595679012345682e-05, "loss": 0.0285, "step": 8605 }, { "epoch": 5.296831744078745, "grad_norm": 0.302734375, "learning_rate": 2.3580246913580247e-05, "loss": 0.0283, "step": 8610 }, { "epoch": 5.299907720701323, "grad_norm": 0.296875, "learning_rate": 2.3564814814814816e-05, "loss": 0.0281, "step": 8615 }, { "epoch": 5.3029836973239, "grad_norm": 0.25390625, "learning_rate": 2.3549382716049384e-05, "loss": 0.0271, "step": 8620 }, { "epoch": 5.306059673946478, "grad_norm": 0.3828125, "learning_rate": 2.353395061728395e-05, "loss": 0.0287, "step": 8625 }, { "epoch": 5.309135650569056, "grad_norm": 0.31640625, "learning_rate": 2.351851851851852e-05, "loss": 0.0294, "step": 8630 }, { "epoch": 5.312211627191633, "grad_norm": 0.298828125, "learning_rate": 2.3503086419753086e-05, "loss": 0.0259, "step": 8635 }, { "epoch": 5.315287603814211, "grad_norm": 0.279296875, "learning_rate": 2.3487654320987658e-05, "loss": 0.0277, "step": 8640 }, { "epoch": 5.318363580436789, "grad_norm": 0.265625, "learning_rate": 2.3472222222222223e-05, "loss": 0.0247, "step": 8645 }, { "epoch": 5.321439557059366, "grad_norm": 0.251953125, "learning_rate": 2.345679012345679e-05, "loss": 0.027, "step": 8650 }, { "epoch": 5.324515533681944, "grad_norm": 0.267578125, "learning_rate": 2.344135802469136e-05, "loss": 0.031, "step": 8655 }, { "epoch": 5.327591510304521, "grad_norm": 0.33984375, "learning_rate": 2.3425925925925925e-05, "loss": 0.0292, "step": 8660 }, { "epoch": 5.3306674869271, "grad_norm": 0.271484375, "learning_rate": 2.3410493827160497e-05, "loss": 0.0269, "step": 8665 }, { "epoch": 5.333743463549677, "grad_norm": 0.28515625, "learning_rate": 2.3395061728395062e-05, "loss": 0.0272, "step": 8670 }, { "epoch": 5.336819440172254, "grad_norm": 0.255859375, "learning_rate": 2.337962962962963e-05, "loss": 0.0278, "step": 8675 }, { "epoch": 5.339895416794833, "grad_norm": 0.2734375, "learning_rate": 2.33641975308642e-05, "loss": 0.03, "step": 8680 }, { "epoch": 5.34297139341741, "grad_norm": 0.2578125, "learning_rate": 2.3348765432098764e-05, "loss": 0.0286, "step": 8685 }, { "epoch": 5.3460473700399875, "grad_norm": 0.294921875, "learning_rate": 2.3333333333333336e-05, "loss": 0.03, "step": 8690 }, { "epoch": 5.349123346662565, "grad_norm": 0.23046875, "learning_rate": 2.33179012345679e-05, "loss": 0.0271, "step": 8695 }, { "epoch": 5.352199323285143, "grad_norm": 0.2470703125, "learning_rate": 2.3302469135802473e-05, "loss": 0.029, "step": 8700 }, { "epoch": 5.355275299907721, "grad_norm": 0.25390625, "learning_rate": 2.3287037037037038e-05, "loss": 0.028, "step": 8705 }, { "epoch": 5.358351276530298, "grad_norm": 0.2578125, "learning_rate": 2.3271604938271606e-05, "loss": 0.0308, "step": 8710 }, { "epoch": 5.361427253152876, "grad_norm": 0.24609375, "learning_rate": 2.3256172839506175e-05, "loss": 0.0259, "step": 8715 }, { "epoch": 5.364503229775454, "grad_norm": 0.275390625, "learning_rate": 2.324074074074074e-05, "loss": 0.0285, "step": 8720 }, { "epoch": 5.367579206398031, "grad_norm": 0.29296875, "learning_rate": 2.322530864197531e-05, "loss": 0.0264, "step": 8725 }, { "epoch": 5.3706551830206095, "grad_norm": 0.3046875, "learning_rate": 2.3209876543209877e-05, "loss": 0.0272, "step": 8730 }, { "epoch": 5.373731159643187, "grad_norm": 0.28125, "learning_rate": 2.3194444444444445e-05, "loss": 0.0288, "step": 8735 }, { "epoch": 5.376807136265764, "grad_norm": 0.27734375, "learning_rate": 2.3179012345679013e-05, "loss": 0.0277, "step": 8740 }, { "epoch": 5.379883112888342, "grad_norm": 0.302734375, "learning_rate": 2.3163580246913582e-05, "loss": 0.029, "step": 8745 }, { "epoch": 5.38295908951092, "grad_norm": 0.27734375, "learning_rate": 2.314814814814815e-05, "loss": 0.0294, "step": 8750 }, { "epoch": 5.3860350661334975, "grad_norm": 0.279296875, "learning_rate": 2.3132716049382715e-05, "loss": 0.0289, "step": 8755 }, { "epoch": 5.389111042756075, "grad_norm": 0.294921875, "learning_rate": 2.3117283950617284e-05, "loss": 0.0296, "step": 8760 }, { "epoch": 5.392187019378653, "grad_norm": 0.259765625, "learning_rate": 2.3101851851851852e-05, "loss": 0.0259, "step": 8765 }, { "epoch": 5.395262996001231, "grad_norm": 0.265625, "learning_rate": 2.308641975308642e-05, "loss": 0.0301, "step": 8770 }, { "epoch": 5.398338972623808, "grad_norm": 0.25390625, "learning_rate": 2.307098765432099e-05, "loss": 0.0258, "step": 8775 }, { "epoch": 5.401414949246385, "grad_norm": 0.26953125, "learning_rate": 2.3055555555555558e-05, "loss": 0.0289, "step": 8780 }, { "epoch": 5.404490925868964, "grad_norm": 0.283203125, "learning_rate": 2.3040123456790126e-05, "loss": 0.0263, "step": 8785 }, { "epoch": 5.407566902491541, "grad_norm": 0.267578125, "learning_rate": 2.302469135802469e-05, "loss": 0.0266, "step": 8790 }, { "epoch": 5.410642879114119, "grad_norm": 0.27734375, "learning_rate": 2.300925925925926e-05, "loss": 0.0299, "step": 8795 }, { "epoch": 5.413718855736697, "grad_norm": 0.275390625, "learning_rate": 2.2993827160493828e-05, "loss": 0.0263, "step": 8800 }, { "epoch": 5.416794832359274, "grad_norm": 0.27734375, "learning_rate": 2.2978395061728397e-05, "loss": 0.0272, "step": 8805 }, { "epoch": 5.419870808981852, "grad_norm": 0.287109375, "learning_rate": 2.2962962962962965e-05, "loss": 0.0295, "step": 8810 }, { "epoch": 5.422946785604429, "grad_norm": 0.259765625, "learning_rate": 2.294753086419753e-05, "loss": 0.0297, "step": 8815 }, { "epoch": 5.426022762227007, "grad_norm": 0.275390625, "learning_rate": 2.29320987654321e-05, "loss": 0.0272, "step": 8820 }, { "epoch": 5.429098738849585, "grad_norm": 0.306640625, "learning_rate": 2.2916666666666667e-05, "loss": 0.0287, "step": 8825 }, { "epoch": 5.432174715472162, "grad_norm": 0.294921875, "learning_rate": 2.2901234567901235e-05, "loss": 0.028, "step": 8830 }, { "epoch": 5.4352506920947405, "grad_norm": 0.26953125, "learning_rate": 2.2885802469135804e-05, "loss": 0.0281, "step": 8835 }, { "epoch": 5.438326668717318, "grad_norm": 0.255859375, "learning_rate": 2.2870370370370372e-05, "loss": 0.0274, "step": 8840 }, { "epoch": 5.441402645339895, "grad_norm": 0.2470703125, "learning_rate": 2.285493827160494e-05, "loss": 0.0252, "step": 8845 }, { "epoch": 5.444478621962473, "grad_norm": 0.310546875, "learning_rate": 2.2839506172839506e-05, "loss": 0.0305, "step": 8850 }, { "epoch": 5.447554598585051, "grad_norm": 0.27734375, "learning_rate": 2.2824074074074074e-05, "loss": 0.0286, "step": 8855 }, { "epoch": 5.4506305752076285, "grad_norm": 0.302734375, "learning_rate": 2.2808641975308643e-05, "loss": 0.0285, "step": 8860 }, { "epoch": 5.453706551830206, "grad_norm": 0.248046875, "learning_rate": 2.279320987654321e-05, "loss": 0.026, "step": 8865 }, { "epoch": 5.456782528452784, "grad_norm": 0.29296875, "learning_rate": 2.277777777777778e-05, "loss": 0.0273, "step": 8870 }, { "epoch": 5.459858505075362, "grad_norm": 0.28125, "learning_rate": 2.2762345679012348e-05, "loss": 0.0288, "step": 8875 }, { "epoch": 5.462934481697939, "grad_norm": 0.26171875, "learning_rate": 2.2746913580246913e-05, "loss": 0.0264, "step": 8880 }, { "epoch": 5.4660104583205165, "grad_norm": 0.302734375, "learning_rate": 2.273148148148148e-05, "loss": 0.0275, "step": 8885 }, { "epoch": 5.469086434943095, "grad_norm": 0.287109375, "learning_rate": 2.271604938271605e-05, "loss": 0.0284, "step": 8890 }, { "epoch": 5.472162411565672, "grad_norm": 0.28515625, "learning_rate": 2.270061728395062e-05, "loss": 0.0266, "step": 8895 }, { "epoch": 5.47523838818825, "grad_norm": 0.216796875, "learning_rate": 2.2685185185185187e-05, "loss": 0.0274, "step": 8900 }, { "epoch": 5.478314364810828, "grad_norm": 0.271484375, "learning_rate": 2.2669753086419755e-05, "loss": 0.0264, "step": 8905 }, { "epoch": 5.481390341433405, "grad_norm": 0.263671875, "learning_rate": 2.2654320987654324e-05, "loss": 0.0269, "step": 8910 }, { "epoch": 5.484466318055983, "grad_norm": 0.26171875, "learning_rate": 2.263888888888889e-05, "loss": 0.0263, "step": 8915 }, { "epoch": 5.48754229467856, "grad_norm": 0.294921875, "learning_rate": 2.2623456790123457e-05, "loss": 0.0293, "step": 8920 }, { "epoch": 5.490618271301138, "grad_norm": 0.31640625, "learning_rate": 2.2608024691358026e-05, "loss": 0.0299, "step": 8925 }, { "epoch": 5.493694247923716, "grad_norm": 0.2890625, "learning_rate": 2.2592592592592594e-05, "loss": 0.0287, "step": 8930 }, { "epoch": 5.496770224546293, "grad_norm": 0.3515625, "learning_rate": 2.2577160493827163e-05, "loss": 0.0309, "step": 8935 }, { "epoch": 5.499846201168872, "grad_norm": 0.318359375, "learning_rate": 2.2561728395061728e-05, "loss": 0.0285, "step": 8940 }, { "epoch": 5.502922177791449, "grad_norm": 0.291015625, "learning_rate": 2.25462962962963e-05, "loss": 0.0282, "step": 8945 }, { "epoch": 5.505998154414026, "grad_norm": 0.2890625, "learning_rate": 2.2530864197530865e-05, "loss": 0.0276, "step": 8950 }, { "epoch": 5.509074131036604, "grad_norm": 0.298828125, "learning_rate": 2.2515432098765433e-05, "loss": 0.0311, "step": 8955 }, { "epoch": 5.512150107659182, "grad_norm": 0.265625, "learning_rate": 2.25e-05, "loss": 0.0269, "step": 8960 }, { "epoch": 5.5152260842817595, "grad_norm": 0.287109375, "learning_rate": 2.248456790123457e-05, "loss": 0.0285, "step": 8965 }, { "epoch": 5.518302060904337, "grad_norm": 0.2490234375, "learning_rate": 2.246913580246914e-05, "loss": 0.0273, "step": 8970 }, { "epoch": 5.521378037526915, "grad_norm": 0.3125, "learning_rate": 2.2453703703703703e-05, "loss": 0.0278, "step": 8975 }, { "epoch": 5.524454014149493, "grad_norm": 0.2734375, "learning_rate": 2.2438271604938272e-05, "loss": 0.0291, "step": 8980 }, { "epoch": 5.52752999077207, "grad_norm": 0.251953125, "learning_rate": 2.242283950617284e-05, "loss": 0.0285, "step": 8985 }, { "epoch": 5.5306059673946475, "grad_norm": 0.271484375, "learning_rate": 2.240740740740741e-05, "loss": 0.0281, "step": 8990 }, { "epoch": 5.533681944017226, "grad_norm": 0.314453125, "learning_rate": 2.2391975308641977e-05, "loss": 0.0301, "step": 8995 }, { "epoch": 5.536757920639803, "grad_norm": 0.2734375, "learning_rate": 2.2376543209876542e-05, "loss": 0.0271, "step": 9000 }, { "epoch": 5.539833897262381, "grad_norm": 0.30078125, "learning_rate": 2.2361111111111114e-05, "loss": 0.0316, "step": 9005 }, { "epoch": 5.542909873884959, "grad_norm": 0.3125, "learning_rate": 2.234567901234568e-05, "loss": 0.0256, "step": 9010 }, { "epoch": 5.545985850507536, "grad_norm": 0.27734375, "learning_rate": 2.2330246913580248e-05, "loss": 0.0286, "step": 9015 }, { "epoch": 5.549061827130114, "grad_norm": 0.298828125, "learning_rate": 2.2314814814814816e-05, "loss": 0.0287, "step": 9020 }, { "epoch": 5.552137803752691, "grad_norm": 0.3125, "learning_rate": 2.2299382716049384e-05, "loss": 0.028, "step": 9025 }, { "epoch": 5.5552137803752695, "grad_norm": 0.310546875, "learning_rate": 2.2283950617283953e-05, "loss": 0.0297, "step": 9030 }, { "epoch": 5.558289756997847, "grad_norm": 0.271484375, "learning_rate": 2.2268518518518518e-05, "loss": 0.0287, "step": 9035 }, { "epoch": 5.561365733620424, "grad_norm": 0.341796875, "learning_rate": 2.225308641975309e-05, "loss": 0.0289, "step": 9040 }, { "epoch": 5.564441710243003, "grad_norm": 0.26953125, "learning_rate": 2.2237654320987655e-05, "loss": 0.0274, "step": 9045 }, { "epoch": 5.56751768686558, "grad_norm": 0.30859375, "learning_rate": 2.2222222222222223e-05, "loss": 0.0295, "step": 9050 }, { "epoch": 5.570593663488157, "grad_norm": 0.294921875, "learning_rate": 2.2206790123456792e-05, "loss": 0.0308, "step": 9055 }, { "epoch": 5.573669640110735, "grad_norm": 0.265625, "learning_rate": 2.2191358024691357e-05, "loss": 0.0284, "step": 9060 }, { "epoch": 5.576745616733313, "grad_norm": 0.3125, "learning_rate": 2.217592592592593e-05, "loss": 0.0291, "step": 9065 }, { "epoch": 5.579821593355891, "grad_norm": 0.279296875, "learning_rate": 2.2160493827160494e-05, "loss": 0.0292, "step": 9070 }, { "epoch": 5.582897569978468, "grad_norm": 0.2734375, "learning_rate": 2.2145061728395066e-05, "loss": 0.0253, "step": 9075 }, { "epoch": 5.585973546601046, "grad_norm": 0.283203125, "learning_rate": 2.212962962962963e-05, "loss": 0.0286, "step": 9080 }, { "epoch": 5.589049523223624, "grad_norm": 0.287109375, "learning_rate": 2.21141975308642e-05, "loss": 0.0278, "step": 9085 }, { "epoch": 5.592125499846201, "grad_norm": 0.271484375, "learning_rate": 2.2098765432098767e-05, "loss": 0.0288, "step": 9090 }, { "epoch": 5.5952014764687785, "grad_norm": 0.296875, "learning_rate": 2.2083333333333333e-05, "loss": 0.0287, "step": 9095 }, { "epoch": 5.598277453091357, "grad_norm": 0.291015625, "learning_rate": 2.2067901234567904e-05, "loss": 0.0277, "step": 9100 }, { "epoch": 5.601353429713934, "grad_norm": 0.30078125, "learning_rate": 2.205246913580247e-05, "loss": 0.0297, "step": 9105 }, { "epoch": 5.604429406336512, "grad_norm": 0.27734375, "learning_rate": 2.2037037037037038e-05, "loss": 0.0277, "step": 9110 }, { "epoch": 5.60750538295909, "grad_norm": 0.294921875, "learning_rate": 2.2021604938271606e-05, "loss": 0.0277, "step": 9115 }, { "epoch": 5.610581359581667, "grad_norm": 0.30078125, "learning_rate": 2.200617283950617e-05, "loss": 0.0265, "step": 9120 }, { "epoch": 5.613657336204245, "grad_norm": 0.296875, "learning_rate": 2.1990740740740743e-05, "loss": 0.0293, "step": 9125 }, { "epoch": 5.616733312826822, "grad_norm": 0.248046875, "learning_rate": 2.1975308641975308e-05, "loss": 0.0297, "step": 9130 }, { "epoch": 5.6198092894494005, "grad_norm": 0.310546875, "learning_rate": 2.195987654320988e-05, "loss": 0.0299, "step": 9135 }, { "epoch": 5.622885266071978, "grad_norm": 0.263671875, "learning_rate": 2.1944444444444445e-05, "loss": 0.0285, "step": 9140 }, { "epoch": 5.625961242694555, "grad_norm": 0.2294921875, "learning_rate": 2.1929012345679014e-05, "loss": 0.0258, "step": 9145 }, { "epoch": 5.629037219317134, "grad_norm": 0.27734375, "learning_rate": 2.1913580246913582e-05, "loss": 0.0268, "step": 9150 }, { "epoch": 5.632113195939711, "grad_norm": 0.275390625, "learning_rate": 2.1898148148148147e-05, "loss": 0.0277, "step": 9155 }, { "epoch": 5.6351891725622885, "grad_norm": 0.322265625, "learning_rate": 2.188271604938272e-05, "loss": 0.0308, "step": 9160 }, { "epoch": 5.638265149184866, "grad_norm": 0.265625, "learning_rate": 2.1867283950617284e-05, "loss": 0.0287, "step": 9165 }, { "epoch": 5.641341125807444, "grad_norm": 0.291015625, "learning_rate": 2.1851851851851852e-05, "loss": 0.0279, "step": 9170 }, { "epoch": 5.644417102430022, "grad_norm": 0.3046875, "learning_rate": 2.183641975308642e-05, "loss": 0.0294, "step": 9175 }, { "epoch": 5.647493079052599, "grad_norm": 0.3046875, "learning_rate": 2.1820987654320986e-05, "loss": 0.0317, "step": 9180 }, { "epoch": 5.650569055675177, "grad_norm": 0.314453125, "learning_rate": 2.1805555555555558e-05, "loss": 0.0273, "step": 9185 }, { "epoch": 5.653645032297755, "grad_norm": 0.27734375, "learning_rate": 2.1790123456790123e-05, "loss": 0.0284, "step": 9190 }, { "epoch": 5.656721008920332, "grad_norm": 0.3125, "learning_rate": 2.1774691358024695e-05, "loss": 0.0309, "step": 9195 }, { "epoch": 5.6597969855429096, "grad_norm": 0.29296875, "learning_rate": 2.175925925925926e-05, "loss": 0.03, "step": 9200 }, { "epoch": 5.662872962165488, "grad_norm": 0.30078125, "learning_rate": 2.1743827160493828e-05, "loss": 0.0287, "step": 9205 }, { "epoch": 5.665948938788065, "grad_norm": 0.302734375, "learning_rate": 2.1728395061728397e-05, "loss": 0.0279, "step": 9210 }, { "epoch": 5.669024915410643, "grad_norm": 0.275390625, "learning_rate": 2.171296296296296e-05, "loss": 0.0259, "step": 9215 }, { "epoch": 5.672100892033221, "grad_norm": 0.287109375, "learning_rate": 2.1697530864197534e-05, "loss": 0.028, "step": 9220 }, { "epoch": 5.675176868655798, "grad_norm": 0.255859375, "learning_rate": 2.16820987654321e-05, "loss": 0.028, "step": 9225 }, { "epoch": 5.678252845278376, "grad_norm": 0.255859375, "learning_rate": 2.1666666666666667e-05, "loss": 0.0245, "step": 9230 }, { "epoch": 5.681328821900953, "grad_norm": 0.2578125, "learning_rate": 2.1651234567901235e-05, "loss": 0.0258, "step": 9235 }, { "epoch": 5.6844047985235315, "grad_norm": 0.2734375, "learning_rate": 2.1635802469135804e-05, "loss": 0.0291, "step": 9240 }, { "epoch": 5.687480775146109, "grad_norm": 0.26953125, "learning_rate": 2.1620370370370372e-05, "loss": 0.0312, "step": 9245 }, { "epoch": 5.690556751768686, "grad_norm": 0.283203125, "learning_rate": 2.1604938271604937e-05, "loss": 0.0268, "step": 9250 }, { "epoch": 5.693632728391265, "grad_norm": 0.267578125, "learning_rate": 2.158950617283951e-05, "loss": 0.0269, "step": 9255 }, { "epoch": 5.696708705013842, "grad_norm": 0.28515625, "learning_rate": 2.1574074074074074e-05, "loss": 0.0301, "step": 9260 }, { "epoch": 5.6997846816364195, "grad_norm": 0.27734375, "learning_rate": 2.1558641975308643e-05, "loss": 0.0285, "step": 9265 }, { "epoch": 5.702860658258997, "grad_norm": 0.259765625, "learning_rate": 2.154320987654321e-05, "loss": 0.026, "step": 9270 }, { "epoch": 5.705936634881575, "grad_norm": 0.318359375, "learning_rate": 2.152777777777778e-05, "loss": 0.0304, "step": 9275 }, { "epoch": 5.709012611504153, "grad_norm": 0.271484375, "learning_rate": 2.1512345679012348e-05, "loss": 0.0266, "step": 9280 }, { "epoch": 5.71208858812673, "grad_norm": 0.263671875, "learning_rate": 2.1496913580246913e-05, "loss": 0.0281, "step": 9285 }, { "epoch": 5.715164564749308, "grad_norm": 0.275390625, "learning_rate": 2.148148148148148e-05, "loss": 0.0286, "step": 9290 }, { "epoch": 5.718240541371886, "grad_norm": 0.2578125, "learning_rate": 2.146604938271605e-05, "loss": 0.0287, "step": 9295 }, { "epoch": 5.721316517994463, "grad_norm": 0.27734375, "learning_rate": 2.145061728395062e-05, "loss": 0.0282, "step": 9300 }, { "epoch": 5.724392494617041, "grad_norm": 0.25390625, "learning_rate": 2.1435185185185187e-05, "loss": 0.0296, "step": 9305 }, { "epoch": 5.727468471239619, "grad_norm": 0.265625, "learning_rate": 2.1419753086419755e-05, "loss": 0.0275, "step": 9310 }, { "epoch": 5.730544447862196, "grad_norm": 0.279296875, "learning_rate": 2.140432098765432e-05, "loss": 0.0282, "step": 9315 }, { "epoch": 5.733620424484774, "grad_norm": 0.294921875, "learning_rate": 2.138888888888889e-05, "loss": 0.0289, "step": 9320 }, { "epoch": 5.736696401107352, "grad_norm": 0.2890625, "learning_rate": 2.1373456790123457e-05, "loss": 0.0271, "step": 9325 }, { "epoch": 5.739772377729929, "grad_norm": 0.28515625, "learning_rate": 2.1358024691358026e-05, "loss": 0.0275, "step": 9330 }, { "epoch": 5.742848354352507, "grad_norm": 0.310546875, "learning_rate": 2.1342592592592594e-05, "loss": 0.0314, "step": 9335 }, { "epoch": 5.745924330975084, "grad_norm": 0.24609375, "learning_rate": 2.1327160493827163e-05, "loss": 0.0267, "step": 9340 }, { "epoch": 5.749000307597663, "grad_norm": 0.287109375, "learning_rate": 2.131172839506173e-05, "loss": 0.0279, "step": 9345 }, { "epoch": 5.75207628422024, "grad_norm": 0.2734375, "learning_rate": 2.1296296296296296e-05, "loss": 0.0286, "step": 9350 }, { "epoch": 5.755152260842817, "grad_norm": 0.3203125, "learning_rate": 2.1280864197530865e-05, "loss": 0.028, "step": 9355 }, { "epoch": 5.758228237465396, "grad_norm": 0.279296875, "learning_rate": 2.1265432098765433e-05, "loss": 0.0267, "step": 9360 }, { "epoch": 5.761304214087973, "grad_norm": 0.306640625, "learning_rate": 2.125e-05, "loss": 0.0271, "step": 9365 }, { "epoch": 5.7643801907105505, "grad_norm": 0.275390625, "learning_rate": 2.123456790123457e-05, "loss": 0.0298, "step": 9370 }, { "epoch": 5.767456167333128, "grad_norm": 0.28515625, "learning_rate": 2.1219135802469135e-05, "loss": 0.0267, "step": 9375 }, { "epoch": 5.770532143955706, "grad_norm": 0.3203125, "learning_rate": 2.1203703703703703e-05, "loss": 0.0306, "step": 9380 }, { "epoch": 5.773608120578284, "grad_norm": 0.275390625, "learning_rate": 2.1188271604938272e-05, "loss": 0.0269, "step": 9385 }, { "epoch": 5.776684097200861, "grad_norm": 0.25, "learning_rate": 2.117283950617284e-05, "loss": 0.0309, "step": 9390 }, { "epoch": 5.779760073823439, "grad_norm": 0.265625, "learning_rate": 2.115740740740741e-05, "loss": 0.0268, "step": 9395 }, { "epoch": 5.782836050446017, "grad_norm": 0.26171875, "learning_rate": 2.1141975308641977e-05, "loss": 0.0302, "step": 9400 }, { "epoch": 5.785912027068594, "grad_norm": 0.31640625, "learning_rate": 2.1126543209876546e-05, "loss": 0.0284, "step": 9405 }, { "epoch": 5.788988003691172, "grad_norm": 0.2734375, "learning_rate": 2.111111111111111e-05, "loss": 0.0276, "step": 9410 }, { "epoch": 5.79206398031375, "grad_norm": 0.28515625, "learning_rate": 2.109567901234568e-05, "loss": 0.0291, "step": 9415 }, { "epoch": 5.795139956936327, "grad_norm": 0.306640625, "learning_rate": 2.1080246913580248e-05, "loss": 0.0284, "step": 9420 }, { "epoch": 5.798215933558905, "grad_norm": 0.255859375, "learning_rate": 2.1064814814814816e-05, "loss": 0.0254, "step": 9425 }, { "epoch": 5.801291910181483, "grad_norm": 0.2490234375, "learning_rate": 2.1049382716049385e-05, "loss": 0.0261, "step": 9430 }, { "epoch": 5.8043678868040605, "grad_norm": 0.287109375, "learning_rate": 2.103395061728395e-05, "loss": 0.0307, "step": 9435 }, { "epoch": 5.807443863426638, "grad_norm": 0.4921875, "learning_rate": 2.101851851851852e-05, "loss": 0.0286, "step": 9440 }, { "epoch": 5.810519840049215, "grad_norm": 0.275390625, "learning_rate": 2.1003086419753087e-05, "loss": 0.0307, "step": 9445 }, { "epoch": 5.813595816671794, "grad_norm": 0.37109375, "learning_rate": 2.0987654320987655e-05, "loss": 0.0303, "step": 9450 }, { "epoch": 5.816671793294371, "grad_norm": 0.34765625, "learning_rate": 2.0972222222222223e-05, "loss": 0.0269, "step": 9455 }, { "epoch": 5.819747769916948, "grad_norm": 0.28125, "learning_rate": 2.0956790123456792e-05, "loss": 0.0245, "step": 9460 }, { "epoch": 5.822823746539527, "grad_norm": 0.27734375, "learning_rate": 2.094135802469136e-05, "loss": 0.0295, "step": 9465 }, { "epoch": 5.825899723162104, "grad_norm": 0.271484375, "learning_rate": 2.0925925925925925e-05, "loss": 0.0291, "step": 9470 }, { "epoch": 5.8289756997846816, "grad_norm": 0.2470703125, "learning_rate": 2.0910493827160497e-05, "loss": 0.0275, "step": 9475 }, { "epoch": 5.832051676407259, "grad_norm": 0.2890625, "learning_rate": 2.0895061728395062e-05, "loss": 0.0285, "step": 9480 }, { "epoch": 5.835127653029837, "grad_norm": 0.294921875, "learning_rate": 2.087962962962963e-05, "loss": 0.0289, "step": 9485 }, { "epoch": 5.838203629652415, "grad_norm": 0.2294921875, "learning_rate": 2.08641975308642e-05, "loss": 0.0266, "step": 9490 }, { "epoch": 5.841279606274992, "grad_norm": 0.275390625, "learning_rate": 2.0848765432098764e-05, "loss": 0.0295, "step": 9495 }, { "epoch": 5.84435558289757, "grad_norm": 0.3125, "learning_rate": 2.0833333333333336e-05, "loss": 0.0317, "step": 9500 }, { "epoch": 5.847431559520148, "grad_norm": 0.26953125, "learning_rate": 2.08179012345679e-05, "loss": 0.0297, "step": 9505 }, { "epoch": 5.850507536142725, "grad_norm": 0.3359375, "learning_rate": 2.0802469135802473e-05, "loss": 0.0289, "step": 9510 }, { "epoch": 5.853583512765303, "grad_norm": 0.310546875, "learning_rate": 2.0787037037037038e-05, "loss": 0.0278, "step": 9515 }, { "epoch": 5.856659489387881, "grad_norm": 0.251953125, "learning_rate": 2.0771604938271606e-05, "loss": 0.0272, "step": 9520 }, { "epoch": 5.859735466010458, "grad_norm": 0.30078125, "learning_rate": 2.0756172839506175e-05, "loss": 0.0276, "step": 9525 }, { "epoch": 5.862811442633036, "grad_norm": 0.275390625, "learning_rate": 2.074074074074074e-05, "loss": 0.0308, "step": 9530 }, { "epoch": 5.865887419255614, "grad_norm": 0.302734375, "learning_rate": 2.0725308641975312e-05, "loss": 0.03, "step": 9535 }, { "epoch": 5.8689633958781915, "grad_norm": 0.29296875, "learning_rate": 2.0709876543209877e-05, "loss": 0.0288, "step": 9540 }, { "epoch": 5.872039372500769, "grad_norm": 0.2734375, "learning_rate": 2.0694444444444445e-05, "loss": 0.0279, "step": 9545 }, { "epoch": 5.875115349123346, "grad_norm": 0.265625, "learning_rate": 2.0679012345679014e-05, "loss": 0.0291, "step": 9550 }, { "epoch": 5.878191325745925, "grad_norm": 0.279296875, "learning_rate": 2.066358024691358e-05, "loss": 0.0307, "step": 9555 }, { "epoch": 5.881267302368502, "grad_norm": 0.291015625, "learning_rate": 2.064814814814815e-05, "loss": 0.0306, "step": 9560 }, { "epoch": 5.8843432789910795, "grad_norm": 0.248046875, "learning_rate": 2.0632716049382716e-05, "loss": 0.0294, "step": 9565 }, { "epoch": 5.887419255613658, "grad_norm": 0.32421875, "learning_rate": 2.0617283950617287e-05, "loss": 0.0324, "step": 9570 }, { "epoch": 5.890495232236235, "grad_norm": 0.33203125, "learning_rate": 2.0601851851851853e-05, "loss": 0.0338, "step": 9575 }, { "epoch": 5.893571208858813, "grad_norm": 0.294921875, "learning_rate": 2.058641975308642e-05, "loss": 0.0277, "step": 9580 }, { "epoch": 5.89664718548139, "grad_norm": 0.26171875, "learning_rate": 2.057098765432099e-05, "loss": 0.0274, "step": 9585 }, { "epoch": 5.899723162103968, "grad_norm": 0.2353515625, "learning_rate": 2.0555555555555555e-05, "loss": 0.0273, "step": 9590 }, { "epoch": 5.902799138726546, "grad_norm": 0.30859375, "learning_rate": 2.0540123456790126e-05, "loss": 0.0257, "step": 9595 }, { "epoch": 5.905875115349123, "grad_norm": 0.27734375, "learning_rate": 2.052469135802469e-05, "loss": 0.0289, "step": 9600 }, { "epoch": 5.908951091971701, "grad_norm": 0.29296875, "learning_rate": 2.050925925925926e-05, "loss": 0.0291, "step": 9605 }, { "epoch": 5.912027068594279, "grad_norm": 0.296875, "learning_rate": 2.0493827160493828e-05, "loss": 0.029, "step": 9610 }, { "epoch": 5.915103045216856, "grad_norm": 0.279296875, "learning_rate": 2.0478395061728393e-05, "loss": 0.0295, "step": 9615 }, { "epoch": 5.918179021839434, "grad_norm": 0.265625, "learning_rate": 2.0462962962962965e-05, "loss": 0.0266, "step": 9620 }, { "epoch": 5.921254998462012, "grad_norm": 0.25390625, "learning_rate": 2.044753086419753e-05, "loss": 0.0286, "step": 9625 }, { "epoch": 5.924330975084589, "grad_norm": 0.25390625, "learning_rate": 2.0432098765432102e-05, "loss": 0.029, "step": 9630 }, { "epoch": 5.927406951707167, "grad_norm": 0.302734375, "learning_rate": 2.0416666666666667e-05, "loss": 0.0288, "step": 9635 }, { "epoch": 5.930482928329745, "grad_norm": 0.26171875, "learning_rate": 2.0401234567901236e-05, "loss": 0.0274, "step": 9640 }, { "epoch": 5.9335589049523225, "grad_norm": 0.296875, "learning_rate": 2.0385802469135804e-05, "loss": 0.0295, "step": 9645 }, { "epoch": 5.9366348815749, "grad_norm": 0.287109375, "learning_rate": 2.037037037037037e-05, "loss": 0.0279, "step": 9650 }, { "epoch": 5.939710858197477, "grad_norm": 0.283203125, "learning_rate": 2.035493827160494e-05, "loss": 0.028, "step": 9655 }, { "epoch": 5.942786834820056, "grad_norm": 0.2333984375, "learning_rate": 2.0339506172839506e-05, "loss": 0.0269, "step": 9660 }, { "epoch": 5.945862811442633, "grad_norm": 0.287109375, "learning_rate": 2.0324074074074074e-05, "loss": 0.0273, "step": 9665 }, { "epoch": 5.9489387880652105, "grad_norm": 0.259765625, "learning_rate": 2.0308641975308643e-05, "loss": 0.0258, "step": 9670 }, { "epoch": 5.952014764687789, "grad_norm": 0.26953125, "learning_rate": 2.029320987654321e-05, "loss": 0.0283, "step": 9675 }, { "epoch": 5.955090741310366, "grad_norm": 0.2578125, "learning_rate": 2.027777777777778e-05, "loss": 0.027, "step": 9680 }, { "epoch": 5.958166717932944, "grad_norm": 0.330078125, "learning_rate": 2.0262345679012345e-05, "loss": 0.0293, "step": 9685 }, { "epoch": 5.961242694555521, "grad_norm": 0.2421875, "learning_rate": 2.0246913580246917e-05, "loss": 0.0251, "step": 9690 }, { "epoch": 5.964318671178099, "grad_norm": 0.306640625, "learning_rate": 2.0231481481481482e-05, "loss": 0.0286, "step": 9695 }, { "epoch": 5.967394647800677, "grad_norm": 0.365234375, "learning_rate": 2.021604938271605e-05, "loss": 0.0307, "step": 9700 }, { "epoch": 5.970470624423254, "grad_norm": 0.2578125, "learning_rate": 2.020061728395062e-05, "loss": 0.03, "step": 9705 }, { "epoch": 5.9735466010458325, "grad_norm": 0.236328125, "learning_rate": 2.0185185185185187e-05, "loss": 0.0259, "step": 9710 }, { "epoch": 5.97662257766841, "grad_norm": 0.263671875, "learning_rate": 2.0169753086419756e-05, "loss": 0.0302, "step": 9715 }, { "epoch": 5.979698554290987, "grad_norm": 0.26171875, "learning_rate": 2.015432098765432e-05, "loss": 0.0268, "step": 9720 }, { "epoch": 5.982774530913565, "grad_norm": 0.291015625, "learning_rate": 2.013888888888889e-05, "loss": 0.0292, "step": 9725 }, { "epoch": 5.985850507536143, "grad_norm": 0.2890625, "learning_rate": 2.0123456790123457e-05, "loss": 0.0291, "step": 9730 }, { "epoch": 5.98892648415872, "grad_norm": 0.263671875, "learning_rate": 2.0108024691358026e-05, "loss": 0.0275, "step": 9735 }, { "epoch": 5.992002460781298, "grad_norm": 0.275390625, "learning_rate": 2.0092592592592594e-05, "loss": 0.0266, "step": 9740 }, { "epoch": 5.995078437403876, "grad_norm": 0.2490234375, "learning_rate": 2.007716049382716e-05, "loss": 0.0271, "step": 9745 }, { "epoch": 5.9981544140264536, "grad_norm": 0.251953125, "learning_rate": 2.006172839506173e-05, "loss": 0.0269, "step": 9750 }, { "epoch": 6.001230390649031, "grad_norm": 0.26171875, "learning_rate": 2.0046296296296296e-05, "loss": 0.0258, "step": 9755 }, { "epoch": 6.004306367271608, "grad_norm": 0.248046875, "learning_rate": 2.0030864197530865e-05, "loss": 0.0247, "step": 9760 }, { "epoch": 6.007382343894187, "grad_norm": 0.263671875, "learning_rate": 2.0015432098765433e-05, "loss": 0.0252, "step": 9765 }, { "epoch": 6.010458320516764, "grad_norm": 0.345703125, "learning_rate": 2e-05, "loss": 0.0264, "step": 9770 }, { "epoch": 6.0135342971393415, "grad_norm": 0.259765625, "learning_rate": 1.998456790123457e-05, "loss": 0.0255, "step": 9775 }, { "epoch": 6.01661027376192, "grad_norm": 0.3046875, "learning_rate": 1.9969135802469135e-05, "loss": 0.0242, "step": 9780 }, { "epoch": 6.019686250384497, "grad_norm": 0.2451171875, "learning_rate": 1.9953703703703704e-05, "loss": 0.0237, "step": 9785 }, { "epoch": 6.022762227007075, "grad_norm": 0.24609375, "learning_rate": 1.9938271604938272e-05, "loss": 0.0233, "step": 9790 }, { "epoch": 6.025838203629652, "grad_norm": 0.28125, "learning_rate": 1.992283950617284e-05, "loss": 0.0225, "step": 9795 }, { "epoch": 6.02891418025223, "grad_norm": 0.259765625, "learning_rate": 1.990740740740741e-05, "loss": 0.0239, "step": 9800 }, { "epoch": 6.031990156874808, "grad_norm": 0.283203125, "learning_rate": 1.9891975308641977e-05, "loss": 0.0257, "step": 9805 }, { "epoch": 6.035066133497385, "grad_norm": 0.294921875, "learning_rate": 1.9876543209876546e-05, "loss": 0.0266, "step": 9810 }, { "epoch": 6.0381421101199635, "grad_norm": 0.251953125, "learning_rate": 1.986111111111111e-05, "loss": 0.0263, "step": 9815 }, { "epoch": 6.041218086742541, "grad_norm": 0.2734375, "learning_rate": 1.984567901234568e-05, "loss": 0.0256, "step": 9820 }, { "epoch": 6.044294063365118, "grad_norm": 0.23046875, "learning_rate": 1.9830246913580248e-05, "loss": 0.0228, "step": 9825 }, { "epoch": 6.047370039987696, "grad_norm": 0.234375, "learning_rate": 1.9814814814814816e-05, "loss": 0.0235, "step": 9830 }, { "epoch": 6.050446016610274, "grad_norm": 0.2314453125, "learning_rate": 1.9799382716049385e-05, "loss": 0.0261, "step": 9835 }, { "epoch": 6.0535219932328515, "grad_norm": 0.27734375, "learning_rate": 1.9783950617283953e-05, "loss": 0.0244, "step": 9840 }, { "epoch": 6.056597969855429, "grad_norm": 0.287109375, "learning_rate": 1.9768518518518518e-05, "loss": 0.0255, "step": 9845 }, { "epoch": 6.059673946478007, "grad_norm": 0.2890625, "learning_rate": 1.9753086419753087e-05, "loss": 0.0249, "step": 9850 }, { "epoch": 6.062749923100585, "grad_norm": 0.275390625, "learning_rate": 1.9737654320987655e-05, "loss": 0.0259, "step": 9855 }, { "epoch": 6.065825899723162, "grad_norm": 0.302734375, "learning_rate": 1.9722222222222224e-05, "loss": 0.0238, "step": 9860 }, { "epoch": 6.068901876345739, "grad_norm": 0.265625, "learning_rate": 1.9706790123456792e-05, "loss": 0.0255, "step": 9865 }, { "epoch": 6.071977852968318, "grad_norm": 0.271484375, "learning_rate": 1.969135802469136e-05, "loss": 0.0242, "step": 9870 }, { "epoch": 6.075053829590895, "grad_norm": 0.29296875, "learning_rate": 1.967592592592593e-05, "loss": 0.0253, "step": 9875 }, { "epoch": 6.0781298062134725, "grad_norm": 0.267578125, "learning_rate": 1.9660493827160494e-05, "loss": 0.0248, "step": 9880 }, { "epoch": 6.081205782836051, "grad_norm": 0.25390625, "learning_rate": 1.9645061728395062e-05, "loss": 0.025, "step": 9885 }, { "epoch": 6.084281759458628, "grad_norm": 0.2578125, "learning_rate": 1.962962962962963e-05, "loss": 0.026, "step": 9890 }, { "epoch": 6.087357736081206, "grad_norm": 0.2470703125, "learning_rate": 1.96141975308642e-05, "loss": 0.0243, "step": 9895 }, { "epoch": 6.090433712703783, "grad_norm": 0.271484375, "learning_rate": 1.9598765432098768e-05, "loss": 0.0246, "step": 9900 }, { "epoch": 6.093509689326361, "grad_norm": 0.291015625, "learning_rate": 1.9583333333333333e-05, "loss": 0.0272, "step": 9905 }, { "epoch": 6.096585665948939, "grad_norm": 0.259765625, "learning_rate": 1.95679012345679e-05, "loss": 0.0232, "step": 9910 }, { "epoch": 6.099661642571516, "grad_norm": 0.2890625, "learning_rate": 1.955246913580247e-05, "loss": 0.0271, "step": 9915 }, { "epoch": 6.1027376191940945, "grad_norm": 0.234375, "learning_rate": 1.9537037037037038e-05, "loss": 0.0211, "step": 9920 }, { "epoch": 6.105813595816672, "grad_norm": 0.25390625, "learning_rate": 1.9521604938271607e-05, "loss": 0.0223, "step": 9925 }, { "epoch": 6.108889572439249, "grad_norm": 0.251953125, "learning_rate": 1.950617283950617e-05, "loss": 0.0245, "step": 9930 }, { "epoch": 6.111965549061827, "grad_norm": 0.296875, "learning_rate": 1.9490740740740743e-05, "loss": 0.0264, "step": 9935 }, { "epoch": 6.115041525684405, "grad_norm": 0.267578125, "learning_rate": 1.947530864197531e-05, "loss": 0.0248, "step": 9940 }, { "epoch": 6.1181175023069825, "grad_norm": 0.263671875, "learning_rate": 1.9459876543209877e-05, "loss": 0.0261, "step": 9945 }, { "epoch": 6.12119347892956, "grad_norm": 0.287109375, "learning_rate": 1.9444444444444445e-05, "loss": 0.0256, "step": 9950 }, { "epoch": 6.124269455552138, "grad_norm": 0.390625, "learning_rate": 1.9429012345679014e-05, "loss": 0.0249, "step": 9955 }, { "epoch": 6.127345432174716, "grad_norm": 0.2392578125, "learning_rate": 1.9413580246913582e-05, "loss": 0.0253, "step": 9960 }, { "epoch": 6.130421408797293, "grad_norm": 0.302734375, "learning_rate": 1.9398148148148147e-05, "loss": 0.0256, "step": 9965 }, { "epoch": 6.13349738541987, "grad_norm": 0.2470703125, "learning_rate": 1.938271604938272e-05, "loss": 0.022, "step": 9970 }, { "epoch": 6.136573362042449, "grad_norm": 0.248046875, "learning_rate": 1.9367283950617284e-05, "loss": 0.0243, "step": 9975 }, { "epoch": 6.139649338665026, "grad_norm": 0.2890625, "learning_rate": 1.9351851851851853e-05, "loss": 0.0259, "step": 9980 }, { "epoch": 6.142725315287604, "grad_norm": 0.25, "learning_rate": 1.933641975308642e-05, "loss": 0.0241, "step": 9985 }, { "epoch": 6.145801291910182, "grad_norm": 0.2451171875, "learning_rate": 1.9320987654320986e-05, "loss": 0.0248, "step": 9990 }, { "epoch": 6.148877268532759, "grad_norm": 0.3828125, "learning_rate": 1.9305555555555558e-05, "loss": 0.0279, "step": 9995 }, { "epoch": 6.151953245155337, "grad_norm": 0.2734375, "learning_rate": 1.9290123456790123e-05, "loss": 0.0248, "step": 10000 }, { "epoch": 6.155029221777914, "grad_norm": 0.263671875, "learning_rate": 1.9274691358024695e-05, "loss": 0.0251, "step": 10005 }, { "epoch": 6.158105198400492, "grad_norm": 0.2734375, "learning_rate": 1.925925925925926e-05, "loss": 0.0238, "step": 10010 }, { "epoch": 6.16118117502307, "grad_norm": 0.3046875, "learning_rate": 1.924382716049383e-05, "loss": 0.0271, "step": 10015 }, { "epoch": 6.164257151645647, "grad_norm": 0.22265625, "learning_rate": 1.9228395061728397e-05, "loss": 0.0251, "step": 10020 }, { "epoch": 6.1673331282682256, "grad_norm": 0.306640625, "learning_rate": 1.9212962962962962e-05, "loss": 0.0286, "step": 10025 }, { "epoch": 6.170409104890803, "grad_norm": 0.2734375, "learning_rate": 1.9197530864197534e-05, "loss": 0.0251, "step": 10030 }, { "epoch": 6.17348508151338, "grad_norm": 0.283203125, "learning_rate": 1.91820987654321e-05, "loss": 0.024, "step": 10035 }, { "epoch": 6.176561058135958, "grad_norm": 0.318359375, "learning_rate": 1.9166666666666667e-05, "loss": 0.0248, "step": 10040 }, { "epoch": 6.179637034758536, "grad_norm": 0.25, "learning_rate": 1.9151234567901236e-05, "loss": 0.0249, "step": 10045 }, { "epoch": 6.1827130113811135, "grad_norm": 0.3125, "learning_rate": 1.91358024691358e-05, "loss": 0.0257, "step": 10050 }, { "epoch": 6.185788988003691, "grad_norm": 0.28515625, "learning_rate": 1.9120370370370373e-05, "loss": 0.0257, "step": 10055 }, { "epoch": 6.188864964626269, "grad_norm": 0.267578125, "learning_rate": 1.9104938271604938e-05, "loss": 0.0224, "step": 10060 }, { "epoch": 6.191940941248847, "grad_norm": 0.271484375, "learning_rate": 1.908950617283951e-05, "loss": 0.0215, "step": 10065 }, { "epoch": 6.195016917871424, "grad_norm": 0.291015625, "learning_rate": 1.9074074074074075e-05, "loss": 0.0263, "step": 10070 }, { "epoch": 6.1980928944940015, "grad_norm": 0.255859375, "learning_rate": 1.9058641975308643e-05, "loss": 0.0259, "step": 10075 }, { "epoch": 6.20116887111658, "grad_norm": 0.240234375, "learning_rate": 1.904320987654321e-05, "loss": 0.0257, "step": 10080 }, { "epoch": 6.204244847739157, "grad_norm": 0.283203125, "learning_rate": 1.9027777777777776e-05, "loss": 0.0247, "step": 10085 }, { "epoch": 6.207320824361735, "grad_norm": 0.3203125, "learning_rate": 1.901234567901235e-05, "loss": 0.0295, "step": 10090 }, { "epoch": 6.210396800984313, "grad_norm": 0.2490234375, "learning_rate": 1.8996913580246913e-05, "loss": 0.0268, "step": 10095 }, { "epoch": 6.21347277760689, "grad_norm": 0.298828125, "learning_rate": 1.8981481481481482e-05, "loss": 0.0282, "step": 10100 }, { "epoch": 6.216548754229468, "grad_norm": 0.271484375, "learning_rate": 1.896604938271605e-05, "loss": 0.0253, "step": 10105 }, { "epoch": 6.219624730852045, "grad_norm": 0.251953125, "learning_rate": 1.8950617283950615e-05, "loss": 0.0275, "step": 10110 }, { "epoch": 6.2227007074746234, "grad_norm": 0.248046875, "learning_rate": 1.8935185185185187e-05, "loss": 0.0232, "step": 10115 }, { "epoch": 6.225776684097201, "grad_norm": 0.310546875, "learning_rate": 1.8919753086419752e-05, "loss": 0.0261, "step": 10120 }, { "epoch": 6.228852660719778, "grad_norm": 0.248046875, "learning_rate": 1.8904320987654324e-05, "loss": 0.0256, "step": 10125 }, { "epoch": 6.231928637342357, "grad_norm": 0.279296875, "learning_rate": 1.888888888888889e-05, "loss": 0.0241, "step": 10130 }, { "epoch": 6.235004613964934, "grad_norm": 0.291015625, "learning_rate": 1.8873456790123458e-05, "loss": 0.024, "step": 10135 }, { "epoch": 6.238080590587511, "grad_norm": 0.30078125, "learning_rate": 1.8858024691358026e-05, "loss": 0.025, "step": 10140 }, { "epoch": 6.241156567210089, "grad_norm": 0.2578125, "learning_rate": 1.884259259259259e-05, "loss": 0.025, "step": 10145 }, { "epoch": 6.244232543832667, "grad_norm": 0.2373046875, "learning_rate": 1.8827160493827163e-05, "loss": 0.0239, "step": 10150 }, { "epoch": 6.2473085204552445, "grad_norm": 0.310546875, "learning_rate": 1.8811728395061728e-05, "loss": 0.0257, "step": 10155 }, { "epoch": 6.250384497077822, "grad_norm": 0.306640625, "learning_rate": 1.8796296296296296e-05, "loss": 0.0288, "step": 10160 }, { "epoch": 6.2534604737004, "grad_norm": 0.2578125, "learning_rate": 1.8780864197530865e-05, "loss": 0.0247, "step": 10165 }, { "epoch": 6.256536450322978, "grad_norm": 0.330078125, "learning_rate": 1.8765432098765433e-05, "loss": 0.0274, "step": 10170 }, { "epoch": 6.259612426945555, "grad_norm": 0.2490234375, "learning_rate": 1.8750000000000002e-05, "loss": 0.0251, "step": 10175 }, { "epoch": 6.2626884035681325, "grad_norm": 0.2431640625, "learning_rate": 1.8734567901234567e-05, "loss": 0.0261, "step": 10180 }, { "epoch": 6.265764380190711, "grad_norm": 0.28515625, "learning_rate": 1.871913580246914e-05, "loss": 0.026, "step": 10185 }, { "epoch": 6.268840356813288, "grad_norm": 0.2734375, "learning_rate": 1.8703703703703704e-05, "loss": 0.0242, "step": 10190 }, { "epoch": 6.271916333435866, "grad_norm": 0.255859375, "learning_rate": 1.8688271604938272e-05, "loss": 0.0252, "step": 10195 }, { "epoch": 6.274992310058444, "grad_norm": 0.3203125, "learning_rate": 1.867283950617284e-05, "loss": 0.0257, "step": 10200 }, { "epoch": 6.278068286681021, "grad_norm": 0.2890625, "learning_rate": 1.865740740740741e-05, "loss": 0.0247, "step": 10205 }, { "epoch": 6.281144263303599, "grad_norm": 0.2578125, "learning_rate": 1.8641975308641977e-05, "loss": 0.0235, "step": 10210 }, { "epoch": 6.284220239926176, "grad_norm": 0.28125, "learning_rate": 1.8626543209876543e-05, "loss": 0.0244, "step": 10215 }, { "epoch": 6.2872962165487545, "grad_norm": 0.2255859375, "learning_rate": 1.861111111111111e-05, "loss": 0.0239, "step": 10220 }, { "epoch": 6.290372193171332, "grad_norm": 0.341796875, "learning_rate": 1.859567901234568e-05, "loss": 0.025, "step": 10225 }, { "epoch": 6.293448169793909, "grad_norm": 0.310546875, "learning_rate": 1.8580246913580248e-05, "loss": 0.0253, "step": 10230 }, { "epoch": 6.296524146416488, "grad_norm": 0.3359375, "learning_rate": 1.8564814814814816e-05, "loss": 0.0264, "step": 10235 }, { "epoch": 6.299600123039065, "grad_norm": 0.298828125, "learning_rate": 1.8549382716049385e-05, "loss": 0.025, "step": 10240 }, { "epoch": 6.302676099661642, "grad_norm": 0.255859375, "learning_rate": 1.8533950617283953e-05, "loss": 0.0267, "step": 10245 }, { "epoch": 6.30575207628422, "grad_norm": 0.26171875, "learning_rate": 1.8518518518518518e-05, "loss": 0.0248, "step": 10250 }, { "epoch": 6.308828052906798, "grad_norm": 0.2890625, "learning_rate": 1.8503086419753087e-05, "loss": 0.028, "step": 10255 }, { "epoch": 6.311904029529376, "grad_norm": 0.298828125, "learning_rate": 1.8487654320987655e-05, "loss": 0.0236, "step": 10260 }, { "epoch": 6.314980006151953, "grad_norm": 0.265625, "learning_rate": 1.8472222222222224e-05, "loss": 0.0256, "step": 10265 }, { "epoch": 6.318055982774531, "grad_norm": 0.259765625, "learning_rate": 1.8456790123456792e-05, "loss": 0.0254, "step": 10270 }, { "epoch": 6.321131959397109, "grad_norm": 0.34765625, "learning_rate": 1.8441358024691357e-05, "loss": 0.0283, "step": 10275 }, { "epoch": 6.324207936019686, "grad_norm": 0.265625, "learning_rate": 1.8425925925925926e-05, "loss": 0.0233, "step": 10280 }, { "epoch": 6.3272839126422635, "grad_norm": 0.296875, "learning_rate": 1.8410493827160494e-05, "loss": 0.024, "step": 10285 }, { "epoch": 6.330359889264842, "grad_norm": 0.298828125, "learning_rate": 1.8395061728395062e-05, "loss": 0.0297, "step": 10290 }, { "epoch": 6.333435865887419, "grad_norm": 0.27734375, "learning_rate": 1.837962962962963e-05, "loss": 0.0231, "step": 10295 }, { "epoch": 6.336511842509997, "grad_norm": 0.265625, "learning_rate": 1.83641975308642e-05, "loss": 0.0261, "step": 10300 }, { "epoch": 6.339587819132575, "grad_norm": 0.271484375, "learning_rate": 1.8348765432098768e-05, "loss": 0.026, "step": 10305 }, { "epoch": 6.342663795755152, "grad_norm": 0.25, "learning_rate": 1.8333333333333333e-05, "loss": 0.0263, "step": 10310 }, { "epoch": 6.34573977237773, "grad_norm": 0.26171875, "learning_rate": 1.83179012345679e-05, "loss": 0.0219, "step": 10315 }, { "epoch": 6.348815749000307, "grad_norm": 0.3203125, "learning_rate": 1.830246913580247e-05, "loss": 0.0277, "step": 10320 }, { "epoch": 6.3518917256228855, "grad_norm": 0.28515625, "learning_rate": 1.8287037037037038e-05, "loss": 0.024, "step": 10325 }, { "epoch": 6.354967702245463, "grad_norm": 0.255859375, "learning_rate": 1.8271604938271607e-05, "loss": 0.0246, "step": 10330 }, { "epoch": 6.35804367886804, "grad_norm": 0.2890625, "learning_rate": 1.8256172839506175e-05, "loss": 0.0267, "step": 10335 }, { "epoch": 6.361119655490619, "grad_norm": 0.3203125, "learning_rate": 1.824074074074074e-05, "loss": 0.0238, "step": 10340 }, { "epoch": 6.364195632113196, "grad_norm": 0.267578125, "learning_rate": 1.822530864197531e-05, "loss": 0.028, "step": 10345 }, { "epoch": 6.3672716087357735, "grad_norm": 0.265625, "learning_rate": 1.8209876543209877e-05, "loss": 0.0256, "step": 10350 }, { "epoch": 6.370347585358351, "grad_norm": 0.255859375, "learning_rate": 1.8194444444444445e-05, "loss": 0.0257, "step": 10355 }, { "epoch": 6.373423561980929, "grad_norm": 0.275390625, "learning_rate": 1.8179012345679014e-05, "loss": 0.0236, "step": 10360 }, { "epoch": 6.376499538603507, "grad_norm": 0.25390625, "learning_rate": 1.8163580246913582e-05, "loss": 0.0259, "step": 10365 }, { "epoch": 6.379575515226084, "grad_norm": 0.259765625, "learning_rate": 1.814814814814815e-05, "loss": 0.0267, "step": 10370 }, { "epoch": 6.382651491848662, "grad_norm": 0.28515625, "learning_rate": 1.8132716049382716e-05, "loss": 0.0248, "step": 10375 }, { "epoch": 6.38572746847124, "grad_norm": 0.265625, "learning_rate": 1.8117283950617284e-05, "loss": 0.0257, "step": 10380 }, { "epoch": 6.388803445093817, "grad_norm": 0.32421875, "learning_rate": 1.8101851851851853e-05, "loss": 0.0274, "step": 10385 }, { "epoch": 6.391879421716395, "grad_norm": 0.2333984375, "learning_rate": 1.808641975308642e-05, "loss": 0.022, "step": 10390 }, { "epoch": 6.394955398338973, "grad_norm": 0.279296875, "learning_rate": 1.807098765432099e-05, "loss": 0.0227, "step": 10395 }, { "epoch": 6.39803137496155, "grad_norm": 0.2490234375, "learning_rate": 1.8055555555555555e-05, "loss": 0.0259, "step": 10400 }, { "epoch": 6.401107351584128, "grad_norm": 0.31640625, "learning_rate": 1.8040123456790127e-05, "loss": 0.0259, "step": 10405 }, { "epoch": 6.404183328206706, "grad_norm": 0.287109375, "learning_rate": 1.802469135802469e-05, "loss": 0.0239, "step": 10410 }, { "epoch": 6.407259304829283, "grad_norm": 0.279296875, "learning_rate": 1.800925925925926e-05, "loss": 0.0261, "step": 10415 }, { "epoch": 6.410335281451861, "grad_norm": 0.328125, "learning_rate": 1.799382716049383e-05, "loss": 0.0276, "step": 10420 }, { "epoch": 6.413411258074438, "grad_norm": 0.271484375, "learning_rate": 1.7978395061728397e-05, "loss": 0.0245, "step": 10425 }, { "epoch": 6.4164872346970165, "grad_norm": 0.2734375, "learning_rate": 1.7962962962962965e-05, "loss": 0.0238, "step": 10430 }, { "epoch": 6.419563211319594, "grad_norm": 0.291015625, "learning_rate": 1.794753086419753e-05, "loss": 0.0239, "step": 10435 }, { "epoch": 6.422639187942171, "grad_norm": 0.259765625, "learning_rate": 1.7932098765432102e-05, "loss": 0.0245, "step": 10440 }, { "epoch": 6.42571516456475, "grad_norm": 0.26171875, "learning_rate": 1.7916666666666667e-05, "loss": 0.0257, "step": 10445 }, { "epoch": 6.428791141187327, "grad_norm": 0.30859375, "learning_rate": 1.7901234567901236e-05, "loss": 0.0244, "step": 10450 }, { "epoch": 6.4318671178099045, "grad_norm": 0.2890625, "learning_rate": 1.7885802469135804e-05, "loss": 0.0248, "step": 10455 }, { "epoch": 6.434943094432482, "grad_norm": 0.30859375, "learning_rate": 1.787037037037037e-05, "loss": 0.0254, "step": 10460 }, { "epoch": 6.43801907105506, "grad_norm": 0.27734375, "learning_rate": 1.785493827160494e-05, "loss": 0.0243, "step": 10465 }, { "epoch": 6.441095047677638, "grad_norm": 0.28515625, "learning_rate": 1.7839506172839506e-05, "loss": 0.0238, "step": 10470 }, { "epoch": 6.444171024300215, "grad_norm": 0.259765625, "learning_rate": 1.7824074074074075e-05, "loss": 0.0259, "step": 10475 }, { "epoch": 6.447247000922793, "grad_norm": 0.26953125, "learning_rate": 1.7808641975308643e-05, "loss": 0.0245, "step": 10480 }, { "epoch": 6.450322977545371, "grad_norm": 0.2578125, "learning_rate": 1.7793209876543208e-05, "loss": 0.0226, "step": 10485 }, { "epoch": 6.453398954167948, "grad_norm": 0.333984375, "learning_rate": 1.777777777777778e-05, "loss": 0.0253, "step": 10490 }, { "epoch": 6.456474930790526, "grad_norm": 0.298828125, "learning_rate": 1.7762345679012345e-05, "loss": 0.0256, "step": 10495 }, { "epoch": 6.459550907413104, "grad_norm": 0.29296875, "learning_rate": 1.7746913580246917e-05, "loss": 0.028, "step": 10500 }, { "epoch": 6.462626884035681, "grad_norm": 0.23046875, "learning_rate": 1.7731481481481482e-05, "loss": 0.0223, "step": 10505 }, { "epoch": 6.465702860658259, "grad_norm": 0.294921875, "learning_rate": 1.771604938271605e-05, "loss": 0.0259, "step": 10510 }, { "epoch": 6.468778837280837, "grad_norm": 0.29296875, "learning_rate": 1.770061728395062e-05, "loss": 0.0276, "step": 10515 }, { "epoch": 6.471854813903414, "grad_norm": 0.30859375, "learning_rate": 1.7685185185185184e-05, "loss": 0.0242, "step": 10520 }, { "epoch": 6.474930790525992, "grad_norm": 0.29296875, "learning_rate": 1.7669753086419756e-05, "loss": 0.0265, "step": 10525 }, { "epoch": 6.478006767148569, "grad_norm": 0.32421875, "learning_rate": 1.765432098765432e-05, "loss": 0.0263, "step": 10530 }, { "epoch": 6.481082743771148, "grad_norm": 0.25, "learning_rate": 1.763888888888889e-05, "loss": 0.0255, "step": 10535 }, { "epoch": 6.484158720393725, "grad_norm": 0.3046875, "learning_rate": 1.7623456790123458e-05, "loss": 0.0244, "step": 10540 }, { "epoch": 6.487234697016302, "grad_norm": 0.27734375, "learning_rate": 1.7608024691358023e-05, "loss": 0.0233, "step": 10545 }, { "epoch": 6.490310673638881, "grad_norm": 0.287109375, "learning_rate": 1.7592592592592595e-05, "loss": 0.0252, "step": 10550 }, { "epoch": 6.493386650261458, "grad_norm": 0.2734375, "learning_rate": 1.757716049382716e-05, "loss": 0.0241, "step": 10555 }, { "epoch": 6.4964626268840355, "grad_norm": 0.275390625, "learning_rate": 1.756172839506173e-05, "loss": 0.0243, "step": 10560 }, { "epoch": 6.499538603506613, "grad_norm": 0.287109375, "learning_rate": 1.7546296296296297e-05, "loss": 0.0275, "step": 10565 }, { "epoch": 6.502614580129191, "grad_norm": 0.287109375, "learning_rate": 1.7530864197530865e-05, "loss": 0.0261, "step": 10570 }, { "epoch": 6.505690556751769, "grad_norm": 0.279296875, "learning_rate": 1.7515432098765433e-05, "loss": 0.0272, "step": 10575 }, { "epoch": 6.508766533374346, "grad_norm": 0.2578125, "learning_rate": 1.75e-05, "loss": 0.0215, "step": 10580 }, { "epoch": 6.511842509996924, "grad_norm": 0.283203125, "learning_rate": 1.748456790123457e-05, "loss": 0.0248, "step": 10585 }, { "epoch": 6.514918486619502, "grad_norm": 0.275390625, "learning_rate": 1.7469135802469135e-05, "loss": 0.0258, "step": 10590 }, { "epoch": 6.517994463242079, "grad_norm": 0.2490234375, "learning_rate": 1.7453703703703704e-05, "loss": 0.0233, "step": 10595 }, { "epoch": 6.521070439864657, "grad_norm": 0.279296875, "learning_rate": 1.7438271604938272e-05, "loss": 0.026, "step": 10600 }, { "epoch": 6.524146416487235, "grad_norm": 0.298828125, "learning_rate": 1.742283950617284e-05, "loss": 0.0244, "step": 10605 }, { "epoch": 6.527222393109812, "grad_norm": 0.265625, "learning_rate": 1.740740740740741e-05, "loss": 0.0248, "step": 10610 }, { "epoch": 6.53029836973239, "grad_norm": 0.259765625, "learning_rate": 1.7391975308641974e-05, "loss": 0.0245, "step": 10615 }, { "epoch": 6.533374346354968, "grad_norm": 0.275390625, "learning_rate": 1.7376543209876546e-05, "loss": 0.0252, "step": 10620 }, { "epoch": 6.5364503229775455, "grad_norm": 0.275390625, "learning_rate": 1.736111111111111e-05, "loss": 0.0243, "step": 10625 }, { "epoch": 6.539526299600123, "grad_norm": 0.310546875, "learning_rate": 1.734567901234568e-05, "loss": 0.0275, "step": 10630 }, { "epoch": 6.5426022762227, "grad_norm": 0.259765625, "learning_rate": 1.7330246913580248e-05, "loss": 0.0255, "step": 10635 }, { "epoch": 6.545678252845279, "grad_norm": 0.24609375, "learning_rate": 1.7314814814814813e-05, "loss": 0.0269, "step": 10640 }, { "epoch": 6.548754229467856, "grad_norm": 0.302734375, "learning_rate": 1.7299382716049385e-05, "loss": 0.0284, "step": 10645 }, { "epoch": 6.551830206090433, "grad_norm": 0.306640625, "learning_rate": 1.728395061728395e-05, "loss": 0.0274, "step": 10650 }, { "epoch": 6.554906182713012, "grad_norm": 0.291015625, "learning_rate": 1.726851851851852e-05, "loss": 0.028, "step": 10655 }, { "epoch": 6.557982159335589, "grad_norm": 0.267578125, "learning_rate": 1.7253086419753087e-05, "loss": 0.0241, "step": 10660 }, { "epoch": 6.561058135958167, "grad_norm": 0.255859375, "learning_rate": 1.7237654320987655e-05, "loss": 0.0248, "step": 10665 }, { "epoch": 6.564134112580744, "grad_norm": 0.32421875, "learning_rate": 1.7222222222222224e-05, "loss": 0.0254, "step": 10670 }, { "epoch": 6.567210089203322, "grad_norm": 0.30859375, "learning_rate": 1.720679012345679e-05, "loss": 0.0259, "step": 10675 }, { "epoch": 6.5702860658259, "grad_norm": 0.287109375, "learning_rate": 1.719135802469136e-05, "loss": 0.0264, "step": 10680 }, { "epoch": 6.573362042448477, "grad_norm": 0.27734375, "learning_rate": 1.7175925925925926e-05, "loss": 0.0244, "step": 10685 }, { "epoch": 6.576438019071055, "grad_norm": 0.255859375, "learning_rate": 1.7160493827160494e-05, "loss": 0.0259, "step": 10690 }, { "epoch": 6.579513995693633, "grad_norm": 0.240234375, "learning_rate": 1.7145061728395063e-05, "loss": 0.0236, "step": 10695 }, { "epoch": 6.58258997231621, "grad_norm": 0.234375, "learning_rate": 1.712962962962963e-05, "loss": 0.024, "step": 10700 }, { "epoch": 6.585665948938788, "grad_norm": 0.23828125, "learning_rate": 1.71141975308642e-05, "loss": 0.024, "step": 10705 }, { "epoch": 6.588741925561366, "grad_norm": 0.283203125, "learning_rate": 1.7098765432098765e-05, "loss": 0.0237, "step": 10710 }, { "epoch": 6.591817902183943, "grad_norm": 0.291015625, "learning_rate": 1.7083333333333333e-05, "loss": 0.0237, "step": 10715 }, { "epoch": 6.594893878806521, "grad_norm": 0.28515625, "learning_rate": 1.70679012345679e-05, "loss": 0.0292, "step": 10720 }, { "epoch": 6.597969855429099, "grad_norm": 0.25390625, "learning_rate": 1.705246913580247e-05, "loss": 0.0227, "step": 10725 }, { "epoch": 6.6010458320516765, "grad_norm": 0.30859375, "learning_rate": 1.7037037037037038e-05, "loss": 0.0258, "step": 10730 }, { "epoch": 6.604121808674254, "grad_norm": 0.27734375, "learning_rate": 1.7021604938271607e-05, "loss": 0.0252, "step": 10735 }, { "epoch": 6.607197785296831, "grad_norm": 0.3125, "learning_rate": 1.7006172839506175e-05, "loss": 0.0238, "step": 10740 }, { "epoch": 6.61027376191941, "grad_norm": 0.28515625, "learning_rate": 1.699074074074074e-05, "loss": 0.0258, "step": 10745 }, { "epoch": 6.613349738541987, "grad_norm": 0.330078125, "learning_rate": 1.697530864197531e-05, "loss": 0.0261, "step": 10750 }, { "epoch": 6.6164257151645645, "grad_norm": 0.29296875, "learning_rate": 1.6959876543209877e-05, "loss": 0.026, "step": 10755 }, { "epoch": 6.619501691787143, "grad_norm": 0.310546875, "learning_rate": 1.6944444444444446e-05, "loss": 0.0242, "step": 10760 }, { "epoch": 6.62257766840972, "grad_norm": 0.271484375, "learning_rate": 1.6929012345679014e-05, "loss": 0.0261, "step": 10765 }, { "epoch": 6.625653645032298, "grad_norm": 0.3125, "learning_rate": 1.6913580246913582e-05, "loss": 0.0269, "step": 10770 }, { "epoch": 6.628729621654875, "grad_norm": 0.26953125, "learning_rate": 1.6898148148148148e-05, "loss": 0.029, "step": 10775 }, { "epoch": 6.631805598277453, "grad_norm": 0.27734375, "learning_rate": 1.6882716049382716e-05, "loss": 0.0255, "step": 10780 }, { "epoch": 6.634881574900031, "grad_norm": 0.337890625, "learning_rate": 1.6867283950617284e-05, "loss": 0.0259, "step": 10785 }, { "epoch": 6.637957551522608, "grad_norm": 0.3203125, "learning_rate": 1.6851851851851853e-05, "loss": 0.0253, "step": 10790 }, { "epoch": 6.641033528145186, "grad_norm": 0.310546875, "learning_rate": 1.683641975308642e-05, "loss": 0.0243, "step": 10795 }, { "epoch": 6.644109504767764, "grad_norm": 0.302734375, "learning_rate": 1.682098765432099e-05, "loss": 0.0266, "step": 10800 }, { "epoch": 6.647185481390341, "grad_norm": 0.244140625, "learning_rate": 1.6805555555555558e-05, "loss": 0.0248, "step": 10805 }, { "epoch": 6.650261458012919, "grad_norm": 0.37109375, "learning_rate": 1.6790123456790123e-05, "loss": 0.0285, "step": 10810 }, { "epoch": 6.653337434635497, "grad_norm": 0.2431640625, "learning_rate": 1.6774691358024692e-05, "loss": 0.0256, "step": 10815 }, { "epoch": 6.656413411258074, "grad_norm": 0.474609375, "learning_rate": 1.675925925925926e-05, "loss": 0.0276, "step": 10820 }, { "epoch": 6.659489387880652, "grad_norm": 0.263671875, "learning_rate": 1.674382716049383e-05, "loss": 0.0247, "step": 10825 }, { "epoch": 6.66256536450323, "grad_norm": 0.29296875, "learning_rate": 1.6728395061728397e-05, "loss": 0.0239, "step": 10830 }, { "epoch": 6.6656413411258075, "grad_norm": 0.259765625, "learning_rate": 1.6712962962962962e-05, "loss": 0.0244, "step": 10835 }, { "epoch": 6.668717317748385, "grad_norm": 0.3125, "learning_rate": 1.669753086419753e-05, "loss": 0.0283, "step": 10840 }, { "epoch": 6.671793294370962, "grad_norm": 0.287109375, "learning_rate": 1.66820987654321e-05, "loss": 0.0273, "step": 10845 }, { "epoch": 6.674869270993541, "grad_norm": 0.2294921875, "learning_rate": 1.6666666666666667e-05, "loss": 0.0249, "step": 10850 }, { "epoch": 6.677945247616118, "grad_norm": 0.263671875, "learning_rate": 1.6651234567901236e-05, "loss": 0.0244, "step": 10855 }, { "epoch": 6.6810212242386955, "grad_norm": 0.259765625, "learning_rate": 1.6635802469135804e-05, "loss": 0.0261, "step": 10860 }, { "epoch": 6.684097200861274, "grad_norm": 0.3203125, "learning_rate": 1.6620370370370373e-05, "loss": 0.0274, "step": 10865 }, { "epoch": 6.687173177483851, "grad_norm": 0.267578125, "learning_rate": 1.6604938271604938e-05, "loss": 0.0244, "step": 10870 }, { "epoch": 6.690249154106429, "grad_norm": 0.28515625, "learning_rate": 1.6589506172839506e-05, "loss": 0.0251, "step": 10875 }, { "epoch": 6.693325130729006, "grad_norm": 0.28515625, "learning_rate": 1.6574074074074075e-05, "loss": 0.0248, "step": 10880 }, { "epoch": 6.696401107351584, "grad_norm": 0.34765625, "learning_rate": 1.6558641975308643e-05, "loss": 0.0262, "step": 10885 }, { "epoch": 6.699477083974162, "grad_norm": 0.30859375, "learning_rate": 1.654320987654321e-05, "loss": 0.025, "step": 10890 }, { "epoch": 6.702553060596739, "grad_norm": 0.279296875, "learning_rate": 1.6527777777777777e-05, "loss": 0.0259, "step": 10895 }, { "epoch": 6.7056290372193175, "grad_norm": 0.296875, "learning_rate": 1.651234567901235e-05, "loss": 0.0254, "step": 10900 }, { "epoch": 6.708705013841895, "grad_norm": 0.2451171875, "learning_rate": 1.6496913580246914e-05, "loss": 0.0241, "step": 10905 }, { "epoch": 6.711780990464472, "grad_norm": 0.24609375, "learning_rate": 1.6481481481481482e-05, "loss": 0.0233, "step": 10910 }, { "epoch": 6.71485696708705, "grad_norm": 0.318359375, "learning_rate": 1.646604938271605e-05, "loss": 0.0275, "step": 10915 }, { "epoch": 6.717932943709628, "grad_norm": 0.2734375, "learning_rate": 1.645061728395062e-05, "loss": 0.0241, "step": 10920 }, { "epoch": 6.721008920332205, "grad_norm": 0.30859375, "learning_rate": 1.6435185185185187e-05, "loss": 0.0227, "step": 10925 }, { "epoch": 6.724084896954783, "grad_norm": 0.26953125, "learning_rate": 1.6419753086419752e-05, "loss": 0.0242, "step": 10930 }, { "epoch": 6.727160873577361, "grad_norm": 0.279296875, "learning_rate": 1.6404320987654324e-05, "loss": 0.0253, "step": 10935 }, { "epoch": 6.730236850199939, "grad_norm": 0.283203125, "learning_rate": 1.638888888888889e-05, "loss": 0.0254, "step": 10940 }, { "epoch": 6.733312826822516, "grad_norm": 0.30859375, "learning_rate": 1.6373456790123458e-05, "loss": 0.0268, "step": 10945 }, { "epoch": 6.736388803445093, "grad_norm": 0.255859375, "learning_rate": 1.6358024691358026e-05, "loss": 0.0259, "step": 10950 }, { "epoch": 6.739464780067672, "grad_norm": 0.287109375, "learning_rate": 1.634259259259259e-05, "loss": 0.0256, "step": 10955 }, { "epoch": 6.742540756690249, "grad_norm": 0.275390625, "learning_rate": 1.6327160493827163e-05, "loss": 0.0244, "step": 10960 }, { "epoch": 6.7456167333128265, "grad_norm": 0.255859375, "learning_rate": 1.6311728395061728e-05, "loss": 0.0257, "step": 10965 }, { "epoch": 6.748692709935405, "grad_norm": 0.2890625, "learning_rate": 1.62962962962963e-05, "loss": 0.0247, "step": 10970 }, { "epoch": 6.751768686557982, "grad_norm": 0.267578125, "learning_rate": 1.6280864197530865e-05, "loss": 0.0256, "step": 10975 }, { "epoch": 6.75484466318056, "grad_norm": 0.3203125, "learning_rate": 1.6265432098765434e-05, "loss": 0.0252, "step": 10980 }, { "epoch": 6.757920639803137, "grad_norm": 0.283203125, "learning_rate": 1.6250000000000002e-05, "loss": 0.0261, "step": 10985 }, { "epoch": 6.760996616425715, "grad_norm": 0.287109375, "learning_rate": 1.6234567901234567e-05, "loss": 0.0261, "step": 10990 }, { "epoch": 6.764072593048293, "grad_norm": 0.2890625, "learning_rate": 1.621913580246914e-05, "loss": 0.0276, "step": 10995 }, { "epoch": 6.76714856967087, "grad_norm": 0.279296875, "learning_rate": 1.6203703703703704e-05, "loss": 0.0257, "step": 11000 }, { "epoch": 6.7702245462934485, "grad_norm": 0.2314453125, "learning_rate": 1.6188271604938272e-05, "loss": 0.0246, "step": 11005 }, { "epoch": 6.773300522916026, "grad_norm": 0.25390625, "learning_rate": 1.617283950617284e-05, "loss": 0.024, "step": 11010 }, { "epoch": 6.776376499538603, "grad_norm": 0.302734375, "learning_rate": 1.6157407407407406e-05, "loss": 0.0257, "step": 11015 }, { "epoch": 6.779452476161181, "grad_norm": 0.490234375, "learning_rate": 1.6141975308641978e-05, "loss": 0.0261, "step": 11020 }, { "epoch": 6.782528452783759, "grad_norm": 0.3046875, "learning_rate": 1.6126543209876543e-05, "loss": 0.0262, "step": 11025 }, { "epoch": 6.7856044294063365, "grad_norm": 0.275390625, "learning_rate": 1.6111111111111115e-05, "loss": 0.0268, "step": 11030 }, { "epoch": 6.788680406028914, "grad_norm": 0.255859375, "learning_rate": 1.609567901234568e-05, "loss": 0.0283, "step": 11035 }, { "epoch": 6.791756382651492, "grad_norm": 0.255859375, "learning_rate": 1.6080246913580245e-05, "loss": 0.0248, "step": 11040 }, { "epoch": 6.79483235927407, "grad_norm": 0.298828125, "learning_rate": 1.6064814814814817e-05, "loss": 0.025, "step": 11045 }, { "epoch": 6.797908335896647, "grad_norm": 0.26953125, "learning_rate": 1.604938271604938e-05, "loss": 0.0232, "step": 11050 }, { "epoch": 6.800984312519224, "grad_norm": 0.2490234375, "learning_rate": 1.6033950617283953e-05, "loss": 0.0242, "step": 11055 }, { "epoch": 6.804060289141803, "grad_norm": 0.306640625, "learning_rate": 1.601851851851852e-05, "loss": 0.0258, "step": 11060 }, { "epoch": 6.80713626576438, "grad_norm": 0.3046875, "learning_rate": 1.6003086419753087e-05, "loss": 0.0227, "step": 11065 }, { "epoch": 6.8102122423869575, "grad_norm": 0.296875, "learning_rate": 1.5987654320987655e-05, "loss": 0.0255, "step": 11070 }, { "epoch": 6.813288219009536, "grad_norm": 0.322265625, "learning_rate": 1.597222222222222e-05, "loss": 0.0232, "step": 11075 }, { "epoch": 6.816364195632113, "grad_norm": 0.37890625, "learning_rate": 1.5956790123456792e-05, "loss": 0.0236, "step": 11080 }, { "epoch": 6.819440172254691, "grad_norm": 0.25, "learning_rate": 1.5941358024691357e-05, "loss": 0.024, "step": 11085 }, { "epoch": 6.822516148877268, "grad_norm": 0.25390625, "learning_rate": 1.5925925925925926e-05, "loss": 0.0241, "step": 11090 }, { "epoch": 6.825592125499846, "grad_norm": 0.306640625, "learning_rate": 1.5910493827160494e-05, "loss": 0.0269, "step": 11095 }, { "epoch": 6.828668102122424, "grad_norm": 0.275390625, "learning_rate": 1.5895061728395063e-05, "loss": 0.0271, "step": 11100 }, { "epoch": 6.831744078745001, "grad_norm": 0.28515625, "learning_rate": 1.587962962962963e-05, "loss": 0.0247, "step": 11105 }, { "epoch": 6.8348200553675795, "grad_norm": 0.25390625, "learning_rate": 1.5864197530864196e-05, "loss": 0.0259, "step": 11110 }, { "epoch": 6.837896031990157, "grad_norm": 0.30078125, "learning_rate": 1.5848765432098768e-05, "loss": 0.025, "step": 11115 }, { "epoch": 6.840972008612734, "grad_norm": 0.287109375, "learning_rate": 1.5833333333333333e-05, "loss": 0.0256, "step": 11120 }, { "epoch": 6.844047985235312, "grad_norm": 0.267578125, "learning_rate": 1.58179012345679e-05, "loss": 0.0249, "step": 11125 }, { "epoch": 6.84712396185789, "grad_norm": 0.259765625, "learning_rate": 1.580246913580247e-05, "loss": 0.0255, "step": 11130 }, { "epoch": 6.8501999384804675, "grad_norm": 0.29296875, "learning_rate": 1.578703703703704e-05, "loss": 0.029, "step": 11135 }, { "epoch": 6.853275915103045, "grad_norm": 0.2734375, "learning_rate": 1.5771604938271607e-05, "loss": 0.0242, "step": 11140 }, { "epoch": 6.856351891725623, "grad_norm": 0.25390625, "learning_rate": 1.5756172839506172e-05, "loss": 0.0275, "step": 11145 }, { "epoch": 6.859427868348201, "grad_norm": 0.31640625, "learning_rate": 1.574074074074074e-05, "loss": 0.0264, "step": 11150 }, { "epoch": 6.862503844970778, "grad_norm": 0.310546875, "learning_rate": 1.572530864197531e-05, "loss": 0.0255, "step": 11155 }, { "epoch": 6.8655798215933554, "grad_norm": 0.2255859375, "learning_rate": 1.5709876543209877e-05, "loss": 0.026, "step": 11160 }, { "epoch": 6.868655798215934, "grad_norm": 0.291015625, "learning_rate": 1.5694444444444446e-05, "loss": 0.0253, "step": 11165 }, { "epoch": 6.871731774838511, "grad_norm": 0.279296875, "learning_rate": 1.5679012345679014e-05, "loss": 0.0242, "step": 11170 }, { "epoch": 6.874807751461089, "grad_norm": 0.2734375, "learning_rate": 1.5663580246913583e-05, "loss": 0.0253, "step": 11175 }, { "epoch": 6.877883728083667, "grad_norm": 0.302734375, "learning_rate": 1.5648148148148148e-05, "loss": 0.0272, "step": 11180 }, { "epoch": 6.880959704706244, "grad_norm": 0.345703125, "learning_rate": 1.5632716049382716e-05, "loss": 0.0268, "step": 11185 }, { "epoch": 6.884035681328822, "grad_norm": 0.26171875, "learning_rate": 1.5617283950617285e-05, "loss": 0.024, "step": 11190 }, { "epoch": 6.887111657951399, "grad_norm": 0.294921875, "learning_rate": 1.5601851851851853e-05, "loss": 0.0253, "step": 11195 }, { "epoch": 6.890187634573977, "grad_norm": 0.265625, "learning_rate": 1.558641975308642e-05, "loss": 0.0241, "step": 11200 }, { "epoch": 6.893263611196555, "grad_norm": 0.271484375, "learning_rate": 1.5570987654320986e-05, "loss": 0.027, "step": 11205 }, { "epoch": 6.896339587819132, "grad_norm": 0.33203125, "learning_rate": 1.5555555555555555e-05, "loss": 0.0265, "step": 11210 }, { "epoch": 6.899415564441711, "grad_norm": 0.29296875, "learning_rate": 1.5540123456790123e-05, "loss": 0.0254, "step": 11215 }, { "epoch": 6.902491541064288, "grad_norm": 0.28125, "learning_rate": 1.5524691358024692e-05, "loss": 0.0249, "step": 11220 }, { "epoch": 6.905567517686865, "grad_norm": 0.25390625, "learning_rate": 1.550925925925926e-05, "loss": 0.0248, "step": 11225 }, { "epoch": 6.908643494309443, "grad_norm": 0.28515625, "learning_rate": 1.549382716049383e-05, "loss": 0.0248, "step": 11230 }, { "epoch": 6.911719470932021, "grad_norm": 0.251953125, "learning_rate": 1.5478395061728397e-05, "loss": 0.0218, "step": 11235 }, { "epoch": 6.9147954475545985, "grad_norm": 0.287109375, "learning_rate": 1.5462962962962962e-05, "loss": 0.0272, "step": 11240 }, { "epoch": 6.917871424177176, "grad_norm": 0.2431640625, "learning_rate": 1.544753086419753e-05, "loss": 0.0262, "step": 11245 }, { "epoch": 6.920947400799754, "grad_norm": 0.27734375, "learning_rate": 1.54320987654321e-05, "loss": 0.0248, "step": 11250 }, { "epoch": 6.924023377422332, "grad_norm": 0.267578125, "learning_rate": 1.5416666666666668e-05, "loss": 0.0243, "step": 11255 }, { "epoch": 6.927099354044909, "grad_norm": 0.275390625, "learning_rate": 1.5401234567901236e-05, "loss": 0.0259, "step": 11260 }, { "epoch": 6.9301753306674865, "grad_norm": 0.287109375, "learning_rate": 1.5385802469135804e-05, "loss": 0.0264, "step": 11265 }, { "epoch": 6.933251307290065, "grad_norm": 0.31640625, "learning_rate": 1.537037037037037e-05, "loss": 0.0284, "step": 11270 }, { "epoch": 6.936327283912642, "grad_norm": 0.283203125, "learning_rate": 1.5354938271604938e-05, "loss": 0.025, "step": 11275 }, { "epoch": 6.93940326053522, "grad_norm": 0.2890625, "learning_rate": 1.5339506172839506e-05, "loss": 0.0246, "step": 11280 }, { "epoch": 6.942479237157798, "grad_norm": 0.296875, "learning_rate": 1.5324074074074075e-05, "loss": 0.0276, "step": 11285 }, { "epoch": 6.945555213780375, "grad_norm": 0.2470703125, "learning_rate": 1.5308641975308643e-05, "loss": 0.0261, "step": 11290 }, { "epoch": 6.948631190402953, "grad_norm": 0.26953125, "learning_rate": 1.5293209876543212e-05, "loss": 0.0249, "step": 11295 }, { "epoch": 6.95170716702553, "grad_norm": 0.30859375, "learning_rate": 1.527777777777778e-05, "loss": 0.0257, "step": 11300 }, { "epoch": 6.9547831436481085, "grad_norm": 0.298828125, "learning_rate": 1.5262345679012345e-05, "loss": 0.0256, "step": 11305 }, { "epoch": 6.957859120270686, "grad_norm": 0.25, "learning_rate": 1.5246913580246914e-05, "loss": 0.0249, "step": 11310 }, { "epoch": 6.960935096893263, "grad_norm": 0.27734375, "learning_rate": 1.5231481481481482e-05, "loss": 0.0258, "step": 11315 }, { "epoch": 6.964011073515842, "grad_norm": 0.291015625, "learning_rate": 1.5216049382716049e-05, "loss": 0.0277, "step": 11320 }, { "epoch": 6.967087050138419, "grad_norm": 0.267578125, "learning_rate": 1.5200617283950619e-05, "loss": 0.0271, "step": 11325 }, { "epoch": 6.970163026760996, "grad_norm": 0.2890625, "learning_rate": 1.5185185185185186e-05, "loss": 0.0276, "step": 11330 }, { "epoch": 6.973239003383574, "grad_norm": 0.27734375, "learning_rate": 1.5169753086419756e-05, "loss": 0.0272, "step": 11335 }, { "epoch": 6.976314980006152, "grad_norm": 0.2255859375, "learning_rate": 1.5154320987654321e-05, "loss": 0.0261, "step": 11340 }, { "epoch": 6.9793909566287295, "grad_norm": 0.30859375, "learning_rate": 1.5138888888888888e-05, "loss": 0.0249, "step": 11345 }, { "epoch": 6.982466933251307, "grad_norm": 0.267578125, "learning_rate": 1.5123456790123458e-05, "loss": 0.024, "step": 11350 }, { "epoch": 6.985542909873885, "grad_norm": 0.2734375, "learning_rate": 1.5108024691358025e-05, "loss": 0.0262, "step": 11355 }, { "epoch": 6.988618886496463, "grad_norm": 0.326171875, "learning_rate": 1.5092592592592595e-05, "loss": 0.0262, "step": 11360 }, { "epoch": 6.99169486311904, "grad_norm": 0.271484375, "learning_rate": 1.5077160493827162e-05, "loss": 0.0253, "step": 11365 }, { "epoch": 6.9947708397416175, "grad_norm": 0.31640625, "learning_rate": 1.506172839506173e-05, "loss": 0.0264, "step": 11370 }, { "epoch": 6.997846816364196, "grad_norm": 0.28515625, "learning_rate": 1.5046296296296297e-05, "loss": 0.0259, "step": 11375 }, { "epoch": 7.000922792986773, "grad_norm": 0.25390625, "learning_rate": 1.5030864197530863e-05, "loss": 0.0247, "step": 11380 }, { "epoch": 7.003998769609351, "grad_norm": 0.236328125, "learning_rate": 1.5015432098765434e-05, "loss": 0.0217, "step": 11385 }, { "epoch": 7.007074746231929, "grad_norm": 0.265625, "learning_rate": 1.5e-05, "loss": 0.0218, "step": 11390 }, { "epoch": 7.010150722854506, "grad_norm": 0.2490234375, "learning_rate": 1.4984567901234569e-05, "loss": 0.0213, "step": 11395 }, { "epoch": 7.013226699477084, "grad_norm": 0.25, "learning_rate": 1.4969135802469136e-05, "loss": 0.026, "step": 11400 }, { "epoch": 7.016302676099662, "grad_norm": 0.244140625, "learning_rate": 1.4953703703703702e-05, "loss": 0.0228, "step": 11405 }, { "epoch": 7.0193786527222395, "grad_norm": 0.306640625, "learning_rate": 1.4938271604938272e-05, "loss": 0.0241, "step": 11410 }, { "epoch": 7.022454629344817, "grad_norm": 0.2431640625, "learning_rate": 1.492283950617284e-05, "loss": 0.0231, "step": 11415 }, { "epoch": 7.025530605967394, "grad_norm": 0.265625, "learning_rate": 1.490740740740741e-05, "loss": 0.0215, "step": 11420 }, { "epoch": 7.028606582589973, "grad_norm": 0.294921875, "learning_rate": 1.4891975308641976e-05, "loss": 0.0249, "step": 11425 }, { "epoch": 7.03168255921255, "grad_norm": 0.265625, "learning_rate": 1.4876543209876545e-05, "loss": 0.0225, "step": 11430 }, { "epoch": 7.0347585358351274, "grad_norm": 0.2890625, "learning_rate": 1.4861111111111111e-05, "loss": 0.0244, "step": 11435 }, { "epoch": 7.037834512457706, "grad_norm": 0.30859375, "learning_rate": 1.4845679012345678e-05, "loss": 0.025, "step": 11440 }, { "epoch": 7.040910489080283, "grad_norm": 0.271484375, "learning_rate": 1.4830246913580248e-05, "loss": 0.0245, "step": 11445 }, { "epoch": 7.043986465702861, "grad_norm": 0.275390625, "learning_rate": 1.4814814814814815e-05, "loss": 0.0236, "step": 11450 }, { "epoch": 7.047062442325438, "grad_norm": 0.28515625, "learning_rate": 1.4799382716049383e-05, "loss": 0.0228, "step": 11455 }, { "epoch": 7.050138418948016, "grad_norm": 0.240234375, "learning_rate": 1.478395061728395e-05, "loss": 0.024, "step": 11460 }, { "epoch": 7.053214395570594, "grad_norm": 0.294921875, "learning_rate": 1.476851851851852e-05, "loss": 0.0229, "step": 11465 }, { "epoch": 7.056290372193171, "grad_norm": 0.259765625, "learning_rate": 1.4753086419753087e-05, "loss": 0.0231, "step": 11470 }, { "epoch": 7.059366348815749, "grad_norm": 0.296875, "learning_rate": 1.4737654320987654e-05, "loss": 0.0221, "step": 11475 }, { "epoch": 7.062442325438327, "grad_norm": 0.28125, "learning_rate": 1.4722222222222224e-05, "loss": 0.0228, "step": 11480 }, { "epoch": 7.065518302060904, "grad_norm": 0.25390625, "learning_rate": 1.470679012345679e-05, "loss": 0.0222, "step": 11485 }, { "epoch": 7.068594278683482, "grad_norm": 0.267578125, "learning_rate": 1.4691358024691359e-05, "loss": 0.0232, "step": 11490 }, { "epoch": 7.07167025530606, "grad_norm": 0.263671875, "learning_rate": 1.4675925925925926e-05, "loss": 0.0257, "step": 11495 }, { "epoch": 7.074746231928637, "grad_norm": 0.275390625, "learning_rate": 1.4660493827160496e-05, "loss": 0.0225, "step": 11500 }, { "epoch": 7.077822208551215, "grad_norm": 0.2578125, "learning_rate": 1.4645061728395063e-05, "loss": 0.0228, "step": 11505 }, { "epoch": 7.080898185173793, "grad_norm": 0.2265625, "learning_rate": 1.462962962962963e-05, "loss": 0.0229, "step": 11510 }, { "epoch": 7.0839741617963705, "grad_norm": 0.263671875, "learning_rate": 1.4614197530864198e-05, "loss": 0.0249, "step": 11515 }, { "epoch": 7.087050138418948, "grad_norm": 0.271484375, "learning_rate": 1.4598765432098765e-05, "loss": 0.0234, "step": 11520 }, { "epoch": 7.090126115041525, "grad_norm": 0.255859375, "learning_rate": 1.4583333333333335e-05, "loss": 0.0222, "step": 11525 }, { "epoch": 7.093202091664104, "grad_norm": 0.255859375, "learning_rate": 1.4567901234567902e-05, "loss": 0.0228, "step": 11530 }, { "epoch": 7.096278068286681, "grad_norm": 0.283203125, "learning_rate": 1.4552469135802472e-05, "loss": 0.0253, "step": 11535 }, { "epoch": 7.0993540449092585, "grad_norm": 0.26171875, "learning_rate": 1.4537037037037039e-05, "loss": 0.0234, "step": 11540 }, { "epoch": 7.102430021531837, "grad_norm": 0.349609375, "learning_rate": 1.4521604938271605e-05, "loss": 0.0226, "step": 11545 }, { "epoch": 7.105505998154414, "grad_norm": 0.234375, "learning_rate": 1.4506172839506174e-05, "loss": 0.0231, "step": 11550 }, { "epoch": 7.108581974776992, "grad_norm": 0.265625, "learning_rate": 1.449074074074074e-05, "loss": 0.0217, "step": 11555 }, { "epoch": 7.111657951399569, "grad_norm": 0.263671875, "learning_rate": 1.447530864197531e-05, "loss": 0.023, "step": 11560 }, { "epoch": 7.114733928022147, "grad_norm": 0.263671875, "learning_rate": 1.4459876543209877e-05, "loss": 0.0257, "step": 11565 }, { "epoch": 7.117809904644725, "grad_norm": 0.28515625, "learning_rate": 1.4444444444444444e-05, "loss": 0.022, "step": 11570 }, { "epoch": 7.120885881267302, "grad_norm": 0.263671875, "learning_rate": 1.4429012345679013e-05, "loss": 0.0222, "step": 11575 }, { "epoch": 7.1239618578898805, "grad_norm": 0.26953125, "learning_rate": 1.441358024691358e-05, "loss": 0.024, "step": 11580 }, { "epoch": 7.127037834512458, "grad_norm": 0.291015625, "learning_rate": 1.439814814814815e-05, "loss": 0.0255, "step": 11585 }, { "epoch": 7.130113811135035, "grad_norm": 0.275390625, "learning_rate": 1.4382716049382716e-05, "loss": 0.0237, "step": 11590 }, { "epoch": 7.133189787757613, "grad_norm": 0.302734375, "learning_rate": 1.4367283950617286e-05, "loss": 0.0234, "step": 11595 }, { "epoch": 7.136265764380191, "grad_norm": 0.298828125, "learning_rate": 1.4351851851851853e-05, "loss": 0.0235, "step": 11600 }, { "epoch": 7.139341741002768, "grad_norm": 0.29296875, "learning_rate": 1.433641975308642e-05, "loss": 0.0253, "step": 11605 }, { "epoch": 7.142417717625346, "grad_norm": 0.232421875, "learning_rate": 1.4320987654320988e-05, "loss": 0.0198, "step": 11610 }, { "epoch": 7.145493694247924, "grad_norm": 0.275390625, "learning_rate": 1.4305555555555555e-05, "loss": 0.0225, "step": 11615 }, { "epoch": 7.1485696708705015, "grad_norm": 0.248046875, "learning_rate": 1.4290123456790125e-05, "loss": 0.0225, "step": 11620 }, { "epoch": 7.151645647493079, "grad_norm": 0.302734375, "learning_rate": 1.4274691358024692e-05, "loss": 0.026, "step": 11625 }, { "epoch": 7.154721624115656, "grad_norm": 0.27734375, "learning_rate": 1.425925925925926e-05, "loss": 0.0258, "step": 11630 }, { "epoch": 7.157797600738235, "grad_norm": 0.26953125, "learning_rate": 1.4243827160493827e-05, "loss": 0.0228, "step": 11635 }, { "epoch": 7.160873577360812, "grad_norm": 0.26171875, "learning_rate": 1.4228395061728394e-05, "loss": 0.0235, "step": 11640 }, { "epoch": 7.1639495539833895, "grad_norm": 0.28125, "learning_rate": 1.4212962962962964e-05, "loss": 0.0249, "step": 11645 }, { "epoch": 7.167025530605968, "grad_norm": 0.255859375, "learning_rate": 1.419753086419753e-05, "loss": 0.0232, "step": 11650 }, { "epoch": 7.170101507228545, "grad_norm": 0.283203125, "learning_rate": 1.4182098765432101e-05, "loss": 0.0227, "step": 11655 }, { "epoch": 7.173177483851123, "grad_norm": 0.34375, "learning_rate": 1.4166666666666668e-05, "loss": 0.0244, "step": 11660 }, { "epoch": 7.1762534604737, "grad_norm": 0.263671875, "learning_rate": 1.4151234567901236e-05, "loss": 0.0221, "step": 11665 }, { "epoch": 7.179329437096278, "grad_norm": 0.265625, "learning_rate": 1.4135802469135803e-05, "loss": 0.0227, "step": 11670 }, { "epoch": 7.182405413718856, "grad_norm": 0.251953125, "learning_rate": 1.412037037037037e-05, "loss": 0.0232, "step": 11675 }, { "epoch": 7.185481390341433, "grad_norm": 0.283203125, "learning_rate": 1.410493827160494e-05, "loss": 0.0233, "step": 11680 }, { "epoch": 7.1885573669640115, "grad_norm": 0.263671875, "learning_rate": 1.4089506172839507e-05, "loss": 0.0239, "step": 11685 }, { "epoch": 7.191633343586589, "grad_norm": 0.30859375, "learning_rate": 1.4074074074074075e-05, "loss": 0.0245, "step": 11690 }, { "epoch": 7.194709320209166, "grad_norm": 0.251953125, "learning_rate": 1.4058641975308642e-05, "loss": 0.0223, "step": 11695 }, { "epoch": 7.197785296831744, "grad_norm": 0.287109375, "learning_rate": 1.4043209876543212e-05, "loss": 0.0236, "step": 11700 }, { "epoch": 7.200861273454322, "grad_norm": 0.234375, "learning_rate": 1.4027777777777779e-05, "loss": 0.023, "step": 11705 }, { "epoch": 7.2039372500768994, "grad_norm": 0.302734375, "learning_rate": 1.4012345679012345e-05, "loss": 0.0236, "step": 11710 }, { "epoch": 7.207013226699477, "grad_norm": 0.25390625, "learning_rate": 1.3996913580246916e-05, "loss": 0.0245, "step": 11715 }, { "epoch": 7.210089203322055, "grad_norm": 0.265625, "learning_rate": 1.3981481481481482e-05, "loss": 0.0239, "step": 11720 }, { "epoch": 7.213165179944633, "grad_norm": 0.279296875, "learning_rate": 1.396604938271605e-05, "loss": 0.0211, "step": 11725 }, { "epoch": 7.21624115656721, "grad_norm": 0.27734375, "learning_rate": 1.3950617283950617e-05, "loss": 0.0238, "step": 11730 }, { "epoch": 7.219317133189787, "grad_norm": 0.263671875, "learning_rate": 1.3935185185185188e-05, "loss": 0.0245, "step": 11735 }, { "epoch": 7.222393109812366, "grad_norm": 0.265625, "learning_rate": 1.3919753086419754e-05, "loss": 0.0223, "step": 11740 }, { "epoch": 7.225469086434943, "grad_norm": 0.2734375, "learning_rate": 1.3904320987654321e-05, "loss": 0.0221, "step": 11745 }, { "epoch": 7.2285450630575205, "grad_norm": 0.259765625, "learning_rate": 1.388888888888889e-05, "loss": 0.0232, "step": 11750 }, { "epoch": 7.231621039680099, "grad_norm": 0.294921875, "learning_rate": 1.3873456790123456e-05, "loss": 0.0253, "step": 11755 }, { "epoch": 7.234697016302676, "grad_norm": 0.27734375, "learning_rate": 1.3858024691358026e-05, "loss": 0.0225, "step": 11760 }, { "epoch": 7.237772992925254, "grad_norm": 0.26171875, "learning_rate": 1.3842592592592593e-05, "loss": 0.0237, "step": 11765 }, { "epoch": 7.240848969547831, "grad_norm": 0.314453125, "learning_rate": 1.382716049382716e-05, "loss": 0.0234, "step": 11770 }, { "epoch": 7.243924946170409, "grad_norm": 0.279296875, "learning_rate": 1.381172839506173e-05, "loss": 0.0242, "step": 11775 }, { "epoch": 7.247000922792987, "grad_norm": 0.25390625, "learning_rate": 1.3796296296296297e-05, "loss": 0.0235, "step": 11780 }, { "epoch": 7.250076899415564, "grad_norm": 0.259765625, "learning_rate": 1.3780864197530865e-05, "loss": 0.0229, "step": 11785 }, { "epoch": 7.2531528760381425, "grad_norm": 0.267578125, "learning_rate": 1.3765432098765432e-05, "loss": 0.0223, "step": 11790 }, { "epoch": 7.25622885266072, "grad_norm": 0.3046875, "learning_rate": 1.3750000000000002e-05, "loss": 0.0259, "step": 11795 }, { "epoch": 7.259304829283297, "grad_norm": 0.236328125, "learning_rate": 1.3734567901234569e-05, "loss": 0.0212, "step": 11800 }, { "epoch": 7.262380805905875, "grad_norm": 0.298828125, "learning_rate": 1.3719135802469136e-05, "loss": 0.0228, "step": 11805 }, { "epoch": 7.265456782528453, "grad_norm": 0.2734375, "learning_rate": 1.3703703703703704e-05, "loss": 0.0236, "step": 11810 }, { "epoch": 7.2685327591510305, "grad_norm": 0.28125, "learning_rate": 1.3688271604938271e-05, "loss": 0.0226, "step": 11815 }, { "epoch": 7.271608735773608, "grad_norm": 0.314453125, "learning_rate": 1.3672839506172841e-05, "loss": 0.0229, "step": 11820 }, { "epoch": 7.274684712396186, "grad_norm": 0.248046875, "learning_rate": 1.3657407407407408e-05, "loss": 0.0226, "step": 11825 }, { "epoch": 7.277760689018764, "grad_norm": 0.34375, "learning_rate": 1.3641975308641978e-05, "loss": 0.0234, "step": 11830 }, { "epoch": 7.280836665641341, "grad_norm": 0.275390625, "learning_rate": 1.3626543209876545e-05, "loss": 0.0241, "step": 11835 }, { "epoch": 7.283912642263918, "grad_norm": 0.271484375, "learning_rate": 1.3611111111111111e-05, "loss": 0.0231, "step": 11840 }, { "epoch": 7.286988618886497, "grad_norm": 0.28515625, "learning_rate": 1.359567901234568e-05, "loss": 0.0264, "step": 11845 }, { "epoch": 7.290064595509074, "grad_norm": 0.283203125, "learning_rate": 1.3580246913580247e-05, "loss": 0.0262, "step": 11850 }, { "epoch": 7.293140572131652, "grad_norm": 0.287109375, "learning_rate": 1.3564814814814817e-05, "loss": 0.025, "step": 11855 }, { "epoch": 7.29621654875423, "grad_norm": 0.302734375, "learning_rate": 1.3549382716049384e-05, "loss": 0.0246, "step": 11860 }, { "epoch": 7.299292525376807, "grad_norm": 0.2578125, "learning_rate": 1.3533950617283952e-05, "loss": 0.022, "step": 11865 }, { "epoch": 7.302368501999385, "grad_norm": 0.287109375, "learning_rate": 1.3518518518518519e-05, "loss": 0.0243, "step": 11870 }, { "epoch": 7.305444478621962, "grad_norm": 0.265625, "learning_rate": 1.3503086419753085e-05, "loss": 0.0248, "step": 11875 }, { "epoch": 7.30852045524454, "grad_norm": 0.3203125, "learning_rate": 1.3487654320987656e-05, "loss": 0.0254, "step": 11880 }, { "epoch": 7.311596431867118, "grad_norm": 0.27734375, "learning_rate": 1.3472222222222222e-05, "loss": 0.0226, "step": 11885 }, { "epoch": 7.314672408489695, "grad_norm": 0.2490234375, "learning_rate": 1.3456790123456793e-05, "loss": 0.0239, "step": 11890 }, { "epoch": 7.3177483851122735, "grad_norm": 0.29296875, "learning_rate": 1.344135802469136e-05, "loss": 0.0249, "step": 11895 }, { "epoch": 7.320824361734851, "grad_norm": 0.306640625, "learning_rate": 1.3425925925925928e-05, "loss": 0.0229, "step": 11900 }, { "epoch": 7.323900338357428, "grad_norm": 0.3203125, "learning_rate": 1.3410493827160494e-05, "loss": 0.0248, "step": 11905 }, { "epoch": 7.326976314980006, "grad_norm": 0.259765625, "learning_rate": 1.3395061728395061e-05, "loss": 0.0246, "step": 11910 }, { "epoch": 7.330052291602584, "grad_norm": 0.2470703125, "learning_rate": 1.3379629629629631e-05, "loss": 0.0211, "step": 11915 }, { "epoch": 7.3331282682251615, "grad_norm": 0.298828125, "learning_rate": 1.3364197530864198e-05, "loss": 0.0234, "step": 11920 }, { "epoch": 7.336204244847739, "grad_norm": 0.31640625, "learning_rate": 1.3348765432098767e-05, "loss": 0.0243, "step": 11925 }, { "epoch": 7.339280221470317, "grad_norm": 0.23046875, "learning_rate": 1.3333333333333333e-05, "loss": 0.0251, "step": 11930 }, { "epoch": 7.342356198092895, "grad_norm": 0.2099609375, "learning_rate": 1.33179012345679e-05, "loss": 0.0238, "step": 11935 }, { "epoch": 7.345432174715472, "grad_norm": 0.33984375, "learning_rate": 1.330246913580247e-05, "loss": 0.024, "step": 11940 }, { "epoch": 7.3485081513380495, "grad_norm": 0.3046875, "learning_rate": 1.3287037037037037e-05, "loss": 0.0241, "step": 11945 }, { "epoch": 7.351584127960628, "grad_norm": 0.294921875, "learning_rate": 1.3271604938271605e-05, "loss": 0.0247, "step": 11950 }, { "epoch": 7.354660104583205, "grad_norm": 0.25390625, "learning_rate": 1.3256172839506172e-05, "loss": 0.0218, "step": 11955 }, { "epoch": 7.357736081205783, "grad_norm": 0.248046875, "learning_rate": 1.3240740740740742e-05, "loss": 0.0246, "step": 11960 }, { "epoch": 7.360812057828361, "grad_norm": 0.30078125, "learning_rate": 1.3225308641975309e-05, "loss": 0.0242, "step": 11965 }, { "epoch": 7.363888034450938, "grad_norm": 0.267578125, "learning_rate": 1.3209876543209876e-05, "loss": 0.0226, "step": 11970 }, { "epoch": 7.366964011073516, "grad_norm": 0.2431640625, "learning_rate": 1.3194444444444446e-05, "loss": 0.022, "step": 11975 }, { "epoch": 7.370039987696093, "grad_norm": 0.2734375, "learning_rate": 1.3179012345679013e-05, "loss": 0.0234, "step": 11980 }, { "epoch": 7.3731159643186714, "grad_norm": 0.267578125, "learning_rate": 1.3163580246913581e-05, "loss": 0.0219, "step": 11985 }, { "epoch": 7.376191940941249, "grad_norm": 0.25390625, "learning_rate": 1.3148148148148148e-05, "loss": 0.0227, "step": 11990 }, { "epoch": 7.379267917563826, "grad_norm": 0.263671875, "learning_rate": 1.3132716049382718e-05, "loss": 0.0223, "step": 11995 }, { "epoch": 7.382343894186405, "grad_norm": 0.28515625, "learning_rate": 1.3117283950617285e-05, "loss": 0.0244, "step": 12000 }, { "epoch": 7.385419870808982, "grad_norm": 0.357421875, "learning_rate": 1.3101851851851852e-05, "loss": 0.0253, "step": 12005 }, { "epoch": 7.388495847431559, "grad_norm": 0.271484375, "learning_rate": 1.308641975308642e-05, "loss": 0.0228, "step": 12010 }, { "epoch": 7.391571824054137, "grad_norm": 0.291015625, "learning_rate": 1.3070987654320987e-05, "loss": 0.023, "step": 12015 }, { "epoch": 7.394647800676715, "grad_norm": 0.29296875, "learning_rate": 1.3055555555555557e-05, "loss": 0.0234, "step": 12020 }, { "epoch": 7.3977237772992925, "grad_norm": 0.28515625, "learning_rate": 1.3040123456790124e-05, "loss": 0.0238, "step": 12025 }, { "epoch": 7.40079975392187, "grad_norm": 0.298828125, "learning_rate": 1.3024691358024694e-05, "loss": 0.0218, "step": 12030 }, { "epoch": 7.403875730544448, "grad_norm": 0.294921875, "learning_rate": 1.300925925925926e-05, "loss": 0.0247, "step": 12035 }, { "epoch": 7.406951707167026, "grad_norm": 0.27734375, "learning_rate": 1.2993827160493827e-05, "loss": 0.0249, "step": 12040 }, { "epoch": 7.410027683789603, "grad_norm": 0.255859375, "learning_rate": 1.2978395061728396e-05, "loss": 0.023, "step": 12045 }, { "epoch": 7.4131036604121805, "grad_norm": 0.2451171875, "learning_rate": 1.2962962962962962e-05, "loss": 0.0242, "step": 12050 }, { "epoch": 7.416179637034759, "grad_norm": 0.2734375, "learning_rate": 1.2947530864197533e-05, "loss": 0.0238, "step": 12055 }, { "epoch": 7.419255613657336, "grad_norm": 0.294921875, "learning_rate": 1.29320987654321e-05, "loss": 0.0256, "step": 12060 }, { "epoch": 7.422331590279914, "grad_norm": 0.302734375, "learning_rate": 1.2916666666666668e-05, "loss": 0.0246, "step": 12065 }, { "epoch": 7.425407566902492, "grad_norm": 0.26171875, "learning_rate": 1.2901234567901235e-05, "loss": 0.0245, "step": 12070 }, { "epoch": 7.428483543525069, "grad_norm": 0.30078125, "learning_rate": 1.2885802469135801e-05, "loss": 0.0239, "step": 12075 }, { "epoch": 7.431559520147647, "grad_norm": 0.271484375, "learning_rate": 1.2870370370370371e-05, "loss": 0.0236, "step": 12080 }, { "epoch": 7.434635496770224, "grad_norm": 0.2734375, "learning_rate": 1.2854938271604938e-05, "loss": 0.0217, "step": 12085 }, { "epoch": 7.4377114733928025, "grad_norm": 0.2431640625, "learning_rate": 1.2839506172839508e-05, "loss": 0.022, "step": 12090 }, { "epoch": 7.44078745001538, "grad_norm": 0.271484375, "learning_rate": 1.2824074074074075e-05, "loss": 0.0251, "step": 12095 }, { "epoch": 7.443863426637957, "grad_norm": 0.298828125, "learning_rate": 1.2808641975308644e-05, "loss": 0.0223, "step": 12100 }, { "epoch": 7.446939403260536, "grad_norm": 0.302734375, "learning_rate": 1.279320987654321e-05, "loss": 0.0243, "step": 12105 }, { "epoch": 7.450015379883113, "grad_norm": 0.2578125, "learning_rate": 1.2777777777777777e-05, "loss": 0.024, "step": 12110 }, { "epoch": 7.45309135650569, "grad_norm": 0.2490234375, "learning_rate": 1.2762345679012347e-05, "loss": 0.0227, "step": 12115 }, { "epoch": 7.456167333128268, "grad_norm": 0.2890625, "learning_rate": 1.2746913580246914e-05, "loss": 0.0235, "step": 12120 }, { "epoch": 7.459243309750846, "grad_norm": 0.279296875, "learning_rate": 1.2731481481481482e-05, "loss": 0.0224, "step": 12125 }, { "epoch": 7.462319286373424, "grad_norm": 0.2890625, "learning_rate": 1.2716049382716049e-05, "loss": 0.0236, "step": 12130 }, { "epoch": 7.465395262996001, "grad_norm": 0.2890625, "learning_rate": 1.2700617283950616e-05, "loss": 0.0227, "step": 12135 }, { "epoch": 7.468471239618579, "grad_norm": 0.26171875, "learning_rate": 1.2685185185185186e-05, "loss": 0.025, "step": 12140 }, { "epoch": 7.471547216241157, "grad_norm": 0.26171875, "learning_rate": 1.2669753086419753e-05, "loss": 0.0236, "step": 12145 }, { "epoch": 7.474623192863734, "grad_norm": 0.28515625, "learning_rate": 1.2654320987654323e-05, "loss": 0.0239, "step": 12150 }, { "epoch": 7.4776991694863115, "grad_norm": 0.28125, "learning_rate": 1.263888888888889e-05, "loss": 0.0236, "step": 12155 }, { "epoch": 7.48077514610889, "grad_norm": 0.30859375, "learning_rate": 1.2623456790123458e-05, "loss": 0.024, "step": 12160 }, { "epoch": 7.483851122731467, "grad_norm": 0.296875, "learning_rate": 1.2608024691358025e-05, "loss": 0.0259, "step": 12165 }, { "epoch": 7.486927099354045, "grad_norm": 0.30859375, "learning_rate": 1.2592592592592592e-05, "loss": 0.0245, "step": 12170 }, { "epoch": 7.490003075976623, "grad_norm": 0.244140625, "learning_rate": 1.2577160493827162e-05, "loss": 0.0214, "step": 12175 }, { "epoch": 7.4930790525992, "grad_norm": 0.36328125, "learning_rate": 1.2561728395061729e-05, "loss": 0.0277, "step": 12180 }, { "epoch": 7.496155029221778, "grad_norm": 0.28515625, "learning_rate": 1.2546296296296297e-05, "loss": 0.024, "step": 12185 }, { "epoch": 7.499231005844355, "grad_norm": 0.30078125, "learning_rate": 1.2530864197530864e-05, "loss": 0.0255, "step": 12190 }, { "epoch": 7.5023069824669335, "grad_norm": 0.28125, "learning_rate": 1.2515432098765434e-05, "loss": 0.0232, "step": 12195 }, { "epoch": 7.505382959089511, "grad_norm": 0.310546875, "learning_rate": 1.25e-05, "loss": 0.0248, "step": 12200 }, { "epoch": 7.508458935712088, "grad_norm": 0.302734375, "learning_rate": 1.2484567901234569e-05, "loss": 0.0238, "step": 12205 }, { "epoch": 7.511534912334667, "grad_norm": 0.27734375, "learning_rate": 1.2469135802469137e-05, "loss": 0.0229, "step": 12210 }, { "epoch": 7.514610888957244, "grad_norm": 0.287109375, "learning_rate": 1.2453703703703704e-05, "loss": 0.0229, "step": 12215 }, { "epoch": 7.5176868655798215, "grad_norm": 0.2890625, "learning_rate": 1.2438271604938271e-05, "loss": 0.0244, "step": 12220 }, { "epoch": 7.520762842202399, "grad_norm": 0.2412109375, "learning_rate": 1.242283950617284e-05, "loss": 0.0233, "step": 12225 }, { "epoch": 7.523838818824977, "grad_norm": 0.287109375, "learning_rate": 1.2407407407407408e-05, "loss": 0.0245, "step": 12230 }, { "epoch": 7.526914795447555, "grad_norm": 0.2734375, "learning_rate": 1.2391975308641976e-05, "loss": 0.0279, "step": 12235 }, { "epoch": 7.529990772070132, "grad_norm": 0.28515625, "learning_rate": 1.2376543209876545e-05, "loss": 0.0223, "step": 12240 }, { "epoch": 7.53306674869271, "grad_norm": 0.271484375, "learning_rate": 1.2361111111111112e-05, "loss": 0.0235, "step": 12245 }, { "epoch": 7.536142725315288, "grad_norm": 0.291015625, "learning_rate": 1.2345679012345678e-05, "loss": 0.0227, "step": 12250 }, { "epoch": 7.539218701937865, "grad_norm": 0.28515625, "learning_rate": 1.2330246913580247e-05, "loss": 0.0235, "step": 12255 }, { "epoch": 7.5422946785604426, "grad_norm": 0.234375, "learning_rate": 1.2314814814814815e-05, "loss": 0.0193, "step": 12260 }, { "epoch": 7.545370655183021, "grad_norm": 0.306640625, "learning_rate": 1.2299382716049384e-05, "loss": 0.0239, "step": 12265 }, { "epoch": 7.548446631805598, "grad_norm": 0.279296875, "learning_rate": 1.2283950617283952e-05, "loss": 0.0237, "step": 12270 }, { "epoch": 7.551522608428176, "grad_norm": 0.28125, "learning_rate": 1.2268518518518519e-05, "loss": 0.0229, "step": 12275 }, { "epoch": 7.554598585050754, "grad_norm": 0.298828125, "learning_rate": 1.2253086419753087e-05, "loss": 0.0256, "step": 12280 }, { "epoch": 7.557674561673331, "grad_norm": 0.265625, "learning_rate": 1.2237654320987654e-05, "loss": 0.024, "step": 12285 }, { "epoch": 7.560750538295909, "grad_norm": 0.302734375, "learning_rate": 1.2222222222222222e-05, "loss": 0.0237, "step": 12290 }, { "epoch": 7.563826514918486, "grad_norm": 0.283203125, "learning_rate": 1.2206790123456791e-05, "loss": 0.0259, "step": 12295 }, { "epoch": 7.5669024915410645, "grad_norm": 0.248046875, "learning_rate": 1.219135802469136e-05, "loss": 0.0241, "step": 12300 }, { "epoch": 7.569978468163642, "grad_norm": 0.298828125, "learning_rate": 1.2175925925925926e-05, "loss": 0.0245, "step": 12305 }, { "epoch": 7.573054444786219, "grad_norm": 0.2421875, "learning_rate": 1.2160493827160495e-05, "loss": 0.0225, "step": 12310 }, { "epoch": 7.576130421408798, "grad_norm": 0.24609375, "learning_rate": 1.2145061728395063e-05, "loss": 0.0237, "step": 12315 }, { "epoch": 7.579206398031375, "grad_norm": 0.30859375, "learning_rate": 1.212962962962963e-05, "loss": 0.0259, "step": 12320 }, { "epoch": 7.5822823746539525, "grad_norm": 0.259765625, "learning_rate": 1.2114197530864198e-05, "loss": 0.023, "step": 12325 }, { "epoch": 7.585358351276531, "grad_norm": 0.310546875, "learning_rate": 1.2098765432098767e-05, "loss": 0.0269, "step": 12330 }, { "epoch": 7.588434327899108, "grad_norm": 0.26953125, "learning_rate": 1.2083333333333333e-05, "loss": 0.0213, "step": 12335 }, { "epoch": 7.591510304521686, "grad_norm": 0.2470703125, "learning_rate": 1.2067901234567902e-05, "loss": 0.0214, "step": 12340 }, { "epoch": 7.594586281144263, "grad_norm": 0.2734375, "learning_rate": 1.205246913580247e-05, "loss": 0.0225, "step": 12345 }, { "epoch": 7.597662257766841, "grad_norm": 0.25390625, "learning_rate": 1.2037037037037037e-05, "loss": 0.0249, "step": 12350 }, { "epoch": 7.600738234389419, "grad_norm": 0.296875, "learning_rate": 1.2021604938271605e-05, "loss": 0.0245, "step": 12355 }, { "epoch": 7.603814211011996, "grad_norm": 0.24609375, "learning_rate": 1.2006172839506174e-05, "loss": 0.0238, "step": 12360 }, { "epoch": 7.6068901876345745, "grad_norm": 0.30078125, "learning_rate": 1.199074074074074e-05, "loss": 0.0247, "step": 12365 }, { "epoch": 7.609966164257152, "grad_norm": 0.318359375, "learning_rate": 1.1975308641975309e-05, "loss": 0.0235, "step": 12370 }, { "epoch": 7.613042140879729, "grad_norm": 0.279296875, "learning_rate": 1.1959876543209878e-05, "loss": 0.0239, "step": 12375 }, { "epoch": 7.616118117502307, "grad_norm": 0.337890625, "learning_rate": 1.1944444444444446e-05, "loss": 0.0268, "step": 12380 }, { "epoch": 7.619194094124885, "grad_norm": 0.2255859375, "learning_rate": 1.1929012345679013e-05, "loss": 0.0219, "step": 12385 }, { "epoch": 7.622270070747462, "grad_norm": 0.28515625, "learning_rate": 1.1913580246913581e-05, "loss": 0.023, "step": 12390 }, { "epoch": 7.62534604737004, "grad_norm": 0.2890625, "learning_rate": 1.1898148148148148e-05, "loss": 0.0245, "step": 12395 }, { "epoch": 7.628422023992618, "grad_norm": 0.287109375, "learning_rate": 1.1882716049382716e-05, "loss": 0.0278, "step": 12400 }, { "epoch": 7.631498000615196, "grad_norm": 0.302734375, "learning_rate": 1.1867283950617285e-05, "loss": 0.023, "step": 12405 }, { "epoch": 7.634573977237773, "grad_norm": 0.265625, "learning_rate": 1.1851851851851853e-05, "loss": 0.0233, "step": 12410 }, { "epoch": 7.63764995386035, "grad_norm": 0.36328125, "learning_rate": 1.1836419753086422e-05, "loss": 0.023, "step": 12415 }, { "epoch": 7.640725930482929, "grad_norm": 0.30859375, "learning_rate": 1.1820987654320989e-05, "loss": 0.0237, "step": 12420 }, { "epoch": 7.643801907105506, "grad_norm": 0.275390625, "learning_rate": 1.1805555555555555e-05, "loss": 0.0242, "step": 12425 }, { "epoch": 7.6468778837280835, "grad_norm": 0.298828125, "learning_rate": 1.1790123456790124e-05, "loss": 0.0255, "step": 12430 }, { "epoch": 7.649953860350662, "grad_norm": 0.2734375, "learning_rate": 1.1774691358024692e-05, "loss": 0.0239, "step": 12435 }, { "epoch": 7.653029836973239, "grad_norm": 0.28515625, "learning_rate": 1.175925925925926e-05, "loss": 0.0259, "step": 12440 }, { "epoch": 7.656105813595817, "grad_norm": 0.265625, "learning_rate": 1.1743827160493829e-05, "loss": 0.0217, "step": 12445 }, { "epoch": 7.659181790218394, "grad_norm": 0.259765625, "learning_rate": 1.1728395061728396e-05, "loss": 0.0216, "step": 12450 }, { "epoch": 7.662257766840972, "grad_norm": 0.353515625, "learning_rate": 1.1712962962962963e-05, "loss": 0.0241, "step": 12455 }, { "epoch": 7.66533374346355, "grad_norm": 0.2890625, "learning_rate": 1.1697530864197531e-05, "loss": 0.0244, "step": 12460 }, { "epoch": 7.668409720086127, "grad_norm": 0.25390625, "learning_rate": 1.16820987654321e-05, "loss": 0.0239, "step": 12465 }, { "epoch": 7.6714856967087055, "grad_norm": 0.30078125, "learning_rate": 1.1666666666666668e-05, "loss": 0.0244, "step": 12470 }, { "epoch": 7.674561673331283, "grad_norm": 0.302734375, "learning_rate": 1.1651234567901236e-05, "loss": 0.0245, "step": 12475 }, { "epoch": 7.67763764995386, "grad_norm": 0.29296875, "learning_rate": 1.1635802469135803e-05, "loss": 0.024, "step": 12480 }, { "epoch": 7.680713626576438, "grad_norm": 0.3046875, "learning_rate": 1.162037037037037e-05, "loss": 0.026, "step": 12485 }, { "epoch": 7.683789603199016, "grad_norm": 0.263671875, "learning_rate": 1.1604938271604938e-05, "loss": 0.023, "step": 12490 }, { "epoch": 7.6868655798215935, "grad_norm": 0.318359375, "learning_rate": 1.1589506172839507e-05, "loss": 0.025, "step": 12495 }, { "epoch": 7.689941556444171, "grad_norm": 0.275390625, "learning_rate": 1.1574074074074075e-05, "loss": 0.0216, "step": 12500 }, { "epoch": 7.693017533066749, "grad_norm": 0.26171875, "learning_rate": 1.1558641975308642e-05, "loss": 0.0255, "step": 12505 }, { "epoch": 7.696093509689327, "grad_norm": 0.2890625, "learning_rate": 1.154320987654321e-05, "loss": 0.0251, "step": 12510 }, { "epoch": 7.699169486311904, "grad_norm": 0.263671875, "learning_rate": 1.1527777777777779e-05, "loss": 0.0227, "step": 12515 }, { "epoch": 7.702245462934481, "grad_norm": 0.306640625, "learning_rate": 1.1512345679012346e-05, "loss": 0.0242, "step": 12520 }, { "epoch": 7.70532143955706, "grad_norm": 0.33203125, "learning_rate": 1.1496913580246914e-05, "loss": 0.0254, "step": 12525 }, { "epoch": 7.708397416179637, "grad_norm": 0.28125, "learning_rate": 1.1481481481481482e-05, "loss": 0.0245, "step": 12530 }, { "epoch": 7.7114733928022146, "grad_norm": 0.26953125, "learning_rate": 1.146604938271605e-05, "loss": 0.024, "step": 12535 }, { "epoch": 7.714549369424793, "grad_norm": 0.3046875, "learning_rate": 1.1450617283950618e-05, "loss": 0.0237, "step": 12540 }, { "epoch": 7.71762534604737, "grad_norm": 0.310546875, "learning_rate": 1.1435185185185186e-05, "loss": 0.0253, "step": 12545 }, { "epoch": 7.720701322669948, "grad_norm": 0.26171875, "learning_rate": 1.1419753086419753e-05, "loss": 0.0236, "step": 12550 }, { "epoch": 7.723777299292525, "grad_norm": 0.2734375, "learning_rate": 1.1404320987654321e-05, "loss": 0.0248, "step": 12555 }, { "epoch": 7.726853275915103, "grad_norm": 0.302734375, "learning_rate": 1.138888888888889e-05, "loss": 0.0244, "step": 12560 }, { "epoch": 7.729929252537681, "grad_norm": 0.2734375, "learning_rate": 1.1373456790123457e-05, "loss": 0.0234, "step": 12565 }, { "epoch": 7.733005229160258, "grad_norm": 0.2412109375, "learning_rate": 1.1358024691358025e-05, "loss": 0.0225, "step": 12570 }, { "epoch": 7.7360812057828365, "grad_norm": 0.271484375, "learning_rate": 1.1342592592592593e-05, "loss": 0.0247, "step": 12575 }, { "epoch": 7.739157182405414, "grad_norm": 0.330078125, "learning_rate": 1.1327160493827162e-05, "loss": 0.0244, "step": 12580 }, { "epoch": 7.742233159027991, "grad_norm": 0.291015625, "learning_rate": 1.1311728395061729e-05, "loss": 0.0264, "step": 12585 }, { "epoch": 7.745309135650569, "grad_norm": 0.27734375, "learning_rate": 1.1296296296296297e-05, "loss": 0.0225, "step": 12590 }, { "epoch": 7.748385112273147, "grad_norm": 0.29296875, "learning_rate": 1.1280864197530864e-05, "loss": 0.0237, "step": 12595 }, { "epoch": 7.7514610888957245, "grad_norm": 0.3125, "learning_rate": 1.1265432098765432e-05, "loss": 0.0265, "step": 12600 }, { "epoch": 7.754537065518302, "grad_norm": 0.26953125, "learning_rate": 1.125e-05, "loss": 0.0235, "step": 12605 }, { "epoch": 7.75761304214088, "grad_norm": 0.2578125, "learning_rate": 1.123456790123457e-05, "loss": 0.0224, "step": 12610 }, { "epoch": 7.760689018763458, "grad_norm": 0.296875, "learning_rate": 1.1219135802469136e-05, "loss": 0.0259, "step": 12615 }, { "epoch": 7.763764995386035, "grad_norm": 0.30078125, "learning_rate": 1.1203703703703704e-05, "loss": 0.0251, "step": 12620 }, { "epoch": 7.7668409720086125, "grad_norm": 0.263671875, "learning_rate": 1.1188271604938271e-05, "loss": 0.0235, "step": 12625 }, { "epoch": 7.769916948631191, "grad_norm": 0.2490234375, "learning_rate": 1.117283950617284e-05, "loss": 0.0239, "step": 12630 }, { "epoch": 7.772992925253768, "grad_norm": 0.25390625, "learning_rate": 1.1157407407407408e-05, "loss": 0.0226, "step": 12635 }, { "epoch": 7.776068901876346, "grad_norm": 0.22265625, "learning_rate": 1.1141975308641976e-05, "loss": 0.021, "step": 12640 }, { "epoch": 7.779144878498924, "grad_norm": 0.28515625, "learning_rate": 1.1126543209876545e-05, "loss": 0.023, "step": 12645 }, { "epoch": 7.782220855121501, "grad_norm": 0.2734375, "learning_rate": 1.1111111111111112e-05, "loss": 0.0247, "step": 12650 }, { "epoch": 7.785296831744079, "grad_norm": 0.326171875, "learning_rate": 1.1095679012345678e-05, "loss": 0.0262, "step": 12655 }, { "epoch": 7.788372808366656, "grad_norm": 0.265625, "learning_rate": 1.1080246913580247e-05, "loss": 0.0249, "step": 12660 }, { "epoch": 7.791448784989234, "grad_norm": 0.265625, "learning_rate": 1.1064814814814815e-05, "loss": 0.0226, "step": 12665 }, { "epoch": 7.794524761611812, "grad_norm": 0.287109375, "learning_rate": 1.1049382716049384e-05, "loss": 0.0242, "step": 12670 }, { "epoch": 7.797600738234389, "grad_norm": 0.306640625, "learning_rate": 1.1033950617283952e-05, "loss": 0.0245, "step": 12675 }, { "epoch": 7.800676714856968, "grad_norm": 0.283203125, "learning_rate": 1.1018518518518519e-05, "loss": 0.0219, "step": 12680 }, { "epoch": 7.803752691479545, "grad_norm": 0.283203125, "learning_rate": 1.1003086419753086e-05, "loss": 0.0246, "step": 12685 }, { "epoch": 7.806828668102122, "grad_norm": 0.296875, "learning_rate": 1.0987654320987654e-05, "loss": 0.0235, "step": 12690 }, { "epoch": 7.8099046447247, "grad_norm": 0.244140625, "learning_rate": 1.0972222222222223e-05, "loss": 0.0222, "step": 12695 }, { "epoch": 7.812980621347278, "grad_norm": 0.296875, "learning_rate": 1.0956790123456791e-05, "loss": 0.0249, "step": 12700 }, { "epoch": 7.8160565979698555, "grad_norm": 0.330078125, "learning_rate": 1.094135802469136e-05, "loss": 0.0254, "step": 12705 }, { "epoch": 7.819132574592433, "grad_norm": 0.26171875, "learning_rate": 1.0925925925925926e-05, "loss": 0.0231, "step": 12710 }, { "epoch": 7.822208551215011, "grad_norm": 0.271484375, "learning_rate": 1.0910493827160493e-05, "loss": 0.0228, "step": 12715 }, { "epoch": 7.825284527837589, "grad_norm": 0.2294921875, "learning_rate": 1.0895061728395061e-05, "loss": 0.0226, "step": 12720 }, { "epoch": 7.828360504460166, "grad_norm": 0.287109375, "learning_rate": 1.087962962962963e-05, "loss": 0.0239, "step": 12725 }, { "epoch": 7.8314364810827435, "grad_norm": 0.2890625, "learning_rate": 1.0864197530864198e-05, "loss": 0.0249, "step": 12730 }, { "epoch": 7.834512457705322, "grad_norm": 0.26953125, "learning_rate": 1.0848765432098767e-05, "loss": 0.0222, "step": 12735 }, { "epoch": 7.837588434327899, "grad_norm": 0.265625, "learning_rate": 1.0833333333333334e-05, "loss": 0.0221, "step": 12740 }, { "epoch": 7.840664410950477, "grad_norm": 0.283203125, "learning_rate": 1.0817901234567902e-05, "loss": 0.0233, "step": 12745 }, { "epoch": 7.843740387573055, "grad_norm": 0.275390625, "learning_rate": 1.0802469135802469e-05, "loss": 0.0243, "step": 12750 }, { "epoch": 7.846816364195632, "grad_norm": 0.326171875, "learning_rate": 1.0787037037037037e-05, "loss": 0.0265, "step": 12755 }, { "epoch": 7.84989234081821, "grad_norm": 0.2578125, "learning_rate": 1.0771604938271606e-05, "loss": 0.0247, "step": 12760 }, { "epoch": 7.852968317440787, "grad_norm": 0.271484375, "learning_rate": 1.0756172839506174e-05, "loss": 0.0246, "step": 12765 }, { "epoch": 7.8560442940633655, "grad_norm": 0.412109375, "learning_rate": 1.074074074074074e-05, "loss": 0.0262, "step": 12770 }, { "epoch": 7.859120270685943, "grad_norm": 0.26953125, "learning_rate": 1.072530864197531e-05, "loss": 0.0233, "step": 12775 }, { "epoch": 7.86219624730852, "grad_norm": 0.275390625, "learning_rate": 1.0709876543209878e-05, "loss": 0.0246, "step": 12780 }, { "epoch": 7.865272223931099, "grad_norm": 0.25, "learning_rate": 1.0694444444444444e-05, "loss": 0.0237, "step": 12785 }, { "epoch": 7.868348200553676, "grad_norm": 0.267578125, "learning_rate": 1.0679012345679013e-05, "loss": 0.0227, "step": 12790 }, { "epoch": 7.871424177176253, "grad_norm": 0.333984375, "learning_rate": 1.0663580246913581e-05, "loss": 0.0223, "step": 12795 }, { "epoch": 7.874500153798831, "grad_norm": 0.30078125, "learning_rate": 1.0648148148148148e-05, "loss": 0.0239, "step": 12800 }, { "epoch": 7.877576130421409, "grad_norm": 0.259765625, "learning_rate": 1.0632716049382717e-05, "loss": 0.0248, "step": 12805 }, { "epoch": 7.8806521070439866, "grad_norm": 0.28515625, "learning_rate": 1.0617283950617285e-05, "loss": 0.0231, "step": 12810 }, { "epoch": 7.883728083666564, "grad_norm": 0.291015625, "learning_rate": 1.0601851851851852e-05, "loss": 0.0248, "step": 12815 }, { "epoch": 7.886804060289142, "grad_norm": 0.2890625, "learning_rate": 1.058641975308642e-05, "loss": 0.0251, "step": 12820 }, { "epoch": 7.88988003691172, "grad_norm": 0.2890625, "learning_rate": 1.0570987654320989e-05, "loss": 0.0238, "step": 12825 }, { "epoch": 7.892956013534297, "grad_norm": 0.28125, "learning_rate": 1.0555555555555555e-05, "loss": 0.0257, "step": 12830 }, { "epoch": 7.8960319901568745, "grad_norm": 0.306640625, "learning_rate": 1.0540123456790124e-05, "loss": 0.024, "step": 12835 }, { "epoch": 7.899107966779453, "grad_norm": 0.27734375, "learning_rate": 1.0524691358024692e-05, "loss": 0.0228, "step": 12840 }, { "epoch": 7.90218394340203, "grad_norm": 0.26953125, "learning_rate": 1.050925925925926e-05, "loss": 0.0249, "step": 12845 }, { "epoch": 7.905259920024608, "grad_norm": 0.265625, "learning_rate": 1.0493827160493827e-05, "loss": 0.0244, "step": 12850 }, { "epoch": 7.908335896647186, "grad_norm": 0.291015625, "learning_rate": 1.0478395061728396e-05, "loss": 0.0254, "step": 12855 }, { "epoch": 7.911411873269763, "grad_norm": 0.283203125, "learning_rate": 1.0462962962962963e-05, "loss": 0.0227, "step": 12860 }, { "epoch": 7.914487849892341, "grad_norm": 0.27734375, "learning_rate": 1.0447530864197531e-05, "loss": 0.0236, "step": 12865 }, { "epoch": 7.917563826514918, "grad_norm": 0.28125, "learning_rate": 1.04320987654321e-05, "loss": 0.0248, "step": 12870 }, { "epoch": 7.9206398031374965, "grad_norm": 0.31640625, "learning_rate": 1.0416666666666668e-05, "loss": 0.0235, "step": 12875 }, { "epoch": 7.923715779760074, "grad_norm": 0.26171875, "learning_rate": 1.0401234567901236e-05, "loss": 0.0227, "step": 12880 }, { "epoch": 7.926791756382651, "grad_norm": 0.271484375, "learning_rate": 1.0385802469135803e-05, "loss": 0.0238, "step": 12885 }, { "epoch": 7.92986773300523, "grad_norm": 0.25, "learning_rate": 1.037037037037037e-05, "loss": 0.0241, "step": 12890 }, { "epoch": 7.932943709627807, "grad_norm": 0.2451171875, "learning_rate": 1.0354938271604938e-05, "loss": 0.0232, "step": 12895 }, { "epoch": 7.9360196862503845, "grad_norm": 0.298828125, "learning_rate": 1.0339506172839507e-05, "loss": 0.0285, "step": 12900 }, { "epoch": 7.939095662872962, "grad_norm": 0.26953125, "learning_rate": 1.0324074074074075e-05, "loss": 0.0235, "step": 12905 }, { "epoch": 7.94217163949554, "grad_norm": 0.291015625, "learning_rate": 1.0308641975308644e-05, "loss": 0.0234, "step": 12910 }, { "epoch": 7.945247616118118, "grad_norm": 0.322265625, "learning_rate": 1.029320987654321e-05, "loss": 0.0233, "step": 12915 }, { "epoch": 7.948323592740695, "grad_norm": 0.27734375, "learning_rate": 1.0277777777777777e-05, "loss": 0.0224, "step": 12920 }, { "epoch": 7.951399569363273, "grad_norm": 0.267578125, "learning_rate": 1.0262345679012346e-05, "loss": 0.0248, "step": 12925 }, { "epoch": 7.954475545985851, "grad_norm": 0.271484375, "learning_rate": 1.0246913580246914e-05, "loss": 0.024, "step": 12930 }, { "epoch": 7.957551522608428, "grad_norm": 0.298828125, "learning_rate": 1.0231481481481483e-05, "loss": 0.0223, "step": 12935 }, { "epoch": 7.9606274992310055, "grad_norm": 0.283203125, "learning_rate": 1.0216049382716051e-05, "loss": 0.0212, "step": 12940 }, { "epoch": 7.963703475853584, "grad_norm": 0.2578125, "learning_rate": 1.0200617283950618e-05, "loss": 0.0233, "step": 12945 }, { "epoch": 7.966779452476161, "grad_norm": 0.302734375, "learning_rate": 1.0185185185185185e-05, "loss": 0.0243, "step": 12950 }, { "epoch": 7.969855429098739, "grad_norm": 0.2734375, "learning_rate": 1.0169753086419753e-05, "loss": 0.0237, "step": 12955 }, { "epoch": 7.972931405721317, "grad_norm": 0.302734375, "learning_rate": 1.0154320987654321e-05, "loss": 0.0238, "step": 12960 }, { "epoch": 7.976007382343894, "grad_norm": 0.310546875, "learning_rate": 1.013888888888889e-05, "loss": 0.0237, "step": 12965 }, { "epoch": 7.979083358966472, "grad_norm": 0.279296875, "learning_rate": 1.0123456790123458e-05, "loss": 0.0238, "step": 12970 }, { "epoch": 7.982159335589049, "grad_norm": 0.2353515625, "learning_rate": 1.0108024691358025e-05, "loss": 0.0229, "step": 12975 }, { "epoch": 7.9852353122116275, "grad_norm": 0.291015625, "learning_rate": 1.0092592592592594e-05, "loss": 0.0239, "step": 12980 }, { "epoch": 7.988311288834205, "grad_norm": 0.236328125, "learning_rate": 1.007716049382716e-05, "loss": 0.0226, "step": 12985 }, { "epoch": 7.991387265456782, "grad_norm": 0.294921875, "learning_rate": 1.0061728395061729e-05, "loss": 0.0251, "step": 12990 }, { "epoch": 7.994463242079361, "grad_norm": 0.287109375, "learning_rate": 1.0046296296296297e-05, "loss": 0.0251, "step": 12995 }, { "epoch": 7.997539218701938, "grad_norm": 0.26953125, "learning_rate": 1.0030864197530866e-05, "loss": 0.0245, "step": 13000 }, { "epoch": 8.000615195324515, "grad_norm": 0.287109375, "learning_rate": 1.0015432098765432e-05, "loss": 0.0257, "step": 13005 }, { "epoch": 8.003691171947093, "grad_norm": 0.302734375, "learning_rate": 1e-05, "loss": 0.0255, "step": 13010 }, { "epoch": 8.00676714856967, "grad_norm": 0.25390625, "learning_rate": 9.984567901234568e-06, "loss": 0.0237, "step": 13015 }, { "epoch": 8.009843125192248, "grad_norm": 0.291015625, "learning_rate": 9.969135802469136e-06, "loss": 0.0222, "step": 13020 }, { "epoch": 8.012919101814827, "grad_norm": 0.279296875, "learning_rate": 9.953703703703704e-06, "loss": 0.021, "step": 13025 }, { "epoch": 8.015995078437404, "grad_norm": 0.2734375, "learning_rate": 9.938271604938273e-06, "loss": 0.0246, "step": 13030 }, { "epoch": 8.019071055059982, "grad_norm": 0.259765625, "learning_rate": 9.92283950617284e-06, "loss": 0.0221, "step": 13035 }, { "epoch": 8.02214703168256, "grad_norm": 0.27734375, "learning_rate": 9.907407407407408e-06, "loss": 0.0232, "step": 13040 }, { "epoch": 8.025223008305137, "grad_norm": 0.259765625, "learning_rate": 9.891975308641977e-06, "loss": 0.0228, "step": 13045 }, { "epoch": 8.028298984927714, "grad_norm": 0.2734375, "learning_rate": 9.876543209876543e-06, "loss": 0.0214, "step": 13050 }, { "epoch": 8.031374961550291, "grad_norm": 0.267578125, "learning_rate": 9.861111111111112e-06, "loss": 0.0214, "step": 13055 }, { "epoch": 8.03445093817287, "grad_norm": 0.271484375, "learning_rate": 9.84567901234568e-06, "loss": 0.0219, "step": 13060 }, { "epoch": 8.037526914795448, "grad_norm": 0.279296875, "learning_rate": 9.830246913580247e-06, "loss": 0.0205, "step": 13065 }, { "epoch": 8.040602891418025, "grad_norm": 0.28125, "learning_rate": 9.814814814814815e-06, "loss": 0.0234, "step": 13070 }, { "epoch": 8.043678868040603, "grad_norm": 0.2451171875, "learning_rate": 9.799382716049384e-06, "loss": 0.0233, "step": 13075 }, { "epoch": 8.04675484466318, "grad_norm": 0.28125, "learning_rate": 9.78395061728395e-06, "loss": 0.0235, "step": 13080 }, { "epoch": 8.049830821285758, "grad_norm": 0.26953125, "learning_rate": 9.768518518518519e-06, "loss": 0.0216, "step": 13085 }, { "epoch": 8.052906797908335, "grad_norm": 0.234375, "learning_rate": 9.753086419753086e-06, "loss": 0.0213, "step": 13090 }, { "epoch": 8.055982774530914, "grad_norm": 0.283203125, "learning_rate": 9.737654320987654e-06, "loss": 0.0232, "step": 13095 }, { "epoch": 8.059058751153492, "grad_norm": 0.25, "learning_rate": 9.722222222222223e-06, "loss": 0.0209, "step": 13100 }, { "epoch": 8.062134727776069, "grad_norm": 0.28515625, "learning_rate": 9.706790123456791e-06, "loss": 0.0231, "step": 13105 }, { "epoch": 8.065210704398647, "grad_norm": 0.287109375, "learning_rate": 9.69135802469136e-06, "loss": 0.0237, "step": 13110 }, { "epoch": 8.068286681021224, "grad_norm": 0.26953125, "learning_rate": 9.675925925925926e-06, "loss": 0.0213, "step": 13115 }, { "epoch": 8.071362657643801, "grad_norm": 0.271484375, "learning_rate": 9.660493827160493e-06, "loss": 0.0235, "step": 13120 }, { "epoch": 8.074438634266379, "grad_norm": 0.25, "learning_rate": 9.645061728395062e-06, "loss": 0.0225, "step": 13125 }, { "epoch": 8.077514610888958, "grad_norm": 0.255859375, "learning_rate": 9.62962962962963e-06, "loss": 0.0225, "step": 13130 }, { "epoch": 8.080590587511535, "grad_norm": 0.26171875, "learning_rate": 9.614197530864198e-06, "loss": 0.0235, "step": 13135 }, { "epoch": 8.083666564134113, "grad_norm": 0.24609375, "learning_rate": 9.598765432098767e-06, "loss": 0.0232, "step": 13140 }, { "epoch": 8.08674254075669, "grad_norm": 0.28125, "learning_rate": 9.583333333333334e-06, "loss": 0.0236, "step": 13145 }, { "epoch": 8.089818517379268, "grad_norm": 0.232421875, "learning_rate": 9.5679012345679e-06, "loss": 0.0211, "step": 13150 }, { "epoch": 8.092894494001845, "grad_norm": 0.279296875, "learning_rate": 9.552469135802469e-06, "loss": 0.0234, "step": 13155 }, { "epoch": 8.095970470624422, "grad_norm": 0.30859375, "learning_rate": 9.537037037037037e-06, "loss": 0.0241, "step": 13160 }, { "epoch": 8.099046447247002, "grad_norm": 0.296875, "learning_rate": 9.521604938271606e-06, "loss": 0.0235, "step": 13165 }, { "epoch": 8.102122423869579, "grad_norm": 0.3203125, "learning_rate": 9.506172839506174e-06, "loss": 0.0226, "step": 13170 }, { "epoch": 8.105198400492156, "grad_norm": 0.27734375, "learning_rate": 9.490740740740741e-06, "loss": 0.0238, "step": 13175 }, { "epoch": 8.108274377114734, "grad_norm": 0.291015625, "learning_rate": 9.475308641975308e-06, "loss": 0.0252, "step": 13180 }, { "epoch": 8.111350353737311, "grad_norm": 0.27734375, "learning_rate": 9.459876543209876e-06, "loss": 0.0236, "step": 13185 }, { "epoch": 8.114426330359889, "grad_norm": 0.2578125, "learning_rate": 9.444444444444445e-06, "loss": 0.0225, "step": 13190 }, { "epoch": 8.117502306982466, "grad_norm": 0.248046875, "learning_rate": 9.429012345679013e-06, "loss": 0.0221, "step": 13195 }, { "epoch": 8.120578283605045, "grad_norm": 0.291015625, "learning_rate": 9.413580246913581e-06, "loss": 0.0234, "step": 13200 }, { "epoch": 8.123654260227623, "grad_norm": 0.24609375, "learning_rate": 9.398148148148148e-06, "loss": 0.0231, "step": 13205 }, { "epoch": 8.1267302368502, "grad_norm": 0.26953125, "learning_rate": 9.382716049382717e-06, "loss": 0.0238, "step": 13210 }, { "epoch": 8.129806213472778, "grad_norm": 0.30078125, "learning_rate": 9.367283950617283e-06, "loss": 0.0228, "step": 13215 }, { "epoch": 8.132882190095355, "grad_norm": 0.32421875, "learning_rate": 9.351851851851852e-06, "loss": 0.0221, "step": 13220 }, { "epoch": 8.135958166717932, "grad_norm": 0.30859375, "learning_rate": 9.33641975308642e-06, "loss": 0.0238, "step": 13225 }, { "epoch": 8.13903414334051, "grad_norm": 0.259765625, "learning_rate": 9.320987654320989e-06, "loss": 0.0204, "step": 13230 }, { "epoch": 8.142110119963089, "grad_norm": 0.2578125, "learning_rate": 9.305555555555555e-06, "loss": 0.0227, "step": 13235 }, { "epoch": 8.145186096585666, "grad_norm": 0.2431640625, "learning_rate": 9.290123456790124e-06, "loss": 0.0245, "step": 13240 }, { "epoch": 8.148262073208244, "grad_norm": 0.265625, "learning_rate": 9.274691358024692e-06, "loss": 0.0225, "step": 13245 }, { "epoch": 8.151338049830821, "grad_norm": 0.279296875, "learning_rate": 9.259259259259259e-06, "loss": 0.0212, "step": 13250 }, { "epoch": 8.154414026453399, "grad_norm": 0.328125, "learning_rate": 9.243827160493828e-06, "loss": 0.0227, "step": 13255 }, { "epoch": 8.157490003075976, "grad_norm": 0.296875, "learning_rate": 9.228395061728396e-06, "loss": 0.0235, "step": 13260 }, { "epoch": 8.160565979698553, "grad_norm": 0.31640625, "learning_rate": 9.212962962962963e-06, "loss": 0.0279, "step": 13265 }, { "epoch": 8.163641956321133, "grad_norm": 0.283203125, "learning_rate": 9.197530864197531e-06, "loss": 0.0227, "step": 13270 }, { "epoch": 8.16671793294371, "grad_norm": 0.25390625, "learning_rate": 9.1820987654321e-06, "loss": 0.0215, "step": 13275 }, { "epoch": 8.169793909566287, "grad_norm": 0.255859375, "learning_rate": 9.166666666666666e-06, "loss": 0.0211, "step": 13280 }, { "epoch": 8.172869886188865, "grad_norm": 0.26171875, "learning_rate": 9.151234567901235e-06, "loss": 0.0248, "step": 13285 }, { "epoch": 8.175945862811442, "grad_norm": 0.294921875, "learning_rate": 9.135802469135803e-06, "loss": 0.0224, "step": 13290 }, { "epoch": 8.17902183943402, "grad_norm": 0.294921875, "learning_rate": 9.12037037037037e-06, "loss": 0.0219, "step": 13295 }, { "epoch": 8.182097816056597, "grad_norm": 0.28125, "learning_rate": 9.104938271604939e-06, "loss": 0.0234, "step": 13300 }, { "epoch": 8.185173792679176, "grad_norm": 0.251953125, "learning_rate": 9.089506172839507e-06, "loss": 0.0221, "step": 13305 }, { "epoch": 8.188249769301754, "grad_norm": 0.28125, "learning_rate": 9.074074074074075e-06, "loss": 0.0237, "step": 13310 }, { "epoch": 8.191325745924331, "grad_norm": 0.28125, "learning_rate": 9.058641975308642e-06, "loss": 0.0228, "step": 13315 }, { "epoch": 8.194401722546909, "grad_norm": 0.294921875, "learning_rate": 9.04320987654321e-06, "loss": 0.0229, "step": 13320 }, { "epoch": 8.197477699169486, "grad_norm": 0.29296875, "learning_rate": 9.027777777777777e-06, "loss": 0.0229, "step": 13325 }, { "epoch": 8.200553675792063, "grad_norm": 0.275390625, "learning_rate": 9.012345679012346e-06, "loss": 0.0213, "step": 13330 }, { "epoch": 8.20362965241464, "grad_norm": 0.291015625, "learning_rate": 8.996913580246914e-06, "loss": 0.0238, "step": 13335 }, { "epoch": 8.20670562903722, "grad_norm": 0.275390625, "learning_rate": 8.981481481481483e-06, "loss": 0.0234, "step": 13340 }, { "epoch": 8.209781605659797, "grad_norm": 0.2890625, "learning_rate": 8.966049382716051e-06, "loss": 0.024, "step": 13345 }, { "epoch": 8.212857582282375, "grad_norm": 0.267578125, "learning_rate": 8.950617283950618e-06, "loss": 0.0215, "step": 13350 }, { "epoch": 8.215933558904952, "grad_norm": 0.287109375, "learning_rate": 8.935185185185185e-06, "loss": 0.0233, "step": 13355 }, { "epoch": 8.21900953552753, "grad_norm": 0.2294921875, "learning_rate": 8.919753086419753e-06, "loss": 0.0214, "step": 13360 }, { "epoch": 8.222085512150107, "grad_norm": 0.265625, "learning_rate": 8.904320987654322e-06, "loss": 0.0223, "step": 13365 }, { "epoch": 8.225161488772684, "grad_norm": 0.275390625, "learning_rate": 8.88888888888889e-06, "loss": 0.0236, "step": 13370 }, { "epoch": 8.228237465395264, "grad_norm": 0.310546875, "learning_rate": 8.873456790123458e-06, "loss": 0.025, "step": 13375 }, { "epoch": 8.231313442017841, "grad_norm": 0.259765625, "learning_rate": 8.858024691358025e-06, "loss": 0.0222, "step": 13380 }, { "epoch": 8.234389418640419, "grad_norm": 0.30078125, "learning_rate": 8.842592592592592e-06, "loss": 0.0244, "step": 13385 }, { "epoch": 8.237465395262996, "grad_norm": 0.259765625, "learning_rate": 8.82716049382716e-06, "loss": 0.0216, "step": 13390 }, { "epoch": 8.240541371885573, "grad_norm": 0.27734375, "learning_rate": 8.811728395061729e-06, "loss": 0.0237, "step": 13395 }, { "epoch": 8.24361734850815, "grad_norm": 0.265625, "learning_rate": 8.796296296296297e-06, "loss": 0.0235, "step": 13400 }, { "epoch": 8.246693325130728, "grad_norm": 0.302734375, "learning_rate": 8.780864197530866e-06, "loss": 0.0217, "step": 13405 }, { "epoch": 8.249769301753307, "grad_norm": 0.263671875, "learning_rate": 8.765432098765432e-06, "loss": 0.0215, "step": 13410 }, { "epoch": 8.252845278375885, "grad_norm": 0.27734375, "learning_rate": 8.75e-06, "loss": 0.0219, "step": 13415 }, { "epoch": 8.255921254998462, "grad_norm": 0.275390625, "learning_rate": 8.734567901234568e-06, "loss": 0.0234, "step": 13420 }, { "epoch": 8.25899723162104, "grad_norm": 0.3046875, "learning_rate": 8.719135802469136e-06, "loss": 0.0248, "step": 13425 }, { "epoch": 8.262073208243617, "grad_norm": 0.2734375, "learning_rate": 8.703703703703705e-06, "loss": 0.0239, "step": 13430 }, { "epoch": 8.265149184866194, "grad_norm": 0.28125, "learning_rate": 8.688271604938273e-06, "loss": 0.021, "step": 13435 }, { "epoch": 8.268225161488772, "grad_norm": 0.255859375, "learning_rate": 8.67283950617284e-06, "loss": 0.0237, "step": 13440 }, { "epoch": 8.271301138111351, "grad_norm": 0.29296875, "learning_rate": 8.657407407407407e-06, "loss": 0.0224, "step": 13445 }, { "epoch": 8.274377114733928, "grad_norm": 0.287109375, "learning_rate": 8.641975308641975e-06, "loss": 0.0213, "step": 13450 }, { "epoch": 8.277453091356506, "grad_norm": 0.287109375, "learning_rate": 8.626543209876543e-06, "loss": 0.0232, "step": 13455 }, { "epoch": 8.280529067979083, "grad_norm": 0.283203125, "learning_rate": 8.611111111111112e-06, "loss": 0.0221, "step": 13460 }, { "epoch": 8.28360504460166, "grad_norm": 0.2734375, "learning_rate": 8.59567901234568e-06, "loss": 0.0234, "step": 13465 }, { "epoch": 8.286681021224238, "grad_norm": 0.27734375, "learning_rate": 8.580246913580247e-06, "loss": 0.0235, "step": 13470 }, { "epoch": 8.289756997846816, "grad_norm": 0.255859375, "learning_rate": 8.564814814814816e-06, "loss": 0.0233, "step": 13475 }, { "epoch": 8.292832974469395, "grad_norm": 0.29296875, "learning_rate": 8.549382716049382e-06, "loss": 0.0245, "step": 13480 }, { "epoch": 8.295908951091972, "grad_norm": 0.255859375, "learning_rate": 8.53395061728395e-06, "loss": 0.0208, "step": 13485 }, { "epoch": 8.29898492771455, "grad_norm": 0.298828125, "learning_rate": 8.518518518518519e-06, "loss": 0.024, "step": 13490 }, { "epoch": 8.302060904337127, "grad_norm": 0.294921875, "learning_rate": 8.503086419753088e-06, "loss": 0.0227, "step": 13495 }, { "epoch": 8.305136880959704, "grad_norm": 0.267578125, "learning_rate": 8.487654320987654e-06, "loss": 0.0214, "step": 13500 }, { "epoch": 8.308212857582282, "grad_norm": 0.27734375, "learning_rate": 8.472222222222223e-06, "loss": 0.0215, "step": 13505 }, { "epoch": 8.31128883420486, "grad_norm": 0.2890625, "learning_rate": 8.456790123456791e-06, "loss": 0.0231, "step": 13510 }, { "epoch": 8.314364810827438, "grad_norm": 0.279296875, "learning_rate": 8.441358024691358e-06, "loss": 0.0235, "step": 13515 }, { "epoch": 8.317440787450016, "grad_norm": 0.255859375, "learning_rate": 8.425925925925926e-06, "loss": 0.0209, "step": 13520 }, { "epoch": 8.320516764072593, "grad_norm": 0.2734375, "learning_rate": 8.410493827160495e-06, "loss": 0.0226, "step": 13525 }, { "epoch": 8.32359274069517, "grad_norm": 0.26953125, "learning_rate": 8.395061728395062e-06, "loss": 0.0221, "step": 13530 }, { "epoch": 8.326668717317748, "grad_norm": 0.30078125, "learning_rate": 8.37962962962963e-06, "loss": 0.0255, "step": 13535 }, { "epoch": 8.329744693940325, "grad_norm": 0.2734375, "learning_rate": 8.364197530864199e-06, "loss": 0.0237, "step": 13540 }, { "epoch": 8.332820670562903, "grad_norm": 0.28515625, "learning_rate": 8.348765432098765e-06, "loss": 0.0232, "step": 13545 }, { "epoch": 8.335896647185482, "grad_norm": 0.279296875, "learning_rate": 8.333333333333334e-06, "loss": 0.0248, "step": 13550 }, { "epoch": 8.33897262380806, "grad_norm": 0.298828125, "learning_rate": 8.317901234567902e-06, "loss": 0.0216, "step": 13555 }, { "epoch": 8.342048600430637, "grad_norm": 0.26953125, "learning_rate": 8.302469135802469e-06, "loss": 0.0228, "step": 13560 }, { "epoch": 8.345124577053214, "grad_norm": 0.251953125, "learning_rate": 8.287037037037037e-06, "loss": 0.0207, "step": 13565 }, { "epoch": 8.348200553675792, "grad_norm": 0.2890625, "learning_rate": 8.271604938271606e-06, "loss": 0.0231, "step": 13570 }, { "epoch": 8.35127653029837, "grad_norm": 0.2578125, "learning_rate": 8.256172839506174e-06, "loss": 0.0223, "step": 13575 }, { "epoch": 8.354352506920947, "grad_norm": 0.294921875, "learning_rate": 8.240740740740741e-06, "loss": 0.0238, "step": 13580 }, { "epoch": 8.357428483543526, "grad_norm": 0.24609375, "learning_rate": 8.22530864197531e-06, "loss": 0.0225, "step": 13585 }, { "epoch": 8.360504460166103, "grad_norm": 0.306640625, "learning_rate": 8.209876543209876e-06, "loss": 0.0238, "step": 13590 }, { "epoch": 8.36358043678868, "grad_norm": 0.287109375, "learning_rate": 8.194444444444445e-06, "loss": 0.0222, "step": 13595 }, { "epoch": 8.366656413411258, "grad_norm": 0.251953125, "learning_rate": 8.179012345679013e-06, "loss": 0.0219, "step": 13600 }, { "epoch": 8.369732390033835, "grad_norm": 0.296875, "learning_rate": 8.163580246913582e-06, "loss": 0.0239, "step": 13605 }, { "epoch": 8.372808366656413, "grad_norm": 0.291015625, "learning_rate": 8.14814814814815e-06, "loss": 0.0236, "step": 13610 }, { "epoch": 8.37588434327899, "grad_norm": 0.2734375, "learning_rate": 8.132716049382717e-06, "loss": 0.0247, "step": 13615 }, { "epoch": 8.37896031990157, "grad_norm": 0.25390625, "learning_rate": 8.117283950617284e-06, "loss": 0.0229, "step": 13620 }, { "epoch": 8.382036296524147, "grad_norm": 0.244140625, "learning_rate": 8.101851851851852e-06, "loss": 0.0227, "step": 13625 }, { "epoch": 8.385112273146724, "grad_norm": 0.279296875, "learning_rate": 8.08641975308642e-06, "loss": 0.0238, "step": 13630 }, { "epoch": 8.388188249769302, "grad_norm": 0.255859375, "learning_rate": 8.070987654320989e-06, "loss": 0.0215, "step": 13635 }, { "epoch": 8.391264226391879, "grad_norm": 0.283203125, "learning_rate": 8.055555555555557e-06, "loss": 0.0252, "step": 13640 }, { "epoch": 8.394340203014456, "grad_norm": 0.2734375, "learning_rate": 8.040123456790122e-06, "loss": 0.0235, "step": 13645 }, { "epoch": 8.397416179637034, "grad_norm": 0.275390625, "learning_rate": 8.02469135802469e-06, "loss": 0.0229, "step": 13650 }, { "epoch": 8.400492156259613, "grad_norm": 0.2734375, "learning_rate": 8.00925925925926e-06, "loss": 0.0221, "step": 13655 }, { "epoch": 8.40356813288219, "grad_norm": 0.318359375, "learning_rate": 7.993827160493828e-06, "loss": 0.0241, "step": 13660 }, { "epoch": 8.406644109504768, "grad_norm": 0.28125, "learning_rate": 7.978395061728396e-06, "loss": 0.0218, "step": 13665 }, { "epoch": 8.409720086127345, "grad_norm": 0.28125, "learning_rate": 7.962962962962963e-06, "loss": 0.0239, "step": 13670 }, { "epoch": 8.412796062749923, "grad_norm": 0.251953125, "learning_rate": 7.947530864197531e-06, "loss": 0.0232, "step": 13675 }, { "epoch": 8.4158720393725, "grad_norm": 0.3203125, "learning_rate": 7.932098765432098e-06, "loss": 0.026, "step": 13680 }, { "epoch": 8.418948015995078, "grad_norm": 0.25, "learning_rate": 7.916666666666667e-06, "loss": 0.0233, "step": 13685 }, { "epoch": 8.422023992617657, "grad_norm": 0.2578125, "learning_rate": 7.901234567901235e-06, "loss": 0.0221, "step": 13690 }, { "epoch": 8.425099969240234, "grad_norm": 0.33984375, "learning_rate": 7.885802469135803e-06, "loss": 0.0239, "step": 13695 }, { "epoch": 8.428175945862812, "grad_norm": 0.265625, "learning_rate": 7.87037037037037e-06, "loss": 0.0235, "step": 13700 }, { "epoch": 8.431251922485389, "grad_norm": 0.232421875, "learning_rate": 7.854938271604939e-06, "loss": 0.0222, "step": 13705 }, { "epoch": 8.434327899107966, "grad_norm": 0.25390625, "learning_rate": 7.839506172839507e-06, "loss": 0.0226, "step": 13710 }, { "epoch": 8.437403875730544, "grad_norm": 0.291015625, "learning_rate": 7.824074074074074e-06, "loss": 0.0244, "step": 13715 }, { "epoch": 8.440479852353121, "grad_norm": 0.271484375, "learning_rate": 7.808641975308642e-06, "loss": 0.022, "step": 13720 }, { "epoch": 8.4435558289757, "grad_norm": 0.298828125, "learning_rate": 7.79320987654321e-06, "loss": 0.0234, "step": 13725 }, { "epoch": 8.446631805598278, "grad_norm": 0.271484375, "learning_rate": 7.777777777777777e-06, "loss": 0.0236, "step": 13730 }, { "epoch": 8.449707782220855, "grad_norm": 0.279296875, "learning_rate": 7.762345679012346e-06, "loss": 0.0237, "step": 13735 }, { "epoch": 8.452783758843433, "grad_norm": 0.28515625, "learning_rate": 7.746913580246914e-06, "loss": 0.0237, "step": 13740 }, { "epoch": 8.45585973546601, "grad_norm": 0.279296875, "learning_rate": 7.731481481481481e-06, "loss": 0.0232, "step": 13745 }, { "epoch": 8.458935712088588, "grad_norm": 0.279296875, "learning_rate": 7.71604938271605e-06, "loss": 0.0228, "step": 13750 }, { "epoch": 8.462011688711165, "grad_norm": 0.3515625, "learning_rate": 7.700617283950618e-06, "loss": 0.0225, "step": 13755 }, { "epoch": 8.465087665333744, "grad_norm": 0.271484375, "learning_rate": 7.685185185185185e-06, "loss": 0.025, "step": 13760 }, { "epoch": 8.468163641956322, "grad_norm": 0.28515625, "learning_rate": 7.669753086419753e-06, "loss": 0.0208, "step": 13765 }, { "epoch": 8.471239618578899, "grad_norm": 0.279296875, "learning_rate": 7.654320987654322e-06, "loss": 0.023, "step": 13770 }, { "epoch": 8.474315595201476, "grad_norm": 0.2734375, "learning_rate": 7.63888888888889e-06, "loss": 0.0233, "step": 13775 }, { "epoch": 8.477391571824054, "grad_norm": 0.28515625, "learning_rate": 7.623456790123457e-06, "loss": 0.023, "step": 13780 }, { "epoch": 8.480467548446631, "grad_norm": 0.2578125, "learning_rate": 7.6080246913580245e-06, "loss": 0.0238, "step": 13785 }, { "epoch": 8.483543525069209, "grad_norm": 0.2578125, "learning_rate": 7.592592592592593e-06, "loss": 0.0243, "step": 13790 }, { "epoch": 8.486619501691788, "grad_norm": 0.32421875, "learning_rate": 7.5771604938271605e-06, "loss": 0.0249, "step": 13795 }, { "epoch": 8.489695478314365, "grad_norm": 0.2734375, "learning_rate": 7.561728395061729e-06, "loss": 0.0244, "step": 13800 }, { "epoch": 8.492771454936943, "grad_norm": 0.271484375, "learning_rate": 7.546296296296297e-06, "loss": 0.0227, "step": 13805 }, { "epoch": 8.49584743155952, "grad_norm": 0.287109375, "learning_rate": 7.530864197530865e-06, "loss": 0.0215, "step": 13810 }, { "epoch": 8.498923408182097, "grad_norm": 0.30859375, "learning_rate": 7.515432098765432e-06, "loss": 0.0234, "step": 13815 }, { "epoch": 8.501999384804675, "grad_norm": 0.298828125, "learning_rate": 7.5e-06, "loss": 0.0229, "step": 13820 }, { "epoch": 8.505075361427252, "grad_norm": 0.275390625, "learning_rate": 7.484567901234568e-06, "loss": 0.0234, "step": 13825 }, { "epoch": 8.508151338049831, "grad_norm": 0.30078125, "learning_rate": 7.469135802469136e-06, "loss": 0.0244, "step": 13830 }, { "epoch": 8.511227314672409, "grad_norm": 0.26171875, "learning_rate": 7.453703703703705e-06, "loss": 0.0218, "step": 13835 }, { "epoch": 8.514303291294986, "grad_norm": 0.271484375, "learning_rate": 7.438271604938272e-06, "loss": 0.0223, "step": 13840 }, { "epoch": 8.517379267917564, "grad_norm": 0.27734375, "learning_rate": 7.422839506172839e-06, "loss": 0.0242, "step": 13845 }, { "epoch": 8.520455244540141, "grad_norm": 0.244140625, "learning_rate": 7.4074074074074075e-06, "loss": 0.022, "step": 13850 }, { "epoch": 8.523531221162719, "grad_norm": 0.271484375, "learning_rate": 7.391975308641975e-06, "loss": 0.0249, "step": 13855 }, { "epoch": 8.526607197785296, "grad_norm": 0.30859375, "learning_rate": 7.3765432098765435e-06, "loss": 0.0233, "step": 13860 }, { "epoch": 8.529683174407875, "grad_norm": 0.275390625, "learning_rate": 7.361111111111112e-06, "loss": 0.0243, "step": 13865 }, { "epoch": 8.532759151030453, "grad_norm": 0.30078125, "learning_rate": 7.3456790123456796e-06, "loss": 0.0237, "step": 13870 }, { "epoch": 8.53583512765303, "grad_norm": 0.3046875, "learning_rate": 7.330246913580248e-06, "loss": 0.0235, "step": 13875 }, { "epoch": 8.538911104275607, "grad_norm": 0.28125, "learning_rate": 7.314814814814815e-06, "loss": 0.0266, "step": 13880 }, { "epoch": 8.541987080898185, "grad_norm": 0.24609375, "learning_rate": 7.299382716049382e-06, "loss": 0.0227, "step": 13885 }, { "epoch": 8.545063057520762, "grad_norm": 0.291015625, "learning_rate": 7.283950617283951e-06, "loss": 0.0245, "step": 13890 }, { "epoch": 8.54813903414334, "grad_norm": 0.27734375, "learning_rate": 7.268518518518519e-06, "loss": 0.024, "step": 13895 }, { "epoch": 8.551215010765919, "grad_norm": 0.2431640625, "learning_rate": 7.253086419753087e-06, "loss": 0.0226, "step": 13900 }, { "epoch": 8.554290987388496, "grad_norm": 0.28125, "learning_rate": 7.237654320987655e-06, "loss": 0.0246, "step": 13905 }, { "epoch": 8.557366964011074, "grad_norm": 0.267578125, "learning_rate": 7.222222222222222e-06, "loss": 0.0213, "step": 13910 }, { "epoch": 8.560442940633651, "grad_norm": 0.259765625, "learning_rate": 7.20679012345679e-06, "loss": 0.0231, "step": 13915 }, { "epoch": 8.563518917256228, "grad_norm": 0.291015625, "learning_rate": 7.191358024691358e-06, "loss": 0.0224, "step": 13920 }, { "epoch": 8.566594893878806, "grad_norm": 0.30078125, "learning_rate": 7.1759259259259266e-06, "loss": 0.023, "step": 13925 }, { "epoch": 8.569670870501383, "grad_norm": 0.287109375, "learning_rate": 7.160493827160494e-06, "loss": 0.0221, "step": 13930 }, { "epoch": 8.572746847123963, "grad_norm": 0.271484375, "learning_rate": 7.145061728395063e-06, "loss": 0.0221, "step": 13935 }, { "epoch": 8.57582282374654, "grad_norm": 0.3203125, "learning_rate": 7.12962962962963e-06, "loss": 0.023, "step": 13940 }, { "epoch": 8.578898800369117, "grad_norm": 0.27734375, "learning_rate": 7.114197530864197e-06, "loss": 0.0242, "step": 13945 }, { "epoch": 8.581974776991695, "grad_norm": 0.279296875, "learning_rate": 7.098765432098765e-06, "loss": 0.0224, "step": 13950 }, { "epoch": 8.585050753614272, "grad_norm": 0.26953125, "learning_rate": 7.083333333333334e-06, "loss": 0.0217, "step": 13955 }, { "epoch": 8.58812673023685, "grad_norm": 0.25390625, "learning_rate": 7.0679012345679014e-06, "loss": 0.0231, "step": 13960 }, { "epoch": 8.591202706859427, "grad_norm": 0.3203125, "learning_rate": 7.05246913580247e-06, "loss": 0.0243, "step": 13965 }, { "epoch": 8.594278683482006, "grad_norm": 0.26953125, "learning_rate": 7.0370370370370375e-06, "loss": 0.0207, "step": 13970 }, { "epoch": 8.597354660104584, "grad_norm": 0.251953125, "learning_rate": 7.021604938271606e-06, "loss": 0.0227, "step": 13975 }, { "epoch": 8.600430636727161, "grad_norm": 0.275390625, "learning_rate": 7.006172839506173e-06, "loss": 0.0214, "step": 13980 }, { "epoch": 8.603506613349738, "grad_norm": 0.296875, "learning_rate": 6.990740740740741e-06, "loss": 0.021, "step": 13985 }, { "epoch": 8.606582589972316, "grad_norm": 0.251953125, "learning_rate": 6.975308641975309e-06, "loss": 0.0233, "step": 13990 }, { "epoch": 8.609658566594893, "grad_norm": 0.28515625, "learning_rate": 6.959876543209877e-06, "loss": 0.0217, "step": 13995 }, { "epoch": 8.61273454321747, "grad_norm": 0.263671875, "learning_rate": 6.944444444444445e-06, "loss": 0.023, "step": 14000 }, { "epoch": 8.61581051984005, "grad_norm": 0.2421875, "learning_rate": 6.929012345679013e-06, "loss": 0.0249, "step": 14005 }, { "epoch": 8.618886496462627, "grad_norm": 0.291015625, "learning_rate": 6.91358024691358e-06, "loss": 0.0222, "step": 14010 }, { "epoch": 8.621962473085205, "grad_norm": 0.302734375, "learning_rate": 6.898148148148148e-06, "loss": 0.0236, "step": 14015 }, { "epoch": 8.625038449707782, "grad_norm": 0.271484375, "learning_rate": 6.882716049382716e-06, "loss": 0.0219, "step": 14020 }, { "epoch": 8.62811442633036, "grad_norm": 0.310546875, "learning_rate": 6.8672839506172845e-06, "loss": 0.0261, "step": 14025 }, { "epoch": 8.631190402952937, "grad_norm": 0.291015625, "learning_rate": 6.851851851851852e-06, "loss": 0.0244, "step": 14030 }, { "epoch": 8.634266379575514, "grad_norm": 0.287109375, "learning_rate": 6.8364197530864205e-06, "loss": 0.022, "step": 14035 }, { "epoch": 8.637342356198094, "grad_norm": 0.310546875, "learning_rate": 6.820987654320989e-06, "loss": 0.0232, "step": 14040 }, { "epoch": 8.640418332820671, "grad_norm": 0.2236328125, "learning_rate": 6.805555555555556e-06, "loss": 0.0218, "step": 14045 }, { "epoch": 8.643494309443248, "grad_norm": 0.271484375, "learning_rate": 6.790123456790123e-06, "loss": 0.0242, "step": 14050 }, { "epoch": 8.646570286065826, "grad_norm": 0.24609375, "learning_rate": 6.774691358024692e-06, "loss": 0.0229, "step": 14055 }, { "epoch": 8.649646262688403, "grad_norm": 0.30078125, "learning_rate": 6.759259259259259e-06, "loss": 0.0235, "step": 14060 }, { "epoch": 8.65272223931098, "grad_norm": 0.2734375, "learning_rate": 6.743827160493828e-06, "loss": 0.022, "step": 14065 }, { "epoch": 8.655798215933558, "grad_norm": 0.2158203125, "learning_rate": 6.728395061728396e-06, "loss": 0.0218, "step": 14070 }, { "epoch": 8.658874192556137, "grad_norm": 0.263671875, "learning_rate": 6.712962962962964e-06, "loss": 0.0223, "step": 14075 }, { "epoch": 8.661950169178715, "grad_norm": 0.28515625, "learning_rate": 6.697530864197531e-06, "loss": 0.0233, "step": 14080 }, { "epoch": 8.665026145801292, "grad_norm": 0.2451171875, "learning_rate": 6.682098765432099e-06, "loss": 0.0213, "step": 14085 }, { "epoch": 8.66810212242387, "grad_norm": 0.29296875, "learning_rate": 6.666666666666667e-06, "loss": 0.0229, "step": 14090 }, { "epoch": 8.671178099046447, "grad_norm": 0.294921875, "learning_rate": 6.651234567901235e-06, "loss": 0.0221, "step": 14095 }, { "epoch": 8.674254075669024, "grad_norm": 0.291015625, "learning_rate": 6.635802469135803e-06, "loss": 0.0234, "step": 14100 }, { "epoch": 8.677330052291602, "grad_norm": 0.27734375, "learning_rate": 6.620370370370371e-06, "loss": 0.0239, "step": 14105 }, { "epoch": 8.680406028914181, "grad_norm": 0.28125, "learning_rate": 6.604938271604938e-06, "loss": 0.0238, "step": 14110 }, { "epoch": 8.683482005536758, "grad_norm": 0.2734375, "learning_rate": 6.589506172839506e-06, "loss": 0.0231, "step": 14115 }, { "epoch": 8.686557982159336, "grad_norm": 0.265625, "learning_rate": 6.574074074074074e-06, "loss": 0.0215, "step": 14120 }, { "epoch": 8.689633958781913, "grad_norm": 0.2578125, "learning_rate": 6.558641975308642e-06, "loss": 0.0223, "step": 14125 }, { "epoch": 8.69270993540449, "grad_norm": 0.259765625, "learning_rate": 6.54320987654321e-06, "loss": 0.0225, "step": 14130 }, { "epoch": 8.695785912027068, "grad_norm": 0.318359375, "learning_rate": 6.5277777777777784e-06, "loss": 0.0277, "step": 14135 }, { "epoch": 8.698861888649645, "grad_norm": 0.255859375, "learning_rate": 6.512345679012347e-06, "loss": 0.024, "step": 14140 }, { "epoch": 8.701937865272225, "grad_norm": 0.291015625, "learning_rate": 6.496913580246914e-06, "loss": 0.0226, "step": 14145 }, { "epoch": 8.705013841894802, "grad_norm": 0.29296875, "learning_rate": 6.481481481481481e-06, "loss": 0.0216, "step": 14150 }, { "epoch": 8.70808981851738, "grad_norm": 0.28125, "learning_rate": 6.46604938271605e-06, "loss": 0.0217, "step": 14155 }, { "epoch": 8.711165795139957, "grad_norm": 0.25390625, "learning_rate": 6.450617283950617e-06, "loss": 0.0222, "step": 14160 }, { "epoch": 8.714241771762534, "grad_norm": 0.271484375, "learning_rate": 6.435185185185186e-06, "loss": 0.0218, "step": 14165 }, { "epoch": 8.717317748385112, "grad_norm": 0.228515625, "learning_rate": 6.419753086419754e-06, "loss": 0.0216, "step": 14170 }, { "epoch": 8.720393725007689, "grad_norm": 0.251953125, "learning_rate": 6.404320987654322e-06, "loss": 0.023, "step": 14175 }, { "epoch": 8.723469701630268, "grad_norm": 0.263671875, "learning_rate": 6.3888888888888885e-06, "loss": 0.0227, "step": 14180 }, { "epoch": 8.726545678252846, "grad_norm": 0.30078125, "learning_rate": 6.373456790123457e-06, "loss": 0.0236, "step": 14185 }, { "epoch": 8.729621654875423, "grad_norm": 0.283203125, "learning_rate": 6.3580246913580246e-06, "loss": 0.0251, "step": 14190 }, { "epoch": 8.732697631498, "grad_norm": 0.2578125, "learning_rate": 6.342592592592593e-06, "loss": 0.0228, "step": 14195 }, { "epoch": 8.735773608120578, "grad_norm": 0.291015625, "learning_rate": 6.3271604938271615e-06, "loss": 0.0228, "step": 14200 }, { "epoch": 8.738849584743155, "grad_norm": 0.275390625, "learning_rate": 6.311728395061729e-06, "loss": 0.024, "step": 14205 }, { "epoch": 8.741925561365733, "grad_norm": 0.2421875, "learning_rate": 6.296296296296296e-06, "loss": 0.0231, "step": 14210 }, { "epoch": 8.745001537988312, "grad_norm": 0.26171875, "learning_rate": 6.280864197530864e-06, "loss": 0.0202, "step": 14215 }, { "epoch": 8.74807751461089, "grad_norm": 0.271484375, "learning_rate": 6.265432098765432e-06, "loss": 0.0201, "step": 14220 }, { "epoch": 8.751153491233467, "grad_norm": 0.232421875, "learning_rate": 6.25e-06, "loss": 0.0213, "step": 14225 }, { "epoch": 8.754229467856044, "grad_norm": 0.25390625, "learning_rate": 6.234567901234569e-06, "loss": 0.0215, "step": 14230 }, { "epoch": 8.757305444478622, "grad_norm": 0.2578125, "learning_rate": 6.2191358024691355e-06, "loss": 0.0233, "step": 14235 }, { "epoch": 8.760381421101199, "grad_norm": 0.251953125, "learning_rate": 6.203703703703704e-06, "loss": 0.0223, "step": 14240 }, { "epoch": 8.763457397723776, "grad_norm": 0.296875, "learning_rate": 6.188271604938272e-06, "loss": 0.0229, "step": 14245 }, { "epoch": 8.766533374346356, "grad_norm": 0.29296875, "learning_rate": 6.172839506172839e-06, "loss": 0.0232, "step": 14250 }, { "epoch": 8.769609350968933, "grad_norm": 0.310546875, "learning_rate": 6.157407407407408e-06, "loss": 0.0215, "step": 14255 }, { "epoch": 8.77268532759151, "grad_norm": 0.30078125, "learning_rate": 6.141975308641976e-06, "loss": 0.0243, "step": 14260 }, { "epoch": 8.775761304214088, "grad_norm": 0.283203125, "learning_rate": 6.126543209876544e-06, "loss": 0.0243, "step": 14265 }, { "epoch": 8.778837280836665, "grad_norm": 0.263671875, "learning_rate": 6.111111111111111e-06, "loss": 0.0261, "step": 14270 }, { "epoch": 8.781913257459243, "grad_norm": 0.306640625, "learning_rate": 6.09567901234568e-06, "loss": 0.0225, "step": 14275 }, { "epoch": 8.78498923408182, "grad_norm": 0.279296875, "learning_rate": 6.080246913580247e-06, "loss": 0.022, "step": 14280 }, { "epoch": 8.7880652107044, "grad_norm": 0.345703125, "learning_rate": 6.064814814814815e-06, "loss": 0.026, "step": 14285 }, { "epoch": 8.791141187326977, "grad_norm": 0.255859375, "learning_rate": 6.049382716049383e-06, "loss": 0.0217, "step": 14290 }, { "epoch": 8.794217163949554, "grad_norm": 0.2578125, "learning_rate": 6.033950617283951e-06, "loss": 0.0225, "step": 14295 }, { "epoch": 8.797293140572132, "grad_norm": 0.26171875, "learning_rate": 6.0185185185185185e-06, "loss": 0.0252, "step": 14300 }, { "epoch": 8.800369117194709, "grad_norm": 0.25, "learning_rate": 6.003086419753087e-06, "loss": 0.0229, "step": 14305 }, { "epoch": 8.803445093817286, "grad_norm": 0.275390625, "learning_rate": 5.9876543209876546e-06, "loss": 0.0225, "step": 14310 }, { "epoch": 8.806521070439864, "grad_norm": 0.271484375, "learning_rate": 5.972222222222223e-06, "loss": 0.0224, "step": 14315 }, { "epoch": 8.809597047062443, "grad_norm": 0.27734375, "learning_rate": 5.956790123456791e-06, "loss": 0.0224, "step": 14320 }, { "epoch": 8.81267302368502, "grad_norm": 0.267578125, "learning_rate": 5.941358024691358e-06, "loss": 0.0238, "step": 14325 }, { "epoch": 8.815749000307598, "grad_norm": 0.271484375, "learning_rate": 5.925925925925927e-06, "loss": 0.021, "step": 14330 }, { "epoch": 8.818824976930175, "grad_norm": 0.26171875, "learning_rate": 5.910493827160494e-06, "loss": 0.0224, "step": 14335 }, { "epoch": 8.821900953552753, "grad_norm": 0.26171875, "learning_rate": 5.895061728395062e-06, "loss": 0.023, "step": 14340 }, { "epoch": 8.82497693017533, "grad_norm": 0.302734375, "learning_rate": 5.87962962962963e-06, "loss": 0.0235, "step": 14345 }, { "epoch": 8.828052906797907, "grad_norm": 0.267578125, "learning_rate": 5.864197530864198e-06, "loss": 0.0238, "step": 14350 }, { "epoch": 8.831128883420487, "grad_norm": 0.275390625, "learning_rate": 5.8487654320987655e-06, "loss": 0.0247, "step": 14355 }, { "epoch": 8.834204860043064, "grad_norm": 0.31640625, "learning_rate": 5.833333333333334e-06, "loss": 0.0235, "step": 14360 }, { "epoch": 8.837280836665641, "grad_norm": 0.287109375, "learning_rate": 5.8179012345679016e-06, "loss": 0.023, "step": 14365 }, { "epoch": 8.840356813288219, "grad_norm": 0.2578125, "learning_rate": 5.802469135802469e-06, "loss": 0.0234, "step": 14370 }, { "epoch": 8.843432789910796, "grad_norm": 0.283203125, "learning_rate": 5.787037037037038e-06, "loss": 0.0241, "step": 14375 }, { "epoch": 8.846508766533374, "grad_norm": 0.283203125, "learning_rate": 5.771604938271605e-06, "loss": 0.0234, "step": 14380 }, { "epoch": 8.849584743155951, "grad_norm": 0.263671875, "learning_rate": 5.756172839506173e-06, "loss": 0.0213, "step": 14385 }, { "epoch": 8.85266071977853, "grad_norm": 0.267578125, "learning_rate": 5.740740740740741e-06, "loss": 0.022, "step": 14390 }, { "epoch": 8.855736696401108, "grad_norm": 0.279296875, "learning_rate": 5.725308641975309e-06, "loss": 0.0243, "step": 14395 }, { "epoch": 8.858812673023685, "grad_norm": 0.259765625, "learning_rate": 5.7098765432098764e-06, "loss": 0.0221, "step": 14400 }, { "epoch": 8.861888649646263, "grad_norm": 0.279296875, "learning_rate": 5.694444444444445e-06, "loss": 0.0235, "step": 14405 }, { "epoch": 8.86496462626884, "grad_norm": 0.28125, "learning_rate": 5.6790123456790125e-06, "loss": 0.0233, "step": 14410 }, { "epoch": 8.868040602891417, "grad_norm": 0.2373046875, "learning_rate": 5.663580246913581e-06, "loss": 0.022, "step": 14415 }, { "epoch": 8.871116579513995, "grad_norm": 0.267578125, "learning_rate": 5.6481481481481485e-06, "loss": 0.0238, "step": 14420 }, { "epoch": 8.874192556136574, "grad_norm": 0.265625, "learning_rate": 5.632716049382716e-06, "loss": 0.0226, "step": 14425 }, { "epoch": 8.877268532759151, "grad_norm": 0.275390625, "learning_rate": 5.617283950617285e-06, "loss": 0.0222, "step": 14430 }, { "epoch": 8.880344509381729, "grad_norm": 0.275390625, "learning_rate": 5.601851851851852e-06, "loss": 0.0223, "step": 14435 }, { "epoch": 8.883420486004306, "grad_norm": 0.298828125, "learning_rate": 5.58641975308642e-06, "loss": 0.0242, "step": 14440 }, { "epoch": 8.886496462626884, "grad_norm": 0.3125, "learning_rate": 5.570987654320988e-06, "loss": 0.0231, "step": 14445 }, { "epoch": 8.889572439249461, "grad_norm": 0.283203125, "learning_rate": 5.555555555555556e-06, "loss": 0.0231, "step": 14450 }, { "epoch": 8.892648415872038, "grad_norm": 0.263671875, "learning_rate": 5.540123456790123e-06, "loss": 0.0258, "step": 14455 }, { "epoch": 8.895724392494618, "grad_norm": 0.30859375, "learning_rate": 5.524691358024692e-06, "loss": 0.0234, "step": 14460 }, { "epoch": 8.898800369117195, "grad_norm": 0.28515625, "learning_rate": 5.5092592592592595e-06, "loss": 0.0235, "step": 14465 }, { "epoch": 8.901876345739772, "grad_norm": 0.26953125, "learning_rate": 5.493827160493827e-06, "loss": 0.0252, "step": 14470 }, { "epoch": 8.90495232236235, "grad_norm": 0.287109375, "learning_rate": 5.4783950617283955e-06, "loss": 0.0229, "step": 14475 }, { "epoch": 8.908028298984927, "grad_norm": 0.302734375, "learning_rate": 5.462962962962963e-06, "loss": 0.0241, "step": 14480 }, { "epoch": 8.911104275607505, "grad_norm": 0.255859375, "learning_rate": 5.447530864197531e-06, "loss": 0.0224, "step": 14485 }, { "epoch": 8.914180252230082, "grad_norm": 0.283203125, "learning_rate": 5.432098765432099e-06, "loss": 0.0224, "step": 14490 }, { "epoch": 8.917256228852661, "grad_norm": 0.259765625, "learning_rate": 5.416666666666667e-06, "loss": 0.0239, "step": 14495 }, { "epoch": 8.920332205475239, "grad_norm": 0.291015625, "learning_rate": 5.401234567901234e-06, "loss": 0.0225, "step": 14500 }, { "epoch": 8.923408182097816, "grad_norm": 0.271484375, "learning_rate": 5.385802469135803e-06, "loss": 0.0219, "step": 14505 }, { "epoch": 8.926484158720394, "grad_norm": 0.30078125, "learning_rate": 5.37037037037037e-06, "loss": 0.0239, "step": 14510 }, { "epoch": 8.929560135342971, "grad_norm": 0.298828125, "learning_rate": 5.354938271604939e-06, "loss": 0.0215, "step": 14515 }, { "epoch": 8.932636111965548, "grad_norm": 0.2734375, "learning_rate": 5.3395061728395064e-06, "loss": 0.0238, "step": 14520 }, { "epoch": 8.935712088588126, "grad_norm": 0.287109375, "learning_rate": 5.324074074074074e-06, "loss": 0.0224, "step": 14525 }, { "epoch": 8.938788065210705, "grad_norm": 0.265625, "learning_rate": 5.3086419753086425e-06, "loss": 0.0252, "step": 14530 }, { "epoch": 8.941864041833282, "grad_norm": 0.255859375, "learning_rate": 5.29320987654321e-06, "loss": 0.021, "step": 14535 }, { "epoch": 8.94494001845586, "grad_norm": 0.291015625, "learning_rate": 5.277777777777778e-06, "loss": 0.0222, "step": 14540 }, { "epoch": 8.948015995078437, "grad_norm": 0.27734375, "learning_rate": 5.262345679012346e-06, "loss": 0.0218, "step": 14545 }, { "epoch": 8.951091971701015, "grad_norm": 0.29296875, "learning_rate": 5.246913580246914e-06, "loss": 0.0234, "step": 14550 }, { "epoch": 8.954167948323592, "grad_norm": 0.26171875, "learning_rate": 5.231481481481481e-06, "loss": 0.0235, "step": 14555 }, { "epoch": 8.95724392494617, "grad_norm": 0.3046875, "learning_rate": 5.21604938271605e-06, "loss": 0.0222, "step": 14560 }, { "epoch": 8.960319901568749, "grad_norm": 0.275390625, "learning_rate": 5.200617283950618e-06, "loss": 0.0223, "step": 14565 }, { "epoch": 8.963395878191326, "grad_norm": 0.271484375, "learning_rate": 5.185185185185185e-06, "loss": 0.0231, "step": 14570 }, { "epoch": 8.966471854813904, "grad_norm": 0.27734375, "learning_rate": 5.1697530864197534e-06, "loss": 0.0233, "step": 14575 }, { "epoch": 8.969547831436481, "grad_norm": 0.2890625, "learning_rate": 5.154320987654322e-06, "loss": 0.0224, "step": 14580 }, { "epoch": 8.972623808059058, "grad_norm": 0.3046875, "learning_rate": 5.138888888888889e-06, "loss": 0.0213, "step": 14585 }, { "epoch": 8.975699784681636, "grad_norm": 0.259765625, "learning_rate": 5.123456790123457e-06, "loss": 0.0216, "step": 14590 }, { "epoch": 8.978775761304213, "grad_norm": 0.25390625, "learning_rate": 5.1080246913580255e-06, "loss": 0.0222, "step": 14595 }, { "epoch": 8.981851737926792, "grad_norm": 0.298828125, "learning_rate": 5.092592592592592e-06, "loss": 0.0257, "step": 14600 }, { "epoch": 8.98492771454937, "grad_norm": 0.306640625, "learning_rate": 5.077160493827161e-06, "loss": 0.0226, "step": 14605 }, { "epoch": 8.988003691171947, "grad_norm": 0.251953125, "learning_rate": 5.061728395061729e-06, "loss": 0.0209, "step": 14610 }, { "epoch": 8.991079667794525, "grad_norm": 0.32421875, "learning_rate": 5.046296296296297e-06, "loss": 0.0253, "step": 14615 }, { "epoch": 8.994155644417102, "grad_norm": 0.28125, "learning_rate": 5.030864197530864e-06, "loss": 0.0223, "step": 14620 }, { "epoch": 8.99723162103968, "grad_norm": 0.318359375, "learning_rate": 5.015432098765433e-06, "loss": 0.0228, "step": 14625 }, { "epoch": 9.000307597662259, "grad_norm": 0.5703125, "learning_rate": 5e-06, "loss": 0.0233, "step": 14630 }, { "epoch": 9.003383574284836, "grad_norm": 0.251953125, "learning_rate": 4.984567901234568e-06, "loss": 0.0219, "step": 14635 }, { "epoch": 9.006459550907413, "grad_norm": 0.2470703125, "learning_rate": 4.9691358024691365e-06, "loss": 0.0219, "step": 14640 }, { "epoch": 9.00953552752999, "grad_norm": 0.3046875, "learning_rate": 4.953703703703704e-06, "loss": 0.0237, "step": 14645 }, { "epoch": 9.012611504152568, "grad_norm": 0.2490234375, "learning_rate": 4.938271604938272e-06, "loss": 0.0228, "step": 14650 }, { "epoch": 9.015687480775146, "grad_norm": 0.26171875, "learning_rate": 4.92283950617284e-06, "loss": 0.0234, "step": 14655 }, { "epoch": 9.018763457397723, "grad_norm": 0.2578125, "learning_rate": 4.907407407407408e-06, "loss": 0.0219, "step": 14660 }, { "epoch": 9.021839434020302, "grad_norm": 0.248046875, "learning_rate": 4.891975308641975e-06, "loss": 0.0232, "step": 14665 }, { "epoch": 9.02491541064288, "grad_norm": 0.25390625, "learning_rate": 4.876543209876543e-06, "loss": 0.0233, "step": 14670 }, { "epoch": 9.027991387265457, "grad_norm": 0.28515625, "learning_rate": 4.861111111111111e-06, "loss": 0.0222, "step": 14675 }, { "epoch": 9.031067363888035, "grad_norm": 0.298828125, "learning_rate": 4.84567901234568e-06, "loss": 0.0226, "step": 14680 }, { "epoch": 9.034143340510612, "grad_norm": 0.251953125, "learning_rate": 4.8302469135802465e-06, "loss": 0.0227, "step": 14685 }, { "epoch": 9.03721931713319, "grad_norm": 0.310546875, "learning_rate": 4.814814814814815e-06, "loss": 0.0236, "step": 14690 }, { "epoch": 9.040295293755767, "grad_norm": 0.25390625, "learning_rate": 4.7993827160493834e-06, "loss": 0.0224, "step": 14695 }, { "epoch": 9.043371270378346, "grad_norm": 0.26171875, "learning_rate": 4.78395061728395e-06, "loss": 0.0227, "step": 14700 }, { "epoch": 9.046447247000923, "grad_norm": 0.283203125, "learning_rate": 4.768518518518519e-06, "loss": 0.0224, "step": 14705 }, { "epoch": 9.0495232236235, "grad_norm": 0.251953125, "learning_rate": 4.753086419753087e-06, "loss": 0.0229, "step": 14710 }, { "epoch": 9.052599200246078, "grad_norm": 0.3125, "learning_rate": 4.737654320987654e-06, "loss": 0.0242, "step": 14715 }, { "epoch": 9.055675176868656, "grad_norm": 0.28125, "learning_rate": 4.722222222222222e-06, "loss": 0.0233, "step": 14720 }, { "epoch": 9.058751153491233, "grad_norm": 0.27734375, "learning_rate": 4.706790123456791e-06, "loss": 0.0217, "step": 14725 }, { "epoch": 9.06182713011381, "grad_norm": 0.28125, "learning_rate": 4.691358024691358e-06, "loss": 0.0227, "step": 14730 }, { "epoch": 9.06490310673639, "grad_norm": 0.2392578125, "learning_rate": 4.675925925925926e-06, "loss": 0.0223, "step": 14735 }, { "epoch": 9.067979083358967, "grad_norm": 0.27734375, "learning_rate": 4.660493827160494e-06, "loss": 0.0214, "step": 14740 }, { "epoch": 9.071055059981544, "grad_norm": 0.248046875, "learning_rate": 4.645061728395062e-06, "loss": 0.0223, "step": 14745 }, { "epoch": 9.074131036604122, "grad_norm": 0.28125, "learning_rate": 4.6296296296296296e-06, "loss": 0.0218, "step": 14750 }, { "epoch": 9.0772070132267, "grad_norm": 0.279296875, "learning_rate": 4.614197530864198e-06, "loss": 0.023, "step": 14755 }, { "epoch": 9.080282989849277, "grad_norm": 0.28125, "learning_rate": 4.598765432098766e-06, "loss": 0.0234, "step": 14760 }, { "epoch": 9.083358966471854, "grad_norm": 0.3046875, "learning_rate": 4.583333333333333e-06, "loss": 0.0216, "step": 14765 }, { "epoch": 9.086434943094433, "grad_norm": 0.255859375, "learning_rate": 4.567901234567902e-06, "loss": 0.023, "step": 14770 }, { "epoch": 9.08951091971701, "grad_norm": 0.2578125, "learning_rate": 4.552469135802469e-06, "loss": 0.0198, "step": 14775 }, { "epoch": 9.092586896339588, "grad_norm": 0.2734375, "learning_rate": 4.537037037037038e-06, "loss": 0.0244, "step": 14780 }, { "epoch": 9.095662872962166, "grad_norm": 0.291015625, "learning_rate": 4.521604938271605e-06, "loss": 0.0217, "step": 14785 }, { "epoch": 9.098738849584743, "grad_norm": 0.265625, "learning_rate": 4.506172839506173e-06, "loss": 0.0238, "step": 14790 }, { "epoch": 9.10181482620732, "grad_norm": 0.248046875, "learning_rate": 4.490740740740741e-06, "loss": 0.0213, "step": 14795 }, { "epoch": 9.104890802829898, "grad_norm": 0.248046875, "learning_rate": 4.475308641975309e-06, "loss": 0.0221, "step": 14800 }, { "epoch": 9.107966779452477, "grad_norm": 0.255859375, "learning_rate": 4.4598765432098765e-06, "loss": 0.0227, "step": 14805 }, { "epoch": 9.111042756075054, "grad_norm": 0.296875, "learning_rate": 4.444444444444445e-06, "loss": 0.0225, "step": 14810 }, { "epoch": 9.114118732697632, "grad_norm": 0.2578125, "learning_rate": 4.429012345679013e-06, "loss": 0.0215, "step": 14815 }, { "epoch": 9.11719470932021, "grad_norm": 0.2490234375, "learning_rate": 4.41358024691358e-06, "loss": 0.0214, "step": 14820 }, { "epoch": 9.120270685942787, "grad_norm": 0.265625, "learning_rate": 4.398148148148149e-06, "loss": 0.0225, "step": 14825 }, { "epoch": 9.123346662565364, "grad_norm": 0.314453125, "learning_rate": 4.382716049382716e-06, "loss": 0.0249, "step": 14830 }, { "epoch": 9.126422639187941, "grad_norm": 0.267578125, "learning_rate": 4.367283950617284e-06, "loss": 0.023, "step": 14835 }, { "epoch": 9.12949861581052, "grad_norm": 0.2734375, "learning_rate": 4.351851851851852e-06, "loss": 0.0222, "step": 14840 }, { "epoch": 9.132574592433098, "grad_norm": 0.2734375, "learning_rate": 4.33641975308642e-06, "loss": 0.0221, "step": 14845 }, { "epoch": 9.135650569055676, "grad_norm": 0.244140625, "learning_rate": 4.3209876543209875e-06, "loss": 0.025, "step": 14850 }, { "epoch": 9.138726545678253, "grad_norm": 0.244140625, "learning_rate": 4.305555555555556e-06, "loss": 0.0211, "step": 14855 }, { "epoch": 9.14180252230083, "grad_norm": 0.287109375, "learning_rate": 4.2901234567901235e-06, "loss": 0.0234, "step": 14860 }, { "epoch": 9.144878498923408, "grad_norm": 0.279296875, "learning_rate": 4.274691358024691e-06, "loss": 0.0218, "step": 14865 }, { "epoch": 9.147954475545985, "grad_norm": 0.236328125, "learning_rate": 4.2592592592592596e-06, "loss": 0.0228, "step": 14870 }, { "epoch": 9.151030452168564, "grad_norm": 0.27734375, "learning_rate": 4.243827160493827e-06, "loss": 0.0245, "step": 14875 }, { "epoch": 9.154106428791142, "grad_norm": 0.294921875, "learning_rate": 4.228395061728396e-06, "loss": 0.0213, "step": 14880 }, { "epoch": 9.15718240541372, "grad_norm": 0.2890625, "learning_rate": 4.212962962962963e-06, "loss": 0.0251, "step": 14885 }, { "epoch": 9.160258382036297, "grad_norm": 0.234375, "learning_rate": 4.197530864197531e-06, "loss": 0.0222, "step": 14890 }, { "epoch": 9.163334358658874, "grad_norm": 0.255859375, "learning_rate": 4.182098765432099e-06, "loss": 0.0228, "step": 14895 }, { "epoch": 9.166410335281451, "grad_norm": 0.291015625, "learning_rate": 4.166666666666667e-06, "loss": 0.0209, "step": 14900 }, { "epoch": 9.169486311904029, "grad_norm": 0.2734375, "learning_rate": 4.1512345679012345e-06, "loss": 0.0225, "step": 14905 }, { "epoch": 9.172562288526608, "grad_norm": 0.310546875, "learning_rate": 4.135802469135803e-06, "loss": 0.0233, "step": 14910 }, { "epoch": 9.175638265149185, "grad_norm": 0.31640625, "learning_rate": 4.1203703703703705e-06, "loss": 0.0224, "step": 14915 }, { "epoch": 9.178714241771763, "grad_norm": 0.310546875, "learning_rate": 4.104938271604938e-06, "loss": 0.0217, "step": 14920 }, { "epoch": 9.18179021839434, "grad_norm": 0.255859375, "learning_rate": 4.0895061728395066e-06, "loss": 0.0232, "step": 14925 }, { "epoch": 9.184866195016918, "grad_norm": 0.296875, "learning_rate": 4.074074074074075e-06, "loss": 0.0231, "step": 14930 }, { "epoch": 9.187942171639495, "grad_norm": 0.23828125, "learning_rate": 4.058641975308642e-06, "loss": 0.0214, "step": 14935 }, { "epoch": 9.191018148262073, "grad_norm": 0.259765625, "learning_rate": 4.04320987654321e-06, "loss": 0.0227, "step": 14940 }, { "epoch": 9.194094124884652, "grad_norm": 0.27734375, "learning_rate": 4.027777777777779e-06, "loss": 0.0242, "step": 14945 }, { "epoch": 9.19717010150723, "grad_norm": 0.318359375, "learning_rate": 4.012345679012345e-06, "loss": 0.0239, "step": 14950 }, { "epoch": 9.200246078129807, "grad_norm": 0.2412109375, "learning_rate": 3.996913580246914e-06, "loss": 0.0201, "step": 14955 }, { "epoch": 9.203322054752384, "grad_norm": 0.263671875, "learning_rate": 3.9814814814814814e-06, "loss": 0.0241, "step": 14960 }, { "epoch": 9.206398031374961, "grad_norm": 0.29296875, "learning_rate": 3.966049382716049e-06, "loss": 0.0239, "step": 14965 }, { "epoch": 9.209474007997539, "grad_norm": 0.287109375, "learning_rate": 3.9506172839506175e-06, "loss": 0.0217, "step": 14970 }, { "epoch": 9.212549984620116, "grad_norm": 0.322265625, "learning_rate": 3.935185185185185e-06, "loss": 0.0226, "step": 14975 }, { "epoch": 9.215625961242695, "grad_norm": 0.27734375, "learning_rate": 3.9197530864197535e-06, "loss": 0.0235, "step": 14980 }, { "epoch": 9.218701937865273, "grad_norm": 0.265625, "learning_rate": 3.904320987654321e-06, "loss": 0.0225, "step": 14985 }, { "epoch": 9.22177791448785, "grad_norm": 0.263671875, "learning_rate": 3.888888888888889e-06, "loss": 0.023, "step": 14990 }, { "epoch": 9.224853891110428, "grad_norm": 0.255859375, "learning_rate": 3.873456790123457e-06, "loss": 0.0221, "step": 14995 }, { "epoch": 9.227929867733005, "grad_norm": 0.244140625, "learning_rate": 3.858024691358025e-06, "loss": 0.0199, "step": 15000 }, { "epoch": 9.231005844355582, "grad_norm": 0.265625, "learning_rate": 3.842592592592592e-06, "loss": 0.024, "step": 15005 }, { "epoch": 9.23408182097816, "grad_norm": 0.3203125, "learning_rate": 3.827160493827161e-06, "loss": 0.0237, "step": 15010 }, { "epoch": 9.237157797600739, "grad_norm": 0.2734375, "learning_rate": 3.8117283950617284e-06, "loss": 0.0218, "step": 15015 }, { "epoch": 9.240233774223316, "grad_norm": 0.291015625, "learning_rate": 3.7962962962962964e-06, "loss": 0.0253, "step": 15020 }, { "epoch": 9.243309750845894, "grad_norm": 0.28125, "learning_rate": 3.7808641975308645e-06, "loss": 0.0232, "step": 15025 }, { "epoch": 9.246385727468471, "grad_norm": 0.26171875, "learning_rate": 3.7654320987654325e-06, "loss": 0.0205, "step": 15030 }, { "epoch": 9.249461704091049, "grad_norm": 0.2578125, "learning_rate": 3.75e-06, "loss": 0.0241, "step": 15035 }, { "epoch": 9.252537680713626, "grad_norm": 0.287109375, "learning_rate": 3.734567901234568e-06, "loss": 0.0234, "step": 15040 }, { "epoch": 9.255613657336204, "grad_norm": 0.296875, "learning_rate": 3.719135802469136e-06, "loss": 0.0244, "step": 15045 }, { "epoch": 9.258689633958783, "grad_norm": 0.28125, "learning_rate": 3.7037037037037037e-06, "loss": 0.0225, "step": 15050 }, { "epoch": 9.26176561058136, "grad_norm": 0.279296875, "learning_rate": 3.6882716049382718e-06, "loss": 0.022, "step": 15055 }, { "epoch": 9.264841587203938, "grad_norm": 0.23046875, "learning_rate": 3.6728395061728398e-06, "loss": 0.0234, "step": 15060 }, { "epoch": 9.267917563826515, "grad_norm": 0.294921875, "learning_rate": 3.6574074074074074e-06, "loss": 0.0253, "step": 15065 }, { "epoch": 9.270993540449092, "grad_norm": 0.279296875, "learning_rate": 3.6419753086419754e-06, "loss": 0.0216, "step": 15070 }, { "epoch": 9.27406951707167, "grad_norm": 0.2734375, "learning_rate": 3.6265432098765434e-06, "loss": 0.0234, "step": 15075 }, { "epoch": 9.277145493694247, "grad_norm": 0.294921875, "learning_rate": 3.611111111111111e-06, "loss": 0.025, "step": 15080 }, { "epoch": 9.280221470316826, "grad_norm": 0.27734375, "learning_rate": 3.595679012345679e-06, "loss": 0.0258, "step": 15085 }, { "epoch": 9.283297446939404, "grad_norm": 0.26953125, "learning_rate": 3.580246913580247e-06, "loss": 0.0232, "step": 15090 }, { "epoch": 9.286373423561981, "grad_norm": 0.279296875, "learning_rate": 3.564814814814815e-06, "loss": 0.0211, "step": 15095 }, { "epoch": 9.289449400184559, "grad_norm": 0.3046875, "learning_rate": 3.5493827160493827e-06, "loss": 0.022, "step": 15100 }, { "epoch": 9.292525376807136, "grad_norm": 0.275390625, "learning_rate": 3.5339506172839507e-06, "loss": 0.0232, "step": 15105 }, { "epoch": 9.295601353429713, "grad_norm": 0.27734375, "learning_rate": 3.5185185185185187e-06, "loss": 0.0224, "step": 15110 }, { "epoch": 9.298677330052291, "grad_norm": 0.26953125, "learning_rate": 3.5030864197530863e-06, "loss": 0.0228, "step": 15115 }, { "epoch": 9.30175330667487, "grad_norm": 0.2490234375, "learning_rate": 3.4876543209876544e-06, "loss": 0.0223, "step": 15120 }, { "epoch": 9.304829283297448, "grad_norm": 0.27734375, "learning_rate": 3.4722222222222224e-06, "loss": 0.0233, "step": 15125 }, { "epoch": 9.307905259920025, "grad_norm": 0.265625, "learning_rate": 3.45679012345679e-06, "loss": 0.0233, "step": 15130 }, { "epoch": 9.310981236542602, "grad_norm": 0.26953125, "learning_rate": 3.441358024691358e-06, "loss": 0.0254, "step": 15135 }, { "epoch": 9.31405721316518, "grad_norm": 0.265625, "learning_rate": 3.425925925925926e-06, "loss": 0.0236, "step": 15140 }, { "epoch": 9.317133189787757, "grad_norm": 0.251953125, "learning_rate": 3.4104938271604945e-06, "loss": 0.021, "step": 15145 }, { "epoch": 9.320209166410335, "grad_norm": 0.275390625, "learning_rate": 3.3950617283950617e-06, "loss": 0.0223, "step": 15150 }, { "epoch": 9.323285143032914, "grad_norm": 0.26171875, "learning_rate": 3.3796296296296297e-06, "loss": 0.0218, "step": 15155 }, { "epoch": 9.326361119655491, "grad_norm": 0.2421875, "learning_rate": 3.364197530864198e-06, "loss": 0.0242, "step": 15160 }, { "epoch": 9.329437096278069, "grad_norm": 0.306640625, "learning_rate": 3.3487654320987653e-06, "loss": 0.0214, "step": 15165 }, { "epoch": 9.332513072900646, "grad_norm": 0.27734375, "learning_rate": 3.3333333333333333e-06, "loss": 0.0238, "step": 15170 }, { "epoch": 9.335589049523223, "grad_norm": 0.287109375, "learning_rate": 3.3179012345679013e-06, "loss": 0.0218, "step": 15175 }, { "epoch": 9.3386650261458, "grad_norm": 0.306640625, "learning_rate": 3.302469135802469e-06, "loss": 0.0259, "step": 15180 }, { "epoch": 9.341741002768378, "grad_norm": 0.28125, "learning_rate": 3.287037037037037e-06, "loss": 0.0232, "step": 15185 }, { "epoch": 9.344816979390957, "grad_norm": 0.27734375, "learning_rate": 3.271604938271605e-06, "loss": 0.0226, "step": 15190 }, { "epoch": 9.347892956013535, "grad_norm": 0.328125, "learning_rate": 3.2561728395061734e-06, "loss": 0.0234, "step": 15195 }, { "epoch": 9.350968932636112, "grad_norm": 0.298828125, "learning_rate": 3.2407407407407406e-06, "loss": 0.0231, "step": 15200 }, { "epoch": 9.35404490925869, "grad_norm": 0.2451171875, "learning_rate": 3.2253086419753086e-06, "loss": 0.0227, "step": 15205 }, { "epoch": 9.357120885881267, "grad_norm": 0.2578125, "learning_rate": 3.209876543209877e-06, "loss": 0.0223, "step": 15210 }, { "epoch": 9.360196862503845, "grad_norm": 0.275390625, "learning_rate": 3.1944444444444443e-06, "loss": 0.0235, "step": 15215 }, { "epoch": 9.363272839126422, "grad_norm": 0.2412109375, "learning_rate": 3.1790123456790123e-06, "loss": 0.0199, "step": 15220 }, { "epoch": 9.366348815749001, "grad_norm": 0.251953125, "learning_rate": 3.1635802469135807e-06, "loss": 0.0225, "step": 15225 }, { "epoch": 9.369424792371579, "grad_norm": 0.27734375, "learning_rate": 3.148148148148148e-06, "loss": 0.0223, "step": 15230 }, { "epoch": 9.372500768994156, "grad_norm": 0.232421875, "learning_rate": 3.132716049382716e-06, "loss": 0.0225, "step": 15235 }, { "epoch": 9.375576745616733, "grad_norm": 0.25390625, "learning_rate": 3.1172839506172844e-06, "loss": 0.0225, "step": 15240 }, { "epoch": 9.37865272223931, "grad_norm": 0.265625, "learning_rate": 3.101851851851852e-06, "loss": 0.0225, "step": 15245 }, { "epoch": 9.381728698861888, "grad_norm": 0.279296875, "learning_rate": 3.0864197530864196e-06, "loss": 0.0241, "step": 15250 }, { "epoch": 9.384804675484466, "grad_norm": 0.259765625, "learning_rate": 3.070987654320988e-06, "loss": 0.0227, "step": 15255 }, { "epoch": 9.387880652107045, "grad_norm": 0.30078125, "learning_rate": 3.0555555555555556e-06, "loss": 0.0211, "step": 15260 }, { "epoch": 9.390956628729622, "grad_norm": 0.279296875, "learning_rate": 3.0401234567901236e-06, "loss": 0.022, "step": 15265 }, { "epoch": 9.3940326053522, "grad_norm": 0.275390625, "learning_rate": 3.0246913580246917e-06, "loss": 0.0209, "step": 15270 }, { "epoch": 9.397108581974777, "grad_norm": 0.328125, "learning_rate": 3.0092592592592593e-06, "loss": 0.0255, "step": 15275 }, { "epoch": 9.400184558597354, "grad_norm": 0.279296875, "learning_rate": 2.9938271604938273e-06, "loss": 0.0213, "step": 15280 }, { "epoch": 9.403260535219932, "grad_norm": 0.318359375, "learning_rate": 2.9783950617283953e-06, "loss": 0.0226, "step": 15285 }, { "epoch": 9.40633651184251, "grad_norm": 0.265625, "learning_rate": 2.9629629629629633e-06, "loss": 0.0225, "step": 15290 }, { "epoch": 9.409412488465088, "grad_norm": 0.271484375, "learning_rate": 2.947530864197531e-06, "loss": 0.022, "step": 15295 }, { "epoch": 9.412488465087666, "grad_norm": 0.3125, "learning_rate": 2.932098765432099e-06, "loss": 0.0233, "step": 15300 }, { "epoch": 9.415564441710243, "grad_norm": 0.2578125, "learning_rate": 2.916666666666667e-06, "loss": 0.0226, "step": 15305 }, { "epoch": 9.41864041833282, "grad_norm": 0.27734375, "learning_rate": 2.9012345679012346e-06, "loss": 0.0224, "step": 15310 }, { "epoch": 9.421716394955398, "grad_norm": 0.240234375, "learning_rate": 2.8858024691358026e-06, "loss": 0.0202, "step": 15315 }, { "epoch": 9.424792371577976, "grad_norm": 0.3046875, "learning_rate": 2.8703703703703706e-06, "loss": 0.0231, "step": 15320 }, { "epoch": 9.427868348200553, "grad_norm": 0.2314453125, "learning_rate": 2.8549382716049382e-06, "loss": 0.0223, "step": 15325 }, { "epoch": 9.430944324823132, "grad_norm": 0.26171875, "learning_rate": 2.8395061728395062e-06, "loss": 0.019, "step": 15330 }, { "epoch": 9.43402030144571, "grad_norm": 0.27734375, "learning_rate": 2.8240740740740743e-06, "loss": 0.022, "step": 15335 }, { "epoch": 9.437096278068287, "grad_norm": 0.28125, "learning_rate": 2.8086419753086423e-06, "loss": 0.0229, "step": 15340 }, { "epoch": 9.440172254690864, "grad_norm": 0.23828125, "learning_rate": 2.79320987654321e-06, "loss": 0.0219, "step": 15345 }, { "epoch": 9.443248231313442, "grad_norm": 0.314453125, "learning_rate": 2.777777777777778e-06, "loss": 0.0244, "step": 15350 }, { "epoch": 9.44632420793602, "grad_norm": 0.265625, "learning_rate": 2.762345679012346e-06, "loss": 0.0226, "step": 15355 }, { "epoch": 9.449400184558597, "grad_norm": 0.30078125, "learning_rate": 2.7469135802469135e-06, "loss": 0.0235, "step": 15360 }, { "epoch": 9.452476161181176, "grad_norm": 0.26953125, "learning_rate": 2.7314814814814816e-06, "loss": 0.0216, "step": 15365 }, { "epoch": 9.455552137803753, "grad_norm": 0.28125, "learning_rate": 2.7160493827160496e-06, "loss": 0.0238, "step": 15370 }, { "epoch": 9.45862811442633, "grad_norm": 0.296875, "learning_rate": 2.700617283950617e-06, "loss": 0.0229, "step": 15375 }, { "epoch": 9.461704091048908, "grad_norm": 0.302734375, "learning_rate": 2.685185185185185e-06, "loss": 0.0215, "step": 15380 }, { "epoch": 9.464780067671485, "grad_norm": 0.255859375, "learning_rate": 2.6697530864197532e-06, "loss": 0.0199, "step": 15385 }, { "epoch": 9.467856044294063, "grad_norm": 0.28125, "learning_rate": 2.6543209876543212e-06, "loss": 0.0241, "step": 15390 }, { "epoch": 9.47093202091664, "grad_norm": 0.25390625, "learning_rate": 2.638888888888889e-06, "loss": 0.0206, "step": 15395 }, { "epoch": 9.47400799753922, "grad_norm": 0.283203125, "learning_rate": 2.623456790123457e-06, "loss": 0.0215, "step": 15400 }, { "epoch": 9.477083974161797, "grad_norm": 0.2421875, "learning_rate": 2.608024691358025e-06, "loss": 0.0231, "step": 15405 }, { "epoch": 9.480159950784374, "grad_norm": 0.2294921875, "learning_rate": 2.5925925925925925e-06, "loss": 0.0203, "step": 15410 }, { "epoch": 9.483235927406952, "grad_norm": 0.30859375, "learning_rate": 2.577160493827161e-06, "loss": 0.0248, "step": 15415 }, { "epoch": 9.48631190402953, "grad_norm": 0.29296875, "learning_rate": 2.5617283950617285e-06, "loss": 0.0243, "step": 15420 }, { "epoch": 9.489387880652107, "grad_norm": 0.25390625, "learning_rate": 2.546296296296296e-06, "loss": 0.0209, "step": 15425 }, { "epoch": 9.492463857274684, "grad_norm": 0.255859375, "learning_rate": 2.5308641975308646e-06, "loss": 0.0211, "step": 15430 }, { "epoch": 9.495539833897263, "grad_norm": 0.279296875, "learning_rate": 2.515432098765432e-06, "loss": 0.0245, "step": 15435 }, { "epoch": 9.49861581051984, "grad_norm": 0.251953125, "learning_rate": 2.5e-06, "loss": 0.0213, "step": 15440 }, { "epoch": 9.501691787142418, "grad_norm": 0.22265625, "learning_rate": 2.4845679012345682e-06, "loss": 0.0231, "step": 15445 }, { "epoch": 9.504767763764995, "grad_norm": 0.296875, "learning_rate": 2.469135802469136e-06, "loss": 0.0251, "step": 15450 }, { "epoch": 9.507843740387573, "grad_norm": 0.2578125, "learning_rate": 2.453703703703704e-06, "loss": 0.0213, "step": 15455 }, { "epoch": 9.51091971701015, "grad_norm": 0.26171875, "learning_rate": 2.4382716049382714e-06, "loss": 0.0213, "step": 15460 }, { "epoch": 9.513995693632728, "grad_norm": 0.275390625, "learning_rate": 2.42283950617284e-06, "loss": 0.0226, "step": 15465 }, { "epoch": 9.517071670255307, "grad_norm": 0.291015625, "learning_rate": 2.4074074074074075e-06, "loss": 0.0229, "step": 15470 }, { "epoch": 9.520147646877884, "grad_norm": 0.2490234375, "learning_rate": 2.391975308641975e-06, "loss": 0.0217, "step": 15475 }, { "epoch": 9.523223623500462, "grad_norm": 0.248046875, "learning_rate": 2.3765432098765435e-06, "loss": 0.0217, "step": 15480 }, { "epoch": 9.52629960012304, "grad_norm": 0.287109375, "learning_rate": 2.361111111111111e-06, "loss": 0.023, "step": 15485 }, { "epoch": 9.529375576745617, "grad_norm": 0.291015625, "learning_rate": 2.345679012345679e-06, "loss": 0.0233, "step": 15490 }, { "epoch": 9.532451553368194, "grad_norm": 0.3046875, "learning_rate": 2.330246913580247e-06, "loss": 0.0227, "step": 15495 }, { "epoch": 9.535527529990771, "grad_norm": 0.271484375, "learning_rate": 2.3148148148148148e-06, "loss": 0.0227, "step": 15500 }, { "epoch": 9.53860350661335, "grad_norm": 0.294921875, "learning_rate": 2.299382716049383e-06, "loss": 0.0259, "step": 15505 }, { "epoch": 9.541679483235928, "grad_norm": 0.26953125, "learning_rate": 2.283950617283951e-06, "loss": 0.023, "step": 15510 }, { "epoch": 9.544755459858505, "grad_norm": 0.283203125, "learning_rate": 2.268518518518519e-06, "loss": 0.0208, "step": 15515 }, { "epoch": 9.547831436481083, "grad_norm": 0.2734375, "learning_rate": 2.2530864197530865e-06, "loss": 0.0233, "step": 15520 }, { "epoch": 9.55090741310366, "grad_norm": 0.28515625, "learning_rate": 2.2376543209876545e-06, "loss": 0.0214, "step": 15525 }, { "epoch": 9.553983389726238, "grad_norm": 0.283203125, "learning_rate": 2.2222222222222225e-06, "loss": 0.0217, "step": 15530 }, { "epoch": 9.557059366348815, "grad_norm": 0.275390625, "learning_rate": 2.20679012345679e-06, "loss": 0.0233, "step": 15535 }, { "epoch": 9.560135342971394, "grad_norm": 0.265625, "learning_rate": 2.191358024691358e-06, "loss": 0.0213, "step": 15540 }, { "epoch": 9.563211319593972, "grad_norm": 0.26953125, "learning_rate": 2.175925925925926e-06, "loss": 0.0219, "step": 15545 }, { "epoch": 9.566287296216549, "grad_norm": 0.294921875, "learning_rate": 2.1604938271604937e-06, "loss": 0.0246, "step": 15550 }, { "epoch": 9.569363272839126, "grad_norm": 0.2421875, "learning_rate": 2.1450617283950618e-06, "loss": 0.0223, "step": 15555 }, { "epoch": 9.572439249461704, "grad_norm": 0.27734375, "learning_rate": 2.1296296296296298e-06, "loss": 0.0223, "step": 15560 }, { "epoch": 9.575515226084281, "grad_norm": 0.296875, "learning_rate": 2.114197530864198e-06, "loss": 0.023, "step": 15565 }, { "epoch": 9.578591202706859, "grad_norm": 0.26171875, "learning_rate": 2.0987654320987654e-06, "loss": 0.0213, "step": 15570 }, { "epoch": 9.581667179329438, "grad_norm": 0.279296875, "learning_rate": 2.0833333333333334e-06, "loss": 0.025, "step": 15575 }, { "epoch": 9.584743155952015, "grad_norm": 0.259765625, "learning_rate": 2.0679012345679015e-06, "loss": 0.0215, "step": 15580 }, { "epoch": 9.587819132574593, "grad_norm": 0.259765625, "learning_rate": 2.052469135802469e-06, "loss": 0.0231, "step": 15585 }, { "epoch": 9.59089510919717, "grad_norm": 0.271484375, "learning_rate": 2.0370370370370375e-06, "loss": 0.0234, "step": 15590 }, { "epoch": 9.593971085819748, "grad_norm": 0.259765625, "learning_rate": 2.021604938271605e-06, "loss": 0.0225, "step": 15595 }, { "epoch": 9.597047062442325, "grad_norm": 0.2578125, "learning_rate": 2.0061728395061727e-06, "loss": 0.0241, "step": 15600 }, { "epoch": 9.600123039064902, "grad_norm": 0.2216796875, "learning_rate": 1.9907407407407407e-06, "loss": 0.0208, "step": 15605 }, { "epoch": 9.603199015687482, "grad_norm": 0.28515625, "learning_rate": 1.9753086419753087e-06, "loss": 0.022, "step": 15610 }, { "epoch": 9.606274992310059, "grad_norm": 0.28515625, "learning_rate": 1.9598765432098768e-06, "loss": 0.0219, "step": 15615 }, { "epoch": 9.609350968932636, "grad_norm": 0.263671875, "learning_rate": 1.9444444444444444e-06, "loss": 0.0209, "step": 15620 }, { "epoch": 9.612426945555214, "grad_norm": 0.28125, "learning_rate": 1.9290123456790124e-06, "loss": 0.0246, "step": 15625 }, { "epoch": 9.615502922177791, "grad_norm": 0.291015625, "learning_rate": 1.9135802469135804e-06, "loss": 0.023, "step": 15630 }, { "epoch": 9.618578898800369, "grad_norm": 0.26171875, "learning_rate": 1.8981481481481482e-06, "loss": 0.0246, "step": 15635 }, { "epoch": 9.621654875422946, "grad_norm": 0.275390625, "learning_rate": 1.8827160493827162e-06, "loss": 0.0202, "step": 15640 }, { "epoch": 9.624730852045525, "grad_norm": 0.267578125, "learning_rate": 1.867283950617284e-06, "loss": 0.0222, "step": 15645 }, { "epoch": 9.627806828668103, "grad_norm": 0.25, "learning_rate": 1.8518518518518519e-06, "loss": 0.0219, "step": 15650 }, { "epoch": 9.63088280529068, "grad_norm": 0.318359375, "learning_rate": 1.8364197530864199e-06, "loss": 0.0216, "step": 15655 }, { "epoch": 9.633958781913257, "grad_norm": 0.28125, "learning_rate": 1.8209876543209877e-06, "loss": 0.0236, "step": 15660 }, { "epoch": 9.637034758535835, "grad_norm": 0.279296875, "learning_rate": 1.8055555555555555e-06, "loss": 0.0229, "step": 15665 }, { "epoch": 9.640110735158412, "grad_norm": 0.26953125, "learning_rate": 1.7901234567901235e-06, "loss": 0.0235, "step": 15670 }, { "epoch": 9.64318671178099, "grad_norm": 0.28125, "learning_rate": 1.7746913580246913e-06, "loss": 0.0242, "step": 15675 }, { "epoch": 9.646262688403569, "grad_norm": 0.27734375, "learning_rate": 1.7592592592592594e-06, "loss": 0.022, "step": 15680 }, { "epoch": 9.649338665026146, "grad_norm": 0.318359375, "learning_rate": 1.7438271604938272e-06, "loss": 0.0242, "step": 15685 }, { "epoch": 9.652414641648724, "grad_norm": 0.259765625, "learning_rate": 1.728395061728395e-06, "loss": 0.0227, "step": 15690 }, { "epoch": 9.655490618271301, "grad_norm": 0.27734375, "learning_rate": 1.712962962962963e-06, "loss": 0.0234, "step": 15695 }, { "epoch": 9.658566594893879, "grad_norm": 0.279296875, "learning_rate": 1.6975308641975308e-06, "loss": 0.0236, "step": 15700 }, { "epoch": 9.661642571516456, "grad_norm": 0.228515625, "learning_rate": 1.682098765432099e-06, "loss": 0.0214, "step": 15705 }, { "epoch": 9.664718548139033, "grad_norm": 0.27734375, "learning_rate": 1.6666666666666667e-06, "loss": 0.0235, "step": 15710 }, { "epoch": 9.667794524761613, "grad_norm": 0.27734375, "learning_rate": 1.6512345679012345e-06, "loss": 0.0241, "step": 15715 }, { "epoch": 9.67087050138419, "grad_norm": 0.30078125, "learning_rate": 1.6358024691358025e-06, "loss": 0.0224, "step": 15720 }, { "epoch": 9.673946478006767, "grad_norm": 0.263671875, "learning_rate": 1.6203703703703703e-06, "loss": 0.0209, "step": 15725 }, { "epoch": 9.677022454629345, "grad_norm": 0.3046875, "learning_rate": 1.6049382716049385e-06, "loss": 0.0231, "step": 15730 }, { "epoch": 9.680098431251922, "grad_norm": 0.2578125, "learning_rate": 1.5895061728395061e-06, "loss": 0.0239, "step": 15735 }, { "epoch": 9.6831744078745, "grad_norm": 0.2578125, "learning_rate": 1.574074074074074e-06, "loss": 0.0217, "step": 15740 }, { "epoch": 9.686250384497077, "grad_norm": 0.248046875, "learning_rate": 1.5586419753086422e-06, "loss": 0.0247, "step": 15745 }, { "epoch": 9.689326361119656, "grad_norm": 0.2431640625, "learning_rate": 1.5432098765432098e-06, "loss": 0.0205, "step": 15750 }, { "epoch": 9.692402337742234, "grad_norm": 0.2255859375, "learning_rate": 1.5277777777777778e-06, "loss": 0.0215, "step": 15755 }, { "epoch": 9.695478314364811, "grad_norm": 0.30859375, "learning_rate": 1.5123456790123458e-06, "loss": 0.0244, "step": 15760 }, { "epoch": 9.698554290987389, "grad_norm": 0.26953125, "learning_rate": 1.4969135802469136e-06, "loss": 0.0216, "step": 15765 }, { "epoch": 9.701630267609966, "grad_norm": 0.294921875, "learning_rate": 1.4814814814814817e-06, "loss": 0.0247, "step": 15770 }, { "epoch": 9.704706244232543, "grad_norm": 0.330078125, "learning_rate": 1.4660493827160495e-06, "loss": 0.025, "step": 15775 }, { "epoch": 9.70778222085512, "grad_norm": 0.275390625, "learning_rate": 1.4506172839506173e-06, "loss": 0.0239, "step": 15780 }, { "epoch": 9.7108581974777, "grad_norm": 0.3125, "learning_rate": 1.4351851851851853e-06, "loss": 0.0247, "step": 15785 }, { "epoch": 9.713934174100277, "grad_norm": 0.326171875, "learning_rate": 1.4197530864197531e-06, "loss": 0.0229, "step": 15790 }, { "epoch": 9.717010150722855, "grad_norm": 0.287109375, "learning_rate": 1.4043209876543211e-06, "loss": 0.0237, "step": 15795 }, { "epoch": 9.720086127345432, "grad_norm": 0.29296875, "learning_rate": 1.388888888888889e-06, "loss": 0.0243, "step": 15800 }, { "epoch": 9.72316210396801, "grad_norm": 0.302734375, "learning_rate": 1.3734567901234568e-06, "loss": 0.0231, "step": 15805 }, { "epoch": 9.726238080590587, "grad_norm": 0.291015625, "learning_rate": 1.3580246913580248e-06, "loss": 0.0249, "step": 15810 }, { "epoch": 9.729314057213164, "grad_norm": 0.29296875, "learning_rate": 1.3425925925925926e-06, "loss": 0.0235, "step": 15815 }, { "epoch": 9.732390033835744, "grad_norm": 0.296875, "learning_rate": 1.3271604938271606e-06, "loss": 0.0219, "step": 15820 }, { "epoch": 9.735466010458321, "grad_norm": 0.310546875, "learning_rate": 1.3117283950617284e-06, "loss": 0.0247, "step": 15825 }, { "epoch": 9.738541987080898, "grad_norm": 0.28515625, "learning_rate": 1.2962962962962962e-06, "loss": 0.0231, "step": 15830 }, { "epoch": 9.741617963703476, "grad_norm": 0.28515625, "learning_rate": 1.2808641975308643e-06, "loss": 0.0228, "step": 15835 }, { "epoch": 9.744693940326053, "grad_norm": 0.23828125, "learning_rate": 1.2654320987654323e-06, "loss": 0.0214, "step": 15840 }, { "epoch": 9.74776991694863, "grad_norm": 0.296875, "learning_rate": 1.25e-06, "loss": 0.0237, "step": 15845 }, { "epoch": 9.750845893571208, "grad_norm": 0.298828125, "learning_rate": 1.234567901234568e-06, "loss": 0.0222, "step": 15850 }, { "epoch": 9.753921870193787, "grad_norm": 0.296875, "learning_rate": 1.2191358024691357e-06, "loss": 0.0226, "step": 15855 }, { "epoch": 9.756997846816365, "grad_norm": 0.279296875, "learning_rate": 1.2037037037037037e-06, "loss": 0.0251, "step": 15860 }, { "epoch": 9.760073823438942, "grad_norm": 0.25390625, "learning_rate": 1.1882716049382718e-06, "loss": 0.0223, "step": 15865 }, { "epoch": 9.76314980006152, "grad_norm": 0.2373046875, "learning_rate": 1.1728395061728396e-06, "loss": 0.0224, "step": 15870 }, { "epoch": 9.766225776684097, "grad_norm": 0.28125, "learning_rate": 1.1574074074074074e-06, "loss": 0.0203, "step": 15875 }, { "epoch": 9.769301753306674, "grad_norm": 0.275390625, "learning_rate": 1.1419753086419754e-06, "loss": 0.0219, "step": 15880 }, { "epoch": 9.772377729929252, "grad_norm": 0.2490234375, "learning_rate": 1.1265432098765432e-06, "loss": 0.0231, "step": 15885 }, { "epoch": 9.775453706551831, "grad_norm": 0.263671875, "learning_rate": 1.1111111111111112e-06, "loss": 0.0235, "step": 15890 }, { "epoch": 9.778529683174408, "grad_norm": 0.28125, "learning_rate": 1.095679012345679e-06, "loss": 0.0233, "step": 15895 }, { "epoch": 9.781605659796986, "grad_norm": 0.26953125, "learning_rate": 1.0802469135802469e-06, "loss": 0.0218, "step": 15900 }, { "epoch": 9.784681636419563, "grad_norm": 0.3046875, "learning_rate": 1.0648148148148149e-06, "loss": 0.0226, "step": 15905 }, { "epoch": 9.78775761304214, "grad_norm": 0.275390625, "learning_rate": 1.0493827160493827e-06, "loss": 0.0214, "step": 15910 }, { "epoch": 9.790833589664718, "grad_norm": 0.2421875, "learning_rate": 1.0339506172839507e-06, "loss": 0.0236, "step": 15915 }, { "epoch": 9.793909566287295, "grad_norm": 0.2578125, "learning_rate": 1.0185185185185188e-06, "loss": 0.0223, "step": 15920 }, { "epoch": 9.796985542909875, "grad_norm": 0.2734375, "learning_rate": 1.0030864197530864e-06, "loss": 0.0238, "step": 15925 }, { "epoch": 9.800061519532452, "grad_norm": 0.26953125, "learning_rate": 9.876543209876544e-07, "loss": 0.0236, "step": 15930 }, { "epoch": 9.80313749615503, "grad_norm": 0.259765625, "learning_rate": 9.722222222222222e-07, "loss": 0.0224, "step": 15935 }, { "epoch": 9.806213472777607, "grad_norm": 0.251953125, "learning_rate": 9.567901234567902e-07, "loss": 0.0236, "step": 15940 }, { "epoch": 9.809289449400184, "grad_norm": 0.302734375, "learning_rate": 9.413580246913581e-07, "loss": 0.0249, "step": 15945 }, { "epoch": 9.812365426022762, "grad_norm": 0.294921875, "learning_rate": 9.259259259259259e-07, "loss": 0.0249, "step": 15950 }, { "epoch": 9.81544140264534, "grad_norm": 0.259765625, "learning_rate": 9.104938271604939e-07, "loss": 0.0219, "step": 15955 }, { "epoch": 9.818517379267918, "grad_norm": 0.263671875, "learning_rate": 8.950617283950618e-07, "loss": 0.022, "step": 15960 }, { "epoch": 9.821593355890496, "grad_norm": 0.255859375, "learning_rate": 8.796296296296297e-07, "loss": 0.0245, "step": 15965 }, { "epoch": 9.824669332513073, "grad_norm": 0.294921875, "learning_rate": 8.641975308641975e-07, "loss": 0.0215, "step": 15970 }, { "epoch": 9.82774530913565, "grad_norm": 0.271484375, "learning_rate": 8.487654320987654e-07, "loss": 0.0215, "step": 15975 }, { "epoch": 9.830821285758228, "grad_norm": 0.2890625, "learning_rate": 8.333333333333333e-07, "loss": 0.0251, "step": 15980 }, { "epoch": 9.833897262380805, "grad_norm": 0.291015625, "learning_rate": 8.179012345679012e-07, "loss": 0.0221, "step": 15985 }, { "epoch": 9.836973239003383, "grad_norm": 0.2294921875, "learning_rate": 8.024691358024693e-07, "loss": 0.0217, "step": 15990 }, { "epoch": 9.840049215625962, "grad_norm": 0.275390625, "learning_rate": 7.87037037037037e-07, "loss": 0.0246, "step": 15995 }, { "epoch": 9.84312519224854, "grad_norm": 0.318359375, "learning_rate": 7.716049382716049e-07, "loss": 0.0238, "step": 16000 }, { "epoch": 9.846201168871117, "grad_norm": 0.25, "learning_rate": 7.561728395061729e-07, "loss": 0.0217, "step": 16005 }, { "epoch": 9.849277145493694, "grad_norm": 0.263671875, "learning_rate": 7.407407407407408e-07, "loss": 0.0226, "step": 16010 }, { "epoch": 9.852353122116272, "grad_norm": 0.26953125, "learning_rate": 7.253086419753086e-07, "loss": 0.0228, "step": 16015 }, { "epoch": 9.855429098738849, "grad_norm": 0.28515625, "learning_rate": 7.098765432098766e-07, "loss": 0.0237, "step": 16020 }, { "epoch": 9.858505075361427, "grad_norm": 0.314453125, "learning_rate": 6.944444444444445e-07, "loss": 0.0234, "step": 16025 }, { "epoch": 9.861581051984006, "grad_norm": 0.251953125, "learning_rate": 6.790123456790124e-07, "loss": 0.0207, "step": 16030 }, { "epoch": 9.864657028606583, "grad_norm": 0.3125, "learning_rate": 6.635802469135803e-07, "loss": 0.0231, "step": 16035 }, { "epoch": 9.86773300522916, "grad_norm": 0.25390625, "learning_rate": 6.481481481481481e-07, "loss": 0.0226, "step": 16040 }, { "epoch": 9.870808981851738, "grad_norm": 0.248046875, "learning_rate": 6.327160493827161e-07, "loss": 0.0225, "step": 16045 }, { "epoch": 9.873884958474315, "grad_norm": 0.283203125, "learning_rate": 6.17283950617284e-07, "loss": 0.0212, "step": 16050 }, { "epoch": 9.876960935096893, "grad_norm": 0.24609375, "learning_rate": 6.018518518518519e-07, "loss": 0.0228, "step": 16055 }, { "epoch": 9.88003691171947, "grad_norm": 0.263671875, "learning_rate": 5.864197530864198e-07, "loss": 0.0231, "step": 16060 }, { "epoch": 9.88311288834205, "grad_norm": 0.259765625, "learning_rate": 5.709876543209877e-07, "loss": 0.0218, "step": 16065 }, { "epoch": 9.886188864964627, "grad_norm": 0.28515625, "learning_rate": 5.555555555555556e-07, "loss": 0.0234, "step": 16070 }, { "epoch": 9.889264841587204, "grad_norm": 0.29296875, "learning_rate": 5.401234567901234e-07, "loss": 0.0222, "step": 16075 }, { "epoch": 9.892340818209782, "grad_norm": 0.2314453125, "learning_rate": 5.246913580246914e-07, "loss": 0.0217, "step": 16080 }, { "epoch": 9.895416794832359, "grad_norm": 0.26953125, "learning_rate": 5.092592592592594e-07, "loss": 0.0232, "step": 16085 }, { "epoch": 9.898492771454936, "grad_norm": 0.27734375, "learning_rate": 4.938271604938272e-07, "loss": 0.0237, "step": 16090 }, { "epoch": 9.901568748077514, "grad_norm": 0.275390625, "learning_rate": 4.783950617283951e-07, "loss": 0.0236, "step": 16095 }, { "epoch": 9.904644724700093, "grad_norm": 0.287109375, "learning_rate": 4.6296296296296297e-07, "loss": 0.0227, "step": 16100 }, { "epoch": 9.90772070132267, "grad_norm": 0.28125, "learning_rate": 4.475308641975309e-07, "loss": 0.0239, "step": 16105 }, { "epoch": 9.910796677945248, "grad_norm": 0.26171875, "learning_rate": 4.3209876543209875e-07, "loss": 0.0219, "step": 16110 }, { "epoch": 9.913872654567825, "grad_norm": 0.22265625, "learning_rate": 4.1666666666666667e-07, "loss": 0.0224, "step": 16115 }, { "epoch": 9.916948631190403, "grad_norm": 0.2373046875, "learning_rate": 4.0123456790123464e-07, "loss": 0.0219, "step": 16120 }, { "epoch": 9.92002460781298, "grad_norm": 0.2578125, "learning_rate": 3.8580246913580245e-07, "loss": 0.0227, "step": 16125 }, { "epoch": 9.923100584435558, "grad_norm": 0.27734375, "learning_rate": 3.703703703703704e-07, "loss": 0.0211, "step": 16130 }, { "epoch": 9.926176561058137, "grad_norm": 0.23046875, "learning_rate": 3.549382716049383e-07, "loss": 0.0197, "step": 16135 }, { "epoch": 9.929252537680714, "grad_norm": 0.298828125, "learning_rate": 3.395061728395062e-07, "loss": 0.022, "step": 16140 }, { "epoch": 9.932328514303292, "grad_norm": 0.298828125, "learning_rate": 3.2407407407407406e-07, "loss": 0.0232, "step": 16145 }, { "epoch": 9.935404490925869, "grad_norm": 0.2470703125, "learning_rate": 3.08641975308642e-07, "loss": 0.021, "step": 16150 }, { "epoch": 9.938480467548446, "grad_norm": 0.26171875, "learning_rate": 2.932098765432099e-07, "loss": 0.0237, "step": 16155 }, { "epoch": 9.941556444171024, "grad_norm": 0.26953125, "learning_rate": 2.777777777777778e-07, "loss": 0.0221, "step": 16160 }, { "epoch": 9.944632420793601, "grad_norm": 0.28125, "learning_rate": 2.623456790123457e-07, "loss": 0.0224, "step": 16165 }, { "epoch": 9.94770839741618, "grad_norm": 0.27734375, "learning_rate": 2.469135802469136e-07, "loss": 0.0215, "step": 16170 }, { "epoch": 9.950784374038758, "grad_norm": 0.2451171875, "learning_rate": 2.3148148148148148e-07, "loss": 0.0227, "step": 16175 }, { "epoch": 9.953860350661335, "grad_norm": 0.2314453125, "learning_rate": 2.1604938271604937e-07, "loss": 0.022, "step": 16180 }, { "epoch": 9.956936327283913, "grad_norm": 0.291015625, "learning_rate": 2.0061728395061732e-07, "loss": 0.0226, "step": 16185 }, { "epoch": 9.96001230390649, "grad_norm": 0.294921875, "learning_rate": 1.851851851851852e-07, "loss": 0.0218, "step": 16190 }, { "epoch": 9.963088280529067, "grad_norm": 0.294921875, "learning_rate": 1.697530864197531e-07, "loss": 0.0226, "step": 16195 }, { "epoch": 9.966164257151645, "grad_norm": 0.2373046875, "learning_rate": 1.54320987654321e-07, "loss": 0.0229, "step": 16200 }, { "epoch": 9.969240233774224, "grad_norm": 0.279296875, "learning_rate": 1.388888888888889e-07, "loss": 0.0221, "step": 16205 }, { "epoch": 9.972316210396801, "grad_norm": 0.318359375, "learning_rate": 1.234567901234568e-07, "loss": 0.0241, "step": 16210 }, { "epoch": 9.975392187019379, "grad_norm": 0.3046875, "learning_rate": 1.0802469135802469e-07, "loss": 0.0225, "step": 16215 }, { "epoch": 9.978468163641956, "grad_norm": 0.28125, "learning_rate": 9.25925925925926e-08, "loss": 0.0221, "step": 16220 }, { "epoch": 9.981544140264534, "grad_norm": 0.287109375, "learning_rate": 7.71604938271605e-08, "loss": 0.0225, "step": 16225 }, { "epoch": 9.984620116887111, "grad_norm": 0.294921875, "learning_rate": 6.17283950617284e-08, "loss": 0.0235, "step": 16230 }, { "epoch": 9.987696093509689, "grad_norm": 0.2490234375, "learning_rate": 4.62962962962963e-08, "loss": 0.0244, "step": 16235 }, { "epoch": 9.990772070132268, "grad_norm": 0.251953125, "learning_rate": 3.08641975308642e-08, "loss": 0.0229, "step": 16240 }, { "epoch": 9.993848046754845, "grad_norm": 0.26953125, "learning_rate": 1.54320987654321e-08, "loss": 0.0212, "step": 16245 }, { "epoch": 9.996924023377423, "grad_norm": 0.25, "learning_rate": 0.0, "loss": 0.0208, "step": 16250 } ], "logging_steps": 5, "max_steps": 16250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.6200174364980675e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }